def update_articles(): articles = Articles() all_urls = get_all_article_urls(articles) log.info('Got all %s urls; storing to database' % len(all_urls)) for i, url in enumerate(all_urls): log.debug('Woo: %d/%d is %s' % (i + 1, len(all_urls), url)) parsed_article = load_article(url) if parsed_article is None: continue articles.save_entry(parsed_article, url)
class ArticleResource: def __init__(self): self.articles = Articles() def on_get(self, req, resp, news_id): try: resp.body = self.articles.load_article_history( news_id, ArticlesParams.from_req(req)) except ValueError as e: raise falcon.HTTPBadRequest('bad request', 'invalid query: ' + str(e))
class SearchResource: def __init__(self): self.articles = Articles() def on_get(self, req, resp): try: resp.body = self.articles.load_modified_news( SearchParams.from_req(req)) except ValueError as e: raise falcon.HTTPBadRequest('bad request', 'invalid query: ' + str(e))
class NewsResource: def __init__(self): self.articles = Articles() def on_get(self, req, resp, publisher_code=None): try: resp.body = self.articles.load_modified_news( NewsParams.from_req(req, publisher_code)) except ValueError as e: raise falcon.HTTPBadRequest('bad request', 'invalid query: ' + str(e))
def crawl_all(): articles = Articles() visited = set() coroutines = [parser.feed_urls() for parser in parsers] for coroutine in asyncio.as_completed(coroutines): urls = list(map(canonicalize_url, (yield from coroutine))) if len(urls) < 1: continue parser = get_parser(urls[0]) log.info('Got {} URLs for {}'.format(len(urls), parser.domain)) to_get = [parser(x).parse() for x in urls if x not in visited] visited = visited.union(urls) for get_page in asyncio.as_completed(to_get): try: page = yield from get_page articles.save_entry(page) except Exception as e: log.error(e) urls = get_existing_urls(articles) to_get = [get_parser(x)(x).parse() for x in urls if x not in visited] log.info("updating {} existing unvisited URLs".format(len(to_get))) for get_page in asyncio.as_completed(to_get): try: page = yield from get_page articles.save_entry(page) except Exception as e: log.error(e)
def __init__(self): self.articles = Articles()