コード例 #1
0
def update_articles():
    articles = Articles()
    all_urls = get_all_article_urls(articles)
    log.info('Got all %s urls; storing to database' % len(all_urls))
    for i, url in enumerate(all_urls):
        log.debug('Woo: %d/%d is %s' % (i + 1, len(all_urls), url))
        parsed_article = load_article(url)
        if parsed_article is None:
            continue
        articles.save_entry(parsed_article, url)
コード例 #2
0
class ArticleResource:
    def __init__(self):
        self.articles = Articles()

    def on_get(self, req, resp, news_id):
        try:
            resp.body = self.articles.load_article_history(
                news_id, ArticlesParams.from_req(req))
        except ValueError as e:
            raise falcon.HTTPBadRequest('bad request',
                                        'invalid query: ' + str(e))
コード例 #3
0
class SearchResource:
    def __init__(self):
        self.articles = Articles()

    def on_get(self, req, resp):
        try:
            resp.body = self.articles.load_modified_news(
                SearchParams.from_req(req))
        except ValueError as e:
            raise falcon.HTTPBadRequest('bad request',
                                        'invalid query: ' + str(e))
コード例 #4
0
class NewsResource:
    def __init__(self):
        self.articles = Articles()

    def on_get(self, req, resp, publisher_code=None):
        try:
            resp.body = self.articles.load_modified_news(
                NewsParams.from_req(req, publisher_code))
        except ValueError as e:
            raise falcon.HTTPBadRequest('bad request',
                                        'invalid query: ' + str(e))
コード例 #5
0
def crawl_all():
    articles = Articles()
    visited = set()
    coroutines = [parser.feed_urls() for parser in parsers]
    for coroutine in asyncio.as_completed(coroutines):
        urls = list(map(canonicalize_url, (yield from coroutine)))
        if len(urls) < 1:
            continue
        parser = get_parser(urls[0])
        log.info('Got {} URLs for {}'.format(len(urls), parser.domain))
        to_get = [parser(x).parse() for x in urls if x not in visited]
        visited = visited.union(urls)
        for get_page in asyncio.as_completed(to_get):
            try:
                page = yield from get_page
                articles.save_entry(page)
            except Exception as e:
                log.error(e)
    urls = get_existing_urls(articles)
    to_get = [get_parser(x)(x).parse() for x in urls if x not in visited]
    log.info("updating {} existing unvisited URLs".format(len(to_get)))
    for get_page in asyncio.as_completed(to_get):
        try:
            page = yield from get_page
            articles.save_entry(page)
        except Exception as e:
            log.error(e)
コード例 #6
0
 def __init__(self):
     self.articles = Articles()