Esempio n. 1
0
def crawl_articles():
    """Crawls and saves new articles in the DB."""
    rss_target_keys = RssTargetModel.all(
        RssTargetModel.query(RssTargetModel.enabled == True), keys_only=True)
    count = len(rss_target_keys)
    logging.info("Starting crawling with %d targets.", count)
    for rss_target_key in rss_target_keys:
        _crawl_articles(key_to_urlsafe(rss_target_key))
    return {"targets": count}
Esempio n. 2
0
def _crawl_articles(rss_target_usafe):
    rss_target = RssTargetModel.get(rss_target_usafe)
    rss_crawler = RssCrawler([rss_target], limit=ARTICLES_PER_TARGET)
    articles_dict = rss_crawler.crawl_targets()
    articles = sum(articles_dict.values(), [])
    count = len(articles)
    logging.info("Saving %d articles into DB.", count)
    article_keys = ArticleModel.put_multi(articles)
    for article_key in article_keys:
        pair_article(key_to_urlsafe(article_key))
    return {"articles": count}
Esempio n. 3
0
def clean_articles():
    """Cleans all outdated articles."""
    delta = datetime.timedelta(days=ARTICLES_MAX_AGE)
    min_date = datetime.datetime.utcnow() - delta
    logging.info("Collecting articles older than %s for removal...", min_date)

    query_filters_list = [
        (ArticleModel.published < min_date, ),
        (ArticleModel.published == None, ArticleModel.created_at < min_date),
    ]
    article_keys = _get_matching_keys(*query_filters_list, model=ArticleModel)

    for article_key in article_keys:
        _clean_article_biases(key_to_urlsafe(article_key))

    articles_count = len(article_keys)
    logging.info("Removing %d articles.", articles_count)
    ArticleModel.remove_multi(article_keys)
    return {"articles": articles_count}
Esempio n. 4
0
    def get_related_articles(main_article_key, meta_func=None):
        """Returns a list of unique related articles given a main one and looking over
        all the biased pairs containing it.

        Args:
            main_article_key (KeyProperty): Queried main article's key.
            meta_func (callable): If this is provided, then `meta_return` will be the
                return value of calling `meta_func` over each related pair of articles.
        Returns:
            dict_values: Of dictionaries containing the article, meta_return and other
                info.
        """
        related_articles = {}

        complementary = {"left": "right", "right": "left"}
        for side in complementary:
            side_field = getattr(BiasPairModel, side)
            query = BiasPairModel.query(side_field == main_article_key)
            pairs = list(query.fetch())

            for pair in pairs:
                # Keep unique related articles only (choose the newest one if
                # duplicates are found).
                article_key = getattr(pair, complementary[side])
                usafe = key_to_urlsafe(article_key)
                seen_date = related_articles.get(usafe, {}).get("created_at")
                if seen_date and pair.created_at <= seen_date:
                    continue

                meta = meta_func(pair) if meta_func else None
                related_article = {
                    "created_at": pair.created_at,
                    "score": pair.score,
                    "meta": meta,
                }
                if not seen_date:
                    article = article_key.get()
                    related_articles[usafe] = {"article": article}
                related_articles[usafe].update(related_article)

        return related_articles.values()
Esempio n. 5
0
    def post(self):
        data = request.get_json()
        link = data.get("link", "").strip() if data else None
        if not link:
            abort(400, message="Article 'link' not supplied.")

        link = strip_article_link(link)
        try:
            site, site_info = ArticleModel.get_site_info(link)
        except Exception as exc:
            abort(403, message=base.exc_to_str(exc))

        article = _extract_article(link, site, site_info)
        article_key = article.put()
        article_usafe = key_to_urlsafe(article_key)
        try:
            _pair_article(article_usafe)
        except Exception as exc:
            logging.exception(
                "Couldn't pair article with urlsafe '%s' due to: %s",
                article_usafe, exc)
        return self._make_response("article", article)
Esempio n. 6
0
def crawl_articles(args):
    """Crawls and possibly saves articles into DB."""
    if args.target:
        query = RssTargetModel.source_name == args.target
    else:
        query = RssTargetModel.enabled == True
    rss_query = RssTargetModel.query(query)
    rss_targets = RssTargetModel.all(rss_query, order=False)
    logging.info(
        "Crawling %d targets into Datastore: %s", len(rss_targets), DATASTORE_NAMESPACE
    )

    for rss_target in rss_targets:
        rss_crawler = RssCrawler([rss_target], limit=args.limit)
        articles_dict = rss_crawler.crawl_targets()
        articles = sum(articles_dict.values(), [])
        for article in articles:
            print(json.dumps(article.to_dict(), indent=4, default=_json_serializer))
        if args.save:
            logging.info("Saving these %d shown article(s).", len(articles))
            article_keys = ArticleModel.put_multi(articles)
            for article_key in article_keys:
                pair_article(key_to_urlsafe(article_key))