Esempio n. 1
0
def tokenize_article(articleid=None):
    if articleid is not None:
        articles = mongo.db.Article.find({"article_id": int(articleid)})
    else:
        print("grabbing random article...")
        articles = mongo.db.Article.find({"body": {"$ne": ""}})

    if not articles.count():
        print("Article id {} not found".format(articleid) if articleid is not None else "No articles fonud")
        return

    index = random.randint(0, articles.count() - 1)
    article = articles[index]

    from spacyparser import SpacyParser

    parser = SpacyParser()
    sentences = parser.sentences(article["body"])

    print("Processing with {}".format(parser.__class__.__name__))
    print("Sentences for Article {}".format(article["article_id"]))
    print("-" * 80)
    for s in sentences:
        print(s)
        print("-" * 80)
Esempio n. 2
0
def process(articles, query_db=True, update_all=False):
    from spacyparser import SpacyParser

    parser = SpacyParser()
    summar = Summarizer(parser)

    num_added = 0
    num_updated = 0
    num_invalid_body = 0
    for article in articles:
        article_id = article["article_id"]
        article_headline = article["headline"]
        article_url = article["url"]
        body = article.get("body", None)
        review = article

        if query_db:
            review = mongo.db.SummaryReview.find_one({"article_id": article_id})

        if not body:
            art = mongo.db.Article.find_one({"article_id": article_id})
            if not art or not art["body"]:
                print("Article {} does not have a body, skipping".format(article_id))
                num_invalid_body += 1
                continue

            body = art["body"]

        # sanitize step
        body = sanitize(body)
        sentences = parser.sentences(body)
        summary = summarize(article["headline"], body, count=3, summarizer=summar)
        bot_indices = summary_indices(sentences, summary)

        if review is None:
            mongo.db.SummaryReview.insert(
                {
                    "article_id": article_id,
                    "headline": article_headline,
                    "url": article_url,
                    "sentences": sentences,
                    "summary": {"Bot": bot_indices},
                }
            )

            num_added += 1
            continue

        # remove all votes and flags if new sentences dont match old ones
        updated = False
        if "sentences" not in review or len(review["sentences"]) != len(sentences):
            updated = True
        else:
            for cur_sentence, new_sentence in zip(review["sentences"], sentences):
                if cur_sentence != new_sentence:
                    updated = True
                    break

        if updated or update_all:
            review["invalid"] = []
            review["summary"] = {"Bot": bot_indices}
            review["sentences"] = sentences
            review["updated_at"] = datetime.utcnow()
            review["tokens_valid"] = False
            mongo.db.SummaryReview.update({"_id": review["_id"]}, review)
            num_updated += 1
        else:
            if "summary" in review:
                review["summary"]["Bot"] = bot_indices
            else:
                review["summary"] = {"Bot": bot_indices}
            mongo.db.SummaryReview.update({"_id": review["_id"]}, review)

    print("-" * 80)
    print("Articles fetched:\n")
    print("\tNumber added: {}".format(num_added))
    print("\tNumber updated: {}".format(num_updated))
    print("\tNumber invalid body: {}".format(num_invalid_body))