Esempio n. 1
0
def content_categories(content):
    if not 'categories' in content:
        content['categories'] = classify_text(content['text'])
        _content.update({'_id': bson.ObjectId(content['id'])},
                        {'$set': {
                            'categories': content['categories']
                        }})
    return content['categories']
Esempio n. 2
0
def content_entities(content):
    if not 'entities' in content:
        content['entities'] = get_entities(content['text'])
        _content.update({'_id': bson.ObjectId(content['id'])},
                        {'$set': {
                            'entities': content['entities']
                        }})
    return content['entities']
Esempio n. 3
0
def content_keywords(content):
    if not 'keywords' in content:
        content['keywords'] = [
            x for x in get_keywords(content['text']) if x['count'] > 2
        ]
        _content.update({'_id': bson.ObjectId(content['id'])},
                        {'$set': {
                            'keywords': content['keywords']
                        }})
    return content['keywords']
Esempio n. 4
0
def content_stakeholders(content):
    if not 'stakeholders' in content:
        entities = content_entities(content)
        kwargs = {'credentials': get_twitter_credentials()}
        stakeholder_list = find_stakeholder_twitter_users(entities, **kwargs)
        content['stakeholders'] = stakeholder_list
        _content.update({'_id': bson.ObjectId(content['id'])},
                        {'$set': {
                            'stakeholders': content['stakeholders']
                        }})
    return content['stakeholders']
Esempio n. 5
0
def all_the_content(content, article_database_ref, reload_pundits=False):
    """

    :param content: this is the mongo object containing our content up to now
    :param reload_pundits: if true, pundits are re-scraped every time
    :return: returns keywords, entities, and newpundits, as well as storing them in the mongo object for the article
    """

    reload_pundits = True

    article = newspaper.Article(content['url'])
    article.download()
    article.parse()
    article.nlp()

    print "HERE ARE THE NEWSPAPER KEYWORDS", article.keywords

    content['keywords'] = ""
    content['entities'] = ""

    # if not 'keywords' in content:
    #     content['keywords'] = [x for x in get_keywords(content['text'])
    #         if x['count'] > 2]
    #     _content.update({'_id': bson.ObjectId(content['id'])},
    #         {'$set': {'keywords': content['keywords']}})
    #
    # if not 'entities' in content:
    #     content['entities'] = get_entities(content['text'])
    #     _content.update({'_id': bson.ObjectId(content['id'])},
    #         {'$set': {'entities': content['entities']}})

    if not 'newpundits' in content or reload_pundits:
        content['newpundits'] = []
        dupe_list = []

        snippets, ratios = pundits.keyword_match(article_database_ref,
                                                 article.keywords)
        content['newpundits'] = snippets

        _content.update({'_id': bson.ObjectId(content['id'])},
                        {'$set': {
                            'newpundits': content['newpundits']
                        }})

    if not len(content['newpundits']):
        print "nothing to see here!"
        failed_snippet = {}
        failed_snippet['name'] = "#shambles"
        failed_snippet['text'] = "we can't seem to find anything."
        content['newpundits'] = [[failed_snippet]]
    else:
        print "HERE ARE NEW PUNDITS:", content['newpundits']

    return content['keywords'], content['entities'], content['newpundits']
Esempio n. 6
0
def all_the_content(content, article_database_ref, reload_pundits=False):
    """

    :param content: this is the mongo object containing our content up to now
    :param reload_pundits: if true, pundits are re-scraped every time
    :return: returns keywords, entities, and newpundits, as well as storing them in the mongo object for the article
    """

    reload_pundits = True

    article = newspaper.Article(content['url'])
    article.download()
    article.parse()
    article.nlp()

    print "HERE ARE THE NEWSPAPER KEYWORDS", article.keywords

    content['keywords']=""
    content['entities']=""

    # if not 'keywords' in content:
    #     content['keywords'] = [x for x in get_keywords(content['text'])
    #         if x['count'] > 2]
    #     _content.update({'_id': bson.ObjectId(content['id'])},
    #         {'$set': {'keywords': content['keywords']}})
    #
    # if not 'entities' in content:
    #     content['entities'] = get_entities(content['text'])
    #     _content.update({'_id': bson.ObjectId(content['id'])},
    #         {'$set': {'entities': content['entities']}})

    if not 'newpundits' in content or reload_pundits:
        content['newpundits'] = []
        dupe_list = []

        snippets, ratios = pundits.keyword_match(article_database_ref, article.keywords)
        content['newpundits'] = snippets

        _content.update({'_id': bson.ObjectId(content['id'])},
            {'$set': {'newpundits': content['newpundits']}})



    if not len(content['newpundits']):
        print "nothing to see here!"
        failed_snippet = {}
        failed_snippet['name'] = "#shambles"
        failed_snippet['text'] = "we can't seem to find anything."
        content['newpundits'] = [[failed_snippet]]
    else:
        print "HERE ARE NEW PUNDITS:", content['newpundits']

    return content['keywords'], content['entities'], content['newpundits']