Example #1
0
def entities_from_mc_or_cliff(stories_id):
    entities = []
    # get entities from MediaCloud, or from CLIFF if not in MC
    cliff_results = cached_story_raw_cliff_results(stories_id)[0]['cliff']
    if (cliff_results == 'story is not annotated') or (cliff_results == "story does not exist"):
        story = mc.story(stories_id, text=True)
        cliff_results = cliff.parse_text(story['story_text'])
    # clean up for reporting
    for org in cliff_results['results']['organizations']:
        entities.append({
            'type': 'ORGANIZATION',
            'name': org['name'],
            'frequency': org['count']
        })
    for person in cliff_results['results']['people']:
        entities.append({
            'type': 'PERSON',
            'name': person['name'],
            'frequency': person['count']
        })
    # places don't have frequency set correctly, so we need to sum them
    locations = []
    place_names = set([place['name'] for place in cliff_results['results']['places']['mentions']])
    for place in place_names:
        loc = {
            'type': 'LOCATION',
            'name': place,
            'frequency': len([p for p in cliff_results['results']['places']['mentions'] if p['name'] == place])
        }
        locations.append(loc)
    entities += locations
    # sort smartly
    unique_entities = sorted(entities, key=itemgetter('frequency'), reverse=True)
    return unique_entities
Example #2
0
def entities_from_mc_or_cliff(stories_id):
    entities = []
    # get entities from MediaCloud, or from CLIFF if not in MC
    cliff_results = cached_story_raw_cliff_results(stories_id)[0]['cliff']
    if (cliff_results == u'"story is not annotated"') or (cliff_results == u"story does not exist"):
        story = mc.story(stories_id, text=True)
        cliff_results = cliff.parseText(story['story_text'])
    # clean up for reporting
    for org in cliff_results['results']['organizations']:
        entities.append({
            'type': 'ORGANIZATION',
            'name': org['name'],
            'frequency': org['count']
        })
    for person in cliff_results ['results']['people']:
        entities.append({
            'type': 'PERSON',
            'name': person['name'],
            'frequency': person['count']
        })
    # places don't have frequency set correctly, so we need to sum them
    locations = []
    place_names = set([place['name'] for place in cliff_results['results']['places']['mentions']])
    for place in place_names:
        loc = {
            'type': 'LOCATION',
            'name': place,
            'frequency': len([p for p in cliff_results['results']['places']['mentions'] if p['name'] == place])
        }
        locations.append(loc)
    entities += locations
    # sort smartly
    unique_entities = sorted(entities, key=itemgetter('frequency'), reverse=True)
    return unique_entities
Example #3
0
def nyt_themes_from_mc_or_labeller(stories_id):
    results = cached_story_raw_theme_results(stories_id)
    if results['nytlabels'] == u'"story is not annotated"':
        story = mc.story(stories_id, text=True)
        results = predict_news_labels(story['story_text'])
    else:
        results = results['nytlabels']
    return results
Example #4
0
def nyt_themes_from_mc_or_labeller(stories_id):
    results = cached_story_raw_theme_results(stories_id)
    if results['nytlabels'] == 'story is not annotated':
        story = mc.story(stories_id, text=True)
        results = predict_news_labels(story['story_text'])
    else:
        results = results['nytlabels']
    return results
Example #5
0
def story_subreddit_shares_csv(stories_id):
    story = mc.story(stories_id)
    submissions_by_sub = pushshift.reddit_url_submissions_by_subreddit(
        story['url'])
    props = ['name', 'value']
    column_names = ['subreddit', 'submissions']
    return csv.stream_response(submissions_by_sub,
                               props,
                               'story-' + str(stories_id) + '-subreddit',
                               column_names=column_names)
Example #6
0
def story_subreddit_shares(stories_id):
    story = mc.story(stories_id)
    submissions_by_sub = pushshift.reddit_url_submissions_by_subreddit(
        story['url'])
    return jsonify({
        'total':
        sum([r['value'] for r in submissions_by_sub])
        if submissions_by_sub is not None else 0,
        'subreddits':
        submissions_by_sub
    })
Example #7
0
def story_top_image(stories_id):
    story = mc.story(stories_id)
    # use the tool key so anyone can see these images
    story_html = apicache.story_raw_1st_download(TOOL_API_KEY, stories_id)
    article = newspaper.Article(url=story['url'])
    article.set_html(story_html)
    article.parse()
    return jsonify({
        'top': article.top_image,
        'all': list(article.images),
    })
def _cached_story_raw_1st_download(api_key, stories_id):
    story = mc.story(stories_id, raw_1st_download=True)
    return story['raw_first_download_file']