Example #1
0
def _matching_ratio(topics_id, query_clause):
    total = apicache.topic_story_count(user_mediacloud_key(), topics_id)
    sub_query_clause = None
    if query_clause:
        sub_query_clause = apicache.add_to_user_query(query_clause)
    matching = apicache.topic_story_count(user_mediacloud_key(), topics_id, q=sub_query_clause)
    return {'count': matching['count'], 'total': total['count']}
Example #2
0
def media_type_story_counts(topics_id):
    tag_story_counts = []
    media_type_tags = tags_in_tag_set(TOOL_API_KEY, TAG_SETS_ID_MEDIA_TYPE)
    # grab the total stories
    total_stories = topic_story_count(user_mediacloud_key(),
                                      topics_id)['count']
    # make a count for each tag based on media_id
    for tag in media_type_tags:
        query_clause = "tags_id_media:{}".format(tag['tags_id'])
        tagged_story_count = topic_story_count(user_mediacloud_key(),
                                               topics_id,
                                               q=query_clause)['count']
        tag_story_counts.append({
            'label':
            tag['label'],
            'tags_id':
            tag['tags_id'],
            'count':
            tagged_story_count,
            'pct':
            float(tagged_story_count) / float(total_stories)
            if total_stories > 0 else 0,  # protect against div by zero
        })

    return jsonify({'story_counts': tag_story_counts})
def media_type_coverage(topics_id):
    media_type_tags = tags_in_tag_set(TOOL_API_KEY, TAG_SETS_ID_MEDIA_TYPE)
    # grab the total stories
    total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count']
    # count the stories in any media in tagged as media_type
    tags_ids = " ".join(str(tag['tags_id']) for tag in media_type_tags)
    query_clause = "tags_id_media:({})".format(tags_ids)
    tagged_story_count = topic_story_count(user_mediacloud_key(), topics_id, q=query_clause)['count']
    return jsonify({'counts': {'count': tagged_story_count, 'total': total_stories}})
Example #4
0
def story_counts(topics_id):
    if access_public_topic(topics_id):
        local_key = TOOL_API_KEY
    elif is_user_logged_in():
        local_key = user_mediacloud_key()
    else:
        return jsonify({'status': 'Error', 'message': 'Invalid attempt'})
    total = topic_story_count(local_key, topics_id, timespans_id=None, q=None)
    filtered = topic_story_count(local_key, topics_id)  # force a count with just the query
    return jsonify({'counts': {'count': filtered['count'], 'total': total['count']}})
Example #5
0
def _public_safe_topic_story_count(topics_id, q):
    if access_public_topic(topics_id):
        total = topic_story_count(TOOL_API_KEY, topics_id, q=add_to_user_query(None))
        matching = topic_story_count(TOOL_API_KEY, topics_id, q=add_to_user_query(q))  # force a count with just the query
    elif is_user_logged_in():
        total = topic_story_count(user_mediacloud_key(), topics_id, q=add_to_user_query(None))
        matching = topic_story_count(user_mediacloud_key(), topics_id, q=add_to_user_query(q))  # force a count with just the query
    else:
        return jsonify({'status': 'Error', 'message': 'Invalid attempt'})
    return jsonify({'counts': {'count': matching['count'], 'total': total['count']}})
Example #6
0
def story_counts(topics_id):
    if access_public_topic(topics_id):
        local_key = TOOL_API_KEY
    elif is_user_logged_in():
        local_key = user_mediacloud_key()
    else:
        return jsonify({'status': 'Error', 'message': 'Invalid attempt'})
    total = apicache.topic_story_count(local_key, topics_id, timespans_id=None, snapshots_id=None, q=None, foci_id=None)
    filtered = apicache.topic_story_count(local_key, topics_id)
    return jsonify({'counts': {'count': filtered['count'], 'total': total['count']}})
def retweet_partisanship_coverage(topics_id):
    # TODO: add in overall timespan id here so it works in different snapshots
    partisanship_tags = cached_media_tags(TAG_SETS_ID_RETWEET_PARTISANSHIP_2016)
    # grab the total stories
    total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count']
    # count the stories in any media in tagged as partisan
    tag_media_ids = [" ".join(tag['media_ids']) for tag in partisanship_tags]
    all_media_ids = " ".join(tag_media_ids)
    media_ids_query_clause = "media_id:({})".format(all_media_ids)
    tagged_story_count = topic_story_count(user_mediacloud_key(), topics_id, q=media_ids_query_clause)['count']
    return jsonify({'counts': {'count': tagged_story_count, 'total': total_stories}})
Example #8
0
def _public_safe_topic_story_count(topics_id, q):
    if access_public_topic(topics_id):
        total = apicache.topic_story_count(TOOL_API_KEY, topics_id, q=apicache.add_to_user_query(None))
        # force a count with just the query
        matching = apicache.topic_story_count(TOOL_API_KEY, topics_id, q=apicache.add_to_user_query(q))
    elif is_user_logged_in():
        total = apicache.topic_story_count(user_mediacloud_key(), topics_id, q=apicache.add_to_user_query(None))
        # force a count with just the query
        matching = apicache.topic_story_count(user_mediacloud_key(), topics_id, q=apicache.add_to_user_query(q))
    else:
        return jsonify({'status': 'Error', 'message': 'Invalid attempt'})
    return jsonify({'counts': {'count': matching['count'], 'total': total['count']}})
Example #9
0
def _public_safe_topic_story_count(topics_id, q):
    total = apicache.topic_story_count(user_mediacloud_key(),
                                       topics_id,
                                       q=apicache.add_to_user_query(None))
    # force a count with just the query
    matching = apicache.topic_story_count(user_mediacloud_key(),
                                          topics_id,
                                          q=apicache.add_to_user_query(q))
    return jsonify(
        {'counts': {
            'count': matching['count'],
            'total': total['count']
        }})
Example #10
0
def story_counts_by_snapshot(topics_id):
    user_mc = user_mediacloud_client(user_mediacloud_key())
    snapshots = user_mc.topicSnapshotList(topics_id)
    counts = {}
    for s in snapshots:
        # get the count of stories in the overally timespan for this snapshot
        timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id,
                                                        snapshots_id=s['snapshots_id'], foci_id=None)
        try:
            total = timespans[0]['story_count']
        except mediacloud.error.MCException:
            total = 0
        except IndexError:  # this doesn't have any snapshots (ie. it failed to generate correctly)
            total = 0
        # search by tag to find out how many stories were spidered
        spidered = 0
        try:
            spidered = apicache.topic_story_count(user_mediacloud_key(), topics_id,
                                                  snapshots_id=s['snapshots_id'], foci_id=None,
                                                  timespans_id=timespans[0]['timespans_id'],
                                                  q="* AND tags_id_stories:{}".format(TAG_SPIDERED_STORY))['count']
        except mediacloud.error.MCException:
            spidered = 0
        except IndexError:  # this doesn't have any snapshots (ie. it failed to generate correctly)
            total = 0
        seeded = total - spidered
        counts[s['snapshots_id']] = {'total': total, 'spidered': spidered, 'seeded': seeded}
    return jsonify(counts)
def get_top_countries_by_story_tag_counts(topics_id, num_countries):
    tag_country_counts = []

    # get the total stories for a topic
    total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count']

    # get the top countries by the story tag counts iwth overall timespan
    timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id)
    overall_timespan = [t for t in timespans if t['period'] == "overall"]
    overall_timespan = next(iter(overall_timespan))
    timespan_query = "timespans_id:{}".format(overall_timespan['timespans_id'])
    top_geo_tags = _cached_topic_tag_counts(user_mediacloud_key(), topics_id, GEO_TAG_SET, GEO_SAMPLE_SIZE, timespan_query)
    
    # make sure the geo tag is in the geo_tags whitelist (is a country)
    country_tag_counts = [r for r in top_geo_tags if
                          int(r['tag'].split('_')[1]) in list(COUNTRY_GEONAMES_ID_TO_APLHA3.keys())]
    country_tag_counts = country_tag_counts[:num_countries]

    # for each country, set up the requisite info for UI
    for tag in country_tag_counts:
        tag_country_counts.append({
            'label': tag['label'],
            'geo_tag': tag['tag'],
            'tags_id': tag['tags_id'],
            'count': tag['count'],
            'pct': float(tag['count']) / float(total_stories),  # story_tag_count / total story per topic count
        })
    return tag_country_counts
Example #12
0
def get_top_themes_by_story_tag_counts(topics_id, num_themes):
    user_mc_key = user_mediacloud_key()
    nyt_counts = []

    #get overall timespan
    timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id)
    overall_timespan = [t for t in timespans if t['period'] == "overall"]
    overall_timespan = next(iter(overall_timespan))
    timespan_query = "timespans_id:{}".format(overall_timespan['timespans_id'])

    # get the top themes by the story counts iwth overall timespan
    top_nyt_tags = _cached_topic_tag_counts(user_mediacloud_key(), topics_id, NYT_LABELS_TAG_SET_ID,
                                            TAG_COUNT_SAMPLE_SIZE, timespan_query)
    # get the total stories for a topic
    total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count']

    top_nyt_tags = top_nyt_tags[:num_themes]
    # for each country, set up the requisite info for UI
    for tag in top_nyt_tags:
        nyt_counts.append({
            'label': tag['label'],
            'geo_tag': tag['tag'],
            'tags_id': tag['tags_id'],
            'count': tag['count'],
            'pct': float(tag['count']) / float(total_stories), #story_tag_count / total story per topic count
        })

    return nyt_counts
Example #13
0
def media_type_coverage(topics_id):
    media_type_tags = tags_in_tag_set(TOOL_API_KEY, TAG_SETS_ID_MEDIA_TYPE)
    # grab the total stories
    total_stories = topic_story_count(user_mediacloud_key(),
                                      topics_id)['count']
    # count the stories in any media in tagged as media_type
    tags_ids = " ".join(str(tag['tags_id']) for tag in media_type_tags)
    query_clause = "tags_id_media:({})".format(tags_ids)
    tagged_story_count = topic_story_count(user_mediacloud_key(),
                                           topics_id,
                                           q=query_clause)['count']
    return jsonify(
        {'counts': {
            'count': tagged_story_count,
            'total': total_stories
        }})
Example #14
0
def media_type_story_counts(topics_id):
    tag_story_counts = []
    media_type_tags = tags_in_tag_set(TOOL_API_KEY, TAG_SETS_ID_MEDIA_TYPE)
    # grab the total stories
    total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count']
    # make a count for each tag based on media_id
    for tag in media_type_tags:
        query_clause = "tags_id_media:{}".format(tag['tags_id'])
        tagged_story_count = topic_story_count(user_mediacloud_key(), topics_id, q=query_clause)['count']
        tag_story_counts.append({
            'label': tag['label'],
            'tags_id': tag['tags_id'],
            'count': tagged_story_count,
            'pct': float(tagged_story_count)/float(total_stories)
        })

    return jsonify({'story_counts': tag_story_counts})
Example #15
0
def retweet_partisanship_coverage(topics_id):
    partisanship_tags = _cached_media_tags(
        TAG_SETS_ID_RETWEET_PARTISANSHIP_2016)
    # grab the total stories
    total_stories = topic_story_count(user_mediacloud_key(),
                                      topics_id)['count']
    # count the stories in any media in tagged as partisan
    tags_ids = " ".join([str(t['tags_id']) for t in partisanship_tags])
    tags_ids_query_clause = "tags_id_media:({})".format(tags_ids)
    tagged_story_count = topic_story_count(user_mediacloud_key(),
                                           topics_id,
                                           q=tags_ids_query_clause)['count']
    return jsonify(
        {'counts': {
            'count': tagged_story_count,
            'total': total_stories
        }})
def retweet_partisanship_story_counts(topics_id):
    # TODO: add in overall timespan id here so it works in different snapshots
    tag_story_counts = []
    partisanship_tags = _cached_media_tags(
        TAG_SETS_ID_RETWEET_PARTISANSHIP_2016)
    # grab the total stories
    try:
        total_stories = topic_story_count(user_mediacloud_key(),
                                          topics_id)['count']
    except mediacloud.error.MCException:
        total_stories = 0
    # make a count for each tag
    for tag in partisanship_tags:
        try:
            tagged_story_count = topic_story_count(user_mediacloud_key(),
                                                   topics_id,
                                                   q=tag['query'])['count']
            pct = float(tagged_story_count) / float(total_stories)
        except ZeroDivisionError:
            tagged_story_count = 0
            pct = 0
        except mediacloud.error.MCException:
            tagged_story_count = 0
            pct = 0
        tag_story_counts.append({
            'label': tag['label'],
            'tags_id': tag['tags_id'],
            'count': tagged_story_count,
            'pct': pct
        })
    # order them in the way a person would expect ( left to center to right)
    ordered_tag_story_counts = list()
    ordered_tag_story_counts.append(
        [t for t in tag_story_counts if t['tags_id'] == 9360520][0])
    ordered_tag_story_counts.append(
        [t for t in tag_story_counts if t['tags_id'] == 9360521][0])
    ordered_tag_story_counts.append(
        [t for t in tag_story_counts if t['tags_id'] == 9360522][0])
    ordered_tag_story_counts.append(
        [t for t in tag_story_counts if t['tags_id'] == 9360523][0])
    ordered_tag_story_counts.append(
        [t for t in tag_story_counts if t['tags_id'] == 9360524][0])
    return jsonify({'story_counts': ordered_tag_story_counts})
Example #17
0
def nyt_theme_coverage(topics_id):
    # grab the total stories
    total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count']
    num_themes = int(request.args['numThemes'])

    nyt_top_themes = get_top_themes_by_story_tag_counts(topics_id, num_themes)
    tag_list = [i['tags_id'] for i in nyt_top_themes]
    query_nyt_tags = "({})".format(" ".join(map(str, tag_list)))
    coverage = topic_tag_coverage(topics_id, query_nyt_tags)   # gets count and total

    if coverage is None:
       return jsonify({'status': 'Error', 'message': 'Invalid attempt'})
    return jsonify(coverage)
Example #18
0
def story_counts(topics_id):
    query = request.form['keywords'] if 'keywords' in request.form else ''
    #for preview information in subtopics and platforms - scope by media source info
    collections = _parse_collection_ids(request.args)
    sources = _parse_media_ids(request.args)
    merged_args = {}
    if ((sources not in [None, ''] and len(sources) > 0)
            or collections not in [None, ''] and len(collections) > 0):
        query = concatenate_query_for_solr(query, sources, collections)
        merged_args = {'q': query}
    filtered = apicache.topic_story_count(user_mediacloud_key(), topics_id,
                                          **merged_args)
    total = apicache.topic_story_count(user_mediacloud_key(),
                                       topics_id,
                                       timespans_id=None,
                                       snapshots_id=None,
                                       foci_id=None,
                                       q=None)
    return jsonify(
        {'counts': {
            'count': filtered['count'],
            'total': total['count']
        }})
Example #19
0
def _add_story_counts_to_foci(topics_id, focal_sets):
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    # need the timespan info, to find the appropriate timespan with each focus
    try:
        base_timespan = base_snapshot_timespan(topics_id)
    except ValueError as e:
        return json_error_response(e.message)
    # now find the story count in each foci in this
    for fs in focal_sets:
        timespans = apicache.matching_timespans_in_foci(topics_id, base_timespan, fs['foci'])
        for idx in range(0, len(timespans)):
            timespan = timespans[idx]
            focus = fs['foci'][idx]
            foci_story_count = apicache.topic_story_count(user_mediacloud_key(), topics_id,
                                                          snapshots_id=snapshots_id,
                                                          timespans_id=timespan['timespans_id'],
                                                          q=q,
                                                          foci_id=focus['foci_id'])['count']
            focus['story_count'] = foci_story_count
    return jsonify(focal_sets)
Example #20
0
def get_top_countries_by_story_tag_counts(topics_id, num_countries):
    tag_country_counts = []

    # get the total stories for a topic
    total_stories = topic_story_count(user_mediacloud_key(),
                                      topics_id)['count']

    # get the top countries by the story tag counts iwth overall timespan
    timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id)
    overall_timespan = [t for t in timespans if t['period'] == "overall"]
    overall_timespan = next(iter(overall_timespan))
    timespan_query = "timespans_id:{}".format(overall_timespan['timespans_id'])
    top_geo_tags = _cached_topic_tag_counts(user_mediacloud_key(), topics_id,
                                            GEO_TAG_SET, GEO_SAMPLE_SIZE,
                                            timespan_query)

    # make sure the geo tag is in the geo_tags whitelist (is a country)
    country_tag_counts = [
        r for r in top_geo_tags
        if int(r['tag'].split('_')[1]) in COUNTRY_GEONAMES_ID_TO_APLHA3.keys()
    ]
    country_tag_counts = country_tag_counts[:num_countries]

    # for each country, set up the requisite info for UI
    for tag in country_tag_counts:
        tag_country_counts.append({
            'label':
            tag['label'],
            'geo_tag':
            tag['tag'],
            'tags_id':
            tag['tags_id'],
            'count':
            tag['count'],
            'pct':
            float(tag['count']) /
            float(total_stories
                  ),  # story_tag_count / total story per topic count
        })
    return tag_country_counts
Example #21
0
def stream_story_list_csv(user_key, filename, topics_id, **kwargs):
    user_mc = user_mediacloud_client(user_key)
    topic = user_mc.topic(topics_id)
    has_twitter_data = topic['ch_monitor_id'] is not None

    # as_attachment = kwargs['as_attachment'] if 'as_attachment' in kwargs else True
    include_fb_date = kwargs['fb_data'] if 'fb_data' in kwargs else False
    all_stories = []
    params = kwargs.copy()

    merged_args = {
        'snapshots_id': request.args['snapshotId'],
        'timespans_id': request.args['timespanId'],
        'foci_id': request.args['focusId'] if 'focusId' in request.args else None,
        'q': request.args['q'] if 'q' in request.args else None,
        'sort': request.args['sort'] if 'sort' in request.args else None,
    }
    params.update(merged_args)

    story_count = apicache.topic_story_count(user_mediacloud_key(), topics_id,
                                             snapshots_id=params['snapshots_id'], timespans_id=params['timespans_id'],
                                             foci_id = params['foci_id'], q=params['q'])
    logger.info("Total stories to download: {}".format(story_count))

    if 'as_attachment' in params:
        del params['as_attachment']
    if 'fb_data' in params:
        del params['fb_data']
    if 'q' in params:
        params['q'] = params['q'] if 'q' not in [None, '', 'null', 'undefined'] else None
    params['limit'] = 1000  # an arbitrary value to let us page through with big topics

    # determine which props the user actaully wants to download
    props = [
        'stories_id', 'publish_date', 'title', 'url', 'language', 'ap_syndicated',
        'themes', 'subtopics',
        'inlink_count', 'facebook_share_count',
        # removed media metadata here because it takes too long to query for it
        # 'media_pub_country', 'media_pub_state', 'media_language', 'media_about_country', 'media_media_type'
    ]
    if has_twitter_data:
        props.append('simple_tweet_count')
    if include_fb_date:
        props.append('facebook_collection_date')
    props += ['outlink_count', 'media_inlink_count', 'media_id', 'media_name', 'media_url']

    if include_fb_date:
        all_fb_count = []
        more_fb_count = True
        link_id = 0
        local_mc = user_admin_mediacloud_client()
        while more_fb_count:
            fb_page = local_mc.topicStoryListFacebookData(topics_id, limit=100, link_id=link_id)

            all_fb_count = all_fb_count + fb_page['counts']
            if 'next' in fb_page['link_ids']:
                link_id = fb_page['link_ids']['next']
                more_fb_count = True
            else:
                more_fb_count = False

        # now iterate through each list and set up the fb collection date
        for s in all_stories:
            for fb_item in all_fb_count:
                if int(fb_item['stories_id']) == int(s['stories_id']):
                    s['facebook_collection_date'] = fb_item['facebook_api_collect_date']

    timestamped_filename = csv.safe_filename(filename)
    headers = {
        "Content-Disposition": "attachment;filename=" + timestamped_filename
    }
    return Response(_topic_story_list_by_page_as_csv_row(user_key, topics_id, props, **params),
                    mimetype='text/csv; charset=utf-8', headers=headers)
Example #22
0
def stream_story_list_csv(user_key, topic, **kwargs):
    filename = topic['name']+'-stories'
    has_twitter_data = (topic['ch_monitor_id'] is not None) and (topic['ch_monitor_id'] != 0)

    # as_attachment = kwargs['as_attachment'] if 'as_attachment' in kwargs else True
    include_media_metadata = ('media_metadata' in kwargs) and (kwargs['media_metadata'] is True)
    include_story_tags = ('story_tags' in kwargs) and (kwargs['story_tags'] is True)
    include_reddit_submissions = ('reddit_submissions' in kwargs) and (kwargs['reddit_submissions'] is True)
    include_fb_date = kwargs['fb_data'] if 'fb_data' in kwargs else False
    all_stories = []
    params = kwargs.copy()

    merged_args = {
        'snapshots_id': request.args['snapshotId'],
        'timespans_id': request.args['timespanId'],
        'foci_id': request.args['focusId'] if 'focusId' in request.args else None,
        'q': request.args['q'] if 'q' in request.args else None,
        'sort': request.args['sort'] if 'sort' in request.args else None,
    }
    params.update(merged_args)

    story_count = apicache.topic_story_count(user_mediacloud_key(), topic['topics_id'],
                                             snapshots_id=params['snapshots_id'], timespans_id=params['timespans_id'],
                                             foci_id=params['foci_id'], q=params['q'])
    logger.info("Total stories to download: {}".format(story_count['count']))

    if 'as_attachment' in params:
        del params['as_attachment']
    if 'fb_data' in params:
        del params['fb_data']
    if 'q' in params:
        params['q'] = params['q'] if 'q' not in [None, '', 'null', 'undefined'] else None
    params['limit'] = 1000  # an arbitrary value to let us page through with big topics

    # determine which props the user actually wants to download
    props = [
        'stories_id', 'publish_date', 'title', 'url', 'language', 'ap_syndicated', 'inlink_count',
        'facebook_share_count',
    ]
    if has_twitter_data:
        props.append('simple_tweet_count')
    if include_reddit_submissions:
        props.append('reddit_submissions')
    if include_fb_date:
        props.append('facebook_collection_date')
    if include_story_tags:
        props += ['themes', 'subtopics']
    props += ['outlink_count', 'media_inlink_count', 'media_id', 'media_name', 'media_url']
    if include_media_metadata:
        props += ['media_pub_country', 'media_pub_state', 'media_language', 'media_about_country', 'media_media_type']

    if include_fb_date:
        all_fb_count = []
        more_fb_count = True
        link_id = 0
        local_mc = user_admin_mediacloud_client()
        while more_fb_count:
            fb_page = local_mc.topicStoryListFacebookData(topic['topics_id'], limit=100, link_id=link_id)

            all_fb_count = all_fb_count + fb_page['counts']
            if 'next' in fb_page['link_ids']:
                link_id = fb_page['link_ids']['next']
                more_fb_count = True
            else:
                more_fb_count = False

        # now iterate through each list and set up the fb collection date
        for s in all_stories:
            for fb_item in all_fb_count:
                if int(fb_item['stories_id']) == int(s['stories_id']):
                    s['facebook_collection_date'] = fb_item['facebook_api_collect_date']

    timestamped_filename = csv.safe_filename(filename)
    headers = {
        "Content-Disposition": "attachment;filename=" + timestamped_filename
    }
    return Response(_topic_story_list_by_page_as_csv_row(user_key, topic['topics_id'], props, **params),
                    mimetype='text/csv; charset=utf-8', headers=headers)