def _matching_ratio(topics_id, query_clause): total = apicache.topic_story_count(user_mediacloud_key(), topics_id) sub_query_clause = None if query_clause: sub_query_clause = apicache.add_to_user_query(query_clause) matching = apicache.topic_story_count(user_mediacloud_key(), topics_id, q=sub_query_clause) return {'count': matching['count'], 'total': total['count']}
def media_type_story_counts(topics_id): tag_story_counts = [] media_type_tags = tags_in_tag_set(TOOL_API_KEY, TAG_SETS_ID_MEDIA_TYPE) # grab the total stories total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count'] # make a count for each tag based on media_id for tag in media_type_tags: query_clause = "tags_id_media:{}".format(tag['tags_id']) tagged_story_count = topic_story_count(user_mediacloud_key(), topics_id, q=query_clause)['count'] tag_story_counts.append({ 'label': tag['label'], 'tags_id': tag['tags_id'], 'count': tagged_story_count, 'pct': float(tagged_story_count) / float(total_stories) if total_stories > 0 else 0, # protect against div by zero }) return jsonify({'story_counts': tag_story_counts})
def media_type_coverage(topics_id): media_type_tags = tags_in_tag_set(TOOL_API_KEY, TAG_SETS_ID_MEDIA_TYPE) # grab the total stories total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count'] # count the stories in any media in tagged as media_type tags_ids = " ".join(str(tag['tags_id']) for tag in media_type_tags) query_clause = "tags_id_media:({})".format(tags_ids) tagged_story_count = topic_story_count(user_mediacloud_key(), topics_id, q=query_clause)['count'] return jsonify({'counts': {'count': tagged_story_count, 'total': total_stories}})
def story_counts(topics_id): if access_public_topic(topics_id): local_key = TOOL_API_KEY elif is_user_logged_in(): local_key = user_mediacloud_key() else: return jsonify({'status': 'Error', 'message': 'Invalid attempt'}) total = topic_story_count(local_key, topics_id, timespans_id=None, q=None) filtered = topic_story_count(local_key, topics_id) # force a count with just the query return jsonify({'counts': {'count': filtered['count'], 'total': total['count']}})
def _public_safe_topic_story_count(topics_id, q): if access_public_topic(topics_id): total = topic_story_count(TOOL_API_KEY, topics_id, q=add_to_user_query(None)) matching = topic_story_count(TOOL_API_KEY, topics_id, q=add_to_user_query(q)) # force a count with just the query elif is_user_logged_in(): total = topic_story_count(user_mediacloud_key(), topics_id, q=add_to_user_query(None)) matching = topic_story_count(user_mediacloud_key(), topics_id, q=add_to_user_query(q)) # force a count with just the query else: return jsonify({'status': 'Error', 'message': 'Invalid attempt'}) return jsonify({'counts': {'count': matching['count'], 'total': total['count']}})
def story_counts(topics_id): if access_public_topic(topics_id): local_key = TOOL_API_KEY elif is_user_logged_in(): local_key = user_mediacloud_key() else: return jsonify({'status': 'Error', 'message': 'Invalid attempt'}) total = apicache.topic_story_count(local_key, topics_id, timespans_id=None, snapshots_id=None, q=None, foci_id=None) filtered = apicache.topic_story_count(local_key, topics_id) return jsonify({'counts': {'count': filtered['count'], 'total': total['count']}})
def retweet_partisanship_coverage(topics_id): # TODO: add in overall timespan id here so it works in different snapshots partisanship_tags = cached_media_tags(TAG_SETS_ID_RETWEET_PARTISANSHIP_2016) # grab the total stories total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count'] # count the stories in any media in tagged as partisan tag_media_ids = [" ".join(tag['media_ids']) for tag in partisanship_tags] all_media_ids = " ".join(tag_media_ids) media_ids_query_clause = "media_id:({})".format(all_media_ids) tagged_story_count = topic_story_count(user_mediacloud_key(), topics_id, q=media_ids_query_clause)['count'] return jsonify({'counts': {'count': tagged_story_count, 'total': total_stories}})
def _public_safe_topic_story_count(topics_id, q): if access_public_topic(topics_id): total = apicache.topic_story_count(TOOL_API_KEY, topics_id, q=apicache.add_to_user_query(None)) # force a count with just the query matching = apicache.topic_story_count(TOOL_API_KEY, topics_id, q=apicache.add_to_user_query(q)) elif is_user_logged_in(): total = apicache.topic_story_count(user_mediacloud_key(), topics_id, q=apicache.add_to_user_query(None)) # force a count with just the query matching = apicache.topic_story_count(user_mediacloud_key(), topics_id, q=apicache.add_to_user_query(q)) else: return jsonify({'status': 'Error', 'message': 'Invalid attempt'}) return jsonify({'counts': {'count': matching['count'], 'total': total['count']}})
def _public_safe_topic_story_count(topics_id, q): total = apicache.topic_story_count(user_mediacloud_key(), topics_id, q=apicache.add_to_user_query(None)) # force a count with just the query matching = apicache.topic_story_count(user_mediacloud_key(), topics_id, q=apicache.add_to_user_query(q)) return jsonify( {'counts': { 'count': matching['count'], 'total': total['count'] }})
def story_counts_by_snapshot(topics_id): user_mc = user_mediacloud_client(user_mediacloud_key()) snapshots = user_mc.topicSnapshotList(topics_id) counts = {} for s in snapshots: # get the count of stories in the overally timespan for this snapshot timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id=s['snapshots_id'], foci_id=None) try: total = timespans[0]['story_count'] except mediacloud.error.MCException: total = 0 except IndexError: # this doesn't have any snapshots (ie. it failed to generate correctly) total = 0 # search by tag to find out how many stories were spidered spidered = 0 try: spidered = apicache.topic_story_count(user_mediacloud_key(), topics_id, snapshots_id=s['snapshots_id'], foci_id=None, timespans_id=timespans[0]['timespans_id'], q="* AND tags_id_stories:{}".format(TAG_SPIDERED_STORY))['count'] except mediacloud.error.MCException: spidered = 0 except IndexError: # this doesn't have any snapshots (ie. it failed to generate correctly) total = 0 seeded = total - spidered counts[s['snapshots_id']] = {'total': total, 'spidered': spidered, 'seeded': seeded} return jsonify(counts)
def get_top_countries_by_story_tag_counts(topics_id, num_countries): tag_country_counts = [] # get the total stories for a topic total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count'] # get the top countries by the story tag counts iwth overall timespan timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id) overall_timespan = [t for t in timespans if t['period'] == "overall"] overall_timespan = next(iter(overall_timespan)) timespan_query = "timespans_id:{}".format(overall_timespan['timespans_id']) top_geo_tags = _cached_topic_tag_counts(user_mediacloud_key(), topics_id, GEO_TAG_SET, GEO_SAMPLE_SIZE, timespan_query) # make sure the geo tag is in the geo_tags whitelist (is a country) country_tag_counts = [r for r in top_geo_tags if int(r['tag'].split('_')[1]) in list(COUNTRY_GEONAMES_ID_TO_APLHA3.keys())] country_tag_counts = country_tag_counts[:num_countries] # for each country, set up the requisite info for UI for tag in country_tag_counts: tag_country_counts.append({ 'label': tag['label'], 'geo_tag': tag['tag'], 'tags_id': tag['tags_id'], 'count': tag['count'], 'pct': float(tag['count']) / float(total_stories), # story_tag_count / total story per topic count }) return tag_country_counts
def get_top_themes_by_story_tag_counts(topics_id, num_themes): user_mc_key = user_mediacloud_key() nyt_counts = [] #get overall timespan timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id) overall_timespan = [t for t in timespans if t['period'] == "overall"] overall_timespan = next(iter(overall_timespan)) timespan_query = "timespans_id:{}".format(overall_timespan['timespans_id']) # get the top themes by the story counts iwth overall timespan top_nyt_tags = _cached_topic_tag_counts(user_mediacloud_key(), topics_id, NYT_LABELS_TAG_SET_ID, TAG_COUNT_SAMPLE_SIZE, timespan_query) # get the total stories for a topic total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count'] top_nyt_tags = top_nyt_tags[:num_themes] # for each country, set up the requisite info for UI for tag in top_nyt_tags: nyt_counts.append({ 'label': tag['label'], 'geo_tag': tag['tag'], 'tags_id': tag['tags_id'], 'count': tag['count'], 'pct': float(tag['count']) / float(total_stories), #story_tag_count / total story per topic count }) return nyt_counts
def media_type_coverage(topics_id): media_type_tags = tags_in_tag_set(TOOL_API_KEY, TAG_SETS_ID_MEDIA_TYPE) # grab the total stories total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count'] # count the stories in any media in tagged as media_type tags_ids = " ".join(str(tag['tags_id']) for tag in media_type_tags) query_clause = "tags_id_media:({})".format(tags_ids) tagged_story_count = topic_story_count(user_mediacloud_key(), topics_id, q=query_clause)['count'] return jsonify( {'counts': { 'count': tagged_story_count, 'total': total_stories }})
def media_type_story_counts(topics_id): tag_story_counts = [] media_type_tags = tags_in_tag_set(TOOL_API_KEY, TAG_SETS_ID_MEDIA_TYPE) # grab the total stories total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count'] # make a count for each tag based on media_id for tag in media_type_tags: query_clause = "tags_id_media:{}".format(tag['tags_id']) tagged_story_count = topic_story_count(user_mediacloud_key(), topics_id, q=query_clause)['count'] tag_story_counts.append({ 'label': tag['label'], 'tags_id': tag['tags_id'], 'count': tagged_story_count, 'pct': float(tagged_story_count)/float(total_stories) }) return jsonify({'story_counts': tag_story_counts})
def retweet_partisanship_coverage(topics_id): partisanship_tags = _cached_media_tags( TAG_SETS_ID_RETWEET_PARTISANSHIP_2016) # grab the total stories total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count'] # count the stories in any media in tagged as partisan tags_ids = " ".join([str(t['tags_id']) for t in partisanship_tags]) tags_ids_query_clause = "tags_id_media:({})".format(tags_ids) tagged_story_count = topic_story_count(user_mediacloud_key(), topics_id, q=tags_ids_query_clause)['count'] return jsonify( {'counts': { 'count': tagged_story_count, 'total': total_stories }})
def retweet_partisanship_story_counts(topics_id): # TODO: add in overall timespan id here so it works in different snapshots tag_story_counts = [] partisanship_tags = _cached_media_tags( TAG_SETS_ID_RETWEET_PARTISANSHIP_2016) # grab the total stories try: total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count'] except mediacloud.error.MCException: total_stories = 0 # make a count for each tag for tag in partisanship_tags: try: tagged_story_count = topic_story_count(user_mediacloud_key(), topics_id, q=tag['query'])['count'] pct = float(tagged_story_count) / float(total_stories) except ZeroDivisionError: tagged_story_count = 0 pct = 0 except mediacloud.error.MCException: tagged_story_count = 0 pct = 0 tag_story_counts.append({ 'label': tag['label'], 'tags_id': tag['tags_id'], 'count': tagged_story_count, 'pct': pct }) # order them in the way a person would expect ( left to center to right) ordered_tag_story_counts = list() ordered_tag_story_counts.append( [t for t in tag_story_counts if t['tags_id'] == 9360520][0]) ordered_tag_story_counts.append( [t for t in tag_story_counts if t['tags_id'] == 9360521][0]) ordered_tag_story_counts.append( [t for t in tag_story_counts if t['tags_id'] == 9360522][0]) ordered_tag_story_counts.append( [t for t in tag_story_counts if t['tags_id'] == 9360523][0]) ordered_tag_story_counts.append( [t for t in tag_story_counts if t['tags_id'] == 9360524][0]) return jsonify({'story_counts': ordered_tag_story_counts})
def nyt_theme_coverage(topics_id): # grab the total stories total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count'] num_themes = int(request.args['numThemes']) nyt_top_themes = get_top_themes_by_story_tag_counts(topics_id, num_themes) tag_list = [i['tags_id'] for i in nyt_top_themes] query_nyt_tags = "({})".format(" ".join(map(str, tag_list))) coverage = topic_tag_coverage(topics_id, query_nyt_tags) # gets count and total if coverage is None: return jsonify({'status': 'Error', 'message': 'Invalid attempt'}) return jsonify(coverage)
def story_counts(topics_id): query = request.form['keywords'] if 'keywords' in request.form else '' #for preview information in subtopics and platforms - scope by media source info collections = _parse_collection_ids(request.args) sources = _parse_media_ids(request.args) merged_args = {} if ((sources not in [None, ''] and len(sources) > 0) or collections not in [None, ''] and len(collections) > 0): query = concatenate_query_for_solr(query, sources, collections) merged_args = {'q': query} filtered = apicache.topic_story_count(user_mediacloud_key(), topics_id, **merged_args) total = apicache.topic_story_count(user_mediacloud_key(), topics_id, timespans_id=None, snapshots_id=None, foci_id=None, q=None) return jsonify( {'counts': { 'count': filtered['count'], 'total': total['count'] }})
def _add_story_counts_to_foci(topics_id, focal_sets): snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) # need the timespan info, to find the appropriate timespan with each focus try: base_timespan = base_snapshot_timespan(topics_id) except ValueError as e: return json_error_response(e.message) # now find the story count in each foci in this for fs in focal_sets: timespans = apicache.matching_timespans_in_foci(topics_id, base_timespan, fs['foci']) for idx in range(0, len(timespans)): timespan = timespans[idx] focus = fs['foci'][idx] foci_story_count = apicache.topic_story_count(user_mediacloud_key(), topics_id, snapshots_id=snapshots_id, timespans_id=timespan['timespans_id'], q=q, foci_id=focus['foci_id'])['count'] focus['story_count'] = foci_story_count return jsonify(focal_sets)
def get_top_countries_by_story_tag_counts(topics_id, num_countries): tag_country_counts = [] # get the total stories for a topic total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count'] # get the top countries by the story tag counts iwth overall timespan timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id) overall_timespan = [t for t in timespans if t['period'] == "overall"] overall_timespan = next(iter(overall_timespan)) timespan_query = "timespans_id:{}".format(overall_timespan['timespans_id']) top_geo_tags = _cached_topic_tag_counts(user_mediacloud_key(), topics_id, GEO_TAG_SET, GEO_SAMPLE_SIZE, timespan_query) # make sure the geo tag is in the geo_tags whitelist (is a country) country_tag_counts = [ r for r in top_geo_tags if int(r['tag'].split('_')[1]) in COUNTRY_GEONAMES_ID_TO_APLHA3.keys() ] country_tag_counts = country_tag_counts[:num_countries] # for each country, set up the requisite info for UI for tag in country_tag_counts: tag_country_counts.append({ 'label': tag['label'], 'geo_tag': tag['tag'], 'tags_id': tag['tags_id'], 'count': tag['count'], 'pct': float(tag['count']) / float(total_stories ), # story_tag_count / total story per topic count }) return tag_country_counts
def stream_story_list_csv(user_key, filename, topics_id, **kwargs): user_mc = user_mediacloud_client(user_key) topic = user_mc.topic(topics_id) has_twitter_data = topic['ch_monitor_id'] is not None # as_attachment = kwargs['as_attachment'] if 'as_attachment' in kwargs else True include_fb_date = kwargs['fb_data'] if 'fb_data' in kwargs else False all_stories = [] params = kwargs.copy() merged_args = { 'snapshots_id': request.args['snapshotId'], 'timespans_id': request.args['timespanId'], 'foci_id': request.args['focusId'] if 'focusId' in request.args else None, 'q': request.args['q'] if 'q' in request.args else None, 'sort': request.args['sort'] if 'sort' in request.args else None, } params.update(merged_args) story_count = apicache.topic_story_count(user_mediacloud_key(), topics_id, snapshots_id=params['snapshots_id'], timespans_id=params['timespans_id'], foci_id = params['foci_id'], q=params['q']) logger.info("Total stories to download: {}".format(story_count)) if 'as_attachment' in params: del params['as_attachment'] if 'fb_data' in params: del params['fb_data'] if 'q' in params: params['q'] = params['q'] if 'q' not in [None, '', 'null', 'undefined'] else None params['limit'] = 1000 # an arbitrary value to let us page through with big topics # determine which props the user actaully wants to download props = [ 'stories_id', 'publish_date', 'title', 'url', 'language', 'ap_syndicated', 'themes', 'subtopics', 'inlink_count', 'facebook_share_count', # removed media metadata here because it takes too long to query for it # 'media_pub_country', 'media_pub_state', 'media_language', 'media_about_country', 'media_media_type' ] if has_twitter_data: props.append('simple_tweet_count') if include_fb_date: props.append('facebook_collection_date') props += ['outlink_count', 'media_inlink_count', 'media_id', 'media_name', 'media_url'] if include_fb_date: all_fb_count = [] more_fb_count = True link_id = 0 local_mc = user_admin_mediacloud_client() while more_fb_count: fb_page = local_mc.topicStoryListFacebookData(topics_id, limit=100, link_id=link_id) all_fb_count = all_fb_count + fb_page['counts'] if 'next' in fb_page['link_ids']: link_id = fb_page['link_ids']['next'] more_fb_count = True else: more_fb_count = False # now iterate through each list and set up the fb collection date for s in all_stories: for fb_item in all_fb_count: if int(fb_item['stories_id']) == int(s['stories_id']): s['facebook_collection_date'] = fb_item['facebook_api_collect_date'] timestamped_filename = csv.safe_filename(filename) headers = { "Content-Disposition": "attachment;filename=" + timestamped_filename } return Response(_topic_story_list_by_page_as_csv_row(user_key, topics_id, props, **params), mimetype='text/csv; charset=utf-8', headers=headers)
def stream_story_list_csv(user_key, topic, **kwargs): filename = topic['name']+'-stories' has_twitter_data = (topic['ch_monitor_id'] is not None) and (topic['ch_monitor_id'] != 0) # as_attachment = kwargs['as_attachment'] if 'as_attachment' in kwargs else True include_media_metadata = ('media_metadata' in kwargs) and (kwargs['media_metadata'] is True) include_story_tags = ('story_tags' in kwargs) and (kwargs['story_tags'] is True) include_reddit_submissions = ('reddit_submissions' in kwargs) and (kwargs['reddit_submissions'] is True) include_fb_date = kwargs['fb_data'] if 'fb_data' in kwargs else False all_stories = [] params = kwargs.copy() merged_args = { 'snapshots_id': request.args['snapshotId'], 'timespans_id': request.args['timespanId'], 'foci_id': request.args['focusId'] if 'focusId' in request.args else None, 'q': request.args['q'] if 'q' in request.args else None, 'sort': request.args['sort'] if 'sort' in request.args else None, } params.update(merged_args) story_count = apicache.topic_story_count(user_mediacloud_key(), topic['topics_id'], snapshots_id=params['snapshots_id'], timespans_id=params['timespans_id'], foci_id=params['foci_id'], q=params['q']) logger.info("Total stories to download: {}".format(story_count['count'])) if 'as_attachment' in params: del params['as_attachment'] if 'fb_data' in params: del params['fb_data'] if 'q' in params: params['q'] = params['q'] if 'q' not in [None, '', 'null', 'undefined'] else None params['limit'] = 1000 # an arbitrary value to let us page through with big topics # determine which props the user actually wants to download props = [ 'stories_id', 'publish_date', 'title', 'url', 'language', 'ap_syndicated', 'inlink_count', 'facebook_share_count', ] if has_twitter_data: props.append('simple_tweet_count') if include_reddit_submissions: props.append('reddit_submissions') if include_fb_date: props.append('facebook_collection_date') if include_story_tags: props += ['themes', 'subtopics'] props += ['outlink_count', 'media_inlink_count', 'media_id', 'media_name', 'media_url'] if include_media_metadata: props += ['media_pub_country', 'media_pub_state', 'media_language', 'media_about_country', 'media_media_type'] if include_fb_date: all_fb_count = [] more_fb_count = True link_id = 0 local_mc = user_admin_mediacloud_client() while more_fb_count: fb_page = local_mc.topicStoryListFacebookData(topic['topics_id'], limit=100, link_id=link_id) all_fb_count = all_fb_count + fb_page['counts'] if 'next' in fb_page['link_ids']: link_id = fb_page['link_ids']['next'] more_fb_count = True else: more_fb_count = False # now iterate through each list and set up the fb collection date for s in all_stories: for fb_item in all_fb_count: if int(fb_item['stories_id']) == int(s['stories_id']): s['facebook_collection_date'] = fb_item['facebook_api_collect_date'] timestamped_filename = csv.safe_filename(filename) headers = { "Content-Disposition": "attachment;filename=" + timestamped_filename } return Response(_topic_story_list_by_page_as_csv_row(user_key, topic['topics_id'], props, **params), mimetype='text/csv; charset=utf-8', headers=headers)