def topic_compare_subtopic_top_words(topics_id): snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) selected_focal_sets_id = request.args['focal_sets_id'] word_count = request.args['word_count'] if 'word_count' in request.args else 20 # first we need to figure out which timespan they are working on selected_snapshot_timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id=snapshots_id) selected_timespan = None for t in selected_snapshot_timespans: if t['timespans_id'] == int(timespans_id): selected_timespan = t try: focal_set = apicache.topic_focal_set(user_mediacloud_key(), topics_id, snapshots_id, selected_focal_sets_id) except ValueError: return json_error_response('Invalid Focal Set Id') timespans = apicache.matching_timespans_in_foci(topics_id, selected_timespan, focal_set['foci']) for idx in range(0, len(timespans)): data = apicache.topic_word_counts(user_mediacloud_key(), topics_id, timespans_id=timespans[idx]['timespans_id']) focal_set['foci'][idx]['top_words'] = data # stitch together the counts to download now data = [] headers = [f['name'] for f in focal_set['foci']] for idx in range(0, word_count): row = {f['name']: "{} ({})".format(f['top_words'][idx]['term'], f['top_words'][idx]['count']) for f in focal_set['foci']} data.append(row) return csv.stream_response(data, headers, 'topic-{}-subtopic-{}-{}-top-words-comparison'.format( topics_id, focal_set['name'], selected_focal_sets_id))
def collection_update(collection_id): user_mc = user_admin_mediacloud_client() label = '{}'.format(request.form['name']) description = request.form['description'] static = request.form['static'] if 'static' in request.form else None show_on_stories = request.form['showOnStories'] if 'showOnStories' in request.form else None show_on_media = request.form['showOnMedia'] if 'showOnMedia' in request.form else None formatted_name = format_name_from_label(label) source_ids = [] if len(request.form['sources[]']) > 0: source_ids = [int(sid) for sid in request.form['sources[]'].split(',')] # first update the collection updated_collection = user_mc.updateTag(collection_id, formatted_name, label, description, is_static=(static == 'true'), show_on_stories=(show_on_stories == 'true'), show_on_media=(show_on_media == 'true')) # get the sources in the collection first, then remove and add as needed existing_source_ids = [int(m['media_id']) for m in media_with_tag(user_mediacloud_key(), collection_id)] source_ids_to_remove = list(set(existing_source_ids) - set(source_ids)) source_ids_to_add = [sid for sid in source_ids if sid not in existing_source_ids] # logger.debug(existing_source_ids) # logger.debug(source_ids_to_add) # logger.debug(source_ids_to_remove) # then go through and tag all the sources specified with the new collection id tags_to_add = [MediaTag(sid, tags_id=collection_id, action=TAG_ACTION_ADD) for sid in source_ids_to_add] tags_to_remove = [MediaTag(sid, tags_id=collection_id, action=TAG_ACTION_REMOVE) for sid in source_ids_to_remove] tags = tags_to_add + tags_to_remove if len(tags) > 0: user_mc.tagMedia(tags) apicache.invalidate_collection_source_representation_cache(user_mediacloud_key(), collection_id) return jsonify(updated_collection['tag'])
def get_top_themes_by_story_tag_counts(topics_id, num_themes): user_mc_key = user_mediacloud_key() nyt_counts = [] #get overall timespan timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id) overall_timespan = [t for t in timespans if t['period'] == "overall"] overall_timespan = next(iter(overall_timespan)) timespan_query = "timespans_id:{}".format(overall_timespan['timespans_id']) # get the top themes by the story counts iwth overall timespan top_nyt_tags = _cached_topic_tag_counts(user_mediacloud_key(), topics_id, NYT_LABELS_TAG_SET_ID, TAG_COUNT_SAMPLE_SIZE, timespan_query) # get the total stories for a topic total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count'] top_nyt_tags = top_nyt_tags[:num_themes] # for each country, set up the requisite info for UI for tag in top_nyt_tags: nyt_counts.append({ 'label': tag['label'], 'geo_tag': tag['tag'], 'tags_id': tag['tags_id'], 'count': tag['count'], 'pct': float(tag['count']) / float(total_stories), #story_tag_count / total story per topic count }) return nyt_counts
def topic_words(topics_id): sample_size = request.args['sample_size'] if 'sample_size' in request.args else WORD_COUNT_SAMPLE_SIZE if access_public_topic(topics_id): results = apicache.topic_word_counts(TOOL_API_KEY, topics_id, sample_size=sample_size, snapshots_id=None, timespans_id=None, foci_id=None, q=None) elif is_user_logged_in(): # grab the top words, respecting all the filters results = apicache.topic_word_counts(user_mediacloud_key(), topics_id, sample_size=sample_size) else: return jsonify({'status': 'Error', 'message': 'Invalid attempt'}) totals = [] # important so that these get reset on the client when they aren't requested logger.debug(request.args) if (is_user_logged_in()) and ('withTotals' in request.args) and (request.args['withTotals'] == "true"): # return along with the results for the overall timespan, to facilitate comparison snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) overall_timespan = _find_overall_timespan(topics_id, snapshots_id) totals = apicache.topic_word_counts(user_mediacloud_key(), topics_id, sample_size=sample_size, timespans_id=overall_timespan['timespans_id'], foci_id=None, q=None) response = { 'list': results[:WORD_COUNT_UI_NUM_WORDS], 'totals': totals[:WORD_COUNT_UI_NUM_WORDS], 'sample_size': str(sample_size) } return jsonify(response)
def get_top_countries_by_story_tag_counts(topics_id, num_countries): tag_country_counts = [] # get the total stories for a topic total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count'] # get the top countries by the story tag counts iwth overall timespan timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id) overall_timespan = [t for t in timespans if t['period'] == "overall"] overall_timespan = next(iter(overall_timespan)) timespan_query = "timespans_id:{}".format(overall_timespan['timespans_id']) top_geo_tags = _cached_topic_tag_counts(user_mediacloud_key(), topics_id, GEO_TAG_SET, GEO_SAMPLE_SIZE, timespan_query) # make sure the geo tag is in the geo_tags whitelist (is a country) country_tag_counts = [r for r in top_geo_tags if int(r['tag'].split('_')[1]) in list(COUNTRY_GEONAMES_ID_TO_APLHA3.keys())] country_tag_counts = country_tag_counts[:num_countries] # for each country, set up the requisite info for UI for tag in country_tag_counts: tag_country_counts.append({ 'label': tag['label'], 'geo_tag': tag['tag'], 'tags_id': tag['tags_id'], 'count': tag['count'], 'pct': float(tag['count']) / float(total_stories), # story_tag_count / total story per topic count }) return tag_country_counts
def media_type_coverage(topics_id): media_type_tags = tags_in_tag_set(TOOL_API_KEY, TAG_SETS_ID_MEDIA_TYPE) # grab the total stories total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count'] # count the stories in any media in tagged as media_type tags_ids = " ".join(str(tag['tags_id']) for tag in media_type_tags) query_clause = "tags_id_media:({})".format(tags_ids) tagged_story_count = topic_story_count(user_mediacloud_key(), topics_id, q=query_clause)['count'] return jsonify({'counts': {'count': tagged_story_count, 'total': total_stories}})
def _public_safe_topic_story_count(topics_id, q): if access_public_topic(topics_id): total = apicache.topic_story_count(TOOL_API_KEY, topics_id, q=apicache.add_to_user_query(None)) # force a count with just the query matching = apicache.topic_story_count(TOOL_API_KEY, topics_id, q=apicache.add_to_user_query(q)) elif is_user_logged_in(): total = apicache.topic_story_count(user_mediacloud_key(), topics_id, q=apicache.add_to_user_query(None)) # force a count with just the query matching = apicache.topic_story_count(user_mediacloud_key(), topics_id, q=apicache.add_to_user_query(q)) else: return jsonify({'status': 'Error', 'message': 'Invalid attempt'}) return jsonify({'counts': {'count': matching['count'], 'total': total['count']}})
def api_collection_details(collection_id): add_in_sources = False if ('getSources' in request.args) and (request.args['getSources'] == 'true'): add_in_sources = True user_mc = user_mediacloud_client() info = user_mc.tag(collection_id) add_user_favorite_flag_to_collections([info]) info['id'] = collection_id info['tag_set'] = _tag_set_info(user_mediacloud_key(), info['tag_sets_id']) if add_in_sources: media_in_collection = media_with_tag(user_mediacloud_key(), collection_id) info['sources'] = media_in_collection analytics_db.increment_count(analytics_db.TYPE_COLLECTION, collection_id, analytics_db.ACTION_SOURCE_MGR_VIEW) return jsonify({'results': info})
def api_collection_source_representation_csv(collection_id): user_mc = user_mediacloud_client() info = user_mc.tag(collection_id) source_representation = apicache.collection_source_representation(user_mediacloud_key(), collection_id) props = ['media_id', 'media_name', 'media_url', 'stories', 'sample_size', 'story_pct'] filename = info['label'] + "-source sentence counts.csv" return csv.stream_response(source_representation, props, filename)
def collection_wordcount_csv(collection_id): solr_q = 'tags_id_media:' + str(collection_id) solr_fq = None # add in the publish_date clause if there is one if ('q' in request.args) and (len(request.args['q']) > 0): solr_fq = request.args['q'] return stream_wordcount_csv(user_mediacloud_key(), 'wordcounts-Collection-' + collection_id, solr_q, solr_fq)
def api_collection_sources_csv(collection_id): user_mc = user_mediacloud_client() collection = user_mc.tag(collection_id) # not cached because props can change often all_media = media_with_tag(user_mediacloud_key(), collection_id) file_prefix = "Collection {} ({}) - sources ".format(collection_id, collection['tag']) properties_to_include = SOURCE_LIST_CSV_EDIT_PROPS return csv.download_media_csv(all_media, file_prefix, properties_to_include)
def api_metadata_values(tag_sets_id): ''' Source metadata is encoded in various tag sets - this returns the set and the list of available tags you can use ''' data = tags_in_tag_set(user_mediacloud_key(), tag_sets_id, False, True) # use the file-based cache here return jsonify(data)
def _find_overall_timespan(topics_id, snapshots_id): selected_snapshot_timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id=snapshots_id) for timespan in selected_snapshot_timespans: if timespan['period'] == 'overall': return timespan raise RuntimeError('Missing overall timespan in snapshot {} (topic {})!'.format(snapshots_id, topics_id))
def media(topics_id, media_id): user_mc = user_admin_mediacloud_client() combined_media_info = apicache.topic_media_list(user_mediacloud_key(), topics_id, media_id=media_id)['media'][0] media_info = user_mc.media(media_id) for key in list(media_info.keys()): if key not in list(combined_media_info.keys()): combined_media_info[key] = media_info[key] return jsonify(combined_media_info)
def media_words_csv(topics_id, media_id): query = apicache.add_to_user_query('media_id:'+media_id) ngram_size = request.args['ngram_size'] if 'ngram_size' in request.args else 1 # default to word count word_counts = apicache.topic_ngram_counts(user_mediacloud_key(), topics_id, ngram_size=ngram_size, q=query, num_words=WORD_COUNT_DOWNLOAD_NUM_WORDS, sample_size=WORD_COUNT_DOWNLOAD_SAMPLE_SIZE) return csv.stream_response(word_counts, apicache.WORD_COUNT_DOWNLOAD_COLUMNS, 'topic-{}-media-{}-sampled-ngrams-{}-word'.format(topics_id, media_id, ngram_size))
def media_type_story_counts(topics_id): tag_story_counts = [] media_type_tags = tags_in_tag_set(TOOL_API_KEY, TAG_SETS_ID_MEDIA_TYPE) # grab the total stories total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count'] # make a count for each tag based on media_id for tag in media_type_tags: query_clause = "tags_id_media:{}".format(tag['tags_id']) tagged_story_count = topic_story_count(user_mediacloud_key(), topics_id, q=query_clause)['count'] tag_story_counts.append({ 'label': tag['label'], 'tags_id': tag['tags_id'], 'count': tagged_story_count, 'pct': float(tagged_story_count)/float(total_stories) }) return jsonify({'story_counts': tag_story_counts})
def _collection_source_story_split_historical_counts(collection_id): media_list = media_with_tag(user_mediacloud_key(), collection_id) jobs = [{'media': m} for m in media_list] # fetch in parallel to make things faster pool = Pool(processes=HISTORICAL_COUNT_POOL_SIZE) results = pool.map(_source_story_split_count_worker, jobs) # blocks until they are all done pool.terminate() # extra safe garbage collection return results
def topic_media(topics_id): if access_public_topic(topics_id): media_list = topic_media_list(TOOL_API_KEY, topics_id, snapshots_id=None, timespans_id=None, foci_id=None, sort=None, limit=None, link_id=None) elif is_user_logged_in(): media_list = topic_media_list(user_mediacloud_key(), topics_id) else: return jsonify({'status': 'Error', 'message': 'Invalid attempt'}) return jsonify(media_list)
def cached_media_tags(tag_sets_id): partisanship_tags = cached_tags_in_tag_set(tag_sets_id) for tag in partisanship_tags: media = media_with_tag(user_mediacloud_key(), tag['tags_id'], True) # cache this list media_ids = [str(m['media_id']) for m in media] # as strs so we can concat into a query str later with .join call tag['media'] = media tag['media_ids'] = media_ids tag['media_query'] = "media_id:({})".format(" ".join(media_ids)) return partisanship_tags
def _find_overall_timespan(topics_id, snapshots_id): selected_snapshot_timespans = apicache.cached_topic_timespan_list( user_mediacloud_key(), topics_id, snapshots_id=snapshots_id) for timespan in selected_snapshot_timespans: if timespan['period'] == 'overall': return timespan raise RuntimeError( 'Missing overall timespan in snapshot {} (topic {})!'.format( snapshots_id, topics_id))
def media_source_words(media_id): solr_q = 'media_id:'+str(media_id) solr_fq = None if ('q' in request.args) and (len(request.args['q']) > 0): solr_fq = request.args['q'] info = { 'wordcounts': word_count(user_mediacloud_key(), solr_q, solr_fq) } return jsonify({'results': info})
def remove_sources_from_collection(collection_id): source_ids_to_remove = request.form['sources[]'].split(',') source_ids_to_remove = [int(s) for s in source_ids_to_remove] user_mc = user_admin_mediacloud_client() # get the sources in the collection first, then remove and add as needed existing_source_ids = [int(m['media_id']) for m in media_with_tag(user_mediacloud_key(), collection_id)] source_ids_to_remain = list(set(existing_source_ids) - set(source_ids_to_remove)) media_to_remove = [MediaTag(sid, tags_id=collection_id, action=TAG_ACTION_REMOVE) for sid in source_ids_to_remove] media_to_remain = [MediaTag(sid, tags_id=collection_id, action=TAG_ACTION_ADD) for sid in source_ids_to_remain] # do I need to run similar or TAG_ACTION_REMOVE? current_media = media_to_remove + media_to_remain if len(current_media) > 0: results = user_mc.tagMedia(current_media) apicache.invalidate_collection_source_representation_cache(user_mediacloud_key(), collection_id) return jsonify(results)
def topic_split_story_count(topics_id): if access_public_topic(topics_id): results = apicache.topic_split_story_counts(TOOL_API_KEY, topics_id, snapshots_id=None, timespans_id=None, foci_id=None,q=None) elif is_user_logged_in(): results = apicache.topic_split_story_counts(user_mediacloud_key(), topics_id) else: return jsonify({'status': 'Error', 'message': 'Invalid attempt'}) return jsonify({'results': results})
def media(topics_id, media_id): user_mc = user_admin_mediacloud_client() combined_media_info = apicache.topic_media_list( user_mediacloud_key(), topics_id, media_id=media_id)['media'][0] media_info = user_mc.media(media_id) for key in media_info.keys(): if key not in combined_media_info.keys(): combined_media_info[key] = media_info[key] return jsonify(combined_media_info)
def api_metadata_values(tag_sets_id): ''' Source metadata is encoded in various tag sets - this returns the set and the list of available tags you can use ''' data = tags_in_tag_set(user_mediacloud_key(), tag_sets_id, False, True) # use the file-based cache here data['short_list'] = get_metadata_defaults(tag_sets_id) return jsonify(data)
def api_collection_sources(collection_id): int(collection_id) results = { 'tags_id': collection_id } media_in_collection = media_with_tag(user_mediacloud_key(), collection_id) add_user_favorite_flag_to_sources(media_in_collection) results['sources'] = media_in_collection return jsonify(results)
def _cached_tag_coverage_pct(query, tag_sets_id): user_mc = user_mediacloud_client() story_count = source_story_count(user_mediacloud_key(), query) tagged_story_counts = user_mc.storyTagCount(solr_query=query, tag_sets_id=tag_sets_id) # sum tagged articles because there are different versions tagged_sum = sum([tag_info['count'] for tag_info in tagged_story_counts]) # compute coverage ratio (protect against div by zero) ratio = float(tagged_sum) / float(story_count) if story_count > 0 else 0 return ratio
def media_stories(topics_id, media_id): sort = validated_sort(request.args.get('sort')) limit = request.args.get('limit') stories = apicache.topic_story_list(user_mediacloud_key(), topics_id, media_id=media_id, sort=sort, limit=limit) return jsonify(stories)
def topic_timespan_list(topics_id, snapshots_id): ignored_snapshots_id, timespans_id, foci_id, q = filters_from_args( request.args) timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id, foci_id) # add the focal_set type to the timespan so we can use that in the client (ie. decide what to show or not # based on what type of focal_set this timespan is part of) focal_sets = apicache.topic_focal_sets_list(user_mediacloud_key(), topics_id, snapshots_id) for t in timespans: for fs in focal_sets: for f in fs['foci']: if f['foci_id'] == t['foci_id']: t['focal_set'] = fs t['focus'] = f break return jsonify({'list': timespans})
def api_collection_set(tag_sets_id): ''' Return a list of all the (public only or public and private, depending on user role) collections in a tag set. Not cached because this can change, and load time isn't terrible. :param tag_sets_id: the tag set to query for public collections :return: dict of info and list of collections in ''' if user_has_auth_role(ROLE_MEDIA_EDIT): info = apicache.tag_set_with_private_collections( user_mediacloud_key(), tag_sets_id) else: info = apicache.tag_set_with_public_collections( user_mediacloud_key(), tag_sets_id) add_user_favorite_flag_to_collections(info['tags']) # rename to make more sense here info['collections'] = info['tags'] del info['tags'] return jsonify(info)
def topic_words(topics_id): sample_size = request.args[ 'sample_size'] if 'sample_size' in request.args else WORD_COUNT_SAMPLE_SIZE if access_public_topic(topics_id): results = apicache.topic_word_counts(TOOL_API_KEY, topics_id, sample_size=sample_size, snapshots_id=None, timespans_id=None, foci_id=None, q=None) elif is_user_logged_in(): # grab the top words, respecting all the filters results = apicache.topic_word_counts(user_mediacloud_key(), topics_id, sample_size=sample_size) else: return jsonify({'status': 'Error', 'message': 'Invalid attempt'}) totals = [ ] # important so that these get reset on the client when they aren't requested logger.debug(request.args) if (is_user_logged_in()) and ('withTotals' in request.args) and ( request.args['withTotals'] == "true"): # return along with the results for the overall timespan, to facilitate comparison snapshots_id, timespans_id, foci_id, q = filters_from_args( request.args) overall_timespan = _find_overall_timespan(topics_id, snapshots_id) totals = apicache.topic_word_counts( user_mediacloud_key(), topics_id, sample_size=sample_size, timespans_id=overall_timespan['timespans_id'], foci_id=None, q=None) response = { 'list': results[:WORD_COUNT_UI_NUM_WORDS], 'totals': totals[:WORD_COUNT_UI_NUM_WORDS], 'sample_size': str(sample_size) } return jsonify(response)
def collection_source_split_stories(collection_id): collections_query = "tags_id_media:{}".format(collection_id) exclude_spidered_stories = " tags_id_media:{} AND NOT tags_id_stories:{}".format(str(collection_id), 8875452)\ if 'separate_spidered' in request.args else collections_query interval = 'day' # default, and not currently passed to the calls above all_results = apicache.last_year_split_story_count(user_mediacloud_key(), collections_query) # same if request.args doesn't ask to exclude_spidered non_spidered_results = apicache.last_year_split_story_count(user_mediacloud_key(), exclude_spidered_stories) all_stories = { 'total_story_count': all_results['total_story_count'], 'list': all_results['counts'], } partial_stories = { 'total_story_count': non_spidered_results['total_story_count'], 'list': non_spidered_results['counts'], } return jsonify({'results': {'all_stories': all_stories, 'partial_stories': partial_stories, 'interval': interval}})
def api_collection_set(tag_sets_id): """ Return a list of all the (public only or public and private, depending on user role) collections in a tag set. Not cached because this can change, and load time isn't terrible. :param tag_sets_id: the tag set to query for public collections :return: dict of info and list of collections in """ if user_has_auth_role(ROLE_MEDIA_EDIT): info = apicache.tag_set_with_private_collections(user_mediacloud_key(), tag_sets_id) else: info = apicache.tag_set_with_public_collections(user_mediacloud_key(), tag_sets_id) add_user_favorite_flag_to_collections(info['tags']) # rename to make more sense here for t in info['tags']: t['sort_key'] = t['label'] if t['label'] else t['tag'] info['collections'] = sorted(info['tags'], key=itemgetter('sort_key')) del info['tags'] return jsonify(info)
def api_collections_by_ids(): collection_ids = request.args['coll[]'].split(',') sources_list = [] for tags_id in collection_ids: all_media = media_with_tag(user_mediacloud_key(), tags_id) info = [{'media_id': m['media_id'], 'name': m['name'], 'url': m['url'], 'public_notes': m['public_notes']} for m in all_media] add_user_favorite_flag_to_sources(info) sources_list += info return jsonify({'results': sources_list})
def topic_stories(topics_id): local_mc = None if access_public_topic(topics_id): stories = topic_story_list(TOOL_API_KEY, topics_id, snapshots_id=None, timespans_id=None, foci_id=None, q=None) elif is_user_logged_in(): stories = topic_story_list(user_mediacloud_key(), topics_id) else: return jsonify({'status':'Error', 'message': 'Invalid attempt'}) return jsonify(stories)
def topic_stories_csv(topics_id): as_attachment = True fb_data = False if ('attach' in request.args): as_attachment = request.args['attach'] == 1 if ('fbData' in request.args): fb_data = int(request.args['fbData']) == 1 user_mc = user_admin_mediacloud_client() topic = user_mc.topic(topics_id) return stream_story_list_csv(user_mediacloud_key(), topic['name']+'-stories', topics_id, as_attachment=as_attachment, fb_data=fb_data)
def retweet_partisanship_story_counts(topics_id): # TODO: add in overall timespan id here so it works in different snapshots tag_story_counts = [] partisanship_tags = _cached_media_tags( TAG_SETS_ID_RETWEET_PARTISANSHIP_2016) # grab the total stories try: total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count'] except mediacloud.error.MCException: total_stories = 0 # make a count for each tag for tag in partisanship_tags: try: tagged_story_count = topic_story_count(user_mediacloud_key(), topics_id, q=tag['query'])['count'] pct = float(tagged_story_count) / float(total_stories) except ZeroDivisionError: tagged_story_count = 0 pct = 0 except mediacloud.error.MCException: tagged_story_count = 0 pct = 0 tag_story_counts.append({ 'label': tag['label'], 'tags_id': tag['tags_id'], 'count': tagged_story_count, 'pct': pct }) # order them in the way a person would expect ( left to center to right) ordered_tag_story_counts = list() ordered_tag_story_counts.append( [t for t in tag_story_counts if t['tags_id'] == 9360520][0]) ordered_tag_story_counts.append( [t for t in tag_story_counts if t['tags_id'] == 9360521][0]) ordered_tag_story_counts.append( [t for t in tag_story_counts if t['tags_id'] == 9360522][0]) ordered_tag_story_counts.append( [t for t in tag_story_counts if t['tags_id'] == 9360523][0]) ordered_tag_story_counts.append( [t for t in tag_story_counts if t['tags_id'] == 9360524][0]) return jsonify({'story_counts': ordered_tag_story_counts})
def base_snapshot_timespan(topics_id): # find the timespan matching this one in the base snapshot (ie. with no foci_id) snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) base_snapshot_timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id=snapshots_id, foci_id=None) timespan = apicache.topic_timespan(topics_id, snapshots_id, foci_id, timespans_id) # the selected timespan for t in base_snapshot_timespans: if apicache.is_timespans_match(timespan, t): return t raise ValueError("Can't find a timespan in the base snapshot matching the one specified")
def topic_media(topics_id): if access_public_topic(topics_id): media_list = apicache.topic_media_list(TOOL_API_KEY, topics_id, snapshots_id=None, timespans_id=None, foci_id=None, sort=None, limit=None, link_id=None) elif is_user_logged_in(): media_list = apicache.topic_media_list(user_mediacloud_key(), topics_id) else: return jsonify({'status': 'Error', 'message': 'Invalid attempt'}) return jsonify(media_list)
def api_collection_sources_csv(collection_id): user_mc = user_mediacloud_client() collection = user_mc.tag( collection_id) # not cached because props can change often all_media = media_with_tag(user_mediacloud_key(), collection_id) file_prefix = "Collection {} ({}) - sources ".format( collection_id, collection['tag']) properties_to_include = SOURCE_LIST_CSV_EDIT_PROPS return csv.download_media_csv(all_media, file_prefix, properties_to_include)
def story_counts(topics_id): if access_public_topic(topics_id): local_key = TOOL_API_KEY elif is_user_logged_in(): local_key = user_mediacloud_key() else: return jsonify({'status': 'Error', 'message': 'Invalid attempt'}) total = topic_story_count(local_key, topics_id, timespans_id=None, q=None) filtered = topic_story_count(local_key, topics_id) # force a count with just the query return jsonify({'counts': {'count': filtered['count'], 'total': total['count']}})
def url_sharing_focal_set(topics_id, snapshots_id) -> bool: """ Return the focal_set that is marked as the auto-generated "URL Sharing" one. :param topics_id: :param snapshots_id: :return: a focal set, or None if the topic doesn't have one """ focal_sets = topic_focal_sets_list(user_mediacloud_key(), topics_id, snapshots_id) url_sharing_focal_sets = [fs for fs in focal_sets if is_url_sharing_focal_set(fs)] return url_sharing_focal_sets[0] if len(url_sharing_focal_sets) > 0 else None
def story_counts(topics_id): if access_public_topic(topics_id): local_key = TOOL_API_KEY elif is_user_logged_in(): local_key = user_mediacloud_key() else: return jsonify({'status': 'Error', 'message': 'Invalid attempt'}) total = apicache.topic_story_count(local_key, topics_id, timespans_id=None, snapshots_id=None, q=None, foci_id=None) filtered = apicache.topic_story_count(local_key, topics_id) return jsonify({'counts': {'count': filtered['count'], 'total': total['count']}})
def api_collection_sources_csv(collection_id): user_mc = user_admin_mediacloud_client() # info = user_mc.tag(int(collection_id)) all_media = media_with_tag(user_mediacloud_key(), collection_id) for src in all_media: for tag in src['media_source_tags']: if is_metadata_tag_set(tag['tag_sets_id']): format_metadata_fields(src, tag['tag_sets_id'], tag['tag']) file_prefix = "Collection_Sourcelist_Template_for_" + collection_id + "_" what_type_download = COLLECTIONS_TEMPLATE_PROPS_EDIT return csv.download_media_csv(all_media, file_prefix, what_type_download)
def api_media_source_split_stories(media_id): media_query = 'media_id:' + str(media_id) exclude_spidered_stories = " media_id:{} AND NOT tags_id_stories:{}".format(str(media_id), 8875452) if 'separate_spidered' in request.args else media_query health = _cached_media_source_health(user_mediacloud_key(), media_id) all_results = apicache.last_year_split_story_count(user_mediacloud_key(), media_query) non_spidered_results = apicache.last_year_split_story_count(user_mediacloud_key(), exclude_spidered_stories) #same if request.args doesn't ask to exclude_spidered all_stories = { 'total_story_count' : all_results['total_story_count'], 'health': health, 'list': all_results['counts'], } partial_stories = { 'total_story_count': non_spidered_results['total_story_count'], 'health': health, 'list': non_spidered_results['counts'], } return jsonify({'results': {'all_stories':all_stories, 'partial_stories': partial_stories}})
def topic_media_csv(topics_id): sort = validated_sort(request.args.get('sort')) snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) return _stream_media_list_csv(user_mediacloud_key(), 'media-for-topic-' + topics_id, topics_id, sort=sort, snapshots_id=snapshots_id, timespans_id=timespans_id, foci_id=foci_id, q=q)
def collection_source_split_stories(collection_id): q = "tags_id_media:{}".format(collection_id) results = apicache.last_year_split_story_count(user_mediacloud_key(), q) interval = 'day' # default, and not currently passed to the calls above return jsonify({ 'results': { 'list': results['counts'], 'total_story_count': results['total_story_count'], 'interval': interval } })
def topic_focal_set_list(topics_id): snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) include_story_counts = request.args.get('includeStoryCounts') focal_sets = apicache.topic_focal_sets_list(user_mediacloud_key(), topics_id, snapshots_id) # now mark the ones that are the magically added URL sharing platform ones for fs in focal_sets: fs['is_url_sharing'] = is_url_sharing_focal_set(fs) if include_story_counts and (include_story_counts == u'1'): _add_story_counts_to_foci(topics_id, focal_sets) return jsonify(focal_sets)
def _generate_network_of_frames(topics_id, timespans_id, num_of_sources, out_name, top_media_sort, remove_media_list=None, remove_word_list=[], generate_word_lists=False, include_media_list=None, media_attribs=None, num_words=None): if remove_media_list is None: remove_media_list = [] # use this specify attributes on the media source that should be added to the node as attributes # if(media_attribs == None): # media_attribs = {} if include_media_list is None: media_sources_md = topic_media_list(user_mediacloud_key(), topics_id, timespans_id=timespans_id, limit=num_of_sources + len(remove_media_list), sort=top_media_sort)['media'] else: media_sources_md = include_media_list if remove_media_list is not None: for r in remove_media_list: media_sources_md = _remove_media_source(r, media_sources_md) top_words = _build_top_words(media_sources_md, topics_id, timespans_id, remove_word_list, num_words) if remove_word_list is not None: top_words = _clean_top_words(top_words, remove_word_list) frame_network = _build_network(top_words, media_sources_md, media_attribs) _export_gexf_network(frame_network, '%s.gexf' % out_name) _export_d3_network(frame_network, '%s' % out_name) if generate_word_lists: with open('%s.txt' % out_name, 'w', encoding="utf-8") as wl: all_words = [] media_sources = {ms['media_id']: ms['name'] for ms in media_sources_md} # counts = {} for ms in top_words: # wl.write("\n\n%s (media id: %d):\n" % (media_sources[ms].encode('ascii', 'ignore'), ms)) wl.write("\n\n{} (media id: {}):\n".format(media_sources[ms], ms)) for w in top_words[ms]: all_words.append(w['term']) # increment count to see how many media source include each word # counts[ms] # wl.write("- %s (%d)\n" % (w['term'].encode('ascii', 'ignore'), w['count'])) wl.write("- {} ({})\n".format(w['term'], w['count'])) wl.write("\n") linefeed = chr(10) # linefeed=\n s = linefeed.join(nx.generate_gexf(frame_network)) # doctest: +SKIP # for line in nx.generate_gexf(frame_network): # doctest: +SKIP # logger.debug line return s
def topic_tag_coverage(topics_id, tags_id): ''' Useful for seeing how many stories in the topic are tagged with a specific tag ''' if isinstance(tags_id, list): # doesn't repect duck-typing, but quick fix tags_id_str = "({})".format(" ".join([str(tid) for tid in tags_id])) else: tags_id_str = str(tags_id) # respect any query filter the user has set query_with_tag = add_to_user_query("tags_id_stories:{}".format(tags_id_str)) # now get the counts if access_public_topic(topics_id): total = topic_story_count(TOOL_API_KEY, topics_id) tagged = topic_story_count(TOOL_API_KEY, topics_id, q=query_with_tag) # force a count with just the query elif is_user_logged_in(): total = topic_story_count(user_mediacloud_key(), topics_id) tagged = topic_story_count(user_mediacloud_key(), topics_id, q=query_with_tag) # force a count with just the query else: return None return {'counts': {'count': tagged['count'], 'total': total['count']}}
def topic_focal_set_split_stories_compare(topics_id, focal_sets_id): snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) # need the timespan info, to find the appropriate timespan with each focus try: base_timespan = base_snapshot_timespan(topics_id) focal_set = apicache.topic_focal_set(user_mediacloud_key(), topics_id, snapshots_id, focal_sets_id) except ValueError as e: return json_error_response(e.message) # collect the story split counts for each foci timespans = apicache.matching_timespans_in_foci(topics_id, base_timespan, focal_set['foci']) for idx in range(0, len(timespans)): data = apicache.topic_split_story_counts( user_mediacloud_key(), topics_id, snapshots_id=snapshots_id, timespans_id=timespans[idx]['timespans_id']) focal_set['foci'][idx]['split_story_counts'] = data return jsonify(focal_set)
def api_collection_source_representation_csv(collection_id): user_mc = user_mediacloud_client() info = user_mc.tag(collection_id) source_representation = apicache.collection_source_representation( user_mediacloud_key(), collection_id) props = [ 'media_id', 'media_name', 'media_url', 'stories', 'sample_size', 'story_pct' ] filename = info['label'] + "-source sentence counts.csv" return csv.stream_response(source_representation, props, filename)
def topic_tag_coverage(topics_id, tags_id): ''' Useful for seeing how many stories in the topic are tagged with a specific tag ''' # respect any query filter the user has set snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) query_with_tag = add_to_user_query("tags_id_stories:{}".format(tags_id)) # now get the counts if access_public_topic(topics_id): total = topic_story_count(TOOL_API_KEY, topics_id) tagged = topic_story_count( TOOL_API_KEY, topics_id, q=query_with_tag) # force a count with just the query elif is_user_logged_in(): total = topic_story_count(user_mediacloud_key(), topics_id) tagged = topic_story_count( user_mediacloud_key(), topics_id, q=query_with_tag) # force a count with just the query else: return None return {'counts': {'count': tagged['count'], 'total': total['count']}}
def _source_story_split_count_job(info): source = info['media'] q = "media_id:{}".format(source['media_id']) split_stories = apicache.split_story_count(user_mediacloud_key(), q, 360) source_data = { 'media_id': source['media_id'], 'media_name': source['name'], 'media_url': source['url'], 'total_story_count': split_stories['total_story_count'], 'splits_over_time': split_stories['counts'], } return source_data
def get_topic_story_links_csv(topics_id): user_mc = user_mediacloud_client() topic = user_mc.topic(topics_id) #page through results for timespand props = [ 'stories_id', 'publish_date', 'title', 'url', 'language', 'ap_syndicated', 'themes', 'subtopics', 'inlink_count', 'facebook_share_count', 'outlink_count', 'media_inlink_count', 'media_id', 'media_name', 'media_url', # 'media_pub_country', 'media_pub_state', 'media_language', 'media_about_country', 'media_media_type' ] return stream_story_link_list_csv(user_mediacloud_key(), topic['name'] + '-stories', topics_id)