def create_nyt_theme_focal_set(topics_id): user_mc = user_mediacloud_client() # grab the focalSetName and focalSetDescription and then make one focal_set_name = request.form['focalSetName'] focal_set_description = request.form['focalSetDescription'] theme_data = json.loads(request.form['data[]']) focal_technique = FOCAL_TECHNIQUE_BOOLEAN_QUERY # is this right? new_focal_set = user_mc.topicFocalSetDefinitionCreate( topics_id, focal_set_name, focal_set_description, focal_technique) if 'focal_set_definitions_id' not in new_focal_set: return json_error_response('Unable to create the subtopic set') # now make the foci in it - one for each country for tag in theme_data: params = { 'name': tag['label'], 'description': "Stories about {}".format(tag['label']), 'query': "tags_id_stories:{}".format(tag['tags_id']), 'focal_set_definitions_id': new_focal_set['focal_set_definitions_id'], } user_mc = user_mediacloud_client() user_mc.topicFocusDefinitionCreate(topics_id, **params) return {'success': True}
def media_search(search_str, tags_id=None, **kwargs): mc = user_mediacloud_client() return mc.mediaList(name_like=search_str, tags_id=tags_id, rows=MAX_SOURCES, sort="num_stories", **kwargs)
def _cached_collection_source_representation(mc_api_key, collection_id, sample_size=1000, fq=''): # have to respect the api here here because only some folks can see private collections user_mc = user_mediacloud_client(mc_api_key) stories = user_mc.storyList('tags_id_media:{}'.format(collection_id), fq, rows=sample_size, sort=mc.SORT_RANDOM) media_representation = {} for s in stories: if s['media_id'] not in media_representation: media_representation[s['media_id']] = { 'media_id': s['media_id'], 'media_name': s['media_name'], 'media_url': s['media_url'], 'sample_size': sample_size, 'stories': 0 } media_representation[s['media_id']]['stories'] += 1 for media_id in media_representation: media_representation[media_id]['story_pct'] = float( media_representation[media_id]['stories']) / float(sample_size) return sorted(list(media_representation.values()), key=operator.itemgetter('stories'))
def _mc_client(admin=False): # return the user's client handler, or a tool one if not logged in if is_user_logged_in(): client_to_use = user_mediacloud_client() if not admin else user_admin_mediacloud_client() else: client_to_use = mc return client_to_use
def _cached_topic_tag_counts(user_mc_key, topics_id, tag_sets_id, sample_size, query): user_mc = user_mediacloud_client() # we don't need ot use topics_id here because the timespans_id is in the query argument tag_counts = user_mc.storyTagCount(query, tag_sets_id=tag_sets_id) # add in the pct so we can show relative values within the sample return tag_counts
def get_topic_platforms(topics_id): user_mc = user_mediacloud_client() available_platforms = _available_platforms() topic = user_mc.topic(topics_id) # and add in the open web query, which isn't stored in topic_seed_queries for historical reasons :-( if topic_has_seed_query(topic): for item in available_platforms: if (item['platform'] == PLATFORM_OPEN_WEB) and (item['source'] == PLATFORM_SOURCE_MEDIA_CLOUD): real_web_query = platform_for_web_seed_query(topic) for key in real_web_query: item[key] = real_web_query[key] break # now fill in with any seed queries that have been created for seed_query in topic['topic_seed_queries']: match = [ p for p in available_platforms if (p['platform'] == seed_query['platform']) and ( p['source'] == seed_query['source']) ] if len(match) == 1: match[0]['query'] = seed_query['query'] match[0]['topic_seed_queries_id'] = seed_query[ 'topic_seed_queries_id'] return jsonify({'results': available_platforms})
def cached_media_with_tag_page(tags_id, max_media_id): ''' We have to do this on the page, not the full list because memcache has a 1MB cache upper limit, and some of the collections have TONS of sources ''' user_mc = user_mediacloud_client() return user_mc.mediaList(tags_id=tags_id, last_media_id=max_media_id, rows=100)
def story_counts_by_snapshot(topics_id): user_mc = user_mediacloud_client(user_mediacloud_key()) snapshots = user_mc.topicSnapshotList(topics_id) counts = {} for s in snapshots: # get the count of stories in the overally timespan for this snapshot timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id=s['snapshots_id'], foci_id=None) try: total = timespans[0]['story_count'] except mediacloud.error.MCException: total = 0 except IndexError: # this doesn't have any snapshots (ie. it failed to generate correctly) total = 0 # search by tag to find out how many stories were spidered spidered = 0 try: spidered = apicache.topic_story_count(user_mediacloud_key(), topics_id, snapshots_id=s['snapshots_id'], foci_id=None, timespans_id=timespans[0]['timespans_id'], q="* AND tags_id_stories:{}".format(TAG_SPIDERED_STORY))['count'] except mediacloud.error.MCException: spidered = 0 except IndexError: # this doesn't have any snapshots (ie. it failed to generate correctly) total = 0 seeded = total - spidered counts[s['snapshots_id']] = {'total': total, 'spidered': spidered, 'seeded': seeded} return jsonify(counts)
def get_topic_media_links_csv(topics_id): user_mc = user_mediacloud_client() topic = user_mc.topic(topics_id) #page through results for timespand return stream_media_link_list_csv(user_mediacloud_key(), topic['name'] + '-stories', topics_id)
def api_collection_source_representation_csv(collection_id): user_mc = user_mediacloud_client() info = user_mc.tag(collection_id) source_representation = apicache.collection_source_representation(user_mediacloud_key(), collection_id) props = ['media_id', 'media_name', 'media_url', 'stories', 'sample_size', 'story_pct'] filename = info['label'] + "-source sentence counts.csv" return csv.stream_response(source_representation, props, filename)
def api_collection_sources_csv(collection_id): user_mc = user_mediacloud_client() collection = user_mc.tag(collection_id) # not cached because props can change often all_media = media_with_tag(user_mediacloud_key(), collection_id) file_prefix = "Collection {} ({}) - sources ".format(collection_id, collection['tag']) properties_to_include = SOURCE_LIST_CSV_EDIT_PROPS return csv.download_media_csv(all_media, file_prefix, properties_to_include)
def mc_client(admin=False): # return the user's client handler, or a tool one if not logged in if is_user_logged_in(): client_to_use = user_mediacloud_client() if not admin else user_admin_mediacloud_client() else: client_to_use = mc return client_to_use
def topic_favorites(): user_mc = user_mediacloud_client() favorite_topic_ids = user_db.get_users_lists(user_name(), 'favoriteTopics') favorited_topics = [user_mc.topic(tid) for tid in favorite_topic_ids] for t in favorited_topics: t['isFavorite'] = True return jsonify({'topics': favorited_topics})
def api_collection_sources_csv(collection_id): user_mc = user_mediacloud_client() collection = user_mc.tag(collection_id) # not cached because props can change often all_media = media_with_tag(user_mediacloud_key(), collection_id) file_prefix = "Collection {} ({}) - sources ".format(collection_id, collection['tag']) properties_to_include = SOURCE_LIST_CSV_EDIT_PROPS return csv.download_media_csv(all_media, file_prefix, properties_to_include)
def add_retweet_partisanship_to_topic(topics_id, focal_set_name, focal_set_description): user_mc = user_mediacloud_client() focal_technique = FOCAL_TECHNIQUE_BOOLEAN_QUERY new_focal_set = user_mc.topicFocalSetDefinitionCreate( topics_id, focal_set_name, focal_set_description, focal_technique) if 'focal_set_definitions_id' not in new_focal_set: return json_error_response('Unable to create the subtopic set') # now make the foci in it - one for each partisanship quintile partisanship_tags = _cached_media_tags( TAG_SETS_ID_RETWEET_PARTISANSHIP_2016) for tag in partisanship_tags: name = tag['label'] description = "Media sources that were retweeted more often during the 2016 US election " \ "season by people on the {}".format(tag['label']) query = tag['query'] focal_set_definitions_id = new_focal_set['focal_set_definitions_id'] # create a new boolean query subtopic based on the tag sets new_focus = user_mc.topicFocusDefinitionCreate( topics_id, name=name, description=description, query=query, focal_set_definitions_id=focal_set_definitions_id) if (len(new_focus) == 0) or ('focus_definitions_id' not in new_focus[0]): return json_error_response( 'Unable to create the {} subtopic'.format(name)) return {'success': True}
def _cached_topic_story_list(user_mc_key, topics_id, **kwargs): """ Internal helper - don't call this; call topic_story_list instead. This needs user_mc_key in the function signature to make sure the caching is keyed correctly. """ local_mc = user_mediacloud_client(user_mc_key) return local_mc.topicStoryList(topics_id, **kwargs)
def topic_focus_definition_update_or_create(topics_id): user_mc = user_mediacloud_client() name = request.form['focusName'] description = request.form['focusDescription'] query = request.form['keywords'] # update if it has an id, create if new if 'foci_id' in request.form: # you can't change the focal set a focus is in foci_id = request.form['foci_id'] focus = user_mc.topicFocusDefinitionUpdate(topics_id, foci_id, name=name, description=description, query=query) else: # if new focal set, then create that first if int(request.form['focalSetDefinitionId']) is NEW_FOCAL_SET_PLACEHOLDER_ID: name = request.form['focalSetName'] description = request.form['focalSetDescription'] focal_technique = request.form['focalTechnique'] new_focal_set = user_mc.topicFocalSetDefinitionCreate(topics_id, name, description, focal_technique) focal_set_definitions_id = new_focal_set['focal_set_definitions_id'] else: focal_set_definitions_id = request.form['focalSetDefinitionId'] # create focus, pointing at focal set focus = user_mc.topicFocusDefinitionCreate(topics_id, name=name, description=description, query=query, focal_set_definitions_id=focal_set_definitions_id) return jsonify(focus)
def _topic_snapshot_list(topic): if access_public_topic(topic['topics_id']): local_mc = mc api_key = TOOL_API_KEY elif is_user_logged_in(): local_mc = user_mediacloud_client() api_key = user_mediacloud_key() else: return {} # prob something smarter we can do here snapshots = local_mc.topicSnapshotList(topic['topics_id']) snapshots = sorted(snapshots, key=itemgetter('snapshots_id')) # add in any missing version numbers for idx in range(0, len(snapshots)): if snapshots[idx]['note'] in [None, '']: snapshots[idx]['note'] = idx + ARRAY_BASE_ONE # seed_query story count topic['seed_query_story_count'] = _topic_seed_story_count(topic) # add foci_count for display snapshots = _add_snapshot_foci_count(api_key, topic['topics_id'], snapshots) snapshots = sorted(snapshots, key=lambda d: d['snapshot_date']) # extra stuff snapshot_status = mc.topicSnapshotGenerateStatus( topic['topics_id'])['job_states'] # need to know if one is running latest = snapshots[-1] if len(snapshots) > 0 else None return { 'list': snapshots, 'jobStatus': snapshot_status, 'latestVersion': latest['note'] if latest else 1, }
def topic_update_platform(topics_id, platform_id): user_mc = user_mediacloud_client() channel = request.form[ 'platform_channel'] if 'platform_channel' in request.form else None source = request.form[ 'platform_source'] if 'platform_source' in request.form else None query = request.form[ 'platform_query'] if 'platform_query' in request.form else None platform = request.form['platform_type'] result = {} if platform == PLATFORM_OPEN_WEB: # here we need to parse the sources and collections out of the 'channel' sources, collections = parse_open_web_media_from_channel(channel) user_mc.topicUpdate(topics_id, media_ids=sources, media_tags_ids=collections, solr_seed_query=query) result['success'] = 1 result['id'] = platform_id #web_shim_ui else: result = user_mc.topicRemoveSeedQuery( topics_id, topic_seed_queries_id=platform_id) # Fake an update operation here by removing and then adding again if platform == PLATFORM_REDDIT: #TODO update this merge with correct info from Jason/Pushshift library query = "{} AND {}".format(query, channel) result = user_mc.topicAddSeedQuery(topics_id, platform, source, query) result['success'] = 1 if 'topic_seed_query' in result else 0 result['id'] = result['topic_seed_query']['topic_seed_queries_id'] return result # topic_seed_queries_id
def favorite_collections(): user_mc = user_mediacloud_client() user_favorited = db.get_users_lists(user_name(), 'favoriteCollections') favorited_collections = [user_mc.tag(tag_id) for tag_id in user_favorited] for s in favorited_collections: s['isFavorite'] = True return jsonify({'list': favorited_collections})
def topic_focus_definition_update_or_create(topics_id): user_mc = user_mediacloud_client() name = request.form['focusName'] description = request.form['focusDescription'] query = request.form['keywords'] # update if it has an id, create if new if 'foci_id' in request.form: # you can't change the focal set a focus is in foci_id = request.form['foci_id'] focus = user_mc.topicFocusDefinitionUpdate(topics_id, foci_id, name=name, description=description, query=query) else: # if new focal set, then create that first if int(request.form['focalSetDefinitionId'] ) is NEW_FOCAL_SET_PLACEHOLDER_ID: fs_name = request.form['focalSetName'] fs_description = request.form['focalSetDescription'] focal_technique = request.form['focalTechnique'] new_focal_set = user_mc.topicFocalSetDefinitionCreate( topics_id, fs_name, fs_description, focal_technique) focal_set_definitions_id = new_focal_set[ 'focal_set_definitions_id'] else: focal_set_definitions_id = request.form['focalSetDefinitionId'] # create focus, pointing at focal set focus = user_mc.topicFocusDefinitionCreate( topics_id, name=name, description=description, query=query, focal_set_definitions_id=focal_set_definitions_id) return jsonify(focus)
def favorite_sources(): user_mc = user_mediacloud_client() user_favorited = db.get_users_lists(user_name(), 'favoriteSources') favorited_s = [user_mc.media(media_id) for media_id in user_favorited] for s in favorited_s: s['isFavorite'] = True return jsonify({'list': favorited_s})
def cached_topic_timespan_list(topics_id, snapshots_id=None, foci_id=None): # this includes the user_mc_key as a first param so the cache works right user_mc = user_mediacloud_client() timespans = user_mc.topicTimespanList(topics_id, snapshots_id=snapshots_id, foci_id=foci_id) return timespans
def _topic_snapshot_list(topic): local_mc = user_mediacloud_client() api_key = user_mediacloud_key() snapshots = local_mc.topicSnapshotList(topic['topics_id']) snapshots = sorted(snapshots, key=itemgetter('snapshots_id')) # add in any missing version numbers for idx in range(0, len(snapshots)): if snapshots[idx]['note'] in [None, '']: snapshots[idx]['note'] = idx + ARRAY_BASE_ONE # format any web seed queries as platforms objects for s in snapshots: platforms = [] if (s['seed_queries'] is not None) and ('topic' in s['seed_queries']): p = platform_for_web_seed_query(s['seed_queries']) platforms.append(p) platforms += s['seed_queries']['topic_seed_queries'] else: if topic_has_seed_query(topic): p = platform_for_web_seed_query(topic) platforms.append(p) s['platform_seed_queries'] = platforms # add foci_count for display snapshots = _add_snapshot_foci_count(api_key, topic['topics_id'], snapshots) snapshots = sorted(snapshots, key=lambda d: d['snapshot_date']) # extra stuff snapshot_status = mc.topicSnapshotGenerateStatus( topic['topics_id'])['job_states'] # need to know if one is running latest = snapshots[-1] if len(snapshots) > 0 else None topic['seed_query_story_count'] = _topic_seed_story_count(topic) return { 'list': snapshots, 'jobStatus': snapshot_status, 'latestVersion': latest['note'] if latest else 1, }
def topic_create(): user_mc = user_mediacloud_client() name = request.form['name'] description = request.form['description'] solr_seed_query = request.form['solr_seed_query'] start_date = request.form['start_date'] end_date = request.form['end_date'] optional_args = { 'max_iterations': request.form['max_iterations'] if 'max_iterations' in request.form and request.form['max_iterations'] != 'null' else None, 'max_stories': request.form['max_stories'] if 'max_stories' in request.form and request.form['max_stories'] != 'null' else flask_login.current_user.profile['limits']['max_topic_stories'], } try: topic_result = user_mc.topicCreate(name=name, description=description, solr_seed_query=solr_seed_query, start_date=start_date, end_date=end_date, media_tags_ids=[COLLECTION_US_TOP_ONLINE], # HACK: can't save without one of these in place (for now) **optional_args, )['topics'][0] topics_id = topic_result['topics_id'] logger.info("Created new topic \"{}\" as {}".format(name, topics_id)) # if this includes any of the US-centric collections, add the retweet partisanship subtopic by default # client will either make a empty snapshot, or a spidering one return topic_summary(topics_id) except mediacloud.error.MCException as e: logging.error("Topic creation failed {}".format(name)) logging.exception(e) return json_error_response(e.message, e.status_code) except Exception as e: logging.error("Topic creation failed {}".format(name)) logging.exception(e) return json_error_response(str(e), 500)
def api_collection_sources_feed_status_csv(collection_id, source_type): user_mc = user_mediacloud_client() collection = user_mc.tag(collection_id) list_type = str(source_type).lower() media_in_collection = media_with_tag(user_mediacloud_key(), collection_id) media_info_in_collection = _media_list_edit_job.map(media_in_collection) if list_type == 'review': filtered_media = [ m for m in media_info_in_collection if m['active_feed_count'] > 0 and m['num_stories_90'] == 0 and m['num_stories_last_year'] > 0 ] elif list_type == 'remove': filtered_media = [ m for m in media_info_in_collection if m['active_feed_count'] > 0 and m['num_stories_90'] == 0 and m['num_stories_last_year'] == 0 and m['latest_scrape_job.state'] == 'failed' ] elif list_type == 'unscrapeable': filtered_media = [ m for m in media_info_in_collection if m['active_feed_count'] == 0 and m['num_stories_90'] > 0 ] elif list_type == 'working': filtered_media = [ m for m in media_info_in_collection if m['active_feed_count'] > 0 and m['num_stories_last_year'] > 0 ] else: filtered_media = media_info_in_collection file_prefix = "Collection {} ({}) - sources feed {}".format( collection_id, collection['tag'], source_type) properties_to_include = SOURCE_FEED_LIST_CSV_PROPS return csv.download_media_csv(filtered_media, file_prefix, properties_to_include)
def cached_entities(user_mediacloud_key, stories_id): user_mc = user_mediacloud_client() nlp_results = user_mc.storyCoreNlpList(story_id_list=[stories_id]) if nlp_results[0]['corenlp'] == "story is not annotated": return None story_nlp = nlp_results[0]['corenlp']['_']['corenlp'] # set up for entity counting entities = [] current_entity_words = [] current_entity_type = None # entities can be split across multiple consecutive words # iterate through the words collecting any named entities for sentence in story_nlp['sentences']: for token in sentence['tokens']: if (len(token['ne']) > 1): current_entity_type = token['ne'] current_entity_words.append(token['word']) else: # found a non-entity, so check if the preceeding word(s) were an entity and add it to the mix if current_entity_type is not None: entities.append({'type': current_entity_type, 'name': " ".join(current_entity_words), 'words': len(current_entity_words)}) current_entity_words = [] current_entity_type = None # turn the lists into counts unique_entities = {} for entity in entities: unique_key = entity['type'] + entity['name'] + str(entity['words']) if unique_key in unique_entities.keys(): unique_entities[unique_key]['frequency'] += 1 else: unique_entities[unique_key] = entity unique_entities[unique_key]['frequency'] = 1 unique_entities = list(unique_entities.values()) unique_entities = sorted(unique_entities, key=itemgetter('frequency'), reverse=True) return unique_entities
def topic_favorites(): user_mc = user_mediacloud_client() favorite_topic_ids = db.get_users_lists(user_name(), 'favoriteTopics') favorited_topics = [user_mc.topic(tid) for tid in favorite_topic_ids] for t in favorited_topics: t['isFavorite'] = True # t['detailInfo'] = get_topic_info_per_snapshot_timespan(t['topics_id']) return jsonify({'topics': favorited_topics})
def sorted_public_topic_list(): # needs to support logged in or not if is_user_logged_in(): local_mc = user_mediacloud_client() else: local_mc = mc public_topics = local_mc.topicList(public=True, limit=51)['topics'] return sorted(public_topics, key=lambda t: t['name'].lower())
def _cached_tag_page(mc_api_key, tag_sets_id, last_tags_id, rows, public_only): # user agnositic here because the list of tags in a collection only changes for users based on public_only local_mc = user_mediacloud_client(mc_api_key) tag_list = local_mc.tagList(tag_sets_id=tag_sets_id, last_tags_id=last_tags_id, rows=rows, public_only=public_only) return tag_list
def sorted_public_topic_list(): # needs to support logged in or not if is_user_logged_in(): local_mc = user_mediacloud_client() else: local_mc = mc public_topics_list = local_mc.topicList(public=True)['topics'] return sorted(public_topics_list, key=lambda t: t['name'].lower())
def topic_favorites(): user_mc = user_mediacloud_client() favorite_topic_ids = user_db.get_users_lists(user_name(), 'favoriteTopics') favorited_topics = [user_mc.topic(tid) for tid in favorite_topic_ids] for t in favorited_topics: t['isFavorite'] = True # t['detailInfo'] = get_topic_info_per_snapshot_timespan(t['topics_id']) return jsonify({'topics': favorited_topics})
def _cached_top_tags(q, fq, tag_sets_id, sample_size=None): # post it so long queries work user_mc = user_mediacloud_client() return user_mc.storyTagCount(q, fq, tag_sets_id=tag_sets_id, limit=sample_size, http_method='POST')
def topic_provider_stories_csv(topics_id): optional_args = _parse_stories_optional_arguments() user_mc = user_mediacloud_client() topic = user_mc.topic(topics_id) del optional_args[ 'link_id'] # we do this do make sure this helper can page through the results return stream_story_list_csv(user_mediacloud_key(), 'stories', topic, **optional_args)
def topic_create(): user_mc = user_mediacloud_client() name = request.form['name'] description = request.form['description'] solr_seed_query = request.form['solr_seed_query'] start_date = request.form['start_date'] end_date = request.form['end_date'] optional_args = { 'is_public': request.form['is_public'] if 'is_public' in request.form else None, 'is_logogram': request.form['is_logogram'] if 'is_logogram' in request.form else None, 'ch_monitor_id': request.form['ch_monitor_id'] if len(request.form['ch_monitor_id']) > 0 and request.form['ch_monitor_id'] != 'null' else None, 'max_iterations': request.form['max_iterations'] if 'max_iterations' in request.form else None, 'max_stories': request.form['max_stories'] if 'max_stories' in request.form and request.form['max_stories'] != 'null' else flask_login.current_user.profile['max_topic_stories'], } # parse out any sources and collections to add media_ids_to_add = ids_from_comma_separated_str(request.form['sources[]']) tag_ids_to_add = ids_from_comma_separated_str( request.form['collections[]']) try: topic_result = user_mc.topicCreate(name=name, description=description, solr_seed_query=solr_seed_query, start_date=start_date, end_date=end_date, media_ids=media_ids_to_add, media_tags_ids=tag_ids_to_add, **optional_args)['topics'][0] topics_id = topic_result['topics_id'] logger.info("Created new topic \"{}\" as {}".format(name, topics_id)) # if this includes any of the US-centric collections, add the retweet partisanship subtopic by default if set(tag_ids_to_add).intersection(US_COLLECTIONS): add_retweet_partisanship_to_topic( topic_result['topics_id'], 'Retweet Partisanship', 'Subtopics driven by our analysis of Twitter followers of Trump and Clinton during the 2016 election season. Each media soure is scored based on the ratio of retweets of their stories in those two groups.' ) # client will either make a empty snapshot, or a spidering one return topic_summary(topics_id) except Exception as e: logging.error("Topic creation failed {}".format(name)) logging.exception(e) return json_error_response(str(e), 500) except mediacloud.error.MCException as e: logging.error("Topic creation failed {}".format(name)) logging.exception(e) return json_error_response(e.message, e.status_code)
def _cached_tag_coverage_pct(query, tag_sets_id): user_mc = user_mediacloud_client() story_count = source_story_count(user_mediacloud_key(), query) tagged_story_counts = user_mc.storyTagCount(solr_query=query, tag_sets_id=tag_sets_id) # sum tagged articles because there are different versions tagged_sum = sum([tag_info['count'] for tag_info in tagged_story_counts]) # compute coverage ratio (protect against div by zero) ratio = float(tagged_sum) / float(story_count) if story_count > 0 else 0 return ratio
def _cached_split_story_counts(q='*', fq=''): # sources are open to everyone, so no need for user-specific cache # Helper to fetch split story counts over a timeframe for an arbitrary query user_mc = user_mediacloud_client() results = user_mc.storyCount(solr_query=q, solr_filter=fq, split=True, split_period='day') return results
def _cached_topic_story_count(user_mc_key, topics_id, **kwargs): ''' Internal helper - don't call this; call topic_story_count instead. This needs user_mc_key in the function signature to make sure the caching is keyed correctly. ''' if user_mc_key == TOOL_API_KEY: local_mc = mc else: local_mc = user_mediacloud_client() return local_mc.topicStoryCount(topics_id, **kwargs)
def _cached_topic_tag_counts(user_mc_key, topics_id, tag_sets_id, sample_size, query): user_mc = user_mediacloud_client() # we don't need ot use topics_id here because the timespans_id is in the query argument tag_counts = user_mc.storyTagCount(query, tag_sets_id=tag_sets_id) # add in the pct so we can show relative values within the sample for t in tag_counts: if (is_bad_theme(t['tags_id'])): tag_counts.remove(t) return tag_counts
def story_info(stories_id): user_mc = user_mediacloud_client() admin_mc = user_admin_mediacloud_client() if stories_id in [None, 'NaN']: return jsonify({'error': 'bad value'}) if 'text' in request.args and request.args['text'] == 'true': story = admin_mc.story(stories_id, text=True) else: story = user_mc.story(stories_id) story["media"] = user_mc.media(story["media_id"]) return jsonify({'info': story})
def api_topics_preview_story_sample(): user_mc = user_mediacloud_client() solr_query = concatenate_query_for_solr(solr_seed_query=request.form['q'], media_ids=ids_from_comma_separated_str(request.form['sources[]']) if 'sources[]' in request.form else None, tags_ids=ids_from_comma_separated_str(request.form['collections[]'])) if 'collections[]' in request.form else None, fq = concatenate_solr_dates(start_date=request.form['start_date'], end_date=request.form['end_date']) num_stories = request.form['rows'] story_count_result = user_mc.storyList(solr_query=solr_query, solr_filter=fq, sort=user_mc.SORT_RANDOM, rows=num_stories) return jsonify(story_count_result)
def _cached_last_year_split_story_count(q='*'): # sources are open to everyone, so no need for user-specific cache # Helper to fetch split story counts over a timeframe for an arbitrary query user_mc = user_mediacloud_client() last_n_days = 365 start_date = datetime.date.today()-datetime.timedelta(last_n_days) end_date = datetime.date.today()-datetime.timedelta(1) # yesterday fq = user_mc.publish_date_query(start_date, end_date) results = user_mc.storyCount(solr_query=q, solr_filter=fq, split=True, split_period='day') results['counts'] = add_missing_dates_to_split_story_counts(results['counts'], start_date, end_date) results['total_story_count'] = sum([r['count'] for r in results['counts']]) return results
def _cached_sentence_list(mc_api_key, q, fq, rows, include_stories=True): # need to get an admin client with the tool key so they have sentence read permissions tool_mc = user_admin_mediacloud_client(mc_api_key) sentences = tool_mc.sentenceList(q, fq)[:rows] stories_id_list = [str(s['stories_id']) for s in sentences] if (len(stories_id_list) > 0) and include_stories: # this is the fastest way to get a list of stories by id stories = user_mediacloud_client().storyList("stories_id:({})".format(" ".join(stories_id_list))) stories_by_id = {s['stories_id']: s for s in stories} # build a quick lookup table by stories_id for s in sentences: s['story'] = stories_by_id[s['stories_id']] return sentences
def _remove_word_source_from_network(ms_name, word_list): user_mc = user_mediacloud_client() ms = user_mc.mediaList(name_like=ms_name) if len(ms) == 1: try: del word_list[ms[0]['media_id']] except KeyError: logger.debug('Media Source not present in list.') elif len(ms) == 0: logger.debug('No match for %s.' % ms_name) else: logger.debug('Multiple matches for Media Source. No action taken.')
def does_user_have_a_running_topic(): user_mc = user_mediacloud_client() queued_and_running_topics = [] more_topics = True link_id = None while more_topics: results = user_mc.topicList(link_id=link_id, limit=100) topics = results['topics'] queued_and_running_topics += [t for t in topics if t['state'] in ['running', 'queued'] and t['user_permission'] in ['admin']] more_topics = 'next' in results['link_ids'] if more_topics: link_id = results['link_ids']['next'] return jsonify(queued_and_running_topics)
def create_nyt_theme_focal_set(topics_id): user_mc = user_mediacloud_client() # grab the focalSetName and focalSetDescription and then make one focal_set_name = request.form['focalSetName'] focal_set_description = request.form['focalSetDescription'] theme_data= json.loads(request.form['data[]']) focal_technique = FOCAL_TECHNIQUE_BOOLEAN_QUERY # is this right? new_focal_set = user_mc.topicFocalSetDefinitionCreate(topics_id, focal_set_name, focal_set_description, focal_technique) if 'focal_set_definitions_id' not in new_focal_set: return json_error_response('Unable to create the subtopic set') # now make the foci in it - one for each country for tag in theme_data: params = { 'name': tag['label'], 'description': "Stories about {}".format(tag['label']), 'query': "tags_id_stories:{}".format(tag['tags_id']) , 'focal_set_definitions_id' : new_focal_set['focal_set_definitions_id'], } user_mc = user_mediacloud_client() user_mc.topicFocusDefinitionCreate(topics_id, **params) return {'success': True}
def api_sources_name_exists(): '''Check if source with name/url exists already :return: boolean indicating if source with this name exists or not (case insensive check) ''' mc = user_mediacloud_client() search_str = request.args['searchStr'] id = int(request.args['id']) if 'id' in request.args else None matching_sources = mc.mediaList(name_like=search_str)[:MAX_SOURCES] if id: matching_source_names = [s['name'].lower().strip() for s in matching_sources if s['media_id'] != id and s['name'].strip().lower() != search_str.strip().lower() ] else: matching_source_names = [s['name'].lower().strip() for s in matching_sources] name_in_use = search_str.lower() in matching_source_names return jsonify({'nameInUse': name_in_use})
def api_collection_details(collection_id): add_in_sources = False if ('getSources' in request.args) and (request.args['getSources'] == 'true'): add_in_sources = True user_mc = user_mediacloud_client() info = user_mc.tag(collection_id) add_user_favorite_flag_to_collections([info]) info['id'] = collection_id info['tag_set'] = _tag_set_info(user_mediacloud_key(), info['tag_sets_id']) if add_in_sources: media_in_collection = media_with_tag(user_mediacloud_key(), collection_id) info['sources'] = media_in_collection analytics_db.increment_count(analytics_db.TYPE_COLLECTION, collection_id, analytics_db.ACTION_SOURCE_MGR_VIEW) return jsonify({'results': info})
def api_collections_name_exists(): '''Check if source with name/url exists already :return: boolean indicating if source with this name exists or not (case insensive check) ''' mc = user_mediacloud_client() search_str = request.args['searchStr'] id = int(request.args['id']) if 'id' in request.args else None #tag_sets_id_list, public_only=public_only, name_like=search_str matching_collections = mc.tagList(name_like=search_str)[:MAX_SOURCES] if id: matching_collections_names = [s['label'].lower().strip() for s in matching_collections if s['tags_id'] != id] else: matching_collections_names = [s['label'].lower().strip() for s in matching_collections] name_in_use = search_str.lower() in matching_collections_names return jsonify({'nameInUse': name_in_use})
def story(topics_id, stories_id): if is_user_logged_in(): local_mc = user_mediacloud_client() story_topic_info = apicache.topic_story_list(user_mediacloud_key(), topics_id, stories_id=stories_id) story_topic_info = story_topic_info['stories'][0] ''' all_fb_count = [] more_fb_count = True link_id = 0 while more_fb_count: fb_page = local_mc.topicStoryListFacebookData(topics_id, limit=100, link_id=link_id) all_fb_count = all_fb_count + fb_page['counts'] if 'next' in fb_page['link_ids']: link_id = fb_page['link_ids']['next'] more_fb_count = True else: more_fb_count = False for fb_item in all_fb_count: if int(fb_item['stories_id']) == int(stories_id): story_topic_info['facebook_collection_date'] = fb_item['facebook_api_collect_date'] ''' else: return jsonify({'status': 'Error', 'message': 'Invalid attempt'}) try: story_info = local_mc.story(stories_id) # add in other fields from regular call for k in story_info.keys(): story_topic_info[k] = story_info[k] for tag in story_info['story_tags']: if tag['tag_sets_id'] == tag_util.GEO_TAG_SET: geonames_id = int(tag['tag'][9:]) try: tag['geoname'] = _cached_geoname(geonames_id) except Exception as e: # query to CLIFF failed :-( handle it gracefully logger.exception(e) tag['geoname'] = {} except MCException: logger.warning("Story {} wasn't found in a regular story API call, but is it topic {}".format( stories_id, topics_id )) return jsonify(story_topic_info)
def api_collection_sources_feed_status_csv(collection_id, source_type): user_mc = user_mediacloud_client() collection = user_mc.tag(collection_id) type = str(source_type).lower() media_in_collection = media_with_tag(user_mediacloud_key(), collection_id) media_info_in_collection = _fetch_collection_source_feed_info(media_in_collection) if type == 'review': filtered_media = [m for m in media_info_in_collection if m['active_feed_count'] > 0 and m['num_stories_90'] == 0 and m['num_stories_last_year'] > 0] elif type == 'remove': filtered_media = [m for m in media_info_in_collection if m['active_feed_count'] > 0 and m['num_stories_90'] == 0 and m['num_stories_last_year'] == 0 and m['latest_scrape_job.state'] == 'failed'] elif type == 'unscrapeable': filtered_media = [m for m in media_info_in_collection if m['active_feed_count'] == 0 and m['num_stories_90'] > 0] elif type == 'working': filtered_media = [m for m in media_info_in_collection if m['active_feed_count'] > 0 and m['num_stories_last_year'] > 0] else: filtered_media = media_info_in_collection file_prefix = "Collection {} ({}) - sources feed {}".format(collection_id, collection['tag'], source_type) properties_to_include = SOURCE_FEED_LIST_CSV_PROPS return csv.download_media_csv(filtered_media, file_prefix, properties_to_include)
def _cached_topic_split_story_counts(user_mc_key, topics_id, **kwargs): ''' Internal helper - don't call this; call topic_split_story_counts instead. This needs user_mc_key in the function signature to make sure the caching is keyed correctly. ''' local_mc = None if user_mc_key == TOOL_API_KEY: local_mc = mc else: local_mc = user_mediacloud_client() results = local_mc.topicStoryCount(topics_id, split=True, **kwargs) total_stories = 0 for c in results['counts']: total_stories += c['count'] results['total_story_count'] = total_stories return results
def _cached_collection_source_representation(mc_api_key, collection_id): # have to respect the api here here because only some folks can see private collections user_mc = user_mediacloud_client(mc_api_key) sample_size = 1000 stories = user_mc.storyList('tags_id_media:{}'.format(collection_id), rows=sample_size, sort=mc.SORT_RANDOM) media_representation = {} for s in stories: if s['media_id'] not in media_representation: media_representation[s['media_id']] = { 'media_id': s['media_id'], 'media_name': s['media_name'], 'media_url': s['media_url'], 'sample_size': sample_size, 'stories': 0 } media_representation[s['media_id']]['stories'] += 1 for media_id in media_representation: media_representation[media_id]['story_pct'] = float(media_representation[media_id]['stories']) / float( sample_size) return sorted(list(media_representation.values()), key=operator.itemgetter('stories'))
def create_media_type_focal_set(topics_id): user_mc = user_mediacloud_client() # grab the focalSetName and focalSetDescription and then make one focal_set_name = request.form['focalSetName'] focal_set_description = request.form['focalSetDescription'] media_type_tags = tags_in_tag_set(TOOL_API_KEY, TAG_SETS_ID_MEDIA_TYPE) focal_technique = FOCAL_TECHNIQUE_BOOLEAN_QUERY new_focal_set = user_mc.topicFocalSetDefinitionCreate(topics_id, focal_set_name, focal_set_description, focal_technique) if 'focal_set_definitions_id' not in new_focal_set: return json_error_response('Unable to create the subtopic set') # now make the foci in it - one for each media type focus_def_results = [] for tag in media_type_tags: params = { 'name': tag['label'], 'description': "Stories from {} sources".format(tag['label']), 'query': "tags_id_media:{}".format(tag['tags_id']), 'focal_set_definitions_id': new_focal_set['focal_set_definitions_id'], } result = user_mc.topicFocusDefinitionCreate(topics_id, **params) focus_def_results.append(result) return {'success': True}
def cached_source_story_count(query): # sources are open to everyone, so no need for user-specific cache user_mc = user_mediacloud_client() return user_mc.storyCount(query)['count']
def _tag_set_info(user_mc_key, tag_sets_id): user_mc = user_mediacloud_client() return user_mc.tagSet(tag_sets_id)
def collection_search(search_str, public_only, tag_sets_id_list): user_mc = user_mediacloud_client() return user_mc.tagList(tag_sets_id_list, public_only=public_only, name_like=search_str)
def media_search(search_str, tags_id=None): mc = user_mediacloud_client() return mc.mediaList(name_like=search_str, tags_id=tags_id, rows=MAX_SOURCES, sort="num_stories")
def _cached_timeperiod_story_count(q='*', time_period=QUERY_LAST_MONTH): # sources are open to everyone, so no need for user-specific cache # Helper to fetch split story counts over a timeframe for an arbitrary query user_mc = user_mediacloud_client() results = user_mc.storyCount(solr_query=q, solr_filter=time_period) return results
def get_topic_media_links_csv(topics_id): user_mc = user_mediacloud_client() topic = user_mc.topic(topics_id) # page through results for timespand return stream_media_link_list_csv(user_mediacloud_key(), topic['name'] + '-stories', topics_id)