def _collection_source_story_split_historical_counts(collection_id): media_list = media_with_tag(user_mediacloud_key(), collection_id) jobs = [{'media': m} for m in media_list] # fetch in parallel to make things faster #return [_source_story_split_count_job(j) for j in jobs] # make sure to execute the generator so what is returned is real data return [d for d in _source_story_split_count_job.map(jobs)]
def collection_update(collection_id): user_mc = user_admin_mediacloud_client() label = '{}'.format(request.form['name']) description = request.form['description'] static = request.form['static'] if 'static' in request.form else None show_on_stories = request.form['showOnStories'] if 'showOnStories' in request.form else None show_on_media = request.form['showOnMedia'] if 'showOnMedia' in request.form else None formatted_name = format_name_from_label(label) source_ids = [] if len(request.form['sources[]']) > 0: source_ids = [int(sid) for sid in request.form['sources[]'].split(',')] # first update the collection updated_collection = user_mc.updateTag(collection_id, formatted_name, label, description, is_static=(static == 'true'), show_on_stories=(show_on_stories == 'true'), show_on_media=(show_on_media == 'true')) # get the sources in the collection first, then remove and add as needed existing_source_ids = [int(m['media_id']) for m in media_with_tag(user_mediacloud_key(), collection_id)] source_ids_to_remove = list(set(existing_source_ids) - set(source_ids)) source_ids_to_add = [sid for sid in source_ids if sid not in existing_source_ids] # logger.debug(existing_source_ids) # logger.debug(source_ids_to_add) # logger.debug(source_ids_to_remove) # then go through and tag all the sources specified with the new collection id tags_to_add = [MediaTag(sid, tags_id=collection_id, action=TAG_ACTION_ADD) for sid in source_ids_to_add] tags_to_remove = [MediaTag(sid, tags_id=collection_id, action=TAG_ACTION_REMOVE) for sid in source_ids_to_remove] tags = tags_to_add + tags_to_remove if len(tags) > 0: user_mc.tagMedia(tags) return jsonify(updated_collection['tag'])
def api_collection_sources_csv(collection_id): user_mc = user_mediacloud_client() collection = user_mc.tag(collection_id) # not cached because props can change often all_media = media_with_tag(user_mediacloud_key(), collection_id) file_prefix = "Collection {} ({}) - sources ".format(collection_id, collection['tag']) properties_to_include = SOURCE_LIST_CSV_EDIT_PROPS return csv.download_media_csv(all_media, file_prefix, properties_to_include)
def collection_update(collection_id): user_mc = user_admin_mediacloud_client() label = '{}'.format(request.form['name']) description = request.form['description'] static = request.form['static'] if 'static' in request.form else None show_on_stories = request.form['showOnStories'] if 'showOnStories' in request.form else None show_on_media = request.form['showOnMedia'] if 'showOnMedia' in request.form else None formatted_name = format_name_from_label(label) source_ids = [] if len(request.form['sources[]']) > 0: source_ids = [int(sid) for sid in request.form['sources[]'].split(',')] # first update the collection updated_collection = user_mc.updateTag(collection_id, formatted_name, label, description, is_static=(static == 'true'), show_on_stories=(show_on_stories == 'true'), show_on_media=(show_on_media == 'true')) # get the sources in the collection first, then remove and add as needed existing_source_ids = [int(m['media_id']) for m in media_with_tag(user_mediacloud_key(), collection_id)] source_ids_to_remove = list(set(existing_source_ids) - set(source_ids)) source_ids_to_add = [sid for sid in source_ids if sid not in existing_source_ids] # logger.debug(existing_source_ids) # logger.debug(source_ids_to_add) # logger.debug(source_ids_to_remove) # then go through and tag all the sources specified with the new collection id tags_to_add = [MediaTag(sid, tags_id=collection_id, action=TAG_ACTION_ADD) for sid in source_ids_to_add] tags_to_remove = [MediaTag(sid, tags_id=collection_id, action=TAG_ACTION_REMOVE) for sid in source_ids_to_remove] tags = tags_to_add + tags_to_remove if len(tags) > 0: user_mc.tagMedia(tags) apicache.invalidate_collection_source_representation_cache(user_mediacloud_key(), collection_id) return jsonify(updated_collection['tag'])
def api_collection_sources(collection_id): int(collection_id) results = {'tags_id': collection_id} media_in_collection = media_with_tag(user_mediacloud_key(), collection_id) add_user_favorite_flag_to_sources(media_in_collection) results['sources'] = media_in_collection return jsonify(results)
def _cached_media_with_sentence_counts(user_mc_key, tag_sets_id): sample_size = 2000 # kind of arbitrary # list all sources first sources_by_id = { int(c['media_id']): c for c in media_with_tag(user_mediacloud_key(), tag_sets_id) } sentences = mc.sentenceList('*', 'tags_id_media:' + str(tag_sets_id), rows=sample_size, sort=mc.SORT_RANDOM) # sum the number of sentences per media source sentence_counts = {int(media_id): 0 for media_id in sources_by_id.keys()} if 'docs' in sentences['response']: for sentence in sentences['response']['docs']: if (sentence['media_id'] is not None) and (int( sentence['media_id']) in sentence_counts): # safety check sentence_counts[int( sentence['media_id'])] = sentence_counts[int( sentence['media_id'])] + 1. # add in sentence count info to media info for media_id in sentence_counts.keys(): sources_by_id[media_id]['sentence_count'] = sentence_counts[media_id] sources_by_id[media_id][ 'sentence_pct'] = sentence_counts[media_id] / sample_size return sources_by_id.values()
def api_metadata_download(collection_id): all_media = media_with_tag(user_mediacloud_key(), collection_id) metadata_items = [] for media_source in all_media: for tag in media_source['media_source_tags']: if is_metadata_tag_set(tag['tag_sets_id']): found = False for dictItem in metadata_items: if dictItem['metadataId'] == tag['tag_sets_id']: temp = dictItem['tagged'] dictItem.update({'tagged': temp + 1}) found = True if not found: metadata_items.append({ 'metadataCoverage': tag['tag_set'], 'metadataId': tag['tag_sets_id'], 'tagged': 1 }) for i in metadata_items: temp = len(all_media) - i['tagged'] i.update({'notTagged': temp}) props = ['metadataCoverage', 'tagged', 'notTagged'] filename = "metadataCoverageForCollection" + collection_id + ".csv" return csv.stream_response(metadata_items, props, filename)
def remove_sources_from_collection(collection_id): source_ids_to_remove = request.form['sources[]'].split(',') source_ids_to_remove = [int(s) for s in source_ids_to_remove] user_mc = user_admin_mediacloud_client() # get the sources in the collection first, then remove and add as needed existing_source_ids = [ int(m['media_id']) for m in media_with_tag(collection_id) ] source_ids_to_remain = list( set(existing_source_ids) - set(source_ids_to_remove)) media_to_remove = [ MediaTag(sid, tags_id=collection_id, action=TAG_ACTION_REMOVE) for sid in source_ids_to_remove ] media_to_remain = [ MediaTag(sid, tags_id=collection_id, action=TAG_ACTION_ADD) for sid in source_ids_to_remain ] # do I need to run similar or TAG_ACTION_REMOVE? current_media = media_to_remove + media_to_remain results = {} if len(current_media) > 0: results = user_mc.tagMedia(current_media) apicache.invalidate_collection_source_representation_cache( user_mediacloud_key(), collection_id) return jsonify(results)
def api_collection_sources_feed_status_csv(collection_id, source_type): user_mc = user_mediacloud_client() collection = user_mc.tag(collection_id) list_type = str(source_type).lower() media_in_collection = media_with_tag(user_mediacloud_key(), collection_id) media_info_in_collection = _media_list_edit_job.map(media_in_collection) if list_type == 'review': filtered_media = [ m for m in media_info_in_collection if m['active_feed_count'] > 0 and m['num_stories_90'] == 0 and m['num_stories_last_year'] > 0 ] elif list_type == 'remove': filtered_media = [ m for m in media_info_in_collection if m['active_feed_count'] > 0 and m['num_stories_90'] == 0 and m['num_stories_last_year'] == 0 and m['latest_scrape_job.state'] == 'failed' ] elif list_type == 'unscrapeable': filtered_media = [ m for m in media_info_in_collection if m['active_feed_count'] == 0 and m['num_stories_90'] > 0 ] elif list_type == 'working': filtered_media = [ m for m in media_info_in_collection if m['active_feed_count'] > 0 and m['num_stories_last_year'] > 0 ] else: filtered_media = media_info_in_collection file_prefix = "Collection {} ({}) - sources feed {}".format( collection_id, collection['tag'], source_type) properties_to_include = SOURCE_FEED_LIST_CSV_PROPS return csv.download_media_csv(filtered_media, file_prefix, properties_to_include)
def _collection_source_story_split_historical_counts(collection_id): media_list = media_with_tag(user_mediacloud_key(), collection_id) jobs = [{'media': m} for m in media_list] # fetch in parallel to make things faster pool = Pool(processes=HISTORICAL_COUNT_POOL_SIZE) results = pool.map(_source_story_split_count_worker, jobs) # blocks until they are all done pool.terminate() # extra safe garbage collection return results
def api_collection_sources(collection_id): int(collection_id) results = { 'tags_id': collection_id } media_in_collection = media_with_tag(user_mediacloud_key(), collection_id) add_user_favorite_flag_to_sources(media_in_collection) results['sources'] = media_in_collection return jsonify(results)
def cached_media_tags(tag_sets_id): partisanship_tags = cached_tags_in_tag_set(tag_sets_id) for tag in partisanship_tags: media = media_with_tag(user_mediacloud_key(), tag['tags_id'], True) # cache this list media_ids = [str(m['media_id']) for m in media] # as strs so we can concat into a query str later with .join call tag['media'] = media tag['media_ids'] = media_ids tag['media_query'] = "media_id:({})".format(" ".join(media_ids)) return partisanship_tags
def api_collections_by_ids(): collection_ids = request.args['coll[]'].split(',') sources_list = [] for tags_id in collection_ids: all_media = media_with_tag(user_mediacloud_key(), tags_id) info = [{'media_id': m['media_id'], 'name': m['name'], 'url': m['url'], 'public_notes': m['public_notes']} for m in all_media] add_user_favorite_flag_to_sources(info) sources_list += info return jsonify({'results': sources_list})
def api_collection_details(collection_id): user_mc = user_admin_mediacloud_client() info = user_mc.tag(collection_id) add_user_favorite_flag_to_collections([info]) info['id'] = collection_id info['tag_set'] = _tag_set_info(user_mediacloud_key(), info['tag_sets_id']) all_media = media_with_tag(user_mediacloud_key(), collection_id) add_user_favorite_flag_to_sources(all_media) info['media'] = all_media return jsonify({'results': info})
def api_collection_sources_csv(collection_id): user_mc = user_admin_mediacloud_client() # info = user_mc.tag(int(collection_id)) all_media = media_with_tag(user_mediacloud_key(), collection_id) for src in all_media: for tag in src['media_source_tags']: if is_metadata_tag_set(tag['tag_sets_id']): format_metadata_fields(src, tag['tag_sets_id'], tag['tag']) file_prefix = "Collection_Sourcelist_Template_for_" + collection_id + "_" what_type_download = COLLECTIONS_TEMPLATE_PROPS_EDIT return csv.download_media_csv(all_media, file_prefix, what_type_download)
def api_collection_sources(collection_id): add_in_details = False if ('details' in request.args) and (request.args['details'] == 'true'): add_in_details = True results = {'tags_id': collection_id} media_in_collection = media_with_tag(user_mediacloud_key(), collection_id) add_user_favorite_flag_to_sources(media_in_collection) if add_in_details and user_has_auth_role(ROLE_MEDIA_EDIT): media_in_collection = fetch_collection_source_feed_info( media_in_collection) results['sources'] = media_in_collection return jsonify(results)
def api_collection_sources_csv(collection_id): user_mc = user_mediacloud_client() collection = user_mc.tag( collection_id) # not cached because props can change often all_media = media_with_tag(user_mediacloud_key(), collection_id) for src in all_media: for tag in src['media_source_tags']: if is_metadata_tag_set(tag['tag_sets_id']): format_metadata_fields(src, tag) file_prefix = "Collection {} ({}) - sources ".format( collection_id, collection['tag']) properties_to_include = COLLECTIONS_TEMPLATE_PROPS_EDIT return csv.download_media_csv(all_media, file_prefix, properties_to_include)
def api_collection_details(collection_id): add_in_sources = False if ('getSources' in request.args) and (request.args['getSources'] == 'true'): add_in_sources = True user_mc = user_mediacloud_client() info = user_mc.tag(collection_id) add_user_favorite_flag_to_collections([info]) info['id'] = collection_id info['tag_set'] = _tag_set_info(user_mediacloud_key(), info['tag_sets_id']) if add_in_sources: media_in_collection = media_with_tag(user_mediacloud_key(), collection_id) info['sources'] = media_in_collection analytics_db.increment_count(analytics_db.TYPE_COLLECTION, collection_id, analytics_db.ACTION_SOURCE_MGR_VIEW) return jsonify({'results': info})
def api_collection_details(collection_id): add_in_sources = False if ('getSources' in request.args) and (request.args['getSources'] == 'true'): add_in_sources = True user_mc = user_mediacloud_client() info = user_mc.tag(collection_id) add_user_favorite_flag_to_collections([info]) info['id'] = collection_id info['tag_set'] = _tag_set_info(user_mediacloud_key(), info['tag_sets_id']) if add_in_sources: media_in_collection = media_with_tag(user_mediacloud_key(), collection_id) info['sources'] = media_in_collection return jsonify({'results': info})
def remove_sources_from_collection(collection_id): source_ids_to_remove = request.form['sources[]'].split(',') source_ids_to_remove = [int(s) for s in source_ids_to_remove] user_mc = user_admin_mediacloud_client() # get the sources in the collection first, then remove and add as needed existing_source_ids = [int(m['media_id']) for m in media_with_tag(user_mediacloud_key(), collection_id)] source_ids_to_remain = list(set(existing_source_ids) - set(source_ids_to_remove)) media_to_remove = [MediaTag(sid, tags_id=collection_id, action=TAG_ACTION_REMOVE) for sid in source_ids_to_remove] media_to_remain = [MediaTag(sid, tags_id=collection_id, action=TAG_ACTION_ADD) for sid in source_ids_to_remain] # do I need to run similar or TAG_ACTION_REMOVE? current_media = media_to_remove + media_to_remain if len(current_media) > 0: results = user_mc.tagMedia(current_media) apicache.invalidate_collection_source_representation_cache(user_mediacloud_key(), collection_id) return jsonify(results)
def api_collection_sources_feed_status_csv(collection_id, source_type): user_mc = user_mediacloud_client() collection = user_mc.tag(collection_id) type = str(source_type).lower() media_in_collection = media_with_tag(user_mediacloud_key(), collection_id) media_info_in_collection = _fetch_collection_source_feed_info(media_in_collection) if type == 'review': filtered_media = [m for m in media_info_in_collection if m['active_feed_count'] > 0 and m['num_stories_90'] == 0 and m['num_stories_last_year'] > 0] elif type == 'remove': filtered_media = [m for m in media_info_in_collection if m['active_feed_count'] > 0 and m['num_stories_90'] == 0 and m['num_stories_last_year'] == 0 and m['latest_scrape_job.state'] == 'failed'] elif type == 'unscrapeable': filtered_media = [m for m in media_info_in_collection if m['active_feed_count'] == 0 and m['num_stories_90'] > 0] elif type == 'working': filtered_media = [m for m in media_info_in_collection if m['active_feed_count'] > 0 and m['num_stories_last_year'] > 0] else: filtered_media = media_info_in_collection file_prefix = "Collection {} ({}) - sources feed {}".format(collection_id, collection['tag'], source_type) properties_to_include = SOURCE_FEED_LIST_CSV_PROPS return csv.download_media_csv(filtered_media, file_prefix, properties_to_include)
def _collection_source_sentence_historical_counts(collection_id, start_date_str, end_date_str): user_mc = user_admin_mediacloud_client() start_date = datetime.datetime.strptime(start_date_str, "%Y-%m-%d").date() end_date = datetime.datetime.strptime(end_date_str, "%Y-%m-%d").date() q = " AND ({})".format(user_mc.publish_date_query(start_date, end_date)) media_list = media_with_tag(user_mediacloud_key(), collection_id) jobs = [{ 'media': m, 'q': q, 'start_date_str': start_date_str, 'end_date_str': end_date_str } for m in media_list] # fetch in parallel to make things faster pool = Pool(processes=HISTORICAL_COUNT_POOL_SIZE) results = pool.map(_source_sentence_counts_worker, jobs) # blocks until they are all done pool.terminate() # extra safe garbage collection return results
def api_metadata_download(collection_id): all_media = media_with_tag(user_mediacloud_key(), collection_id) metadata_counts = {} # from tag_sets_id to info for media_source in all_media: for metadata_label, info in media_source['metadata'].items(): if metadata_label not in metadata_counts: # lazily populate counts metadata_counts[metadata_label] = { 'metadataCoverage': metadata_label, 'tagged': 0 } if info is not None: metadata_counts[metadata_label]['tagged'] += 1 for item_info in list(metadata_counts.values()): temp = len(all_media) - item_info['tagged'] item_info.update({'notTagged': temp}) props = ['metadataCoverage', 'tagged', 'notTagged'] filename = "metadataCoverageForCollection" + collection_id + ".csv" return csv.stream_response(list(metadata_counts.values()), props, filename, ['metadata category', 'sources with info', 'sources missing info'])
def api_collection_sources(collection_id): add_in_details = False if ('details' in request.args) and (request.args['details'] == 'true'): add_in_details = True results = {'tags_id': collection_id} media_in_collection = media_with_tag(user_mediacloud_key(), collection_id) add_user_favorite_flag_to_sources(media_in_collection) if add_in_details and user_has_auth_role(ROLE_MEDIA_EDIT): # for editing users, add in last scrape and active feed count (if requested) pool = Pool(processes=FEED_SCRAPE_JOB_POOL_SIZE) jobs = [m['media_id'] for m in media_in_collection] job_results = pool.map(_media_list_edit_worker, jobs) # blocks until they are all done job_by_media_id = {j['media_id']: j for j in job_results} for m in media_in_collection: m['latest_scrape_job'] = job_by_media_id[ m['media_id']]['latest_scrape_job'] m['active_feed_count'] = job_by_media_id[ m['media_id']]['active_feed_count'] pool.terminate() results['sources'] = media_in_collection return jsonify(results)
def _collection_source_story_split_historical_counts(collection_id): media_list = media_with_tag(user_mediacloud_key(), collection_id) jobs = [{'media': m} for m in media_list] # fetch in parallel to make things faster #return [_source_story_split_count_job(j) for j in jobs] return _source_story_split_count_job.map(jobs)