def _collection_source_story_split_historical_counts(collection_id):
    media_list = media_with_tag(user_mediacloud_key(), collection_id)
    jobs = [{'media': m} for m in media_list]
    # fetch in parallel to make things faster
    #return [_source_story_split_count_job(j) for j in jobs]
    # make sure to execute the generator so what is returned is real data
    return [d for d in _source_story_split_count_job.map(jobs)]
Beispiel #2
0
def collection_update(collection_id):
    user_mc = user_admin_mediacloud_client()
    label = '{}'.format(request.form['name'])
    description = request.form['description']
    static = request.form['static'] if 'static' in request.form else None
    show_on_stories = request.form['showOnStories'] if 'showOnStories' in request.form else None
    show_on_media = request.form['showOnMedia'] if 'showOnMedia' in request.form else None

    formatted_name = format_name_from_label(label)

    source_ids = []
    if len(request.form['sources[]']) > 0:
        source_ids = [int(sid) for sid in request.form['sources[]'].split(',')]
    # first update the collection
    updated_collection = user_mc.updateTag(collection_id, formatted_name, label, description,
                                           is_static=(static == 'true'),
                                           show_on_stories=(show_on_stories == 'true'),
                                           show_on_media=(show_on_media == 'true'))
    # get the sources in the collection first, then remove and add as needed
    existing_source_ids = [int(m['media_id']) for m in media_with_tag(user_mediacloud_key(), collection_id)]
    source_ids_to_remove = list(set(existing_source_ids) - set(source_ids))
    source_ids_to_add = [sid for sid in source_ids if sid not in existing_source_ids]
    # logger.debug(existing_source_ids)
    # logger.debug(source_ids_to_add)
    # logger.debug(source_ids_to_remove)
    # then go through and tag all the sources specified with the new collection id
    tags_to_add = [MediaTag(sid, tags_id=collection_id, action=TAG_ACTION_ADD) for sid in source_ids_to_add]
    tags_to_remove = [MediaTag(sid, tags_id=collection_id, action=TAG_ACTION_REMOVE) for sid in source_ids_to_remove]
    tags = tags_to_add + tags_to_remove
    if len(tags) > 0:
        user_mc.tagMedia(tags)
    return jsonify(updated_collection['tag'])
def api_collection_sources_csv(collection_id):
    user_mc = user_mediacloud_client()
    collection = user_mc.tag(collection_id)    # not cached because props can change often
    all_media = media_with_tag(user_mediacloud_key(), collection_id)
    file_prefix = "Collection {} ({}) - sources ".format(collection_id, collection['tag'])
    properties_to_include = SOURCE_LIST_CSV_EDIT_PROPS
    return csv.download_media_csv(all_media, file_prefix, properties_to_include)
def collection_update(collection_id):
    user_mc = user_admin_mediacloud_client()
    label = '{}'.format(request.form['name'])
    description = request.form['description']
    static = request.form['static'] if 'static' in request.form else None
    show_on_stories = request.form['showOnStories'] if 'showOnStories' in request.form else None
    show_on_media = request.form['showOnMedia'] if 'showOnMedia' in request.form else None

    formatted_name = format_name_from_label(label)

    source_ids = []
    if len(request.form['sources[]']) > 0:
        source_ids = [int(sid) for sid in request.form['sources[]'].split(',')]
    # first update the collection
    updated_collection = user_mc.updateTag(collection_id, formatted_name, label, description,
                                           is_static=(static == 'true'),
                                           show_on_stories=(show_on_stories == 'true'),
                                           show_on_media=(show_on_media == 'true'))
    # get the sources in the collection first, then remove and add as needed
    existing_source_ids = [int(m['media_id']) for m in media_with_tag(user_mediacloud_key(), collection_id)]
    source_ids_to_remove = list(set(existing_source_ids) - set(source_ids))
    source_ids_to_add = [sid for sid in source_ids if sid not in existing_source_ids]
    # logger.debug(existing_source_ids)
    # logger.debug(source_ids_to_add)
    # logger.debug(source_ids_to_remove)
    # then go through and tag all the sources specified with the new collection id
    tags_to_add = [MediaTag(sid, tags_id=collection_id, action=TAG_ACTION_ADD) for sid in source_ids_to_add]
    tags_to_remove = [MediaTag(sid, tags_id=collection_id, action=TAG_ACTION_REMOVE) for sid in source_ids_to_remove]
    tags = tags_to_add + tags_to_remove
    if len(tags) > 0:
        user_mc.tagMedia(tags)
        apicache.invalidate_collection_source_representation_cache(user_mediacloud_key(), collection_id)
    return jsonify(updated_collection['tag'])
def api_collection_sources(collection_id):
    int(collection_id)
    results = {'tags_id': collection_id}
    media_in_collection = media_with_tag(user_mediacloud_key(), collection_id)
    add_user_favorite_flag_to_sources(media_in_collection)
    results['sources'] = media_in_collection
    return jsonify(results)
Beispiel #6
0
def _cached_media_with_sentence_counts(user_mc_key, tag_sets_id):
    sample_size = 2000  # kind of arbitrary
    # list all sources first
    sources_by_id = {
        int(c['media_id']): c
        for c in media_with_tag(user_mediacloud_key(), tag_sets_id)
    }
    sentences = mc.sentenceList('*',
                                'tags_id_media:' + str(tag_sets_id),
                                rows=sample_size,
                                sort=mc.SORT_RANDOM)
    # sum the number of sentences per media source
    sentence_counts = {int(media_id): 0 for media_id in sources_by_id.keys()}
    if 'docs' in sentences['response']:
        for sentence in sentences['response']['docs']:
            if (sentence['media_id'] is not None) and (int(
                    sentence['media_id']) in sentence_counts):  # safety check
                sentence_counts[int(
                    sentence['media_id'])] = sentence_counts[int(
                        sentence['media_id'])] + 1.
    # add in sentence count info to media info
    for media_id in sentence_counts.keys():
        sources_by_id[media_id]['sentence_count'] = sentence_counts[media_id]
        sources_by_id[media_id][
            'sentence_pct'] = sentence_counts[media_id] / sample_size
    return sources_by_id.values()
Beispiel #7
0
def api_metadata_download(collection_id):
    all_media = media_with_tag(user_mediacloud_key(), collection_id)

    metadata_items = []
    for media_source in all_media:
        for tag in media_source['media_source_tags']:
            if is_metadata_tag_set(tag['tag_sets_id']):
                found = False
                for dictItem in metadata_items:
                    if dictItem['metadataId'] == tag['tag_sets_id']:
                        temp = dictItem['tagged']
                        dictItem.update({'tagged': temp + 1})
                        found = True
                if not found:
                    metadata_items.append({
                        'metadataCoverage': tag['tag_set'],
                        'metadataId': tag['tag_sets_id'],
                        'tagged': 1
                    })

    for i in metadata_items:
        temp = len(all_media) - i['tagged']
        i.update({'notTagged': temp})

    props = ['metadataCoverage', 'tagged', 'notTagged']
    filename = "metadataCoverageForCollection" + collection_id + ".csv"
    return csv.stream_response(metadata_items, props, filename)
Beispiel #8
0
def remove_sources_from_collection(collection_id):
    source_ids_to_remove = request.form['sources[]'].split(',')
    source_ids_to_remove = [int(s) for s in source_ids_to_remove]
    user_mc = user_admin_mediacloud_client()
    # get the sources in the collection first, then remove and add as needed
    existing_source_ids = [
        int(m['media_id']) for m in media_with_tag(collection_id)
    ]
    source_ids_to_remain = list(
        set(existing_source_ids) - set(source_ids_to_remove))

    media_to_remove = [
        MediaTag(sid, tags_id=collection_id, action=TAG_ACTION_REMOVE)
        for sid in source_ids_to_remove
    ]
    media_to_remain = [
        MediaTag(sid, tags_id=collection_id, action=TAG_ACTION_ADD)
        for sid in source_ids_to_remain
    ]  # do I need to run similar or TAG_ACTION_REMOVE?
    current_media = media_to_remove + media_to_remain

    results = {}
    if len(current_media) > 0:
        results = user_mc.tagMedia(current_media)

    apicache.invalidate_collection_source_representation_cache(
        user_mediacloud_key(), collection_id)
    return jsonify(results)
def api_collection_sources_feed_status_csv(collection_id, source_type):
    user_mc = user_mediacloud_client()
    collection = user_mc.tag(collection_id)
    list_type = str(source_type).lower()
    media_in_collection = media_with_tag(user_mediacloud_key(), collection_id)
    media_info_in_collection = _media_list_edit_job.map(media_in_collection)
    if list_type == 'review':
        filtered_media = [
            m for m in media_info_in_collection if m['active_feed_count'] > 0
            and m['num_stories_90'] == 0 and m['num_stories_last_year'] > 0
        ]
    elif list_type == 'remove':
        filtered_media = [
            m for m in media_info_in_collection if m['active_feed_count'] > 0
            and m['num_stories_90'] == 0 and m['num_stories_last_year'] == 0
            and m['latest_scrape_job.state'] == 'failed'
        ]
    elif list_type == 'unscrapeable':
        filtered_media = [
            m for m in media_info_in_collection
            if m['active_feed_count'] == 0 and m['num_stories_90'] > 0
        ]
    elif list_type == 'working':
        filtered_media = [
            m for m in media_info_in_collection
            if m['active_feed_count'] > 0 and m['num_stories_last_year'] > 0
        ]
    else:
        filtered_media = media_info_in_collection
    file_prefix = "Collection {} ({}) - sources feed {}".format(
        collection_id, collection['tag'], source_type)
    properties_to_include = SOURCE_FEED_LIST_CSV_PROPS
    return csv.download_media_csv(filtered_media, file_prefix,
                                  properties_to_include)
Beispiel #10
0
def api_collection_sources_csv(collection_id):
    user_mc = user_mediacloud_client()
    collection = user_mc.tag(collection_id)    # not cached because props can change often
    all_media = media_with_tag(user_mediacloud_key(), collection_id)
    file_prefix = "Collection {} ({}) - sources ".format(collection_id, collection['tag'])
    properties_to_include = SOURCE_LIST_CSV_EDIT_PROPS
    return csv.download_media_csv(all_media, file_prefix, properties_to_include)
def _collection_source_story_split_historical_counts(collection_id):
    media_list = media_with_tag(user_mediacloud_key(), collection_id)
    jobs = [{'media': m} for m in media_list]
    # fetch in parallel to make things faster
    pool = Pool(processes=HISTORICAL_COUNT_POOL_SIZE)
    results = pool.map(_source_story_split_count_worker, jobs)  # blocks until they are all done
    pool.terminate()  # extra safe garbage collection
    return results
def _collection_source_story_split_historical_counts(collection_id):
    media_list = media_with_tag(user_mediacloud_key(), collection_id)
    jobs = [{'media': m} for m in media_list]
    # fetch in parallel to make things faster
    pool = Pool(processes=HISTORICAL_COUNT_POOL_SIZE)
    results = pool.map(_source_story_split_count_worker,
                       jobs)  # blocks until they are all done
    pool.terminate()  # extra safe garbage collection
    return results
def api_collection_sources(collection_id):
    int(collection_id)
    results = {
        'tags_id': collection_id
    }
    media_in_collection = media_with_tag(user_mediacloud_key(), collection_id)
    add_user_favorite_flag_to_sources(media_in_collection)
    results['sources'] = media_in_collection
    return jsonify(results)
def cached_media_tags(tag_sets_id):
    partisanship_tags = cached_tags_in_tag_set(tag_sets_id)
    for tag in partisanship_tags:
        media = media_with_tag(user_mediacloud_key(), tag['tags_id'], True)  # cache this list
        media_ids = [str(m['media_id']) for m in media] # as strs so we can concat into a query str later with .join call
        tag['media'] = media
        tag['media_ids'] = media_ids
        tag['media_query'] = "media_id:({})".format(" ".join(media_ids))
    return partisanship_tags
Beispiel #15
0
def api_collections_by_ids():
    collection_ids = request.args['coll[]'].split(',')
    sources_list = []
    for tags_id in collection_ids:
        all_media = media_with_tag(user_mediacloud_key(), tags_id)
        info = [{'media_id': m['media_id'], 'name': m['name'], 'url': m['url'], 'public_notes': m['public_notes']} for m
                in all_media]
        add_user_favorite_flag_to_sources(info)
        sources_list += info
    return jsonify({'results': sources_list})
def api_collections_by_ids():
    collection_ids = request.args['coll[]'].split(',')
    sources_list = []
    for tags_id in collection_ids:
        all_media = media_with_tag(user_mediacloud_key(), tags_id)
        info = [{'media_id': m['media_id'], 'name': m['name'], 'url': m['url'], 'public_notes': m['public_notes']} for m
                in all_media]
        add_user_favorite_flag_to_sources(info)
        sources_list += info
    return jsonify({'results': sources_list})
Beispiel #17
0
def api_collection_details(collection_id):
    user_mc = user_admin_mediacloud_client()
    info = user_mc.tag(collection_id)
    add_user_favorite_flag_to_collections([info])
    info['id'] = collection_id
    info['tag_set'] = _tag_set_info(user_mediacloud_key(), info['tag_sets_id'])
    all_media = media_with_tag(user_mediacloud_key(), collection_id)
    add_user_favorite_flag_to_sources(all_media)
    info['media'] = all_media

    return jsonify({'results': info})
Beispiel #18
0
def api_collection_sources_csv(collection_id):
    user_mc = user_admin_mediacloud_client()
    # info = user_mc.tag(int(collection_id))
    all_media = media_with_tag(user_mediacloud_key(), collection_id)
    for src in all_media:
        for tag in src['media_source_tags']:
            if is_metadata_tag_set(tag['tag_sets_id']):
                format_metadata_fields(src, tag['tag_sets_id'], tag['tag'])
    file_prefix = "Collection_Sourcelist_Template_for_" + collection_id + "_"
    what_type_download = COLLECTIONS_TEMPLATE_PROPS_EDIT
    return csv.download_media_csv(all_media, file_prefix, what_type_download)
Beispiel #19
0
def api_collection_sources(collection_id):
    add_in_details = False
    if ('details' in request.args) and (request.args['details'] == 'true'):
        add_in_details = True
    results = {'tags_id': collection_id}
    media_in_collection = media_with_tag(user_mediacloud_key(), collection_id)
    add_user_favorite_flag_to_sources(media_in_collection)
    if add_in_details and user_has_auth_role(ROLE_MEDIA_EDIT):
        media_in_collection = fetch_collection_source_feed_info(
            media_in_collection)

    results['sources'] = media_in_collection
    return jsonify(results)
Beispiel #20
0
def api_collection_sources_csv(collection_id):
    user_mc = user_mediacloud_client()
    collection = user_mc.tag(
        collection_id)  # not cached because props can change often
    all_media = media_with_tag(user_mediacloud_key(), collection_id)
    for src in all_media:
        for tag in src['media_source_tags']:
            if is_metadata_tag_set(tag['tag_sets_id']):
                format_metadata_fields(src, tag)
    file_prefix = "Collection {} ({}) - sources ".format(
        collection_id, collection['tag'])
    properties_to_include = COLLECTIONS_TEMPLATE_PROPS_EDIT
    return csv.download_media_csv(all_media, file_prefix,
                                  properties_to_include)
Beispiel #21
0
def api_collection_details(collection_id):
    add_in_sources = False
    if ('getSources' in request.args) and (request.args['getSources'] == 'true'):
        add_in_sources = True

    user_mc = user_mediacloud_client()
    info = user_mc.tag(collection_id)
    add_user_favorite_flag_to_collections([info])
    info['id'] = collection_id
    info['tag_set'] = _tag_set_info(user_mediacloud_key(), info['tag_sets_id'])
    if add_in_sources:
        media_in_collection = media_with_tag(user_mediacloud_key(), collection_id)
        info['sources'] = media_in_collection
    analytics_db.increment_count(analytics_db.TYPE_COLLECTION, collection_id, analytics_db.ACTION_SOURCE_MGR_VIEW)
    return jsonify({'results': info})
def api_collection_details(collection_id):
    add_in_sources = False
    if ('getSources' in request.args) and (request.args['getSources'] == 'true'):
        add_in_sources = True

    user_mc = user_mediacloud_client()
    info = user_mc.tag(collection_id)
    add_user_favorite_flag_to_collections([info])
    info['id'] = collection_id
    info['tag_set'] = _tag_set_info(user_mediacloud_key(), info['tag_sets_id'])
    if add_in_sources:
        media_in_collection = media_with_tag(user_mediacloud_key(), collection_id)
        info['sources'] = media_in_collection
    analytics_db.increment_count(analytics_db.TYPE_COLLECTION, collection_id, analytics_db.ACTION_SOURCE_MGR_VIEW)
    return jsonify({'results': info})
def api_collection_details(collection_id):
    add_in_sources = False
    if ('getSources' in request.args) and (request.args['getSources']
                                           == 'true'):
        add_in_sources = True

    user_mc = user_mediacloud_client()
    info = user_mc.tag(collection_id)
    add_user_favorite_flag_to_collections([info])
    info['id'] = collection_id
    info['tag_set'] = _tag_set_info(user_mediacloud_key(), info['tag_sets_id'])
    if add_in_sources:
        media_in_collection = media_with_tag(user_mediacloud_key(),
                                             collection_id)
        info['sources'] = media_in_collection
    return jsonify({'results': info})
def remove_sources_from_collection(collection_id):
    source_ids_to_remove = request.form['sources[]'].split(',')
    source_ids_to_remove = [int(s) for s in source_ids_to_remove]
    user_mc = user_admin_mediacloud_client()
    # get the sources in the collection first, then remove and add as needed
    existing_source_ids = [int(m['media_id']) for m in media_with_tag(user_mediacloud_key(), collection_id)]
    source_ids_to_remain = list(set(existing_source_ids) - set(source_ids_to_remove))

    media_to_remove = [MediaTag(sid, tags_id=collection_id, action=TAG_ACTION_REMOVE) for sid in source_ids_to_remove]
    media_to_remain = [MediaTag(sid, tags_id=collection_id, action=TAG_ACTION_ADD) for sid in
                       source_ids_to_remain]  # do I need to run similar or TAG_ACTION_REMOVE?
    current_media = media_to_remove + media_to_remain

    if len(current_media) > 0:
        results = user_mc.tagMedia(current_media)

    apicache.invalidate_collection_source_representation_cache(user_mediacloud_key(), collection_id)
    return jsonify(results)
def api_collection_sources_feed_status_csv(collection_id, source_type):
    user_mc = user_mediacloud_client()
    collection = user_mc.tag(collection_id)
    type = str(source_type).lower()
    media_in_collection = media_with_tag(user_mediacloud_key(), collection_id)
    media_info_in_collection = _fetch_collection_source_feed_info(media_in_collection)
    if type == 'review':
        filtered_media = [m for m in media_info_in_collection if m['active_feed_count'] > 0 and m['num_stories_90'] == 0 and m['num_stories_last_year'] > 0]
    elif type == 'remove':
        filtered_media = [m for m in media_info_in_collection if m['active_feed_count'] > 0 and m['num_stories_90'] == 0 and m['num_stories_last_year'] == 0 and m['latest_scrape_job.state'] == 'failed']
    elif type == 'unscrapeable':
        filtered_media = [m for m in media_info_in_collection if m['active_feed_count'] == 0 and m['num_stories_90'] > 0]
    elif type == 'working':
        filtered_media = [m for m in media_info_in_collection if m['active_feed_count'] > 0 and m['num_stories_last_year'] > 0]
    else:
        filtered_media = media_info_in_collection
    file_prefix = "Collection {} ({}) - sources feed {}".format(collection_id, collection['tag'], source_type)
    properties_to_include = SOURCE_FEED_LIST_CSV_PROPS
    return csv.download_media_csv(filtered_media, file_prefix, properties_to_include)
Beispiel #26
0
def _collection_source_sentence_historical_counts(collection_id,
                                                  start_date_str,
                                                  end_date_str):
    user_mc = user_admin_mediacloud_client()
    start_date = datetime.datetime.strptime(start_date_str, "%Y-%m-%d").date()
    end_date = datetime.datetime.strptime(end_date_str, "%Y-%m-%d").date()
    q = " AND ({})".format(user_mc.publish_date_query(start_date, end_date))
    media_list = media_with_tag(user_mediacloud_key(), collection_id)
    jobs = [{
        'media': m,
        'q': q,
        'start_date_str': start_date_str,
        'end_date_str': end_date_str
    } for m in media_list]
    # fetch in parallel to make things faster
    pool = Pool(processes=HISTORICAL_COUNT_POOL_SIZE)
    results = pool.map(_source_sentence_counts_worker,
                       jobs)  # blocks until they are all done
    pool.terminate()  # extra safe garbage collection
    return results
Beispiel #27
0
def api_metadata_download(collection_id):
    all_media = media_with_tag(user_mediacloud_key(), collection_id)

    metadata_counts = {}  # from tag_sets_id to info
    for media_source in all_media:
        for metadata_label, info in media_source['metadata'].items():
            if metadata_label not in metadata_counts:  # lazily populate counts
                metadata_counts[metadata_label] = {
                    'metadataCoverage': metadata_label,
                    'tagged': 0
                }
            if info is not None:
                metadata_counts[metadata_label]['tagged'] += 1

    for item_info in list(metadata_counts.values()):
        temp = len(all_media) - item_info['tagged']
        item_info.update({'notTagged': temp})

    props = ['metadataCoverage', 'tagged', 'notTagged']
    filename = "metadataCoverageForCollection" + collection_id + ".csv"
    return csv.stream_response(list(metadata_counts.values()), props, filename,
                               ['metadata category', 'sources with info', 'sources missing info'])
def api_metadata_download(collection_id):
    all_media = media_with_tag(user_mediacloud_key(), collection_id)

    metadata_counts = {}  # from tag_sets_id to info
    for media_source in all_media:
        for metadata_label, info in media_source['metadata'].items():
            if metadata_label not in metadata_counts:  # lazily populate counts
                metadata_counts[metadata_label] = {
                    'metadataCoverage': metadata_label,
                    'tagged': 0
                }
            if info is not None:
                metadata_counts[metadata_label]['tagged'] += 1

    for item_info in list(metadata_counts.values()):
        temp = len(all_media) - item_info['tagged']
        item_info.update({'notTagged': temp})

    props = ['metadataCoverage', 'tagged', 'notTagged']
    filename = "metadataCoverageForCollection" + collection_id + ".csv"
    return csv.stream_response(list(metadata_counts.values()), props, filename,
                               ['metadata category', 'sources with info', 'sources missing info'])
def api_collection_sources(collection_id):
    add_in_details = False
    if ('details' in request.args) and (request.args['details'] == 'true'):
        add_in_details = True
    results = {'tags_id': collection_id}
    media_in_collection = media_with_tag(user_mediacloud_key(), collection_id)
    add_user_favorite_flag_to_sources(media_in_collection)
    if add_in_details and user_has_auth_role(ROLE_MEDIA_EDIT):
        # for editing users, add in last scrape and active feed count (if requested)
        pool = Pool(processes=FEED_SCRAPE_JOB_POOL_SIZE)
        jobs = [m['media_id'] for m in media_in_collection]
        job_results = pool.map(_media_list_edit_worker,
                               jobs)  # blocks until they are all done
        job_by_media_id = {j['media_id']: j for j in job_results}
        for m in media_in_collection:
            m['latest_scrape_job'] = job_by_media_id[
                m['media_id']]['latest_scrape_job']
            m['active_feed_count'] = job_by_media_id[
                m['media_id']]['active_feed_count']
        pool.terminate()
    results['sources'] = media_in_collection
    return jsonify(results)
def _collection_source_story_split_historical_counts(collection_id):
    media_list = media_with_tag(user_mediacloud_key(), collection_id)
    jobs = [{'media': m} for m in media_list]
    # fetch in parallel to make things faster
    #return [_source_story_split_count_job(j) for j in jobs]
    return _source_story_split_count_job.map(jobs)