def api_explorer_story_split_count_csv():
    filename = 'stories-over-time'
    data = request.form
    if 'searchId' in data:
        solr_q, solr_fq = parse_as_sample(data['searchId'], data['index'])
        filename = filename  # don't have this info + current_query['q']
        SAMPLE_SEARCHES = load_sample_searches()
        queries = SAMPLE_SEARCHES[data['searchId']]['queries']
    else:
        queries = json.loads(data['queries'])
    label = " ".join([q['label'] for q in queries])
    filename = file_name_for_download(label, filename)
    # now compute total attention for all results
    story_count_results = []
    for q in queries:
        solr_q, solr_fq = parse_query_with_keywords(q)
        solr_open_query = concatenate_query_for_solr(solr_seed_query='*', media_ids=q['sources'],
                                                     tags_ids=q['collections'])
        story_counts = apicache.normalized_and_story_count(solr_q, solr_fq, solr_open_query)
        story_count_results.append({
            'date': q['startDate'],
            'query': q['label'],
            'matching_stories': story_counts['total'],
            'total_stories': story_counts['normalized_total'],
            'ratio': float(story_counts['total']) / float(story_counts['normalized_total'])
        })
    props = ['date','query', 'matching_stories', 'total_stories', 'ratio']
    return csv.stream_response(story_count_results, props, filename)
Esempio n. 2
0
def topic_compare_subtopic_top_words(topics_id):
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    selected_focal_sets_id = request.args['focal_sets_id']
    word_count = request.args['word_count'] if 'word_count' in request.args else 20
    # first we need to figure out which timespan they are working on
    selected_snapshot_timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id=snapshots_id)
    selected_timespan = None
    for t in selected_snapshot_timespans:
        if t['timespans_id'] == int(timespans_id):
            selected_timespan = t
    focal_set = apicache.topic_focal_set(user_mediacloud_key(), topics_id, snapshots_id, selected_focal_sets_id)
    timespans = apicache.matching_timespans_in_foci(topics_id, selected_timespan, focal_set['foci'])
    for idx in range(0, len(timespans)):
        data = apicache.topic_word_counts(user_mediacloud_key(), topics_id,
                                          timespans_id=timespans[idx]['timespans_id'])
        focal_set['foci'][idx]['top_words'] = data
    # stitch together the counts to download now
    data = []
    headers = [f['name'] for f in focal_set['foci']]
    for idx in range(0, word_count):
        row = {f['name']: u"{} ({})".format(f['top_words'][idx]['term'], f['top_words'][idx]['count'])
               for f in focal_set['foci']}
        data.append(row)
    return csv.stream_response(data, headers,
                               'topic-{}-subtopic-{}-{}-top-words-comparison'.format(
                                   topics_id, focal_set['name'], selected_focal_sets_id))
Esempio n. 3
0
def api_collection_source_representation_csv(collection_id):
    user_mc = user_mediacloud_client()
    info = user_mc.tag(collection_id)
    source_representation = apicache.collection_source_representation(user_mediacloud_key(), collection_id)
    props = ['media_id', 'media_name', 'media_url', 'stories', 'sample_size', 'story_pct']
    filename = info['label'] + "-source sentence counts.csv"
    return csv.stream_response(source_representation, props, filename)
Esempio n. 4
0
def api_download_sources_template():
    filename = "Collection_Template_for_sources.csv"

    what_type_download = SOURCES_TEMPLATE_PROPS_EDIT

    return csv.stream_response(what_type_download, what_type_download,
                               filename)
Esempio n. 5
0
def stream_story_count_csv(fn, search_id_or_query_list):
    '''
    Helper method to stream a list of stories back to the client as a csv.  Any args you pass in will be
    simply be passed on to a call to topicStoryList.
    '''
    # if we have a search id, we load the samples from our sample searches file
    filename = ''
    story_count_results = []
    SAMPLE_SEARCHES = load_sample_searches()
    try:
        search_id = int(search_id_or_query_list)
        if search_id >= 0:
            SAMPLE_SEARCHES = load_sample_searches()

            sample_queries = SAMPLE_SEARCHES[search_id]['queries']

            for query in sample_queries:
                solr_query = prep_simple_solr_query(query)
                storyList = cached_story_count(solr_query)
                query_and_story_count = {'query' : query['label'], 'count' : storyList['count']}
                story_count_results.append(query_and_story_count)

    except Exception as e:
        custom_queries = json.loads(search_id_or_query_list)

        for query in custom_queries:
            solr_query = parse_query_with_keywords(query)
            filename = fn + query['q']

            storyList = cached_story_count(solr_query)
            query_and_story_count = {'query' : query['label'], 'count' : storyList['count']}
            story_count_results.append(query_and_story_count)
    
    props = ['query','count']
    return csv.stream_response(story_count_results, props, filename)
Esempio n. 6
0
def topic_compare_subtopic_top_words(topics_id):
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    selected_focal_sets_id = request.args['focal_sets_id']
    word_count = request.args['word_count'] if 'word_count' in request.args else 20
    # first we need to figure out which timespan they are working on
    selected_snapshot_timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id,
                                                                      snapshots_id=snapshots_id)
    selected_timespan = None
    for t in selected_snapshot_timespans:
        if t['timespans_id'] == int(timespans_id):
            selected_timespan = t
    try:
        focal_set = apicache.topic_focal_set(user_mediacloud_key(), topics_id, snapshots_id, selected_focal_sets_id)
    except ValueError:
        return json_error_response('Invalid Focal Set Id')
    timespans = apicache.matching_timespans_in_foci(topics_id, selected_timespan, focal_set['foci'])
    for idx in range(0, len(timespans)):
        data = apicache.topic_word_counts(user_mediacloud_key(), topics_id,
                                          timespans_id=timespans[idx]['timespans_id'])
        focal_set['foci'][idx]['top_words'] = data
    # stitch together the counts to download now
    data = []
    headers = [f['name'] for f in focal_set['foci']]
    for idx in range(0, word_count):
        row = {f['name']: "{} ({})".format(f['top_words'][idx]['term'], f['top_words'][idx]['count'])
               for f in focal_set['foci']}
        data.append(row)
    return csv.stream_response(data, headers,
                               'topic-{}-subtopic-{}-{}-top-words-comparison'.format(
                                   topics_id, focal_set['name'], selected_focal_sets_id))
Esempio n. 7
0
def api_metadata_download(collection_id):
    all_media = media_with_tag(user_mediacloud_key(), collection_id)

    metadata_items = []
    for media_source in all_media:
        for tag in media_source['media_source_tags']:
            if is_metadata_tag_set(tag['tag_sets_id']):
                found = False
                for dictItem in metadata_items:
                    if dictItem['metadataId'] == tag['tag_sets_id']:
                        temp = dictItem['tagged']
                        dictItem.update({'tagged': temp + 1})
                        found = True
                if not found:
                    metadata_items.append({
                        'metadataCoverage': tag['tag_set'],
                        'metadataId': tag['tag_sets_id'],
                        'tagged': 1
                    })

    for i in metadata_items:
        temp = len(all_media) - i['tagged']
        i.update({'notTagged': temp})

    props = ['metadataCoverage', 'tagged', 'notTagged']
    filename = "metadataCoverageForCollection" + collection_id + ".csv"
    return csv.stream_response(metadata_items, props, filename)
def api_download_sources_template():
    filename = "media cloud collection upload template.csv"

    what_type_download = SOURCE_LIST_CSV_EDIT_PROPS

    return csv.stream_response(what_type_download, what_type_download,
                               filename)
Esempio n. 9
0
def entities_csv(topics_id, type_entity):
    tag_type = CLIFF_PEOPLE if type_entity == 'people' else CLIFF_ORGS
    top_tag_counts = topic_tag_counts(user_mediacloud_key(), topics_id,
                                      tag_type)
    data = process_tags_for_coverage(topics_id, top_tag_counts)
    return csv.stream_response(top_tag_counts, ENTITY_DOWNLOAD_COLUMNS,
                               'topic-{}-entities-{}'.format(topics_id, type))
Esempio n. 10
0
def story_words_csv(topics_id, stories_id):
    query = add_to_user_query('stories_id:'+stories_id)
    ngram_size = request.args['ngram_size'] if 'ngram_size' in request.args else 1  # default to word count
    word_counts = topic_ngram_counts(user_mediacloud_key(), topics_id, ngram_size, q=query,
                                     num_words=WORD_COUNT_DOWNLOAD_NUM_WORDS)
    return csv.stream_response(word_counts, WORD_COUNT_DOWNLOAD_COLUMNS,
                               'topic-{}-story-{}-sampled-ngrams-{}-word'.format(topics_id, stories_id, ngram_size))
Esempio n. 11
0
def explorer_story_count_csv():
    filename = u'total-story-count'
    data = request.form
    if 'searchId' in data:
        # TODO: don't load this query twice because that is kind of dumb
        sample_searches = load_sample_searches()
        queries = sample_searches[data['searchId']]['queries']
    else:
        queries = json.loads(data['queries'])
    label = " ".join([q['label'] for q in queries])
    filename = file_name_for_download(label, filename)
    # now compute total attention for all results
    story_count_results = []
    for q in queries:
        solr_q, solr_fq = parse_query_with_keywords(q)
        solr_open_query = concatenate_query_for_solr(solr_seed_query='*',
                                                     media_ids=q['sources'],
                                                     tags_ids=q['collections'])
        story_counts = apicache.normalized_and_story_count(
            solr_q, solr_fq, solr_open_query)
        story_count_results.append({
            'query':
            q['label'],
            'matching_stories':
            story_counts['total'],
            'total_stories':
            story_counts['normalized_total'],
            'ratio':
            float(story_counts['total']) /
            float(story_counts['normalized_total'])
        })
    props = ['query', 'matching_stories', 'total_stories', 'ratio']
    return csv.stream_response(story_count_results, props, filename)
Esempio n. 12
0
def explorer_story_count_csv():
    filename = 'total-story-count'
    data = request.form
    if 'searchId' in data:
        # TODO: don't load this query twice because that is kind of dumb
        sample_searches = load_sample_searches()
        queries = sample_searches[data['searchId']]['queries']
    else:
        queries = json.loads(data['queries'])
    label = " ".join([q['label'] for q in queries])
    filename = file_name_for_download(label, filename)
    # now compute total attention for all results
    story_count_results = []
    for q in queries:
        solr_q, solr_fq = parse_query_with_keywords(q)
        solr_open_query = concatenate_query_for_solr(solr_seed_query='*', media_ids=q['sources'],
                                                     tags_ids=q['collections'])
        story_counts = apicache.normalized_and_story_count(solr_q, solr_fq, solr_open_query)
        story_count_results.append({
            'query': q['label'],
            'matching_stories': story_counts['total'],
            'total_stories': story_counts['normalized_total'],
            'ratio': float(story_counts['total']) / float(story_counts['normalized_total'])
        })
    props = ['query', 'matching_stories', 'total_stories', 'ratio']
    return csv.stream_response(story_count_results, props, filename)
def explorer_story_count_csv():
    filename = 'total-story-count'
    data = request.form
    if 'searchId' in data:
        queries = SAMPLE_SEARCHES[data['searchId']]['queries']
    else:
        queries = json.loads(data['queries'])
    label = " ".join([q['label'] for q in queries])
    filename = file_name_for_download(label, filename)
    # now compute total attention for all results
    story_count_results = []
    for q in queries:
        if (len(q['collections']) == 0) and only_queries_reddit(q['sources']):
            start_date, end_date = parse_query_dates(q)
            story_counts = pushshift.reddit_submission_normalized_and_split_story_count(query=q['q'],
                                                                                        start_date=start_date,
                                                                                        end_date=end_date,
                                                                                        subreddits=pushshift.NEWS_SUBREDDITS)
        else:
            solr_q, solr_fq = parse_query_with_keywords(q)
            solr_open_query = concatenate_query_for_solr(solr_seed_query='*', media_ids=q['sources'],
                                                         tags_ids=q['collections'])
            story_counts = apicache.normalized_and_story_count(solr_q, solr_fq, solr_open_query)
        story_count_results.append({
            'query': q['label'],
            'matching_stories': story_counts['total'],
            'total_stories': story_counts['normalized_total'],
            'ratio': float(story_counts['total']) / float(story_counts['normalized_total'])
        })
    props = ['query', 'matching_stories', 'total_stories', 'ratio']
    return csv.stream_response(story_count_results, props, filename)
Esempio n. 14
0
def story_tags_csv(stories_id):
    # in the download include all entity types
    admin_mc = user_admin_mediacloud_client()
    if stories_id in [None, 'NaN']:
        return jsonify({'error': 'bad value'})
    story = admin_mc.story(stories_id, text=True)  # Note - this call doesn't pull cliff places
    props = ['tags_id', 'tag', 'tag_sets_id', 'tag_set']
    return csv.stream_response(story['story_tags'], props, 'story-' + str(stories_id) + '-all-tags-and-tag-sets')
Esempio n. 15
0
def collection_source_sentence_counts_csv(collection_id):
    user_mc = user_admin_mediacloud_client()
    info = user_mc.tag(collection_id)
    results = _cached_media_with_sentence_counts(user_mediacloud_key(),
                                                 collection_id)
    props = ['media_id', 'name', 'url', 'sentence_count', 'sentence_pct']
    filename = info['label'] + "-source sentence counts.csv"
    return csv.stream_response(results, props, filename)
Esempio n. 16
0
def stream_story_samples_csv(filename, stories):
    '''
    Helper method to stream a list of stories back to the client as a csv.  Any args you pass in will be
    simply be passed on to a call to topicStoryList.
    '''
    props = ['stories_id', 'publish_date',
            'title', 'url', 'media_name','media_id', 'language']
    return csv.stream_response(stories, props, filename)
Esempio n. 17
0
def story_entities_csv(topics_id, stories_id):
    # in the download include all entity types
    entities = cached_entities(user_mediacloud_key(), stories_id)
    if entities is None:
        # none means not processed by corenlp, but for download just make it empty
        entities = []
    props = ['type', 'name', 'words']
    return csv.stream_response(entities, props, 'story-'+str(stories_id)+'-entities')
Esempio n. 18
0
def topic_provider_words_csv(topics_id):
    optional_args = _parse_words_optional_arguments()
    results = apicache.topic_ngram_counts(user_mediacloud_key(), topics_id,
                                          **optional_args)
    file_name = 'topic-{}-sampled-ngrams-{}-word'.format(
        topics_id, optional_args['ngram_size'])
    return csv.stream_response(results, apicache.WORD_COUNT_DOWNLOAD_COLUMNS,
                               file_name)
Esempio n. 19
0
def _stream_topic_split_story_counts_csv(results, filename):
    clean_results = [{
        'date': trim_solr_date(item['date']),
        'stories': item['count']
    } for item in results['counts']]
    sorted_results = sorted(clean_results, key=itemgetter('date'))
    props = ['date', 'stories']
    return csv.stream_response(sorted_results, props, filename)
Esempio n. 20
0
def topic_words_csv(topics_id):
    query = apicache.add_to_user_query(None)
    sample_size = request.args['sample_size'] if 'sample_size' in request.args else WORD_COUNT_SAMPLE_SIZE
    ngram_size = request.args['ngram_size'] if 'ngram_size' in request.args else 1  # default to word count
    word_counts = apicache.topic_ngram_counts(user_mediacloud_key(), topics_id, ngram_size=ngram_size, q=query,
                                              num_words=WORD_COUNT_DOWNLOAD_NUM_WORDS, sample_size=sample_size)
    return csv.stream_response(word_counts, apicache.WORD_COUNT_DOWNLOAD_COLUMNS,
                               'topic-{}-sampled-ngrams-{}-word'.format(topics_id, ngram_size))
def stream_split_stories_csv(user_mc_key, filename, item_id, which):
    response = {
        'story_splits': apicache.last_year_split_story_count(user_mc_key, [which + ":" + str(item_id)])['counts']
    }
    clean_results = [{'date': trimSolrDate(item['date']), 'numStories': item['count']} for item in response['story_splits']]
    clean_results = sorted(clean_results, key=itemgetter('date'))
    props = ['date', 'numStories']
    return csv.stream_response(clean_results, props, filename)
Esempio n. 22
0
def media_words_csv(topics_id, media_id):
    query = apicache.add_to_user_query('media_id:'+media_id)
    ngram_size = request.args['ngram_size'] if 'ngram_size' in request.args else 1  # default to word count
    word_counts = apicache.topic_ngram_counts(user_mediacloud_key(), topics_id, ngram_size=ngram_size, q=query,
                                              num_words=WORD_COUNT_DOWNLOAD_NUM_WORDS,
                                              sample_size=WORD_COUNT_DOWNLOAD_SAMPLE_SIZE)
    return csv.stream_response(word_counts, apicache.WORD_COUNT_DOWNLOAD_COLUMNS,
                               'topic-{}-media-{}-sampled-ngrams-{}-word'.format(topics_id, media_id, ngram_size))
Esempio n. 23
0
def api_collection_source_representation_csv(collection_id):
    user_mc = user_mediacloud_client()
    info = user_mc.tag(collection_id)
    source_representation = apicache.collection_source_representation(user_mediacloud_key(), collection_id,
                                                                      sample_size=500,
                                                                      fq="publish_date:[NOW-90DAY TO NOW]")
    props = ['media_id', 'media_name', 'media_url', 'stories', 'sample_size', 'story_pct']
    filename = info['label'] + "-source sentence counts.csv"
    return csv.stream_response(source_representation, props, filename)
Esempio n. 24
0
def stream_wordcount_csv(filename, q, fq, ngram_size=1, sample_size=WORD_COUNT_SAMPLE_SIZE):
    # use bigger values for CSV download
    num_words = WORD_COUNT_DOWNLOAD_NUM_WORDS
    word_counts = query_wordcount(q, fq, ngram_size, num_words, sample_size)
    for w in word_counts:
        w['sample_size'] = sample_size
        w['ratio'] = float(w['count'])/float(sample_size)
    props = ['term', 'stem', 'count', 'sample_size', 'ratio', 'google_w2v_x', 'google_w2v_y']
    return csv.stream_response(word_counts, props, filename)
Esempio n. 25
0
def explorer_geo_csv():
    filename = 'sampled-geographic-coverage'
    data = request.form
    query_object = json.loads(data['q'])
    solr_q, solr_fq = parse_query_with_keywords(query_object)
    filename = file_name_for_download(query_object['label'], filename)
    data = apicache.top_tags_with_coverage(solr_q, solr_fq, tags.GEO_TAG_SET)
    props = ['tags_id', 'label', 'count', 'pct']
    return csv.stream_response(data['results'], props, filename)
Esempio n. 26
0
def story_subreddit_shares_csv(stories_id):
    story = mc.story(stories_id)
    submissions_by_sub = pushshift.reddit_url_submissions_by_subreddit(
        story['url'])
    props = ['name', 'value']
    column_names = ['subreddit', 'submissions']
    return csv.stream_response(submissions_by_sub,
                               props,
                               'story-' + str(stories_id) + '-subreddit',
                               column_names=column_names)
Esempio n. 27
0
def stream_sentence_count_csv(user_mc_key, filename, topics_id, **kwargs):
    results = topic_sentence_counts(user_mc_key, topics_id, **kwargs)
    clean_results = [{
        'date': date,
        'numFound': count
    } for date, count in results['split'].iteritems()
                     if date not in ['gap', 'start', 'end']]
    sorted_results = sorted(clean_results, key=itemgetter('date'))
    props = ['date', 'numFound']
    return csv.stream_response(sorted_results, props, filename)
def stream_topic_split_story_counts_csv(user_mc_key, filename, topics_id,
                                        **kwargs):
    results = topic_split_story_counts(user_mc_key, topics_id, **kwargs)
    clean_results = [{
        'date': item['date'],
        'stories': item['count']
    } for item in results['counts']]
    sorted_results = sorted(clean_results, key=itemgetter('date'))
    props = ['date', 'stories']
    return csv.stream_response(sorted_results, props, filename)
Esempio n. 29
0
def explorer_entities_csv(tag_sets_id):
    tag_set = base_apicache.tag_set(tag_sets_id)
    filename = 'sampled-{}'.format(tag_set['label'])
    data = request.form
    query_object = json.loads(data['q'])
    solr_q, solr_fq = parse_query_with_keywords(query_object)
    filename = file_name_for_download(query_object['label'], filename)
    top_tag_counts = apicache.top_tags_with_coverage(
        solr_q, solr_fq, tag_sets_id, TAG_COUNT_DOWNLOAD_LENGTH)['results']
    return csv.stream_response(top_tag_counts, ENTITY_DOWNLOAD_COLUMNS,
                               filename)
def stream_split_stories_csv(user_mc_key, filename, q):
    response = {
        'story_splits': apicache.split_story_count(user_mc_key, q)['counts']
    }
    clean_results = [{
        'date': trim_solr_date(item['date']),
        'numStories': item['count']
    } for item in response['story_splits']]
    clean_results = sorted(clean_results, key=itemgetter('date'))
    props = ['date', 'numStories']
    return csv.stream_response(clean_results, props, filename)
Esempio n. 31
0
def story_tags_csv(stories_id):
    # in the download include all entity types
    admin_mc = user_admin_mediacloud_client()
    if stories_id in [None, 'NaN']:
        return jsonify({'error': 'bad value'})
    story = admin_mc.story(
        stories_id, text=True)  # Note - this call doesn't pull cliff places
    props = ['tags_id', 'tag', 'tag_sets_id', 'tag_set']
    return csv.stream_response(
        story['story_tags'], props,
        'story-' + str(stories_id) + '-all-tags-and-tag-sets')
Esempio n. 32
0
def stream_sentence_count_csv(user_mc_key, filename, item_id, which):
    response = {}
    response['sentencecounts'] = cached_recent_sentence_counts(
        user_mc_key, [which + ":" + str(item_id)])
    clean_results = [{
        'date': date,
        'numFound': count
    } for date, count in response['sentencecounts'].iteritems()
                     if date not in ['gap', 'start', 'end']]
    clean_results = sorted(clean_results, key=itemgetter('date'))
    props = ['date', 'numFound']
    return csv.stream_response(clean_results, props, filename)
Esempio n. 33
0
def explorer_entities_csv(tag_sets_id):
    tag_set = apicache.tag_set(tag_sets_id)
    filename = 'sampled-{}'.format(tag_set['label'])
    data = request.form
    if 'searchId' in data:
        solr_q, solr_fq = parse_as_sample(data['searchId'], data['index'])
    else:
        query_object = json.loads(data['q'])
        solr_q, solr_fq = parse_query_with_keywords(query_object)
        filename = file_name_for_download(query_object['label'], filename)
    top_tag_counts = apicache.top_tags_with_coverage(solr_q, solr_fq, tag_sets_id, TAG_COUNT_DOWNLOAD_LENGTH)['results']
    return csv.stream_response(top_tag_counts, ENTITY_DOWNLOAD_COLUMNS, filename)
Esempio n. 34
0
def topic_word_associated_words_csv(topics_id, word):
    query = apicache.add_to_user_query(word)
    ngram_size = request.args[
        'ngram_size'] if 'ngram_size' in request.args else 1  # default to word count
    word_counts = apicache.topic_ngram_counts(user_mediacloud_key(),
                                              topics_id,
                                              ngram_size=ngram_size,
                                              q=query)
    return csv.stream_response(
        word_counts, apicache.WORD_COUNT_DOWNLOAD_COLUMNS,
        'topic-{}-{}-sampled-ngrams-{}-word'.format(topics_id, word,
                                                    ngram_size))
def stream_split_stories_csv(user_mc_key, filename, item_id, which):
    response = {
        'story_splits':
        apicache.last_year_split_story_count(
            user_mc_key, [which + ":" + str(item_id)])['counts']
    }
    clean_results = [{
        'date': trimSolrDate(item['date']),
        'numStories': item['count']
    } for item in response['story_splits']]
    clean_results = sorted(clean_results, key=itemgetter('date'))
    props = ['date', 'numStories']
    return csv.stream_response(clean_results, props, filename)
Esempio n. 36
0
def collection_source_story_split_historical_counts_csv(collection_id):
    results = _collection_source_story_split_historical_counts(collection_id)
    date_cols = None
    # TODO verify this
    for source in results:
        if date_cols is None:
            date_cols = sorted([s['date'] for s in source['splits_over_time']])
        for day in source['splits_over_time']:
            source[day['date']] = day['count']
        del source['splits_over_time']
    props = ['media_id', 'media_name', 'media_url', 'total_stories', 'splits_over_time'] + date_cols
    filename = "{} - source content count".format(collection_id)
    return csv.stream_response(results, props, filename)
Esempio n. 37
0
def explorer_geo_csv():
    filename = u'sampled-geographic-coverage'
    data = request.form
    if 'searchId' in data:
        solr_q, solr_fq = parse_as_sample(data['searchId'], data['index'])
    else:
        query_object = json.loads(data['q'])
        solr_q, solr_fq = parse_query_with_keywords(query_object)
        filename = file_name_for_download(query_object['label'], filename)
    data = apicache.top_tags_with_coverage(solr_q, solr_fq, tags.GEO_TAG_SET)
    data['results'] = _filter_for_countries(data['results'])
    props = ['label', 'count', 'pct', 'alpha3', 'iso-a2', 'geonamesId', 'tags_id', 'tag']
    return csv.stream_response(data['results'], props, filename)
Esempio n. 38
0
def stream_wordcount_csv(filename, q, fq, ngram_size=1):
    # use bigger values for CSV download
    num_words = WORD_COUNT_DOWNLOAD_LENGTH
    sample_size = WORD_COUNT_SAMPLE_SIZE
    word_counts = query_wordcount(q, fq, ngram_size, num_words, sample_size)
    for w in word_counts:
        w['sample_size'] = sample_size
        w['ratio'] = float(w['count']) / float(sample_size)
    props = [
        'term', 'stem', 'count', 'sample_size', 'ratio', 'google_w2v_x',
        'google_w2v_y'
    ]
    return csv.stream_response(word_counts, props, filename)
Esempio n. 39
0
def collection_source_story_split_historical_counts_csv(collection_id):
    results = _collection_source_story_split_historical_counts(collection_id)
    date_cols = None
    # TODO verify this
    for source in results:
        if date_cols is None:
            date_cols = sorted([s['date'] for s in source['splits_over_time']])
        for day in source['splits_over_time']:
            source[day['date']] = day['count']
        del source['splits_over_time']
    props = [
        'media_id', 'media_name', 'media_url', 'total_stories',
        'splits_over_time'
    ] + date_cols
    filename = "{} - source content count".format(collection_id)
    return csv.stream_response(results, props, filename)
Esempio n. 40
0
def stream_geo_csv(fn, search_id_or_query, index):
    filename = ''

    # TODO: there is duplicate code here...
    SAMPLE_SEARCHES = load_sample_searches()
    try:
        search_id = int(search_id_or_query)
        if search_id >= 0:
            SAMPLE_SEARCHES = load_sample_searches()
            current_search = SAMPLE_SEARCHES[search_id]['queries']
            solr_query = parse_query_with_args_and_sample_search(
                search_id, current_search)

            if int(index) < len(current_search):
                start_date = current_search[int(index)]['startDate']
                end_date = current_search[int(index)]['endDate']
                filename = fn + current_search[int(index)]['q']
    except Exception as e:
        # so far, we will only be fielding one keyword csv query at a time, so we can use index of 0
        query = json.loads(search_id_or_query)
        current_query = query[0]
        solr_query = parse_query_with_keywords(current_query)
        filename = fn + current_query['q']

    res = cached_geotags(solr_query)
    res = [
        r for r in res
        if int(r['tag'].split('_')[1]) in COUNTRY_GEONAMES_ID_TO_APLHA3.keys()
    ]
    for r in res:
        geonamesId = int(r['tag'].split('_')[1])
        if geonamesId not in COUNTRY_GEONAMES_ID_TO_APLHA3.keys(
        ):  # only include countries
            continue
        r['geonamesId'] = geonamesId
        r['alpha3'] = COUNTRY_GEONAMES_ID_TO_APLHA3[geonamesId]
        r['count'] = (
            float(r['count']) / float(tag_utl.GEO_SAMPLE_SIZE)
        )  # WTF: why is the API returning this as a string and not a number?
        for hq in HIGHCHARTS_KEYS:
            if hq['properties']['iso-a3'] == r['alpha3']:
                r['iso-a2'] = hq['properties']['iso-a2']
                r['value'] = r['count']

    props = ['label', 'count']
    return csv.stream_response(res, props, filename)
Esempio n. 41
0
def api_metadata_download(collection_id):
    all_media = media_with_tag(user_mediacloud_key(), collection_id)

    metadata_counts = {}  # from tag_sets_id to info
    for media_source in all_media:
        for metadata_label, info in media_source['metadata'].items():
            if metadata_label not in metadata_counts:  # lazily populate counts
                metadata_counts[metadata_label] = {
                    'metadataCoverage': metadata_label,
                    'tagged': 0
                }
            if info is not None:
                metadata_counts[metadata_label]['tagged'] += 1

    for item_info in list(metadata_counts.values()):
        temp = len(all_media) - item_info['tagged']
        item_info.update({'notTagged': temp})

    props = ['metadataCoverage', 'tagged', 'notTagged']
    filename = "metadataCoverageForCollection" + collection_id + ".csv"
    return csv.stream_response(list(metadata_counts.values()), props, filename,
                               ['metadata category', 'sources with info', 'sources missing info'])
Esempio n. 42
0
def api_download_sources_template():
    filename = "media cloud collection upload template.csv"

    what_type_download = SOURCE_LIST_CSV_EDIT_PROPS

    return csv.stream_response(what_type_download, what_type_download, filename)
Esempio n. 43
0
def topic_geo_tag_counts_csv(topics_id):
    tags = _geo_tag_counts(user_mediacloud_key(), topics_id)
    data = process_tags_for_coverage(topics_id, tags)
    return stream_response(tags, ['tags_id', 'tag', 'label', 'count', 'pct'], "topic-{}-geo-tag-counts".format(topics_id))
Esempio n. 44
0
def stream_geo_csv(user_mc_key, filename, item_id, which):
    info = {}
    info = cached_geotag_count(user_mc_key, which+":"+str(item_id))
    props = ['label', 'count']
    return csv.stream_response(info, props, filename)
Esempio n. 45
0
def topic_nyt_tag_counts_csv(topics_id):
    tags = _nyt_tag_counts(user_mediacloud_key(), topics_id)
    return stream_response(tags['entities'], ['tags_id', 'tag', 'label', 'count', 'pct'], "topic-{}-nyt-label-counts".format(topics_id))
Esempio n. 46
0
def stream_feed_csv(filename, media_id):
    response = cached_feed(media_id)
    props = ['name', 'type', 'url']
    return csv.stream_response(response, props, filename)
Esempio n. 47
0
def story_entities_csv(stories_id):
    # in the download include all entity types
    entities = entities_from_mc_or_cliff(stories_id)
    props = ['type', 'name', 'frequency']
    return csv.stream_response(entities, props, 'story-'+str(stories_id)+'-entities')
Esempio n. 48
0
def story_nyt_themes_csv(stories_id):
    results = nyt_themes_from_mc_or_labeller(stories_id)
    themes = results['descriptors600']
    props = ['label', 'score']
    return csv.stream_response(themes, props, 'story-'+str(stories_id)+'-nyt-themes')
Esempio n. 49
0
def entities_csv(topics_id, type_entity):
    tag_type = CLIFF_PEOPLE if type_entity == 'people' else CLIFF_ORGS
    top_tag_counts = topic_tag_counts(user_mediacloud_key(), topics_id, tag_type)
    data = process_tags_for_coverage(topics_id, top_tag_counts)
    return csv.stream_response(top_tag_counts, ENTITY_DOWNLOAD_COLUMNS,
                               'topic-{}-entities-{}'.format(topics_id, type))
Esempio n. 50
0
def stream_topic_split_story_counts_csv(user_mc_key, filename, topics_id, **kwargs):
    results = apicache.topic_split_story_counts(user_mc_key, topics_id, **kwargs)
    clean_results = [{'date': trimSolrDate(item['date']), 'stories': item['count']} for item in results['counts']]
    sorted_results = sorted(clean_results, key=itemgetter('date'))
    props = ['date', 'stories']
    return csv.stream_response(sorted_results, props, filename)