コード例 #1
0
def topic_focal_set_sentences_compare(topics_id, focal_sets_id):
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    all_focal_sets = topic_focal_sets(user_mediacloud_key(), topics_id,
                                      snapshots_id)
    # need the timespan info, to find the appropriate timespan with each focus
    base_snapshot_timespans = cached_topic_timespan_list(
        user_mediacloud_key(), topics_id, snapshots_id=snapshots_id)
    # if they have a focus selected, we need to find the appropriate overall timespan
    if foci_id is not None:
        timespan = topic_timespan(topics_id, snapshots_id, foci_id,
                                  timespans_id)
        for t in base_snapshot_timespans:
            if timespans_match(timespan, t):
                base_timespan = t
    else:
        base_timespan = None
        for t in base_snapshot_timespans:
            if t['timespans_id'] == int(timespans_id):
                base_timespan = t
                logger.info('base timespan = %s', timespans_id)
    if base_timespan is None:
        return json_error_response("Couldn't find the timespan you specified")
    # iterate through to find the one of interest
    focal_set = None
    for fs in all_focal_sets:
        if int(fs['focal_sets_id']) == int(focal_sets_id):
            focal_set = fs
    if focal_set is None:
        return json_error_response('Invalid Focal Set Id')
    # collect the sentence counts for each foci
    for focus in focal_set['foci']:
        # find the matching timespan within this focus
        snapshot_timespans = cached_topic_timespan_list(
            user_mediacloud_key(),
            topics_id,
            snapshots_id=snapshots_id,
            foci_id=focus['foci_id'])
        timespan = None
        for t in snapshot_timespans:
            if timespans_match(t, base_timespan):
                timespan = t
                logger.info('matching in focus %s, timespan = %s',
                            focus['foci_id'], t['timespans_id'])
        if timespan is None:
            return json_error_response(
                'Couldn\'t find a matching timespan in the ' + focus.name +
                ' focus')
        data = topic_sentence_counts(user_mediacloud_key(),
                                     topics_id,
                                     snapshots_id=snapshots_id,
                                     timespans_id=timespan['timespans_id'],
                                     foci_id=focus['foci_id'])
        focus['sentence_counts'] = data
    return jsonify(focal_set)
コード例 #2
0
def story_counts_by_snapshot(topics_id):
    user_mc = user_mediacloud_client(user_mediacloud_key())
    snapshots = user_mc.topicSnapshotList(topics_id)
    counts = {}
    for s in snapshots:
        # get the count of stories in the overally timespan for this snapshot
        timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id,
                                                        snapshots_id=s['snapshots_id'], foci_id=None)
        try:
            total = timespans[0]['story_count']
        except mediacloud.error.MCException:
            total = 0
        except IndexError:  # this doesn't have any snapshots (ie. it failed to generate correctly)
            total = 0
        # search by tag to find out how many stories were spidered
        spidered = 0
        try:
            spidered = apicache.topic_story_count(user_mediacloud_key(), topics_id,
                                                  snapshots_id=s['snapshots_id'], foci_id=None,
                                                  timespans_id=timespans[0]['timespans_id'],
                                                  q="* AND tags_id_stories:{}".format(TAG_SPIDERED_STORY))['count']
        except mediacloud.error.MCException:
            spidered = 0
        except IndexError:  # this doesn't have any snapshots (ie. it failed to generate correctly)
            total = 0
        seeded = total - spidered
        counts[s['snapshots_id']] = {'total': total, 'spidered': spidered, 'seeded': seeded}
    return jsonify(counts)
コード例 #3
0
ファイル: words.py プロジェクト: Fa67/MediaCloud-Web-Tools
def topic_w2v_timespan_embeddings(topics_id):
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    # Retrieve embeddings for overall topic
    overall_word_counts = apicache.topic_word_counts(user_mediacloud_key(),
                                                     topics_id,
                                                     num_words=50,
                                                     snapshots_id=snapshots_id,
                                                     timespans_id=None,
                                                     foci_id=foci_id,
                                                     q=q)
    overall_words = [x['term'] for x in overall_word_counts]
    overall_embeddings = {
        x['term']: (x['google_w2v_x'], x['google_w2v_y'])
        for x in overall_word_counts
    }

    # Retrieve top words for each timespan
    timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(),
                                                    topics_id, snapshots_id,
                                                    foci_id)

    # Retrieve embeddings for each timespan
    jobs = [{
        'api_key': user_mediacloud_key(),
        'topics_id': topics_id,
        'snapshots_id': snapshots_id,
        'foci_id': foci_id,
        'overall_words': overall_words,
        'overall_embeddings': overall_embeddings,
        'q': q,
        'timespan': t,
    } for t in timespans]
    embeddings_by_timespan = _get_all_timespan_embeddings(jobs)
    return jsonify({'list': embeddings_by_timespan})
コード例 #4
0
def get_top_countries_by_story_tag_counts(topics_id, num_countries):
    tag_country_counts = []

    # get the total stories for a topic
    total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count']

    # get the top countries by the story tag counts iwth overall timespan
    timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id)
    overall_timespan = [t for t in timespans if t['period'] == "overall"]
    overall_timespan = next(iter(overall_timespan))
    timespan_query = "timespans_id:{}".format(overall_timespan['timespans_id'])
    top_geo_tags = _cached_topic_tag_counts(user_mediacloud_key(), topics_id, GEO_TAG_SET, GEO_SAMPLE_SIZE, timespan_query)
    
    # make sure the geo tag is in the geo_tags whitelist (is a country)
    country_tag_counts = [r for r in top_geo_tags if
                          int(r['tag'].split('_')[1]) in list(COUNTRY_GEONAMES_ID_TO_APLHA3.keys())]
    country_tag_counts = country_tag_counts[:num_countries]

    # for each country, set up the requisite info for UI
    for tag in country_tag_counts:
        tag_country_counts.append({
            'label': tag['label'],
            'geo_tag': tag['tag'],
            'tags_id': tag['tags_id'],
            'count': tag['count'],
            'pct': float(tag['count']) / float(total_stories),  # story_tag_count / total story per topic count
        })
    return tag_country_counts
コード例 #5
0
def topic_w2v_timespan_embeddings(topics_id):
    args = {
        'snapshots_id': request.args.get('snapshotId'),
        'foci_id': request.args.get('focusId'),
        'q': request.args.get('q'),
    }

    # Retrieve embeddings for overall topic
    overall_word_counts = topic_word_counts(user_mediacloud_key(),
                                            topics_id,
                                            num_words=50,
                                            **args)
    overall_words = [x['term'] for x in overall_word_counts]
    overall_embeddings = {
        x['term']: (x['google_w2v_x'], x['google_w2v_y'])
        for x in overall_word_counts
    }

    # Retrieve top words for each timespan
    timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id,
                                           args['snapshots_id'],
                                           args['foci_id'])

    # Retrieve embeddings for each timespan
    p = Pool(processes=WORD2VEC_TIMESPAN_POOL_PROCESSES)
    func = partial(grab_timespan_embeddings, user_mediacloud_key(), topics_id,
                   args, overall_words, overall_embeddings)
    ts_embeddings = p.map(func, timespans)

    return jsonify({'list': ts_embeddings})
コード例 #6
0
def get_top_themes_by_story_tag_counts(topics_id, num_themes):
    user_mc_key = user_mediacloud_key()
    nyt_counts = []

    #get overall timespan
    timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id)
    overall_timespan = [t for t in timespans if t['period'] == "overall"]
    overall_timespan = next(iter(overall_timespan))
    timespan_query = "timespans_id:{}".format(overall_timespan['timespans_id'])

    # get the top themes by the story counts iwth overall timespan
    top_nyt_tags = _cached_topic_tag_counts(user_mediacloud_key(), topics_id, NYT_LABELS_TAG_SET_ID,
                                            TAG_COUNT_SAMPLE_SIZE, timespan_query)
    # get the total stories for a topic
    total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count']

    top_nyt_tags = top_nyt_tags[:num_themes]
    # for each country, set up the requisite info for UI
    for tag in top_nyt_tags:
        nyt_counts.append({
            'label': tag['label'],
            'geo_tag': tag['tag'],
            'tags_id': tag['tags_id'],
            'count': tag['count'],
            'pct': float(tag['count']) / float(total_stories), #story_tag_count / total story per topic count
        })

    return nyt_counts
コード例 #7
0
ファイル: words.py プロジェクト: c4fcm/MediaMeter-TopicMapper
def _find_overall_timespan(topics_id, snapshots_id):
    selected_snapshot_timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id,
                                                                      snapshots_id=snapshots_id)
    for timespan in selected_snapshot_timespans:
        if timespan['period'] == 'overall':
            return timespan
    raise RuntimeError('Missing overall timespan in snapshot {} (topic {})!'.format(snapshots_id, topics_id))
コード例 #8
0
ファイル: words.py プロジェクト: c4fcm/MediaMeter-TopicMapper
def topic_compare_subtopic_top_words(topics_id):
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    selected_focal_sets_id = request.args['focal_sets_id']
    word_count = request.args['word_count'] if 'word_count' in request.args else 20
    # first we need to figure out which timespan they are working on
    selected_snapshot_timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id,
                                                                      snapshots_id=snapshots_id)
    selected_timespan = None
    for t in selected_snapshot_timespans:
        if t['timespans_id'] == int(timespans_id):
            selected_timespan = t
    try:
        focal_set = apicache.topic_focal_set(user_mediacloud_key(), topics_id, snapshots_id, selected_focal_sets_id)
    except ValueError:
        return json_error_response('Invalid Focal Set Id')
    timespans = apicache.matching_timespans_in_foci(topics_id, selected_timespan, focal_set['foci'])
    for idx in range(0, len(timespans)):
        data = apicache.topic_word_counts(user_mediacloud_key(), topics_id,
                                          timespans_id=timespans[idx]['timespans_id'])
        focal_set['foci'][idx]['top_words'] = data
    # stitch together the counts to download now
    data = []
    headers = [f['name'] for f in focal_set['foci']]
    for idx in range(0, word_count):
        row = {f['name']: "{} ({})".format(f['top_words'][idx]['term'], f['top_words'][idx]['count'])
               for f in focal_set['foci']}
        data.append(row)
    return csv.stream_response(data, headers,
                               'topic-{}-subtopic-{}-{}-top-words-comparison'.format(
                                   topics_id, focal_set['name'], selected_focal_sets_id))
コード例 #9
0
def topic_compare_subtopic_top_words(topics_id):
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    selected_focal_sets_id = request.args['focal_sets_id']
    word_count = request.args['word_count'] if 'word_count' in request.args else 20
    # first we need to figure out which timespan they are working on
    selected_snapshot_timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id=snapshots_id)
    selected_timespan = None
    for t in selected_snapshot_timespans:
        if t['timespans_id'] == int(timespans_id):
            selected_timespan = t
    focal_set = apicache.topic_focal_set(user_mediacloud_key(), topics_id, snapshots_id, selected_focal_sets_id)
    timespans = apicache.matching_timespans_in_foci(topics_id, selected_timespan, focal_set['foci'])
    for idx in range(0, len(timespans)):
        data = apicache.topic_word_counts(user_mediacloud_key(), topics_id,
                                          timespans_id=timespans[idx]['timespans_id'])
        focal_set['foci'][idx]['top_words'] = data
    # stitch together the counts to download now
    data = []
    headers = [f['name'] for f in focal_set['foci']]
    for idx in range(0, word_count):
        row = {f['name']: u"{} ({})".format(f['top_words'][idx]['term'], f['top_words'][idx]['count'])
               for f in focal_set['foci']}
        data.append(row)
    return csv.stream_response(data, headers,
                               'topic-{}-subtopic-{}-{}-top-words-comparison'.format(
                                   topics_id, focal_set['name'], selected_focal_sets_id))
コード例 #10
0
def _find_overall_timespan(topics_id, snapshots_id):
    selected_snapshot_timespans = apicache.cached_topic_timespan_list(
        user_mediacloud_key(), topics_id, snapshots_id=snapshots_id)
    for timespan in selected_snapshot_timespans:
        if timespan['period'] == 'overall':
            return timespan
    raise RuntimeError(
        'Missing overall timespan in snapshot {} (topic {})!'.format(
            snapshots_id, topics_id))
コード例 #11
0
def base_snapshot_timespan(topics_id):
    # find the timespan matching this one in the base snapshot (ie. with no foci_id)
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    base_snapshot_timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id,
                                                                  snapshots_id=snapshots_id, foci_id=None)
    timespan = apicache.topic_timespan(topics_id, snapshots_id, foci_id, timespans_id)  # the selected timespan
    for t in base_snapshot_timespans:
        if apicache.is_timespans_match(timespan, t):
            return t
    raise ValueError("Can't find a timespan in the base snapshot matching the one specified")
コード例 #12
0
def topic_timespan_list(topics_id, snapshots_id):
    ignored_snapshots_id, _timespans_id, foci_id, _q = filters_from_args(
        request.args)
    timespans = apicache.cached_topic_timespan_list(topics_id, snapshots_id,
                                                    foci_id)
    # add the focal_set type to the timespan so we can use that in the client (ie. decide what to show or not
    # based on what type of focal_set this timespan is part of)
    focal_sets = apicache.topic_focal_sets_list(user_mediacloud_key(),
                                                topics_id, snapshots_id)
    for t in timespans:
        for fs in focal_sets:
            for f in fs['foci']:
                if f['foci_id'] == t['foci_id']:
                    t['focal_set'] = fs
                    t['focus'] = f
                    break
    return jsonify({'list': timespans})
コード例 #13
0
ファイル: topic.py プロジェクト: c4fcm/MediaMeter-TopicMapper
def get_topic_info_per_snapshot_timespan(topic_id):
    if not is_user_logged_in():
        local_mc = mc
    else:
        local_mc = user_admin_mediacloud_client()
    snapshots = {
        'list': local_mc.topicSnapshotList(topic_id),
    }
    most_recent_running_snapshot = {}
    overall_timespan = {}
    for snp in snapshots['list']:
        if snp['searchable'] == 1 and snp['state'] == "completed":
            most_recent_running_snapshot = snp
            timespans = cached_topic_timespan_list(user_mediacloud_key(), topic_id,
                                                   most_recent_running_snapshot['snapshots_id'])
            for ts in timespans:
                if ts['period'] == "overall":
                    overall_timespan = ts

    return {'snapshot': most_recent_running_snapshot, 'timespan': overall_timespan}
コード例 #14
0
ファイル: topic.py プロジェクト: c4fcm/MediaMeter-TopicMapper
def topic_w2v_timespan_embeddings(topics_id):
    args = {
        'snapshots_id': request.args.get('snapshotId'),
        'foci_id': request.args.get('focusId'),
        'q': request.args.get('q'),
    }

    # Retrieve embeddings for overall topic
    overall_word_counts = topic_word_counts(user_mediacloud_key(), topics_id, num_words=50, **args)
    overall_words = [x['term'] for x in overall_word_counts]
    overall_embeddings = {x['term']: (x['google_w2v_x'], x['google_w2v_y']) for x in overall_word_counts}

    # Retrieve top words for each timespan
    timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id, args['snapshots_id'], args['foci_id'])

    # Retrieve embeddings for each timespan
    p = Pool(processes=WORD2VEC_TIMESPAN_POOL_PROCESSES)
    func = partial(grab_timespan_embeddings, user_mediacloud_key(), topics_id, args, overall_words, overall_embeddings)
    ts_embeddings = p.map(func, timespans)

    return jsonify({'list': ts_embeddings})
コード例 #15
0
def get_topic_info_per_snapshot_timespan(topic_id):
    local_mc = user_admin_mediacloud_client()
    snapshots = {
        'list': local_mc.topicSnapshotList(topic_id),
    }
    most_recent_running_snapshot = {}
    overall_timespan = {}
    for snp in snapshots['list']:
        if snp['searchable'] == 1 and snp['state'] == "completed":
            most_recent_running_snapshot = snp
            timespans = cached_topic_timespan_list(
                user_mediacloud_key(), topic_id,
                most_recent_running_snapshot['snapshots_id'])
            for ts in timespans:
                if ts['period'] == "overall":
                    overall_timespan = ts

    return {
        'snapshot': most_recent_running_snapshot,
        'timespan': overall_timespan
    }
コード例 #16
0
def get_top_countries_by_story_tag_counts(topics_id, num_countries):
    tag_country_counts = []

    # get the total stories for a topic
    total_stories = topic_story_count(user_mediacloud_key(),
                                      topics_id)['count']

    # get the top countries by the story tag counts iwth overall timespan
    timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id)
    overall_timespan = [t for t in timespans if t['period'] == "overall"]
    overall_timespan = next(iter(overall_timespan))
    timespan_query = "timespans_id:{}".format(overall_timespan['timespans_id'])
    top_geo_tags = _cached_topic_tag_counts(user_mediacloud_key(), topics_id,
                                            GEO_TAG_SET, GEO_SAMPLE_SIZE,
                                            timespan_query)

    # make sure the geo tag is in the geo_tags whitelist (is a country)
    country_tag_counts = [
        r for r in top_geo_tags
        if int(r['tag'].split('_')[1]) in COUNTRY_GEONAMES_ID_TO_APLHA3.keys()
    ]
    country_tag_counts = country_tag_counts[:num_countries]

    # for each country, set up the requisite info for UI
    for tag in country_tag_counts:
        tag_country_counts.append({
            'label':
            tag['label'],
            'geo_tag':
            tag['tag'],
            'tags_id':
            tag['tags_id'],
            'count':
            tag['count'],
            'pct':
            float(tag['count']) /
            float(total_stories
                  ),  # story_tag_count / total story per topic count
        })
    return tag_country_counts
コード例 #17
0
def topic_focal_set_split_stories_compare(topics_id, focal_sets_id):
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    # need the timespan info, to find the appropriate timespan with each focus
    base_snapshot_timespans = apicache.cached_topic_timespan_list(
        user_mediacloud_key(), topics_id, snapshots_id=snapshots_id)
    # if they have a focus selected, we need to find the appropriate overall timespan
    if foci_id is not None:
        timespan = apicache.topic_timespan(topics_id, snapshots_id, foci_id,
                                           timespans_id)
        for t in base_snapshot_timespans:
            if apicache.is_timespans_match(timespan, t):
                base_timespan = t
    else:
        base_timespan = None
        for t in base_snapshot_timespans:
            if t['timespans_id'] == int(timespans_id):
                base_timespan = t
                logger.info('base timespan = %s', timespans_id)
    if base_timespan is None:
        return json_error_response("Couldn't find the timespan you specified")
    # iterate through to find the one of interest
    focal_set = apicache.topic_focal_set(user_mediacloud_key(), topics_id,
                                         snapshots_id, focal_sets_id)
    if focal_set is None:
        return json_error_response('Invalid Focal Set Id')
    # collect the story split counts for each foci
    timespans = apicache.matching_timespans_in_foci(topics_id, base_timespan,
                                                    focal_set['foci'])
    for idx in range(0, len(timespans)):
        data = apicache.topic_split_story_counts(
            user_mediacloud_key(),
            topics_id,
            snapshots_id=snapshots_id,
            timespans_id=timespans[idx]['timespans_id'])
        focal_set['foci'][idx]['split_story_counts'] = data
    return jsonify(focal_set)
コード例 #18
0
def topic_timespan_list(topics_id, snapshots_id):
    foci_id = request.args.get('focusId')
    timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(),
                                                    topics_id, snapshots_id,
                                                    foci_id)
    return jsonify({'list': timespans})
コード例 #19
0
ファイル: topic.py プロジェクト: c4fcm/MediaMeter-TopicMapper
def topic_timespan_list(topics_id, snapshots_id):
    foci_id = request.args.get('focusId')
    timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id, foci_id)
    return jsonify({'list': timespans})