def cached_geotag_count(user_mc_key, query):
    user_mc = user_admin_mediacloud_client()
    res = user_mc.sentenceFieldCount('*',
                                     query,
                                     tag_sets_id=tag_utl.GEO_TAG_SET,
                                     sample_size=tag_utl.GEO_SAMPLE_SIZE)
    res = [
        r for r in res
        if int(r['tag'].split('_')[1]) in COUNTRY_GEONAMES_ID_TO_APLHA3.keys()
    ]
    for r in res:
        geonamesId = int(r['tag'].split('_')[1])
        if geonamesId not in COUNTRY_GEONAMES_ID_TO_APLHA3.keys(
        ):  # only include countries
            continue
        r['geonamesId'] = geonamesId  # TODO: move this to JS?
        r['alpha3'] = COUNTRY_GEONAMES_ID_TO_APLHA3[geonamesId]
        r['count'] = (
            float(r['count']) / float(tag_utl.GEO_SAMPLE_SIZE)
        )  # WTF: why is the API returning this as a string and not a number?
        for hq in HIGHCHARTS_KEYS:
            if hq['properties']['iso-a3'] == r['alpha3']:
                r['iso-a2'] = hq['properties']['iso-a2']
                r['value'] = r['count']
    return res
def get_top_countries_by_story_tag_counts(topics_id, num_countries):
    tag_country_counts = []

    # get the total stories for a topic
    total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count']

    # get the top countries by the story tag counts iwth overall timespan
    timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id)
    overall_timespan = [t for t in timespans if t['period'] == "overall"]
    overall_timespan = next(iter(overall_timespan))
    timespan_query = "timespans_id:{}".format(overall_timespan['timespans_id'])
    top_geo_tags = _cached_topic_tag_counts(user_mediacloud_key(), topics_id, GEO_TAG_SET, GEO_SAMPLE_SIZE, timespan_query)
    
    # make sure the geo tag is in the geo_tags whitelist (is a country)
    country_tag_counts = [r for r in top_geo_tags if
                          int(r['tag'].split('_')[1]) in list(COUNTRY_GEONAMES_ID_TO_APLHA3.keys())]
    country_tag_counts = country_tag_counts[:num_countries]

    # for each country, set up the requisite info for UI
    for tag in country_tag_counts:
        tag_country_counts.append({
            'label': tag['label'],
            'geo_tag': tag['tag'],
            'tags_id': tag['tags_id'],
            'count': tag['count'],
            'pct': float(tag['count']) / float(total_stories),  # story_tag_count / total story per topic count
        })
    return tag_country_counts
def cached_geotag_count(user_mc_key, query):
    user_mc = user_admin_mediacloud_client()
    res = user_mc.storyTagCount(query, [QUERY_LAST_MONTH, QUERY_ENGLISH_LANGUAGE], tag_sets_id=tag_utl.GEO_TAG_SET)
    full_count = apicache.cached_timeperiod_story_count(user_mc, query, QUERY_LAST_MONTH)['count']
    res = [r for r in res if int(r['tag'].split('_')[1]) in COUNTRY_GEONAMES_ID_TO_APLHA3.keys()]
    for r in res:
        geonamesId = int(r['tag'].split('_')[1])
        if geonamesId not in COUNTRY_GEONAMES_ID_TO_APLHA3.keys():   # only include countries
            continue
        r['geonamesId'] = geonamesId
        r['alpha3'] = COUNTRY_GEONAMES_ID_TO_APLHA3[geonamesId]
        r['pct'] = (float(r['count'])/float(full_count))
        r['value'] = (float(r['count']))
        for hq in HIGHCHARTS_KEYS:
            if hq['properties']['iso-a3'] == r['alpha3']:
                r['iso-a2'] = hq['properties']['iso-a2']
    return res
def stream_geo_csv(fn, search_id_or_query, index):
    filename = ''

    # TODO: there is duplicate code here...
    SAMPLE_SEARCHES = load_sample_searches()
    try:
        search_id = int(search_id_or_query)
        if search_id >= 0:
            SAMPLE_SEARCHES = load_sample_searches()
            current_search = SAMPLE_SEARCHES[search_id]['queries']
            solr_query = parse_query_with_args_and_sample_search(
                search_id, current_search)

            if int(index) < len(current_search):
                start_date = current_search[int(index)]['startDate']
                end_date = current_search[int(index)]['endDate']
                filename = fn + current_search[int(index)]['q']
    except Exception as e:
        # so far, we will only be fielding one keyword csv query at a time, so we can use index of 0
        query = json.loads(search_id_or_query)
        current_query = query[0]
        solr_query = parse_query_with_keywords(current_query)
        filename = fn + current_query['q']

    res = cached_geotags(solr_query)
    res = [
        r for r in res
        if int(r['tag'].split('_')[1]) in COUNTRY_GEONAMES_ID_TO_APLHA3.keys()
    ]
    for r in res:
        geonamesId = int(r['tag'].split('_')[1])
        if geonamesId not in COUNTRY_GEONAMES_ID_TO_APLHA3.keys(
        ):  # only include countries
            continue
        r['geonamesId'] = geonamesId
        r['alpha3'] = COUNTRY_GEONAMES_ID_TO_APLHA3[geonamesId]
        r['count'] = (
            float(r['count']) / float(tag_utl.GEO_SAMPLE_SIZE)
        )  # WTF: why is the API returning this as a string and not a number?
        for hq in HIGHCHARTS_KEYS:
            if hq['properties']['iso-a3'] == r['alpha3']:
                r['iso-a2'] = hq['properties']['iso-a2']
                r['value'] = r['count']

    props = ['label', 'count']
    return csv.stream_response(res, props, filename)
def cached_geotag_count(user_mc_key, query):
    user_mc = user_admin_mediacloud_client()
    res = user_mc.storyTagCount(query, [QUERY_LAST_MONTH, QUERY_ENGLISH_LANGUAGE], tag_sets_id=tag_utl.GEO_TAG_SET)
    full_count = apicache.timeperiod_story_count(user_mc, query, QUERY_LAST_MONTH)['count']
    res = [r for r in res if int(r['tag'].split('_')[1]) in list(COUNTRY_GEONAMES_ID_TO_APLHA3.keys())]
    for r in res:
        geonamesId = int(r['tag'].split('_')[1])
        if geonamesId not in list(COUNTRY_GEONAMES_ID_TO_APLHA3.keys()):   # only include countries
            continue
        r['geonamesId'] = geonamesId
        r['alpha3'] = COUNTRY_GEONAMES_ID_TO_APLHA3[geonamesId]
        r['pct'] = (float(r['count'])/float(full_count))
        r['value'] = (float(r['count']))
        for hq in HIGHCHARTS_KEYS:
            if hq['properties']['iso-a3'] == r['alpha3']:
                r['iso-a2'] = hq['properties']['iso-a2']
    return res
def geotag_count():
    two_weeks_before_now = datetime.datetime.now() - datetime.timedelta(
        days=14)
    start_date = two_weeks_before_now.strftime("%Y-%m-%d")
    end_date = datetime.datetime.now().strftime("%Y-%m-%d")

    search_id = int(
        request.args['search_id']) if 'search_id' in request.args else None
    index = int(request.args['index']) if 'index' in request.args else None

    if search_id not in [None, -1]:
        SAMPLE_SEARCHES = load_sample_searches()
        current_search = SAMPLE_SEARCHES[search_id]['queries']
        solr_query = parse_query_with_args_and_sample_search(
            request.args, current_search)
    else:
        solr_query = parse_query_with_keywords(request.args)

    # TODO coverage here
    # total_stories = mc.storyCount(solr_query)
    # geotagged_stories = mc.storyCount("({}) AND (tags_id_stories:{})".format(solr_query, CLIFF_CLAVIN_2_3_0_TAG_ID))
    # coverage_pct = float(geotagged_stories) / float(total_stories)

    res = cached_geotags(solr_query)
    res = [
        r for r in res
        if int(r['tag'].split('_')[1]) in COUNTRY_GEONAMES_ID_TO_APLHA3.keys()
    ]
    for r in res:
        geonamesId = int(r['tag'].split('_')[1])
        if geonamesId not in COUNTRY_GEONAMES_ID_TO_APLHA3.keys(
        ):  # only include countries
            continue
        r['geonamesId'] = geonamesId  # TODO: move this to JS?
        r['alpha3'] = COUNTRY_GEONAMES_ID_TO_APLHA3[geonamesId]
        r['count'] = (
            float(r['count']) / float(tag_utl.GEO_SAMPLE_SIZE)
        )  # WTF: why is the API returning this as a string and not a number?
        for hq in HIGHCHARTS_KEYS:
            if hq['properties']['iso-a3'] == r['alpha3']:
                r['iso-a2'] = hq['properties']['iso-a2']
                r['value'] = r['count']

    # results = {'coverage': coverage_pct, 'list': res }
    return jsonify(res)
def _geo_tag_counts(user_mc_key, topics_id):
    tag_counts = topic_tag_counts(user_mc_key, topics_id, tag_util.GEO_TAG_SET,
                                  tag_util.GEO_SAMPLE_SIZE)
    # filter for countries, add in highcharts metadata
    country_tag_counts = [r for r in tag_counts if
                          int(r['tag'].split('_')[1]) in list(COUNTRY_GEONAMES_ID_TO_APLHA3.keys())]
    for r in country_tag_counts:
        geonamesId = int(r['tag'].split('_')[1])
        if geonamesId not in list(COUNTRY_GEONAMES_ID_TO_APLHA3.keys()):  # only include countries
            continue
        r['geonamesId'] = geonamesId  # TODO: move this to JS?
        r['alpha3'] = COUNTRY_GEONAMES_ID_TO_APLHA3[geonamesId]
        r['count'] = r['count']
        for hq in HIGHCHARTS_KEYS:
            if hq['properties']['iso-a3'] == r['alpha3']:
                r['iso-a2'] = hq['properties']['iso-a2']
                r['value'] = r['count']
    return country_tag_counts
def _geo_tag_counts(user_mc_key, topics_id):
    tag_counts = topic_tag_counts(user_mc_key, topics_id, tag_util.GEO_TAG_SET,
                                  tag_util.GEO_SAMPLE_SIZE)
    # filter for countries, add in highcharts metadata
    country_tag_counts = [
        r for r in tag_counts
        if int(r['tag'].split('_')[1]) in COUNTRY_GEONAMES_ID_TO_APLHA3.keys()
    ]
    for r in country_tag_counts:
        geonamesId = int(r['tag'].split('_')[1])
        if geonamesId not in COUNTRY_GEONAMES_ID_TO_APLHA3.keys(
        ):  # only include countries
            continue
        r['geonamesId'] = geonamesId  # TODO: move this to JS?
        r['alpha3'] = COUNTRY_GEONAMES_ID_TO_APLHA3[geonamesId]
        r['count'] = r['count']
        for hq in HIGHCHARTS_KEYS:
            if hq['properties']['iso-a3'] == r['alpha3']:
                r['iso-a2'] = hq['properties']['iso-a2']
                r['value'] = r['count']
    return country_tag_counts
def _filter_for_countries(top_geo_tags):
    # this tag set has country and state tags, so we have to filter out to get just the country ones to draw a heatmap
    # 1: parse out the geonames id from the tag (ie. "geonames_6252001" and verify it on the whitelist
    country_tags = [t for t in top_geo_tags if int(t['tag'].split('_')[1]) in COUNTRY_GEONAMES_ID_TO_APLHA3.keys()]
    # 2: now add in helpful data for mapping it
    for t in country_tags:
        geonames_id = int(t['tag'].split('_')[1])
        t['geonamesId'] = geonames_id    # TODO: move this to JS?
        t['alpha3'] = COUNTRY_GEONAMES_ID_TO_APLHA3[geonames_id]
        # TODO: What is this doin ghere? Bad coupling of front and back-end
        for hq in HIGHCHARTS_KEYS:
            if hq['properties']['iso-a3'] == t['alpha3']:
                t['iso-a2'] = hq['properties']['iso-a2']
                t['value'] = t['count']
    return country_tags
Exemple #10
0
def get_top_countries_by_story_tag_counts(topics_id, num_countries):
    tag_country_counts = []

    # get the total stories for a topic
    total_stories = topic_story_count(user_mediacloud_key(),
                                      topics_id)['count']

    # get the top countries by the story tag counts iwth overall timespan
    timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id)
    overall_timespan = [t for t in timespans if t['period'] == "overall"]
    overall_timespan = next(iter(overall_timespan))
    timespan_query = "timespans_id:{}".format(overall_timespan['timespans_id'])
    top_geo_tags = _cached_topic_tag_counts(user_mediacloud_key(), topics_id,
                                            GEO_TAG_SET, GEO_SAMPLE_SIZE,
                                            timespan_query)

    # make sure the geo tag is in the geo_tags whitelist (is a country)
    country_tag_counts = [
        r for r in top_geo_tags
        if int(r['tag'].split('_')[1]) in COUNTRY_GEONAMES_ID_TO_APLHA3.keys()
    ]
    country_tag_counts = country_tag_counts[:num_countries]

    # for each country, set up the requisite info for UI
    for tag in country_tag_counts:
        tag_country_counts.append({
            'label':
            tag['label'],
            'geo_tag':
            tag['tag'],
            'tags_id':
            tag['tags_id'],
            'count':
            tag['count'],
            'pct':
            float(tag['count']) /
            float(total_stories
                  ),  # story_tag_count / total story per topic count
        })
    return tag_country_counts