Example #1
0
def stream_story_count_csv(fn, search_id_or_query_list):
    '''
    Helper method to stream a list of stories back to the client as a csv.  Any args you pass in will be
    simply be passed on to a call to topicStoryList.
    '''
    # if we have a search id, we load the samples from our sample searches file
    filename = ''
    story_count_results = []
    SAMPLE_SEARCHES = load_sample_searches()
    try:
        search_id = int(search_id_or_query_list)
        if search_id >= 0:
            SAMPLE_SEARCHES = load_sample_searches()

            sample_queries = SAMPLE_SEARCHES[search_id]['queries']

            for query in sample_queries:
                solr_query = prep_simple_solr_query(query)
                storyList = cached_story_count(solr_query)
                query_and_story_count = {'query' : query['label'], 'count' : storyList['count']}
                story_count_results.append(query_and_story_count)

    except Exception as e:
        custom_queries = json.loads(search_id_or_query_list)

        for query in custom_queries:
            solr_query = parse_query_with_keywords(query)
            filename = fn + query['q']

            storyList = cached_story_count(solr_query)
            query_and_story_count = {'query' : query['label'], 'count' : storyList['count']}
            story_count_results.append(query_and_story_count)
    
    props = ['query','count']
    return csv.stream_response(story_count_results, props, filename)
def explorer_story_count_csv():
    filename = 'total-story-count'
    data = request.form
    if 'searchId' in data:
        # TODO: don't load this query twice because that is kind of dumb
        sample_searches = load_sample_searches()
        queries = sample_searches[data['searchId']]['queries']
    else:
        queries = json.loads(data['queries'])
    label = " ".join([q['label'] for q in queries])
    filename = file_name_for_download(label, filename)
    # now compute total attention for all results
    story_count_results = []
    for q in queries:
        solr_q, solr_fq = parse_query_with_keywords(q)
        solr_open_query = concatenate_query_for_solr(solr_seed_query='*', media_ids=q['sources'],
                                                     tags_ids=q['collections'])
        story_counts = apicache.normalized_and_story_count(solr_q, solr_fq, solr_open_query)
        story_count_results.append({
            'query': q['label'],
            'matching_stories': story_counts['total'],
            'total_stories': story_counts['normalized_total'],
            'ratio': float(story_counts['total']) / float(story_counts['normalized_total'])
        })
    props = ['query', 'matching_stories', 'total_stories', 'ratio']
    return csv.stream_response(story_count_results, props, filename)
Example #3
0
def api_explorer_demo_compare_words():
    search_id = int(
        request.args['search_id']) if 'search_id' in request.args else None

    if search_id not in [None, -1]:
        sample_searches = load_sample_searches()
        compared_sample_queries = sample_searches[search_id]['queries']
        results = []
        for cq in compared_sample_queries:
            solr_q, solr_fq = parse_query_with_keywords(cq)
            word_count_result = query_wordcount(solr_q, solr_fq)
            results.append(word_count_result)
    else:
        compared_queries = request.args['compared_queries[]'].split(',')
        results = []
        for cq in compared_queries:
            dictq = {
                x[0]: x[1]
                for x in [x.split("=") for x in cq[1:].split("&")]
            }
            solr_q, solr_fq = parse_query_with_keywords(dictq)
            word_count_result = query_wordcount(solr_q, solr_fq)
            results.append(word_count_result)

    return jsonify({"results": results})
def api_explorer_demo_sentences_count():
    two_weeks_before_now = datetime.datetime.now() - datetime.timedelta(
        days=14)
    start_date = two_weeks_before_now.strftime("%Y-%m-%d")
    end_date = datetime.datetime.now().strftime("%Y-%m-%d")

    search_id = int(
        request.args['search_id']) if 'search_id' in request.args else None
    index = int(request.args['index']) if 'index' in request.args else None

    if isinstance(search_id, int) and search_id not in [None, -1]:
        SAMPLE_SEARCHES = load_sample_searches()
        current_search = SAMPLE_SEARCHES[search_id]['queries']
        solr_query = parse_query_with_args_and_sample_search(
            request.args, current_search)

        if index < len(current_search):
            start_date = current_search[index]['startDate']
            end_date = current_search[index]['endDate']
    else:
        solr_query = parse_query_with_keywords(request.args)
    # why is this call fundamentally different than the cache call???
    sentence_count_result = mc.sentenceCount(solr_query=solr_query,
                                             split_start_date=start_date,
                                             split_end_date=end_date,
                                             split=True)
    results = cached_by_query_sentence_counts(solr_query, start_date, end_date)

    return jsonify(results)
Example #5
0
def explorer_story_count_csv():
    filename = u'total-story-count'
    data = request.form
    if 'searchId' in data:
        # TODO: don't load this query twice because that is kind of dumb
        sample_searches = load_sample_searches()
        queries = sample_searches[data['searchId']]['queries']
    else:
        queries = json.loads(data['queries'])
    label = " ".join([q['label'] for q in queries])
    filename = file_name_for_download(label, filename)
    # now compute total attention for all results
    story_count_results = []
    for q in queries:
        solr_q, solr_fq = parse_query_with_keywords(q)
        solr_open_query = concatenate_query_for_solr(solr_seed_query='*',
                                                     media_ids=q['sources'],
                                                     tags_ids=q['collections'])
        story_counts = apicache.normalized_and_story_count(
            solr_q, solr_fq, solr_open_query)
        story_count_results.append({
            'query':
            q['label'],
            'matching_stories':
            story_counts['total'],
            'total_stories':
            story_counts['normalized_total'],
            'ratio':
            float(story_counts['total']) /
            float(story_counts['normalized_total'])
        })
    props = ['query', 'matching_stories', 'total_stories', 'ratio']
    return csv.stream_response(story_count_results, props, filename)
def api_explorer_story_split_count_csv():
    filename = 'stories-over-time'
    data = request.form
    if 'searchId' in data:
        solr_q, solr_fq = parse_as_sample(data['searchId'], data['index'])
        filename = filename  # don't have this info + current_query['q']
        SAMPLE_SEARCHES = load_sample_searches()
        queries = SAMPLE_SEARCHES[data['searchId']]['queries']
    else:
        queries = json.loads(data['queries'])
    label = " ".join([q['label'] for q in queries])
    filename = file_name_for_download(label, filename)
    # now compute total attention for all results
    story_count_results = []
    for q in queries:
        solr_q, solr_fq = parse_query_with_keywords(q)
        solr_open_query = concatenate_query_for_solr(solr_seed_query='*', media_ids=q['sources'],
                                                     tags_ids=q['collections'])
        story_counts = apicache.normalized_and_story_count(solr_q, solr_fq, solr_open_query)
        story_count_results.append({
            'date': q['startDate'],
            'query': q['label'],
            'matching_stories': story_counts['total'],
            'total_stories': story_counts['normalized_total'],
            'ratio': float(story_counts['total']) / float(story_counts['normalized_total'])
        })
    props = ['date','query', 'matching_stories', 'total_stories', 'ratio']
    return csv.stream_response(story_count_results, props, filename)
Example #7
0
def stream_geo_csv(fn, search_id_or_query, index):
    filename = ''

    # TODO: there is duplicate code here...
    SAMPLE_SEARCHES = load_sample_searches()
    try:
        search_id = int(search_id_or_query)
        if search_id >= 0:
            SAMPLE_SEARCHES = load_sample_searches()
            current_search = SAMPLE_SEARCHES[search_id]['queries']
            solr_query = parse_query_with_args_and_sample_search(
                search_id, current_search)

            if int(index) < len(current_search):
                start_date = current_search[int(index)]['startDate']
                end_date = current_search[int(index)]['endDate']
                filename = fn + current_search[int(index)]['q']
    except Exception as e:
        # so far, we will only be fielding one keyword csv query at a time, so we can use index of 0
        query = json.loads(search_id_or_query)
        current_query = query[0]
        solr_query = parse_query_with_keywords(current_query)
        filename = fn + current_query['q']

    res = cached_geotags(solr_query)
    res = [
        r for r in res
        if int(r['tag'].split('_')[1]) in COUNTRY_GEONAMES_ID_TO_APLHA3.keys()
    ]
    for r in res:
        geonamesId = int(r['tag'].split('_')[1])
        if geonamesId not in COUNTRY_GEONAMES_ID_TO_APLHA3.keys(
        ):  # only include countries
            continue
        r['geonamesId'] = geonamesId
        r['alpha3'] = COUNTRY_GEONAMES_ID_TO_APLHA3[geonamesId]
        r['count'] = (
            float(r['count']) / float(tag_utl.GEO_SAMPLE_SIZE)
        )  # WTF: why is the API returning this as a string and not a number?
        for hq in HIGHCHARTS_KEYS:
            if hq['properties']['iso-a3'] == r['alpha3']:
                r['iso-a2'] = hq['properties']['iso-a2']
                r['value'] = r['count']

    props = ['label', 'count']
    return csv.stream_response(res, props, filename)
Example #8
0
def demo_top_tags_with_coverage(tag_sets_id,):
    # parses the query for you
    search_id = int(request.args['search_id']) if 'search_id' in request.args else None
    if search_id not in [None, -1]:
        sample_searches = load_sample_searches()
        current_search = sample_searches[search_id]['queries']
        solr_q, solr_fq = parse_query_with_args_and_sample_search(request.args, current_search)
    else:
        solr_q, solr_fq = parse_query_with_keywords(request.args)
    return apicache.top_tags_with_coverage(solr_q, solr_fq, tag_sets_id)
Example #9
0
def demo_top_tags_with_coverage(tag_sets_id,):
    # parses the query for you
    search_id = int(request.args['search_id']) if 'search_id' in request.args else None
    query_index = int(request.args['index']) if 'index' in request.args else None
    if (query_index is None )and (search_id not in [None, -1]):
        sample_searches = load_sample_searches()
        current_search = sample_searches[search_id]['queries']
        solr_q, solr_fq = parse_as_sample(search_id, request.args)
    else:
        solr_q, solr_fq = parse_query_with_keywords(request.args)
    return apicache.top_tags_with_coverage(solr_q, solr_fq, tag_sets_id)
Example #10
0
def api_explorer_demo_geotag_count():
    search_id = int(request.args['search_id']) if 'search_id' in request.args else None
    if search_id not in [None, -1]:
        SAMPLE_SEARCHES = load_sample_searches()
        current_search = SAMPLE_SEARCHES[search_id]['queries']
        solr_q, solr_fq = parse_query_with_args_and_sample_search(request.args, current_search)
    else:
        solr_q, solr_fq= parse_query_with_keywords(request.args)
    data = apicache.top_tags_with_coverage(solr_q, solr_fq, tags.GEO_TAG_SET)
    data['results'] = _filter_for_countries(data['results'])
    return jsonify(data)
Example #11
0
def api_explorer_demo_story_sample():
    search_id = int(request.args['search_id']) if 'search_id' in request.args else None
    
    if search_id not in [None, -1]:
        SAMPLE_SEARCHES = load_sample_searches()
        current_search = SAMPLE_SEARCHES[search_id]['queries']
        solr_query = parse_query_with_args_and_sample_search(request.args, current_search)
    else:
        solr_query = parse_query_with_keywords(request.args)
 
    story_count_result = cached_story_samples(solr_query)
    return jsonify(story_count_result)  
Example #12
0
def api_explorer_demo_story_count():
    search_id = int(request.args['search_id']) if 'search_id' in request.args else None
    if search_id not in [None, -1]:
        SAMPLE_SEARCHES = load_sample_searches()
        current_search = SAMPLE_SEARCHES[search_id]['queries']
        solr_query = parse_query_with_args_and_sample_search(request.args, current_search)
    else:
        solr_query = parse_query_with_keywords(request.args)

    story_count_result = cached_story_count(solr_query)
    # maybe check admin role before we run this?
    return jsonify(story_count_result)  # give them back new data, so they can update the client
def stream_sentence_count_csv(fn, search_id_or_query, index):

    two_weeks_before_now = datetime.datetime.now() - datetime.timedelta(
        days=14)
    start_date = two_weeks_before_now.strftime("%Y-%m-%d")
    end_date = datetime.datetime.now().strftime("%Y-%m-%d")

    SAMPLE_SEARCHES = load_sample_searches()  # TODO: some duplicate code here
    try:
        search_id = int(search_id_or_query)
        if search_id >= 0:
            SAMPLE_SEARCHES = load_sample_searches()
            current_search = SAMPLE_SEARCHES[search_id]['queries']
            solr_query = parse_query_with_args_and_sample_search(
                search_id, current_search)

            if int(index) < len(current_search):
                start_date = current_search[int(index)]['startDate']
                end_date = current_search[int(index)]['endDate']
                filename = fn + current_search[int(index)]['q']

    except Exception as e:
        # so far, we will only be fielding one keyword csv query at a time, so we can use index of 0
        query = json.loads(search_id_or_query)
        current_query = query[0]
        solr_query = parse_query_with_keywords(
            current_query
        )  # TODO don't mod the start and end date unless permissions
        filename = fn + current_query['q']

    results = cached_by_query_sentence_counts(
        solr_query, start_date, end_date)  # get dates out of query?
    clean_results = [{
        'date': date,
        'sentences': count
    } for date, count in results['split'].iteritems()
                     if date not in ['gap', 'start', 'end']]
    clean_results = sorted(clean_results, key=itemgetter('date'))
    props = ['date', 'sentences']
    return csv.stream_response(clean_results, props, filename)
Example #14
0
def explorer_wordcount_csv():
    
    search_id = int(request.args['search_id']) if 'search_id' in request.args else None
    
    if search_id not in [None, -1]:
        SAMPLE_SEARCHES = load_sample_searches()
        current_search = SAMPLE_SEARCHES[search_id]['queries']
        solr_query = parse_query_with_args_and_sample_search(request.args, current_search)
    else:
        solr_query = parse_query_with_keywords(request.args)
        # TODO what about other params: date etc for demo..

    return stream_wordcount_csv(mc, 'wordcounts-Explorer', solr_query)
def api_explorer_demo_story_sample():
    search_id = int(request.args['search_id']) if 'search_id' in request.args else None
    if search_id not in [None, -1]:
        sample_searches = load_sample_searches()
        current_search = sample_searches[search_id]['queries']
        solr_q, solr_fq = parse_query_with_args_and_sample_search(request.args, current_search)
    else:
        solr_q, solr_fq = parse_query_with_keywords(request.args)

    story_sample_result = apicache.random_story_list(solr_q, solr_fq, SAMPLE_STORY_COUNT)
    for story in story_sample_result:
        story["media"] = apicache.media(story["media_id"])
    return jsonify(story_sample_result)
def api_explorer_demo_story_sample():
    search_id = int(request.args['search_id']) if 'search_id' in request.args else None
    if search_id not in [None, -1]:
        sample_searches = load_sample_searches()
        current_search = sample_searches[search_id]['queries']
        solr_q, solr_fq = parse_as_sample(search_id, request.args['index'])
    else:
        solr_q, solr_fq = parse_query_with_keywords(request.args)

    story_sample_result = apicache.random_story_list(solr_q, solr_fq, SAMPLE_STORY_COUNT)
    for story in story_sample_result:
        story["media"] = server.views.apicache.media(story["media_id"])
    return jsonify({"results": story_sample_result})
Example #17
0
def get_word_count():
    search_id = int(
        request.args['search_id']) if 'search_id' in request.args else None
    if search_id not in [None, -1]:
        sample_searches = load_sample_searches()
        current_search = sample_searches[search_id]['queries']
        solr_q, solr_fq = parse_query_with_args_and_sample_search(
            request.args, current_search)
    else:
        solr_q, solr_fq = parse_query_with_keywords(request.args)
    word_data = query_wordcount(solr_q, solr_fq)
    # return combined data
    return jsonify({"list": word_data})
Example #18
0
def explorer_stories_csv(search_id_or_query, index):
    filename = ''
    SAMPLE_SEARCHES = load_sample_searches()
    try:
        search_id = int(search_id_or_query)
        if search_id >= 0:
            SAMPLE_SEARCHES = load_sample_searches()
            current_search = SAMPLE_SEARCHES[search_id]['queries']
            solr_query = parse_query_with_args_and_sample_search(search_id, current_search)

            if int(index) < len(current_search): 
                start_date = current_search[int(index)]['startDate']
                end_date = current_search[int(index)]['endDate']
            filename = 'explorer-stories-' + current_search[int(index)]['q']
    except Exception as e:
        # so far, we will only be fielding one keyword csv query at a time, so we can use index of 0
        query = json.loads(search_id_or_query)
        current_query = query[0]
        solr_query = parse_query_with_keywords(current_query)
        filename = 'explorer-stories-' + current_query['q']

    story_count_result = cached_story_samples(solr_query)
    
    return stream_story_samples_csv(filename, story_count_result)
def _get_word_count():
    search_id = int(
        request.args['search_id']) if 'search_id' in request.args else None
    sample_size = int(
        request.args['sample_size']
    ) if 'sample_size' in request.args else WORD_COUNT_SAMPLE_SIZE
    if search_id not in [None, -1]:
        sample_searches = load_sample_searches()
        current_search = sample_searches[search_id]['queries']
        solr_q, solr_fq = parse_query_with_args_and_sample_search(
            request.args, current_search)
    else:
        solr_q, solr_fq = parse_query_with_keywords(request.args)
    word_data = query_wordcount(solr_q, solr_fq, sample_size=sample_size)
    # return combined data
    return jsonify({"list": word_data, "sample_size": str(sample_size)})
def api_explorer_demo_story_split_count():
    search_id = int(request.args['search_id']) if 'search_id' in request.args else None

    if isinstance(search_id, int) and search_id not in [None, -1]:
        SAMPLE_SEARCHES = load_sample_searches()
        current_search = SAMPLE_SEARCHES[search_id]['queries']
        solr_q, solr_fq = parse_as_sample(search_id, request.args['index'])
    else:
        solr_q, solr_fq = parse_query_with_keywords(request.args)
    # why is this call fundamentally different than the cache call???
    solr_open_query = concatenate_query_for_solr(solr_seed_query='*',
                                                 media_ids=[],
                                                 tags_ids=DEFAULT_COLLECTION_IDS)
    results = apicache.normalized_and_story_split_count(solr_q, solr_fq, solr_open_query)

    return jsonify({'results': results})
Example #21
0
def geotag_count():
    two_weeks_before_now = datetime.datetime.now() - datetime.timedelta(
        days=14)
    start_date = two_weeks_before_now.strftime("%Y-%m-%d")
    end_date = datetime.datetime.now().strftime("%Y-%m-%d")

    search_id = int(
        request.args['search_id']) if 'search_id' in request.args else None
    index = int(request.args['index']) if 'index' in request.args else None

    if search_id not in [None, -1]:
        SAMPLE_SEARCHES = load_sample_searches()
        current_search = SAMPLE_SEARCHES[search_id]['queries']
        solr_query = parse_query_with_args_and_sample_search(
            request.args, current_search)
    else:
        solr_query = parse_query_with_keywords(request.args)

    # TODO coverage here
    # total_stories = mc.storyCount(solr_query)
    # geotagged_stories = mc.storyCount("({}) AND (tags_id_stories:{})".format(solr_query, CLIFF_CLAVIN_2_3_0_TAG_ID))
    # coverage_pct = float(geotagged_stories) / float(total_stories)

    res = cached_geotags(solr_query)
    res = [
        r for r in res
        if int(r['tag'].split('_')[1]) in COUNTRY_GEONAMES_ID_TO_APLHA3.keys()
    ]
    for r in res:
        geonamesId = int(r['tag'].split('_')[1])
        if geonamesId not in COUNTRY_GEONAMES_ID_TO_APLHA3.keys(
        ):  # only include countries
            continue
        r['geonamesId'] = geonamesId  # TODO: move this to JS?
        r['alpha3'] = COUNTRY_GEONAMES_ID_TO_APLHA3[geonamesId]
        r['count'] = (
            float(r['count']) / float(tag_utl.GEO_SAMPLE_SIZE)
        )  # WTF: why is the API returning this as a string and not a number?
        for hq in HIGHCHARTS_KEYS:
            if hq['properties']['iso-a3'] == r['alpha3']:
                r['iso-a2'] = hq['properties']['iso-a2']
                r['value'] = r['count']

    # results = {'coverage': coverage_pct, 'list': res }
    return jsonify(res)
Example #22
0
def api_explorer_demo_story_split_count():
    search_id = int(
        request.args['search_id']) if 'search_id' in request.args else None

    if isinstance(search_id, int) and search_id not in [None, -1]:
        SAMPLE_SEARCHES = load_sample_searches()
        current_search = SAMPLE_SEARCHES[search_id]['queries']
        solr_q, solr_fq = parse_query_with_args_and_sample_search(
            request.args, current_search)
    else:
        solr_q, solr_fq = parse_query_with_keywords(request.args)
    # why is this call fundamentally different than the cache call???
    solr_open_query = concatenate_query_for_solr(
        solr_seed_query='*', media_ids=[], tags_ids=DEFAULT_COLLECTION_IDS)
    results = apicache.normalized_and_story_split_count(
        solr_q, solr_fq, solr_open_query)

    return jsonify({'results': results})
def api_explorer_story_split_count():
    search_id = int(request.args['search_id']) if 'search_id' in request.args else None
    index = int(request.args['index']) if 'index' in request.args else None

    #get specific stories by keyword
    if isinstance(search_id, int) and search_id not in [None, -1]:
        SAMPLE_SEARCHES = load_sample_searches()
        current_search = SAMPLE_SEARCHES[search_id]['queries']
        solr_q, solr_fq = parse_as_sample(search_id, request.args['index'])
    else:
        solr_q, solr_fq = parse_query_with_keywords(request.args)

    # get all the stories (no keyword)
    solr_open_query = concatenate_query_for_solr(solr_seed_query='*',
                                                 media_ids=request.args['sources'],
                                                 tags_ids=request.args['collections'])
    results = apicache.normalized_and_story_split_count(solr_q, solr_fq, solr_open_query)

    return jsonify({'results': results})
Example #24
0
def api_explorer_story_split_count():
    search_id = int(
        request.args['search_id']) if 'search_id' in request.args else None
    index = int(request.args['index']) if 'index' in request.args else None

    if isinstance(search_id, int) and search_id not in [None, -1]:
        SAMPLE_SEARCHES = load_sample_searches()
        current_search = SAMPLE_SEARCHES[search_id]['queries']
        solr_q, solr_fq = parse_query_with_args_and_sample_search(
            request.args, current_search)
    else:
        solr_q, solr_fq = parse_query_with_keywords(request.args)
    solr_open_query = concatenate_query_for_solr(
        solr_seed_query='*',
        media_ids=request.args['sources'],
        tags_ids=request.args['collections'])
    results = apicache.normalized_and_story_split_count(
        solr_q, solr_fq, solr_open_query)

    return jsonify({'results': results})
Example #25
0
def api_explorer_demo_compare_words():
    search_id = int(request.args['search_id']) if 'search_id' in request.args else None
    
    if search_id not in [None, -1]:
        sample_searches = load_sample_searches()
        compared_sample_queries = sample_searches[search_id]['queries']
        results = []
        for cq in compared_sample_queries:
            solr_q, solr_fq = parse_query_with_keywords(cq)
            word_count_result = query_wordcount(solr_q, solr_fq)
            results.append(word_count_result)
    else:
        compared_queries = request.args['compared_queries[]'].split(',')
        results = []
        for cq in compared_queries:
            dictq = {x[0]:x[1] for x in [x.split("=") for x in cq[1:].split("&")]}
            solr_q, solr_fq = parse_query_with_keywords(dictq)
            word_count_result = query_wordcount(solr_q, solr_fq)
            results.append(word_count_result)

    return jsonify({"results": results})
import logging
from flask import jsonify, request
import flask_login
import json

from server import app
import server.util.csv as csv
import server.util.pushshift as pushshift
from server.util.request import api_error_handler
from server.views.explorer import parse_as_sample,\
    parse_query_with_keywords, load_sample_searches, file_name_for_download, concatenate_query_for_solr,\
    DEFAULT_COLLECTION_IDS, only_queries_reddit, parse_query_dates
import server.views.explorer.apicache as apicache

SAMPLE_SEARCHES = load_sample_searches()
logger = logging.getLogger(__name__)


@app.route('/api/explorer/stories/count.csv', methods=['POST'])
def explorer_story_count_csv():
    filename = 'total-story-count'
    data = request.form
    if 'searchId' in data:
        queries = SAMPLE_SEARCHES[data['searchId']]['queries']
    else:
        queries = json.loads(data['queries'])
    label = " ".join([q['label'] for q in queries])
    filename = file_name_for_download(label, filename)
    # now compute total attention for all results
    story_count_results = []
    for q in queries: