def api_explorer_story_split_count_csv(): filename = 'stories-over-time' data = request.form if 'searchId' in data: solr_q, solr_fq = parse_as_sample(data['searchId'], data['index']) filename = filename # don't have this info + current_query['q'] SAMPLE_SEARCHES = load_sample_searches() queries = SAMPLE_SEARCHES[data['searchId']]['queries'] else: queries = json.loads(data['queries']) label = " ".join([q['label'] for q in queries]) filename = file_name_for_download(label, filename) # now compute total attention for all results story_count_results = [] for q in queries: solr_q, solr_fq = parse_query_with_keywords(q) solr_open_query = concatenate_query_for_solr(solr_seed_query='*', media_ids=q['sources'], tags_ids=q['collections']) story_counts = apicache.normalized_and_story_count(solr_q, solr_fq, solr_open_query) story_count_results.append({ 'date': q['startDate'], 'query': q['label'], 'matching_stories': story_counts['total'], 'total_stories': story_counts['normalized_total'], 'ratio': float(story_counts['total']) / float(story_counts['normalized_total']) }) props = ['date','query', 'matching_stories', 'total_stories', 'ratio'] return csv.stream_response(story_count_results, props, filename)
def explorer_story_count_csv(): filename = u'total-story-count' data = request.form if 'searchId' in data: # TODO: don't load this query twice because that is kind of dumb sample_searches = load_sample_searches() queries = sample_searches[data['searchId']]['queries'] else: queries = json.loads(data['queries']) label = " ".join([q['label'] for q in queries]) filename = file_name_for_download(label, filename) # now compute total attention for all results story_count_results = [] for q in queries: solr_q, solr_fq = parse_query_with_keywords(q) solr_open_query = concatenate_query_for_solr(solr_seed_query='*', media_ids=q['sources'], tags_ids=q['collections']) story_counts = apicache.normalized_and_story_count( solr_q, solr_fq, solr_open_query) story_count_results.append({ 'query': q['label'], 'matching_stories': story_counts['total'], 'total_stories': story_counts['normalized_total'], 'ratio': float(story_counts['total']) / float(story_counts['normalized_total']) }) props = ['query', 'matching_stories', 'total_stories', 'ratio'] return csv.stream_response(story_count_results, props, filename)
def explorer_story_count_csv(): filename = 'total-story-count' data = request.form if 'searchId' in data: # TODO: don't load this query twice because that is kind of dumb sample_searches = load_sample_searches() queries = sample_searches[data['searchId']]['queries'] else: queries = json.loads(data['queries']) label = " ".join([q['label'] for q in queries]) filename = file_name_for_download(label, filename) # now compute total attention for all results story_count_results = [] for q in queries: solr_q, solr_fq = parse_query_with_keywords(q) solr_open_query = concatenate_query_for_solr(solr_seed_query='*', media_ids=q['sources'], tags_ids=q['collections']) story_counts = apicache.normalized_and_story_count(solr_q, solr_fq, solr_open_query) story_count_results.append({ 'query': q['label'], 'matching_stories': story_counts['total'], 'total_stories': story_counts['normalized_total'], 'ratio': float(story_counts['total']) / float(story_counts['normalized_total']) }) props = ['query', 'matching_stories', 'total_stories', 'ratio'] return csv.stream_response(story_count_results, props, filename)
def explorer_story_count_csv(): filename = 'total-story-count' data = request.form if 'searchId' in data: queries = SAMPLE_SEARCHES[data['searchId']]['queries'] else: queries = json.loads(data['queries']) label = " ".join([q['label'] for q in queries]) filename = file_name_for_download(label, filename) # now compute total attention for all results story_count_results = [] for q in queries: if (len(q['collections']) == 0) and only_queries_reddit(q['sources']): start_date, end_date = parse_query_dates(q) story_counts = pushshift.reddit_submission_normalized_and_split_story_count(query=q['q'], start_date=start_date, end_date=end_date, subreddits=pushshift.NEWS_SUBREDDITS) else: solr_q, solr_fq = parse_query_with_keywords(q) solr_open_query = concatenate_query_for_solr(solr_seed_query='*', media_ids=q['sources'], tags_ids=q['collections']) story_counts = apicache.normalized_and_story_count(solr_q, solr_fq, solr_open_query) story_count_results.append({ 'query': q['label'], 'matching_stories': story_counts['total'], 'total_stories': story_counts['normalized_total'], 'ratio': float(story_counts['total']) / float(story_counts['normalized_total']) }) props = ['query', 'matching_stories', 'total_stories', 'ratio'] return csv.stream_response(story_count_results, props, filename)
def explorer_geo_csv(): filename = 'sampled-geographic-coverage' data = request.form query_object = json.loads(data['q']) solr_q, solr_fq = parse_query_with_keywords(query_object) filename = file_name_for_download(query_object['label'], filename) data = apicache.top_tags_with_coverage(solr_q, solr_fq, tags.GEO_TAG_SET) props = ['tags_id', 'label', 'count', 'pct'] return csv.stream_response(data['results'], props, filename)
def explorer_entities_csv(tag_sets_id): tag_set = base_apicache.tag_set(tag_sets_id) filename = 'sampled-{}'.format(tag_set['label']) data = request.form query_object = json.loads(data['q']) solr_q, solr_fq = parse_query_with_keywords(query_object) filename = file_name_for_download(query_object['label'], filename) top_tag_counts = apicache.top_tags_with_coverage( solr_q, solr_fq, tag_sets_id, TAG_COUNT_DOWNLOAD_LENGTH)['results'] return csv.stream_response(top_tag_counts, ENTITY_DOWNLOAD_COLUMNS, filename)
def explorer_wordcount_csv(): data = request.form ngram_size = data[ 'ngramSize'] if 'ngramSize' in data else 1 # defaul to words if ngram not specified sample_size = data[ 'sample_size'] if 'sample_size' in data else WORD_COUNT_SAMPLE_SIZE filename = 'sampled-{}-ngrams-{}'.format(sample_size, ngram_size) query_object = json.loads(data['q']) solr_q, solr_fq = parse_query_with_keywords(query_object) filename = file_name_for_download(query_object['label'], filename) return stream_wordcount_csv(filename, solr_q, solr_fq, ngram_size, sample_size)
def explorer_wordcount_csv(): data = request.form ngram_size = data[ 'ngramSize'] if 'ngramSize' in data else 1 # defaul to words if ngram not specified filename = u'sampled-ngrams-{}'.format(ngram_size) if 'searchId' in data: solr_q, solr_fq = parse_as_sample(data['searchId'], data['index']) else: query_object = json.loads(data['q']) solr_q, solr_fq = parse_query_with_keywords(query_object) filename = file_name_for_download(query_object['label'], filename) return stream_wordcount_csv(filename, solr_q, solr_fq, ngram_size)
def explorer_entities_csv(tag_sets_id): tag_set = apicache.tag_set(tag_sets_id) filename = 'sampled-{}'.format(tag_set['label']) data = request.form if 'searchId' in data: solr_q, solr_fq = parse_as_sample(data['searchId'], data['index']) else: query_object = json.loads(data['q']) solr_q, solr_fq = parse_query_with_keywords(query_object) filename = file_name_for_download(query_object['label'], filename) top_tag_counts = apicache.top_tags_with_coverage(solr_q, solr_fq, tag_sets_id, TAG_COUNT_DOWNLOAD_LENGTH)['results'] return csv.stream_response(top_tag_counts, ENTITY_DOWNLOAD_COLUMNS, filename)
def explorer_wordcount_csv(): data = request.form ngram_size = data['ngramSize'] if 'ngramSize' in data else 1 # defaul to words if ngram not specified sample_size = data['sample_size'] if 'sample_size' in data else WORD_COUNT_SAMPLE_SIZE filename = 'sampled-{}-ngrams-{}'.format(sample_size, ngram_size) if 'searchId' in data: solr_q, solr_fq = parse_as_sample(data['searchId'], data['index']) else: query_object = json.loads(data['q']) solr_q, solr_fq = parse_query_with_keywords(query_object) filename = file_name_for_download(query_object['label'], filename) return stream_wordcount_csv(filename, solr_q, solr_fq, ngram_size, sample_size)
def explorer_geo_csv(): filename = u'sampled-geographic-coverage' data = request.form if 'searchId' in data: solr_q, solr_fq = parse_as_sample(data['searchId'], data['index']) else: query_object = json.loads(data['q']) solr_q, solr_fq = parse_query_with_keywords(query_object) filename = file_name_for_download(query_object['label'], filename) data = apicache.top_tags_with_coverage(solr_q, solr_fq, tags.GEO_TAG_SET) data['results'] = _filter_for_countries(data['results']) props = ['label', 'count', 'pct', 'alpha3', 'iso-a2', 'geonamesId', 'tags_id', 'tag'] return csv.stream_response(data['results'], props, filename)
def explorer_stories_csv(): filename = 'sampled-stories' data = request.form if 'searchId' in data: solr_q, solr_fq = parse_as_sample(data['searchId'], data['uid']) filename = filename # don't have this info + current_query['q'] # for demo users we only download 100 random stories (ie. not all matching stories) return _stream_story_list_csv(filename, solr_q, solr_fq, 100, MediaCloud.SORT_RANDOM, 1) else: query_object = json.loads(data['q']) solr_q, solr_fq = parse_query_with_keywords(query_object) filename = file_name_for_download(query_object['label'], filename) # now page through all the stories and download them return _stream_story_list_csv(filename, solr_q, solr_fq)
def explorer_stories_csv(): filename = u'sampled-stories' data = request.form if 'searchId' in data: solr_q, solr_fq = parse_as_sample(data['searchId'], data['index']) filename = filename # don't have this info + current_query['q'] # for demo users we only download 100 random stories (ie. not all matching stories) return _stream_story_list_csv(filename, solr_q, solr_fq, 100, MediaCloud.SORT_RANDOM, 1) else: query_object = json.loads(data['q']) solr_q, solr_fq = parse_query_with_keywords(query_object) filename = file_name_for_download(query_object['label'], filename) # now page through all the stories and download them return _stream_story_list_csv(filename, solr_q, solr_fq)
def api_explorer_combined_story_split_count_csv(): filename = 'stories-over-time' data = request.form queries = json.loads(data['queries']) label = " ".join([q['label'] for q in queries]) filename = file_name_for_download(label, filename) # now compute total attention for all results story_count_results = [] for q in queries: start_date, end_date = parse_query_dates(q) if (len(q['collections']) == 0) and only_queries_reddit(q['sources']): provider = RedditPushshiftProvider() story_counts = provider.normalized_count_over_time( query=q['q'], start_date=start_date, end_date=end_date, subreddits=NEWS_SUBREDDITS) else: solr_q, solr_fq = parse_query_with_keywords(q) solr_open_query = concatenate_query_for_solr( solr_seed_query='*', media_ids=q['sources'], tags_ids=q['collections'], custom_ids=q['searches']) story_counts = apicache.normalized_and_story_split_count( solr_q, solr_open_query, start_date, end_date) story_count_results.append({ 'label': q['label'], 'by_date': story_counts['counts'], }) # now combine them by date data = [] dates = [d['date'] for d in story_count_results[0]['by_date']] for idx in range(len(dates)): row = {'date': dates[idx]} for q in story_count_results: row[q['label'] + '-count'] = q['by_date'][idx]['count'] row[q['label'] + '-total_count'] = q['by_date'][idx]['total_count'] row[q['label'] + '-ratio'] = q['by_date'][idx]['ratio'] data.append(row) props = ['date'] + [q['label'] + '-count' for q in queries] + [ q['label'] + '-total_count' for q in queries ] + [q['label'] + '-ratio' for q in queries] return csv.stream_response(data, props, filename)
def api_explorer_story_split_count_csv(): filename = u'stories-over-time' data = request.form if 'searchId' in data: solr_q, solr_fq = parse_as_sample(data['searchId'], data['index']) filename = filename # don't have this info + current_query['q'] # TODO solr_open_query else: query_object = json.loads(data['q']) solr_q, solr_fq = parse_query_with_keywords(query_object) filename = file_name_for_download(query_object['label'], filename) solr_open_query = concatenate_query_for_solr( solr_seed_query='*', media_ids=query_object['sources'], tags_ids=query_object['collections']) results = apicache.normalized_and_story_split_count( solr_q, solr_fq, solr_open_query) props = ['date', 'count', 'total_count', 'ratio'] return csv.stream_response(results['counts'], props, filename)
def explorer_stories_csv(): logger.info(flask_login.current_user.name) filename = 'all-story-urls' data = request.form q = json.loads(data['q']) filename = file_name_for_download(q['label'], filename) # now compute total attention for all results if (len(q['collections']) == 0) and only_queries_reddit(q['sources']): start_date, end_date = parse_query_dates(q) provider = RedditPushshiftProvider() stories = provider.sample(query=q['q'], limit=2000, start_date=start_date, end_date=end_date, subreddits=NEWS_SUBREDDITS) props = ['stories_id', 'subreddit', 'publish_date', 'score', 'last_updated', 'title', 'url', 'full_link', 'author'] return csv.stream_response(stories, props, filename) else: solr_q, solr_fq = parse_query_with_keywords(q) # now page through all the stories and download them return _stream_story_list_csv(filename, solr_q, solr_fq)
def api_explorer_story_split_count_csv(): filename = 'stories-over-time' data = request.form q = json.loads(data['q']) filename = file_name_for_download(q['label'], filename) # now compute total attention for all results start_date, end_date = parse_query_dates(q) if (len(q['collections']) == 0) and only_queries_reddit(q['sources']): provider = RedditPushshiftProvider() story_counts = provider.normalized_count_over_time(query=q['q'], start_date=start_date, end_date=end_date, subreddits=NEWS_SUBREDDITS) else: solr_q, _solr_fq = parse_query_with_keywords(q) solr_open_query = concatenate_query_for_solr(solr_seed_query='*', media_ids=q['sources'], tags_ids=q['collections'], custom_ids=q['searches']) story_counts = apicache.normalized_and_story_split_count(solr_q, solr_open_query, start_date, end_date) props = ['date', 'count', 'total_count', 'ratio'] return csv.stream_response(story_counts['counts'], props, filename)
def api_explorer_story_split_count_csv(): filename = 'stories-over-time' data = request.form if 'searchId' in data: filename = filename # don't have this info + current_query['q'] q = SAMPLE_SEARCHES[data['index']] else: q = json.loads(data['q']) filename = file_name_for_download(q['label'], filename) # now compute total attention for all results start_date, end_date = parse_query_dates(q) if (len(q['collections']) == 0) and only_queries_reddit(q['sources']): story_counts = pushshift.reddit_submission_normalized_and_split_story_count(query=q['q'], start_date=start_date, end_date=end_date, subreddits=pushshift.NEWS_SUBREDDITS) else: solr_q, solr_fq = parse_query_with_keywords(q) solr_open_query = concatenate_query_for_solr(solr_seed_query='*', media_ids=q['sources'], tags_ids=q['collections']) story_counts = apicache.normalized_and_story_split_count(solr_q, solr_open_query, start_date, end_date) props = ['date', 'count', 'total_count', 'ratio'] return csv.stream_response(story_counts['counts'], props, filename)