def explorer_story_count_csv(): filename = 'total-story-count' data = request.form queries = json.loads(data['queries']) label = " ".join([q['label'] for q in queries]) filename = file_name_for_download(label, filename) # now compute total attention for all results story_count_results = [] for q in queries: if (len(q['collections']) == 0) and only_queries_reddit(q['sources']): start_date, end_date = parse_query_dates(q) provider = RedditPushshiftProvider() story_counts = provider.normalized_count_over_time(query=q['q'], start_date=start_date, end_date=end_date, subreddits=NEWS_SUBREDDITS) else: solr_q, solr_fq = parse_query_with_keywords(q) solr_open_query = concatenate_query_for_solr(solr_seed_query='*', media_ids=q['sources'], tags_ids=q['collections']) story_counts = apicache.normalized_and_story_count(solr_q, solr_fq, solr_open_query) story_count_results.append({ 'query': q['label'], 'matching_stories': story_counts['total'], 'total_stories': story_counts['normalized_total'], 'ratio': float(story_counts['total']) / float(story_counts['normalized_total']) }) props = ['query', 'matching_stories', 'total_stories', 'ratio'] return csv.stream_response(story_count_results, props, filename)
def parse_query_with_keywords(args): solr_q = '' solr_fq = None # should I break this out into just a parse_query_with_keywords routine where we add in the start/end date without relying that the # try statement will fail? try: # if user arguments are present and allowed by the client endpoint, use them, otherwise use defaults current_query = args['q'] if current_query == '': current_query = "*" start_date, end_date = parse_query_dates(args) media_ids = _parse_media_ids(args) collections = _parse_collection_ids(args) searches = args['searches'] if 'searches' in args else [] solr_q = concatenate_query_for_solr(solr_seed_query=current_query, media_ids=media_ids, tags_ids=collections, custom_ids=searches) solr_fq = dates_as_filter_query(start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d")) # otherwise, default except Exception as e: logger.warning( "user custom query failed, there's a problem with the arguments " + str(e)) return solr_q, solr_fq
def _as_query_and_filter_query(cls, query: str, start_date: dt.datetime, end_date: dt.datetime, **kwargs) -> (str, str): """ Take all the query params and return q and fq suitable for a media cloud solr-syntax query :param query: :param start_date: :param end_date: :param kwargs: sources and collections :return: """ media_ids = kwargs['sources'] if 'sources' in kwargs else [] tags_ids = kwargs['collections'] if 'collections' in kwargs else [] q = concatenate_query_for_solr(query, media_ids, tags_ids) fq = MediaCloud.dates_as_query_clause(start_date, end_date) return q, fq
def _topic_seed_story_count(topic): try: seed_query_count = shared_apicache.story_count( q=concatenate_query_for_solr( solr_seed_query=topic['solr_seed_query'], media_ids=[ m['media_id'] for m in topic['media'] if 'media_id' in m ], tags_ids=[ t['tags_id'] for t in topic['media_tags'] if 'tags_id' in t ]), fq=concatenate_solr_dates(start_date=topic['start_date'], end_date=topic['end_date']))['count'] except mediacloud.error.MCException: # the query syntax is wrong (perhaps pre-story-level search) seed_query_count = None return seed_query_count
def api_explorer_story_split_count(): start_date, end_date = parse_query_dates(request.form) if only_queries_reddit(request.form): provider = RedditPushshiftProvider() results = provider.normalized_count_over_time(query=request.form['q'], start_date=start_date, end_date=end_date, subreddits=NEWS_SUBREDDITS) else: # get specific stories by keyword solr_q, _solr_fq = parse_query_with_keywords(request.form) # get all the stories (no keyword) so we can support normalization solr_open_query = concatenate_query_for_solr(solr_seed_query='*', media_ids=request.form['sources'], tags_ids=request.form['collections'], custom_ids=request.form['searches']) results = apicache.normalized_and_story_split_count(solr_q, solr_open_query, start_date, end_date) return jsonify({'results': results})
def api_explorer_combined_story_split_count_csv(): filename = 'stories-over-time' data = request.form queries = json.loads(data['queries']) label = " ".join([q['label'] for q in queries]) filename = file_name_for_download(label, filename) # now compute total attention for all results story_count_results = [] for q in queries: start_date, end_date = parse_query_dates(q) if (len(q['collections']) == 0) and only_queries_reddit(q['sources']): provider = RedditPushshiftProvider() story_counts = provider.normalized_count_over_time( query=q['q'], start_date=start_date, end_date=end_date, subreddits=NEWS_SUBREDDITS) else: solr_q, solr_fq = parse_query_with_keywords(q) solr_open_query = concatenate_query_for_solr( solr_seed_query='*', media_ids=q['sources'], tags_ids=q['collections'], custom_ids=q['searches']) story_counts = apicache.normalized_and_story_split_count( solr_q, solr_open_query, start_date, end_date) story_count_results.append({ 'label': q['label'], 'by_date': story_counts['counts'], }) # now combine them by date data = [] dates = [d['date'] for d in story_count_results[0]['by_date']] for idx in range(len(dates)): row = {'date': dates[idx]} for q in story_count_results: row[q['label'] + '-count'] = q['by_date'][idx]['count'] row[q['label'] + '-total_count'] = q['by_date'][idx]['total_count'] row[q['label'] + '-ratio'] = q['by_date'][idx]['ratio'] data.append(row) props = ['date'] + [q['label'] + '-count' for q in queries] + [ q['label'] + '-total_count' for q in queries ] + [q['label'] + '-ratio' for q in queries] return csv.stream_response(data, props, filename)
def api_explorer_story_split_count_csv(): filename = 'stories-over-time' data = request.form q = json.loads(data['q']) filename = file_name_for_download(q['label'], filename) # now compute total attention for all results start_date, end_date = parse_query_dates(q) if (len(q['collections']) == 0) and only_queries_reddit(q['sources']): provider = RedditPushshiftProvider() story_counts = provider.normalized_count_over_time(query=q['q'], start_date=start_date, end_date=end_date, subreddits=NEWS_SUBREDDITS) else: solr_q, _solr_fq = parse_query_with_keywords(q) solr_open_query = concatenate_query_for_solr(solr_seed_query='*', media_ids=q['sources'], tags_ids=q['collections'], custom_ids=q['searches']) story_counts = apicache.normalized_and_story_split_count(solr_q, solr_open_query, start_date, end_date) props = ['date', 'count', 'total_count', 'ratio'] return csv.stream_response(story_counts['counts'], props, filename)