def topic_words(topics_id): sample_size = request.args['sample_size'] if 'sample_size' in request.args else WORD_COUNT_SAMPLE_SIZE if access_public_topic(topics_id): results = apicache.topic_word_counts(TOOL_API_KEY, topics_id, sample_size=sample_size, snapshots_id=None, timespans_id=None, foci_id=None, q=None) elif is_user_logged_in(): # grab the top words, respecting all the filters results = apicache.topic_word_counts(user_mediacloud_key(), topics_id, sample_size=sample_size) else: return jsonify({'status': 'Error', 'message': 'Invalid attempt'}) totals = [] # important so that these get reset on the client when they aren't requested logger.debug(request.args) if (is_user_logged_in()) and ('withTotals' in request.args) and (request.args['withTotals'] == "true"): # return along with the results for the overall timespan, to facilitate comparison snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) overall_timespan = _find_overall_timespan(topics_id, snapshots_id) totals = apicache.topic_word_counts(user_mediacloud_key(), topics_id, sample_size=sample_size, timespans_id=overall_timespan['timespans_id'], foci_id=None, q=None) response = { 'list': results[:WORD_COUNT_UI_NUM_WORDS], 'totals': totals[:WORD_COUNT_UI_NUM_WORDS], 'sample_size': str(sample_size) } return jsonify(response)
def topic_words(topics_id): if access_public_topic(topics_id): results = topic_word_counts(TOOL_API_KEY, topics_id, snapshots_id=None, timespans_id=None, foci_id=None, q=None) elif is_user_logged_in(): results = topic_word_counts(user_mediacloud_key(), topics_id)[:200] else: return jsonify({'status': 'Error', 'message': 'Invalid attempt'}) totals = [ ] # important so that these get reset on the client when they aren't requested logger.info(request.args) if (is_user_logged_in()) and ('withTotals' in request.args) and ( request.args['withTotals'] == "true"): # handle requests to return these results # and also data to compare it to for the whole topic focus totals = topic_word_counts(user_mediacloud_key(), topics_id, timespans_id=None, q=None) response = {'list': results, 'totals': totals} return jsonify(response)
def topic_compare_subtopic_top_words(topics_id): snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) selected_focal_sets_id = request.args['focal_sets_id'] word_count = request.args['word_count'] if 'word_count' in request.args else 20 # first we need to figure out which timespan they are working on selected_snapshot_timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id=snapshots_id) selected_timespan = None for t in selected_snapshot_timespans: if t['timespans_id'] == int(timespans_id): selected_timespan = t focal_set = apicache.topic_focal_set(user_mediacloud_key(), topics_id, snapshots_id, selected_focal_sets_id) timespans = apicache.matching_timespans_in_foci(topics_id, selected_timespan, focal_set['foci']) for idx in range(0, len(timespans)): data = apicache.topic_word_counts(user_mediacloud_key(), topics_id, timespans_id=timespans[idx]['timespans_id']) focal_set['foci'][idx]['top_words'] = data # stitch together the counts to download now data = [] headers = [f['name'] for f in focal_set['foci']] for idx in range(0, word_count): row = {f['name']: u"{} ({})".format(f['top_words'][idx]['term'], f['top_words'][idx]['count']) for f in focal_set['foci']} data.append(row) return csv.stream_response(data, headers, 'topic-{}-subtopic-{}-{}-top-words-comparison'.format( topics_id, focal_set['name'], selected_focal_sets_id))
def topic_w2v_timespan_embeddings(topics_id): args = { 'snapshots_id': request.args.get('snapshotId'), 'foci_id': request.args.get('focusId'), 'q': request.args.get('q'), } # Retrieve embeddings for overall topic overall_word_counts = topic_word_counts(user_mediacloud_key(), topics_id, num_words=50, **args) overall_words = [x['term'] for x in overall_word_counts] overall_embeddings = { x['term']: (x['google_w2v_x'], x['google_w2v_y']) for x in overall_word_counts } # Retrieve top words for each timespan timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id, args['snapshots_id'], args['foci_id']) # Retrieve embeddings for each timespan p = Pool(processes=WORD2VEC_TIMESPAN_POOL_PROCESSES) func = partial(grab_timespan_embeddings, user_mediacloud_key(), topics_id, args, overall_words, overall_embeddings) ts_embeddings = p.map(func, timespans) return jsonify({'list': ts_embeddings})
def topic_compare_subtopic_top_words(topics_id): snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) selected_focal_sets_id = request.args['focal_sets_id'] word_count = request.args['word_count'] if 'word_count' in request.args else 20 # first we need to figure out which timespan they are working on selected_snapshot_timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id=snapshots_id) selected_timespan = None for t in selected_snapshot_timespans: if t['timespans_id'] == int(timespans_id): selected_timespan = t try: focal_set = apicache.topic_focal_set(user_mediacloud_key(), topics_id, snapshots_id, selected_focal_sets_id) except ValueError: return json_error_response('Invalid Focal Set Id') timespans = apicache.matching_timespans_in_foci(topics_id, selected_timespan, focal_set['foci']) for idx in range(0, len(timespans)): data = apicache.topic_word_counts(user_mediacloud_key(), topics_id, timespans_id=timespans[idx]['timespans_id']) focal_set['foci'][idx]['top_words'] = data # stitch together the counts to download now data = [] headers = [f['name'] for f in focal_set['foci']] for idx in range(0, word_count): row = {f['name']: "{} ({})".format(f['top_words'][idx]['term'], f['top_words'][idx]['count']) for f in focal_set['foci']} data.append(row) return csv.stream_response(data, headers, 'topic-{}-subtopic-{}-{}-top-words-comparison'.format( topics_id, focal_set['name'], selected_focal_sets_id))
def topic_w2v_timespan_embeddings(topics_id): snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) # Retrieve embeddings for overall topic overall_word_counts = apicache.topic_word_counts(user_mediacloud_key(), topics_id, num_words=50, snapshots_id=snapshots_id, timespans_id=None, foci_id=foci_id, q=q) overall_words = [x['term'] for x in overall_word_counts] overall_embeddings = { x['term']: (x['google_w2v_x'], x['google_w2v_y']) for x in overall_word_counts } # Retrieve top words for each timespan timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id, foci_id) # Retrieve embeddings for each timespan jobs = [{ 'api_key': user_mediacloud_key(), 'topics_id': topics_id, 'snapshots_id': snapshots_id, 'foci_id': foci_id, 'overall_words': overall_words, 'overall_embeddings': overall_embeddings, 'q': q, 'timespan': t, } for t in timespans] embeddings_by_timespan = _get_all_timespan_embeddings(jobs) return jsonify({'list': embeddings_by_timespan})
def topic_words(topics_id): sample_size = request.args[ 'sample_size'] if 'sample_size' in request.args else WORD_COUNT_SAMPLE_SIZE if access_public_topic(topics_id): results = apicache.topic_word_counts(TOOL_API_KEY, topics_id, sample_size=sample_size, snapshots_id=None, timespans_id=None, foci_id=None, q=None) elif is_user_logged_in(): # grab the top words, respecting all the filters results = apicache.topic_word_counts(user_mediacloud_key(), topics_id, sample_size=sample_size) else: return jsonify({'status': 'Error', 'message': 'Invalid attempt'}) totals = [ ] # important so that these get reset on the client when they aren't requested logger.debug(request.args) if (is_user_logged_in()) and ('withTotals' in request.args) and ( request.args['withTotals'] == "true"): # return along with the results for the overall timespan, to facilitate comparison snapshots_id, timespans_id, foci_id, q = filters_from_args( request.args) overall_timespan = _find_overall_timespan(topics_id, snapshots_id) totals = apicache.topic_word_counts( user_mediacloud_key(), topics_id, sample_size=sample_size, timespans_id=overall_timespan['timespans_id'], foci_id=None, q=None) response = { 'list': results[:WORD_COUNT_UI_NUM_WORDS], 'totals': totals[:WORD_COUNT_UI_NUM_WORDS], 'sample_size': str(sample_size) } return jsonify(response)
def topic_w2v_timespan_embeddings(topics_id): args = { 'snapshots_id': request.args.get('snapshotId'), 'foci_id': request.args.get('focusId'), 'q': request.args.get('q'), } # Retrieve embeddings for overall topic overall_word_counts = topic_word_counts(user_mediacloud_key(), topics_id, num_words=50, **args) overall_words = [x['term'] for x in overall_word_counts] overall_embeddings = {x['term']: (x['google_w2v_x'], x['google_w2v_y']) for x in overall_word_counts} # Retrieve top words for each timespan timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id, args['snapshots_id'], args['foci_id']) # Retrieve embeddings for each timespan p = Pool(processes=WORD2VEC_TIMESPAN_POOL_PROCESSES) func = partial(grab_timespan_embeddings, user_mediacloud_key(), topics_id, args, overall_words, overall_embeddings) ts_embeddings = p.map(func, timespans) return jsonify({'list': ts_embeddings})
def media_words(topics_id, media_id): query = apicache.add_to_user_query('media_id:'+media_id) word_list = apicache.topic_word_counts(user_mediacloud_key(), topics_id, q=query)[:100] return jsonify(word_list)
def story_words(topics_id, stories_id): word_list = topic_word_counts(user_mediacloud_key(), topics_id, q='stories_id:'+stories_id)[:100] return jsonify(word_list)
def topic_words_csv(topics_id): response = topic_word_counts(user_mediacloud_key(), topics_id) props = ['term', 'stem', 'count'] return csv.stream_response(response, props, 'sampled-words')
def media_words_csv(topics_id, media_id): query = add_to_user_query('media_id:'+media_id) word_list = topic_word_counts(user_mediacloud_key(), topics_id, q=query) props = ['term', 'stem', 'count'] return csv.stream_response(word_list, props, 'media-'+str(media_id)+'-words')
def topic_word(topics_id, word): response = topic_word_counts(user_mediacloud_key(), topics_id, q=word)[:1] logger.info(response) return jsonify(response)
def topic_word_associated_words_csv(topics_id, word): response = topic_word_counts(user_mediacloud_key(), topics_id, q=word) props = ['term', 'stem', 'count'] return csv.stream_response(response, props, 'word-' + word + '-sampled-words')
def topic_word_associated_words(topics_id, word): response = topic_word_counts(user_mediacloud_key(), topics_id, q=word)[:100] return jsonify(response)
def topic_word_associated_words(topics_id, word): query = apicache.add_to_user_query(word) response = apicache.topic_word_counts(user_mediacloud_key(), topics_id, q=query)[:100] return jsonify(response)
def media_words(topics_id, media_id): query = apicache.add_to_user_query('media_id:' + media_id) word_list = apicache.topic_word_counts(user_mediacloud_key(), topics_id, q=query)[:100] return jsonify(word_list)
def story_words_csv(topics_id, stories_id): word_list = topic_word_counts(user_mediacloud_key(), topics_id, q='stories_id:'+stories_id) props = ['term', 'stem', 'count'] return csv.stream_response(word_list, props, 'story-'+str(stories_id)+'-words')
def topic_word(topics_id, word): response = apicache.topic_word_counts(user_mediacloud_key(), topics_id, q=word)[:1] return jsonify(response)
def topic_word_associated_words(topics_id, word): query = apicache.add_to_user_query(word) response = apicache.topic_word_counts(user_mediacloud_key(), topics_id, q=query)[:100] return jsonify(response)
def story_words(topics_id, stories_id): word_list = apicache.topic_word_counts(user_mediacloud_key(), topics_id, q='stories_id:'+stories_id)[:100] return jsonify(word_list)
def topic_word(topics_id, word): response = apicache.topic_word_counts(user_mediacloud_key(), topics_id, q=word)[:1] return jsonify(response)