query = "" name_key = person['full_name'] if name_key in custom_query_keywords: log.info(" adjustment: %s -> %s" % (name_key, custom_query_keywords[name_key])) query = custom_query_keywords[name_key] elif 'first_name' in person.keys() and 'last_name' in person.keys(): query = '"{0}" AND "{1}"'.format(person['first_name'], person['last_name']) else: query = '"{0}"'.format(person['full_name']) # a) limit query to correct date range only # query_filter = build_mpv_daterange(row) # b) also limit query to us media sources (msm, regional, partisan sets) date_range_query = build_mpv_daterange(person['date_of_death']) query_filter = "( " + date_range_query + " AND " + media_filter_query + " )" # c) also limit query to non-spidered us media sources (msm, regional, partisan sets) # query_filter = build_mpv_daterange(row) + " AND (tags_id_media:(8875027 2453107 129 8878292 8878293 8878294)) " + " AND NOT (tags_id_stories:8875452) " queries.append("(" + query + " AND " + date_range_query + ")") no_keyword_queries.append("(" + date_range_query + ")") if WRITE_STORY_COUNT_CSVS: data = {} data['full_name'] = name_key data['date_of_death'] = person['date_of_death'] data['total_stories'] = count_stories('*', query_filter) data['stories_about_person'] = count_stories(query, query_filter) normalized_story_count = float(data['stories_about_person']) / float( data['total_stories'])
# if person['full_name']!="Akai Gurley": # continue # build the in-controversy query for stories about this person query = "{~ topic:" + CONTROVERSY_ID + "}" name_key = person["full_name"] if name_key in custom_query_keywords: log.info(" adjustment: %s -> %s" % (name_key, custom_query_keywords[name_key])) query += " AND " + custom_query_keywords[name_key] elif "first_name" in person.keys() and "last_name" in person.keys(): query += ' AND "{0}" AND "{1}"'.format(person["first_name"], person["last_name"]) else: query += ' AND "{0}"'.format(person["full_name"]) query_filter = build_mpv_daterange(person["date_of_death"]) # fetch the stories query_start = time.time() stories = fetch_all_stories(query, query_filter) query_duration = float(time.time() - query_start) time_spent_querying = time_spent_querying + query_duration queue_start = time.time() duplicate_stories = 0 urls_already_done = [] # build a list of unique urls for de-duping log.info(" found %d stories" % len(stories)) for story in stories: # figure out the base url so we can de-duplicate results from MC story["base_url"] = story["url"]
# continue # build the in-controversy query for stories about this person query = "{~ topic:" + CONTROVERSY_ID + "}" name_key = person['full_name'] if name_key in custom_query_keywords: log.info(" adjustment: %s -> %s" % (name_key, custom_query_keywords[name_key])) query += " AND " + custom_query_keywords[name_key] elif 'first_name' in person.keys() and 'last_name' in person.keys(): query += ' AND "{0}" AND "{1}"'.format(person['first_name'], person['last_name']) else: query += ' AND "{0}"'.format(person['full_name']) query_filter = build_mpv_daterange(person['date_of_death']) # fetch the stories query_start = time.time() stories = fetch_all_stories(query, query_filter) query_duration = float(time.time() - query_start) time_spent_querying = time_spent_querying + query_duration queue_start = time.time() duplicate_stories = 0 urls_already_done = [] # build a list of unique urls for de-duping log.info(" found %d stories" % len(stories)) for story in stories: # figure out the base url so we can de-duplicate results from MC story['base_url'] = story['url']
for person in data: log.info(" Working on %s" % person['full_name']) query = "" name_key = person['full_name'] if name_key in custom_query_keywords: log.info(" adjustment: %s -> %s" % (name_key,custom_query_keywords[name_key])) query = custom_query_keywords[name_key] elif 'first_name' in person.keys() and 'last_name' in person.keys(): query = '"{0}" AND "{1}"'.format(person['first_name'], person['last_name']) else: query = '"{0}"'.format(person['full_name']) # a) limit query to correct date range only # query_filter = build_mpv_daterange(row) # b) also limit query to us media sources (msm, regional, partisan sets) date_range_query = build_mpv_daterange(person['date_of_death']) query_filter = "( " + date_range_query + " AND "+media_filter_query+" )" # c) also limit query to non-spidered us media sources (msm, regional, partisan sets) # query_filter = build_mpv_daterange(row) + " AND (tags_id_media:(8875027 2453107 129 8878292 8878293 8878294)) " + " AND NOT (tags_id_stories:8875452) " queries.append("("+query+" AND "+date_range_query+")") no_keyword_queries.append("(" + date_range_query +")") if WRITE_STORY_COUNT_CSVS: data = {} data['full_name'] = name_key data['date_of_death'] = person['date_of_death'] data['total_stories'] = count_stories('*',query_filter) data['stories_about_person'] = count_stories(query,query_filter) normalized_story_count = float(data['stories_about_person']) / float(data['total_stories']) data['normalized_stories_about_person'] = "{0:.15f}".format(normalized_story_count)