def parse_with_genderize(self, story): people_with_first_names = [ p for p in story['people'] if p['multi_part_name'] ] first_names = [p['first_name'] for p in people_with_first_names] if len(first_names ) > 0: # don't send it to genderize if there aren't any people results = fetch_genderize_results(first_names, story['place_iso_code']) else: results = [] # merge data people_with_gender = people_with_first_names for idx in range(0, len(people_with_gender)): del results[idx]['name'] people_with_gender[idx].update(results[idx]) # make a local connection to DB, because this is in its own thread collection = get_db_client() if SAVE_TO_DB: # write all the data to the DB collection.update_one( {'stories_id': story['stories_id']}, {'$set': { 'people_with_gender': people_with_gender }}) logger.info('{} - Saved results to DB '.format(story['stories_id'])) else: logger.info('{} - NOT SAVED'.format(story['stories_id']))
def parse_with_cliff(self, story): cliff_results = {} if 'text' not in story: logger.error('{} - no text') return elif len(story['text']) == 0: logger.warning('{} - no chars in text') # OK to save the empty list of quotes here because we don't have any text in story else: cliff_results = fetch_cliff_results(story['text']) # make a local connection to DB, because this is in its own thread collection = get_db_client() # parse out people people = copy.deepcopy(cliff_results['results']['people']) for p in people: name_parts = p['name'].split() only_one_name = len(name_parts) == 1 p['name_parts'] = name_parts p['first_name'] = name_parts[0] if not only_one_name else None p['last_name'] = " ".join( name_parts[1:]) if not only_one_name else p['name'] p['only_one_name'] = only_one_name p['multi_part_name'] = len(name_parts) > 1 if SAVE_TO_DB: # write all the quotes to the DB collection.update_one( {'stories_id': story['stories_id']}, {'$set': { 'raw_cliff_results': cliff_results, 'people': people }}) logger.info('{} - Saved results to DB '.format(story['stories_id'])) else: logger.info('{} - NOT SAVED'.format(story['stories_id']))
import logging import os import csv import copy import sys from worker import get_db_client SINGLE_NAME_GENDERS_CSV = os.path.join('data', 'manually coded single names.csv') logging.info( "Adding single name gender data (from {})".format(SINGLE_NAME_GENDERS_CSV)) db = get_db_client() processed = db.count_documents({'people_one_name_genders': {'$exists': True}}) logging.info(" {} stories already processed".format(processed)) to_do = db.count_documents({'people_one_name_genders': {'$exists': False}}) logging.info(" {} more to check".format(to_do)) sys.exit() # load custom gender lookup accepted_genders = ['male', 'female'] # most ironic variable name ever? custom_gender_data = csv.DictReader( open(SINGLE_NAME_GENDERS_CSV, 'r', encoding='utf-8-sig')) gender_lookup = { r['name']: r['gender'].lower() for r in custom_gender_data if r['gender'].lower() in accepted_genders } logging.info(" {} total manually coded genders (male or female)".format( len(gender_lookup)))
import logging import csv from worker import get_mc_client, get_db_client, get_genderize_client, places, themes_tag_ids from worker.cache import cache logging.info("Writing reports") collection = get_db_client() genderize = get_genderize_client() db = get_db_client() # fetch the top themes across all the stories for each source ''' @cache.cache_on_arguments() def top_theme_tags(q: str, fq: str): mc = get_mc_client() return mc.storyTagCount(q, fq, tag_sets_id=TAG_SET_NYT_THEMES) top_themes = {} mc = get_mc_client() TAG_SET_NYT_THEMES = 1963 # the tag set the top 600 labels from our NYT-corpus-trained model place2themes = {} for p in places: place_top_themes = top_theme_tags(p['query'], p['date_query'])[:20] place_top_theme_ids = [int(t['tags_id']) for t in place_top_themes] p['top_themes'] = place_top_themes place2themes[p['name']] = place_top_theme_ids ''' total_stories = 0