Example #1
0
def parse_with_genderize(self, story):
    people_with_first_names = [
        p for p in story['people'] if p['multi_part_name']
    ]
    first_names = [p['first_name'] for p in people_with_first_names]
    if len(first_names
           ) > 0:  # don't send it to genderize if there aren't any people
        results = fetch_genderize_results(first_names, story['place_iso_code'])
    else:
        results = []
    # merge data
    people_with_gender = people_with_first_names
    for idx in range(0, len(people_with_gender)):
        del results[idx]['name']
        people_with_gender[idx].update(results[idx])
    # make a local connection to DB, because this is in its own thread
    collection = get_db_client()
    if SAVE_TO_DB:  # write all the data to the DB
        collection.update_one(
            {'stories_id': story['stories_id']},
            {'$set': {
                'people_with_gender': people_with_gender
            }})
        logger.info('{} - Saved results to DB '.format(story['stories_id']))
    else:
        logger.info('{} - NOT SAVED'.format(story['stories_id']))
Example #2
0
def parse_with_cliff(self, story):
    cliff_results = {}
    if 'text' not in story:
        logger.error('{} - no text')
        return
    elif len(story['text']) == 0:
        logger.warning('{} - no chars in text')
        # OK to save the empty list of quotes here because we don't have any text in story
    else:
        cliff_results = fetch_cliff_results(story['text'])
    # make a local connection to DB, because this is in its own thread
    collection = get_db_client()
    # parse out people
    people = copy.deepcopy(cliff_results['results']['people'])
    for p in people:
        name_parts = p['name'].split()
        only_one_name = len(name_parts) == 1
        p['name_parts'] = name_parts
        p['first_name'] = name_parts[0] if not only_one_name else None
        p['last_name'] = " ".join(
            name_parts[1:]) if not only_one_name else p['name']
        p['only_one_name'] = only_one_name
        p['multi_part_name'] = len(name_parts) > 1
    if SAVE_TO_DB:  # write all the quotes to the DB
        collection.update_one(
            {'stories_id': story['stories_id']},
            {'$set': {
                'raw_cliff_results': cliff_results,
                'people': people
            }})
        logger.info('{} - Saved results to DB '.format(story['stories_id']))
    else:
        logger.info('{} - NOT SAVED'.format(story['stories_id']))
import logging
import os
import csv
import copy
import sys

from worker import get_db_client

SINGLE_NAME_GENDERS_CSV = os.path.join('data',
                                       'manually coded single names.csv')

logging.info(
    "Adding single name gender data (from {})".format(SINGLE_NAME_GENDERS_CSV))

db = get_db_client()

processed = db.count_documents({'people_one_name_genders': {'$exists': True}})
logging.info("  {} stories already processed".format(processed))
to_do = db.count_documents({'people_one_name_genders': {'$exists': False}})
logging.info("  {} more to check".format(to_do))
sys.exit()
# load custom gender lookup
accepted_genders = ['male', 'female']  # most ironic variable name ever?
custom_gender_data = csv.DictReader(
    open(SINGLE_NAME_GENDERS_CSV, 'r', encoding='utf-8-sig'))
gender_lookup = {
    r['name']: r['gender'].lower()
    for r in custom_gender_data if r['gender'].lower() in accepted_genders
}
logging.info("  {} total manually coded genders (male or female)".format(
    len(gender_lookup)))
Example #4
0
import logging
import csv

from worker import get_mc_client, get_db_client, get_genderize_client, places, themes_tag_ids
from worker.cache import cache

logging.info("Writing reports")

collection = get_db_client()

genderize = get_genderize_client()
db = get_db_client()

# fetch the top themes across all the stories for each source
'''
@cache.cache_on_arguments()
def top_theme_tags(q: str, fq: str):
    mc = get_mc_client()
    return mc.storyTagCount(q, fq, tag_sets_id=TAG_SET_NYT_THEMES)
top_themes = {}
mc = get_mc_client()
TAG_SET_NYT_THEMES = 1963  # the tag set the top 600 labels from our NYT-corpus-trained model
place2themes = {}
for p in places:
    place_top_themes = top_theme_tags(p['query'], p['date_query'])[:20]
    place_top_theme_ids = [int(t['tags_id']) for t in place_top_themes]
    p['top_themes'] = place_top_themes
    place2themes[p['name']] = place_top_theme_ids
'''

total_stories = 0