def update_contexts(incremental=True): localangle_person_names = dict([(person['name'], person['name']) for person in _db.persons.find()]) localangle_company_names = collections.defaultdict(str, [(clean_company_name(company['name']), company['name']) for company in _db.companies.find()]) story_criteria = { 'entities' : { '$exists' : True } } if incremental: story_criteria['contexts'] = { '$exists' : False } for story in _db.stories.find(story_criteria): contexts = [] for entity in story['entities']: contexts += find_person_contexts(entity, localangle_person_names) contexts += find_company_contexts(entity, localangle_company_names) # Collapse by location story['contexts'] = [] groupby_fn = lambda context: context['location'] contexts.sort(key=groupby_fn) for location, location_contexts in itertools.groupby(contexts, key=groupby_fn): story['contexts'].append({ 'location' : location, 'entities' : map(lambda context: context['entity'], location_contexts) }) _db.stories.save(story)
def transform_headlines_blurbs(incremental=True): alchemy = AlchemyAPI() PERSON_PATTERN = '<span class=\"context\">%s native %s</span>' COMPANY_PATTERN = '<span class=\"context\">%s-based %s</span>' story_criteria = { 'entities' : { '$exists' : True }, 'contexts' : { '$exists' : True, '$ne' : [] }} if incremental: story_criteria['contexts.headline'] = { '$exists' : False } for story in _db.stories.find(story_criteria): for context in story['contexts']: context['headline'] = None context['blurb'] = None display_location = context['location']['city'] if context['location']['city'] else context['location']['state'] for entity in context['entities']: # Transform headlines if entity['type'] == 'Person': new_headline = search_and_replace_text(story['titleNoFormatting'], [entity['name'], entity['name'].split()[-1]], PERSON_PATTERN % (display_location, '%s')) new_blurb = search_and_replace_text(story['content'], entity['name'], PERSON_PATTERN % (display_location, '%s')) elif entity['type'] == 'Company': new_headline = search_and_replace_text(story['titleNoFormatting'], [entity['name'], clean_company_name(entity['name'], robust=True)], COMPANY_PATTERN % (display_location, '%s')) new_blurb = search_and_replace_text(story['content'], entity['name'], COMPANY_PATTERN % (display_location, '%s')) if new_headline: logging.debug(new_headline) context['headline'] = new_headline if new_blurb: context['blurb'] = new_blurb # Transform "blurbs" entity['instances'] = list((e['instances'] for e in story['entities'] if e['text'] == entity['name']).next()) for i, instance in enumerate(entity['instances']): if entity['type'] == 'Person': entity['instances'][i] = search_and_replace_text(instance, entity['name'], PERSON_PATTERN % (display_location, '%s')) elif entity['type'] == 'Company': entity['instances'][i] = search_and_replace_text(instance, entity['name'], COMPANY_PATTERN % (display_location, '%s')) _db.stories.save(story)