class BuildingMatchEvaluator: def __init__(self, start_date, end_date, db, buildings_collection_name='buildings'): self.start_date = start_date self.end_date = end_date self.db = db self.buildings_collection = db[buildings_collection_name] # The building objects at the start of this run. These will not yet # have the new alias information given by user self.current_buildings = list(self.buildings_collection.find()) self.scraper = EventScraper(self.current_buildings, building_callback=self.cl_user_update_aliases) def run_evaluator(self): events = self.scraper.get_events_for_dates(self.start_date, self.end_date) print('Evaluator done!') def cl_user_update_aliases(self, full_location_str, closest_match_str, closest_match_score): formal_building_names = [b['name'] for b in self.current_buildings] print('Scraped location name: %s' % full_location_str) print('Matched to: %s, with score: %i' % (closest_match_str, closest_match_score)) building_was_correct = input("Was this correct? If so, type 'y' ") if building_was_correct != 'y': correct_building = input('Please type the correct building: ') # Be annoying: keep asking until they enter a correct name while correct_building not in formal_building_names: print('Must be one of:') print(formal_building_names) correct_building = input('Please type the correct building: ') new_alias = input("New alias (or '' to not add an alias): ") # (nj) This is bad -- if we change how we do string matching, this # won't be accurate. Should really pass an evaluation function to the # scraper. alias_score = fuzzywuzzy.fuzz.partial_ratio(full_location_str, new_alias) if new_alias == '': print('No alias added. Good.') else: # building_document = self.buildings_collection.find_one({'name' : correct_building}) updated_aliases = building_document['aliases'] + [new_alias] self.buildings_collection.update( {'name' : correct_building}, {'$set' : {'aliases' : updated_aliases}} ) print("Added alias '%s' for '%s' with score %i" % (new_alias, correct_building, alias_score)) print('-'*30)
def update_db_for_dates(start_date, end_date, db, collection_name='events', buildings_collection='buildings'): ''' Insert all events scraped from the website that take place between <start_date> and <end_date> Both parameters should be datetime.date objects ''' buildings = list(db[buildings_collection].find()) scraper = EventScraper(buildings) event_dicts = scraper.get_events_for_dates(start_date, end_date) collection = db[collection_name] inserted_ids = [collection.update(e, e, upsert = True) for e in event_dicts]
def update_db_for_dates(start_date, end_date, db, collection_name='events', buildings_collection='buildings'): ''' Insert all events scraped from the website that take place between <start_date> and <end_date> Both parameters should be datetime.date objects ''' buildings = list(db[buildings_collection].find()) scraper = EventScraper(buildings) event_dicts = scraper.get_events_for_dates(start_date, end_date) collection = db[collection_name] inserted_ids = [collection.update(e, e, upsert=True) for e in event_dicts]
class BuildingMatchEvaluator: def __init__(self, start_date, end_date, db, buildings_collection_name='buildings'): self.start_date = start_date self.end_date = end_date self.db = db self.buildings_collection = db[buildings_collection_name] # The building objects at the start of this run. These will not yet # have the new alias information given by user self.current_buildings = list(self.buildings_collection.find()) self.scraper = EventScraper( self.current_buildings, building_callback=self.cl_user_update_aliases) def run_evaluator(self): events = self.scraper.get_events_for_dates(self.start_date, self.end_date) print('Evaluator done!') def cl_user_update_aliases(self, full_location_str, closest_match_str, closest_match_score): formal_building_names = [b['name'] for b in self.current_buildings] print('Scraped location name: %s' % full_location_str) print('Matched to: %s, with score: %i' % (closest_match_str, closest_match_score)) building_was_correct = input("Was this correct? If so, type 'y' ") if building_was_correct != 'y': correct_building = input('Please type the correct building: ') # Be annoying: keep asking until they enter a correct name while correct_building not in formal_building_names: print('Must be one of:') print(formal_building_names) correct_building = input('Please type the correct building: ') new_alias = input("New alias (or '' to not add an alias): ") # (nj) This is bad -- if we change how we do string matching, this # won't be accurate. Should really pass an evaluation function to the # scraper. alias_score = fuzzywuzzy.fuzz.partial_ratio( full_location_str, new_alias) if new_alias == '': print('No alias added. Good.') else: # building_document = self.buildings_collection.find_one( {'name': correct_building}) updated_aliases = building_document['aliases'] + [new_alias] self.buildings_collection.update( {'name': correct_building}, {'$set': { 'aliases': updated_aliases }}) print("Added alias '%s' for '%s' with score %i" % (new_alias, correct_building, alias_score)) print('-' * 30)