def identify_missing_locations(unidentified_grouped_locations_enum, identified_grouped_locations, minimum_match_value, t): #For each group of locations with the same country for i, item in unidentified_grouped_locations_enum: country, grouped_locations_list = item #Get a list of all cities that exist anywhere in that country all_cities_in_country = geo_data_session.query( AllCities.city, AllCities.region).filter_by(country=country) #Construct a name for each location that matches the normal cleaned location format all_cities_in_country = [ geoalchemy_util.concatenate_location( x.city, x.region if geoalchemy_util.region_is_a_state(x.region) else '', country) for x in all_cities_in_country ] #For each location found in this country, find its closest match #among the list of all cities from that country for grouped_location in grouped_locations_list: cleaned_location = grouped_location["cleaned_location"] closest_match = geoalchemy_util.get_closest_match_leven( cleaned_location, all_cities_in_country, minimum_match_value) #If no match was found or only the trivial match if closest_match == '' or closest_match == country: continue #If we have a good match, add it to the list of matched locations closest_match_split = re.split(",", closest_match) city = closest_match_split[0].strip() if len(closest_match_split) == 3: region = closest_match_split[1].strip() country = closest_match_split[2].strip() matching_location = geo_data_session.query( AllCities).filter_by(city=city, region=region, country=country).first() else: country = closest_match_split[1].strip() matching_location = geo_data_session.query( AllCities).filter_by(city=city, country=country).first() if not matching_location: print 'Warning: all_cities match attempt failed for', cleaned_location.encode( 'utf8'), 'location not found' grouping_id = u"{0}|{1}".format(matching_location.latitude, matching_location.longitude) raw_location = grouped_location["raw_location"] identified_grouped_locations.append({ "raw_location": raw_location, "matching_location": matching_location, "grouping_id": grouping_id }) print 'all_cities found additional location for', raw_location
def identify_missing_locations( unidentified_grouped_locations_enum, identified_grouped_locations, minimum_match_value, t ): # For each group of locations with the same country for i, item in unidentified_grouped_locations_enum: country, grouped_locations_list = item # Get a list of all cities that exist anywhere in that country all_cities_in_country = geo_data_session.query(AllCities.city, AllCities.region).filter_by(country=country) # Construct a name for each location that matches the normal cleaned location format all_cities_in_country = [ geoalchemy_util.concatenate_location( x.city, x.region if geoalchemy_util.region_is_a_state(x.region) else "", country ) for x in all_cities_in_country ] # For each location found in this country, find its closest match # among the list of all cities from that country for grouped_location in grouped_locations_list: cleaned_location = grouped_location["cleaned_location"] closest_match = geoalchemy_util.get_closest_match_leven( cleaned_location, all_cities_in_country, minimum_match_value ) # If no match was found or only the trivial match if closest_match == "" or closest_match == country: continue # If we have a good match, add it to the list of matched locations closest_match_split = re.split(",", closest_match) city = closest_match_split[0].strip() if len(closest_match_split) == 3: region = closest_match_split[1].strip() country = closest_match_split[2].strip() matching_location = ( geo_data_session.query(AllCities).filter_by(city=city, region=region, country=country).first() ) else: country = closest_match_split[1].strip() matching_location = geo_data_session.query(AllCities).filter_by(city=city, country=country).first() if not matching_location: print "Warning: all_cities match attempt failed for", cleaned_location.encode( "utf8" ), "location not found" grouping_id = u"{0}|{1}".format(matching_location.latitude, matching_location.longitude) raw_location = grouped_location["raw_location"] identified_grouped_locations.append( {"raw_location": raw_location, "matching_location": matching_location, "grouping_id": grouping_id} ) print "all_cities found additional location for", raw_location
def main(limit=None, offset=0): t = datetime.datetime.now() print "geocoding started", t #Construct a list of all addresses which Google was capable of identifying #Making this now allows it to be referenced quickly later construct_valid_input_address_list(force_lowercase=True) #Get all of the raw locations in alchemy.db that were parsed from XML raw_parsed_locations = alchemy_session.query( alchemy.RawLocation).limit(limit).offset(offset) #If there are no locations, there is no point in continuing if raw_parsed_locations.count() == 0: return False print 'Constructed list of all parsed locations containing', raw_parsed_locations.count( ), 'items' """ grouped_loations will contain a list of dicts. Each dict will contain three values: raw_location = Location object containing the original location found in the XML matching_location = RawGoogle object containing the disambiguated location grouping_id = ID constructed from the city, region, and country of the matching_location """ grouped_locations = [] for instance in raw_parsed_locations: #Convert the location into a string that matches the Google format parsed_raw_location = geoalchemy_util.concatenate_location( instance.city, instance.state, instance.country) cleaned_location = geoalchemy_util.clean_raw_location( parsed_raw_location) #If the cleaned location has a match in the raw_google database, #we use that to classify it if input_address_exists(cleaned_location, force_lowercase=True): #Find the location from the raw_google database that matches this input matching_location = geo_data_session.query(RawGoogle).filter( sqlalchemy.func.lower(RawGoogle.input_address) == sqlalchemy.func.lower(cleaned_location)).first() grouping_id = u"{0}|{1}".format(matching_location.latitude, matching_location.longitude) else: """ If there is no match in the raw_google database, we leave the location alone TODO: analyze the location's edit distance to make minor adjustments to it such that it can be matched. Particularly good if we can combine the all_cities database with the list of valid input_address values in the raw_google database. """ print cleaned_location matching_location = RawGoogle(cleaned_location, '', '', '', '', '', -1) grouping_id = u"nolocationfound" grouped_locations.append({ "raw_location": instance, "matching_location": matching_location, "grouping_id": grouping_id }) print "grouped_locations created", datetime.datetime.now() - t t = datetime.datetime.now() #We now have a list of all locations in the file, along with their #matching locations and the id used to group them #Sort the list by the grouping_id keyfunc = lambda x: x['grouping_id'] grouped_locations.sort(key=keyfunc) #Create an iterator that will access everything in the list with the same #grouping_id grouped_locations_enum = enumerate( itertools.groupby(grouped_locations, keyfunc)) print "grouped_locations sorted", datetime.datetime.now() - t t = datetime.datetime.now() #Match the locations match_grouped_locations(grouped_locations_enum, t) alchemy_session.commit() print "Matches made!", datetime.datetime.now() - t unique_group_count = alchemy_session.query( expression.func.count(sqlalchemy.distinct(alchemy.Location.id))).all() print "%s groups formed from %s locations" % (unique_group_count, raw_parsed_locations.count())
def main(limit=None, offset=0, minimum_match_value=0.8, doctype='grant'): alchemy_session = alchemy.fetch_session(dbtype=doctype) t = datetime.datetime.now() print "geocoding started", doctype, t #Construct a list of all addresses which Google was capable of identifying #Making this now allows it to be referenced quickly later valid_input_addresses = construct_valid_input_addresses() #Get all of the raw locations in alchemy.db that were parsed from XML if doctype == 'grant': raw_parsed_locations = alchemy_session.query(alchemy.schema.RawLocation).limit(limit).offset(offset) elif doctype == 'application': raw_parsed_locations = alchemy_session.query(alchemy.schema.App_RawLocation).limit(limit).offset(offset) raw_parsed_locations_count = raw_parsed_locations.count() #If there are no locations, there is no point in continuing if raw_parsed_locations_count == 0: return False print 'Constructed list of all parsed locations containing', raw_parsed_locations_count, 'items' """ grouped_loations will contain a list of dicts. Each dict will contain three values: raw_location = Location object containing the original location found in the XML matching_location = RawGoogle object containing the disambiguated location grouping_id = ID constructed from the city, region, and country of the matching_location """ identified_grouped_locations = [] unidentified_grouped_locations = [] for instance in raw_parsed_locations: #Convert the location into a string that matches the Google format parsed_raw_location = geoalchemy_util.concatenate_location(instance.city, instance.state, instance.country) cleaned_location = geoalchemy_util.clean_raw_location(parsed_raw_location) #If the cleaned location has a match in the raw_google database, #we use that to classify it if input_address_exists(valid_input_addresses, cleaned_location): matching_location = geo_data_session.query(RawGoogle).filter( RawGoogle.input_address==cleaned_location).first() if matching_location: grouping_id = u"{0}|{1}".format(matching_location.latitude, matching_location.longitude) identified_grouped_locations.append({"raw_location": instance, "matching_location": matching_location, "grouping_id": grouping_id}) else: print 'Cleaned location not matched', cleaned_location country = geoalchemy_util.get_country_from_cleaned(cleaned_location) unidentified_grouped_locations.append({"raw_location": instance, "cleaned_location": cleaned_location, "country": country}) else: """ If there is no match in the raw_google database, we leave the location alone TODO: analyze the location's edit distance to make minor adjustments to it such that it can be matched. Particularly good if we can combine the all_cities database with the list of valid input_address values in the raw_google database. """ #Sort the locations by their country country = geoalchemy_util.get_country_from_cleaned(cleaned_location) unidentified_grouped_locations.append({"raw_location": instance, "cleaned_location": cleaned_location, "country": country}) if ((len(identified_grouped_locations)+len(unidentified_grouped_locations))%10000 == 0): print "Processed", len(identified_grouped_locations)+len(unidentified_grouped_locations), datetime.datetime.now() print "locations grouped", datetime.datetime.now() - t print 'count of identified locations:', len(identified_grouped_locations) t = datetime.datetime.now() alchemy_session.close() #We now have two lists of locations. First, consider the unmatched locations. keyfunc = lambda x:x["country"] #Sort the list by the country unidentified_grouped_locations.sort(key=keyfunc) #Create an iterator that will access everything in the list with the same #country unidentified_grouped_locations_enum = enumerate(itertools.groupby(unidentified_grouped_locations, keyfunc)) #Identify the correct location for each entry by comparing to all_cities identify_missing_locations(unidentified_grouped_locations_enum, identified_grouped_locations, minimum_match_value, t) print 'new count of identified locations:', len(identified_grouped_locations) #We now have a list of all locations in the file, along with their #matching locations and the id used to group them #Perform a quickfix to correct state names geoalchemy_util.fix_state_abbreviations(identified_grouped_locations) #Sort the list by the grouping_id keyfunc = lambda x: x['grouping_id'] identified_grouped_locations.sort(key=keyfunc) #Create an iterator that will access everything in the list with the same #grouping_id identified_grouped_locations_enum = enumerate(itertools.groupby(identified_grouped_locations, keyfunc)) print "identified_grouped_locations sorted", datetime.datetime.now() - t t = datetime.datetime.now() alchemy_session = alchemy.fetch_session(dbtype=doctype) #Match the locations match_grouped_locations(identified_grouped_locations_enum, t, alchemy_session) print "Matches made!", datetime.datetime.now() - t if doctype == 'grant': unique_group_count = alchemy_session.query(expression.func.count(sqlalchemy.distinct(alchemy.schema.Location.id))).all() elif doctype == 'application': unique_group_count = alchemy_session.query(expression.func.count(sqlalchemy.distinct(alchemy.schema.App_Location.id))).all() print "%s groups formed from %s locations" % (unique_group_count, raw_parsed_locations_count) alchemy_session.close()
def main(limit=None, offset=0): t = datetime.datetime.now() print "geocoding started", t #Construct a list of all addresses which Google was capable of identifying #Making this now allows it to be referenced quickly later construct_valid_input_address_list(force_lowercase=True) #Get all of the raw locations in alchemy.db that were parsed from XML raw_parsed_locations = alchemy_session.query(alchemy.RawLocation).limit(limit).offset(offset) #If there are no locations, there is no point in continuing if raw_parsed_locations.count() == 0: return False print 'Constructed list of all parsed locations containing', raw_parsed_locations.count(), 'items' """ grouped_loations will contain a list of dicts. Each dict will contain three values: raw_location = Location object containing the original location found in the XML matching_location = RawGoogle object containing the disambiguated location grouping_id = ID constructed from the city, region, and country of the matching_location """ grouped_locations = [] for instance in raw_parsed_locations: #Convert the location into a string that matches the Google format parsed_raw_location = geoalchemy_util.concatenate_location(instance.city, instance.state, instance.country) cleaned_location = geoalchemy_util.clean_raw_location(parsed_raw_location) #If the cleaned location has a match in the raw_google database, #we use that to classify it if input_address_exists(cleaned_location, force_lowercase=True): #Find the location from the raw_google database that matches this input matching_location = geo_data_session.query(RawGoogle).filter( sqlalchemy.func.lower(RawGoogle.input_address)== sqlalchemy.func.lower(cleaned_location)).first() grouping_id = u"{0}|{1}".format(matching_location.latitude, matching_location.longitude) else: """ If there is no match in the raw_google database, we leave the location alone TODO: analyze the location's edit distance to make minor adjustments to it such that it can be matched. Particularly good if we can combine the all_cities database with the list of valid input_address values in the raw_google database. """ print cleaned_location matching_location = RawGoogle(cleaned_location, '', '', '', '', '', -1) grouping_id = u"nolocationfound" grouped_locations.append({"raw_location": instance, "matching_location": matching_location, "grouping_id": grouping_id}) print "grouped_locations created", datetime.datetime.now() - t t = datetime.datetime.now() #We now have a list of all locations in the file, along with their #matching locations and the id used to group them #Sort the list by the grouping_id keyfunc = lambda x: x['grouping_id'] grouped_locations.sort(key=keyfunc) #Create an iterator that will access everything in the list with the same #grouping_id grouped_locations_enum = enumerate(itertools.groupby(grouped_locations, keyfunc)) print "grouped_locations sorted", datetime.datetime.now() - t t = datetime.datetime.now() #Match the locations match_grouped_locations(grouped_locations_enum, t) alchemy_session.commit() print "Matches made!", datetime.datetime.now() - t unique_group_count = alchemy_session.query(expression.func.count(sqlalchemy.distinct(alchemy.Location.id))).all() print "%s groups formed from %s locations" % (unique_group_count, raw_parsed_locations.count())