def clean_raw_locations_from_file(inputfilename, outputfilename): inputfile = open(inputfilename, 'r') outputfile = open(outputfilename, 'w+') for line in inputfile: line = line.decode('utf8') line = geoalchemy_util.clean_raw_location(line) line = line.encode('utf8') outputfile.write(line)
def clean_raw_locations_from_file(inputfilename, outputfilename): inputfile = open(inputfilename, "r") outputfile = open(outputfilename, "w+") for line in inputfile: line = line.decode("utf8") line = geoalchemy_util.clean_raw_location(line) line = line.encode("utf8") outputfile.write(line)
def analyze_input_addresses(inputfilename): construct_valid_input_address_list() print datetime.datetime.now() inputfile = open(inputfilename, 'r') line_count = 0 good_count = 0 exists_in_all_cities_count = 0 #not_found_file = open('not_found.txt', 'w+') for line in inputfile: line = line.decode('utf8') input_address = geoalchemy_util.clean_raw_location(line) if input_address_exists(input_address): good_count += 1 #else: #not_found_file.write('{0}\n'.format(input_address.encode('utf8'))) line_count += 1 print 'All lines compared!' print '% good:', good_count * 1.0 / line_count print '% in all_cities:', exists_in_all_cities_count * 1.0 / line_count print datetime.datetime.now()
def analyze_input_addresses(inputfilename): valid_input_addresses = construct_valid_input_addresses() print datetime.datetime.now() inputfile = open(inputfilename, 'r') line_count=0 good_count=0 exists_in_all_cities_count=0 #not_found_file = open('not_found.txt', 'w+') for line in inputfile: line = line.decode('utf8') input_address = geoalchemy_util.clean_raw_location(line) if input_address_exists(valid_input_addresses, input_address): good_count+=1 #else: #not_found_file.write('{0}\n'.format(input_address.encode('utf8'))) line_count+=1 print 'All lines compared!' print '% good:', good_count*1.0/line_count print '% in all_cities:', exists_in_all_cities_count*1.0/line_count print datetime.datetime.now()
def main(limit=None, offset=0): t = datetime.datetime.now() print "geocoding started", t #Construct a list of all addresses which Google was capable of identifying #Making this now allows it to be referenced quickly later construct_valid_input_address_list(force_lowercase=True) #Get all of the raw locations in alchemy.db that were parsed from XML raw_parsed_locations = alchemy_session.query( alchemy.RawLocation).limit(limit).offset(offset) #If there are no locations, there is no point in continuing if raw_parsed_locations.count() == 0: return False print 'Constructed list of all parsed locations containing', raw_parsed_locations.count( ), 'items' """ grouped_loations will contain a list of dicts. Each dict will contain three values: raw_location = Location object containing the original location found in the XML matching_location = RawGoogle object containing the disambiguated location grouping_id = ID constructed from the city, region, and country of the matching_location """ grouped_locations = [] for instance in raw_parsed_locations: #Convert the location into a string that matches the Google format parsed_raw_location = geoalchemy_util.concatenate_location( instance.city, instance.state, instance.country) cleaned_location = geoalchemy_util.clean_raw_location( parsed_raw_location) #If the cleaned location has a match in the raw_google database, #we use that to classify it if input_address_exists(cleaned_location, force_lowercase=True): #Find the location from the raw_google database that matches this input matching_location = geo_data_session.query(RawGoogle).filter( sqlalchemy.func.lower(RawGoogle.input_address) == sqlalchemy.func.lower(cleaned_location)).first() grouping_id = u"{0}|{1}".format(matching_location.latitude, matching_location.longitude) else: """ If there is no match in the raw_google database, we leave the location alone TODO: analyze the location's edit distance to make minor adjustments to it such that it can be matched. Particularly good if we can combine the all_cities database with the list of valid input_address values in the raw_google database. """ print cleaned_location matching_location = RawGoogle(cleaned_location, '', '', '', '', '', -1) grouping_id = u"nolocationfound" grouped_locations.append({ "raw_location": instance, "matching_location": matching_location, "grouping_id": grouping_id }) print "grouped_locations created", datetime.datetime.now() - t t = datetime.datetime.now() #We now have a list of all locations in the file, along with their #matching locations and the id used to group them #Sort the list by the grouping_id keyfunc = lambda x: x['grouping_id'] grouped_locations.sort(key=keyfunc) #Create an iterator that will access everything in the list with the same #grouping_id grouped_locations_enum = enumerate( itertools.groupby(grouped_locations, keyfunc)) print "grouped_locations sorted", datetime.datetime.now() - t t = datetime.datetime.now() #Match the locations match_grouped_locations(grouped_locations_enum, t) alchemy_session.commit() print "Matches made!", datetime.datetime.now() - t unique_group_count = alchemy_session.query( expression.func.count(sqlalchemy.distinct(alchemy.Location.id))).all() print "%s groups formed from %s locations" % (unique_group_count, raw_parsed_locations.count())
def main(limit=None, offset=0, minimum_match_value=0.8, doctype='grant'): alchemy_session = alchemy.fetch_session(dbtype=doctype) t = datetime.datetime.now() print "geocoding started", doctype, t #Construct a list of all addresses which Google was capable of identifying #Making this now allows it to be referenced quickly later valid_input_addresses = construct_valid_input_addresses() #Get all of the raw locations in alchemy.db that were parsed from XML if doctype == 'grant': raw_parsed_locations = alchemy_session.query(alchemy.schema.RawLocation).limit(limit).offset(offset) elif doctype == 'application': raw_parsed_locations = alchemy_session.query(alchemy.schema.App_RawLocation).limit(limit).offset(offset) raw_parsed_locations_count = raw_parsed_locations.count() #If there are no locations, there is no point in continuing if raw_parsed_locations_count == 0: return False print 'Constructed list of all parsed locations containing', raw_parsed_locations_count, 'items' """ grouped_loations will contain a list of dicts. Each dict will contain three values: raw_location = Location object containing the original location found in the XML matching_location = RawGoogle object containing the disambiguated location grouping_id = ID constructed from the city, region, and country of the matching_location """ identified_grouped_locations = [] unidentified_grouped_locations = [] for instance in raw_parsed_locations: #Convert the location into a string that matches the Google format parsed_raw_location = geoalchemy_util.concatenate_location(instance.city, instance.state, instance.country) cleaned_location = geoalchemy_util.clean_raw_location(parsed_raw_location) #If the cleaned location has a match in the raw_google database, #we use that to classify it if input_address_exists(valid_input_addresses, cleaned_location): matching_location = geo_data_session.query(RawGoogle).filter( RawGoogle.input_address==cleaned_location).first() if matching_location: grouping_id = u"{0}|{1}".format(matching_location.latitude, matching_location.longitude) identified_grouped_locations.append({"raw_location": instance, "matching_location": matching_location, "grouping_id": grouping_id}) else: print 'Cleaned location not matched', cleaned_location country = geoalchemy_util.get_country_from_cleaned(cleaned_location) unidentified_grouped_locations.append({"raw_location": instance, "cleaned_location": cleaned_location, "country": country}) else: """ If there is no match in the raw_google database, we leave the location alone TODO: analyze the location's edit distance to make minor adjustments to it such that it can be matched. Particularly good if we can combine the all_cities database with the list of valid input_address values in the raw_google database. """ #Sort the locations by their country country = geoalchemy_util.get_country_from_cleaned(cleaned_location) unidentified_grouped_locations.append({"raw_location": instance, "cleaned_location": cleaned_location, "country": country}) if ((len(identified_grouped_locations)+len(unidentified_grouped_locations))%10000 == 0): print "Processed", len(identified_grouped_locations)+len(unidentified_grouped_locations), datetime.datetime.now() print "locations grouped", datetime.datetime.now() - t print 'count of identified locations:', len(identified_grouped_locations) t = datetime.datetime.now() alchemy_session.close() #We now have two lists of locations. First, consider the unmatched locations. keyfunc = lambda x:x["country"] #Sort the list by the country unidentified_grouped_locations.sort(key=keyfunc) #Create an iterator that will access everything in the list with the same #country unidentified_grouped_locations_enum = enumerate(itertools.groupby(unidentified_grouped_locations, keyfunc)) #Identify the correct location for each entry by comparing to all_cities identify_missing_locations(unidentified_grouped_locations_enum, identified_grouped_locations, minimum_match_value, t) print 'new count of identified locations:', len(identified_grouped_locations) #We now have a list of all locations in the file, along with their #matching locations and the id used to group them #Perform a quickfix to correct state names geoalchemy_util.fix_state_abbreviations(identified_grouped_locations) #Sort the list by the grouping_id keyfunc = lambda x: x['grouping_id'] identified_grouped_locations.sort(key=keyfunc) #Create an iterator that will access everything in the list with the same #grouping_id identified_grouped_locations_enum = enumerate(itertools.groupby(identified_grouped_locations, keyfunc)) print "identified_grouped_locations sorted", datetime.datetime.now() - t t = datetime.datetime.now() alchemy_session = alchemy.fetch_session(dbtype=doctype) #Match the locations match_grouped_locations(identified_grouped_locations_enum, t, alchemy_session) print "Matches made!", datetime.datetime.now() - t if doctype == 'grant': unique_group_count = alchemy_session.query(expression.func.count(sqlalchemy.distinct(alchemy.schema.Location.id))).all() elif doctype == 'application': unique_group_count = alchemy_session.query(expression.func.count(sqlalchemy.distinct(alchemy.schema.App_Location.id))).all() print "%s groups formed from %s locations" % (unique_group_count, raw_parsed_locations_count) alchemy_session.close()
def main(limit=None, offset=0): t = datetime.datetime.now() print "geocoding started", t #Construct a list of all addresses which Google was capable of identifying #Making this now allows it to be referenced quickly later construct_valid_input_address_list(force_lowercase=True) #Get all of the raw locations in alchemy.db that were parsed from XML raw_parsed_locations = alchemy_session.query(alchemy.RawLocation).limit(limit).offset(offset) #If there are no locations, there is no point in continuing if raw_parsed_locations.count() == 0: return False print 'Constructed list of all parsed locations containing', raw_parsed_locations.count(), 'items' """ grouped_loations will contain a list of dicts. Each dict will contain three values: raw_location = Location object containing the original location found in the XML matching_location = RawGoogle object containing the disambiguated location grouping_id = ID constructed from the city, region, and country of the matching_location """ grouped_locations = [] for instance in raw_parsed_locations: #Convert the location into a string that matches the Google format parsed_raw_location = geoalchemy_util.concatenate_location(instance.city, instance.state, instance.country) cleaned_location = geoalchemy_util.clean_raw_location(parsed_raw_location) #If the cleaned location has a match in the raw_google database, #we use that to classify it if input_address_exists(cleaned_location, force_lowercase=True): #Find the location from the raw_google database that matches this input matching_location = geo_data_session.query(RawGoogle).filter( sqlalchemy.func.lower(RawGoogle.input_address)== sqlalchemy.func.lower(cleaned_location)).first() grouping_id = u"{0}|{1}".format(matching_location.latitude, matching_location.longitude) else: """ If there is no match in the raw_google database, we leave the location alone TODO: analyze the location's edit distance to make minor adjustments to it such that it can be matched. Particularly good if we can combine the all_cities database with the list of valid input_address values in the raw_google database. """ print cleaned_location matching_location = RawGoogle(cleaned_location, '', '', '', '', '', -1) grouping_id = u"nolocationfound" grouped_locations.append({"raw_location": instance, "matching_location": matching_location, "grouping_id": grouping_id}) print "grouped_locations created", datetime.datetime.now() - t t = datetime.datetime.now() #We now have a list of all locations in the file, along with their #matching locations and the id used to group them #Sort the list by the grouping_id keyfunc = lambda x: x['grouping_id'] grouped_locations.sort(key=keyfunc) #Create an iterator that will access everything in the list with the same #grouping_id grouped_locations_enum = enumerate(itertools.groupby(grouped_locations, keyfunc)) print "grouped_locations sorted", datetime.datetime.now() - t t = datetime.datetime.now() #Match the locations match_grouped_locations(grouped_locations_enum, t) alchemy_session.commit() print "Matches made!", datetime.datetime.now() - t unique_group_count = alchemy_session.query(expression.func.count(sqlalchemy.distinct(alchemy.Location.id))).all() print "%s groups formed from %s locations" % (unique_group_count, raw_parsed_locations.count())