def print_diffs(importance_filepath_1, importance_filepath_2): """ Use this method to print the top differences between two different files for estimated importances. """ with open(importance_filepath_1) as f: importance_1 = json.load(f) with open(importance_filepath_2) as f: importance_2 = json.load(f) _, locations_by_id = geonames.load_data() diffs = [ (id_, importance_1.get(id_, 0.), importance_2.get(id_, 0.)) for id_ in set(importance_1.keys() + importance_2.keys()) ] diffs.sort(key=lambda info: abs(info[1] - info[2]), reverse=True) for id_, i1, i2 in diffs: if abs(i1 - i2) < .1: break if id_ not in locations_by_id: print id_ continue location = locations_by_id[id_] print location['resolution'], location['name'], location['country'], i1, i2
def compare(filepath1, filepath2): """ Prints the difference between two files of alternate names. """ with open(filepath1) as file1: alt_names1 = json.load(file1) with open(filepath2) as file2: alt_names2 = json.load(file2) _, locations_by_id = load_data() print_extra(alt_names1, alt_names2, locations_by_id) print_extra(alt_names2, alt_names1, locations_by_id)
def run(osm_data_filepath, out_filepath, show_plot): geo_locations_by_name, geo_locations_by_id = geonames.load_data() location_importances = find_osm_importances( osm_data_filepath, geo_locations_by_name, geo_locations_by_id ) training_sets, predict_sets = get_data_sets(geo_locations_by_id, location_importances) models = train_models(training_sets, geo_locations_by_id, location_importances, show_plot) predictions = make_predictions( models, training_sets, predict_sets, geo_locations_by_id, location_importances ) with open(out_filepath, 'w') as out: json.dump(predictions, out) if show_plot: plt.show()
def test_geonames_data_format(): locations_by_name, locations_by_id = load_data() _test_data_format(locations_by_id)
def test_geonames_data_numbers(): locations_by_name, locations_by_id = load_data() _test_data_numbers(locations_by_name, locations_by_id)
def run(out_filename, log_filename): """ Find the alternate names of locations in the geonames data set and write them to an output file. """ locations_by_name, locations_by_id = load_data() alt_names_found = {} counts = dict((kind, 0) for kind in ('ambiguous name', 'no wiki page found', 'no alt names found', 'ambiguous alt name')) hits = 0 for i, (name, locations_with_name) in enumerate(locations_by_name.iteritems()): if i % 1000 == 0: print 'Search number', i print counts, hits if not locations_with_name: continue location = None if len(locations_with_name) == 1: candidate = locations_with_name.values()[0] if candidate['population'] > MIN_POPULATION_THRESHOLD: location = candidate else: locations_by_importance = sorted( locations_with_name.values(), key=lambda loc: get_adjusted_importance(loc), reverse=True) top_importance = get_adjusted_importance( locations_by_importance[0]) next_importance = get_adjusted_importance( locations_by_importance[1]) if (top_importance > MIN_AMBIG_IMPORTANCE_THRESHOLD and top_importance - next_importance > .12): candidate = locations_by_importance[0] if candidate['population'] > MIN_POPULATION_THRESHOLD: location = candidate if not location: counts['ambiguous name'] += 1 continue if location['name'] != name: # only search for location's real name continue result = search(name) if result is None: counts['no wiki page found'] += 1 continue if not result: counts['no alt names found'] += 1 continue importance = get_adjusted_importance(location) good_alt_names = [] for alt_name in result: skip_name = alt_name.title() in BLACKLIST locations_with_alt_name = locations_by_name.get(alt_name, {}) for alt_location in locations_with_alt_name.itervalues(): if alt_location['id'] == location['id']: continue alt_importance = get_adjusted_importance(alt_location) if alt_location[ 'name'] == alt_name and alt_importance + .1 > importance: skip_name = True break if alt_location[ 'name'] != alt_name and alt_importance > importance + .1: skip_name = True break if not skip_name: good_alt_names.append(alt_name) if not good_alt_names: counts['ambiguous alt name'] += 1 continue hits += 1 alt_names_found[location['id']] = good_alt_names with open(out_filename, 'w') as out: json.dump(alt_names_found, out) with open(log_filename, 'w') as out: out.write('\t'.join(('Resolution', 'Name', 'Country', 'Population', 'Alt names')) + '\n') results = sorted( [(locations_by_id[id_], alt_names) for id_, alt_names in alt_names_found.iteritems()], key=lambda pair: pair[0]['population'], reverse=True, ) for location, alt_names in results: out.write('\t'.join( (location['resolution'], location['name'], location['country'], str(location['population']), ','.join(alt_names))) + '\n')
def run(output_filepath): locations_by_name, locations_by_id = load_data() with open(output_filepath, 'w') as output: json.dump(locations_by_name, output)