コード例 #1
0
def print_diffs(importance_filepath_1, importance_filepath_2):
    """
    Use this method to print the top differences between two different files for estimated
    importances.
    """
    with open(importance_filepath_1) as f:
        importance_1 = json.load(f)
    with open(importance_filepath_2) as f:
        importance_2 = json.load(f)
    _, locations_by_id = geonames.load_data()

    diffs = [
        (id_, importance_1.get(id_, 0.), importance_2.get(id_, 0.))
        for id_ in set(importance_1.keys() + importance_2.keys())
    ]
    diffs.sort(key=lambda info: abs(info[1] - info[2]), reverse=True)
    for id_, i1, i2 in diffs:
        if abs(i1 - i2) < .1:
            break

        if id_ not in locations_by_id:
            print id_
            continue
        location = locations_by_id[id_]
        print location['resolution'], location['name'], location['country'], i1, i2
コード例 #2
0
def compare(filepath1, filepath2):
    """
    Prints the difference between two files of alternate names.
    """
    with open(filepath1) as file1:
        alt_names1 = json.load(file1)
    with open(filepath2) as file2:
        alt_names2 = json.load(file2)

    _, locations_by_id = load_data()

    print_extra(alt_names1, alt_names2, locations_by_id)
    print_extra(alt_names2, alt_names1, locations_by_id)
コード例 #3
0
def run(osm_data_filepath, out_filepath, show_plot):
    geo_locations_by_name, geo_locations_by_id = geonames.load_data()
    location_importances = find_osm_importances(
        osm_data_filepath, geo_locations_by_name, geo_locations_by_id
    )
    training_sets, predict_sets = get_data_sets(geo_locations_by_id, location_importances)
    models = train_models(training_sets, geo_locations_by_id, location_importances, show_plot)
    predictions = make_predictions(
        models, training_sets, predict_sets, geo_locations_by_id, location_importances
    )

    with open(out_filepath, 'w') as out:
        json.dump(predictions, out)

    if show_plot:
        plt.show()
コード例 #4
0
def test_geonames_data_format():
    locations_by_name, locations_by_id = load_data()
    _test_data_format(locations_by_id)
コード例 #5
0
def test_geonames_data_numbers():
    locations_by_name, locations_by_id = load_data()
    _test_data_numbers(locations_by_name, locations_by_id)
コード例 #6
0
def run(out_filename, log_filename):
    """
    Find the alternate names of locations in the geonames data set and write them to an output file.
    """
    locations_by_name, locations_by_id = load_data()

    alt_names_found = {}
    counts = dict((kind, 0)
                  for kind in ('ambiguous name', 'no wiki page found',
                               'no alt names found', 'ambiguous alt name'))
    hits = 0

    for i, (name,
            locations_with_name) in enumerate(locations_by_name.iteritems()):
        if i % 1000 == 0:
            print 'Search number', i
            print counts, hits

        if not locations_with_name:
            continue

        location = None
        if len(locations_with_name) == 1:
            candidate = locations_with_name.values()[0]
            if candidate['population'] > MIN_POPULATION_THRESHOLD:
                location = candidate
        else:
            locations_by_importance = sorted(
                locations_with_name.values(),
                key=lambda loc: get_adjusted_importance(loc),
                reverse=True)
            top_importance = get_adjusted_importance(
                locations_by_importance[0])
            next_importance = get_adjusted_importance(
                locations_by_importance[1])
            if (top_importance > MIN_AMBIG_IMPORTANCE_THRESHOLD
                    and top_importance - next_importance > .12):
                candidate = locations_by_importance[0]
                if candidate['population'] > MIN_POPULATION_THRESHOLD:
                    location = candidate

        if not location:
            counts['ambiguous name'] += 1
            continue

        if location['name'] != name:
            # only search for location's real name
            continue

        result = search(name)
        if result is None:
            counts['no wiki page found'] += 1
            continue

        if not result:
            counts['no alt names found'] += 1
            continue

        importance = get_adjusted_importance(location)
        good_alt_names = []

        for alt_name in result:
            skip_name = alt_name.title() in BLACKLIST
            locations_with_alt_name = locations_by_name.get(alt_name, {})
            for alt_location in locations_with_alt_name.itervalues():
                if alt_location['id'] == location['id']:
                    continue
                alt_importance = get_adjusted_importance(alt_location)
                if alt_location[
                        'name'] == alt_name and alt_importance + .1 > importance:
                    skip_name = True
                    break
                if alt_location[
                        'name'] != alt_name and alt_importance > importance + .1:
                    skip_name = True
                    break

            if not skip_name:
                good_alt_names.append(alt_name)

        if not good_alt_names:
            counts['ambiguous alt name'] += 1
            continue

        hits += 1
        alt_names_found[location['id']] = good_alt_names

    with open(out_filename, 'w') as out:
        json.dump(alt_names_found, out)

    with open(log_filename, 'w') as out:
        out.write('\t'.join(('Resolution', 'Name', 'Country', 'Population',
                             'Alt names')) + '\n')

        results = sorted(
            [(locations_by_id[id_], alt_names)
             for id_, alt_names in alt_names_found.iteritems()],
            key=lambda pair: pair[0]['population'],
            reverse=True,
        )
        for location, alt_names in results:
            out.write('\t'.join(
                (location['resolution'], location['name'], location['country'],
                 str(location['population']), ','.join(alt_names))) + '\n')
コード例 #7
0
def run(output_filepath):
    locations_by_name, locations_by_id = load_data()
    with open(output_filepath, 'w') as output:
        json.dump(locations_by_name, output)