Ejemplo n.º 1
0
def identify_missing_locations(unidentified_grouped_locations_enum,
                               identified_grouped_locations,
                               minimum_match_value, t):
    #For each group of locations with the same country
    for i, item in unidentified_grouped_locations_enum:
        country, grouped_locations_list = item
        #Get a list of all cities that exist anywhere in that country
        all_cities_in_country = geo_data_session.query(
            AllCities.city, AllCities.region).filter_by(country=country)
        #Construct a name for each location that matches the normal cleaned location format
        all_cities_in_country = [
            geoalchemy_util.concatenate_location(
                x.city, x.region if geoalchemy_util.region_is_a_state(x.region)
                else '', country) for x in all_cities_in_country
        ]
        #For each location found in this country, find its closest match
        #among the list of all cities from that country
        for grouped_location in grouped_locations_list:
            cleaned_location = grouped_location["cleaned_location"]
            closest_match = geoalchemy_util.get_closest_match_leven(
                cleaned_location, all_cities_in_country, minimum_match_value)
            #If no match was found or only the trivial match
            if closest_match == '' or closest_match == country:
                continue
            #If we have a good match, add it to the list of matched locations
            closest_match_split = re.split(",", closest_match)
            city = closest_match_split[0].strip()
            if len(closest_match_split) == 3:
                region = closest_match_split[1].strip()
                country = closest_match_split[2].strip()
                matching_location = geo_data_session.query(
                    AllCities).filter_by(city=city,
                                         region=region,
                                         country=country).first()
            else:
                country = closest_match_split[1].strip()
                matching_location = geo_data_session.query(
                    AllCities).filter_by(city=city, country=country).first()
            if not matching_location:
                print 'Warning: all_cities match attempt failed for', cleaned_location.encode(
                    'utf8'), 'location not found'
            grouping_id = u"{0}|{1}".format(matching_location.latitude,
                                            matching_location.longitude)
            raw_location = grouped_location["raw_location"]
            identified_grouped_locations.append({
                "raw_location": raw_location,
                "matching_location": matching_location,
                "grouping_id": grouping_id
            })
            print 'all_cities found additional location for', raw_location
Ejemplo n.º 2
0
def identify_missing_locations(
    unidentified_grouped_locations_enum, identified_grouped_locations, minimum_match_value, t
):
    # For each group of locations with the same country
    for i, item in unidentified_grouped_locations_enum:
        country, grouped_locations_list = item
        # Get a list of all cities that exist anywhere in that country
        all_cities_in_country = geo_data_session.query(AllCities.city, AllCities.region).filter_by(country=country)
        # Construct a name for each location that matches the normal cleaned location format
        all_cities_in_country = [
            geoalchemy_util.concatenate_location(
                x.city, x.region if geoalchemy_util.region_is_a_state(x.region) else "", country
            )
            for x in all_cities_in_country
        ]
        # For each location found in this country, find its closest match
        # among the list of all cities from that country
        for grouped_location in grouped_locations_list:
            cleaned_location = grouped_location["cleaned_location"]
            closest_match = geoalchemy_util.get_closest_match_leven(
                cleaned_location, all_cities_in_country, minimum_match_value
            )
            # If no match was found or only the trivial match
            if closest_match == "" or closest_match == country:
                continue
            # If we have a good match, add it to the list of matched locations
            closest_match_split = re.split(",", closest_match)
            city = closest_match_split[0].strip()
            if len(closest_match_split) == 3:
                region = closest_match_split[1].strip()
                country = closest_match_split[2].strip()
                matching_location = (
                    geo_data_session.query(AllCities).filter_by(city=city, region=region, country=country).first()
                )
            else:
                country = closest_match_split[1].strip()
                matching_location = geo_data_session.query(AllCities).filter_by(city=city, country=country).first()
            if not matching_location:
                print "Warning: all_cities match attempt failed for", cleaned_location.encode(
                    "utf8"
                ), "location not found"
            grouping_id = u"{0}|{1}".format(matching_location.latitude, matching_location.longitude)
            raw_location = grouped_location["raw_location"]
            identified_grouped_locations.append(
                {"raw_location": raw_location, "matching_location": matching_location, "grouping_id": grouping_id}
            )
            print "all_cities found additional location for", raw_location
Ejemplo n.º 3
0
def find_difficult_locations_from_file(inputfilename, outputfilename):
    inputfile = open(inputfilename, 'r')
    outputfile = open(outputfilename, 'w+')
    t = datetime.datetime.now()
    all_japan_cities_query = geo_data_session.query(AllCities.city).filter(AllCities.country=='JP').group_by(AllCities.city).all()
    all_japan_cities = []
    for row in all_japan_cities_query:
        all_japan_cities.append(row.city)
    print 'list of all_japan_cities created', datetime.datetime.now()-t
    for line in inputfile:
        line = line.decode('utf8')
        line = geoalchemy_util.remove_eol_pattern.sub('', line)
        if line.endswith(', JP') or line.endswith(', JA'):
            city = line.split(',')[0].strip()
            most_similar_city = geoalchemy_util.get_closest_match_leven(city, all_japan_cities, 0.8)
            if most_similar_city!='':
                outputfile.write('{0}|{1}\n'.format(city.encode('utf8'), most_similar_city.encode('utf8')))
    print datetime.datetime.now()-t
Ejemplo n.º 4
0
def find_difficult_locations_from_file(inputfilename, outputfilename):
    inputfile = open(inputfilename, "r")
    outputfile = open(outputfilename, "w+")
    t = datetime.datetime.now()
    all_japan_cities_query = (
        geo_data_session.query(AllCities.city).filter(AllCities.country == "JP").group_by(AllCities.city).all()
    )
    all_japan_cities = []
    for row in all_japan_cities_query:
        all_japan_cities.append(row.city)
    print "list of all_japan_cities created", datetime.datetime.now() - t
    for line in inputfile:
        line = line.decode("utf8")
        line = geoalchemy_util.remove_eol_pattern.sub("", line)
        if line.endswith(", JP") or line.endswith(", JA"):
            city = line.split(",")[0].strip()
            most_similar_city = geoalchemy_util.get_closest_match_leven(city, all_japan_cities, 0.8)
            if most_similar_city != "":
                outputfile.write("{0}|{1}\n".format(city.encode("utf8"), most_similar_city.encode("utf8")))
    print datetime.datetime.now() - t