Esempio n. 1
0
def identify_missing_locations(unidentified_grouped_locations_enum,
                               identified_grouped_locations,
                               minimum_match_value, t):
    #For each group of locations with the same country
    for i, item in unidentified_grouped_locations_enum:
        country, grouped_locations_list = item
        #Get a list of all cities that exist anywhere in that country
        all_cities_in_country = geo_data_session.query(
            AllCities.city, AllCities.region).filter_by(country=country)
        #Construct a name for each location that matches the normal cleaned location format
        all_cities_in_country = [
            geoalchemy_util.concatenate_location(
                x.city, x.region if geoalchemy_util.region_is_a_state(x.region)
                else '', country) for x in all_cities_in_country
        ]
        #For each location found in this country, find its closest match
        #among the list of all cities from that country
        for grouped_location in grouped_locations_list:
            cleaned_location = grouped_location["cleaned_location"]
            closest_match = geoalchemy_util.get_closest_match_leven(
                cleaned_location, all_cities_in_country, minimum_match_value)
            #If no match was found or only the trivial match
            if closest_match == '' or closest_match == country:
                continue
            #If we have a good match, add it to the list of matched locations
            closest_match_split = re.split(",", closest_match)
            city = closest_match_split[0].strip()
            if len(closest_match_split) == 3:
                region = closest_match_split[1].strip()
                country = closest_match_split[2].strip()
                matching_location = geo_data_session.query(
                    AllCities).filter_by(city=city,
                                         region=region,
                                         country=country).first()
            else:
                country = closest_match_split[1].strip()
                matching_location = geo_data_session.query(
                    AllCities).filter_by(city=city, country=country).first()
            if not matching_location:
                print 'Warning: all_cities match attempt failed for', cleaned_location.encode(
                    'utf8'), 'location not found'
            grouping_id = u"{0}|{1}".format(matching_location.latitude,
                                            matching_location.longitude)
            raw_location = grouped_location["raw_location"]
            identified_grouped_locations.append({
                "raw_location": raw_location,
                "matching_location": matching_location,
                "grouping_id": grouping_id
            })
            print 'all_cities found additional location for', raw_location
Esempio n. 2
0
def identify_missing_locations(
    unidentified_grouped_locations_enum, identified_grouped_locations, minimum_match_value, t
):
    # For each group of locations with the same country
    for i, item in unidentified_grouped_locations_enum:
        country, grouped_locations_list = item
        # Get a list of all cities that exist anywhere in that country
        all_cities_in_country = geo_data_session.query(AllCities.city, AllCities.region).filter_by(country=country)
        # Construct a name for each location that matches the normal cleaned location format
        all_cities_in_country = [
            geoalchemy_util.concatenate_location(
                x.city, x.region if geoalchemy_util.region_is_a_state(x.region) else "", country
            )
            for x in all_cities_in_country
        ]
        # For each location found in this country, find its closest match
        # among the list of all cities from that country
        for grouped_location in grouped_locations_list:
            cleaned_location = grouped_location["cleaned_location"]
            closest_match = geoalchemy_util.get_closest_match_leven(
                cleaned_location, all_cities_in_country, minimum_match_value
            )
            # If no match was found or only the trivial match
            if closest_match == "" or closest_match == country:
                continue
            # If we have a good match, add it to the list of matched locations
            closest_match_split = re.split(",", closest_match)
            city = closest_match_split[0].strip()
            if len(closest_match_split) == 3:
                region = closest_match_split[1].strip()
                country = closest_match_split[2].strip()
                matching_location = (
                    geo_data_session.query(AllCities).filter_by(city=city, region=region, country=country).first()
                )
            else:
                country = closest_match_split[1].strip()
                matching_location = geo_data_session.query(AllCities).filter_by(city=city, country=country).first()
            if not matching_location:
                print "Warning: all_cities match attempt failed for", cleaned_location.encode(
                    "utf8"
                ), "location not found"
            grouping_id = u"{0}|{1}".format(matching_location.latitude, matching_location.longitude)
            raw_location = grouped_location["raw_location"]
            identified_grouped_locations.append(
                {"raw_location": raw_location, "matching_location": matching_location, "grouping_id": grouping_id}
            )
            print "all_cities found additional location for", raw_location
Esempio n. 3
0
def main(limit=None, offset=0):
    t = datetime.datetime.now()
    print "geocoding started", t
    #Construct a list of all addresses which Google was capable of identifying
    #Making this now allows it to be referenced quickly later
    construct_valid_input_address_list(force_lowercase=True)
    #Get all of the raw locations in alchemy.db that were parsed from XML
    raw_parsed_locations = alchemy_session.query(
        alchemy.RawLocation).limit(limit).offset(offset)
    #If there are no locations, there is no point in continuing
    if raw_parsed_locations.count() == 0:
        return False
    print 'Constructed list of all parsed locations containing', raw_parsed_locations.count(
    ), 'items'
    """
    grouped_loations will contain a list of dicts. Each dict will contain three values:
    raw_location = Location object containing the original location found in the XML
    matching_location = RawGoogle object containing the disambiguated location
    grouping_id = ID constructed from the city, region, and country of the matching_location
    """
    grouped_locations = []
    for instance in raw_parsed_locations:
        #Convert the location into a string that matches the Google format
        parsed_raw_location = geoalchemy_util.concatenate_location(
            instance.city, instance.state, instance.country)
        cleaned_location = geoalchemy_util.clean_raw_location(
            parsed_raw_location)
        #If the cleaned location has a match in the raw_google database,
        #we use that to classify it
        if input_address_exists(cleaned_location, force_lowercase=True):
            #Find the location from the raw_google database that matches this input
            matching_location = geo_data_session.query(RawGoogle).filter(
                sqlalchemy.func.lower(RawGoogle.input_address) ==
                sqlalchemy.func.lower(cleaned_location)).first()
            grouping_id = u"{0}|{1}".format(matching_location.latitude,
                                            matching_location.longitude)
        else:
            """
            If there is no match in the raw_google database, we leave the location alone
            TODO: analyze the location's edit distance to make minor adjustments to it
            such that it can be matched. Particularly good if we can combine the
            all_cities database with the list of valid input_address values in the
            raw_google database.
            """
            print cleaned_location
            matching_location = RawGoogle(cleaned_location, '', '', '', '', '',
                                          -1)
            grouping_id = u"nolocationfound"
        grouped_locations.append({
            "raw_location": instance,
            "matching_location": matching_location,
            "grouping_id": grouping_id
        })
    print "grouped_locations created", datetime.datetime.now() - t
    t = datetime.datetime.now()
    #We now have a list of all locations in the file, along with their
    #matching locations and the id used to group them
    #Sort the list by the grouping_id
    keyfunc = lambda x: x['grouping_id']
    grouped_locations.sort(key=keyfunc)
    #Create an iterator that will access everything in the list with the same
    #grouping_id
    grouped_locations_enum = enumerate(
        itertools.groupby(grouped_locations, keyfunc))
    print "grouped_locations sorted", datetime.datetime.now() - t
    t = datetime.datetime.now()
    #Match the locations
    match_grouped_locations(grouped_locations_enum, t)

    alchemy_session.commit()

    print "Matches made!", datetime.datetime.now() - t
    unique_group_count = alchemy_session.query(
        expression.func.count(sqlalchemy.distinct(alchemy.Location.id))).all()
    print "%s groups formed from %s locations" % (unique_group_count,
                                                  raw_parsed_locations.count())
Esempio n. 4
0
def main(limit=None, offset=0, minimum_match_value=0.8, doctype='grant'):
    alchemy_session = alchemy.fetch_session(dbtype=doctype)
    t = datetime.datetime.now()
    print "geocoding started", doctype, t
    #Construct a list of all addresses which Google was capable of identifying
    #Making this now allows it to be referenced quickly later
    valid_input_addresses = construct_valid_input_addresses()
    #Get all of the raw locations in alchemy.db that were parsed from XML
    if doctype == 'grant':
        raw_parsed_locations = alchemy_session.query(alchemy.schema.RawLocation).limit(limit).offset(offset)
    elif doctype == 'application':
        raw_parsed_locations = alchemy_session.query(alchemy.schema.App_RawLocation).limit(limit).offset(offset)
    raw_parsed_locations_count = raw_parsed_locations.count()

    #If there are no locations, there is no point in continuing
    if raw_parsed_locations_count == 0:
        return False
    print 'Constructed list of all parsed locations containing', raw_parsed_locations_count, 'items'
    """
    grouped_loations will contain a list of dicts. Each dict will contain three values:
    raw_location = Location object containing the original location found in the XML
    matching_location = RawGoogle object containing the disambiguated location
    grouping_id = ID constructed from the city, region, and country of the matching_location
    """
    identified_grouped_locations = []
    unidentified_grouped_locations = []
    for instance in raw_parsed_locations:
        #Convert the location into a string that matches the Google format
        parsed_raw_location = geoalchemy_util.concatenate_location(instance.city, instance.state, instance.country)
        cleaned_location = geoalchemy_util.clean_raw_location(parsed_raw_location)
        #If the cleaned location has a match in the raw_google database,
        #we use that to classify it
        if input_address_exists(valid_input_addresses, cleaned_location):
            matching_location = geo_data_session.query(RawGoogle).filter(
                                     RawGoogle.input_address==cleaned_location).first()
            if matching_location:
                grouping_id = u"{0}|{1}".format(matching_location.latitude, matching_location.longitude)
                identified_grouped_locations.append({"raw_location": instance,
                                      "matching_location": matching_location,
                                      "grouping_id": grouping_id})
            else:
                print 'Cleaned location not matched', cleaned_location
                country = geoalchemy_util.get_country_from_cleaned(cleaned_location)
                unidentified_grouped_locations.append({"raw_location": instance,
                                                       "cleaned_location": cleaned_location,
                                                       "country": country})

        else:
            """
            If there is no match in the raw_google database, we leave the location alone
            TODO: analyze the location's edit distance to make minor adjustments to it
            such that it can be matched. Particularly good if we can combine the
            all_cities database with the list of valid input_address values in the
            raw_google database.
            """
            #Sort the locations by their country
            country = geoalchemy_util.get_country_from_cleaned(cleaned_location)
            unidentified_grouped_locations.append({"raw_location": instance,
                                                   "cleaned_location": cleaned_location,
                                                   "country": country})
        if ((len(identified_grouped_locations)+len(unidentified_grouped_locations))%10000 == 0):
            print "Processed", len(identified_grouped_locations)+len(unidentified_grouped_locations), datetime.datetime.now()
    print "locations grouped", datetime.datetime.now() - t
    print 'count of identified locations:', len(identified_grouped_locations)
    t = datetime.datetime.now()
    alchemy_session.close()


    #We now have two lists of locations. First, consider the unmatched locations.
    keyfunc = lambda x:x["country"]
    #Sort the list by the country
    unidentified_grouped_locations.sort(key=keyfunc)
    #Create an iterator that will access everything in the list with the same
    #country
    unidentified_grouped_locations_enum = enumerate(itertools.groupby(unidentified_grouped_locations, keyfunc))
    #Identify the correct location for each entry by comparing to all_cities
    identify_missing_locations(unidentified_grouped_locations_enum,
                               identified_grouped_locations,
                               minimum_match_value, t)
    print 'new count of identified locations:', len(identified_grouped_locations)

    #We now have a list of all locations in the file, along with their
    #matching locations and the id used to group them
    #Perform a quickfix to correct state names
    geoalchemy_util.fix_state_abbreviations(identified_grouped_locations)

    #Sort the list by the grouping_id
    keyfunc = lambda x: x['grouping_id']
    identified_grouped_locations.sort(key=keyfunc)
    #Create an iterator that will access everything in the list with the same
    #grouping_id
    identified_grouped_locations_enum = enumerate(itertools.groupby(identified_grouped_locations, keyfunc))
    print "identified_grouped_locations sorted", datetime.datetime.now() - t
    t = datetime.datetime.now()

    alchemy_session = alchemy.fetch_session(dbtype=doctype)

    #Match the locations
    match_grouped_locations(identified_grouped_locations_enum, t, alchemy_session)

    print "Matches made!", datetime.datetime.now() - t
    if doctype == 'grant':
        unique_group_count = alchemy_session.query(expression.func.count(sqlalchemy.distinct(alchemy.schema.Location.id))).all()
    elif doctype == 'application':
        unique_group_count = alchemy_session.query(expression.func.count(sqlalchemy.distinct(alchemy.schema.App_Location.id))).all()

    print "%s groups formed from %s locations" % (unique_group_count, raw_parsed_locations_count)
    alchemy_session.close()
Esempio n. 5
0
def main(limit=None, offset=0):
    t = datetime.datetime.now()
    print "geocoding started", t
    #Construct a list of all addresses which Google was capable of identifying
    #Making this now allows it to be referenced quickly later
    construct_valid_input_address_list(force_lowercase=True)
    #Get all of the raw locations in alchemy.db that were parsed from XML
    raw_parsed_locations = alchemy_session.query(alchemy.RawLocation).limit(limit).offset(offset)
    #If there are no locations, there is no point in continuing
    if raw_parsed_locations.count() == 0:
        return False
    print 'Constructed list of all parsed locations containing', raw_parsed_locations.count(), 'items'
    """
    grouped_loations will contain a list of dicts. Each dict will contain three values:
    raw_location = Location object containing the original location found in the XML
    matching_location = RawGoogle object containing the disambiguated location
    grouping_id = ID constructed from the city, region, and country of the matching_location
    """
    grouped_locations = []
    for instance in raw_parsed_locations:
        #Convert the location into a string that matches the Google format
        parsed_raw_location = geoalchemy_util.concatenate_location(instance.city, instance.state, instance.country)
        cleaned_location = geoalchemy_util.clean_raw_location(parsed_raw_location)
        #If the cleaned location has a match in the raw_google database,
        #we use that to classify it
        if input_address_exists(cleaned_location, force_lowercase=True):
            #Find the location from the raw_google database that matches this input
            matching_location = geo_data_session.query(RawGoogle).filter(
                                    sqlalchemy.func.lower(RawGoogle.input_address)==
                                    sqlalchemy.func.lower(cleaned_location)).first()
            grouping_id = u"{0}|{1}".format(matching_location.latitude, matching_location.longitude)
        else:
            """
            If there is no match in the raw_google database, we leave the location alone
            TODO: analyze the location's edit distance to make minor adjustments to it
            such that it can be matched. Particularly good if we can combine the
            all_cities database with the list of valid input_address values in the
            raw_google database.
            """
            print cleaned_location
            matching_location = RawGoogle(cleaned_location, '', '', '', '', '', -1)
            grouping_id = u"nolocationfound"
        grouped_locations.append({"raw_location": instance,
                                  "matching_location": matching_location,
                                  "grouping_id": grouping_id})
    print "grouped_locations created", datetime.datetime.now() - t
    t = datetime.datetime.now()
    #We now have a list of all locations in the file, along with their
    #matching locations and the id used to group them
    #Sort the list by the grouping_id
    keyfunc = lambda x: x['grouping_id']
    grouped_locations.sort(key=keyfunc)
    #Create an iterator that will access everything in the list with the same
    #grouping_id
    grouped_locations_enum = enumerate(itertools.groupby(grouped_locations, keyfunc))
    print "grouped_locations sorted", datetime.datetime.now() - t
    t = datetime.datetime.now()
    #Match the locations
    match_grouped_locations(grouped_locations_enum, t)
    
    alchemy_session.commit()

    print "Matches made!", datetime.datetime.now() - t
    unique_group_count = alchemy_session.query(expression.func.count(sqlalchemy.distinct(alchemy.Location.id))).all()
    print "%s groups formed from %s locations" % (unique_group_count, raw_parsed_locations.count())