Beispiel #1
0
def to_file_matching(civici):
    
    csvfile = open("civici_comuni.csv", "w")
    filewriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    filewriter.writerow(["ID PROVINCIA", "ID OSM", "VIA COMUNE", "VIA NORMALIZZATA COMUNE", "VIA OSM",
                         "VIA NORMALIZZATA OSM", "X", "Y" "INDICE difflib", "INDICE jellyfish(Jaro-Winkler)", "ULIMA PAROLA"])
    
    for civico in civici:
        
        via_prov = civico["via_prov"].lower().replace(".", "")
        via_osm = civico["via_osm"].lower().replace(".", "")
        
        s = None
        s = difflib.SequenceMatcher(None, via_prov, via_osm)
        sm_ratio = s.quick_ratio()
        
        jw_ratio = jellyfish.jaro_winkler(via_osm, via_prov)
        
        last_via_prov = via_prov.split(" ")[-1]
        last_via_osm = via_osm.split(" ")[-1]
        
        # Checking which are the numbers with the same way and writing to file
        if (sm_ratio >= 0.80 or jw_ratio > 0.91):
            filewriter.writerow([civico["ID_PROV"], civico["ID_OSM"],
                                 civico["via_prov"].encode('utf-8'), 
                                 expand_address(civico["via_prov"], languages=["it"]),
                                 sm_ratio, jw_ratio, 1 if last_via_prov.encode('utf-8') == last_via_osm.encode('utf-8') else 0])
        else:
            if (last_via_prov == last_via_osm and sm_ratio >= 0.6):
                filewriter.writerow([civico["ID_PROV"], civico["ID_OSM"],
                                 civico["via_prov"].encode('utf-8'), 
                                 expand_address(civico["via_prov"], languages=["it"]),
                                 sm_ratio, jw_ratio, 1])
            else:
                print " X ", civico["numero_provincia"], via_prov, via_osm, sm_ratio, jellyfish.jaro_winkler(via_osm, via_prov)
Beispiel #2
0
    def component_expansions(cls, address):
        street = address.get(AddressComponents.STREET)
        house_number = address.get(AddressComponents.HOUSE_NUMBER)

        if not (street and house_number):
            return ()

        street_expansions = expand_address(street,
                                           address_components=ADDRESS_STREET)
        house_number_expansions = expand_address(
            house_number, address_components=ADDRESS_HOUSE_NUMBER)

        return street_expansions, house_number_expansions
Beispiel #3
0
    def near_dupe_hashes(cls,
                         address,
                         geohash_precision=DEFAULT_GEOHASH_PRECISION,
                         use_latlon=True,
                         use_city=False,
                         use_postal_code=False):
        address_expansions = cls.component_expansions(address)

        lat = address.get(Coordinates.LATITUDE)
        lon = address.get(Coordinates.LONGITUDE)
        postcode = safe_decode(address.get(AddressComponents.POSTAL_CODE,
                                           u'')).strip()
        city = safe_decode(address.get(AddressComponents.CITY, u'')).strip()

        if not any(address_expansions):
            return

        if lat and lon and use_latlon and not (
            (isclose(lat, 0.0) and isclose(lon, 0.0)) or lat >= 90.0
                or lat <= -90.0):
            geo = geohash.encode(lat, lon)[:geohash_precision]
            geohash_neighbors = [geo] + geohash.neighbors(geo)

            base_key = cls.GEOHASH_KEY_PREFIX

            for keys in six.itertools.product(geohash_neighbors,
                                              *address_expansions):
                yield u'{}|{}'.format(base_key, u'|'.join(keys))

        if postcode and use_postal_code:
            postcode_expansions = expand_address(
                postcode, address_components=ADDRESS_POSTAL_CODE)

            base_key = cls.POSTCODE_KEY_PREFIX

            for keys in six.itertools.product(postcode_expansions,
                                              *address_expansions):
                yield u'{}|{}'.format(base_key, u'|'.join(keys))

        if city and use_city:
            city_expansions = expand_address(
                city, address_components=ADDRESS_TOPONYM)

            base_key = cls.CITY_KEY_PREFIX

            for keys in six.itertools.product(city_expansions,
                                              *address_expansions):
                yield u'{}|{}'.format(base_key, u'|'.join(keys))
Beispiel #4
0
def cleanup_addr(file_name, output_file):
    modified_add = []
    with open(file_name, encoding="utf-8-sig") as csvfile:
        reader = csv.DictReader(csvfile, delimiter=";")
        for row in reader:
            addr = row["adresse"]
            lbl = _get_addock_label(addr, row["insee"])

            final_addr = addr
            if lbl:
                # row["addok_adresse"] = lbl
                final_addr = lbl

            ex = expand_address(final_addr)[0]
            cased_ex = reaccentue.reaccentue(ex)
            # row["libpostal"] = ex
            # row["libpostal_normalized"] = normalize_string(final_addr)
            # row["final_addr"] = cased_ex
            row["adresse"] = cased_ex

            modified_add.append(row)

    with open(output_file, "w", encoding="utf-8-sig") as output:
        w = csv.DictWriter(output, fieldnames=modified_add[0].keys())
        w.writeheader()
        w.writerows(modified_add)
Beispiel #5
0
    def contained_in_expansions(self, address, output, **kw):
        """Test whether an expansion contains a particular output."""
        expansions = expand_address(address, **kw)
        self.assertTrue(expansions)

        expansions = set(expansions)
        self.assertTrue(output in expansions)
Beispiel #6
0
def expand_address_value(address_field):
    expanded_address = expand_address(str(address_field), languages=['en'])

    if len(expanded_address) == 1:
        return expanded_address[0]
    else:
        return address_field
Beispiel #7
0
    def contained_in_expansions(self, address, output, **kw):
        """Test whether an expansion contains a particular output."""
        expansions = expand_address(address, **kw)
        self.assertTrue(expansions)

        expansions = set(expansions)
        self.assertTrue(output in expansions)
Beispiel #8
0
 def address_expander(address: str) -> list:
     """
     Return a list of normalized address for the input address.
     """
     expaned_addresses = expand_address(address)
     logging.debug(f"{address} has {len(expaned_addresses)} "
                   f"expanded addresses: {expaned_addresses}")
     return expaned_addresses
Beispiel #9
0
    def has_exact_expansions(self, address, expected_expansions, **kw):
        """Test whether an address expands exactly"""
        expansions = expand_address(address, **kw)
        self.assertTrue(expansions)

        expansions = set(expansions)
        expected_expansions = set(expected_expansions)
        self.assertTrue(expansions == expected_expansions)
Beispiel #10
0
def _standardise_addresses(
    addresses: pd.DataFrame,
    on_column: str,
    to_column: str,
) -> pd.DataFrame:

    addresses[to_column] = addresses[on_column].apply(
        lambda cell: expand_address(cell)[0], )

    return addresses
 def expand_row(row, other_fields):
     """
     Expand a row
     """
     expanded_addresses = expand_address(row["address"])
     expanded_rows = []
     for expanded_address in expanded_addresses:
         expanded_row = {"normalized_address": expanded_address}
         for other_field in other_fields:
             expanded_row[other_field] = row[other_field]
         expanded_rows.append(expanded_row)
     return expanded_rows
Beispiel #12
0
    def component_equals(cls, c1, c2, component, no_whitespace=True):
        if not c1 or not c2:
            return False

        c1 = safe_decode(c1)
        c2 = safe_decode(c2)
        if no_whitespace and whitespace_regex.sub(
                u'', c1.lower()) == whitespace_regex.sub(u'', c2.lower()):
            return True

        expansions1 = expand_address(c1, address_components=component)
        expansions2 = expand_address(c2, address_components=component)

        if not no_whitespace:
            set_expansions1 = set(expansions1)
            set_expansions2 = set(expansions2)
        else:
            set_expansions1 = set(
                [whitespace_regex.sub(u'', e1) for e1 in expansions1])
            set_expansions2 = set(
                [whitespace_regex.sub(u'', e2) for e2 in expansions2])

        return len(set_expansions1 & set_expansions2) > 0
Beispiel #13
0
    def name_word_hashes(cls, name):
        name_expanded_words = set()

        for n in expand_address(name, address_components=ADDRESS_NAME):
            tokens = NameDeduper.tokenize(n)
            for t in tokens:
                dm = set([
                    e for e in double_metaphone(safe_encode(t))
                    if e is not None
                ])
                if dm:
                    name_expanded_words |= dm
                else:
                    name_expanded_words.add(t)

        return name_expanded_words
Beispiel #14
0
def to_file_all(civici):
    
    # Creating a CSV file to store all housenumbers
    csvfile = open("civici_comuni.csv", "w")
    filewriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    filewriter.writerow(["ID PROVINCIA", "ID OSM", "VIA COMUNE", "VIA NORMALIZZATA COMUNE", "VIA OSM",
                         "VIA NORMALIZZATA OSM", "LAT", "LON", "INDICE difflib", "INDICE jellyfish(Jaro-Winkler)", "ULIMA PAROLA"])
    
    for civico in civici:
        
        via_prov = civico["via_prov"].lower().replace(".", "")
        via_osm = civico["via_osm"].lower().replace(".", "")
        
        s = None
        s = difflib.SequenceMatcher(None, via_prov, via_osm)
        sm_ratio = s.quick_ratio()
        
        jw_ratio = jellyfish.jaro_winkler(via_osm, via_prov)
        
        last_via_prov = via_prov.split(" ")[-1]
        last_via_osm = via_osm.split(" ")[-1]
        
        # Checking which are the housenumbers that have the same street and writing to file
        if (sm_ratio >= 0.8 or jw_ratio > 0.91):
            filewriter.writerow([civico["ID_PROV"], civico["ID_OSM"],
                             civico["via_prov"].encode('utf-8'), 
                             expand_address(civico["via_prov"], languages=["it"]),
                             civico["via_osm"].encode('utf-8'), 
                             expand_address(civico["via_osm"], languages=["it"]),
                             civico["LAT"], civico["LON"],
                             sm_ratio, jw_ratio, 1 if last_via_prov.encode('utf-8') == last_via_osm.encode('utf-8') else 0])
    
        elif (sm_ratio >= 0.6 and last_via_prov.encode('utf-8') == last_via_osm.encode('utf-8')):
            filewriter.writerow([civico["ID_PROV"], civico["ID_OSM"],
                             civico["via_prov"].encode('utf-8'), 
                             expand_address(civico["via_prov"], languages=["it"]),
                             civico["via_osm"].encode('utf-8'), 
                             expand_address(civico["via_osm"], languages=["it"]),
                             civico["LAT"], civico["LON"],
                             sm_ratio, jw_ratio, 1])
        else:
            filewriter.writerow([civico["ID_PROV"], civico["ID_OSM"],
                             civico["via_prov"].encode('utf-8'), 
                             expand_address(civico["via_prov"], languages=["it"]),
                             civico["via_osm"].encode('utf-8'), 
                             expand_address(civico["via_osm"], languages=["it"]),
                             civico["LAT"], civico["LON"],
                             sm_ratio, jw_ratio, 0])
        mailing_address = row_dict[contributor_address1_field]#.upper().replace('  ',', ')
        if row_dict[contributor_address2_field] != '':
            mailing_address += ', {}'.format(row_dict[contributor_address2_field])

        mailing_address += ', {}, {} {}'.format(row_dict[contributor_city_field], row_dict[contributor_state_field], row_dict[zip_code_field]).strip(' ,')

        if mailing_address == '':
            print('ERROR: Empty Address on line {}'.format(line_num))
            bad_addy = True
            address_dict = {}

        else:

            try:

                expanded_address = expand_address(mailing_address)

                parsed_address = parse_address(expanded_address[0])

            except Exception as e:

                print('ERROR: bad address "{}" on line {}'.format(mailing_address, line_num))

                parsed_address = parse_address(mailing_address)

            # Create address dictionary
            address_dict = {d[1]: d[0] for d in parsed_address}
            #print('address_dict:', address_dict)
            #break

Beispiel #16
0
    def have_expansion_in_common(self, str1, str2, **kw):
        """Test whether strings have at least one shared expansion."""
        expansions1 = expand_address(str1, **kw)
        expansions2 = expand_address(str2, **kw)

        self.assertTrue(set(expansions1) & set(expansions2))
def get_address_set(input):
    # use set to ensure the values are unique
    address = expand_address(input)
    uniq = set(address)
    return list(uniq)
Beispiel #18
0
    city_id = db.insertCityIntoDB('Paignton', region_id)
    city_id = db.insertCityIntoDB('Brixham', region_id)
    city_id = db.insertCityIntoDB('Kingswear', region_id)
    city_id = db.insertCityIntoDB('Dartmouth', region_id)

    city_id = db.insertCityIntoDB('Dartmouth', region_id) # should be ignored as is a repeat

    region = g_address.region2
    preferred = db.getPreferredRegionFromDB(region, country_id)
    if preferred != None:
        region = preferred
    region_id = db.getRegionIdFromDB(region, country_id)
    city = g_address.city
    city_id = db.getCityIdFromDB(city, region_id)
    street = g_address.street
    s_address = expand_address(street)[0] # pypostal expands rd to road etc
    s_address = StringUtil.titlecase(s_address)
    def_lat = g_address.lat
    def_lon = g_address.lon

    street_id = db.insertStreetIntoDB(s_address, city_id, postcode, def_lat, def_lon)

    lat,lon = db.getDefaultLatLonFromDB(postcode)
    print(lat)
    print(lon)

    db.close()



Beispiel #19
0
 def get(self):
     query = request.args.get('address')
     return expand_address(query)
Beispiel #20
0
print("Hello world")

from postal.parser import parse_address
a = parse_address(
    'The Book Club 100-106 Leonard St, Shoreditch, London, Greater London, EC2A 4RH, United Kingdom'
)
print(a)

from postal.expand import expand_address
b = expand_address('Quatre vingt douze Ave des Champs-Élysées')
print(b)
Beispiel #21
0
    def have_expansion_in_common(self, str1, str2, **kw):
        """Test whether strings have at least one shared expansion."""
        expansions1 = expand_address(str1, **kw)
        expansions2 = expand_address(str2, **kw)

        self.assertTrue(set(expansions1) & set(expansions2))
Beispiel #22
0
def address_parser(address):

    return convert_json(
        parse_address(expand_address(address, languages=('es'))[0]))
Beispiel #23
0
    def merge(self, output_path):
        """
        Do the merge the best way possible
        """

        logging.info(
            f"Starting merge {self.csv_file1_path} + {self.csv_file2_path} "
            f"to {output_path}")

        start_time = time.time()

        # Expand the CSV file 1, i.e. duplicate rows by adding a new column
        # called "normalized_address" each new row will have all
        # the same values in the first columns,
        # but a different one in "normalized_address"
        expanded_csv_file1_path = "expanded_file1.csv"
        CsvAddressExpander.expand_csv(self.csv_file1_path,
                                      expanded_csv_file1_path,
                                      ["id_store", "variable1"])
        file1_df = pd.read_csv(expanded_csv_file1_path, sep=";")
        # Set an index to increase perfomance of filters
        file1_df.set_index("normalized_address", drop=True, inplace=True)

        # Prepare the output CSV writer
        output_handler = CsvHandler(output_path)
        csv_output_file_writer = output_handler.write_csv(
            ['id', 'var1', 'var2', 'ratio'])
        csv_output_file_row_count = 0
        # Read each row of the CSV file 2 and expand their addresses,
        # loop through their normalized addresses and as soon as
        # there is a coincidence, write the result on the output CSV file
        csv_file2_reader = CsvHandler(self.csv_file2_path).read_csv()
        for row2 in csv_file2_reader:
            normalized_addresses = expand_address(row2["address"])
            for normalized_address in normalized_addresses:
                # Exact match using the index (better performance)
                file1_normalized_address_selection =\
                    file1_df[file1_df.index == normalized_address]
                file1_normalized_address_selection_match_found =\
                    file1_normalized_address_selection.shape[0] > 0
                if file1_normalized_address_selection_match_found:
                    row1 = file1_normalized_address_selection.iloc[0]
                    output_row = {
                        "id":
                        row1["id_store"],
                        "var1":
                        row1["variable1"],
                        "var2":
                        row2["variable2"],
                        "ratio":
                        (float(row1["variable1"]) / float(row2["variable2"])
                         if float(row2["variable2"]) != 0.0 else None)
                    }
                    csv_output_file_writer.writerow(output_row)
                    csv_output_file_row_count += 1
                    break
            else:
                # Exact match has not delivered any results
                # TODO: implement fuzzy matching
                logging.error(f"Error. {row2['address']}' couldn't be found "
                              f"on expanded address set.")
        os.remove(expanded_csv_file1_path)

        elapsed_time = time.time() - start_time
        csv1_address_count = file1_df[["id_store"]].drop_duplicates().shape[0]
        merged_address_percentage =\
            100.0 * csv_output_file_row_count / csv1_address_count
        logging.info(f"Merge of addresses {csv_output_file_row_count}"
                     f"of {csv1_address_count} "
                     f"({merged_address_percentage} %) "
                     f"completed in {output_path} in {elapsed_time} seconds")
Beispiel #24
0
def do_expand(address):
    return expand_address(address)[0]