def to_file_matching(civici): csvfile = open("civici_comuni.csv", "w") filewriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) filewriter.writerow(["ID PROVINCIA", "ID OSM", "VIA COMUNE", "VIA NORMALIZZATA COMUNE", "VIA OSM", "VIA NORMALIZZATA OSM", "X", "Y" "INDICE difflib", "INDICE jellyfish(Jaro-Winkler)", "ULIMA PAROLA"]) for civico in civici: via_prov = civico["via_prov"].lower().replace(".", "") via_osm = civico["via_osm"].lower().replace(".", "") s = None s = difflib.SequenceMatcher(None, via_prov, via_osm) sm_ratio = s.quick_ratio() jw_ratio = jellyfish.jaro_winkler(via_osm, via_prov) last_via_prov = via_prov.split(" ")[-1] last_via_osm = via_osm.split(" ")[-1] # Checking which are the numbers with the same way and writing to file if (sm_ratio >= 0.80 or jw_ratio > 0.91): filewriter.writerow([civico["ID_PROV"], civico["ID_OSM"], civico["via_prov"].encode('utf-8'), expand_address(civico["via_prov"], languages=["it"]), sm_ratio, jw_ratio, 1 if last_via_prov.encode('utf-8') == last_via_osm.encode('utf-8') else 0]) else: if (last_via_prov == last_via_osm and sm_ratio >= 0.6): filewriter.writerow([civico["ID_PROV"], civico["ID_OSM"], civico["via_prov"].encode('utf-8'), expand_address(civico["via_prov"], languages=["it"]), sm_ratio, jw_ratio, 1]) else: print " X ", civico["numero_provincia"], via_prov, via_osm, sm_ratio, jellyfish.jaro_winkler(via_osm, via_prov)
def component_expansions(cls, address): street = address.get(AddressComponents.STREET) house_number = address.get(AddressComponents.HOUSE_NUMBER) if not (street and house_number): return () street_expansions = expand_address(street, address_components=ADDRESS_STREET) house_number_expansions = expand_address( house_number, address_components=ADDRESS_HOUSE_NUMBER) return street_expansions, house_number_expansions
def near_dupe_hashes(cls, address, geohash_precision=DEFAULT_GEOHASH_PRECISION, use_latlon=True, use_city=False, use_postal_code=False): address_expansions = cls.component_expansions(address) lat = address.get(Coordinates.LATITUDE) lon = address.get(Coordinates.LONGITUDE) postcode = safe_decode(address.get(AddressComponents.POSTAL_CODE, u'')).strip() city = safe_decode(address.get(AddressComponents.CITY, u'')).strip() if not any(address_expansions): return if lat and lon and use_latlon and not ( (isclose(lat, 0.0) and isclose(lon, 0.0)) or lat >= 90.0 or lat <= -90.0): geo = geohash.encode(lat, lon)[:geohash_precision] geohash_neighbors = [geo] + geohash.neighbors(geo) base_key = cls.GEOHASH_KEY_PREFIX for keys in six.itertools.product(geohash_neighbors, *address_expansions): yield u'{}|{}'.format(base_key, u'|'.join(keys)) if postcode and use_postal_code: postcode_expansions = expand_address( postcode, address_components=ADDRESS_POSTAL_CODE) base_key = cls.POSTCODE_KEY_PREFIX for keys in six.itertools.product(postcode_expansions, *address_expansions): yield u'{}|{}'.format(base_key, u'|'.join(keys)) if city and use_city: city_expansions = expand_address( city, address_components=ADDRESS_TOPONYM) base_key = cls.CITY_KEY_PREFIX for keys in six.itertools.product(city_expansions, *address_expansions): yield u'{}|{}'.format(base_key, u'|'.join(keys))
def cleanup_addr(file_name, output_file): modified_add = [] with open(file_name, encoding="utf-8-sig") as csvfile: reader = csv.DictReader(csvfile, delimiter=";") for row in reader: addr = row["adresse"] lbl = _get_addock_label(addr, row["insee"]) final_addr = addr if lbl: # row["addok_adresse"] = lbl final_addr = lbl ex = expand_address(final_addr)[0] cased_ex = reaccentue.reaccentue(ex) # row["libpostal"] = ex # row["libpostal_normalized"] = normalize_string(final_addr) # row["final_addr"] = cased_ex row["adresse"] = cased_ex modified_add.append(row) with open(output_file, "w", encoding="utf-8-sig") as output: w = csv.DictWriter(output, fieldnames=modified_add[0].keys()) w.writeheader() w.writerows(modified_add)
def contained_in_expansions(self, address, output, **kw): """Test whether an expansion contains a particular output.""" expansions = expand_address(address, **kw) self.assertTrue(expansions) expansions = set(expansions) self.assertTrue(output in expansions)
def expand_address_value(address_field): expanded_address = expand_address(str(address_field), languages=['en']) if len(expanded_address) == 1: return expanded_address[0] else: return address_field
def address_expander(address: str) -> list: """ Return a list of normalized address for the input address. """ expaned_addresses = expand_address(address) logging.debug(f"{address} has {len(expaned_addresses)} " f"expanded addresses: {expaned_addresses}") return expaned_addresses
def has_exact_expansions(self, address, expected_expansions, **kw): """Test whether an address expands exactly""" expansions = expand_address(address, **kw) self.assertTrue(expansions) expansions = set(expansions) expected_expansions = set(expected_expansions) self.assertTrue(expansions == expected_expansions)
def _standardise_addresses( addresses: pd.DataFrame, on_column: str, to_column: str, ) -> pd.DataFrame: addresses[to_column] = addresses[on_column].apply( lambda cell: expand_address(cell)[0], ) return addresses
def expand_row(row, other_fields): """ Expand a row """ expanded_addresses = expand_address(row["address"]) expanded_rows = [] for expanded_address in expanded_addresses: expanded_row = {"normalized_address": expanded_address} for other_field in other_fields: expanded_row[other_field] = row[other_field] expanded_rows.append(expanded_row) return expanded_rows
def component_equals(cls, c1, c2, component, no_whitespace=True): if not c1 or not c2: return False c1 = safe_decode(c1) c2 = safe_decode(c2) if no_whitespace and whitespace_regex.sub( u'', c1.lower()) == whitespace_regex.sub(u'', c2.lower()): return True expansions1 = expand_address(c1, address_components=component) expansions2 = expand_address(c2, address_components=component) if not no_whitespace: set_expansions1 = set(expansions1) set_expansions2 = set(expansions2) else: set_expansions1 = set( [whitespace_regex.sub(u'', e1) for e1 in expansions1]) set_expansions2 = set( [whitespace_regex.sub(u'', e2) for e2 in expansions2]) return len(set_expansions1 & set_expansions2) > 0
def name_word_hashes(cls, name): name_expanded_words = set() for n in expand_address(name, address_components=ADDRESS_NAME): tokens = NameDeduper.tokenize(n) for t in tokens: dm = set([ e for e in double_metaphone(safe_encode(t)) if e is not None ]) if dm: name_expanded_words |= dm else: name_expanded_words.add(t) return name_expanded_words
def to_file_all(civici): # Creating a CSV file to store all housenumbers csvfile = open("civici_comuni.csv", "w") filewriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) filewriter.writerow(["ID PROVINCIA", "ID OSM", "VIA COMUNE", "VIA NORMALIZZATA COMUNE", "VIA OSM", "VIA NORMALIZZATA OSM", "LAT", "LON", "INDICE difflib", "INDICE jellyfish(Jaro-Winkler)", "ULIMA PAROLA"]) for civico in civici: via_prov = civico["via_prov"].lower().replace(".", "") via_osm = civico["via_osm"].lower().replace(".", "") s = None s = difflib.SequenceMatcher(None, via_prov, via_osm) sm_ratio = s.quick_ratio() jw_ratio = jellyfish.jaro_winkler(via_osm, via_prov) last_via_prov = via_prov.split(" ")[-1] last_via_osm = via_osm.split(" ")[-1] # Checking which are the housenumbers that have the same street and writing to file if (sm_ratio >= 0.8 or jw_ratio > 0.91): filewriter.writerow([civico["ID_PROV"], civico["ID_OSM"], civico["via_prov"].encode('utf-8'), expand_address(civico["via_prov"], languages=["it"]), civico["via_osm"].encode('utf-8'), expand_address(civico["via_osm"], languages=["it"]), civico["LAT"], civico["LON"], sm_ratio, jw_ratio, 1 if last_via_prov.encode('utf-8') == last_via_osm.encode('utf-8') else 0]) elif (sm_ratio >= 0.6 and last_via_prov.encode('utf-8') == last_via_osm.encode('utf-8')): filewriter.writerow([civico["ID_PROV"], civico["ID_OSM"], civico["via_prov"].encode('utf-8'), expand_address(civico["via_prov"], languages=["it"]), civico["via_osm"].encode('utf-8'), expand_address(civico["via_osm"], languages=["it"]), civico["LAT"], civico["LON"], sm_ratio, jw_ratio, 1]) else: filewriter.writerow([civico["ID_PROV"], civico["ID_OSM"], civico["via_prov"].encode('utf-8'), expand_address(civico["via_prov"], languages=["it"]), civico["via_osm"].encode('utf-8'), expand_address(civico["via_osm"], languages=["it"]), civico["LAT"], civico["LON"], sm_ratio, jw_ratio, 0])
mailing_address = row_dict[contributor_address1_field]#.upper().replace(' ',', ') if row_dict[contributor_address2_field] != '': mailing_address += ', {}'.format(row_dict[contributor_address2_field]) mailing_address += ', {}, {} {}'.format(row_dict[contributor_city_field], row_dict[contributor_state_field], row_dict[zip_code_field]).strip(' ,') if mailing_address == '': print('ERROR: Empty Address on line {}'.format(line_num)) bad_addy = True address_dict = {} else: try: expanded_address = expand_address(mailing_address) parsed_address = parse_address(expanded_address[0]) except Exception as e: print('ERROR: bad address "{}" on line {}'.format(mailing_address, line_num)) parsed_address = parse_address(mailing_address) # Create address dictionary address_dict = {d[1]: d[0] for d in parsed_address} #print('address_dict:', address_dict) #break
def have_expansion_in_common(self, str1, str2, **kw): """Test whether strings have at least one shared expansion.""" expansions1 = expand_address(str1, **kw) expansions2 = expand_address(str2, **kw) self.assertTrue(set(expansions1) & set(expansions2))
def get_address_set(input): # use set to ensure the values are unique address = expand_address(input) uniq = set(address) return list(uniq)
city_id = db.insertCityIntoDB('Paignton', region_id) city_id = db.insertCityIntoDB('Brixham', region_id) city_id = db.insertCityIntoDB('Kingswear', region_id) city_id = db.insertCityIntoDB('Dartmouth', region_id) city_id = db.insertCityIntoDB('Dartmouth', region_id) # should be ignored as is a repeat region = g_address.region2 preferred = db.getPreferredRegionFromDB(region, country_id) if preferred != None: region = preferred region_id = db.getRegionIdFromDB(region, country_id) city = g_address.city city_id = db.getCityIdFromDB(city, region_id) street = g_address.street s_address = expand_address(street)[0] # pypostal expands rd to road etc s_address = StringUtil.titlecase(s_address) def_lat = g_address.lat def_lon = g_address.lon street_id = db.insertStreetIntoDB(s_address, city_id, postcode, def_lat, def_lon) lat,lon = db.getDefaultLatLonFromDB(postcode) print(lat) print(lon) db.close()
def get(self): query = request.args.get('address') return expand_address(query)
print("Hello world") from postal.parser import parse_address a = parse_address( 'The Book Club 100-106 Leonard St, Shoreditch, London, Greater London, EC2A 4RH, United Kingdom' ) print(a) from postal.expand import expand_address b = expand_address('Quatre vingt douze Ave des Champs-Élysées') print(b)
def address_parser(address): return convert_json( parse_address(expand_address(address, languages=('es'))[0]))
def merge(self, output_path): """ Do the merge the best way possible """ logging.info( f"Starting merge {self.csv_file1_path} + {self.csv_file2_path} " f"to {output_path}") start_time = time.time() # Expand the CSV file 1, i.e. duplicate rows by adding a new column # called "normalized_address" each new row will have all # the same values in the first columns, # but a different one in "normalized_address" expanded_csv_file1_path = "expanded_file1.csv" CsvAddressExpander.expand_csv(self.csv_file1_path, expanded_csv_file1_path, ["id_store", "variable1"]) file1_df = pd.read_csv(expanded_csv_file1_path, sep=";") # Set an index to increase perfomance of filters file1_df.set_index("normalized_address", drop=True, inplace=True) # Prepare the output CSV writer output_handler = CsvHandler(output_path) csv_output_file_writer = output_handler.write_csv( ['id', 'var1', 'var2', 'ratio']) csv_output_file_row_count = 0 # Read each row of the CSV file 2 and expand their addresses, # loop through their normalized addresses and as soon as # there is a coincidence, write the result on the output CSV file csv_file2_reader = CsvHandler(self.csv_file2_path).read_csv() for row2 in csv_file2_reader: normalized_addresses = expand_address(row2["address"]) for normalized_address in normalized_addresses: # Exact match using the index (better performance) file1_normalized_address_selection =\ file1_df[file1_df.index == normalized_address] file1_normalized_address_selection_match_found =\ file1_normalized_address_selection.shape[0] > 0 if file1_normalized_address_selection_match_found: row1 = file1_normalized_address_selection.iloc[0] output_row = { "id": row1["id_store"], "var1": row1["variable1"], "var2": row2["variable2"], "ratio": (float(row1["variable1"]) / float(row2["variable2"]) if float(row2["variable2"]) != 0.0 else None) } csv_output_file_writer.writerow(output_row) csv_output_file_row_count += 1 break else: # Exact match has not delivered any results # TODO: implement fuzzy matching logging.error(f"Error. {row2['address']}' couldn't be found " f"on expanded address set.") os.remove(expanded_csv_file1_path) elapsed_time = time.time() - start_time csv1_address_count = file1_df[["id_store"]].drop_duplicates().shape[0] merged_address_percentage =\ 100.0 * csv_output_file_row_count / csv1_address_count logging.info(f"Merge of addresses {csv_output_file_row_count}" f"of {csv1_address_count} " f"({merged_address_percentage} %) " f"completed in {output_path} in {elapsed_time} seconds")
def do_expand(address): return expand_address(address)[0]