def geocode_siae(self, siae): assert siae.address_on_one_line geocoding_data = get_geocoding_data(siae.address_on_one_line, post_code=siae.post_code) if not geocoding_data: self.stderr.write( f"No geocoding data found for siae.external_id={siae.external_id}" ) else: siae.geocoding_score = geocoding_data["score"] # If the score is greater than API_BAN_RELIABLE_MIN_SCORE, coords are reliable: # use data returned by the BAN API because it's better written using accents etc. # while the source data is in all caps etc. # Otherwise keep the old address (which is probably wrong or incomplete). if siae.geocoding_score >= API_BAN_RELIABLE_MIN_SCORE: siae.address_line_1 = geocoding_data["address_line_1"] else: self.stderr.write( f"Geocoding not reliable for siae.external_id={siae.external_id}" ) # City is always good due to `postcode` passed in query. # ST MAURICE DE REMENS => Saint-Maurice-de-Rémens siae.city = geocoding_data["city"] siae.coords = geocoding_data["coords"] return siae
def set_coords(self, address, post_code=None): geocoding_data = get_geocoding_data(address, post_code=post_code) if not geocoding_data: logger.error("No geocoding data could be found for `%s - %s`", address, post_code) return self.coords = geocoding_data["coords"] self.geocoding_score = geocoding_data["score"]
def set_coords_and_address(self, address, post_code=None): geocoding_data = get_geocoding_data(address, post_code=post_code) if not geocoding_data: logger.error("No geocoding data could be found for `%s - %s`", address, post_code) return self.coords = geocoding_data["coords"] self.geocoding_score = geocoding_data["score"] self.address_line_1 = geocoding_data["address_line_1"] self.address_line_2 = "" self.post_code = geocoding_data["post_code"] self.city = geocoding_data["city"]
def clean_siret(self): # `max_length` is skipped so that we can allow an arbitrary number of spaces in the user-entered value. siret = self.cleaned_data["siret"].replace(" ", "") validate_siret(siret) # Fetch name and address from API entreprise. etablissement, error = etablissement_get_or_error(siret) if error: raise forms.ValidationError(error) if etablissement.is_closed: raise forms.ValidationError( "La base Sirene indique que l'établissement est fermé.") # Perform another API call to fetch geocoding data. address_fields = [ etablissement.address_line_1, # `address_line_2` is omitted on purpose because it tends to return no results with the BAN API. etablissement.post_code, etablissement.city, etablissement.department, ] address_on_one_line = ", ".join( [field for field in address_fields if field]) geocoding_data = get_geocoding_data( address_on_one_line, post_code=etablissement.post_code) or {} self.org_data = { "siret": siret, "is_head_office": etablissement.is_head_office, "name": etablissement.name, "address_line_1": etablissement.address_line_1, "address_line_2": etablissement.address_line_2, "post_code": etablissement.post_code, "city": etablissement.city, "department": etablissement.department, "longitude": geocoding_data.get("longitude"), "latitude": geocoding_data.get("latitude"), "geocoding_score": geocoding_data.get("score"), } return siret
def geocode_siae(siae): if siae.geocoding_address is None: return siae geocoding_data = get_geocoding_data(siae.geocoding_address, post_code=siae.post_code) if geocoding_data: siae.geocoding_score = geocoding_data["score"] # If the score is greater than API_BAN_RELIABLE_MIN_SCORE, coords are reliable: # use data returned by the BAN API because it's better written using accents etc. # while the source data is in all caps etc. # Otherwise keep the old address (which is probably wrong or incomplete). if siae.geocoding_score >= AddressMixin.API_BAN_RELIABLE_MIN_SCORE: siae.address_line_1 = geocoding_data["address_line_1"] # City is always good due to `postcode` passed in query. # ST MAURICE DE REMENS => Saint-Maurice-de-Rémens siae.city = geocoding_data["city"] siae.coords = geocoding_data["coords"] return siae
def handle(self, dry_run=False, **options): self.set_logger(options.get("verbosity")) with open(CSV_FILE) as csvfile: # Count lines in CSV. reader = csv.reader(csvfile, delimiter=";") row_count = sum(1 for row in reader) last_progress = 0 # Reset the iterator to iterate through the reader again. csvfile.seek(0) for i, row in enumerate(reader): if i == 0: # Skip CSV header. continue progress = int((100 * i) / row_count) if progress > last_progress + 5: self.stdout.write(f"Creating SIAEs… {progress}%") last_progress = progress self.logger.debug("-" * 80) siret = row[7] self.logger.debug(siret) assert len(siret) == 14 naf = row[5] self.logger.debug(naf) assert len(naf) == 5 kind = row[0] self.logger.debug(kind) assert kind in KINDS # Max length of `name` is 50 chars in the source file, some are truncated. # Also `name` is in upper case. name = row[8].strip() name = " ".join( name.split()) # Replace multiple spaces by a single space. self.logger.debug(name) email = row[14].strip() self.logger.debug(email) assert " " not in email street_num = row[9].strip().replace(" ", "") street_name = row[10].strip().lower() street_name = " ".join(street_name.split( )) # Replace multiple spaces by a single space. address_line_1 = f"{street_num} {street_name}" address_line_1 = " ".join(address_line_1.split( )) # Replace multiple spaces by a single space. address_line_2 = "" if " - " in address_line_1: addresses = address_line_1.split(" - ") address_line_1 = addresses[0] address_line_2 = addresses[1] self.logger.debug(address_line_1) self.logger.debug(address_line_2) # Fields are identical, we can use one or another. post_code = row[3].strip() post_code2 = row[11].strip() self.logger.debug(post_code) assert post_code == post_code2 # Fields are identical, we can use one or another. city = row[4].strip() city_name = row[12].strip() self.logger.debug(city) assert city_name == city department = row[1] if department[0] == "0": department = department[1:] if department in ["59L", "59V"]: department = "59" if department not in [ "2A", "2B" ] and not post_code.startswith(department): # Fix wrong departments using the post code. department = post_code[:len(department)] self.logger.debug(department) assert department in DEPARTMENTS siae_info = f"{siret} {name} - {address_line_1} - {post_code} {city}." phone = row[13].strip().replace(" ", "") if phone and len(phone) != 10: self.stderr.write(f"Wrong phone `{phone}`. {siae_info}.") phone = "" self.logger.debug(phone) if siret in SEEN_SIRET: # First come, first served. self.stderr.write( f"Siret already seen. Skipping {siae_info}.") continue SEEN_SIRET.add(siret) if not dry_run: siae = Siae() siae.siret = siret siae.naf = naf siae.kind = kind siae.source = Siae.SOURCE_ASP siae.name = name siae.phone = phone siae.email = email siae.address_line_1 = address_line_1 siae.address_line_2 = address_line_2 siae.post_code = post_code siae.city = city siae.department = department if siae.address_on_one_line: geocoding_data = get_geocoding_data( siae.address_on_one_line, post_code=siae.post_code) if not geocoding_data: self.stderr.write( f"No geocoding data found for {siae_info}") siae.save() continue siae.geocoding_score = geocoding_data["score"] # If the score is greater than API_BAN_RELIABLE_MIN_SCORE, coords are reliable: # use data returned by the BAN API because it's better written using accents etc. # while the source data is in all caps etc. # Otherwise keep the old address (which is probably wrong or incomplete). if siae.geocoding_score >= API_BAN_RELIABLE_MIN_SCORE: siae.address_line_1 = geocoding_data[ "address_line_1"] else: self.stderr.write( f"Geocoding not reliable for {siae_info}") # City is always good due to `postcode` passed in query. # ST MAURICE DE REMENS => Saint-Maurice-de-Rémens siae.city = geocoding_data["city"] self.logger.debug("-" * 40) self.logger.debug(siae.address_line_1) self.logger.debug(siae.city) siae.coords = geocoding_data["coords"] siae.save() self.stdout.write("-" * 80) self.stdout.write("Done.")
def format_address(obj): """ Formats the address contained in obj into a valid address "structure" for ASP ER exports. Heavily relies on geo.api.gouv.fr API to do parts of the job for us: - extracting lane number and extension - giving a geocoding score / threshold in order to improve an existing DB address - validation of a real-world address Employee records ("Fiches salarié") contains 2 addresses of this kind. See validation of ASP address for expected/valid fields. Output fields: - number (opt.): number in the lane - std_extension (opt.): One of the ASP ref lane extension (see LaneExtension) - non_std_extension (opt.): if another extension is detected - lane: name of the lane - lane_type: One of the ASP ref lane type (see LaneType) - additional_address : further details on the address (if available) - city: name of city - post_code: postal code - insee_code: INSEE code of the city (Itou) INSEE code can be checked against ASP ref for further validation. Returns a (result,error) tuple: - OK => (result_dict, None), - KO => (None, error_message) """ if not obj: return None, ERROR_HEXA_CONVERSION # Do we have enough data to make an extraction? if not obj.post_code or not obj.address_line_1: return None, ERROR_INCOMPLETE_ADDRESS_DATA # first we use geo API to get a 'lane' and a number address = get_geocoding_data(obj.address_line_1, post_code=obj.post_code) if not address: return None, ERROR_GEOCODING_API # Default values additional_address = unidecode(obj.address_line_2) result = { "number": "", "non_std_extension": "", "additional_address": additional_address if re.match(ADDITIONAL_ADDRESS_RE, additional_address) else "", } # Street extension processing (bis, ter ...) # Extension is part of the resulting streetnumber geo API field number_plus_ext = address.get("number") if number_plus_ext: # API change : now extension can be "stuck" to lane number # This was not he case before (space in between) # REGEX to the rescue to fix ASP error 3323 [[number, extension]] = re.findall(LANE_NUMBER_RE, number_plus_ext) if number: result["number"] = number if extension: extension = extension[0] ext = LaneExtension.with_similar_name_or_value(extension) if ext: result["std_extension"] = ext.name or "" else: result["non_std_extension"] = extension.upper() lane = None if not address.get("lane") and not address.get("address"): return None, ERROR_UNKNOWN_ADDRESS_LANE lane = address.get("lane") or address.get("address") lane = unidecode(lane) result["lane"] = lane # Lane type processing (Avenue, RUe, Boulevard ...) lane_type, *rest = lane.split(maxsplit=1) lt = ( # The API field is similar to know lane type, # example: got "Av" for name "AV" (Avenue) LaneType.with_similar_name(lane_type) # The API field is similar to an exiting value # example: got "allee" for "Allée" or LaneType.with_similar_value(lane_type) # Maybe the geo API mispelled the lane type (happens sometimes) # so we use an aliases table as a last change to get the type # example: got "R" or "r" instead of "Rue" or find_lane_type_aliases(lane) ) if lt: result["lane_type"] = lt.name # If split is successful, then we can strip the lane type # from the lane name for a better result result["lane"] = rest[0] if rest else lane_type else: return None, f"Impossible de trouver le type de voie : {lane_type} pour l'adresse : {address}" # INSEE code: must double check with ASP ref file result["insee_code"] = address.get("insee_code") result["post_code"] = address.get("post_code") result["city"] = address.get("city") return result, None
def handle(self, dry_run=False, **options): self.set_logger(options.get("verbosity")) with open(CSV_FILE) as csvfile: reader = csv.reader(csvfile, delimiter=";") for i, row in enumerate(reader): if i == 0: # Skip CSV header. continue self.logger.debug("-" * 80) name = row[0].strip() name = " ".join( name.split()) # Replace multiple spaces by a single space. self.logger.debug(name) address_line_1 = row[1].strip() address_line_1 = " ".join(address_line_1.split()) self.logger.debug(address_line_1) address_line_2 = row[2].strip() address_line_2 = " ".join(address_line_2.split()) self.logger.debug(address_line_2) city = row[3].strip() city = " ".join(city.split()) self.logger.debug(city) post_code = row[4].strip() post_code = " ".join(post_code.split()) self.logger.debug(post_code) email = row[5].strip() self.logger.debug(email) assert " " not in email phone = row[6].strip().replace(" ", "") assert len(phone) == 10 self.logger.debug(phone) naf = row[7] self.logger.debug(naf) if naf: assert len(naf) == 5 siret = row[8].strip() self.logger.debug(siret) assert len(siret) == 14 if siret in SEEN_SIRET: self.stderr.write(f"Siret already seen. Skipping.") continue SEEN_SIRET.add(siret) if post_code.startswith("20"): if post_code.startswith("200") or post_code.startswith( "201"): department = "2A" elif post_code.startswith("202"): department = "2B" elif post_code.startswith("97") or post_code.startswith("98"): department = post_code[:3] else: department = post_code[:2] self.logger.debug(department) assert department in DEPARTMENTS siae_info = f"{siret} {name} - {address_line_1} - {post_code} {city}." self.logger.debug(siae_info) if not dry_run: siae = Siae() siae.siret = siret siae.naf = naf siae.kind = Siae.KIND_GEIQ siae.source = Siae.SOURCE_GEIQ siae.name = name siae.phone = phone siae.email = email siae.address_line_1 = address_line_1 siae.address_line_2 = address_line_2 siae.post_code = post_code siae.city = city siae.department = department if siae.address_on_one_line: geocoding_data = get_geocoding_data( siae.address_on_one_line, post_code=siae.post_code) if (not geocoding_data or geocoding_data["score"] < API_BAN_RELIABLE_MIN_SCORE): geocoding_data = get_geocoding_data( siae.address_on_one_line, post_code=f"{siae.post_code[:2]}000", ) if (not geocoding_data or geocoding_data["score"] < API_BAN_RELIABLE_MIN_SCORE): geocoding_data = get_geocoding_data( siae.address_on_one_line) if (not geocoding_data or geocoding_data["score"] < API_BAN_RELIABLE_MIN_SCORE): geocoding_data = get_geocoding_data( siae.address_line_1) if (not geocoding_data or geocoding_data["score"] < API_BAN_RELIABLE_MIN_SCORE): geocoding_data = get_geocoding_data( siae.address_line_2) if not geocoding_data: self.stderr.write( f"No geocoding data found for {siae_info}") siae.save() continue siae.geocoding_score = geocoding_data["score"] # If the score is greater than API_BAN_RELIABLE_MIN_SCORE, coords are reliable: # use data returned by the BAN API because it's better written using accents etc. # while the source data is in all caps etc. # Otherwise keep the old address (which is probably wrong or incomplete). if siae.geocoding_score >= API_BAN_RELIABLE_MIN_SCORE: siae.address_line_1 = geocoding_data[ "address_line_1"] siae.city = geocoding_data["city"] else: self.stderr.write( f"Geocoding not reliable for {siae_info}\n{siae.address_on_one_line}" ) self.logger.debug("-" * 40) self.logger.debug(siae.address_line_1) self.logger.debug(siae.city) siae.coords = geocoding_data["coords"] siae.save() self.stdout.write("-" * 80) self.stdout.write("Done.")
def handle(self, dry_run=False, **options): self.set_logger(options.get("verbosity")) with open(CSV_FILE) as csvfile: # Count lines in CSV. reader = csv.reader(csvfile, delimiter=",") row_count = sum(1 for row in reader) last_progress = 0 # Reset the iterator to iterate through the reader again. csvfile.seek(0) for i, row in enumerate(reader): if i == 0: # Skip CSV header. continue progress = int((100 * i) / row_count) if progress > last_progress + 5: self.stdout.write( f"Creating prescriber organizations… {progress}%") last_progress = progress self.logger.debug("-" * 80) city = row[7].strip() self.logger.debug(city) department = row[0].strip() assert department in DEPARTMENTS self.logger.debug(department) name = row[1].strip() if name == "MISSION LOCALE": name = f"{name} - {city}" elif name in [ "CAP EMPLOI", "DIRECTION TERRITORIALE DE LA PROTECTION JUDICIAIRE DE LA JEUNESSE", "POLE EMPLOI", "SERVICE PENITENTIAIRE D'INSERTION ET DE PROBATION", ]: name = f"{name} - {department}" self.logger.debug(name) phone = row[10].strip() if phone: assert len(phone) == 10 self.logger.debug(phone) email = row[9].strip() self.logger.debug(email) website = row[11].strip() self.logger.debug(website) post_code = row[6].strip() self.logger.debug(post_code) address_line_1 = row[2].strip() self.logger.debug(address_line_1) complement = row[3].strip() complement_2 = row[4].strip() bp_cs = row[5].strip() cedex = row[8].strip() address_line_2 = [complement, complement_2, bp_cs, cedex] address_line_2 = " - ".join(item for item in address_line_2 if item) self.logger.debug(address_line_2) if not dry_run: prescriber_organization = PrescriberOrganization() prescriber_organization.is_authorized = True prescriber_organization.name = name prescriber_organization.phone = phone prescriber_organization.email = email prescriber_organization.website = website prescriber_organization.address_line_1 = address_line_1 prescriber_organization.address_line_2 = address_line_2 prescriber_organization.post_code = post_code prescriber_organization.city = city prescriber_organization.department = department geocoding_data = get_geocoding_data("{}, {} {}".format( prescriber_organization.address_line_1, prescriber_organization.post_code, prescriber_organization.city, )) prescriber_organization.coords = geocoding_data["coords"] prescriber_organization.save() self.stdout.write("-" * 80) self.stdout.write("Done.")