def run_main(): import_util.clean_temporary_tables() task = ScoreComputingJob() results = task.run() no_results = [] departements = [] for departement, result in results: departements.append(departement) if not result: no_results.append(departement) if len(no_results) > settings.MAXIMUM_COMPUTE_SCORE_JOB_FAILURES: results = set(departements) - set(no_results) logger.warning( "compute_scores by departement : %i failures (%s) vs %i successes (%s), aborting...", len(no_results), ",".join(no_results), len(results), ",".join(results), ) sys.exit(-1) import_util.reduce_scores_for_backoffice(departements) import_util.reduce_scores_for_main_db(departements) if COMPUTE_SCORES_DEBUG_MODE: logger.warning( "debug mode enabled, failing on purpose for debugging of temporary tables" ) sys.exit(-1) import_util.clean_temporary_tables() logger.info("compute_scores task: FINISHED")
def delete_deletable_offices(self): con, cur = import_util.create_cursor() if self.deletable_sirets: for sirets in chunks(list(self.deletable_sirets), 500): stringified_siret_list = ",".join(sirets) logger.info("deleting a chunk of %i offices...", len(sirets)) query = """DELETE FROM %s where siret IN (%s)""" % (settings.RAW_OFFICE_TABLE, stringified_siret_list) try: cur.execute(query) con.commit() except: logger.warning("error while deleting chunk of sirets : %s", sirets) raise cur.close() con.close() logger.info("%i no longer existing offices deleted.", len(self.deletable_sirets))
def get_csv_from_api(self, csv_path): # curl -X POST -F data=@path/to/file.csv -F columns=voie columns=ville -F # citycode=ma_colonne_code_insee https://api-adresse.data.gouv.fr/search/csv/ logger.info("find coordinates on CSV {}".format(csv_path)) BASE = "http://api-adresse.data.gouv.fr/search/csv/" files = {'data': open(csv_path, 'rb')} values = {'columns': 'full_address', 'city_code': 'city_code'} # FIXME : Ugly way to wait for the API to be OK with our requests retry_counter = 5 job_done = False while not job_done and retry_counter > 0: response = session.post(BASE, files=files, data=values) response.close() logger.info('STATUS RESPONSE : {} pour le csv {}'.format( response.status_code, csv_path)) if response.status_code == 200: job_done = True else: retry_counter -= 1 time.sleep(5) if job_done: GEOCODING_STATS['API status 200 for CSV'] = GEOCODING_STATS.get( 'API status 200 for CSV', 0) + 1 try: logger.info( "API addr gouv response on CSV {} OK".format(csv_path)) decoded_content = response.content.decode('utf-8') df_geocodes = pd.read_csv(io.StringIO(decoded_content), dtype={'siret': str}) csv_api_back_path = csv_path + '-api' df_geocodes.to_csv(csv_api_back_path, index=False) csv_api_back.append(csv_api_back_path) logger.info("Wrote CSV sent back by API : {}".format( csv_api_back_path)) except ValueError: logger.warning('ValueError in json-ing features result %s', response.text) else: logger.info("The csv {} was not saved correctly".format(csv_path)) logger.info("GEOCODING_STATS = {} for CSV {}".format( GEOCODING_STATS, csv_path))
def create_geocoding_jobs(self): query = """ select siret, numerorue, libellerue, codepostal, codecommune, coordinates_x, coordinates_y from %s """ % (settings.SCORE_REDUCING_TARGET_TABLE) if DEBUG_MODE: #query += "WHERE coordinates_x = 0 and coordinates_y = 0" query += "ORDER BY RAND() LIMIT 100000" con, cur = import_util.create_cursor() cur.execute(query) rows = cur.fetchall() geocoding_jobs = [] count = 0 for row in rows: siret, street_number, street_name, zipcode, codecommune, coordinates_x, coordinates_y = row try: city = CITY_NAMES[codecommune] except KeyError: logger.warning("wrong codecommune: %s", codecommune) continue try: full_address = self.get_full_adress(street_number, street_name, zipcode, city) initial_coordinates = [coordinates_x, coordinates_y] geocoding_jobs.append( [siret, full_address, initial_coordinates, codecommune]) except IncorrectAdressDataException: logger.warning("incorrect address for %s %s %s %s", street_number, street_name, zipcode, city) count += 1 GEOCODING_STATS['jobs'] = GEOCODING_STATS.get('jobs', 0) + 1 if not count % 10000: logger.info("loading geocoding jobs from db... loaded %s rows", count) logger.info("%i geocoding jobs created...", len(geocoding_jobs)) cur.close() con.close() return geocoding_jobs
def find_coordinates_for_address(self): """ finding coordinates for an address based on the BAN (base d'adresses nationale), an online governmental service. """ coordinates = None # FIXME refer to settings.API_ADRESS_BASE_URL and make sure we don't # make real requests in unit tests BASE = "http://api-adresse.data.gouv.fr/search/?q=" geocoding_request = "%s%s" % (BASE, self.full_address) geolocation = Geolocation.get(self.full_address) if geolocation: # coordinates were already queried and cached before coordinates = [geolocation.x, geolocation.y] GEOCODING_STATS['cache_hits'] = GEOCODING_STATS.get( 'cache_hits', 0) + 1 else: # coordinates need to be queried and cached response = session.get(geocoding_request) response.close() GEOCODING_STATS['cache_misses'] = GEOCODING_STATS.get( 'cache_misses', 0) + 1 if response.status_code == 200: try: results = response.json()['features'] if len(results) >= 1: coordinates = results[0]['geometry']['coordinates'] # let's cache the result for later computations geolocation = Geolocation( full_address=self.full_address, x=coordinates[0], y=coordinates[1]) db_session.add(geolocation) # as this method is run in parallel jobs, # let's commit often so that each job see each other's changes # and rollback in case of rare simultaneous changes on same geolocation try: db_session.commit() # usually flush() is called as part of commit() # however it is not the case in our project # because autoflush=False db_session.flush() GEOCODING_STATS['flushes'] = GEOCODING_STATS.get( 'flushes', 0) + 1 except IntegrityError: # happens when a job tries to insert an already existing full_address # rollback needed otherwise db_session is left # in a state unusable by the other parallel jobs db_session.rollback() GEOCODING_STATS['rollbacks'] = GEOCODING_STATS.get( 'rollbacks', 0) + 1 except ValueError: logger.warning('ValueError in json-ing features result %s', response.text) if coordinates: if coordinates == self.initial_coordinates: GEOCODING_STATS['unchanged_coordinates'] = GEOCODING_STATS.get( 'unchanged_coordinates', 0) + 1 else: GEOCODING_STATS['updatable_coordinates'] = GEOCODING_STATS.get( 'updatable_coordinates', 0) + 1 self.updates.append([self.siret, coordinates]) else: GEOCODING_STATS['coordinates_not_found'] = GEOCODING_STATS.get( 'coordinates_not_found', 0) + 1