Ejemplo n.º 1
0
def run_main():
    import_util.clean_temporary_tables()
    task = ScoreComputingJob()
    results = task.run()
    no_results = []
    departements = []
    for departement, result in results:
        departements.append(departement)
        if not result:
            no_results.append(departement)
    if len(no_results) > settings.MAXIMUM_COMPUTE_SCORE_JOB_FAILURES:
        results = set(departements) - set(no_results)
        logger.warning(
            "compute_scores by departement : %i failures (%s) vs %i successes (%s), aborting...",
            len(no_results),
            ",".join(no_results),
            len(results),
            ",".join(results),
        )
        sys.exit(-1)

    import_util.reduce_scores_for_backoffice(departements)
    import_util.reduce_scores_for_main_db(departements)
    if COMPUTE_SCORES_DEBUG_MODE:
        logger.warning(
            "debug mode enabled, failing on purpose for debugging of temporary tables"
        )
        sys.exit(-1)
    import_util.clean_temporary_tables()
    logger.info("compute_scores task: FINISHED")
Ejemplo n.º 2
0
 def delete_deletable_offices(self):
     con, cur = import_util.create_cursor()
     if self.deletable_sirets:
         for sirets in chunks(list(self.deletable_sirets), 500):
             stringified_siret_list = ",".join(sirets)
             logger.info("deleting a chunk of %i offices...", len(sirets))
             query = """DELETE FROM %s where siret IN (%s)""" % (settings.RAW_OFFICE_TABLE, stringified_siret_list)
             try:
                 cur.execute(query)
                 con.commit()
             except:
                 logger.warning("error while deleting chunk of sirets : %s", sirets)
                 raise
     cur.close()
     con.close()
     logger.info("%i no longer existing offices deleted.", len(self.deletable_sirets))
Ejemplo n.º 3
0
    def get_csv_from_api(self, csv_path):
        # curl -X POST -F data=@path/to/file.csv -F columns=voie columns=ville -F
        # citycode=ma_colonne_code_insee https://api-adresse.data.gouv.fr/search/csv/

        logger.info("find coordinates on CSV {}".format(csv_path))

        BASE = "http://api-adresse.data.gouv.fr/search/csv/"

        files = {'data': open(csv_path, 'rb')}
        values = {'columns': 'full_address', 'city_code': 'city_code'}

        # FIXME : Ugly way to wait for the API to be OK with our requests
        retry_counter = 5
        job_done = False

        while not job_done and retry_counter > 0:
            response = session.post(BASE, files=files, data=values)
            response.close()
            logger.info('STATUS RESPONSE : {} pour le csv {}'.format(
                response.status_code, csv_path))
            if response.status_code == 200:
                job_done = True
            else:
                retry_counter -= 1
                time.sleep(5)

        if job_done:
            GEOCODING_STATS['API status 200 for CSV'] = GEOCODING_STATS.get(
                'API status 200 for CSV', 0) + 1
            try:
                logger.info(
                    "API addr gouv response on CSV {} OK".format(csv_path))
                decoded_content = response.content.decode('utf-8')
                df_geocodes = pd.read_csv(io.StringIO(decoded_content),
                                          dtype={'siret': str})
                csv_api_back_path = csv_path + '-api'
                df_geocodes.to_csv(csv_api_back_path, index=False)
                csv_api_back.append(csv_api_back_path)
                logger.info("Wrote CSV sent back by API : {}".format(
                    csv_api_back_path))
            except ValueError:
                logger.warning('ValueError in json-ing features result %s',
                               response.text)
        else:
            logger.info("The csv {} was not saved correctly".format(csv_path))
        logger.info("GEOCODING_STATS = {} for CSV {}".format(
            GEOCODING_STATS, csv_path))
Ejemplo n.º 4
0
 def create_geocoding_jobs(self):
     query = """
         select
             siret,
             numerorue,
             libellerue,
             codepostal,
             codecommune,
             coordinates_x,
             coordinates_y
         from %s
     """ % (settings.SCORE_REDUCING_TARGET_TABLE)
     if DEBUG_MODE:
         #query += "WHERE coordinates_x = 0 and coordinates_y = 0"
         query += "ORDER BY RAND() LIMIT 100000"
     con, cur = import_util.create_cursor()
     cur.execute(query)
     rows = cur.fetchall()
     geocoding_jobs = []
     count = 0
     for row in rows:
         siret, street_number, street_name, zipcode, codecommune, coordinates_x, coordinates_y = row
         try:
             city = CITY_NAMES[codecommune]
         except KeyError:
             logger.warning("wrong codecommune: %s", codecommune)
             continue
         try:
             full_address = self.get_full_adress(street_number, street_name,
                                                 zipcode, city)
             initial_coordinates = [coordinates_x, coordinates_y]
             geocoding_jobs.append(
                 [siret, full_address, initial_coordinates, codecommune])
         except IncorrectAdressDataException:
             logger.warning("incorrect address for %s %s %s %s",
                            street_number, street_name, zipcode, city)
         count += 1
         GEOCODING_STATS['jobs'] = GEOCODING_STATS.get('jobs', 0) + 1
         if not count % 10000:
             logger.info("loading geocoding jobs from db... loaded %s rows",
                         count)
     logger.info("%i geocoding jobs created...", len(geocoding_jobs))
     cur.close()
     con.close()
     return geocoding_jobs
Ejemplo n.º 5
0
    def find_coordinates_for_address(self):
        """
        finding coordinates for an address based on the BAN (base d'adresses nationale),
        an online governmental service.
        """
        coordinates = None
        # FIXME refer to settings.API_ADRESS_BASE_URL and make sure we don't
        # make real requests in unit tests
        BASE = "http://api-adresse.data.gouv.fr/search/?q="
        geocoding_request = "%s%s" % (BASE, self.full_address)
        geolocation = Geolocation.get(self.full_address)

        if geolocation:
            # coordinates were already queried and cached before
            coordinates = [geolocation.x, geolocation.y]
            GEOCODING_STATS['cache_hits'] = GEOCODING_STATS.get(
                'cache_hits', 0) + 1
        else:
            # coordinates need to be queried and cached
            response = session.get(geocoding_request)
            response.close()
            GEOCODING_STATS['cache_misses'] = GEOCODING_STATS.get(
                'cache_misses', 0) + 1
            if response.status_code == 200:
                try:
                    results = response.json()['features']
                    if len(results) >= 1:
                        coordinates = results[0]['geometry']['coordinates']
                        # let's cache the result for later computations
                        geolocation = Geolocation(
                            full_address=self.full_address,
                            x=coordinates[0],
                            y=coordinates[1])
                        db_session.add(geolocation)

                        # as this method is run in parallel jobs,
                        # let's commit often so that each job see each other's changes
                        # and rollback in case of rare simultaneous changes on same geolocation
                        try:
                            db_session.commit()
                            # usually flush() is called as part of commit()
                            # however it is not the case in our project
                            # because autoflush=False
                            db_session.flush()
                            GEOCODING_STATS['flushes'] = GEOCODING_STATS.get(
                                'flushes', 0) + 1
                        except IntegrityError:
                            # happens when a job tries to insert an already existing full_address
                            # rollback needed otherwise db_session is left
                            # in a state unusable by the other parallel jobs
                            db_session.rollback()
                            GEOCODING_STATS['rollbacks'] = GEOCODING_STATS.get(
                                'rollbacks', 0) + 1
                except ValueError:
                    logger.warning('ValueError in json-ing features result %s',
                                   response.text)

        if coordinates:
            if coordinates == self.initial_coordinates:
                GEOCODING_STATS['unchanged_coordinates'] = GEOCODING_STATS.get(
                    'unchanged_coordinates', 0) + 1
            else:
                GEOCODING_STATS['updatable_coordinates'] = GEOCODING_STATS.get(
                    'updatable_coordinates', 0) + 1
                self.updates.append([self.siret, coordinates])
        else:
            GEOCODING_STATS['coordinates_not_found'] = GEOCODING_STATS.get(
                'coordinates_not_found', 0) + 1