Exemple #1
0
 def create_geocoding_jobs(self):
     query = """select siret, numerorue, libellerue, codepostal, codecommune, coordinates_x, coordinates_y from %s""" % (
         settings.EXPORT_ETABLISSEMENT_TABLE)
     con, cur = import_util.create_cursor()
     cur.execute(query)
     rows = cur.fetchall()
     geocoding_jobs = []
     count = 0
     for row in rows:
         siret, street_number, street_name, zipcode, codecommune, coordinates_x, coordinates_y = row
         try:
             city = CITY_NAMES[codecommune]
         except KeyError:
             logger.warning("wrong codecommune: %s", codecommune)
             continue
         try:
             full_address = self.get_full_adress(street_number, street_name,
                                                 zipcode, city)
             initial_coordinates = [coordinates_x, coordinates_y]
             geocoding_jobs.append(
                 [siret, full_address, initial_coordinates])
         except IncorrectAdressDataException:
             logger.warning("incorrect address for %s %s %s %s",
                            street_number, street_name, zipcode, city)
         count += 1
         GEOCODING_STATS['jobs'] = GEOCODING_STATS.get('jobs', 0) + 1
         if not count % 10000:
             logger.info("loading geocoding jobs from db... loaded %s rows",
                         count)
     logger.info("%i geocoding jobs created...", len(geocoding_jobs))
     return geocoding_jobs
 def get_sirets_from_database(self):
     query = "select siret from %s where siret != ''" % settings.OFFICE_TABLE
     logger.info("get etablissements from database")
     con, cur = import_util.create_cursor()
     cur.execute(query)
     rows = cur.fetchall()
     return [row[0] for row in rows]
Exemple #3
0
    def create_creatable_offices(self):
        """
        create new offices (that are not yet in our etablissement table)
        """
        con, cur = import_util.create_cursor()
        query = """INSERT into %s(siret, raisonsociale, enseigne, codenaf, numerorue,
            libellerue, codecommune, codepostal, email, tel, departement, trancheeffectif,
            website, flag_poe_afpr, flag_pmsmp)
        values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s)""" % settings.RAW_OFFICE_TABLE

        count = 1
        logger.info("create new offices in table %s",
                    settings.RAW_OFFICE_TABLE)
        statements = []
        MAX_COUNT_EXECUTE = 500
        for siret in self.creatable_sirets:
            statement = self.csv_offices[siret]["create_fields"]
            statements.append(statement)
            count += 1
            if not count % MAX_COUNT_EXECUTE:
                cur.executemany(query, statements)
                con.commit()
                statements = []
                if not count % 10000:
                    logger.info("created %s offices", count)
        if statements:
            cur.executemany(query, statements)
            con.commit()
        cur.close()
        con.close()
        logger.info("%i new offices created.", count)
    def update_updatable_offices(self):
        con, cur = import_util.create_cursor()
        query = """UPDATE %s SET
            raisonsociale=%%s,
            enseigne=%%s,
            codenaf=%%s,
            numerorue=%%s,
            libellerue=%%s,
            codecommune=%%s,
            codepostal=%%s,
            email=%%s,
            tel=%%s,
            departement=%%s,
            trancheeffectif=%%s,
            website1=%%s,
            website2=%%s
        where siret=%%s""" % settings.OFFICE_TABLE

        count = 1
        logger.info("update updatable etablissements in table %s" %
                    settings.OFFICE_TABLE)
        statements = []
        MAX_COUNT_EXECUTE = 500
        for siret in self.updatable_sirets:
            statement = self.csv_offices[siret]["update_fields"]
            statements.append(statement)
            count += 1
            if not count % MAX_COUNT_EXECUTE:
                cur.executemany(query, statements)
                con.commit()
                statements = []
        if statements:
            cur.executemany(query, statements)
            con.commit()
        logger.info("%i etablissements updated.", count)
    def update_coordinates(self, updates):
        con, cur = import_util.create_cursor()
        count = 0
        statements = []
        update_query = "update %s set coordinates_x=%%s, coordinates_y=%%s where siret=%%s" % \
            settings.SCORE_REDUCING_TARGET_TABLE

        logger.info("Nb of offices to update : {}".format(len(updates)))

        for siret, coordinates in updates:
            count += 1
            statements.append([coordinates[0], coordinates[1], siret])
            if len(statements) == 1000:
                logger.info("geocoding with ban... %i of %i done", count,
                            len(updates))
                cur.executemany(update_query, statements)
                con.commit()
                statements = []

        if len(statements) >= 1:
            logger.info("geocoding with ban... %i of %i done", count,
                        len(updates))
            cur.executemany(update_query, statements)
            con.commit()

        cur.close()
        con.close()
Exemple #6
0
def run_sql_script(sql_script):
    con, cur = import_util.create_cursor()

    for query in sql_script.split(';'):
        query = query.strip()
        if len(query) >= 1:
            cur.execute(query)
            con.commit()
Exemple #7
0
 def get_sirets_from_database(self):
     query = "select siret from %s" % settings.RAW_OFFICE_TABLE
     logger.info("get offices from database")
     con, cur = import_util.create_cursor()
     cur.execute(query)
     rows = cur.fetchall()
     cur.close()
     con.close()
     return [row[0] for row in rows if siret_util.is_siret(row[0])]
def check_departements(departements):
    for dep in departements:
        con, cur = import_util.create_cursor()
        cur.execute("select count(1) from %s where departement='%s'" %
                    (settings.OFFICE_TABLE, dep))
        con.commit()
        count = cur.fetchone()[0]
        if count < 1000:
            logger.error("only %s results for departement %s", count, dep)
Exemple #9
0
 def validate_coordinates(self):
     con, cur = import_util.create_cursor()
     query = """
     select
     sum(coordinates_x > 0 and coordinates_y > 0)/count(*)
     from %s
     """ % settings.EXPORT_ETABLISSEMENT_TABLE
     cur.execute(query)
     geocoding_ratio = cur.fetchall()[0][0]
     logger.info("geocoding_ratio = %s" % geocoding_ratio)
     if geocoding_ratio < 0.75:
         raise AbnormallyLowGeocodingRatioException
 def after_check(self):
     con, cur = import_util.create_cursor()
     for departement in settings.DEPARTEMENTS:
         query = "select count(1) from %s where departement='%s'" % (
             settings.OFFICE_TABLE, departement)
         cur.execute(query)
         result = cur.fetchone()
         count = result[0]
         logger.info("number of companies in departement %s : %i",
                     departement, count)
         if count <= 1000:
             raise "too few companies"
Exemple #11
0
 def test_insert_data_from_file(self):
     file = get_available_files_list(
         path_folder=os.path.join(os.path.dirname(__file__), "data"))[0]
     insert_into_sql_table_old_prediction_file(file)
     insert_data(file, months_time=4)
     con, cur = import_util.create_cursor()
     cur.execute("select count(*) from etablissements_new;")
     number_new_offices = cur.fetchone()[0]
     self.assertTrue(number_new_offices == 2)
     self.assertTrue(
         PerfImporterCycleInfos.query.filter(
             PerfImporterCycleInfos.file_name == file).count() == 1)
Exemple #12
0
def populate_flag(flag):
    logger.info("populating %s ... " % flag)
    con, cur = import_util.create_cursor()
    query = """
        UPDATE
        %s e
        INNER JOIN %s f
        ON e.siret = f.siret
        SET e.%s = True;
    """ % (settings.EXPORT_ETABLISSEMENT_TABLE, flag, flag)
    cur.execute(query)
    con.commit()
    logger.info("completed populating %s ... " % flag)
def populate_flag(flag):
    logger.info("populating %s ... ", flag)
    con, cur = import_util.create_cursor()
    query = """
        UPDATE
        %s e
        INNER JOIN %s f
        ON e.siret = f.siret
        SET e.%s = True;
    """ % (settings.SCORE_REDUCING_TARGET_TABLE, flag, flag)
    cur.execute(query)
    con.commit()
    logger.info("completed populating %s ... ", flag)
    cur.close()
    con.close()
 def delete_deletable_offices(self):
     con, cur = import_util.create_cursor()
     if self.deletable_sirets:
         stringified_siret_list = ",".join(self.deletable_sirets)
         logger.info("going to delete %i offices...",
                     len(self.deletable_sirets))
         query = """DELETE FROM %s where siret IN (%s)""" % (
             settings.OFFICE_TABLE, stringified_siret_list)
         try:
             cur.execute(query)
             con.commit()
         except:
             logger.warning("deletable_sirets=%s" % self.deletable_sirets)
             raise
         logger.info("%i old offices deleted.", len(self.deletable_sirets))
Exemple #15
0
 def update_coordinates(self, coordinates_updates):
     con, cur = import_util.create_cursor()
     count = 0
     statements = []
     update_query = "update %s set coordinates_x=%%s, coordinates_y=%%s where siret=%%s" % settings.EXPORT_ETABLISSEMENT_TABLE
     for siret, coordinates in coordinates_updates:
         statements.append([coordinates[0], coordinates[1], siret])
         if not count % 1000:
             logger.info(
                 "geocoding with ban... %i done (example: coordinates_x=%s, coordinates_y=%s",
                 count, statements[0][0], statements[0][1])
             cur.executemany(update_query, statements)
             con.commit()
             statements = []
         count += 1
Exemple #16
0
 def delete_deletable_offices(self):
     con, cur = import_util.create_cursor()
     if self.deletable_sirets:
         for sirets in chunks(list(self.deletable_sirets), 500):
             stringified_siret_list = ",".join(sirets)
             logger.info("deleting a chunk of %i offices...", len(sirets))
             query = """DELETE FROM %s where siret IN (%s)""" % (settings.RAW_OFFICE_TABLE, stringified_siret_list)
             try:
                 cur.execute(query)
                 con.commit()
             except:
                 logger.warning("error while deleting chunk of sirets : %s", sirets)
                 raise
     cur.close()
     con.close()
     logger.info("%i no longer existing offices deleted.", len(self.deletable_sirets))
 def validate_coordinates(self):
     con, cur = import_util.create_cursor()
     query = """
     select
     sum(
         (coordinates_x > 0 or coordinates_x < 0)
         and
         (coordinates_y > 0 or coordinates_y < 0)
     )/count(*)
     from %s
     """ % settings.SCORE_REDUCING_TARGET_TABLE
     cur.execute(query)
     geocoding_ratio = cur.fetchall()[0][0]
     logger.info("geocoding_ratio = %s", geocoding_ratio)
     if geocoding_ratio < settings.MINIMUM_GEOCODING_RATIO:
         raise AbnormallyLowGeocodingRatioException
     cur.close()
     con.close()
def insert_into_sql_table_old_prediction_file(file):
    file_name = os.path.basename(file)
    logger.info(
        f"\n Start : Insert data into etablissements_new from file {file_name}"
    )
    con, cur = import_util.create_cursor()
    sql_file = gzip.open(file, 'rt', encoding='utf8')
    sql_as_string = sql_file.read()

    # Cant load the whole table at once, the file is to large ( ~ 400mb)
    # So we have to split the sql file in multiple transactions
    drop_statement = "DROP TABLE IF EXISTS `etablissements_new`;"
    cur.execute(drop_statement)

    start_create_text = "CREATE TABLE "
    end_create_text = "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;"
    start_create_statement_index = sql_as_string.find(start_create_text) + len(
        start_create_text)
    end_create_statement_index = sql_as_string.find(
        end_create_text, start_create_statement_index)
    create_statement = start_create_text + sql_as_string[
        start_create_statement_index:
        end_create_statement_index] + end_create_text
    cur.execute(create_statement)

    cur.execute("LOCK TABLES `etablissements_new` WRITE;")

    insert_statements = sql_as_string.split(
        "INSERT INTO `etablissements_new` VALUES")[1:]
    for statement in insert_statements:
        if "/*!40000 ALTER TABLE `etablissements_new` ENABLE KEYS */;" in statement:
            clean_insert_statement = "INSERT INTO `etablissements_new` VALUES" + \
                                     statement.split("/*!40000 ALTER TABLE `etablissements_new` ENABLE KEYS */;")[0]
        else:
            clean_insert_statement = "INSERT INTO `etablissements_new` VALUES" + statement
        cur.execute(clean_insert_statement)

    cur.execute("UNLOCK TABLES;")

    con.commit()  # foo test resolution du TO
    cur.close()
    con.close()
    logger.info(
        f"\n End : Insert data into etablissements_new from file {file_name}")
Exemple #19
0
    def update_updatable_offices(self):
        # FIXME parallelize and/or batch for better performance
        con, cur = import_util.create_cursor()
        query = """UPDATE %s SET
            raisonsociale=%%s,
            enseigne=%%s,
            codenaf=%%s,
            numerorue=%%s,
            libellerue=%%s,
            codecommune=%%s,
            codepostal=%%s,
            email=%%s,
            tel=%%s,
            departement=%%s,
            trancheeffectif=%%s,
            website=%%s,
            flag_poe_afpr=%%s,
            flag_pmsmp=%%s
        where siret=%%s""" % settings.RAW_OFFICE_TABLE

        count = 0
        logger.info("update updatable offices in table %s",
                    settings.RAW_OFFICE_TABLE)
        statements = []
        MAX_COUNT_EXECUTE = 500
        for siret in self.updatable_sirets:
            statement = self.csv_offices[siret]["update_fields"]
            statements.append(statement)
            count += 1
            if not count % MAX_COUNT_EXECUTE:
                cur.executemany(query, statements)
                con.commit()
                statements = []
                if not count % 100000:
                    logger.info("updated %s offices", count)
        if statements:
            cur.executemany(query, statements)
            con.commit()
        cur.close()
        con.close()
        logger.info("%i offices updated.", count)
Exemple #20
0
def clear_useless_data(importer_cycle_infos_id):
    con, cur = import_util.create_cursor()
    for ici_id in importer_cycle_infos_id:
        cur.execute(
            "DELETE FROM perf_prediction_and_effective_hirings WHERE importer_cycle_infos_id = %s",
            [ici_id])
Exemple #21
0
    def run_task(self):
        logger.info("extracting %s ", self.input_filename)
        date_pattern = ".*_(\d\d\d\d\d\d\d\d)"
        date_match = re.match(date_pattern, self.input_filename)
        if date_match:
            date_part = date_match.groups()[-1]
            self.most_recent_data_date = datetime.strptime(date_part, "%Y%m%d")
            logger.debug("identified most_recent_data_date=%s" %
                         self.most_recent_data_date)
        else:
            raise Exception(
                "couldn't find a date pattern in filename. filename should be dpae_XYZ_20xxxxxx.tar.gz"
            )

        count = 0
        statements = []
        something_new = False
        query = "INSERT into %s(siret, hiring_date, zipcode, contract_type, departement, contract_duration, iiann, tranche_age, handicap_label) values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s)" % settings.DPAE_TABLE
        imported_dpae = 0
        imported_dpae_distribution = {}
        not_imported_dpae = 0
        initial_most_recent_data_date = DpaeStatistics.get_most_recent_data_date(
        )

        logger.info(
            "will now extract all dpae with hiring_date between %s and %s" %
            (initial_most_recent_data_date, self.most_recent_data_date))

        with import_util.get_reader(self.input_filename) as myfile:
            con, cur = import_util.create_cursor()
            header_line = myfile.readline().strip()
            if "siret" not in header_line:
                logger.debug(header_line)
                raise Exception("wrong header line")
            for line in myfile:
                count += 1
                if not count % 100000:
                    logger.debug("reading line %i", count)
                    try:
                        try:
                            cur.executemany(query, statements)
                        except OperationalError:  # retry once in case of deadlock error
                            time.sleep(10)
                            cur.executemany(query, statements)
                        statements = []
                        con.commit()
                        something_new = True
                    except:
                        logger.error(
                            "error in executing statement into dpae table: %s",
                            sys.exc_info()[1])
                        statements = []
                        raise
                try:
                    siret, hiring_date, zipcode, contract_type, departement, contract_duration, iiann, tranche_age, handicap_label = parse_dpae_line(
                        line)
                except ValueError:
                    self.zipcode_errors += 1
                    continue
                except DepartementException:
                    self.zipcode_errors += 1
                    continue
                except TooFewFieldsException:
                    logger.info("invalid_row met at row: %i", count)
                    self.invalid_row_errors += 1
                    continue

                if hiring_date > initial_most_recent_data_date and hiring_date <= self.most_recent_data_date:
                    statement = (siret, hiring_date, zipcode, contract_type,
                                 departement, contract_duration, iiann,
                                 tranche_age, handicap_label)
                    statements.append(statement)
                    imported_dpae += 1

                    if hiring_date.year not in imported_dpae_distribution:
                        imported_dpae_distribution[hiring_date.year] = {}
                    if hiring_date.month not in imported_dpae_distribution[
                            hiring_date.year]:
                        imported_dpae_distribution[hiring_date.year][
                            hiring_date.month] = {}
                    if hiring_date.day not in imported_dpae_distribution[
                            hiring_date.year][hiring_date.month]:
                        imported_dpae_distribution[hiring_date.year][
                            hiring_date.month][hiring_date.day] = 0
                    imported_dpae_distribution[hiring_date.year][
                        hiring_date.month][hiring_date.day] += 1
                else:
                    not_imported_dpae += 1

        # run remaining statements
        try:
            cur.executemany(query, statements)
            something_new = True
        except:
            logger.error("error in executing statement into dpae table: %s",
                         sys.exc_info()[1])
            raise

        logger.info("processed %i dpae...", count)
        logger.info("imported dpae: %i", imported_dpae)
        logger.info("not imported dpae: %i", not_imported_dpae)
        logger.info("zipcode errors: %i", self.zipcode_errors)
        logger.info("invalid_row errors: %i", self.invalid_row_errors)
        if self.zipcode_errors >= 100:
            raise Exception('too many zipcode errors')
        if self.invalid_row_errors >= 100:
            raise Exception('too many invalid_row errors')
        statistics = DpaeStatistics(
            last_import=datetime.now(),
            most_recent_data_date=self.most_recent_data_date)
        statistics.save()
        con.commit()
        logger.info("finished importing dpae...")
        return something_new
import random
import urllib.request, urllib.parse, urllib.error

from locust import HttpLocust, TaskSet, task
from slugify import slugify

from labonneboite.common import geocoding
from labonneboite.conf import settings
from labonneboite.importer import util as import_util
from labonneboite.web.api import util


logger = logging.getLogger(__name__)
logger.info("loading locustfile")

con, cur = import_util.create_cursor()

# For each locust, number of seconds between its tasks. Default value: 1.
SECONDS_BETWEEN_TASKS = 1


def generate_siret_choices():
    cur.execute("select siret from %s limit 100000" % (settings.OFFICE_TABLE))
    con.commit()
    rows = cur.fetchall()
    return [row[0] for row in rows]


def generate_city_choices():
    cities_by_population = sorted(geocoding.get_cities(), key=itemgetter('population'), reverse=True)
    city_choices = []
    def run_task(self):
        date_insertion = datetime.now()
        logger.info("extracting %s ", self.input_filename)
        # this pattern matches the first date
        # e.g. 'lbb_xdpdpae_delta_201611102200.bz2'
        # will match 2018-09-12
        date_pattern = r'.*_(\d\d\d\d\d\d\d\d)\d\d\d\d'  #We keep only the date in the file name, ex: 20190910 = 10th september 2019
        date_match = re.match(date_pattern, self.input_filename)
        if date_match:
            date_part = date_match.groups()[0]
            self.last_historical_data_date_in_file = datetime.strptime(
                date_part, "%Y%m%d")
            logger.debug("identified last_historical_data_date_in_file=%s",
                         self.last_historical_data_date_in_file)
        else:
            raise Exception(
                "couldn't find a date pattern in filename. filename should be \
                like lbb_xdpdpae_delta_YYYYMMDDHHMM.csv")

        count = 0
        statements = []
        something_new = False
        query = """
            INSERT into %s(
                siret,
                hiring_date,
                contract_type,
                departement,
                contract_duration,
                iiann,
                tranche_age,
                handicap_label,
                duree_pec,
                date_insertion
                )
            values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s)
        """ % settings.HIRING_TABLE
        imported_dpae = 0
        imported_dpae_distribution = {}
        not_imported_dpae = 0
        last_historical_data_date_in_db = db_session.query(func.max(Hiring.hiring_date)) \
                                        .filter(Hiring.contract_type.in_((Hiring.CONTRACT_TYPE_CDI,
                                                                          Hiring.CONTRACT_TYPE_CDD,
                                                                          Hiring.CONTRACT_TYPE_CTT))).first()[0]
        if last_historical_data_date_in_db is None:
            last_historical_data_date_in_db = DEFAULT_DATETIME_DPAE
        logger.info(
            "will now extract all dpae with hiring_date between %s and %s",
            last_historical_data_date_in_db,
            self.last_historical_data_date_in_file)

        with import_util.get_reader(self.input_filename) as myfile:
            con, cur = import_util.create_cursor()
            header_line = myfile.readline().strip(
            )  # FIXME detect column positions from header
            if b"siret" not in header_line:
                logger.debug(header_line)
                raise Exception("wrong header line")
            for line in myfile:
                line = line.decode()
                count += 1
                if not count % 100000:
                    logger.debug("reading line %i", count)
                    try:
                        try:
                            cur.executemany(query, statements)
                        except OperationalError:  # retry once in case of deadlock error
                            time.sleep(10)
                            cur.executemany(query, statements)
                        statements = []
                        con.commit()
                        something_new = True
                    except:
                        logger.error(
                            "error in executing statement into dpae table: %s",
                            sys.exc_info()[1])
                        statements = []
                        raise
                try:
                    siret, hiring_date, _, contract_type, departement, contract_duration, \
                    iiann, tranche_age, handicap_label, duree_pec = parse_dpae_line(line)
                except ValueError:
                    self.zipcode_errors += 1
                    continue
                except InvalidRowException:
                    logger.info("invalid_row met at row: %i", count)
                    self.invalid_row_errors += 1
                    continue

                dpae_should_be_imported = (
                    hiring_date > last_historical_data_date_in_db
                    and hiring_date <= self.last_historical_data_date_in_file
                    # For DPAE contracts we only keep all CDI, only long enough CDD (at least 31 days)
                    # and we ignore CTT.
                    and (contract_type == Hiring.CONTRACT_TYPE_CDI or
                         (contract_type == Hiring.CONTRACT_TYPE_CDD
                          and contract_duration is not None
                          and contract_duration > 31)))

                if dpae_should_be_imported:
                    statement = (siret, hiring_date, contract_type,
                                 departement, contract_duration, iiann,
                                 tranche_age, handicap_label, duree_pec,
                                 date_insertion)
                    statements.append(statement)
                    imported_dpae += 1

                    if hiring_date.year not in imported_dpae_distribution:
                        imported_dpae_distribution[hiring_date.year] = {}
                    if hiring_date.month not in imported_dpae_distribution[
                            hiring_date.year]:
                        imported_dpae_distribution[hiring_date.year][
                            hiring_date.month] = {}
                    if hiring_date.day not in imported_dpae_distribution[
                            hiring_date.year][hiring_date.month]:
                        imported_dpae_distribution[hiring_date.year][
                            hiring_date.month][hiring_date.day] = 0
                    imported_dpae_distribution[hiring_date.year][
                        hiring_date.month][hiring_date.day] += 1
                else:
                    not_imported_dpae += 1

        # run remaining statements
        try:
            cur.executemany(query, statements)
            something_new = True
        except:
            logger.error("error in executing statement into dpae table: %s",
                         sys.exc_info()[1])
            raise

        logger.info("processed %i dpae...", count)
        logger.info("imported dpae: %i", imported_dpae)
        logger.info("not imported dpae: %i", not_imported_dpae)
        logger.info("zipcode errors: %i", self.zipcode_errors)
        logger.info("invalid_row errors: %i", self.invalid_row_errors)
        if self.zipcode_errors > settings.MAXIMUM_ZIPCODE_ERRORS:
            raise IOError('too many zipcode errors')
        if self.invalid_row_errors > settings.MAXIMUM_INVALID_ROWS:
            raise IOError('too many invalid_row errors')
        logger.info("verifying good number of dpae imported.")
        query = "select count(*) from hirings h where hiring_date > %s and hiring_date <= %s and h.contract_type in (1,2,3)"
        cur.execute(query, [
            last_historical_data_date_in_db,
            self.last_historical_data_date_in_file
        ])
        res = cur.fetchone()
        if res[0] != imported_dpae:
            raise DoublonException(
                f"Too many DPAE ({res[0]}) in DB compared to DPAE file ({imported_dpae})."
            )
        logger.info("verifying number of DPAE: OK.")
        con.commit()
        cur.close()
        con.close()

        try:
            statistics = DpaeStatistics(
                last_import=datetime.now(),
                most_recent_data_date=self.last_historical_data_date_in_file,
                file_type=self.file_type)
            db_session.add(statistics)
            db_session.commit()
            logger.info("First way to insert DPAE statistics in DB : OK")
        except OperationalError:
            # For an obscure reason, the DpaeStatistics way to insert does not work on the bonaparte server
            # So we insert it directly via an SQL query
            # This job has been broken for more than a year, only way to fix it :
            db_session.rollback()
            last_import_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            most_recent_date = self.last_historical_data_date_in_file.strftime(
                '%Y-%m-%d %H:%M:%S')
            query = f"insert into dpae_statistics (last_import, most_recent_data_date, file_type) values ('{last_import_date}','{most_recent_date}','{self.file_type}')"
            con, cur = import_util.create_cursor()
            cur.execute(query)
            con.commit()
            cur.close()
            con.close()
            logger.info("Second way to insert DPAE statistics in DB : OK")

        logger.info("finished importing dpae...")
        return something_new
    def get_offices_from_file(self):
        logger.info("extracting %s...", self.input_filename)
        departements = settings.DEPARTEMENTS
        count = 0
        no_zipcode_count = 0
        departement_errors = 0
        unprocessable_departement_errors = 0
        format_errors = 0
        con, cur = import_util.create_cursor()
        departement_counter_dic = {}
        etablissements = {}

        with import_util.get_reader(self.input_filename) as myfile:
            header_line = myfile.readline().strip()
            if "siret" not in header_line:
                logger.debug(header_line)
                raise "wrong header line"
            for line in myfile:
                count += 1
                if not count % 100000:
                    logger.debug("reading line %i", count)

                try:
                    fields = import_util.get_fields_from_csv_line(line)
                    if len(fields) != 16:
                        logger.exception("wrong number of fields in line %s" %
                                         line)
                        raise ValueError

                    siret, raisonsociale, enseigne, codenaf, numerorue, \
                        libellerue, codecommune, codepostal, email, tel, \
                        trancheeffectif_etablissement, effectif_etablissement, \
                        trancheeffectif_entreprise, date_creation_entreprise, \
                        website1, website2 = fields
                except ValueError:
                    logger.exception("exception in line %s" % line)
                    format_errors += 1
                    continue

                website1 = encoding_util.strip_french_accents(website1)
                website2 = encoding_util.strip_french_accents(website2)
                email = encoding_util.strip_french_accents(email)

                if codecommune.strip():
                    try:
                        departement = extract_departement_from_zipcode(
                            codepostal, siret)
                        process_this_departement = departement in departements
                        if process_this_departement:

                            if len(codepostal) == 4:
                                codepostal = "0%s" % codepostal
                            etab_create_fields = siret, raisonsociale, enseigne, codenaf, numerorue, libellerue, \
                                codecommune, codepostal, email, tel, departement, trancheeffectif_etablissement, \
                                website1, website2
                            etab_update_fields = raisonsociale, enseigne, codenaf, numerorue, libellerue, \
                                codecommune, codepostal, email, tel, departement, trancheeffectif_etablissement, \
                                website1, website2, siret
                            if codepostal.startswith(departement):
                                departement_counter_dic.setdefault(
                                    departement, 0)
                                departement_counter_dic[departement] += 1
                                etablissements[siret] = {
                                    "create_fields": etab_create_fields,
                                    "update_fields": etab_update_fields,
                                }
                            else:
                                logger.info(
                                    "zipcode and departement dont match code commune: %s, code postal: %s, departement: %s",
                                    codecommune, codepostal, departement)
                        else:
                            unprocessable_departement_errors += 1
                    except DepartementException:
                        logger.exception("departement exception")
                        departement_errors += 1
                else:
                    no_zipcode_count += 1

        logger.info("%i etablissements total" % count)
        logger.info("%i etablissements with incorrect departement" %
                    departement_errors)
        logger.info("%i etablissements with unprocessable departement" %
                    unprocessable_departement_errors)
        logger.info("%i etablissements with no zipcodes", no_zipcode_count)
        logger.info("%i etablissements not read because of format error",
                    format_errors)
        logger.info("%i number of departements from file" %
                    len(departement_counter_dic))
        departement_count = sorted(departement_counter_dic.items())
        logger.info("per departement read %s", departement_count)
        logger.info("finished reading etablissements...")

        if departement_errors > 500:
            raise "too many departement_errors"
        if unprocessable_departement_errors > 2000:
            raise "too many unprocessable_departement_errors"
        if no_zipcode_count > 40000:
            raise "too many no_zipcode_count"
        if format_errors > 5:
            raise "too many format_errors"
        if len(departement_counter_dic) not in [
                96, 15
        ]:  # 96 in production, 15 in test
            logger.exception("incorrect total number of departements : %s" %
                             len(departement_counter_dic))
            raise "incorrect total number of departements"
        if len(departement_counter_dic) == 96:
            for departement, count in departement_count:
                if count < 10000:
                    logger.exception(
                        "only %s etablissements in departement %s" %
                        (count, departement))
                    raise "not enough etablissements in at least one departement"

        return etablissements
    def run_task(self):
        date_insertion = datetime.now()
        logger.info("extracting %s ", self.input_filename)
        # this pattern matches the first date
        # e.g. '20200803ExtractApp'
        # will match 20200803
        date_string = self.input_filename.split('/')[-1][0:8] 
        try:
            self.last_historical_data_date_in_file = datetime.strptime(date_string, "%Y%m%d")
        except ValueError:
            raise Exception("couldn't find a date pattern in filename. filename should be \
                like 20200803ExtractApp.csv")

        count = 0
        statements = []
        something_new = False
        query = """
            INSERT into %s(
                siret,
                hiring_date,
                contract_type,
                departement,
                contract_duration,
                iiann,
                tranche_age,
                handicap_label,
                duree_pec,
                date_insertion
                )
            values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s)
        """ % settings.HIRING_TABLE
        imported_alternance_contracts = 0
        imported_alternance_contracts_distribution = {}
        not_imported_alternance_contracts = 0

        last_historical_data_date_in_db = db_session.query(func.max(Hiring.hiring_date))\
                                                            .filter(Hiring.contract_type == self.contract_type).first()[0]

        logger.info("will now extract all alternance contracts with hiring_date between %s and %s",
                    last_historical_data_date_in_db, self.last_historical_data_date_in_file)

        with import_util.get_reader(self.input_filename) as myfile:
            con, cur = import_util.create_cursor()
            header_line = myfile.readline().strip()   # FIXME detect column positions from header
            
            if b"SIRET" not in header_line:
                logger.debug(header_line)
                raise Exception("wrong header line")

            for line in myfile:
                line = line.decode()
                count += 1
                if not count % 10000:
                    logger.debug("reading line %i", count)
                    try:
                        try:
                            cur.executemany(query, statements)
                        except OperationalError:  # retry once in case of deadlock error
                            time.sleep(10)
                            cur.executemany(query, statements)
                        statements = []
                        con.commit()
                        something_new = True
                    except:
                        logger.error("error in executing statement into hirings table: %s", sys.exc_info()[1])
                        statements = []
                        raise
                try:
                    siret, hiring_date, departement = parse_alternance_line(line)
                except InvalidRowException:
                    logger.info("invalid_row met at row: %i", count)
                    self.invalid_row_errors += 1
                    continue
                except InvalidSiretException:
                    error_message = traceback.format_exc()
                    logger.info("invalid siret met at row: %i", count)
                    logger.info(error_message)
                    self.invalid_siret_errors += 1
                    continue
                except InvalidZipCodeException:
                    logger.info("invalid zip code met at row: %i", count)
                    self.invalid_zipcode_errors += 1
                    continue
                
                # This part of code is useless : 
                #   The data used has a lot of late contracts inputs
                #   So we have to insert ALL the contracts from different dates

                #  alternance_contract_should_be_imported = (
                #      hiring_date > last_historical_data_date_in_db 
                #      and hiring_date <= self.last_historical_data_date_in_file
                #)

                if hiring_date <= self.last_historical_data_date_in_file:
                    statement = (
                        siret,
                        hiring_date,
                        self.contract_type,
                        departement,
                        None, #contract_duration
                        None, #iiann
                        None, #tranche_age
                        None, #handicap_label
                        None,  #duree_pec
                        date_insertion

                    )
                    statements.append(statement)
                    imported_alternance_contracts += 1

                    if hiring_date.year not in imported_alternance_contracts_distribution:
                        imported_alternance_contracts_distribution[hiring_date.year] = {}
                    if hiring_date.month not in imported_alternance_contracts_distribution[hiring_date.year]:
                        imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month] = {}
                    if hiring_date.day not in imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month]:
                        imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month][hiring_date.day] = 0
                    imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month][hiring_date.day] += 1

        # run remaining statements
        try:
            cur.executemany(query, statements)
            something_new = True
        except:
            logger.error("error in executing statement into hirings table: %s", sys.exc_info()[1])
            raise

        logger.info(f"Types de contrats à importer : {self.contract_name}")
        logger.info(f"processed {count} lba_contracts...")
        logger.info(f"imported lba_contracts: {imported_alternance_contracts}")
        logger.info(f"not imported lba_contracts: {not_imported_alternance_contracts}")
        logger.info(f"zipcode errors: {self.invalid_zipcode_errors}")
        logger.info(f"invalid_row errors: {self.invalid_row_errors}")
        logger.info(f"invalid siret errors: {self.invalid_siret_errors}")
#        if self.zipcode_errors > settings.MAXIMUM_ZIPCODE_ERRORS:
#            raise IOError('too many zipcode errors')
#        if self.invalid_row_errors > settings.MAXIMUM_INVALID_ROWS:
#            raise IOError('too many invalid_row errors')

        con.commit()
        cur.close()
        con.close()

        try:
            statistics = DpaeStatistics(
                last_import=datetime.now(),
                most_recent_data_date=self.last_historical_data_date_in_file,
                file_type=self.file_type
            )
            db_session.add(statistics)
            db_session.commit()
            logger.info("First way to insert DPAE statistics in DB : OK")
        except OperationalError:
            # For an obscure reason, the DpaeStatistics way to insert does not work on the bonaparte server
            # So we insert it directly via an SQL query
            # This job has been broken for more than a year, only way to fix it : 
            db_session.rollback()
            last_import_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            most_recent_date = self.last_historical_data_date_in_file.strftime('%Y-%m-%d %H:%M:%S')
            query = f"insert into dpae_statistics (last_import, most_recent_data_date, file_type) values ('{last_import_date}','{most_recent_date}','{self.file_type}')"
            con, cur = import_util.create_cursor()
            cur.execute(query)
            con.commit()
            cur.close()
            con.close()
            logger.info("Second way to insert DPAE statistics in DB : OK")


        logger.info("finished importing dpae...")
        return something_new
    def run_task(self):
        logger.info("extracting %s ", self.input_filename)
        # this pattern matches the first date
        # e.g. 'LBB_XDPDPAE_2018-09-12_2017-08-01.bz2'
        # will match 2018-09-12
        date_pattern = r'.*_(\d\d\d\d-\d\d-\d\d)_'
        date_match = re.match(date_pattern, self.input_filename)
        if date_match:
            date_part = date_match.groups()[0]
            self.last_historical_data_date_in_file = datetime.strptime(
                date_part, "%Y-%m-%d")
            logger.debug("identified last_historical_data_date_in_file=%s",
                         self.last_historical_data_date_in_file)
        else:
            raise Exception(
                "couldn't find a date pattern in filename. filename should be \
                like LBB_XDPDPAE_YYYY-MM-DD_YYYY-MM-DD.csv")

        count = 0
        statements = []
        something_new = False
        query = """
            INSERT into %s(
                siret,
                hiring_date,
                contract_type,
                departement,
                contract_duration,
                iiann,
                tranche_age,
                handicap_label,
                duree_pec
                )
            values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s)
        """ % settings.HIRING_TABLE
        imported_dpae = 0
        imported_dpae_distribution = {}
        not_imported_dpae = 0
        last_historical_data_date_in_db = DpaeStatistics.get_last_historical_data_date(
        )

        logger.info(
            "will now extract all dpae with hiring_date between %s and %s",
            last_historical_data_date_in_db,
            self.last_historical_data_date_in_file)

        with import_util.get_reader(self.input_filename) as myfile:
            con, cur = import_util.create_cursor()
            header_line = myfile.readline().strip(
            )  # FIXME detect column positions from header
            if b"siret" not in header_line:
                logger.debug(header_line)
                raise Exception("wrong header line")
            for line in myfile:
                line = line.decode()
                count += 1
                if not count % 100000:
                    logger.debug("reading line %i", count)
                    try:
                        try:
                            cur.executemany(query, statements)
                        except OperationalError:  # retry once in case of deadlock error
                            time.sleep(10)
                            cur.executemany(query, statements)
                        statements = []
                        con.commit()
                        something_new = True
                    except:
                        logger.error(
                            "error in executing statement into dpae table: %s",
                            sys.exc_info()[1])
                        statements = []
                        raise
                try:
                    siret, hiring_date, _, contract_type, departement, contract_duration, \
                    iiann, tranche_age, handicap_label, duree_pec = parse_dpae_line(line)
                except ValueError:
                    self.zipcode_errors += 1
                    continue
                except InvalidRowException:
                    logger.info("invalid_row met at row: %i", count)
                    self.invalid_row_errors += 1
                    continue

                dpae_should_be_imported = (
                    hiring_date > last_historical_data_date_in_db
                    and hiring_date <= self.last_historical_data_date_in_file
                    # For DPAE contracts we only keep all CDI, only long enough CDD (at least 31 days)
                    # and we ignore CTT.
                    and (contract_type == Hiring.CONTRACT_TYPE_CDI or
                         (contract_type == Hiring.CONTRACT_TYPE_CDD
                          and contract_duration is not None
                          and contract_duration > 31)))

                if dpae_should_be_imported:
                    statement = (
                        siret,
                        hiring_date,
                        contract_type,
                        departement,
                        contract_duration,
                        iiann,
                        tranche_age,
                        handicap_label,
                        duree_pec,
                    )
                    statements.append(statement)
                    imported_dpae += 1

                    if hiring_date.year not in imported_dpae_distribution:
                        imported_dpae_distribution[hiring_date.year] = {}
                    if hiring_date.month not in imported_dpae_distribution[
                            hiring_date.year]:
                        imported_dpae_distribution[hiring_date.year][
                            hiring_date.month] = {}
                    if hiring_date.day not in imported_dpae_distribution[
                            hiring_date.year][hiring_date.month]:
                        imported_dpae_distribution[hiring_date.year][
                            hiring_date.month][hiring_date.day] = 0
                    imported_dpae_distribution[hiring_date.year][
                        hiring_date.month][hiring_date.day] += 1
                else:
                    not_imported_dpae += 1

        # run remaining statements
        try:
            cur.executemany(query, statements)
            something_new = True
        except:
            logger.error("error in executing statement into dpae table: %s",
                         sys.exc_info()[1])
            raise

        logger.info("processed %i dpae...", count)
        logger.info("imported dpae: %i", imported_dpae)
        logger.info("not imported dpae: %i", not_imported_dpae)
        logger.info("zipcode errors: %i", self.zipcode_errors)
        logger.info("invalid_row errors: %i", self.invalid_row_errors)
        if self.zipcode_errors > settings.MAXIMUM_ZIPCODE_ERRORS:
            raise IOError('too many zipcode errors')
        if self.invalid_row_errors > settings.MAXIMUM_INVALID_ROWS:
            raise IOError('too many invalid_row errors')
        con.commit()
        cur.close()
        con.close()
        statistics = DpaeStatistics(
            last_import=datetime.now(),
            most_recent_data_date=self.last_historical_data_date_in_file,
        )
        statistics.save()
        logger.info("finished importing dpae...")
        return something_new