def create_geocoding_jobs(self): query = """select siret, numerorue, libellerue, codepostal, codecommune, coordinates_x, coordinates_y from %s""" % ( settings.EXPORT_ETABLISSEMENT_TABLE) con, cur = import_util.create_cursor() cur.execute(query) rows = cur.fetchall() geocoding_jobs = [] count = 0 for row in rows: siret, street_number, street_name, zipcode, codecommune, coordinates_x, coordinates_y = row try: city = CITY_NAMES[codecommune] except KeyError: logger.warning("wrong codecommune: %s", codecommune) continue try: full_address = self.get_full_adress(street_number, street_name, zipcode, city) initial_coordinates = [coordinates_x, coordinates_y] geocoding_jobs.append( [siret, full_address, initial_coordinates]) except IncorrectAdressDataException: logger.warning("incorrect address for %s %s %s %s", street_number, street_name, zipcode, city) count += 1 GEOCODING_STATS['jobs'] = GEOCODING_STATS.get('jobs', 0) + 1 if not count % 10000: logger.info("loading geocoding jobs from db... loaded %s rows", count) logger.info("%i geocoding jobs created...", len(geocoding_jobs)) return geocoding_jobs
def get_sirets_from_database(self): query = "select siret from %s where siret != ''" % settings.OFFICE_TABLE logger.info("get etablissements from database") con, cur = import_util.create_cursor() cur.execute(query) rows = cur.fetchall() return [row[0] for row in rows]
def create_creatable_offices(self): """ create new offices (that are not yet in our etablissement table) """ con, cur = import_util.create_cursor() query = """INSERT into %s(siret, raisonsociale, enseigne, codenaf, numerorue, libellerue, codecommune, codepostal, email, tel, departement, trancheeffectif, website, flag_poe_afpr, flag_pmsmp) values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s)""" % settings.RAW_OFFICE_TABLE count = 1 logger.info("create new offices in table %s", settings.RAW_OFFICE_TABLE) statements = [] MAX_COUNT_EXECUTE = 500 for siret in self.creatable_sirets: statement = self.csv_offices[siret]["create_fields"] statements.append(statement) count += 1 if not count % MAX_COUNT_EXECUTE: cur.executemany(query, statements) con.commit() statements = [] if not count % 10000: logger.info("created %s offices", count) if statements: cur.executemany(query, statements) con.commit() cur.close() con.close() logger.info("%i new offices created.", count)
def update_updatable_offices(self): con, cur = import_util.create_cursor() query = """UPDATE %s SET raisonsociale=%%s, enseigne=%%s, codenaf=%%s, numerorue=%%s, libellerue=%%s, codecommune=%%s, codepostal=%%s, email=%%s, tel=%%s, departement=%%s, trancheeffectif=%%s, website1=%%s, website2=%%s where siret=%%s""" % settings.OFFICE_TABLE count = 1 logger.info("update updatable etablissements in table %s" % settings.OFFICE_TABLE) statements = [] MAX_COUNT_EXECUTE = 500 for siret in self.updatable_sirets: statement = self.csv_offices[siret]["update_fields"] statements.append(statement) count += 1 if not count % MAX_COUNT_EXECUTE: cur.executemany(query, statements) con.commit() statements = [] if statements: cur.executemany(query, statements) con.commit() logger.info("%i etablissements updated.", count)
def update_coordinates(self, updates): con, cur = import_util.create_cursor() count = 0 statements = [] update_query = "update %s set coordinates_x=%%s, coordinates_y=%%s where siret=%%s" % \ settings.SCORE_REDUCING_TARGET_TABLE logger.info("Nb of offices to update : {}".format(len(updates))) for siret, coordinates in updates: count += 1 statements.append([coordinates[0], coordinates[1], siret]) if len(statements) == 1000: logger.info("geocoding with ban... %i of %i done", count, len(updates)) cur.executemany(update_query, statements) con.commit() statements = [] if len(statements) >= 1: logger.info("geocoding with ban... %i of %i done", count, len(updates)) cur.executemany(update_query, statements) con.commit() cur.close() con.close()
def run_sql_script(sql_script): con, cur = import_util.create_cursor() for query in sql_script.split(';'): query = query.strip() if len(query) >= 1: cur.execute(query) con.commit()
def get_sirets_from_database(self): query = "select siret from %s" % settings.RAW_OFFICE_TABLE logger.info("get offices from database") con, cur = import_util.create_cursor() cur.execute(query) rows = cur.fetchall() cur.close() con.close() return [row[0] for row in rows if siret_util.is_siret(row[0])]
def check_departements(departements): for dep in departements: con, cur = import_util.create_cursor() cur.execute("select count(1) from %s where departement='%s'" % (settings.OFFICE_TABLE, dep)) con.commit() count = cur.fetchone()[0] if count < 1000: logger.error("only %s results for departement %s", count, dep)
def validate_coordinates(self): con, cur = import_util.create_cursor() query = """ select sum(coordinates_x > 0 and coordinates_y > 0)/count(*) from %s """ % settings.EXPORT_ETABLISSEMENT_TABLE cur.execute(query) geocoding_ratio = cur.fetchall()[0][0] logger.info("geocoding_ratio = %s" % geocoding_ratio) if geocoding_ratio < 0.75: raise AbnormallyLowGeocodingRatioException
def after_check(self): con, cur = import_util.create_cursor() for departement in settings.DEPARTEMENTS: query = "select count(1) from %s where departement='%s'" % ( settings.OFFICE_TABLE, departement) cur.execute(query) result = cur.fetchone() count = result[0] logger.info("number of companies in departement %s : %i", departement, count) if count <= 1000: raise "too few companies"
def test_insert_data_from_file(self): file = get_available_files_list( path_folder=os.path.join(os.path.dirname(__file__), "data"))[0] insert_into_sql_table_old_prediction_file(file) insert_data(file, months_time=4) con, cur = import_util.create_cursor() cur.execute("select count(*) from etablissements_new;") number_new_offices = cur.fetchone()[0] self.assertTrue(number_new_offices == 2) self.assertTrue( PerfImporterCycleInfos.query.filter( PerfImporterCycleInfos.file_name == file).count() == 1)
def populate_flag(flag): logger.info("populating %s ... " % flag) con, cur = import_util.create_cursor() query = """ UPDATE %s e INNER JOIN %s f ON e.siret = f.siret SET e.%s = True; """ % (settings.EXPORT_ETABLISSEMENT_TABLE, flag, flag) cur.execute(query) con.commit() logger.info("completed populating %s ... " % flag)
def populate_flag(flag): logger.info("populating %s ... ", flag) con, cur = import_util.create_cursor() query = """ UPDATE %s e INNER JOIN %s f ON e.siret = f.siret SET e.%s = True; """ % (settings.SCORE_REDUCING_TARGET_TABLE, flag, flag) cur.execute(query) con.commit() logger.info("completed populating %s ... ", flag) cur.close() con.close()
def delete_deletable_offices(self): con, cur = import_util.create_cursor() if self.deletable_sirets: stringified_siret_list = ",".join(self.deletable_sirets) logger.info("going to delete %i offices...", len(self.deletable_sirets)) query = """DELETE FROM %s where siret IN (%s)""" % ( settings.OFFICE_TABLE, stringified_siret_list) try: cur.execute(query) con.commit() except: logger.warning("deletable_sirets=%s" % self.deletable_sirets) raise logger.info("%i old offices deleted.", len(self.deletable_sirets))
def update_coordinates(self, coordinates_updates): con, cur = import_util.create_cursor() count = 0 statements = [] update_query = "update %s set coordinates_x=%%s, coordinates_y=%%s where siret=%%s" % settings.EXPORT_ETABLISSEMENT_TABLE for siret, coordinates in coordinates_updates: statements.append([coordinates[0], coordinates[1], siret]) if not count % 1000: logger.info( "geocoding with ban... %i done (example: coordinates_x=%s, coordinates_y=%s", count, statements[0][0], statements[0][1]) cur.executemany(update_query, statements) con.commit() statements = [] count += 1
def delete_deletable_offices(self): con, cur = import_util.create_cursor() if self.deletable_sirets: for sirets in chunks(list(self.deletable_sirets), 500): stringified_siret_list = ",".join(sirets) logger.info("deleting a chunk of %i offices...", len(sirets)) query = """DELETE FROM %s where siret IN (%s)""" % (settings.RAW_OFFICE_TABLE, stringified_siret_list) try: cur.execute(query) con.commit() except: logger.warning("error while deleting chunk of sirets : %s", sirets) raise cur.close() con.close() logger.info("%i no longer existing offices deleted.", len(self.deletable_sirets))
def validate_coordinates(self): con, cur = import_util.create_cursor() query = """ select sum( (coordinates_x > 0 or coordinates_x < 0) and (coordinates_y > 0 or coordinates_y < 0) )/count(*) from %s """ % settings.SCORE_REDUCING_TARGET_TABLE cur.execute(query) geocoding_ratio = cur.fetchall()[0][0] logger.info("geocoding_ratio = %s", geocoding_ratio) if geocoding_ratio < settings.MINIMUM_GEOCODING_RATIO: raise AbnormallyLowGeocodingRatioException cur.close() con.close()
def insert_into_sql_table_old_prediction_file(file): file_name = os.path.basename(file) logger.info( f"\n Start : Insert data into etablissements_new from file {file_name}" ) con, cur = import_util.create_cursor() sql_file = gzip.open(file, 'rt', encoding='utf8') sql_as_string = sql_file.read() # Cant load the whole table at once, the file is to large ( ~ 400mb) # So we have to split the sql file in multiple transactions drop_statement = "DROP TABLE IF EXISTS `etablissements_new`;" cur.execute(drop_statement) start_create_text = "CREATE TABLE " end_create_text = "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;" start_create_statement_index = sql_as_string.find(start_create_text) + len( start_create_text) end_create_statement_index = sql_as_string.find( end_create_text, start_create_statement_index) create_statement = start_create_text + sql_as_string[ start_create_statement_index: end_create_statement_index] + end_create_text cur.execute(create_statement) cur.execute("LOCK TABLES `etablissements_new` WRITE;") insert_statements = sql_as_string.split( "INSERT INTO `etablissements_new` VALUES")[1:] for statement in insert_statements: if "/*!40000 ALTER TABLE `etablissements_new` ENABLE KEYS */;" in statement: clean_insert_statement = "INSERT INTO `etablissements_new` VALUES" + \ statement.split("/*!40000 ALTER TABLE `etablissements_new` ENABLE KEYS */;")[0] else: clean_insert_statement = "INSERT INTO `etablissements_new` VALUES" + statement cur.execute(clean_insert_statement) cur.execute("UNLOCK TABLES;") con.commit() # foo test resolution du TO cur.close() con.close() logger.info( f"\n End : Insert data into etablissements_new from file {file_name}")
def update_updatable_offices(self): # FIXME parallelize and/or batch for better performance con, cur = import_util.create_cursor() query = """UPDATE %s SET raisonsociale=%%s, enseigne=%%s, codenaf=%%s, numerorue=%%s, libellerue=%%s, codecommune=%%s, codepostal=%%s, email=%%s, tel=%%s, departement=%%s, trancheeffectif=%%s, website=%%s, flag_poe_afpr=%%s, flag_pmsmp=%%s where siret=%%s""" % settings.RAW_OFFICE_TABLE count = 0 logger.info("update updatable offices in table %s", settings.RAW_OFFICE_TABLE) statements = [] MAX_COUNT_EXECUTE = 500 for siret in self.updatable_sirets: statement = self.csv_offices[siret]["update_fields"] statements.append(statement) count += 1 if not count % MAX_COUNT_EXECUTE: cur.executemany(query, statements) con.commit() statements = [] if not count % 100000: logger.info("updated %s offices", count) if statements: cur.executemany(query, statements) con.commit() cur.close() con.close() logger.info("%i offices updated.", count)
def clear_useless_data(importer_cycle_infos_id): con, cur = import_util.create_cursor() for ici_id in importer_cycle_infos_id: cur.execute( "DELETE FROM perf_prediction_and_effective_hirings WHERE importer_cycle_infos_id = %s", [ici_id])
def run_task(self): logger.info("extracting %s ", self.input_filename) date_pattern = ".*_(\d\d\d\d\d\d\d\d)" date_match = re.match(date_pattern, self.input_filename) if date_match: date_part = date_match.groups()[-1] self.most_recent_data_date = datetime.strptime(date_part, "%Y%m%d") logger.debug("identified most_recent_data_date=%s" % self.most_recent_data_date) else: raise Exception( "couldn't find a date pattern in filename. filename should be dpae_XYZ_20xxxxxx.tar.gz" ) count = 0 statements = [] something_new = False query = "INSERT into %s(siret, hiring_date, zipcode, contract_type, departement, contract_duration, iiann, tranche_age, handicap_label) values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s)" % settings.DPAE_TABLE imported_dpae = 0 imported_dpae_distribution = {} not_imported_dpae = 0 initial_most_recent_data_date = DpaeStatistics.get_most_recent_data_date( ) logger.info( "will now extract all dpae with hiring_date between %s and %s" % (initial_most_recent_data_date, self.most_recent_data_date)) with import_util.get_reader(self.input_filename) as myfile: con, cur = import_util.create_cursor() header_line = myfile.readline().strip() if "siret" not in header_line: logger.debug(header_line) raise Exception("wrong header line") for line in myfile: count += 1 if not count % 100000: logger.debug("reading line %i", count) try: try: cur.executemany(query, statements) except OperationalError: # retry once in case of deadlock error time.sleep(10) cur.executemany(query, statements) statements = [] con.commit() something_new = True except: logger.error( "error in executing statement into dpae table: %s", sys.exc_info()[1]) statements = [] raise try: siret, hiring_date, zipcode, contract_type, departement, contract_duration, iiann, tranche_age, handicap_label = parse_dpae_line( line) except ValueError: self.zipcode_errors += 1 continue except DepartementException: self.zipcode_errors += 1 continue except TooFewFieldsException: logger.info("invalid_row met at row: %i", count) self.invalid_row_errors += 1 continue if hiring_date > initial_most_recent_data_date and hiring_date <= self.most_recent_data_date: statement = (siret, hiring_date, zipcode, contract_type, departement, contract_duration, iiann, tranche_age, handicap_label) statements.append(statement) imported_dpae += 1 if hiring_date.year not in imported_dpae_distribution: imported_dpae_distribution[hiring_date.year] = {} if hiring_date.month not in imported_dpae_distribution[ hiring_date.year]: imported_dpae_distribution[hiring_date.year][ hiring_date.month] = {} if hiring_date.day not in imported_dpae_distribution[ hiring_date.year][hiring_date.month]: imported_dpae_distribution[hiring_date.year][ hiring_date.month][hiring_date.day] = 0 imported_dpae_distribution[hiring_date.year][ hiring_date.month][hiring_date.day] += 1 else: not_imported_dpae += 1 # run remaining statements try: cur.executemany(query, statements) something_new = True except: logger.error("error in executing statement into dpae table: %s", sys.exc_info()[1]) raise logger.info("processed %i dpae...", count) logger.info("imported dpae: %i", imported_dpae) logger.info("not imported dpae: %i", not_imported_dpae) logger.info("zipcode errors: %i", self.zipcode_errors) logger.info("invalid_row errors: %i", self.invalid_row_errors) if self.zipcode_errors >= 100: raise Exception('too many zipcode errors') if self.invalid_row_errors >= 100: raise Exception('too many invalid_row errors') statistics = DpaeStatistics( last_import=datetime.now(), most_recent_data_date=self.most_recent_data_date) statistics.save() con.commit() logger.info("finished importing dpae...") return something_new
import random import urllib.request, urllib.parse, urllib.error from locust import HttpLocust, TaskSet, task from slugify import slugify from labonneboite.common import geocoding from labonneboite.conf import settings from labonneboite.importer import util as import_util from labonneboite.web.api import util logger = logging.getLogger(__name__) logger.info("loading locustfile") con, cur = import_util.create_cursor() # For each locust, number of seconds between its tasks. Default value: 1. SECONDS_BETWEEN_TASKS = 1 def generate_siret_choices(): cur.execute("select siret from %s limit 100000" % (settings.OFFICE_TABLE)) con.commit() rows = cur.fetchall() return [row[0] for row in rows] def generate_city_choices(): cities_by_population = sorted(geocoding.get_cities(), key=itemgetter('population'), reverse=True) city_choices = []
def run_task(self): date_insertion = datetime.now() logger.info("extracting %s ", self.input_filename) # this pattern matches the first date # e.g. 'lbb_xdpdpae_delta_201611102200.bz2' # will match 2018-09-12 date_pattern = r'.*_(\d\d\d\d\d\d\d\d)\d\d\d\d' #We keep only the date in the file name, ex: 20190910 = 10th september 2019 date_match = re.match(date_pattern, self.input_filename) if date_match: date_part = date_match.groups()[0] self.last_historical_data_date_in_file = datetime.strptime( date_part, "%Y%m%d") logger.debug("identified last_historical_data_date_in_file=%s", self.last_historical_data_date_in_file) else: raise Exception( "couldn't find a date pattern in filename. filename should be \ like lbb_xdpdpae_delta_YYYYMMDDHHMM.csv") count = 0 statements = [] something_new = False query = """ INSERT into %s( siret, hiring_date, contract_type, departement, contract_duration, iiann, tranche_age, handicap_label, duree_pec, date_insertion ) values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s) """ % settings.HIRING_TABLE imported_dpae = 0 imported_dpae_distribution = {} not_imported_dpae = 0 last_historical_data_date_in_db = db_session.query(func.max(Hiring.hiring_date)) \ .filter(Hiring.contract_type.in_((Hiring.CONTRACT_TYPE_CDI, Hiring.CONTRACT_TYPE_CDD, Hiring.CONTRACT_TYPE_CTT))).first()[0] if last_historical_data_date_in_db is None: last_historical_data_date_in_db = DEFAULT_DATETIME_DPAE logger.info( "will now extract all dpae with hiring_date between %s and %s", last_historical_data_date_in_db, self.last_historical_data_date_in_file) with import_util.get_reader(self.input_filename) as myfile: con, cur = import_util.create_cursor() header_line = myfile.readline().strip( ) # FIXME detect column positions from header if b"siret" not in header_line: logger.debug(header_line) raise Exception("wrong header line") for line in myfile: line = line.decode() count += 1 if not count % 100000: logger.debug("reading line %i", count) try: try: cur.executemany(query, statements) except OperationalError: # retry once in case of deadlock error time.sleep(10) cur.executemany(query, statements) statements = [] con.commit() something_new = True except: logger.error( "error in executing statement into dpae table: %s", sys.exc_info()[1]) statements = [] raise try: siret, hiring_date, _, contract_type, departement, contract_duration, \ iiann, tranche_age, handicap_label, duree_pec = parse_dpae_line(line) except ValueError: self.zipcode_errors += 1 continue except InvalidRowException: logger.info("invalid_row met at row: %i", count) self.invalid_row_errors += 1 continue dpae_should_be_imported = ( hiring_date > last_historical_data_date_in_db and hiring_date <= self.last_historical_data_date_in_file # For DPAE contracts we only keep all CDI, only long enough CDD (at least 31 days) # and we ignore CTT. and (contract_type == Hiring.CONTRACT_TYPE_CDI or (contract_type == Hiring.CONTRACT_TYPE_CDD and contract_duration is not None and contract_duration > 31))) if dpae_should_be_imported: statement = (siret, hiring_date, contract_type, departement, contract_duration, iiann, tranche_age, handicap_label, duree_pec, date_insertion) statements.append(statement) imported_dpae += 1 if hiring_date.year not in imported_dpae_distribution: imported_dpae_distribution[hiring_date.year] = {} if hiring_date.month not in imported_dpae_distribution[ hiring_date.year]: imported_dpae_distribution[hiring_date.year][ hiring_date.month] = {} if hiring_date.day not in imported_dpae_distribution[ hiring_date.year][hiring_date.month]: imported_dpae_distribution[hiring_date.year][ hiring_date.month][hiring_date.day] = 0 imported_dpae_distribution[hiring_date.year][ hiring_date.month][hiring_date.day] += 1 else: not_imported_dpae += 1 # run remaining statements try: cur.executemany(query, statements) something_new = True except: logger.error("error in executing statement into dpae table: %s", sys.exc_info()[1]) raise logger.info("processed %i dpae...", count) logger.info("imported dpae: %i", imported_dpae) logger.info("not imported dpae: %i", not_imported_dpae) logger.info("zipcode errors: %i", self.zipcode_errors) logger.info("invalid_row errors: %i", self.invalid_row_errors) if self.zipcode_errors > settings.MAXIMUM_ZIPCODE_ERRORS: raise IOError('too many zipcode errors') if self.invalid_row_errors > settings.MAXIMUM_INVALID_ROWS: raise IOError('too many invalid_row errors') logger.info("verifying good number of dpae imported.") query = "select count(*) from hirings h where hiring_date > %s and hiring_date <= %s and h.contract_type in (1,2,3)" cur.execute(query, [ last_historical_data_date_in_db, self.last_historical_data_date_in_file ]) res = cur.fetchone() if res[0] != imported_dpae: raise DoublonException( f"Too many DPAE ({res[0]}) in DB compared to DPAE file ({imported_dpae})." ) logger.info("verifying number of DPAE: OK.") con.commit() cur.close() con.close() try: statistics = DpaeStatistics( last_import=datetime.now(), most_recent_data_date=self.last_historical_data_date_in_file, file_type=self.file_type) db_session.add(statistics) db_session.commit() logger.info("First way to insert DPAE statistics in DB : OK") except OperationalError: # For an obscure reason, the DpaeStatistics way to insert does not work on the bonaparte server # So we insert it directly via an SQL query # This job has been broken for more than a year, only way to fix it : db_session.rollback() last_import_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') most_recent_date = self.last_historical_data_date_in_file.strftime( '%Y-%m-%d %H:%M:%S') query = f"insert into dpae_statistics (last_import, most_recent_data_date, file_type) values ('{last_import_date}','{most_recent_date}','{self.file_type}')" con, cur = import_util.create_cursor() cur.execute(query) con.commit() cur.close() con.close() logger.info("Second way to insert DPAE statistics in DB : OK") logger.info("finished importing dpae...") return something_new
def get_offices_from_file(self): logger.info("extracting %s...", self.input_filename) departements = settings.DEPARTEMENTS count = 0 no_zipcode_count = 0 departement_errors = 0 unprocessable_departement_errors = 0 format_errors = 0 con, cur = import_util.create_cursor() departement_counter_dic = {} etablissements = {} with import_util.get_reader(self.input_filename) as myfile: header_line = myfile.readline().strip() if "siret" not in header_line: logger.debug(header_line) raise "wrong header line" for line in myfile: count += 1 if not count % 100000: logger.debug("reading line %i", count) try: fields = import_util.get_fields_from_csv_line(line) if len(fields) != 16: logger.exception("wrong number of fields in line %s" % line) raise ValueError siret, raisonsociale, enseigne, codenaf, numerorue, \ libellerue, codecommune, codepostal, email, tel, \ trancheeffectif_etablissement, effectif_etablissement, \ trancheeffectif_entreprise, date_creation_entreprise, \ website1, website2 = fields except ValueError: logger.exception("exception in line %s" % line) format_errors += 1 continue website1 = encoding_util.strip_french_accents(website1) website2 = encoding_util.strip_french_accents(website2) email = encoding_util.strip_french_accents(email) if codecommune.strip(): try: departement = extract_departement_from_zipcode( codepostal, siret) process_this_departement = departement in departements if process_this_departement: if len(codepostal) == 4: codepostal = "0%s" % codepostal etab_create_fields = siret, raisonsociale, enseigne, codenaf, numerorue, libellerue, \ codecommune, codepostal, email, tel, departement, trancheeffectif_etablissement, \ website1, website2 etab_update_fields = raisonsociale, enseigne, codenaf, numerorue, libellerue, \ codecommune, codepostal, email, tel, departement, trancheeffectif_etablissement, \ website1, website2, siret if codepostal.startswith(departement): departement_counter_dic.setdefault( departement, 0) departement_counter_dic[departement] += 1 etablissements[siret] = { "create_fields": etab_create_fields, "update_fields": etab_update_fields, } else: logger.info( "zipcode and departement dont match code commune: %s, code postal: %s, departement: %s", codecommune, codepostal, departement) else: unprocessable_departement_errors += 1 except DepartementException: logger.exception("departement exception") departement_errors += 1 else: no_zipcode_count += 1 logger.info("%i etablissements total" % count) logger.info("%i etablissements with incorrect departement" % departement_errors) logger.info("%i etablissements with unprocessable departement" % unprocessable_departement_errors) logger.info("%i etablissements with no zipcodes", no_zipcode_count) logger.info("%i etablissements not read because of format error", format_errors) logger.info("%i number of departements from file" % len(departement_counter_dic)) departement_count = sorted(departement_counter_dic.items()) logger.info("per departement read %s", departement_count) logger.info("finished reading etablissements...") if departement_errors > 500: raise "too many departement_errors" if unprocessable_departement_errors > 2000: raise "too many unprocessable_departement_errors" if no_zipcode_count > 40000: raise "too many no_zipcode_count" if format_errors > 5: raise "too many format_errors" if len(departement_counter_dic) not in [ 96, 15 ]: # 96 in production, 15 in test logger.exception("incorrect total number of departements : %s" % len(departement_counter_dic)) raise "incorrect total number of departements" if len(departement_counter_dic) == 96: for departement, count in departement_count: if count < 10000: logger.exception( "only %s etablissements in departement %s" % (count, departement)) raise "not enough etablissements in at least one departement" return etablissements
def run_task(self): date_insertion = datetime.now() logger.info("extracting %s ", self.input_filename) # this pattern matches the first date # e.g. '20200803ExtractApp' # will match 20200803 date_string = self.input_filename.split('/')[-1][0:8] try: self.last_historical_data_date_in_file = datetime.strptime(date_string, "%Y%m%d") except ValueError: raise Exception("couldn't find a date pattern in filename. filename should be \ like 20200803ExtractApp.csv") count = 0 statements = [] something_new = False query = """ INSERT into %s( siret, hiring_date, contract_type, departement, contract_duration, iiann, tranche_age, handicap_label, duree_pec, date_insertion ) values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s) """ % settings.HIRING_TABLE imported_alternance_contracts = 0 imported_alternance_contracts_distribution = {} not_imported_alternance_contracts = 0 last_historical_data_date_in_db = db_session.query(func.max(Hiring.hiring_date))\ .filter(Hiring.contract_type == self.contract_type).first()[0] logger.info("will now extract all alternance contracts with hiring_date between %s and %s", last_historical_data_date_in_db, self.last_historical_data_date_in_file) with import_util.get_reader(self.input_filename) as myfile: con, cur = import_util.create_cursor() header_line = myfile.readline().strip() # FIXME detect column positions from header if b"SIRET" not in header_line: logger.debug(header_line) raise Exception("wrong header line") for line in myfile: line = line.decode() count += 1 if not count % 10000: logger.debug("reading line %i", count) try: try: cur.executemany(query, statements) except OperationalError: # retry once in case of deadlock error time.sleep(10) cur.executemany(query, statements) statements = [] con.commit() something_new = True except: logger.error("error in executing statement into hirings table: %s", sys.exc_info()[1]) statements = [] raise try: siret, hiring_date, departement = parse_alternance_line(line) except InvalidRowException: logger.info("invalid_row met at row: %i", count) self.invalid_row_errors += 1 continue except InvalidSiretException: error_message = traceback.format_exc() logger.info("invalid siret met at row: %i", count) logger.info(error_message) self.invalid_siret_errors += 1 continue except InvalidZipCodeException: logger.info("invalid zip code met at row: %i", count) self.invalid_zipcode_errors += 1 continue # This part of code is useless : # The data used has a lot of late contracts inputs # So we have to insert ALL the contracts from different dates # alternance_contract_should_be_imported = ( # hiring_date > last_historical_data_date_in_db # and hiring_date <= self.last_historical_data_date_in_file #) if hiring_date <= self.last_historical_data_date_in_file: statement = ( siret, hiring_date, self.contract_type, departement, None, #contract_duration None, #iiann None, #tranche_age None, #handicap_label None, #duree_pec date_insertion ) statements.append(statement) imported_alternance_contracts += 1 if hiring_date.year not in imported_alternance_contracts_distribution: imported_alternance_contracts_distribution[hiring_date.year] = {} if hiring_date.month not in imported_alternance_contracts_distribution[hiring_date.year]: imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month] = {} if hiring_date.day not in imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month]: imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month][hiring_date.day] = 0 imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month][hiring_date.day] += 1 # run remaining statements try: cur.executemany(query, statements) something_new = True except: logger.error("error in executing statement into hirings table: %s", sys.exc_info()[1]) raise logger.info(f"Types de contrats à importer : {self.contract_name}") logger.info(f"processed {count} lba_contracts...") logger.info(f"imported lba_contracts: {imported_alternance_contracts}") logger.info(f"not imported lba_contracts: {not_imported_alternance_contracts}") logger.info(f"zipcode errors: {self.invalid_zipcode_errors}") logger.info(f"invalid_row errors: {self.invalid_row_errors}") logger.info(f"invalid siret errors: {self.invalid_siret_errors}") # if self.zipcode_errors > settings.MAXIMUM_ZIPCODE_ERRORS: # raise IOError('too many zipcode errors') # if self.invalid_row_errors > settings.MAXIMUM_INVALID_ROWS: # raise IOError('too many invalid_row errors') con.commit() cur.close() con.close() try: statistics = DpaeStatistics( last_import=datetime.now(), most_recent_data_date=self.last_historical_data_date_in_file, file_type=self.file_type ) db_session.add(statistics) db_session.commit() logger.info("First way to insert DPAE statistics in DB : OK") except OperationalError: # For an obscure reason, the DpaeStatistics way to insert does not work on the bonaparte server # So we insert it directly via an SQL query # This job has been broken for more than a year, only way to fix it : db_session.rollback() last_import_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') most_recent_date = self.last_historical_data_date_in_file.strftime('%Y-%m-%d %H:%M:%S') query = f"insert into dpae_statistics (last_import, most_recent_data_date, file_type) values ('{last_import_date}','{most_recent_date}','{self.file_type}')" con, cur = import_util.create_cursor() cur.execute(query) con.commit() cur.close() con.close() logger.info("Second way to insert DPAE statistics in DB : OK") logger.info("finished importing dpae...") return something_new
def run_task(self): logger.info("extracting %s ", self.input_filename) # this pattern matches the first date # e.g. 'LBB_XDPDPAE_2018-09-12_2017-08-01.bz2' # will match 2018-09-12 date_pattern = r'.*_(\d\d\d\d-\d\d-\d\d)_' date_match = re.match(date_pattern, self.input_filename) if date_match: date_part = date_match.groups()[0] self.last_historical_data_date_in_file = datetime.strptime( date_part, "%Y-%m-%d") logger.debug("identified last_historical_data_date_in_file=%s", self.last_historical_data_date_in_file) else: raise Exception( "couldn't find a date pattern in filename. filename should be \ like LBB_XDPDPAE_YYYY-MM-DD_YYYY-MM-DD.csv") count = 0 statements = [] something_new = False query = """ INSERT into %s( siret, hiring_date, contract_type, departement, contract_duration, iiann, tranche_age, handicap_label, duree_pec ) values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s) """ % settings.HIRING_TABLE imported_dpae = 0 imported_dpae_distribution = {} not_imported_dpae = 0 last_historical_data_date_in_db = DpaeStatistics.get_last_historical_data_date( ) logger.info( "will now extract all dpae with hiring_date between %s and %s", last_historical_data_date_in_db, self.last_historical_data_date_in_file) with import_util.get_reader(self.input_filename) as myfile: con, cur = import_util.create_cursor() header_line = myfile.readline().strip( ) # FIXME detect column positions from header if b"siret" not in header_line: logger.debug(header_line) raise Exception("wrong header line") for line in myfile: line = line.decode() count += 1 if not count % 100000: logger.debug("reading line %i", count) try: try: cur.executemany(query, statements) except OperationalError: # retry once in case of deadlock error time.sleep(10) cur.executemany(query, statements) statements = [] con.commit() something_new = True except: logger.error( "error in executing statement into dpae table: %s", sys.exc_info()[1]) statements = [] raise try: siret, hiring_date, _, contract_type, departement, contract_duration, \ iiann, tranche_age, handicap_label, duree_pec = parse_dpae_line(line) except ValueError: self.zipcode_errors += 1 continue except InvalidRowException: logger.info("invalid_row met at row: %i", count) self.invalid_row_errors += 1 continue dpae_should_be_imported = ( hiring_date > last_historical_data_date_in_db and hiring_date <= self.last_historical_data_date_in_file # For DPAE contracts we only keep all CDI, only long enough CDD (at least 31 days) # and we ignore CTT. and (contract_type == Hiring.CONTRACT_TYPE_CDI or (contract_type == Hiring.CONTRACT_TYPE_CDD and contract_duration is not None and contract_duration > 31))) if dpae_should_be_imported: statement = ( siret, hiring_date, contract_type, departement, contract_duration, iiann, tranche_age, handicap_label, duree_pec, ) statements.append(statement) imported_dpae += 1 if hiring_date.year not in imported_dpae_distribution: imported_dpae_distribution[hiring_date.year] = {} if hiring_date.month not in imported_dpae_distribution[ hiring_date.year]: imported_dpae_distribution[hiring_date.year][ hiring_date.month] = {} if hiring_date.day not in imported_dpae_distribution[ hiring_date.year][hiring_date.month]: imported_dpae_distribution[hiring_date.year][ hiring_date.month][hiring_date.day] = 0 imported_dpae_distribution[hiring_date.year][ hiring_date.month][hiring_date.day] += 1 else: not_imported_dpae += 1 # run remaining statements try: cur.executemany(query, statements) something_new = True except: logger.error("error in executing statement into dpae table: %s", sys.exc_info()[1]) raise logger.info("processed %i dpae...", count) logger.info("imported dpae: %i", imported_dpae) logger.info("not imported dpae: %i", not_imported_dpae) logger.info("zipcode errors: %i", self.zipcode_errors) logger.info("invalid_row errors: %i", self.invalid_row_errors) if self.zipcode_errors > settings.MAXIMUM_ZIPCODE_ERRORS: raise IOError('too many zipcode errors') if self.invalid_row_errors > settings.MAXIMUM_INVALID_ROWS: raise IOError('too many invalid_row errors') con.commit() cur.close() con.close() statistics = DpaeStatistics( last_import=datetime.now(), most_recent_data_date=self.last_historical_data_date_in_file, ) statistics.save() logger.info("finished importing dpae...") return something_new