def run_main(): import_util.clean_temporary_tables() task = ScoreComputingJob() results = task.run() no_results = [] departements = [] for departement, result in results: departements.append(departement) if not result: no_results.append(departement) if len(no_results) > settings.MAXIMUM_COMPUTE_SCORE_JOB_FAILURES: results = set(departements) - set(no_results) logger.warning( "compute_scores by departement : %i failures (%s) vs %i successes (%s), aborting...", len(no_results), ",".join(no_results), len(results), ",".join(results), ) sys.exit(-1) import_util.reduce_scores_for_backoffice(departements) import_util.reduce_scores_for_main_db(departements) if COMPUTE_SCORES_DEBUG_MODE: logger.warning( "debug mode enabled, failing on purpose for debugging of temporary tables" ) sys.exit(-1) import_util.clean_temporary_tables() logger.info("compute_scores task: FINISHED")
def get_json_logs_activity(self, need_all_files=False): '''Function which will return a list with all file names of activity logs that need to be parsed ''' # Now we have a cron task which will copy json activity logs to /srv/lbb/data # list of all the json activities files json_logs_files_names = [ i for i in os.listdir(self.json_logs_folder_path) if i.startswith('activity') ] # list of all the json activities that need to be parsed (which aren't stored in database) if need_all_files is False: json_logs_files_names_to_parse = [ file_name for file_name in json_logs_files_names if self.needs_parse_json_activity_log(file_name) ] else: json_logs_files_names_to_parse = json_logs_files_names if not json_logs_files_names_to_parse: #if empty list logger.info("Did not find/need any data to parse") raise NoDataException logger.info(f'.json files to parse : {json_logs_files_names_to_parse}') self.json_logs_files_names_to_parse = json_logs_files_names_to_parse
def run_geocoding_jobs(self, geocoding_jobs, disable_multithreading=False): adresses_not_geolocated[:] = [] coordinates_updates[:] = [] logger.info("Nombre de geocoding jobs : {}".format( len(geocoding_jobs))) if disable_multithreading: for siret, full_address, initial_coordinates, city_code in geocoding_jobs: self.find_coordinates_for_address(siret, full_address, initial_coordinates, city_code) else: pool = Pool(processes=pool_size) for siret, full_address, initial_coordinates, city_code in geocoding_jobs: pool.apply_async(self.find_coordinates_for_address, ( siret, full_address, initial_coordinates, city_code, )) pool.close() pool.join() logger.info( "run geocoding jobs : collected {} coordinates on {} jobs, need to geocode {}" .format(GEOCODING_STATS.get('cache_hits', 0), len(geocoding_jobs), len(adresses_not_geolocated))) return adresses_not_geolocated
def insert_logs_activity_recherche(self, activity_df): logs_activity_df = activity_df[activity_df['nom'] == 'recherche'] logs_activity_df['ville'] = logs_activity_df.apply( lambda row: get_propriete(row, 'localisation', 'ville'), axis=1) logs_activity_df['code_postal'] = logs_activity_df.apply( lambda row: get_propriete(row, 'localisation', 'codepostal'), axis=1) logs_activity_df['emploi'] = logs_activity_df.apply( lambda row: get_propriete(row, 'emploi'), axis=1) logs_activity_df['emploi'] = logs_activity_df.apply( lambda row: clean_emploi(row), axis=1) # TODO : Find a way to concatenate logs, because too many logs_activity_df = logs_activity_df[( logs_activity_df.source == 'site')] cols_of_interest = [ "dateheure", "idutilisateur_peconnect", "ville", "code_postal", "emploi", ] logs_activity_df = logs_activity_df[cols_of_interest] nb_lines = logs_activity_df.shape[0] logger.info( f'Number of lines to insert into logs_activity_recherche : {nb_lines}' ) return logs_activity_df
def run(): errors = sanity.check_scores() if errors: msg = "departements with errors: %s" % ",".join(errors) logger.error(msg) raise ValueError(msg) logger.info("validate_scores task: FINISHED")
def get_total_hirings_per_office(): engine = import_util.create_sqlalchemy_engine() query = "select\ siret,\ raisonsociale,\ enseigne,\ email,\ tel,\ website,\ codenaf,\ codepostal,\ codecommune,\ trancheeffectif,\ greatest(0, floor(score_regr)) as total_hirings\ from \ etablissements_backoffice" # To make it work in local dev, we have to take all the offices that have score > 0 if get_current_env() != ENV_DEVELOPMENT: query += " where greatest(0, floor(score_regr)) > 0;" df_total_hirings = pd.read_sql_query(query, engine) engine.close() logger.info("Datas selected from etablissements_backoffice") print(df_total_hirings) return df_total_hirings
def get_data_second_sheet(self): # SECOND SHEET : https://docs.google.com/spreadsheets/d/1gbvFvFEEugCmPhsAdoRZEdjfEl579uUnmf5MIryaVB8/edit#gid=0 self.df_evol_nb_dpae_hiring_and_activity_date = self.get_df_evol_nb_dpae_hiring_and_activity_date( ) logger.info("Data for second sheet ready")
def run_main(): import logging logging.basicConfig(level=logging.DEBUG) dpae_filenames = import_util.detect_runnable_file("dpae", bulk=True) for filename in dpae_filenames: logger.info("PROCESSING %s" % filename) task = DpaeExtractJob(filename) task.run()
def get_sirets_from_database(self): query = "select siret from %s" % settings.RAW_OFFICE_TABLE logger.info("get offices from database") con, cur = import_util.create_cursor() cur.execute(query) rows = cur.fetchall() cur.close() con.close() return [row[0] for row in rows if siret_util.is_siret(row[0])]
def get_most_recent_csv_file(self): csv_paths = os.listdir(self.csv_folder_path) csv_paths = [i for i in csv_paths if i.startswith('act_dpae-')] csv_paths.sort() most_recent_csv_file = csv_paths[-1] logger.info( f"the act-dpae file which will be used is : {most_recent_csv_file}" ) return most_recent_csv_file
def insert_id_peconnect(self, activity_df): activity_df = activity_df.dropna(axis=0, subset=['idutilisateur_peconnect']) activity_idpec = activity_df.drop_duplicates( subset=['idutilisateur_peconnect', 'date'], keep='first') activity_idpec = activity_idpec[[ 'dateheure', 'idutilisateur_peconnect' ]] nb_lines = activity_idpec.shape[0] logger.info(f'Number of lines to insert into idpec : {nb_lines}') return activity_idpec
def get_most_recent_dpae_file(self): dpae_paths = os.listdir(self.dpae_folder_path) # IMPORTANT : Need to copy the DPAE file from /mnt/datalakepe/ to /srv/lbb/data # It is certainly ok, because of the importer which also needs this file dpae_paths = [i for i in dpae_paths if i.startswith('lbb_xdpdpae_delta')] dpae_paths.sort() most_recent_dpae_file = dpae_paths[-1] if get_current_env() == ENV_DEVELOPMENT: most_recent_dpae_file = 'lbb_xdpdpae_delta_201511102200.csv' logger.info(f"the DPAE file which will be used is : {most_recent_dpae_file}") return most_recent_dpae_file
def join_old_data_with_new_ones(self): # If it exists, we have to get the whole old datas, to delete rows which are duplicates for a couple idpeconnect/siret if self.existing_sql_table: df_dpae_act_existing = self.get_old_activities_logs_saved() # In case a problem appear in the script, we save old datas under .csv extension # because we will rewrite the whole table after each execution, we have to remove duplicates df_dpae_act_existing.to_csv( f"{self.csv_folder_path}backup_sql_{TABLE_NAME}.csv", encoding='utf-8', sep='|') nb_rows_old = df_dpae_act_existing.shape[0] logger.info(f"There were already act/dpae : {nb_rows_old} rows") nb_rows = self.df_dpae_act.shape[0] logger.info( f"There are {nb_rows} new rows to concat with the old one") # We concatenate all the old rows with the new ones self.df_dpae_act = pd.concat( [self.df_dpae_act, df_dpae_act_existing]) nb_rows = self.df_dpae_act.shape[0] logger.info(f"Concatenation of both has {nb_rows} rows") # Remove the duplicates self.df_dpae_act = self.df_dpae_act.sort_values('date_activite') self.df_dpae_act = self.df_dpae_act.drop_duplicates( subset=['idutilisateur_peconnect', 'siret'], keep='first') nb_rows = self.df_dpae_act.shape[0] logger.info( f"Concatenation of both minus duplicates has {nb_rows} rows")
def run_main(): import logging logging.basicConfig(level=logging.DEBUG) lba_app_filenames = import_util.detect_runnable_file("lba-app", bulk=True) for filename in lba_app_filenames: logger.info("PROCESSING %s" % filename) task_app = ApprentissageExtractJob(filename, contract_type = 'APPRENTISSAGE') task_app.run() lba_pro_filenames = import_util.detect_runnable_file("lba-pro", bulk=True) for filename in lba_pro_filenames: logger.info("PROCESSING %s" % filename) task_pro = ApprentissageExtractJob(filename, contract_type = 'CONTRAT_PRO') task_pro.run()
def populate_flag(flag): logger.info("populating %s ... ", flag) con, cur = import_util.create_cursor() query = """ UPDATE %s e INNER JOIN %s f ON e.siret = f.siret SET e.%s = True; """ % (settings.SCORE_REDUCING_TARGET_TABLE, flag, flag) cur.execute(query) con.commit() logger.info("completed populating %s ... ", flag) cur.close() con.close()
def delete_deletable_offices(self): con, cur = import_util.create_cursor() if self.deletable_sirets: for sirets in chunks(list(self.deletable_sirets), 500): stringified_siret_list = ",".join(sirets) logger.info("deleting a chunk of %i offices...", len(sirets)) query = """DELETE FROM %s where siret IN (%s)""" % (settings.RAW_OFFICE_TABLE, stringified_siret_list) try: cur.execute(query) con.commit() except: logger.warning("error while deleting chunk of sirets : %s", sirets) raise cur.close() con.close() logger.info("%i no longer existing offices deleted.", len(self.deletable_sirets))
def prepare_flag_handicap(): logger.info("preparing flag_handicap...") sql_script = """ drop table if exists flag_handicap; create table flag_handicap as ( select distinct(siret) from %s where handicap_label = 'RQTH-MDT' and hiring_date >= DATE_SUB(NOW(),INTERVAL 1 YEAR) ); """ % settings.HIRING_TABLE import_util.run_sql_script(sql_script) logger.info("completed preparing flag_handicap.")
def prepare_flags_junior_and_senior(): logger.info("preparing flags_junior_and_senior...") sql_script = """ drop table if exists flag_tmp1; create table flag_tmp1 as ( select siret, tranche_age, count(*) as contrats from %s where hiring_date >= DATE_SUB(NOW(),INTERVAL 1 YEAR) group by siret, tranche_age ); drop table if exists flag_tmp2; create table flag_tmp2 as ( select siret, 100*sum(contrats*(tranche_age='00-25'))/sum(contrats) as ratio_junior, 100*sum(contrats*(tranche_age='51-99'))/sum(contrats) as ratio_senior from flag_tmp1 group by siret ); drop table if exists flag_junior; create table flag_junior as ( select siret from flag_tmp2 where ratio_junior >= 80 ); drop table if exists flag_senior; create table flag_senior as ( select siret from flag_tmp2 where ratio_senior >= 16 ); drop table if exists flag_tmp1; drop table if exists flag_tmp2; """ % settings.HIRING_TABLE import_util.run_sql_script(sql_script) logger.info("completed preparing flags_junior_and_senior.")
def get_data_first_sheet(self): # FIRST SHEET : https://docs.google.com/spreadsheets/d/1kx-mxCaXIkys3hU4El4K7a6JBwzrdF75X4U8igqLB4I/edit?folder=1QFm0t2weoUjTsl-FPYUj94__zq_mZq0h#gid=0 # 1st column : Nb IDPE unique ayant accédé à LBB self.df_evol_idpe_connect = self.get_df_evol_idpe_connect() # 2nd column : Nb d'IDPE unique ayant déplié une fiche entreprise self.df_evol_idpe_connect_sign_afficher_details = self.get_df_evol_idpe_connect_sign( did_specific_activity='afficher-details') # 3rd column : Nb d'IDPE unique ayant consulté une page entreprise self.df_evol_idpe_connect_sign_details = self.get_df_evol_idpe_connect_sign( did_specific_activity='details') # 4th column : Not data yet but : Nb d'IDPE unique ayant accédé à la première étape de "Je postule" # 5th column : Nb IDPE unique ayant déplié une fiche entreprise, consulté une page entreprise, mis en Favoris une entreprise # + Fourth column ( accédé à la première étape de JP) self.df_evol_idpe_connect_sign = self.get_df_evol_idpe_connect_sign() # 6th column : Nb d'embauche par mois ayant pour origine une activité d'usager connecté LBB (date de début/fin = date d'embauche) self.df_evol_dpae = self.get_df_evol_dpae() # 7th column is an empty column # 8th column : Nb candidatures JP self.df_nb_candidatures_jp = pd.read_csv( f'{self.csv_jp_folder_path}/dump_nb_candidatures_jp.csv', delimiter=';') # 9th column : Nb email unique ayant candidaté via Je postule self.df_nb_distinct_email_jp = pd.read_csv( f'{self.csv_jp_folder_path}/dump_nb_distinct_email_jp.csv', delimiter=';') # 10th column : Nb candidats ayant reçus une réponse via JP self.df_nb_candidates_with_answer_jp = pd.read_csv( f'{self.csv_jp_folder_path}/dump_nb_candidates_with_answer_jp.csv', delimiter=';') # 11th column : Délai moyen de réponse des recruteurs via JP (en jours) self.df_medium_delay_answer_jp = pd.read_csv( f'{self.csv_jp_folder_path}/dump_medium_delay_answer_jp.csv', delimiter=';') logger.info("Data for first sheet ready")
def create_geocoding_jobs(self): query = """ select siret, numerorue, libellerue, codepostal, codecommune, coordinates_x, coordinates_y from %s """ % (settings.SCORE_REDUCING_TARGET_TABLE) if DEBUG_MODE: #query += "WHERE coordinates_x = 0 and coordinates_y = 0" query += "ORDER BY RAND() LIMIT 100000" con, cur = import_util.create_cursor() cur.execute(query) rows = cur.fetchall() geocoding_jobs = [] count = 0 for row in rows: siret, street_number, street_name, zipcode, codecommune, coordinates_x, coordinates_y = row try: city = CITY_NAMES[codecommune] except KeyError: logger.warning("wrong codecommune: %s", codecommune) continue try: full_address = self.get_full_adress(street_number, street_name, zipcode, city) initial_coordinates = [coordinates_x, coordinates_y] geocoding_jobs.append( [siret, full_address, initial_coordinates, codecommune]) except IncorrectAdressDataException: logger.warning("incorrect address for %s %s %s %s", street_number, street_name, zipcode, city) count += 1 GEOCODING_STATS['jobs'] = GEOCODING_STATS.get('jobs', 0) + 1 if not count % 10000: logger.info("loading geocoding jobs from db... loaded %s rows", count) logger.info("%i geocoding jobs created...", len(geocoding_jobs)) cur.close() con.close() return geocoding_jobs
def validate_coordinates(self): con, cur = import_util.create_cursor() query = """ select sum( (coordinates_x > 0 or coordinates_x < 0) and (coordinates_y > 0 or coordinates_y < 0) )/count(*) from %s """ % settings.SCORE_REDUCING_TARGET_TABLE cur.execute(query) geocoding_ratio = cur.fetchall()[0][0] logger.info("geocoding_ratio = %s", geocoding_ratio) if geocoding_ratio < settings.MINIMUM_GEOCODING_RATIO: raise AbnormallyLowGeocodingRatioException cur.close() con.close()
def dump(): timestamp = datetime.now().strftime('%Y_%m_%d_%H%M') logger.info("backing up table %s ...", settings.SCORE_REDUCING_TARGET_TABLE) etab_result = import_util.back_up(settings.BACKUP_OUTPUT_FOLDER, settings.SCORE_REDUCING_TARGET_TABLE, "export_etablissement", timestamp, new_table_name="etablissements_new") tar_filename = os.path.join(settings.BACKUP_FOLDER, "%s.tar.bz2" % timestamp) with tarfile.open(tar_filename, "w:bz2") as tar: logger.info("creating tar file %s...", tar_filename) tar.add(etab_result, arcname=os.path.basename(etab_result)) tar.close() return tar_filename
def check_complete_test(dpae_filename): logger.info("check complete test...") lines = get_n_lines(dpae_filename, n=20) success = 0 errors = 0 for line in lines: try: parse_dpae_line(line) success += 1 except (ValueError, IndexError): errors += 1 logger.info("%i lines parsed with success", success) logger.info("%i lines parsed with error", errors) error_rate = errors / (1.0 * (success + errors)) logger.info("error rate: %i", error_rate) if error_rate >= settings.DPAE_ERROR_RATE_MAX: raise "error_rate too high" logger.info("complete test OK!")
def insert_logs_activity(self, activity_df): ''' details = consulter une page entreprise afficher-details = déplier fiche entreprise ''' clics_of_interest = ['details', 'afficher-details', 'ajout-favori'] logs_activity_df = activity_df[activity_df['nom'].isin( clics_of_interest)] logs_activity_df['siret'] = logs_activity_df.apply( lambda row: siret(row), axis=1) logs_activity_df['utm_medium'] = logs_activity_df.apply( lambda row: get_propriete(row, 'utm_medium'), axis=1) logs_activity_df['utm_source'] = logs_activity_df.apply( lambda row: get_propriete(row, 'utm_source'), axis=1) logs_activity_df['utm_campaign'] = logs_activity_df.apply( lambda row: get_propriete(row, 'utm_campaign'), axis=1) # We want to keep only the activity logs with IDPeconnect OR the ones that have the values we want in utm_medium # If we keep everything, there will be way too many lines in the database utm_medium_to_keep = ['mailing'] logs_activity_df = logs_activity_df[ (logs_activity_df.idutilisateur_peconnect.notnull()) | (logs_activity_df.utm_medium.isin(utm_medium_to_keep))] cols_of_interest = [ "dateheure", "nom", "idutilisateur_peconnect", "siret", "utm_medium", "utm_source", "utm_campaign", ] logs_activity_df = logs_activity_df[cols_of_interest] nb_lines = logs_activity_df.shape[0] logger.info( f'Number of lines to insert into logs_activity : {nb_lines}') return logs_activity_df
def get_activity_logs(self): engine = import_util.create_sqlalchemy_engine() query = "select * from logs_activity" if DEBUG: query += " ORDER BY RAND() LIMIT 10000" df_activity = pd.read_sql_query(query, engine) engine.close() # TODO : Define how long we should use logs # https://valodata.slack.com/archives/C0QR8RYL8/p1562319224015200 # TODO : Uncomment these lines after the first initialization of the project # one_year_date = datetime.date.today() - datetime.timedelta(365) # df_activity['dateheure'] = pd.to_datetime(df_activity['dateheure']) # df_activity = df_activity[df_activity.dateheure > one_year_date] logger.info('Activities logs are loaded') return df_activity.astype(str)
def get_last_recorded_hiring_date(self): # We want to check if there are already data in the final table table_name_act_dpae = LogsActivityDPAEClean.__tablename__ query = f"SELECT COUNT(*) FROM {table_name_act_dpae}" engine = import_util.create_sqlalchemy_engine() # If data in table if engine.execute(query).fetchone()[0] > 0: query = f"select date_embauche from {table_name_act_dpae} order by date_embauche DESC LIMIT 1" row = engine.execute(query).fetchone() date_last_recorded_hiring = row[0].split()[0] # Else no data else: # We set the date to the first activity log that has ever been created on LBB date_last_recorded_hiring = "2018-08-31" engine.close() logger.info(f"the most recent date found is {date_last_recorded_hiring}") return date_last_recorded_hiring
def after_check(self): query = db_session.query(RawOffice.departement.distinct().label("departement")) departements = [row.departement for row in query.all()] if len(departements) != settings.DISTINCT_DEPARTEMENTS_HAVING_OFFICES: msg = "wrong number of departements : %s instead of expected %s" % ( len(departements), settings.DISTINCT_DEPARTEMENTS_HAVING_OFFICES ) raise Exception(msg) # FIXME parallelize for better performance for departement in departements: count = RawOffice.query.filter_by(departement=departement).count() logger.info("number of companies in departement %s : %i", departement, count) if not count >= settings.MINIMUM_OFFICES_TO_BE_EXTRACTED_PER_DEPARTEMENT: msg = "too few companies in departement : %s instead of expected %s" % ( count, settings.MINIMUM_OFFICES_TO_BE_EXTRACTED_PER_DEPARTEMENT ) raise Exception(msg)
def update_coordinates(self, updates): con, cur = import_util.create_cursor() count = 0 statements = [] update_query = "update %s set coordinates_x=%%s, coordinates_y=%%s where siret=%%s" % \ settings.SCORE_REDUCING_TARGET_TABLE logger.info("Nb of offices to update : {}".format(len(updates))) for siret, coordinates in updates: count += 1 statements.append([coordinates[0], coordinates[1], siret]) if len(statements) == 1000: logger.info("geocoding with ban... %i of %i done", count, len(updates)) cur.executemany(update_query, statements) con.commit() statements = [] if len(statements) >= 1: logger.info("geocoding with ban... %i of %i done", count, len(updates)) cur.executemany(update_query, statements) con.commit() cur.close() con.close()
def create_creatable_offices(self): """ create new offices (that are not yet in our etablissement table) """ con, cur = import_util.create_cursor() query = """INSERT into %s(siret, raisonsociale, enseigne, codenaf, numerorue, libellerue, codecommune, codepostal, email, tel, departement, trancheeffectif, website, flag_poe_afpr, flag_pmsmp) values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s)""" % settings.RAW_OFFICE_TABLE count = 1 logger.info("create new offices in table %s", settings.RAW_OFFICE_TABLE) statements = [] MAX_COUNT_EXECUTE = 500 for siret in self.creatable_sirets: statement = self.csv_offices[siret]["create_fields"] statements.append(statement) count += 1 if not count % MAX_COUNT_EXECUTE: cur.executemany(query, statements) con.commit() statements = [] if not count % 10000: logger.info("created %s offices", count) if statements: cur.executemany(query, statements) con.commit() cur.close() con.close() logger.info("%i new offices created.", count)
def run_geocoding_jobs(self, geocoding_jobs): ban_jobs = [] coordinates_updates = [] count = 0 for siret, full_address, initial_coordinates in geocoding_jobs: unit = GeocodeUnit(siret, full_address, coordinates_updates, initial_coordinates) job_id = pool.spawn(unit.find_coordinates_for_address) ban_jobs.append(job_id) count += 1 if not count % 1000: logger.info( "running geocoding jobs : started %s of %s jobs, collected %s coordinates so far", count, len(geocoding_jobs), len(coordinates_updates), ) gevent.joinall(ban_jobs) ban_jobs = [] # processing remaining jobs gevent.joinall(ban_jobs) return coordinates_updates