def test_converting_hirings_into_scores_back_and_forth(self): for score in range(101): # [0, 1, 2, 3.. 100] self.assertEqual( score, scoring_util.get_score_from_hirings( scoring_util.get_hirings_from_score(score), skip_bucketing=True))
def test_query_returns_scores_adjusted_to_rome_code_context(self): rome_code = u'D1405' params = self.add_security_params({ 'commune_id': self.positions['caen']['commune_id'], 'distance': 20, 'page': 1, 'page_size': 2, 'rome_codes': rome_code, 'user': u'labonneboite', }) rv = self.app.get('/api/v1/company/?%s' % urlencode(params)) self.assertEqual(rv.status_code, 200) data = json.loads(rv.data) self.assertEqual(data['companies_count'], 1) self.assertEqual(len(data['companies']), 1) office_json = data['companies'][0] siret = office_json['siret'] office = Office.query.get(siret) # ############### WARNING about matching scores vs hirings ################ # Methods scoring_util.get_hirings_from_score # and scoring_util.get_score_from_hirings # rely on special coefficients SCORE_50_HIRINGS, SCORE_60_HIRINGS etc.. # which values in github repository are *fake* and used for dev and test only. # # The real values are confidential, stored outside of github repo, # and only used in staging and production. # # This is designed so that you *CANNOT* guess the hirings based # on the score you see in production. # ######################################################################### # general score/stars/hirings values (all rome_codes included) self.assertEqual(office.score, 71) self.assertEqual(office.stars, 3.55) self.assertEqual(scoring_util.get_hirings_from_score(office.score), 77.5) # now let's see values adjusted for current rome_code stars_for_rome_code = office_json['stars'] self.assertEqual(stars_for_rome_code, office.get_stars_for_rome_code(rome_code)) # stars from 0 to 5, score from 0 to 100 (x20) score_for_rome = stars_for_rome_code * 20.0 self.assertEqual(score_for_rome, 3.0) self.assertEqual(scoring_util.get_hirings_from_score(score_for_rome), 0.6) # let's see how adjusting for this rome decreased hirings # from 77.5 (hirings for all rome_codes included) # to 0.6 (hirings for only the current rome_code) # # 0.6 is approx 1% of 77.5 # which means that on average, companies of this naf_code hire 1% in this rome_code # and 99% in all other rome_codes associated to this naf_code # # let's check we can find back this 1% ratio in our rome-naf mapping data naf_code = office.naf rome_codes = mapping_util.MANUAL_NAF_ROME_MAPPING[naf_code].keys() total_naf_hirings = sum(mapping_util.MANUAL_NAF_ROME_MAPPING[naf_code][rome] for rome in rome_codes) self.assertEqual(total_naf_hirings, 7844) current_rome_hirings = mapping_util.MANUAL_NAF_ROME_MAPPING[naf_code][rome_code] self.assertEqual(current_rome_hirings, 52)
def compute_effective_and_predicted_hirings(): logger.info(f"\n Start : Computing effective hirings") importer_cycles_infos = PerfImporterCycleInfos.query.filter( PerfImporterCycleInfos.computed == False).all() importer_cycles_infos_to_compute = [] for ici in importer_cycles_infos: if os.environ["LBB_ENV"] in ["development", "test"]: importer_cycles_infos_to_compute.append(ici) continue if ici.prediction_end_date < datetime.now(): importer_cycles_infos_to_compute.append(ici) logger.info( f"Importer cycles infos which have not been computed yet : {[i.file_name for i in importer_cycles_infos_to_compute]}" ) for ici in importer_cycles_infos_to_compute: perf_division_per_rome_dict = load_perf_division_per_rome_dict() naf_not_founds = set() nb_companies_with_naf_not_found = 0 logger.info( f"Start computing for importer cycle infos : {ici._id} - {ici.file_name}" ) engine = import_util.create_sqlalchemy_engine() ppaeh = PerfPredictionAndEffectiveHirings.query.filter( PerfPredictionAndEffectiveHirings.importer_cycle_infos_id == ici._id) columns_companies = [ "_id", "siret", "naf", "lbb_nb_predicted_hirings_score", "lba_nb_predicted_hirings_score" ] dict_df_companies = {} dict_ppaeh = {} for col in columns_companies: dict_df_companies[col] = [] for perf in ppaeh: dict_ppaeh[perf._id] = perf for col in columns_companies: dict_df_companies[col].append(getattr(perf, col)) del ppaeh df_companies_list = pd.DataFrame(data=dict_df_companies) logger.info(f"Nb offices to compute : {len(df_companies_list)}") query_hirings_lbb = f"SELECT siret, count(*) as lbb_nb_effective_hirings \ FROM hirings\ WHERE hiring_date >= '{ici.prediction_start_date}'\ and hiring_date <= '{ici.prediction_end_date}'\ and (contract_type={Hiring.CONTRACT_TYPE_CDD} or contract_type={Hiring.CONTRACT_TYPE_CDI})\ GROUP BY siret;" df_hirings_lbb = pd.read_sql_query(query_hirings_lbb, engine) logger.info( f"Nb offices found in hirings for lbb : {len(df_hirings_lbb)}") query_hirings_lba = f"SELECT siret, count(*) as lba_nb_effective_hirings \ FROM hirings\ WHERE hiring_date >= '{ici.prediction_start_date}'\ and hiring_date <= '{ici.prediction_end_date}'\ and (contract_type={Hiring.CONTRACT_TYPE_APR} or contract_type={Hiring.CONTRACT_TYPE_CP})\ GROUP BY siret;" df_hirings_lba = pd.read_sql_query(query_hirings_lba, engine) logger.info( f"Nb offices found in hirings for lba: {len(df_hirings_lba)}") engine.close() df_merge_hirings_tmp = pd.merge(df_companies_list, df_hirings_lbb, how='left', on="siret") df_merged = pd.merge(df_merge_hirings_tmp, df_hirings_lba, how='left', on="siret") # Compute the predicted hirings from the score df_merged["lbb_nb_predicted_hirings"] = df_merged[ "lbb_nb_predicted_hirings_score"].apply( lambda x: scoring_util.get_hirings_from_score(x)) df_merged["lba_nb_predicted_hirings"] = df_merged[ "lba_nb_predicted_hirings_score"].apply( lambda x: scoring_util.get_hirings_from_score(x)) df_merged = df_merged.fillna(0) cols_we_want_to_keep = [ "_id", "siret", "naf", "lbb_nb_effective_hirings", "lba_nb_effective_hirings", "lbb_nb_predicted_hirings", "lba_nb_predicted_hirings", "lbb_nb_predicted_hirings_score", "lba_nb_predicted_hirings_score", ] df_merged = df_merged[cols_we_want_to_keep] values_to_update = df_merged.values.tolist() count = 0 updated_ppaeh = [] for row in values_to_update: row_id = row[0] siret = row[1] naf = row[2] params = dict( zip([ "lbb_nb_effective_hirings", "lba_nb_effective_hirings", "lbb_nb_predicted_hirings", "lba_nb_predicted_hirings" ], row[3:7])) lbb_nb_predicted_hirings_score = row[7] lba_nb_predicted_hirings_score = row[8] # foo pred_effective_hirings = dict_ppaeh[row_id] updated_values = {"_id": row_id} for key, val in params.items(): updated_values[key] = val is_a_bonne_boite = False is_a_bonne_alternance = False naf_present_in_mapping_rome_naf = naf in perf_division_per_rome_dict if naf_present_in_mapping_rome_naf: for rome_code, values in perf_division_per_rome_dict[ naf].items(): score_lbb = scoring_util.get_score_adjusted_to_rome_code_and_naf_code( score=lbb_nb_predicted_hirings_score, rome_code=rome_code, naf_code=naf) if score_lbb >= values["threshold_lbb"]: perf_division_per_rome_dict[naf][rome_code][ "nb_bonne_boites_lbb"] += 1 is_a_bonne_boite = True score_lba = scoring_util.get_score_adjusted_to_rome_code_and_naf_code( score=lba_nb_predicted_hirings_score, rome_code=rome_code, naf_code=naf) if score_lba >= values["threshold_lba"]: perf_division_per_rome_dict[naf][rome_code][ "nb_bonne_boites_lba"] += 1 is_a_bonne_alternance = True else: naf_not_founds.add(naf) nb_companies_with_naf_not_found += 1 pred_effective_hirings.is_a_bonne_boite = is_a_bonne_boite pred_effective_hirings.is_a_bonne_alternance = is_a_bonne_alternance updated_values["is_a_bonne_boite"] = is_a_bonne_boite updated_values["is_a_bonne_alternance"] = is_a_bonne_alternance updated_ppaeh.append(updated_values) count += 1 # Commit all the 10 000 transactions if len(updated_ppaeh) % 100000 == 0: logger.info(f"{count} companies have been treated") db_session.bulk_update_mappings( PerfPredictionAndEffectiveHirings, updated_ppaeh) db_session.commit() updated_ppaeh = [] # Commit for the remaining rows db_session.bulk_update_mappings(PerfPredictionAndEffectiveHirings, updated_ppaeh) db_session.commit() updated_ppaeh = [] logger.info( f"Number of naf not found in the mapping rome naf for this importer cycle : {len(naf_not_founds)}" ) logger.info( f"List of naf not found in the mapping rome naf for this importer cycle : {naf_not_founds}" ) logger.info( f"Number of companies with naf not found in the mapping rome naf for this importer cycle : {nb_companies_with_naf_not_found}" ) logger.info(f"Number of total companies : {count}") for naf_code, romes_list in perf_division_per_rome_dict.items(): for rome_code, values in romes_list.items(): division_per_rome = PerfDivisionPerRome( importer_cycle_infos_id=ici._id, naf=naf_code, rome=rome_code, threshold_lbb=values["threshold_lbb"], threshold_lba=values["threshold_lba"], nb_bonne_boites_lbb=values["nb_bonne_boites_lbb"], nb_bonne_boites_lba=values["nb_bonne_boites_lba"], ) db_session.add(division_per_rome) db_session.commit() ici.computed = True db_session.add(ici) db_session.commit()
def get_true_hirings(row): return scoring_util.get_hirings_from_score(row['score'])