コード例 #1
0
 def test_converting_hirings_into_scores_back_and_forth(self):
     for score in range(101):  # [0, 1, 2, 3.. 100]
         self.assertEqual(
             score,
             scoring_util.get_score_from_hirings(
                 scoring_util.get_hirings_from_score(score),
                 skip_bucketing=True))
コード例 #2
0
    def test_query_returns_scores_adjusted_to_rome_code_context(self):
        rome_code = u'D1405'
        params = self.add_security_params({
            'commune_id': self.positions['caen']['commune_id'],
            'distance': 20,
            'page': 1,
            'page_size': 2,
            'rome_codes': rome_code,
            'user': u'labonneboite',
        })
        rv = self.app.get('/api/v1/company/?%s' % urlencode(params))
        self.assertEqual(rv.status_code, 200)
        data = json.loads(rv.data)
        self.assertEqual(data['companies_count'], 1)
        self.assertEqual(len(data['companies']), 1)

        office_json = data['companies'][0]
        siret = office_json['siret']
        office = Office.query.get(siret)

        # ############### WARNING about matching scores vs hirings ################
        # Methods scoring_util.get_hirings_from_score
        # and scoring_util.get_score_from_hirings
        # rely on special coefficients SCORE_50_HIRINGS, SCORE_60_HIRINGS etc..
        # which values in github repository are *fake* and used for dev and test only.
        #
        # The real values are confidential, stored outside of github repo,
        # and only used in staging and production.
        #
        # This is designed so that you *CANNOT* guess the hirings based
        # on the score you see in production.
        # #########################################################################

        # general score/stars/hirings values (all rome_codes included)
        self.assertEqual(office.score, 71)
        self.assertEqual(office.stars, 3.55)
        self.assertEqual(scoring_util.get_hirings_from_score(office.score), 77.5)

        # now let's see values adjusted for current rome_code
        stars_for_rome_code = office_json['stars']
        self.assertEqual(stars_for_rome_code, office.get_stars_for_rome_code(rome_code))
        # stars from 0 to 5, score from 0 to 100 (x20)
        score_for_rome = stars_for_rome_code * 20.0
        self.assertEqual(score_for_rome, 3.0)
        self.assertEqual(scoring_util.get_hirings_from_score(score_for_rome), 0.6)

        # let's see how adjusting for this rome decreased hirings
        # from 77.5 (hirings for all rome_codes included)
        # to 0.6 (hirings for only the current rome_code)
        # 
        # 0.6 is approx 1% of 77.5
        # which means that on average, companies of this naf_code hire 1% in this rome_code
        # and 99% in all other rome_codes associated to this naf_code
        #
        # let's check we can find back this 1% ratio in our rome-naf mapping data
        naf_code = office.naf
        rome_codes = mapping_util.MANUAL_NAF_ROME_MAPPING[naf_code].keys()
        total_naf_hirings = sum(mapping_util.MANUAL_NAF_ROME_MAPPING[naf_code][rome] for rome in rome_codes)
        self.assertEqual(total_naf_hirings, 7844)
        current_rome_hirings = mapping_util.MANUAL_NAF_ROME_MAPPING[naf_code][rome_code]
        self.assertEqual(current_rome_hirings, 52)
コード例 #3
0
def compute_effective_and_predicted_hirings():
    logger.info(f"\n Start : Computing effective hirings")

    importer_cycles_infos = PerfImporterCycleInfos.query.filter(
        PerfImporterCycleInfos.computed == False).all()
    importer_cycles_infos_to_compute = []
    for ici in importer_cycles_infos:
        if os.environ["LBB_ENV"] in ["development", "test"]:
            importer_cycles_infos_to_compute.append(ici)
            continue
        if ici.prediction_end_date < datetime.now():
            importer_cycles_infos_to_compute.append(ici)

    logger.info(
        f"Importer cycles infos which have not been computed yet : {[i.file_name for i in importer_cycles_infos_to_compute]}"
    )

    for ici in importer_cycles_infos_to_compute:
        perf_division_per_rome_dict = load_perf_division_per_rome_dict()

        naf_not_founds = set()
        nb_companies_with_naf_not_found = 0

        logger.info(
            f"Start computing for importer cycle infos : {ici._id} - {ici.file_name}"
        )

        engine = import_util.create_sqlalchemy_engine()
        ppaeh = PerfPredictionAndEffectiveHirings.query.filter(
            PerfPredictionAndEffectiveHirings.importer_cycle_infos_id ==
            ici._id)
        columns_companies = [
            "_id", "siret", "naf", "lbb_nb_predicted_hirings_score",
            "lba_nb_predicted_hirings_score"
        ]
        dict_df_companies = {}
        dict_ppaeh = {}
        for col in columns_companies:
            dict_df_companies[col] = []
        for perf in ppaeh:
            dict_ppaeh[perf._id] = perf
            for col in columns_companies:
                dict_df_companies[col].append(getattr(perf, col))
        del ppaeh
        df_companies_list = pd.DataFrame(data=dict_df_companies)

        logger.info(f"Nb offices to compute : {len(df_companies_list)}")

        query_hirings_lbb = f"SELECT siret, count(*) as lbb_nb_effective_hirings \
                FROM hirings\
                WHERE hiring_date >= '{ici.prediction_start_date}'\
                and hiring_date <= '{ici.prediction_end_date}'\
                and (contract_type={Hiring.CONTRACT_TYPE_CDD} or contract_type={Hiring.CONTRACT_TYPE_CDI})\
                GROUP BY siret;"

        df_hirings_lbb = pd.read_sql_query(query_hirings_lbb, engine)
        logger.info(
            f"Nb offices found in hirings for lbb : {len(df_hirings_lbb)}")

        query_hirings_lba = f"SELECT siret, count(*) as lba_nb_effective_hirings \
                FROM hirings\
                WHERE hiring_date >= '{ici.prediction_start_date}'\
                and hiring_date <= '{ici.prediction_end_date}'\
                and (contract_type={Hiring.CONTRACT_TYPE_APR} or contract_type={Hiring.CONTRACT_TYPE_CP})\
                GROUP BY siret;"

        df_hirings_lba = pd.read_sql_query(query_hirings_lba, engine)
        logger.info(
            f"Nb offices found in hirings for lba: {len(df_hirings_lba)}")

        engine.close()

        df_merge_hirings_tmp = pd.merge(df_companies_list,
                                        df_hirings_lbb,
                                        how='left',
                                        on="siret")
        df_merged = pd.merge(df_merge_hirings_tmp,
                             df_hirings_lba,
                             how='left',
                             on="siret")

        # Compute the predicted hirings from the score
        df_merged["lbb_nb_predicted_hirings"] = df_merged[
            "lbb_nb_predicted_hirings_score"].apply(
                lambda x: scoring_util.get_hirings_from_score(x))
        df_merged["lba_nb_predicted_hirings"] = df_merged[
            "lba_nb_predicted_hirings_score"].apply(
                lambda x: scoring_util.get_hirings_from_score(x))

        df_merged = df_merged.fillna(0)

        cols_we_want_to_keep = [
            "_id",
            "siret",
            "naf",
            "lbb_nb_effective_hirings",
            "lba_nb_effective_hirings",
            "lbb_nb_predicted_hirings",
            "lba_nb_predicted_hirings",
            "lbb_nb_predicted_hirings_score",
            "lba_nb_predicted_hirings_score",
        ]

        df_merged = df_merged[cols_we_want_to_keep]

        values_to_update = df_merged.values.tolist()
        count = 0

        updated_ppaeh = []
        for row in values_to_update:
            row_id = row[0]
            siret = row[1]
            naf = row[2]
            params = dict(
                zip([
                    "lbb_nb_effective_hirings", "lba_nb_effective_hirings",
                    "lbb_nb_predicted_hirings", "lba_nb_predicted_hirings"
                ], row[3:7]))
            lbb_nb_predicted_hirings_score = row[7]
            lba_nb_predicted_hirings_score = row[8]
            # foo
            pred_effective_hirings = dict_ppaeh[row_id]
            updated_values = {"_id": row_id}
            for key, val in params.items():
                updated_values[key] = val
            is_a_bonne_boite = False
            is_a_bonne_alternance = False

            naf_present_in_mapping_rome_naf = naf in perf_division_per_rome_dict

            if naf_present_in_mapping_rome_naf:
                for rome_code, values in perf_division_per_rome_dict[
                        naf].items():
                    score_lbb = scoring_util.get_score_adjusted_to_rome_code_and_naf_code(
                        score=lbb_nb_predicted_hirings_score,
                        rome_code=rome_code,
                        naf_code=naf)
                    if score_lbb >= values["threshold_lbb"]:
                        perf_division_per_rome_dict[naf][rome_code][
                            "nb_bonne_boites_lbb"] += 1
                        is_a_bonne_boite = True

                    score_lba = scoring_util.get_score_adjusted_to_rome_code_and_naf_code(
                        score=lba_nb_predicted_hirings_score,
                        rome_code=rome_code,
                        naf_code=naf)
                    if score_lba >= values["threshold_lba"]:
                        perf_division_per_rome_dict[naf][rome_code][
                            "nb_bonne_boites_lba"] += 1
                        is_a_bonne_alternance = True
            else:
                naf_not_founds.add(naf)
                nb_companies_with_naf_not_found += 1
            pred_effective_hirings.is_a_bonne_boite = is_a_bonne_boite
            pred_effective_hirings.is_a_bonne_alternance = is_a_bonne_alternance
            updated_values["is_a_bonne_boite"] = is_a_bonne_boite
            updated_values["is_a_bonne_alternance"] = is_a_bonne_alternance

            updated_ppaeh.append(updated_values)
            count += 1
            # Commit all the 10 000 transactions
            if len(updated_ppaeh) % 100000 == 0:
                logger.info(f"{count} companies have been treated")
                db_session.bulk_update_mappings(
                    PerfPredictionAndEffectiveHirings, updated_ppaeh)
                db_session.commit()
                updated_ppaeh = []

        # Commit for the remaining rows
        db_session.bulk_update_mappings(PerfPredictionAndEffectiveHirings,
                                        updated_ppaeh)
        db_session.commit()
        updated_ppaeh = []

        logger.info(
            f"Number of naf not found in the mapping rome naf for this importer cycle : {len(naf_not_founds)}"
        )
        logger.info(
            f"List of naf not found in the mapping rome naf for this importer cycle : {naf_not_founds}"
        )
        logger.info(
            f"Number of companies with naf not found in the mapping rome naf for this importer cycle : {nb_companies_with_naf_not_found}"
        )
        logger.info(f"Number of total companies : {count}")

        for naf_code, romes_list in perf_division_per_rome_dict.items():
            for rome_code, values in romes_list.items():
                division_per_rome = PerfDivisionPerRome(
                    importer_cycle_infos_id=ici._id,
                    naf=naf_code,
                    rome=rome_code,
                    threshold_lbb=values["threshold_lbb"],
                    threshold_lba=values["threshold_lba"],
                    nb_bonne_boites_lbb=values["nb_bonne_boites_lbb"],
                    nb_bonne_boites_lba=values["nb_bonne_boites_lba"],
                )
                db_session.add(division_per_rome)

        db_session.commit()

        ici.computed = True
        db_session.add(ici)
        db_session.commit()
コード例 #4
0
def get_true_hirings(row):
    return scoring_util.get_hirings_from_score(row['score'])