コード例 #1
0
def get_total_hirings_per_office():
    engine = import_util.create_sqlalchemy_engine()

    query = "select\
                siret,\
                raisonsociale,\
                enseigne,\
                email,\
                tel,\
                website,\
                codenaf,\
                codepostal,\
                codecommune,\
                trancheeffectif,\
                greatest(0, floor(score_regr)) as total_hirings\
             from \
                etablissements_backoffice"

    # To make it work in local dev, we have to take all the offices that have score > 0
    if get_current_env() != ENV_DEVELOPMENT:
        query += " where greatest(0, floor(score_regr)) > 0;"

    df_total_hirings = pd.read_sql_query(query, engine)

    engine.close()
    logger.info("Datas selected from etablissements_backoffice")
    print(df_total_hirings)
    return df_total_hirings
コード例 #2
0
def get_nb_entreprise_par_cycle_et_naf_ou_dep_isLBX(colonne, nom, is_lbb):
    filter_lbx = "is_a_bonne_boite" if is_lbb else "is_a_bonne_alternance"
    prefix_columns = "lbb" if is_lbb else "lba"
    engine = import_util.create_sqlalchemy_engine()
    if colonne == "global":
        query = f'SELECT importer_cycle_infos_id as cycle,sum({prefix_columns}_nb_effective_hirings) as nbTotalLBXHirings, count(*) as nbTotalLBX \
                FROM perf_prediction_and_effective_hirings ppaeh \
        INNER JOIN perf_importer_cycle_infos pici on pici.id = ppaeh.importer_cycle_infos_id \
        WHERE pici.computed is true \
        AND pici.on_google_sheets is false \
                AND {filter_lbx} is true \
                GROUP BY importer_cycle_infos_id;'

    else:  #colonne == Naf OR departement
        query = f'SELECT importer_cycle_infos_id as cycle , {colonne} as {nom} ,sum({prefix_columns}_nb_effective_hirings) as nbTotalLBXHirings, count(*) as nbTotalLBX \
                FROM perf_prediction_and_effective_hirings ppaeh \
        INNER JOIN perf_importer_cycle_infos pici on pici.id = ppaeh.importer_cycle_infos_id \
        WHERE pici.computed is true \
        AND pici.on_google_sheets is false \
                AND {filter_lbx} is true \
                GROUP BY importer_cycle_infos_id , {colonne};'

    df_nb_entreprise_isLBX = pd.read_sql_query(query, engine)
    engine.close()
    return df_nb_entreprise_isLBX
コード例 #3
0
    def needs_parse_json_activity_log(self, json_file_name):
        '''Function which takes one json file name and check if it needs to be parsed and saved in database
        '''

        # json_file_name is format : activity-lbb-2019.09.13.json
        date = json_file_name.replace('activity-lbb-',
                                      '').replace('.json',
                                                  '').replace('.', '-')

        date_in_db_query = 'select dateheure\
                            from logs_activity\
                            where date(dateheure) = "{}"\
                            ORDER BY dateheure desc\
                            LIMIT 1'.format(date)

        engine = import_util.create_sqlalchemy_engine()
        row = engine.execute(date_in_db_query).fetchone()
        engine.close()

        file_needs_to_be_parsed = False
        if row is None:
            file_needs_to_be_parsed = True
        else:
            # we check the last hour of the activity logs in database
            hour_recorded_activity = row[0].hour
            # if the most recent hour in the database for the logs is before 3am, the file needs to be parsed
            if hour_recorded_activity <= 3:
                file_needs_to_be_parsed = True

        return file_needs_to_be_parsed
コード例 #4
0
 def get_logs_activities_by_sirets(self, sirets):
     if len(sirets) > 1:
         sub_query = "in %s" % str(tuple(sirets))
     else:
         sub_query = "= %s" % sirets[0]
     engine = import_util.create_sqlalchemy_engine()
     query = "select * from logs_activity where siret %s order by dateheure asc;" % sub_query
     df_activity = pd.read_sql_query(query, engine)
     engine.close()
     return df_activity.astype(str)
コード例 #5
0
    def insert_in_database(self, dataframe, table_name):
        engine = import_util.create_sqlalchemy_engine()

        dataframe.to_sql(con=engine,
                         name=table_name,
                         if_exists='append',
                         index=False,
                         chunksize=10000)

        engine.close()
コード例 #6
0
def get_cycle_infos():
    engine = import_util.create_sqlalchemy_engine()
    query = f'SELECT id as cycle, execution_date as dateExecution \
            FROM perf_importer_cycle_infos \
            WHERE on_google_sheets = 0 \
            and computed is true;'

    df_predict = pd.read_sql_query(query, engine)
    engine.close()
    return df_predict
コード例 #7
0
    def get_df_evol_idpe_connect(self):
        # Get the evolution of number of IDPEC which log into LBB
        engine = import_util.create_sqlalchemy_engine()
        query = 'SELECT count(DISTINCT idutilisateur_peconnect) as count_distinct_idpe, concat(MONTH(dateheure),"-",YEAR(dateheure)) as date_month\
                FROM logs_idpe_connect\
                GROUP BY MONTH(dateheure), YEAR(dateheure)\
                ORDER BY YEAR(dateheure), MONTH(dateheure);'

        df_evol_idpe_connect = pd.read_sql_query(query, engine)
        engine.close()
        return df_evol_idpe_connect
コード例 #8
0
    def get_df_evol_dpae(self):
        # GET the evolution of the number of dpae with LBB activity
        engine = import_util.create_sqlalchemy_engine()
        query = 'SELECT count(idutilisateur_peconnect) as count_dpae_lbb, concat(MONTH(date_embauche),"-",YEAR(date_embauche)) as date_month \
                FROM logs_activity_dpae_clean\
                GROUP BY MONTH(date_embauche), YEAR(date_embauche)\
                ORDER BY YEAR(date_embauche), MONTH(date_embauche);'

        df_evol_dpae = pd.read_sql_query(query, engine)
        engine.close()
        return df_evol_dpae
コード例 #9
0
    def save_activity_logs(self):

        engine = import_util.create_sqlalchemy_engine()
        self.df_dpae_act.reset_index(drop=True, inplace=True)
        self.df_dpae_act.to_sql(con=engine,
                                name=TABLE_NAME,
                                if_exists='replace',
                                index=True,
                                index_label='id',
                                chunksize=10000)
        engine.close()
コード例 #10
0
    def check_existing_data_in_table(self):
        engine = import_util.create_sqlalchemy_engine()

        query = f"SELECT COUNT(*) FROM {TABLE_NAME}"

        existing_sql_table = False
        if engine.execute(query).fetchone()[0] > 0:  # Table existe
            existing_sql_table = True

        engine.close()

        return existing_sql_table
コード例 #11
0
def insert_into_etablissements_predicted_and_effective_hirings(
        importer_cycle_infos_id, file_name):
    logger.info(
        f"\n Start : Insert data into perf_prediction_and_effective_hirings from file {file_name}"
    )

    query = f'SELECT siret, \
                    raisonsociale, \
                    enseigne, \
                    codenaf, \
                    codepostal, \
                    codecommune, \
                    departement, \
                    score, \
                    score_alternance \
               FROM etablissements_new;'

    engine = import_util.create_sqlalchemy_engine()
    df = pd.read_sql_query(query, engine)
    engine.close()

    df['importer_cycle_infos_id'] = importer_cycle_infos_id
    df = df.rename(
        columns={
            "score": "lbb_nb_predicted_hirings_score",
            "score_alternance": "lba_nb_predicted_hirings_score"
        })

    df.reset_index(drop=True, inplace=True)
    engine = import_util.create_sqlalchemy_engine()
    df.to_sql(con=engine,
              name="perf_prediction_and_effective_hirings",
              if_exists='append',
              index=False,
              chunksize=1000)
    engine.close()

    logger.info("Insertion into perf_prediction_and_effective_hirings OK")
    logger.info(f"Insertion of {len(df.index)} rows ")
コード例 #12
0
    def get_last_recorded_hiring_date(self):

        # We want to check if there are already data  in the final table
        table_name_act_dpae = LogsActivityDPAEClean.__tablename__
        query = f"SELECT COUNT(*) FROM {table_name_act_dpae}"
        engine = import_util.create_sqlalchemy_engine()

        # If data in table
        if engine.execute(query).fetchone()[0] > 0:
            query = f"select date_embauche from {table_name_act_dpae} order by date_embauche DESC LIMIT 1"
            row = engine.execute(query).fetchone()
            date_last_recorded_hiring = row[0].split()[0]
        # Else no data
        else:
            # We set the date to the first activity log that has ever been created on LBB
            date_last_recorded_hiring = "2018-08-31"

        engine.close()
        logger.info(f"the most recent date found is {date_last_recorded_hiring}")

        return date_last_recorded_hiring
コード例 #13
0
    def get_activity_logs(self):
        engine = import_util.create_sqlalchemy_engine()

        query = "select * from logs_activity"
        if DEBUG:
            query += " ORDER BY RAND() LIMIT 10000"
        df_activity = pd.read_sql_query(query, engine)

        engine.close()

        # TODO : Define how long we should use logs
        # https://valodata.slack.com/archives/C0QR8RYL8/p1562319224015200

        # TODO : Uncomment these lines after the first initialization of the project
        # one_year_date = datetime.date.today() - datetime.timedelta(365)
        # df_activity['dateheure'] = pd.to_datetime(df_activity['dateheure'])
        # df_activity = df_activity[df_activity.dateheure > one_year_date]

        logger.info('Activities logs are loaded')

        return df_activity.astype(str)
コード例 #14
0
def get_predict_par_cycle_et_naf_ou_dep(cycle,
                                        colonne,
                                        value=None,
                                        is_lbb=True):
    prefix_columns = "lbb" if is_lbb else "lba"
    engine = import_util.create_sqlalchemy_engine()
    if colonne == "global":
        query = f'SELECT {prefix_columns}_nb_predicted_hirings as predict, {prefix_columns}_nb_effective_hirings as effective, codenaf, departement \
                FROM perf_prediction_and_effective_hirings \
                WHERE importer_cycle_infos_id = {cycle}\
                ORDER BY {prefix_columns}_nb_predicted_hirings desc '

    else:
        query = f'SELECT {prefix_columns}_nb_predicted_hirings as predict, {prefix_columns}_nb_effective_hirings as effective, codenaf, departement \
                FROM perf_prediction_and_effective_hirings \
                WHERE importer_cycle_infos_id = {cycle} and {colonne} = "{value}"  \
                ORDER BY {prefix_columns}_nb_predicted_hirings desc'

    df_predict = pd.read_sql_query(query, engine)
    engine.close()
    return df_predict
コード例 #15
0
def get_sum_predict_par_cycle_et_naf_ou_dep(cycle,
                                            colonne,
                                            nom=None,
                                            value=None,
                                            is_lbb=True):
    prefix_columns = "lbb" if is_lbb else "lba"
    engine = import_util.create_sqlalchemy_engine()
    if colonne == "global":
        query = f'SELECT importer_cycle_infos_id as cycle, sum({prefix_columns}_nb_predicted_hirings) as sommeTotal \
                FROM perf_prediction_and_effective_hirings ppaeh \
                WHERE importer_cycle_infos_id = {cycle} \
                GROUP BY importer_cycle_infos_id;'

    else:  #colonne == Naf OR departement
        query = f'SELECT importer_cycle_infos_id as cycle , {colonne} as {nom},sum({prefix_columns}_nb_predicted_hirings) as sommeTotal \
                FROM perf_prediction_and_effective_hirings ppaeh \
                WHERE importer_cycle_infos_id = {cycle} and {colonne} = "{value}"  \
                GROUP BY importer_cycle_infos_id , {colonne};'

    df_sum_predict = pd.read_sql_query(query, engine)
    engine.close()
    return df_sum_predict
コード例 #16
0
    def get_old_activities_logs_saved(self):
        engine = import_util.create_sqlalchemy_engine()

        query = f"select idutilisateur_peconnect,\
                         siret, date_activite,\
                         date_embauche,\
                         type_contrat,\
                         duree_activite_cdd_mois,\
                         duree_activite_cdd_jours,\
                         diff_activite_embauche_jrs,\
                         dc_lblprioritede,\
                         tranche_age,\
                         dc_prive_public,\
                         duree_prise_en_charge,\
                         dn_tailleetablissement,\
                         code_postal from {TABLE_NAME}"

        df_dpae_act_existing = pd.read_sql_query(query, engine)
        df_dpae_act_existing['siret'] = df_dpae_act_existing.siret.astype(str)
        engine.close()

        return df_dpae_act_existing
コード例 #17
0
    def get_df_evol_idpe_connect_sign(self, did_specific_activity=None):
        # Get the evolution of the number of significative activities about a company
        engine = import_util.create_sqlalchemy_engine()

        if did_specific_activity is not None:
            specific_column_name = did_specific_activity.replace('-', '_')
            column_name = f'count_distinct_activity_{specific_column_name}'
        else:
            column_name = 'count_distinct_activity'

        query = f'SELECT count(DISTINCT idutilisateur_peconnect) as {column_name},\
                        concat(MONTH(dateheure),"-",YEAR(dateheure)) as date_month\
                 FROM logs_activity '

        if did_specific_activity is not None:
            query += f'WHERE nom="{did_specific_activity}"'

        query += ' GROUP BY MONTH(dateheure), YEAR(dateheure)\
                   ORDER BY YEAR(dateheure), MONTH(dateheure);'

        df_evol_idpe_connect_sign = pd.read_sql_query(query, engine)
        engine.close()
        return df_evol_idpe_connect_sign
コード例 #18
0
def compute_effective_and_predicted_hirings():
    logger.info(f"\n Start : Computing effective hirings")

    importer_cycles_infos = PerfImporterCycleInfos.query.filter(
        PerfImporterCycleInfos.computed == False).all()
    importer_cycles_infos_to_compute = []
    for ici in importer_cycles_infos:
        if os.environ["LBB_ENV"] in ["development", "test"]:
            importer_cycles_infos_to_compute.append(ici)
            continue
        if ici.prediction_end_date < datetime.now():
            importer_cycles_infos_to_compute.append(ici)

    logger.info(
        f"Importer cycles infos which have not been computed yet : {[i.file_name for i in importer_cycles_infos_to_compute]}"
    )

    for ici in importer_cycles_infos_to_compute:
        perf_division_per_rome_dict = load_perf_division_per_rome_dict()

        naf_not_founds = set()
        nb_companies_with_naf_not_found = 0

        logger.info(
            f"Start computing for importer cycle infos : {ici._id} - {ici.file_name}"
        )

        engine = import_util.create_sqlalchemy_engine()
        ppaeh = PerfPredictionAndEffectiveHirings.query.filter(
            PerfPredictionAndEffectiveHirings.importer_cycle_infos_id ==
            ici._id)
        columns_companies = [
            "_id", "siret", "naf", "lbb_nb_predicted_hirings_score",
            "lba_nb_predicted_hirings_score"
        ]
        dict_df_companies = {}
        dict_ppaeh = {}
        for col in columns_companies:
            dict_df_companies[col] = []
        for perf in ppaeh:
            dict_ppaeh[perf._id] = perf
            for col in columns_companies:
                dict_df_companies[col].append(getattr(perf, col))
        del ppaeh
        df_companies_list = pd.DataFrame(data=dict_df_companies)

        logger.info(f"Nb offices to compute : {len(df_companies_list)}")

        query_hirings_lbb = f"SELECT siret, count(*) as lbb_nb_effective_hirings \
                FROM hirings\
                WHERE hiring_date >= '{ici.prediction_start_date}'\
                and hiring_date <= '{ici.prediction_end_date}'\
                and (contract_type={Hiring.CONTRACT_TYPE_CDD} or contract_type={Hiring.CONTRACT_TYPE_CDI})\
                GROUP BY siret;"

        df_hirings_lbb = pd.read_sql_query(query_hirings_lbb, engine)
        logger.info(
            f"Nb offices found in hirings for lbb : {len(df_hirings_lbb)}")

        query_hirings_lba = f"SELECT siret, count(*) as lba_nb_effective_hirings \
                FROM hirings\
                WHERE hiring_date >= '{ici.prediction_start_date}'\
                and hiring_date <= '{ici.prediction_end_date}'\
                and (contract_type={Hiring.CONTRACT_TYPE_APR} or contract_type={Hiring.CONTRACT_TYPE_CP})\
                GROUP BY siret;"

        df_hirings_lba = pd.read_sql_query(query_hirings_lba, engine)
        logger.info(
            f"Nb offices found in hirings for lba: {len(df_hirings_lba)}")

        engine.close()

        df_merge_hirings_tmp = pd.merge(df_companies_list,
                                        df_hirings_lbb,
                                        how='left',
                                        on="siret")
        df_merged = pd.merge(df_merge_hirings_tmp,
                             df_hirings_lba,
                             how='left',
                             on="siret")

        # Compute the predicted hirings from the score
        df_merged["lbb_nb_predicted_hirings"] = df_merged[
            "lbb_nb_predicted_hirings_score"].apply(
                lambda x: scoring_util.get_hirings_from_score(x))
        df_merged["lba_nb_predicted_hirings"] = df_merged[
            "lba_nb_predicted_hirings_score"].apply(
                lambda x: scoring_util.get_hirings_from_score(x))

        df_merged = df_merged.fillna(0)

        cols_we_want_to_keep = [
            "_id",
            "siret",
            "naf",
            "lbb_nb_effective_hirings",
            "lba_nb_effective_hirings",
            "lbb_nb_predicted_hirings",
            "lba_nb_predicted_hirings",
            "lbb_nb_predicted_hirings_score",
            "lba_nb_predicted_hirings_score",
        ]

        df_merged = df_merged[cols_we_want_to_keep]

        values_to_update = df_merged.values.tolist()
        count = 0

        updated_ppaeh = []
        for row in values_to_update:
            row_id = row[0]
            siret = row[1]
            naf = row[2]
            params = dict(
                zip([
                    "lbb_nb_effective_hirings", "lba_nb_effective_hirings",
                    "lbb_nb_predicted_hirings", "lba_nb_predicted_hirings"
                ], row[3:7]))
            lbb_nb_predicted_hirings_score = row[7]
            lba_nb_predicted_hirings_score = row[8]
            # foo
            pred_effective_hirings = dict_ppaeh[row_id]
            updated_values = {"_id": row_id}
            for key, val in params.items():
                updated_values[key] = val
            is_a_bonne_boite = False
            is_a_bonne_alternance = False

            naf_present_in_mapping_rome_naf = naf in perf_division_per_rome_dict

            if naf_present_in_mapping_rome_naf:
                for rome_code, values in perf_division_per_rome_dict[
                        naf].items():
                    score_lbb = scoring_util.get_score_adjusted_to_rome_code_and_naf_code(
                        score=lbb_nb_predicted_hirings_score,
                        rome_code=rome_code,
                        naf_code=naf)
                    if score_lbb >= values["threshold_lbb"]:
                        perf_division_per_rome_dict[naf][rome_code][
                            "nb_bonne_boites_lbb"] += 1
                        is_a_bonne_boite = True

                    score_lba = scoring_util.get_score_adjusted_to_rome_code_and_naf_code(
                        score=lba_nb_predicted_hirings_score,
                        rome_code=rome_code,
                        naf_code=naf)
                    if score_lba >= values["threshold_lba"]:
                        perf_division_per_rome_dict[naf][rome_code][
                            "nb_bonne_boites_lba"] += 1
                        is_a_bonne_alternance = True
            else:
                naf_not_founds.add(naf)
                nb_companies_with_naf_not_found += 1
            pred_effective_hirings.is_a_bonne_boite = is_a_bonne_boite
            pred_effective_hirings.is_a_bonne_alternance = is_a_bonne_alternance
            updated_values["is_a_bonne_boite"] = is_a_bonne_boite
            updated_values["is_a_bonne_alternance"] = is_a_bonne_alternance

            updated_ppaeh.append(updated_values)
            count += 1
            # Commit all the 10 000 transactions
            if len(updated_ppaeh) % 100000 == 0:
                logger.info(f"{count} companies have been treated")
                db_session.bulk_update_mappings(
                    PerfPredictionAndEffectiveHirings, updated_ppaeh)
                db_session.commit()
                updated_ppaeh = []

        # Commit for the remaining rows
        db_session.bulk_update_mappings(PerfPredictionAndEffectiveHirings,
                                        updated_ppaeh)
        db_session.commit()
        updated_ppaeh = []

        logger.info(
            f"Number of naf not found in the mapping rome naf for this importer cycle : {len(naf_not_founds)}"
        )
        logger.info(
            f"List of naf not found in the mapping rome naf for this importer cycle : {naf_not_founds}"
        )
        logger.info(
            f"Number of companies with naf not found in the mapping rome naf for this importer cycle : {nb_companies_with_naf_not_found}"
        )
        logger.info(f"Number of total companies : {count}")

        for naf_code, romes_list in perf_division_per_rome_dict.items():
            for rome_code, values in romes_list.items():
                division_per_rome = PerfDivisionPerRome(
                    importer_cycle_infos_id=ici._id,
                    naf=naf_code,
                    rome=rome_code,
                    threshold_lbb=values["threshold_lbb"],
                    threshold_lba=values["threshold_lba"],
                    nb_bonne_boites_lbb=values["nb_bonne_boites_lbb"],
                    nb_bonne_boites_lba=values["nb_bonne_boites_lba"],
                )
                db_session.add(division_per_rome)

        db_session.commit()

        ici.computed = True
        db_session.add(ici)
        db_session.commit()