Ejemplo n.º 1
0
def run_main():
    import_util.clean_temporary_tables()
    task = ScoreComputingJob()
    results = task.run()
    no_results = []
    departements = []
    for departement, result in results:
        departements.append(departement)
        if not result:
            no_results.append(departement)
    if len(no_results) > settings.MAXIMUM_COMPUTE_SCORE_JOB_FAILURES:
        results = set(departements) - set(no_results)
        logger.warning(
            "compute_scores by departement : %i failures (%s) vs %i successes (%s), aborting...",
            len(no_results),
            ",".join(no_results),
            len(results),
            ",".join(results),
        )
        sys.exit(-1)

    import_util.reduce_scores_for_backoffice(departements)
    import_util.reduce_scores_for_main_db(departements)
    if COMPUTE_SCORES_DEBUG_MODE:
        logger.warning(
            "debug mode enabled, failing on purpose for debugging of temporary tables"
        )
        sys.exit(-1)
    import_util.clean_temporary_tables()
    logger.info("compute_scores task: FINISHED")
Ejemplo n.º 2
0
    def get_json_logs_activity(self, need_all_files=False):
        '''Function which will return a list with all file names of activity logs that need to be parsed
        '''
        # Now we have a cron task which will copy json activity logs to /srv/lbb/data

        # list of all the json activities files
        json_logs_files_names = [
            i for i in os.listdir(self.json_logs_folder_path)
            if i.startswith('activity')
        ]

        # list of all the json activities that need to be parsed (which aren't stored in database)
        if need_all_files is False:
            json_logs_files_names_to_parse = [
                file_name for file_name in json_logs_files_names
                if self.needs_parse_json_activity_log(file_name)
            ]
        else:
            json_logs_files_names_to_parse = json_logs_files_names

        if not json_logs_files_names_to_parse:  #if empty list
            logger.info("Did not find/need any data to parse")
            raise NoDataException

        logger.info(f'.json files to parse : {json_logs_files_names_to_parse}')

        self.json_logs_files_names_to_parse = json_logs_files_names_to_parse
Ejemplo n.º 3
0
    def run_geocoding_jobs(self, geocoding_jobs, disable_multithreading=False):
        adresses_not_geolocated[:] = []
        coordinates_updates[:] = []

        logger.info("Nombre de geocoding jobs : {}".format(
            len(geocoding_jobs)))

        if disable_multithreading:
            for siret, full_address, initial_coordinates, city_code in geocoding_jobs:
                self.find_coordinates_for_address(siret, full_address,
                                                  initial_coordinates,
                                                  city_code)
        else:
            pool = Pool(processes=pool_size)
            for siret, full_address, initial_coordinates, city_code in geocoding_jobs:
                pool.apply_async(self.find_coordinates_for_address, (
                    siret,
                    full_address,
                    initial_coordinates,
                    city_code,
                ))
            pool.close()
            pool.join()

        logger.info(
            "run geocoding jobs : collected {} coordinates on {} jobs, need to geocode {}"
            .format(GEOCODING_STATS.get('cache_hits', 0), len(geocoding_jobs),
                    len(adresses_not_geolocated)))
        return adresses_not_geolocated
Ejemplo n.º 4
0
    def insert_logs_activity_recherche(self, activity_df):

        logs_activity_df = activity_df[activity_df['nom'] == 'recherche']

        logs_activity_df['ville'] = logs_activity_df.apply(
            lambda row: get_propriete(row, 'localisation', 'ville'), axis=1)
        logs_activity_df['code_postal'] = logs_activity_df.apply(
            lambda row: get_propriete(row, 'localisation', 'codepostal'),
            axis=1)
        logs_activity_df['emploi'] = logs_activity_df.apply(
            lambda row: get_propriete(row, 'emploi'), axis=1)
        logs_activity_df['emploi'] = logs_activity_df.apply(
            lambda row: clean_emploi(row), axis=1)

        # TODO : Find a way to concatenate logs, because too many
        logs_activity_df = logs_activity_df[(
            logs_activity_df.source == 'site')]

        cols_of_interest = [
            "dateheure",
            "idutilisateur_peconnect",
            "ville",
            "code_postal",
            "emploi",
        ]

        logs_activity_df = logs_activity_df[cols_of_interest]

        nb_lines = logs_activity_df.shape[0]
        logger.info(
            f'Number of lines to insert into logs_activity_recherche : {nb_lines}'
        )

        return logs_activity_df
def run():
    errors = sanity.check_scores()
    if errors:
        msg = "departements with errors: %s" % ",".join(errors)
        logger.error(msg)
        raise ValueError(msg)
    logger.info("validate_scores task: FINISHED")
Ejemplo n.º 6
0
def get_total_hirings_per_office():
    engine = import_util.create_sqlalchemy_engine()

    query = "select\
                siret,\
                raisonsociale,\
                enseigne,\
                email,\
                tel,\
                website,\
                codenaf,\
                codepostal,\
                codecommune,\
                trancheeffectif,\
                greatest(0, floor(score_regr)) as total_hirings\
             from \
                etablissements_backoffice"

    # To make it work in local dev, we have to take all the offices that have score > 0
    if get_current_env() != ENV_DEVELOPMENT:
        query += " where greatest(0, floor(score_regr)) > 0;"

    df_total_hirings = pd.read_sql_query(query, engine)

    engine.close()
    logger.info("Datas selected from etablissements_backoffice")
    print(df_total_hirings)
    return df_total_hirings
Ejemplo n.º 7
0
    def get_data_second_sheet(self):

        # SECOND SHEET : https://docs.google.com/spreadsheets/d/1gbvFvFEEugCmPhsAdoRZEdjfEl579uUnmf5MIryaVB8/edit#gid=0
        self.df_evol_nb_dpae_hiring_and_activity_date = self.get_df_evol_nb_dpae_hiring_and_activity_date(
        )

        logger.info("Data for second sheet ready")
def run_main():
    import logging
    logging.basicConfig(level=logging.DEBUG)
    dpae_filenames = import_util.detect_runnable_file("dpae", bulk=True)
    for filename in dpae_filenames:
        logger.info("PROCESSING %s" % filename)
        task = DpaeExtractJob(filename)
        task.run()
Ejemplo n.º 9
0
 def get_sirets_from_database(self):
     query = "select siret from %s" % settings.RAW_OFFICE_TABLE
     logger.info("get offices from database")
     con, cur = import_util.create_cursor()
     cur.execute(query)
     rows = cur.fetchall()
     cur.close()
     con.close()
     return [row[0] for row in rows if siret_util.is_siret(row[0])]
Ejemplo n.º 10
0
    def get_most_recent_csv_file(self):
        csv_paths = os.listdir(self.csv_folder_path)
        csv_paths = [i for i in csv_paths if i.startswith('act_dpae-')]
        csv_paths.sort()
        most_recent_csv_file = csv_paths[-1]

        logger.info(
            f"the act-dpae file which will be used is : {most_recent_csv_file}"
        )

        return most_recent_csv_file
Ejemplo n.º 11
0
    def insert_id_peconnect(self, activity_df):

        activity_df = activity_df.dropna(axis=0,
                                         subset=['idutilisateur_peconnect'])
        activity_idpec = activity_df.drop_duplicates(
            subset=['idutilisateur_peconnect', 'date'], keep='first')
        activity_idpec = activity_idpec[[
            'dateheure', 'idutilisateur_peconnect'
        ]]

        nb_lines = activity_idpec.shape[0]
        logger.info(f'Number of lines to insert into idpec : {nb_lines}')

        return activity_idpec
Ejemplo n.º 12
0
    def get_most_recent_dpae_file(self):
        dpae_paths = os.listdir(self.dpae_folder_path)
        # IMPORTANT : Need to copy the DPAE file from /mnt/datalakepe/ to /srv/lbb/data
        # It is certainly ok, because of the importer which also needs this file
        dpae_paths = [i for i in dpae_paths if i.startswith('lbb_xdpdpae_delta')]

        dpae_paths.sort()
        most_recent_dpae_file = dpae_paths[-1]
        if get_current_env() == ENV_DEVELOPMENT:
            most_recent_dpae_file = 'lbb_xdpdpae_delta_201511102200.csv'

        logger.info(f"the DPAE file which will be used is : {most_recent_dpae_file}")

        return most_recent_dpae_file
Ejemplo n.º 13
0
    def join_old_data_with_new_ones(self):

        # If it exists, we have to get the whole old datas, to delete rows which are duplicates for a couple idpeconnect/siret
        if self.existing_sql_table:

            df_dpae_act_existing = self.get_old_activities_logs_saved()

            # In case a problem appear in the script, we save old datas under .csv extension
            # because we will rewrite the whole table after each execution, we have to remove duplicates
            df_dpae_act_existing.to_csv(
                f"{self.csv_folder_path}backup_sql_{TABLE_NAME}.csv",
                encoding='utf-8',
                sep='|')

            nb_rows_old = df_dpae_act_existing.shape[0]
            logger.info(f"There were already act/dpae : {nb_rows_old} rows")

            nb_rows = self.df_dpae_act.shape[0]
            logger.info(
                f"There are {nb_rows} new rows to concat with the old one")

            # We concatenate all the old rows with the new ones
            self.df_dpae_act = pd.concat(
                [self.df_dpae_act, df_dpae_act_existing])
            nb_rows = self.df_dpae_act.shape[0]
            logger.info(f"Concatenation of both has {nb_rows} rows")

            # Remove the duplicates
            self.df_dpae_act = self.df_dpae_act.sort_values('date_activite')
            self.df_dpae_act = self.df_dpae_act.drop_duplicates(
                subset=['idutilisateur_peconnect', 'siret'], keep='first')
            nb_rows = self.df_dpae_act.shape[0]
            logger.info(
                f"Concatenation of both minus duplicates has {nb_rows} rows")
Ejemplo n.º 14
0
def run_main():
    import logging
    logging.basicConfig(level=logging.DEBUG)

    lba_app_filenames = import_util.detect_runnable_file("lba-app", bulk=True)
    for filename in lba_app_filenames:
        logger.info("PROCESSING %s" % filename)
        task_app = ApprentissageExtractJob(filename, contract_type = 'APPRENTISSAGE')
        task_app.run()

    lba_pro_filenames = import_util.detect_runnable_file("lba-pro", bulk=True)
    for filename in lba_pro_filenames:
        logger.info("PROCESSING %s" % filename)
        task_pro = ApprentissageExtractJob(filename, contract_type = 'CONTRAT_PRO')
        task_pro.run()
Ejemplo n.º 15
0
def populate_flag(flag):
    logger.info("populating %s ... ", flag)
    con, cur = import_util.create_cursor()
    query = """
        UPDATE
        %s e
        INNER JOIN %s f
        ON e.siret = f.siret
        SET e.%s = True;
    """ % (settings.SCORE_REDUCING_TARGET_TABLE, flag, flag)
    cur.execute(query)
    con.commit()
    logger.info("completed populating %s ... ", flag)
    cur.close()
    con.close()
Ejemplo n.º 16
0
 def delete_deletable_offices(self):
     con, cur = import_util.create_cursor()
     if self.deletable_sirets:
         for sirets in chunks(list(self.deletable_sirets), 500):
             stringified_siret_list = ",".join(sirets)
             logger.info("deleting a chunk of %i offices...", len(sirets))
             query = """DELETE FROM %s where siret IN (%s)""" % (settings.RAW_OFFICE_TABLE, stringified_siret_list)
             try:
                 cur.execute(query)
                 con.commit()
             except:
                 logger.warning("error while deleting chunk of sirets : %s", sirets)
                 raise
     cur.close()
     con.close()
     logger.info("%i no longer existing offices deleted.", len(self.deletable_sirets))
Ejemplo n.º 17
0
def prepare_flag_handicap():
    logger.info("preparing flag_handicap...")

    sql_script = """
        drop table if exists flag_handicap;
        create table flag_handicap as
        (
        select distinct(siret) from %s
        where
            handicap_label = 'RQTH-MDT'
            and hiring_date >= DATE_SUB(NOW(),INTERVAL 1 YEAR)
        );
    """ % settings.HIRING_TABLE

    import_util.run_sql_script(sql_script)
    logger.info("completed preparing flag_handicap.")
Ejemplo n.º 18
0
def prepare_flags_junior_and_senior():
    logger.info("preparing flags_junior_and_senior...")

    sql_script = """
        drop table if exists flag_tmp1;
        create table flag_tmp1 as
        (
        select siret, tranche_age, count(*) as contrats
        from %s
        where hiring_date >= DATE_SUB(NOW(),INTERVAL 1 YEAR)
        group by siret, tranche_age
        );

        drop table if exists flag_tmp2;
        create table flag_tmp2 as
        (
        select
            siret,
            100*sum(contrats*(tranche_age='00-25'))/sum(contrats) as ratio_junior,
            100*sum(contrats*(tranche_age='51-99'))/sum(contrats) as ratio_senior
        from flag_tmp1
        group by siret
        );

        drop table if exists flag_junior;
        create table flag_junior as
        (
        select siret
        from flag_tmp2
        where ratio_junior >= 80
        );

        drop table if exists flag_senior;
        create table flag_senior as
        (
        select siret
        from flag_tmp2
        where ratio_senior >= 16
        );

        drop table if exists flag_tmp1;
        drop table if exists flag_tmp2;
    """ % settings.HIRING_TABLE

    import_util.run_sql_script(sql_script)
    logger.info("completed preparing flags_junior_and_senior.")
Ejemplo n.º 19
0
    def get_data_first_sheet(self):

        # FIRST SHEET : https://docs.google.com/spreadsheets/d/1kx-mxCaXIkys3hU4El4K7a6JBwzrdF75X4U8igqLB4I/edit?folder=1QFm0t2weoUjTsl-FPYUj94__zq_mZq0h#gid=0
        # 1st column : Nb IDPE unique ayant accédé à LBB
        self.df_evol_idpe_connect = self.get_df_evol_idpe_connect()

        # 2nd column : Nb d'IDPE unique ayant déplié une fiche entreprise
        self.df_evol_idpe_connect_sign_afficher_details = self.get_df_evol_idpe_connect_sign(
            did_specific_activity='afficher-details')

        # 3rd column : Nb d'IDPE unique ayant consulté une page entreprise
        self.df_evol_idpe_connect_sign_details = self.get_df_evol_idpe_connect_sign(
            did_specific_activity='details')

        # 4th column : Not data yet but : Nb d'IDPE unique ayant accédé à la première étape de "Je postule"

        # 5th column : Nb IDPE unique ayant déplié une fiche entreprise, consulté une page entreprise, mis en Favoris une entreprise
        #               + Fourth column ( accédé à la première étape de JP)
        self.df_evol_idpe_connect_sign = self.get_df_evol_idpe_connect_sign()

        # 6th column : Nb d'embauche par mois ayant pour origine une activité d'usager connecté LBB (date de début/fin = date d'embauche)
        self.df_evol_dpae = self.get_df_evol_dpae()

        # 7th column is an empty column

        # 8th column : Nb candidatures JP
        self.df_nb_candidatures_jp = pd.read_csv(
            f'{self.csv_jp_folder_path}/dump_nb_candidatures_jp.csv',
            delimiter=';')

        # 9th column : Nb email unique ayant candidaté via Je postule
        self.df_nb_distinct_email_jp = pd.read_csv(
            f'{self.csv_jp_folder_path}/dump_nb_distinct_email_jp.csv',
            delimiter=';')

        # 10th column : Nb candidats ayant reçus une réponse via JP
        self.df_nb_candidates_with_answer_jp = pd.read_csv(
            f'{self.csv_jp_folder_path}/dump_nb_candidates_with_answer_jp.csv',
            delimiter=';')

        # 11th column : Délai moyen de réponse des recruteurs via JP (en jours)
        self.df_medium_delay_answer_jp = pd.read_csv(
            f'{self.csv_jp_folder_path}/dump_medium_delay_answer_jp.csv',
            delimiter=';')

        logger.info("Data for first sheet ready")
Ejemplo n.º 20
0
 def create_geocoding_jobs(self):
     query = """
         select
             siret,
             numerorue,
             libellerue,
             codepostal,
             codecommune,
             coordinates_x,
             coordinates_y
         from %s
     """ % (settings.SCORE_REDUCING_TARGET_TABLE)
     if DEBUG_MODE:
         #query += "WHERE coordinates_x = 0 and coordinates_y = 0"
         query += "ORDER BY RAND() LIMIT 100000"
     con, cur = import_util.create_cursor()
     cur.execute(query)
     rows = cur.fetchall()
     geocoding_jobs = []
     count = 0
     for row in rows:
         siret, street_number, street_name, zipcode, codecommune, coordinates_x, coordinates_y = row
         try:
             city = CITY_NAMES[codecommune]
         except KeyError:
             logger.warning("wrong codecommune: %s", codecommune)
             continue
         try:
             full_address = self.get_full_adress(street_number, street_name,
                                                 zipcode, city)
             initial_coordinates = [coordinates_x, coordinates_y]
             geocoding_jobs.append(
                 [siret, full_address, initial_coordinates, codecommune])
         except IncorrectAdressDataException:
             logger.warning("incorrect address for %s %s %s %s",
                            street_number, street_name, zipcode, city)
         count += 1
         GEOCODING_STATS['jobs'] = GEOCODING_STATS.get('jobs', 0) + 1
         if not count % 10000:
             logger.info("loading geocoding jobs from db... loaded %s rows",
                         count)
     logger.info("%i geocoding jobs created...", len(geocoding_jobs))
     cur.close()
     con.close()
     return geocoding_jobs
Ejemplo n.º 21
0
 def validate_coordinates(self):
     con, cur = import_util.create_cursor()
     query = """
     select
     sum(
         (coordinates_x > 0 or coordinates_x < 0)
         and
         (coordinates_y > 0 or coordinates_y < 0)
     )/count(*)
     from %s
     """ % settings.SCORE_REDUCING_TARGET_TABLE
     cur.execute(query)
     geocoding_ratio = cur.fetchall()[0][0]
     logger.info("geocoding_ratio = %s", geocoding_ratio)
     if geocoding_ratio < settings.MINIMUM_GEOCODING_RATIO:
         raise AbnormallyLowGeocodingRatioException
     cur.close()
     con.close()
Ejemplo n.º 22
0
def dump():
    timestamp = datetime.now().strftime('%Y_%m_%d_%H%M')

    logger.info("backing up table %s ...",
                settings.SCORE_REDUCING_TARGET_TABLE)
    etab_result = import_util.back_up(settings.BACKUP_OUTPUT_FOLDER,
                                      settings.SCORE_REDUCING_TARGET_TABLE,
                                      "export_etablissement",
                                      timestamp,
                                      new_table_name="etablissements_new")

    tar_filename = os.path.join(settings.BACKUP_FOLDER,
                                "%s.tar.bz2" % timestamp)
    with tarfile.open(tar_filename, "w:bz2") as tar:
        logger.info("creating tar file %s...", tar_filename)
        tar.add(etab_result, arcname=os.path.basename(etab_result))
        tar.close()
    return tar_filename
Ejemplo n.º 23
0
def check_complete_test(dpae_filename):
    logger.info("check complete test...")
    lines = get_n_lines(dpae_filename, n=20)
    success = 0
    errors = 0
    for line in lines:
        try:
            parse_dpae_line(line)
            success += 1
        except (ValueError, IndexError):
            errors += 1
    logger.info("%i lines parsed with success", success)
    logger.info("%i lines parsed with error", errors)
    error_rate = errors / (1.0 * (success + errors))
    logger.info("error rate: %i", error_rate)
    if error_rate >= settings.DPAE_ERROR_RATE_MAX:
        raise "error_rate too high"
    logger.info("complete test OK!")
Ejemplo n.º 24
0
    def insert_logs_activity(self, activity_df):
        '''
        details = consulter une page entreprise
        afficher-details = déplier fiche entreprise
        '''

        clics_of_interest = ['details', 'afficher-details', 'ajout-favori']

        logs_activity_df = activity_df[activity_df['nom'].isin(
            clics_of_interest)]

        logs_activity_df['siret'] = logs_activity_df.apply(
            lambda row: siret(row), axis=1)
        logs_activity_df['utm_medium'] = logs_activity_df.apply(
            lambda row: get_propriete(row, 'utm_medium'), axis=1)
        logs_activity_df['utm_source'] = logs_activity_df.apply(
            lambda row: get_propriete(row, 'utm_source'), axis=1)
        logs_activity_df['utm_campaign'] = logs_activity_df.apply(
            lambda row: get_propriete(row, 'utm_campaign'), axis=1)

        # We want to keep only the activity logs with IDPeconnect OR the ones that have the values we want in utm_medium
        # If we keep everything, there will be way too many lines in the database
        utm_medium_to_keep = ['mailing']
        logs_activity_df = logs_activity_df[
            (logs_activity_df.idutilisateur_peconnect.notnull()) |
            (logs_activity_df.utm_medium.isin(utm_medium_to_keep))]

        cols_of_interest = [
            "dateheure",
            "nom",
            "idutilisateur_peconnect",
            "siret",
            "utm_medium",
            "utm_source",
            "utm_campaign",
        ]

        logs_activity_df = logs_activity_df[cols_of_interest]

        nb_lines = logs_activity_df.shape[0]
        logger.info(
            f'Number of lines to insert into logs_activity : {nb_lines}')

        return logs_activity_df
Ejemplo n.º 25
0
    def get_activity_logs(self):
        engine = import_util.create_sqlalchemy_engine()

        query = "select * from logs_activity"
        if DEBUG:
            query += " ORDER BY RAND() LIMIT 10000"
        df_activity = pd.read_sql_query(query, engine)

        engine.close()

        # TODO : Define how long we should use logs
        # https://valodata.slack.com/archives/C0QR8RYL8/p1562319224015200

        # TODO : Uncomment these lines after the first initialization of the project
        # one_year_date = datetime.date.today() - datetime.timedelta(365)
        # df_activity['dateheure'] = pd.to_datetime(df_activity['dateheure'])
        # df_activity = df_activity[df_activity.dateheure > one_year_date]

        logger.info('Activities logs are loaded')

        return df_activity.astype(str)
Ejemplo n.º 26
0
    def get_last_recorded_hiring_date(self):

        # We want to check if there are already data  in the final table
        table_name_act_dpae = LogsActivityDPAEClean.__tablename__
        query = f"SELECT COUNT(*) FROM {table_name_act_dpae}"
        engine = import_util.create_sqlalchemy_engine()

        # If data in table
        if engine.execute(query).fetchone()[0] > 0:
            query = f"select date_embauche from {table_name_act_dpae} order by date_embauche DESC LIMIT 1"
            row = engine.execute(query).fetchone()
            date_last_recorded_hiring = row[0].split()[0]
        # Else no data
        else:
            # We set the date to the first activity log that has ever been created on LBB
            date_last_recorded_hiring = "2018-08-31"

        engine.close()
        logger.info(f"the most recent date found is {date_last_recorded_hiring}")

        return date_last_recorded_hiring
Ejemplo n.º 27
0
    def after_check(self):
        query = db_session.query(RawOffice.departement.distinct().label("departement"))
        departements = [row.departement for row in query.all()]

        if len(departements) != settings.DISTINCT_DEPARTEMENTS_HAVING_OFFICES:
            msg = "wrong number of departements : %s instead of expected %s" % (
                len(departements),
                settings.DISTINCT_DEPARTEMENTS_HAVING_OFFICES
            )
            raise Exception(msg)

        # FIXME parallelize for better performance
        for departement in departements:
            count = RawOffice.query.filter_by(departement=departement).count()
            logger.info("number of companies in departement %s : %i", departement, count)
            if not count >= settings.MINIMUM_OFFICES_TO_BE_EXTRACTED_PER_DEPARTEMENT:
                msg = "too few companies in departement : %s instead of expected %s" % (
                    count,
                    settings.MINIMUM_OFFICES_TO_BE_EXTRACTED_PER_DEPARTEMENT
                )
                raise Exception(msg)
Ejemplo n.º 28
0
    def update_coordinates(self, updates):
        con, cur = import_util.create_cursor()
        count = 0
        statements = []
        update_query = "update %s set coordinates_x=%%s, coordinates_y=%%s where siret=%%s" % \
            settings.SCORE_REDUCING_TARGET_TABLE

        logger.info("Nb of offices to update : {}".format(len(updates)))

        for siret, coordinates in updates:
            count += 1
            statements.append([coordinates[0], coordinates[1], siret])
            if len(statements) == 1000:
                logger.info("geocoding with ban... %i of %i done", count,
                            len(updates))
                cur.executemany(update_query, statements)
                con.commit()
                statements = []

        if len(statements) >= 1:
            logger.info("geocoding with ban... %i of %i done", count,
                        len(updates))
            cur.executemany(update_query, statements)
            con.commit()

        cur.close()
        con.close()
Ejemplo n.º 29
0
    def create_creatable_offices(self):
        """
        create new offices (that are not yet in our etablissement table)
        """
        con, cur = import_util.create_cursor()
        query = """INSERT into %s(siret, raisonsociale, enseigne, codenaf, numerorue,
            libellerue, codecommune, codepostal, email, tel, departement, trancheeffectif,
            website, flag_poe_afpr, flag_pmsmp)
        values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s)""" % settings.RAW_OFFICE_TABLE

        count = 1
        logger.info("create new offices in table %s",
                    settings.RAW_OFFICE_TABLE)
        statements = []
        MAX_COUNT_EXECUTE = 500
        for siret in self.creatable_sirets:
            statement = self.csv_offices[siret]["create_fields"]
            statements.append(statement)
            count += 1
            if not count % MAX_COUNT_EXECUTE:
                cur.executemany(query, statements)
                con.commit()
                statements = []
                if not count % 10000:
                    logger.info("created %s offices", count)
        if statements:
            cur.executemany(query, statements)
            con.commit()
        cur.close()
        con.close()
        logger.info("%i new offices created.", count)
Ejemplo n.º 30
0
 def run_geocoding_jobs(self, geocoding_jobs):
     ban_jobs = []
     coordinates_updates = []
     count = 0
     for siret, full_address, initial_coordinates in geocoding_jobs:
         unit = GeocodeUnit(siret, full_address, coordinates_updates,
                            initial_coordinates)
         job_id = pool.spawn(unit.find_coordinates_for_address)
         ban_jobs.append(job_id)
         count += 1
         if not count % 1000:
             logger.info(
                 "running geocoding jobs : started %s of %s jobs, collected %s coordinates so far",
                 count,
                 len(geocoding_jobs),
                 len(coordinates_updates),
             )
             gevent.joinall(ban_jobs)
             ban_jobs = []
     # processing remaining jobs
     gevent.joinall(ban_jobs)
     return coordinates_updates