def siret_validator(form, field): """ Validate siret number. http://wtforms.readthedocs.io/en/latest/validators.html#custom-validators """ if not is_siret(field.data): raise ValidationError( "Le numéro SIRET doit être composé de 14 chiffres")
def get_sirets_from_database(self): query = "select siret from %s" % settings.RAW_OFFICE_TABLE logger.info("get offices from database") con, cur = import_util.create_cursor() cur.execute(query) rows = cur.fetchall() cur.close() con.close() return [row[0] for row in rows if siret_util.is_siret(row[0])]
def validate_form(self, form): # Add http:// is missing form['new_website'].data = format_url(form['new_website'].data) form['social_network'].data = format_url(form['social_network'].data) form['website_alternance'].data = format_url( form['website_alternance'].data) is_valid = super(OfficeAdminUpdateModelView, self).validate_form(form) # All sirets must be well formed sirets = models.OfficeAdminUpdate.as_list(form.data['sirets']) only_one_siret = len(sirets) == 1 if is_valid and not sirets: message = "Le champs 'Sirets' est obligatoire. Veuillez le renseigner." flash(message, 'error') return False if is_valid: for siret in sirets: if not is_siret(siret): message = "Ce siret suivant n'est pas composé de 14 chiffres : {}".format( siret) flash(message, 'error') return False # If only one siret, we valdate it # If more than one siret : no siret validation if is_valid and only_one_siret: siret = sirets[0] office = models.Office.query.filter_by(siret=siret).first() if not office: message = "Le siret suivant n'est pas présent sur LBB: {}".format( siret) flash(message, 'error') return False if is_valid: # Show old value in description for siret in sirets: if 'id' in request.args: # Avoid conflict with itself if update by adding id != request.args['id'] office_update_conflict = models.OfficeAdminUpdate.query.filter( models.OfficeAdminUpdate.sirets.like( "%{}%".format(siret)), models.OfficeAdminUpdate.id != request.args['id']) else: office_update_conflict = models.OfficeAdminUpdate.query.filter( models.OfficeAdminUpdate.sirets.like( "%{}%".format(siret))) if office_update_conflict.count() > 0: message = """" Le siret {} est déjà présent dans la fiche n°{} """.format(siret, office_update_conflict[0].id) flash(message, 'error') return False # Get company first_office = models.Office.query.filter_by( siret=sirets[0]).first() if sirets else None # Codes ROMES to boost or to add if is_valid and form.data.get('romes_to_boost'): try: self.validate_romes_to_boost(form, 'romes_to_boost', 'boost') except (RomeToBoostException, InvalidRome) as e: flash(e.args[0], 'error') return False # Codes ROMES to boost or to add (for alternance) if is_valid and form.data.get('romes_alternance_to_boost'): try: self.validate_romes_to_boost(form, 'romes_to_boost', 'boost_alternance') except (RomeToBoostException, InvalidRome) as e: flash(e.args[0], 'error') return False # Codes ROMES to remove for LBB if is_valid and form.data.get('romes_to_remove'): try: office_naf = first_office.naf if only_one_siret else None self.validate_romes_to_remove(form, 'romes_to_remove', office_naf) except (RomeToRemoveException, InvalidRome) as e: flash(e.args[0], 'error') return False # Codes ROMES to remove (for alternance) if is_valid and form.data.get('romes_alternance_to_remove'): try: office_naf = first_office.naf if only_one_siret else None self.validate_romes_to_remove(form, 'romes_alternance_to_remove', office_naf) except (RomeToRemoveException, InvalidRome) as e: flash(e.args[0], 'error') return False # Codes NAF to add if is_valid and form.data.get('nafs_to_add'): nafs_to_add = form.data.get('nafs_to_add') for naf in models.OfficeAdminUpdate.as_list(nafs_to_add): if naf not in settings.NAF_CODES: msg = "`%s` n'est pas un code NAF valide." % naf flash(msg, 'error') return False if naf == first_office.naf and only_one_siret: msg = "Le NAF `%s` est déjà associé à cette entreprise." % naf flash(msg, 'error') return False # Identifiant recruteur if is_valid and form.data.get( 'certified_recruiter') and not form.data.get('recruiter_uid'): msg = "La case 'Recruteur certifié' est cochée mais aucun identifiant n'est indiqué." flash(msg, 'error') return False return is_valid
def get_offices_from_file(self): # FIXME elegantly parallelize this stuff # see # https://stackoverflow.com/questions/8717179/chunking-data-from-a-large-file-for-multiprocessing # https://docs.python.org/2/library/itertools.html#itertools.islice logger.info("extracting %s...", self.input_filename) departements = dpt.DEPARTEMENTS count = 0 no_zipcode_count = 0 unprocessable_departement_errors = 0 format_errors = 0 # KPI expected after the add of the RGPD email column emails_here_before_rgpd = 0 # Number of offices who did not have email before, and now have one emails_not_here_before_rgpd = 0 # Number of offices who had an existing email, which has been replaced by the new rgpd mail clean departement_counter_dic = {} offices = {} with import_util.get_reader(self.input_filename) as myfile: header_line = myfile.readline().strip() # FIXME detect column positions from header if b"siret" not in header_line: logger.debug(header_line) raise ValueError("wrong header line") for line in myfile: line = line.decode() count += 1 try: fields = import_util.get_fields_from_csv_line(line) if len(fields) != 22: logger.exception("wrong number of fields in line %s", line) raise ValueError siret, raisonsociale, enseigne, codenaf, numerorue, \ libellerue, codecommune, codepostal, email, \ tel, trancheeffectif_etablissement, effectif_reel, _, _, \ website1, website2, better_tel, \ website3, _, contrat_afpr, contrat_poe, contrat_pmsmp = fields if not siret_util.is_siret(siret): logger.exception("wrong siret : %s", siret) raise ValueError if siret in WRONG_SIRETS: logger.exception("wrong siret : %s, should not be here - need other extract from datalake", siret) raise WrongSiretException except ValueError: logger.exception("exception in line %s", line) format_errors += 1 continue # We cant rely on the field trancheeffectif_etablissement which is in etablissement file # We have to rely on the field effectif_reel # We take the number of employees and we use a dataframe which will help us to determine which category the number of employees is related to # If there is no effectif reel in the dataset OR it is 0, we use the old field tranche_effectif if effectif_reel != '': if int(effectif_reel) > 0: trancheeffectif_etablissement = DF_EFFECTIF_TO_LABEL[ (DF_EFFECTIF_TO_LABEL.start_effectif <= int(effectif_reel)) & (DF_EFFECTIF_TO_LABEL.end_effectif >= int(effectif_reel)) ]['code'].values[0] website = merge_and_normalize_websites([website1, website2, website3]) if has_text_content(better_tel): tel = better_tel flag_pmsmp = 0 if contrat_pmsmp == "O": flag_pmsmp = 1 flag_poe_afpr = 0 if contrat_poe == "O" or contrat_afpr == "O": flag_poe_afpr = 1 if codecommune.strip(): departement = import_util.get_departement_from_zipcode(codepostal) process_this_departement = departement in departements if process_this_departement: # Trello Pz5UlnFh : supprimer-les-emails-pe-des-entreprises-qui-ne-sont-pas-des-agences-pe if "@pole-emploi." in email and raisonsociale != "POLE EMPLOI": email = "" if len(codepostal) == 4: codepostal = "0%s" % codepostal etab_create_fields = siret, raisonsociale, enseigne, codenaf, numerorue, libellerue, \ codecommune, codepostal, email, tel, departement, trancheeffectif_etablissement, \ website, flag_poe_afpr, flag_pmsmp etab_update_fields = raisonsociale, enseigne, codenaf, numerorue, libellerue, \ codecommune, codepostal, email, tel, departement, trancheeffectif_etablissement, \ website, flag_poe_afpr, flag_pmsmp, siret if codepostal.startswith(departement): departement_counter_dic.setdefault(departement, 0) departement_counter_dic[departement] += 1 offices[siret] = { "create_fields": etab_create_fields, "update_fields": etab_update_fields, } else: logger.info( "zipcode %s and departement %s don't match commune_id %s", codepostal, departement, codecommune, ) else: unprocessable_departement_errors += 1 else: no_zipcode_count += 1 if not count % 100000: logger.debug("processed %s lines", count) yield offices offices = {} logger.info("%i offices total", count) logger.info("%i offices without email before and have now thanks to RGPD mails", emails_not_here_before_rgpd) logger.info("%i offices with emails before and have been replaced by RGPD mails", emails_here_before_rgpd) logger.info("%i offices with unprocessable departement", unprocessable_departement_errors) logger.info("%i offices with no zipcodes", no_zipcode_count) logger.info("%i offices not read because of format error", format_errors) logger.info("%i distinct departements from file", len(departement_counter_dic)) departement_count = sorted(departement_counter_dic.items()) logger.info("per departement read %s", departement_count) logger.info("finished reading offices...") if get_current_env() != ENV_TEST: if unprocessable_departement_errors > 2500: raise ValueError("too many unprocessable_departement_errors") if no_zipcode_count > 75000: raise ValueError("too many no_zipcode_count") if format_errors > 5: raise ValueError("too many format_errors") if len(departement_counter_dic) != settings.DISTINCT_DEPARTEMENTS_HAVING_OFFICES: msg = "incorrect total number of departements : %s instead of expected %s" % ( len(departement_counter_dic), settings.DISTINCT_DEPARTEMENTS_HAVING_OFFICES ) raise ValueError(msg) for departement, count in departement_count: if not count >= settings.MINIMUM_OFFICES_TO_BE_EXTRACTED_PER_DEPARTEMENT: logger.exception("only %s offices in departement %s", count, departement) raise ValueError("not enough offices in at least one departement") yield offices
def get_offices_from_file(self): # FIXME elegantly parallelize this stuff # see # https://stackoverflow.com/questions/8717179/chunking-data-from-a-large-file-for-multiprocessing # https://docs.python.org/2/library/itertools.html#itertools.islice logger.info("extracting %s...", self.input_filename) departements = dpt.DEPARTEMENTS count = 0 no_zipcode_count = 0 unprocessable_departement_errors = 0 format_errors = 0 departement_counter_dic = {} offices = {} with import_util.get_reader(self.input_filename) as myfile: header_line = myfile.readline().strip() # FIXME detect column positions from header if b"siret" not in header_line: logger.debug(header_line) raise ValueError("wrong header line") for line in myfile: line = line.decode() count += 1 if not count % 100000: logger.debug("processed %s lines", count) try: fields = import_util.get_fields_from_csv_line(line) if len(fields) != 22: logger.exception("wrong number of fields in line %s", line) raise ValueError siret, raisonsociale, enseigne, codenaf, numerorue, \ libellerue, codecommune, codepostal, email, tel, \ trancheeffectif_etablissement, _, _, _, \ website1, website2, better_tel, \ website3, _, contrat_afpr, contrat_poe, contrat_pmsmp = fields if not siret_util.is_siret(siret): logger.exception("wrong siret : %s", siret) raise ValueError except ValueError: logger.exception("exception in line %s", line) format_errors += 1 continue website = merge_and_normalize_websites([website1, website2, website3]) if has_text_content(better_tel): tel = better_tel flag_pmsmp = 0 if contrat_pmsmp == "O" : flag_pmsmp = 1 flag_poe_afpr = 0 if contrat_poe == "O" or contrat_afpr == "O" : flag_poe_afpr = 1 email = encoding_util.strip_french_accents(email) if codecommune.strip(): departement = import_util.get_departement_from_zipcode(codepostal) process_this_departement = departement in departements if process_this_departement: # Trello Pz5UlnFh : supprimer-les-emails-pe-des-entreprises-qui-ne-sont-pas-des-agences-pe if "@pole-emploi." in email and raisonsociale != "POLE EMPLOI": email = "" if len(codepostal) == 4: codepostal = "0%s" % codepostal etab_create_fields = siret, raisonsociale, enseigne, codenaf, numerorue, libellerue, \ codecommune, codepostal, email, tel, departement, trancheeffectif_etablissement, \ website, flag_poe_afpr, flag_pmsmp etab_update_fields = raisonsociale, enseigne, codenaf, numerorue, libellerue, \ codecommune, codepostal, email, tel, departement, trancheeffectif_etablissement, \ website, flag_poe_afpr, flag_pmsmp, siret if codepostal.startswith(departement): departement_counter_dic.setdefault(departement, 0) departement_counter_dic[departement] += 1 offices[siret] = { "create_fields": etab_create_fields, "update_fields": etab_update_fields, } else: logger.info( "zipcode %s and departement %s don't match commune_id %s", codepostal, departement, codecommune, ) else: unprocessable_departement_errors += 1 else: no_zipcode_count += 1 logger.info("%i offices total", count) logger.info("%i offices with unprocessable departement", unprocessable_departement_errors) logger.info("%i offices with no zipcodes", no_zipcode_count) logger.info("%i offices not read because of format error", format_errors) logger.info("%i distinct departements from file", len(departement_counter_dic)) departement_count = sorted(departement_counter_dic.items()) logger.info("per departement read %s", departement_count) logger.info("finished reading offices...") if unprocessable_departement_errors > 2500: raise ValueError("too many unprocessable_departement_errors") if no_zipcode_count > 75000: raise ValueError("too many no_zipcode_count") if format_errors > 5: raise ValueError("too many format_errors") if len(departement_counter_dic) != settings.DISTINCT_DEPARTEMENTS_HAVING_OFFICES: msg = "incorrect total number of departements : %s instead of expected %s" % ( len(departement_counter_dic), settings.DISTINCT_DEPARTEMENTS_HAVING_OFFICES ) raise ValueError(msg) for departement, count in departement_count: if not count >= settings.MINIMUM_OFFICES_TO_BE_EXTRACTED_PER_DEPARTEMENT: logger.exception("only %s offices in departement %s", count, departement) raise ValueError("not enough offices in at least one departement") return offices