Ejemplo n.º 1
0
def get_geiq_df():
    filename = get_filename(filename_prefix="Liste_Geiq",
                            filename_extension=".xls",
                            description="Export GEIQ")

    df = pd.read_excel(filename, converters={"siret": str, "zip": str})

    column_mapping = {
        "name": "name",
        "street": "address_line_1",
        "street2": "address_line_2",
        "zip": "post_code",
        "city": "city",
        "siret": "siret",
        "email": "auth_email",
    }
    df = remap_columns(df, column_mapping=column_mapping)

    # Replace NaN elements with None.
    df = df.replace({np.nan: None})

    # Clean string fields.
    df["name"] = df.name.apply(clean_string)
    df["address_line_1"] = df.address_line_1.apply(clean_string)
    df["address_line_2"] = df.address_line_2.apply(clean_string)
    df["post_code"] = df.post_code.apply(clean_string)
    df["city"] = df.city.apply(clean_string)
    df["siret"] = df.siret.apply(clean_string)
    df["auth_email"] = df.auth_email.apply(clean_string)

    # "GEIQ PROVENCE" becomes "Geiq Provence".
    df["name"] = df.name.apply(str.title)

    df["department"] = df.post_code.apply(department_from_postcode)

    # Drop rows without siret.
    df = df[~df.siret.isnull()]

    # Drop rows without auth_email.
    df = df[~df.auth_email.isnull()]

    for _, row in df.iterrows():
        validate_siret(row.siret)

    assert df.siret.is_unique
    assert len(df) >= 150  # Export usually has 180+ geiq structures.

    return df
Ejemplo n.º 2
0
    def clean_siret(self):
        # `max_length` is skipped so that we can allow an arbitrary number of spaces in the user-entered value.
        siret = self.cleaned_data["siret"].replace(" ", "")
        validate_siret(siret)

        # Fetch name and address from API entreprise.
        etablissement, error = etablissement_get_or_error(siret)
        if error:
            raise forms.ValidationError(error)

        if etablissement.is_closed:
            raise forms.ValidationError(
                "La base Sirene indique que l'établissement est fermé.")

        # Perform another API call to fetch geocoding data.
        address_fields = [
            etablissement.address_line_1,
            # `address_line_2` is omitted on purpose because it tends to return no results with the BAN API.
            etablissement.post_code,
            etablissement.city,
            etablissement.department,
        ]
        address_on_one_line = ", ".join(
            [field for field in address_fields if field])
        geocoding_data = get_geocoding_data(
            address_on_one_line, post_code=etablissement.post_code) or {}

        self.org_data = {
            "siret": siret,
            "is_head_office": etablissement.is_head_office,
            "name": etablissement.name,
            "address_line_1": etablissement.address_line_1,
            "address_line_2": etablissement.address_line_2,
            "post_code": etablissement.post_code,
            "city": etablissement.city,
            "department": etablissement.department,
            "longitude": geocoding_data.get("longitude"),
            "latitude": geocoding_data.get("latitude"),
            "geocoding_score": geocoding_data.get("score"),
        }

        return siret
Ejemplo n.º 3
0
def get_vue_structure_df():
    """
    The "Vue Structure" export has the following fields:
    - asp_id
    - siret (current)
    - siret (initial aka siret_signature)
    - auth_email
    - name
    - address
    - phone
    but does *not* have those fields:
    - kind (found in the "Vue AF" export)
    - website (nowhere to be found)
    """
    df = get_fluxiae_df(
        vue_name="fluxIAE_Structure",
        converters={
            "structure_siret_actualise": str,
            "structure_siret_signature": str,
            "structure_adresse_mail_corresp_technique": str,
            "structure_adresse_gestion_cp": str,
            "structure_adresse_gestion_telephone": str,
        },
        description="Vue Structure",
        skip_first_row=True,
        # We need the phone number.
        anonymize_sensitive_data=False,
    )

    column_mapping = {
        "structure_siret_actualise": "siret",
        "structure_siret_signature": "siret_signature",
        "structure_id_siae": "asp_id",
        "structure_adresse_mail_corresp_technique": "auth_email",
        "structure_code_naf": "naf",
        "structure_denomination": "name",
        # ASP recommends using *_gestion_* fields rather than *_admin_* ones.
        "structure_adresse_gestion_numero": "street_num",
        "structure_adresse_gestion_cplt_num_voie": "street_num_extra",
        "structure_adresse_gestion_type_voie": "street_type",
        "structure_adresse_gestion_nom_voie": "street_name",
        "structure_adresse_gestion_cp": "post_code",
        "structure_adresse_gestion_commune": "city",
        "structure_adresse_gestion_telephone": "phone",
        # The extra* fields have very low quality data,
        # their content does not reflect the field name at all.
        "structure_adresse_gestion_numero_apt": "extra1",
        "structure_adresse_gestion_entree": "extra2",
        "structure_adresse_gestion_cplt_adresse": "extra3",
    }
    df = remap_columns(df, column_mapping=column_mapping)

    # Replace NaN elements with None.
    df = df.replace({np.nan: None})

    # Drop rows without auth_email.
    df = df[df.auth_email.notnull()]
    df = df[df.auth_email != ""]

    for _, row in df.iterrows():
        validate_siret(row.siret)
        validate_siret(row.siret_signature)
        validate_naf(row.naf)
        assert " " not in row.auth_email
        assert "@" in row.auth_email
        assert row.siret[:9] == row.siret_signature[:9]

    return df
Ejemplo n.º 4
0
def get_ea_eatt_df():
    filename = get_filename(filename_prefix="Liste_Contact_EA",
                            filename_extension=".xlsx",
                            description="Export EA/EATT")

    siret_field_name = "Siret_Signataire"
    post_code_field_name = "CODE_POST_Signataire"
    phone_field_name = "TEL_CONT_Signataire"

    df = pd.read_excel(filename,
                       converters={
                           siret_field_name: str,
                           post_code_field_name: str,
                           phone_field_name: str
                       })

    column_mapping = {
        "Denomination_Sociale_Signataire": "name",
        "LIB_TYPE_EA": "kind",
        "NUM_ENTREE_Signataire": "address_part1",
        "NUM_VOIE_Signataire": "address_part2",
        "CODE_VOIE_Signataire": "address_part3",
        "LIB_VOIE_Signataire": "address_part4",
        post_code_field_name: "post_code",
        "LIB_COM_Signataire": "city",
        siret_field_name: "siret",
        "CRL_CONT_Signataire": "auth_email",
        phone_field_name: "phone",
    }
    df = remap_columns(df, column_mapping=column_mapping)

    # Replace NaN elements with None.
    df = df.replace({np.nan: None})

    df["kind"] = df.kind.apply(convert_kind)

    # Drop rows without siret.
    df = df[~df.siret.isnull()]

    df.drop_duplicates(
        subset=["siret"],
        keep="first",
        inplace=True,
    )

    df["address_line_1"] = ""

    for _, row in df.iterrows():
        validate_siret(row.siret)
        address_line_1 = ""
        if row.address_part1:
            address_line_1 += row.address_part1
        if row.address_part2:
            address_line_1 += f" {int(row.address_part2)}"
        if row.address_part3:
            address_line_1 += f" {row.address_part3}"
        if row.address_part4:
            address_line_1 += f" {row.address_part4}"
        row["address_line_1"] = address_line_1

    # Clean string fields.
    df["name"] = df.name.apply(clean_string)
    df["kind"] = df.kind.apply(clean_string)
    df["address_line_1"] = df.address_line_1.apply(clean_string)
    df["post_code"] = df.post_code.apply(clean_string)
    df["city"] = df.city.apply(clean_string)
    df["siret"] = df.siret.apply(clean_string)
    df["auth_email"] = df.auth_email.apply(clean_string)
    df["phone"] = df.phone.apply(clean_string)

    # "EA LOU JAS" becomes "Ea Lou Jas".
    df["name"] = df.name.apply(str.title)

    df["department"] = df.post_code.apply(department_from_postcode)

    missing_emails_count = len(df[df.auth_email.isnull()])
    assert missing_emails_count <= 20, f"Too many missing emails: {missing_emails_count}"

    # Drop rows without auth_email.
    df = df[~df.auth_email.isnull()]

    assert df.siret.is_unique
    assert len(
        df
    ) >= 600, f"Export usually has 700+ EA/EATT structures (only {len(df)})."

    return df
Ejemplo n.º 5
0
 def test_validate_siret(self):
     self.assertRaises(ValidationError, validate_siret, "1200001530001")
     self.assertRaises(ValidationError, validate_siret, "120000153000111")
     self.assertRaises(ValidationError, validate_siret, "1200001530001a")
     self.assertRaises(ValidationError, validate_siret, "azertyqwerty")
     validate_siret("12000015300011")