def join_cadastre(): """ Join the table basol_cadastre with cadastre table """ # Input datasets basol_cadastre = Dataset("etl", "basol_cadastre") cadastre = Dataset("etl", "cadastre") # Output datasets basol_cadastre_joined = Dataset("etl", "basol_cadastre_joined") basol_cadastre_dtype = basol_cadastre.read_dtype() dtype = [*basol_cadastre_dtype, Column("geog", Geometry(srid=4326))] basol_cadastre_joined.write_dtype(dtype) BasolCadastre = basol_cadastre.reflect() Cadastre = cadastre.reflect() session = basol_cadastre.get_session() cond = (BasolCadastre.commune == Cadastre.commune) & \ (BasolCadastre.section == Cadastre.section) & \ (BasolCadastre.numero == Cadastre.numero) q = session.query(BasolCadastre, Cadastre.geog) \ .join(Cadastre, cond) \ .yield_per(500) with basol_cadastre_joined.get_writer() as writer: for (basol, geog) in q: output_row = {**row2dict(basol), "geog": geog} output_row.pop("id") writer.write_row_dict(output_row) session.close()
def merge_cadastre(): """ Merge the different parcelles into a MultiPolygon """ # Input dataset basol_cadastre_joined = Dataset("etl", "basol_cadastre_joined") # Output dataset basol_cadastre_merged = Dataset("etl", "basol_cadastre_merged") dtype = [ Column("id", BigInteger, primary_key=True, autoincrement=True), Column("numerobasol", String), Column("geog", Geometry(srid=4326)) ] basol_cadastre_merged.write_dtype(dtype) BasolCadastreJoined = basol_cadastre_joined.reflect() session = basol_cadastre_joined.get_session() select = [ BasolCadastreJoined.numerobasol, func.st_multi(func.st_union(BasolCadastreJoined.geog)) ] q = session.query(*select) \ .group_by(BasolCadastreJoined.numerobasol) \ .all() with basol_cadastre_merged.get_writer() as writer: for (numerobasol, geog) in q: row = {"numerobasol": numerobasol, "geog": geog} writer.write_row_dict(row) session.close()
def add_parcels(): """ join table basol_intersected with basol_cadastre_merged """ # input datasets basol_intersected = Dataset("etl", "basol_intersected") basol_cadastre_merged = Dataset("etl", "basol_cadastre_merged") # output datasets basol_with_parcels = Dataset("etl", "basol_with_parcels") BasolIntersected = basol_intersected.reflect() BasolCadastreMerged = basol_cadastre_merged.reflect() dtype = basol_intersected.read_dtype() basol_with_parcels.write_dtype(dtype) session = basol_intersected.get_session() cond = (BasolIntersected.numerobasol == BasolCadastreMerged.numerobasol) q = session.query(BasolIntersected, BasolCadastreMerged.geog) \ .join(BasolCadastreMerged, cond, isouter=True) \ .all() with basol_with_parcels.get_writer() as writer: for (row, geog) in q: if geog is not None: row.geog_precision = precisions.PARCEL row.geog_source = "cadastre" row.geog = geog writer.write_row_dict(row2dict(row)) session.close()
def merge_geog(): """ Choose best precision between initial coordinates or geocoded coordinates if geog is not set from cadastre information """ # Input dataset basol_geocoded = Dataset("etl", "basol_normalized") # Output dataset basol_geog_merged = Dataset("etl", "basol_geog_merged") basol_geog_merged.write_dtype([ *basol_geocoded.read_dtype(), Column("geog", Geometry(srid=4326)), Column("geog_precision", String), Column("geog_source", String) ]) BasolGeocoded = basol_geocoded.reflect() session = basol_geocoded.get_session() point_lambert2 = func.ST_Transform( func.ST_setSRID( func.ST_MakePoint(BasolGeocoded.coordxlambertii, BasolGeocoded.coordylambertii), LAMBERT2), WGS84) point_geocoded = func.ST_setSRID( func.ST_MakePoint(BasolGeocoded.geocoded_longitude, BasolGeocoded.geocoded_latitude), WGS84) q = session.query(BasolGeocoded, point_lambert2, point_geocoded).all() with basol_geog_merged.get_writer() as writer: for (row, point_lambert2, point_geocoded) in q: output_row = { **row2dict(row), "geog": None, "geog_precision": None, "geog_source": None } if row.l2e_precision == precisions.HOUSENUMBER: output_row["geog"] = point_lambert2 output_row["geog_precision"] = row.l2e_precision output_row["geog_source"] = "lambert2" elif (row.geocoded_result_type == precisions.HOUSENUMBER) and \ (row.geocoded_result_score >= 0.6): output_row["geog"] = point_geocoded output_row["geog_precision"] = row.geocoded_result_type output_row["geog_source"] = "geocodage" writer.write_row_dict(output_row) session.close()
def check(): # check we have same number of records # than data filtered basol = Dataset("etl", "basol") Basol = basol.reflect() session = basol.get_session() basol_count = session.query(Basol).count() session.close() basol_filtered = Dataset("etl", "basol_filtered") BasolFiltered = basol_filtered.reflect() session = basol_filtered.get_session() basol_filtered_count = session.query(BasolFiltered).count() session.close() assert basol_count > 0 assert basol_count == basol_filtered_count
def add_geog(): """ Cette recette réalise une jointure avec la table cadastre pour nettoyer les parcelles invalide. On utilise la table cadastre du schéma kelrisks et non la table du schéma etl car il n'y a pas assez de stockage sur le serveur pour avoir 4 tables cadastre (preprod `etl`, preprod `kelrisks`, prod `etl`, prod `kelrisks`). On est donc obligé de supprimer les tables cadastres du schéma `etl` après les avoir copié dans le schéma `kelrisks`. La table `etl.cadastre` n'existe donc pas forcément au moment où ce DAG est executé. """ # Input dataset basias_cadastre_parsed = Dataset("etl", "basias_cadastre_parsed") cadastre = Dataset("kelrisks", "cadastre") # Output dataset basias_cadastre_with_geog = Dataset("etl", "basias_cadastre_with_geog") BasiasCadastreParsed = basias_cadastre_parsed.reflect() Cadastre = cadastre.reflect() basias_cadastre_with_geog.write_dtype([ *basias_cadastre_parsed.read_dtype(), Column("geog", Geometry(srid=4326)) ]) session = basias_cadastre_parsed.get_session() cond = (BasiasCadastreParsed.commune == Cadastre.commune) & \ (BasiasCadastreParsed.section == Cadastre.section) & \ (BasiasCadastreParsed.numero == Cadastre.numero) q = session.query(BasiasCadastreParsed, Cadastre.geog) \ .join(Cadastre, cond) \ .yield_per(500) with basias_cadastre_with_geog.get_writer() as writer: for (row, geog) in q: output_row = {**row2dict(row), "geog": None} del output_row["id"] if geog is not None: output_row["geog"] = geog writer.write_row_dict(output_row) session.close()
def check(): """ Cette recette permet de faire des tests afin de vérifier que la table générée est bien conforme. Dans un premier temps on vérifie uniquement que le nombre d'enregistrements est supérieur à 0 """ basias = Dataset("etl", "basias") Basias = basias.reflect() session = basias.get_session() basias_count = session.query(Basias).count() session.close() assert basias_count > 0
def check(): """ Cette recette permet de faire des tests afin de vérifier que la table générée est bien conforme. Dans un premier temps on vérifie uniquement que le nombre d'enregistrements est supérieur à 0 """ sis = Dataset("etl", "sis_with_precision") SIS = sis.reflect() session = sis.get_session() sis_count = session.query(SIS).count() session.close() assert sis_count > 0
def join_sites_localisation(): """ Réalise une jointure entre la table sites et la table localisation """ # datasets to join basias_sites_prepared = Dataset("etl", "basias_sites_prepared") basias_localisation_with_cadastre = Dataset( "etl", "basias_localisation_with_cadastre") # output dataset basias_sites_localisation_joined = Dataset( "etl", "basias_sites_localisation_joined") # transform types output_dtype = [ *basias_sites_prepared.read_dtype(), *basias_localisation_with_cadastre.read_dtype() ] output_dtype = merge_dtype(output_dtype) basias_sites_localisation_joined.write_dtype(output_dtype) # transform data BasiasSitesPrepared = basias_sites_prepared.reflect() BasiasLocalisation = basias_localisation_with_cadastre.reflect() session = basias_sites_prepared.get_session() join_query = session \ .query(BasiasSitesPrepared, BasiasLocalisation) \ .join(BasiasLocalisation, BasiasSitesPrepared.indice_departemental == BasiasLocalisation.indice_departemental, isouter=True) \ .all() with basias_sites_localisation_joined.get_writer() as writer: for (site, localisation) in join_query: output_row = {c.name: None for c in output_dtype} output_row = {**output_row, **row2dict(site)} if localisation: output_row = {**output_row, **row2dict(localisation)} del output_row["id"] writer.write_row_dict(output_row) session.close()
def add_commune(): """ Ajoute le contour des communes comme nouvelle valeur pour geog dans le cas où la précision est MUNICIPALITY La reqête SqlAlchemy est équivalente à SELECT * FROM etl.basias_sites_localisation_joined A LEFT JOIN etl.commune B ON A.numero_insee == B.insee """ # input dataset basias_sites_localisation_joined = Dataset( "etl", "basias_sites_localisation_joined") communes = Dataset("etl", "commune") # output dataset basias_sites_with_commune = Dataset("etl", "basias_sites_with_commune") dtype = basias_sites_localisation_joined.read_dtype() basias_sites_with_commune.write_dtype(dtype) BasiasSites = basias_sites_localisation_joined.reflect() Commune = communes.reflect() session = basias_sites_localisation_joined.get_session() q = session.query(BasiasSites, Commune.geog) \ .join(Commune, BasiasSites.numero_insee == Commune.insee, isouter=True) \ .all() with basias_sites_with_commune.get_writer() as writer: for (row, commune) in q: if row.geog is None: row.geog = commune row.geog_precision = precisions.MUNICIPALITY row.geog_source = "numero_insee" writer.write_row_dict(row2dict(row)) session.close()
def add_commune(): """ Ajoute le contour des communes comme nouvelle valeur pour geog dans le cas où la précision est MUNICIPALITY La reqête SqlAlchemy est équivalente à SELECT * FROM etl.s3ic_with_parcelle A LEFT JOIN etl.commune B ON A.cd_insee == B.insee """ # input dataset s3ic_with_parcelle = Dataset("etl", "s3ic_with_parcelle") communes = Dataset("etl", "commune") # output dataset s3ic_with_commune = Dataset("etl", "s3ic_with_commune") dtype = s3ic_with_parcelle.read_dtype() s3ic_with_commune.write_dtype(dtype) S3icWithParcelle = s3ic_with_parcelle.reflect() Commune = communes.reflect() session = s3ic_with_parcelle.get_session() q = session.query(S3icWithParcelle, Commune.geog) \ .join(Commune, S3icWithParcelle.code_insee == Commune.insee, isouter=True) \ .all() with s3ic_with_commune.get_writer() as writer: for (row, commune) in q: if row.geog_precision == precisions.MUNICIPALITY: row.geog = commune row.precision = precisions.MUNICIPALITY row.geog_source = "code_insee" writer.write_row_dict(row2dict(row)) session.close()
def intersect(): """ Find the closest parcelle to the point and set it as new geography """ # Input dataset basol_geog_merged = Dataset("etl", "basol_geog_merged") cadastre = Dataset("etl", "cadastre") # Output dataset basol_intersected = Dataset("etl", "basol_intersected") dtype = basol_geog_merged.read_dtype() basol_intersected.write_dtype(dtype) Cadastre = cadastre.reflect() BasolGeogMerged = basol_geog_merged.reflect() session = basol_geog_merged.get_session() stmt = session.query(Cadastre.geog) \ .filter(func.st_dwithin( Cadastre.geog, BasolGeogMerged.geog, 0.0001)) \ .order_by(func.st_distance( Cadastre.geog, BasolGeogMerged.geog)) \ .limit(1) \ .label("nearest") q = session.query(BasolGeogMerged, stmt).all() with basol_intersected.get_writer() as writer: for (row, cadastre_geog) in q: if cadastre_geog is not None: row.geog = cadastre_geog writer.write_row_dict(row2dict(row)) session.close()
def add_communes(): """ set commune geog for all records where geog is not set with a better precision """ # input dataset basol_with_parcels = Dataset("etl", "basol_with_parcels") communes = Dataset("etl", "commune") # output dataset basol_with_communes = Dataset("etl", "basol_with_commune") dtype = basol_with_parcels.read_dtype() basol_with_communes.write_dtype(dtype) BasolWithParcels = basol_with_parcels.reflect() Commune = communes.reflect() session = basol_with_parcels.get_session() q = session.query(BasolWithParcels, Commune.geog) \ .join(Commune, BasolWithParcels.code_insee == Commune.insee, isouter=True) \ .all() with basol_with_communes.get_writer() as writer: for (row, commune) in q: if row.geog is None: row.geog = commune row.geog_precision = precisions.MUNICIPALITY row.geog_source = "code_insee" writer.write_row_dict(row2dict(row)) session.close()
def join_localisation_cadastre(): """ Réalise une jointure entre la table localisation et la table cadastre """ # Input datasets basias_localisation_intersected = Dataset( "etl", "basias_localisation_intersected") basias_cadastre_merged = Dataset("etl", "basias_cadastre_merged") # Output dataset basias_localisation_with_cadastre = Dataset( "etl", "basias_localisation_with_cadastre") basias_localisation_with_cadastre.write_dtype( basias_localisation_intersected.read_dtype()) BasiasLocalisationIntersected = basias_localisation_intersected.reflect() BasiasCadastreMerged = basias_cadastre_merged.reflect() session = basias_localisation_intersected.get_session() cond = BasiasCadastreMerged.indice_departemental \ == BasiasLocalisationIntersected.indice_departemental q = session.query( BasiasLocalisationIntersected, BasiasCadastreMerged.geog) \ .join(BasiasCadastreMerged, cond, isouter=True) \ .all() with basias_localisation_with_cadastre.get_writer() as writer: for (row, cadastre_geog) in q: if cadastre_geog is not None: # replace geog with cadastre geog row.geog = cadastre_geog row.geog_precision = precisions.PARCEL row.geog_source = "cadastre" writer.write_row_dict(row2dict(row))
def merge_cadastre_geog(): """ Aggrège les différentes parcelles d'un même site dans un Multi Polygon """ # Input dataset basias_cadastre_with_geog = Dataset("etl", "basias_cadastre_with_geog") # Output dataset basias_cadastre_merged = Dataset("etl", "basias_cadastre_merged") dtype = [ Column("id", BigInteger, primary_key=True, autoincrement=True), Column("indice_departemental", String), Column("geog", Geometry(srid=4326)) ] basias_cadastre_merged.write_dtype(dtype) basias_cadastre_merged.write_dtype(dtype) BasiasCadastreWithGeog = basias_cadastre_with_geog.reflect() session = basias_cadastre_with_geog.get_session() select = [ BasiasCadastreWithGeog.indice_departemental, func.st_multi(func.st_union(BasiasCadastreWithGeog.geog)) ] q = session.query(*select) \ .group_by(BasiasCadastreWithGeog.indice_departemental) \ .all() with basias_cadastre_merged.get_writer() as writer: for (indice_departemental, geog) in q: row = {"indice_departemental": indice_departemental, "geog": geog} writer.write_row_dict(row) session.close()
def parse_cadastre(): """ Cette recette permet d'extraire des parcelles du champ "numero_de_parcelle" qui n'est pas formaté correctement Exemple de valeurs: ZA 128 et 146 ? 131 AH 575-574-43-224 etc """ # input dataset basias_cadastre_filtered = Dataset("etl", "basias_cadastre_filtered") basias_localisation_filtered = Dataset("etl", "basias_localisation_filtered") # output dataset basias_cadastre_parsed = Dataset("etl", "basias_cadastre_parsed") BasiasCadastreFiltered = basias_cadastre_filtered.reflect() BasiasLocalisationFiltered = basias_localisation_filtered.reflect() session = basias_cadastre_filtered.get_session() # write output schema output_dtype = [ Column("id", BigInteger, primary_key=True, autoincrement=True), Column("indice_departemental", String), Column("commune", String), Column("section", String), Column("numero", String) ] basias_cadastre_parsed.write_dtype(output_dtype) # Fait une jointure avec la table basias_localiation_source pour # récuperer le code insee q = session \ .query(BasiasCadastreFiltered, BasiasLocalisationFiltered.numero_insee) \ .join( BasiasLocalisationFiltered, BasiasCadastreFiltered.indice_departemental == BasiasLocalisationFiltered.indice_departemental, isouter=True) \ .all() with basias_cadastre_parsed.get_writer() as writer: for (cadastre, numero_insee) in q: row = {**row2dict(cadastre), "numero_insee": numero_insee} parcelles = transformers.extract_basias_parcelles_from_row(row) for parcelle in parcelles: output_row = { "indice_departemental": row["indice_departemental"], "commune": parcelle.commune, "section": parcelle.section, "numero": parcelle.numero } writer.write_row_dict(output_row) session.close()
def add_parcelle(): """ Projette les points géométriques sur la parcelle la plus proche. On se limite à un voisinage d'environ 10m = 0.0001 degré. Si aucune parcelle n'a été trouvée dans ce voisinage on conserve le point d'origine. La requête SQL Alchemy est équivalente à SELECT *, (SELECT geog FROM kelrisks.cadastre AS c WHERE st_dwithin(ic.geog, c.geog, 0.0001) ORDER BY st_distance(ic.geog, c.geog) LIMIT 1) nearest FROM etl.s3ic_geog_merged ic On utilise la table cadastre du schéma kelrisks et non la table du schéma etl car il n'y a pas assez de stockage sur le serveur pour avoir 4 tables cadastre (preprod `etl`, preprod `kelrisks`, prod `etl`, prod `kelrisks`). On est donc obligé de supprimer les tables cadastres du schéma `etl` après les avoir copié dans le schéma `kelrisks`. La table `etl.cadastre` n'existe donc pas forcément au moment où ce DAG est executé. """ # Input dataset s3ic_geog_merged = Dataset("etl", "s3ic_geog_merged") cadastre = Dataset("kelrisks", "cadastre") # Output dataset s3ic_with_parcelle = Dataset("etl", "s3ic_with_parcelle") dtype = s3ic_geog_merged.read_dtype() s3ic_with_parcelle.write_dtype(dtype) Cadastre = cadastre.reflect() S3icGeogMerged = s3ic_geog_merged.reflect() session = s3ic_geog_merged.get_session() stmt = session.query(Cadastre.geog) \ .filter(func.st_dwithin( Cadastre.geog, S3icGeogMerged.geog, 0.0001)) \ .order_by(func.st_distance( Cadastre.geog, S3icGeogMerged.geog)) \ .limit(1) \ .label("nearest") q = session.query(S3icGeogMerged, stmt).yield_per(500) with s3ic_with_parcelle.get_writer() as writer: for (row, cadastre_geog) in q: if cadastre_geog is not None \ and row.geog_precision != precisions.MUNICIPALITY: row.geog = cadastre_geog writer.write_row_dict(row2dict(row)) session.close()
def merge_geog(): """ Fusionne les infomations géographiques et produit les champs normalisés geog, geog_precision, geog_source On choisit de privilégier les informations de géocodage lorsque la précision initiale est MUNICIPALITY et que le score de géocodage est supérieur à 0.6 """ # input dataset s3ic_normalized = Dataset("etl", "s3ic_normalized") # output dataset s3ic_merged = Dataset("etl", "s3ic_geog_merged") dtype = s3ic_normalized.read_dtype() # geom => geog # lib_precis => precision dtype = [c for c in dtype if c.name not in ("geom", "lib_precis")] output_dtype = [ *dtype, Column("geog", Geometry(srid=4326)), Column("geog_precision", String), Column("geog_source", String) ] s3ic_merged.write_dtype(output_dtype) S3icNormalized = s3ic_normalized.reflect() session = s3ic_normalized.get_session() point_geocoded = func.ST_setSRID( func.ST_MakePoint(S3icNormalized.geocoded_longitude, S3icNormalized.geocoded_latitude), WGS84) q = session.query(S3icNormalized, point_geocoded).all() with s3ic_merged.get_writer() as writer: for (row, point) in q: output_row = { **row2dict(row), "geog": row.geog, "geog_precision": row.precision, "geog_source": "initial_data" } c1 = row.precision not in \ (precisions.HOUSENUMBER, precisions.PARCEL) c2 = row.geocoded_latitude and row.geocoded_longitude c3 = row.geocoded_result_score and row.geocoded_result_score > 0.6 c4 = row.geocoded_result_type == precisions.HOUSENUMBER if c1 and c2 and c3 and c4: output_row["geog"] = point output_row["geog_precision"] = row.geocoded_result_type output_row["geog_source"] = "geocodage" writer.write_row_dict(output_row) session.close()
def merge_geog(): """ Sélectionne la meilleure information géographique entre - (x_lambert2_etendue, y_lambert2_etendue) - (xl2_adresse, yl2_adresse) - point géocodé Ajoute la précision associée et la source de l'info """ # Input dataset basias_localisation_geocoded = Dataset("etl", "basias_localisation_geocoded") # Output dataset basias_localisation_geog_merged = Dataset( "etl", "basias_localisation_geog_merged") basias_localisation_geog_merged.write_dtype([ *basias_localisation_geocoded.read_dtype(), Column("geog", Geometry(srid=4326)), Column("geog_precision", String), Column("geog_source", String) ]) BasiasLocalisationGeocoded = basias_localisation_geocoded.reflect() session = basias_localisation_geocoded.get_session() point_lambert2_etendue = func.ST_Transform( func.ST_setSRID( func.ST_MakePoint(BasiasLocalisationGeocoded.x_lambert2_etendue, BasiasLocalisationGeocoded.y_lambert2_etendue), LAMBERT2), WGS84) point_adresse = func.ST_Transform( func.ST_setSRID( func.ST_MakePoint(BasiasLocalisationGeocoded.xl2_adresse, BasiasLocalisationGeocoded.yl2_adresse), LAMBERT2), WGS84) point_geocoded = func.ST_setSRID( func.ST_MakePoint(BasiasLocalisationGeocoded.geocoded_longitude, BasiasLocalisationGeocoded.geocoded_latitude), WGS84) q = session.query(BasiasLocalisationGeocoded, point_lambert2_etendue, point_adresse, point_geocoded).all() with basias_localisation_geog_merged.get_writer() as writer: for (row, p_lambert2_etendue, p_adresse, p_geocoded) in q: output_row = { **row2dict(row), "geog": None, "geog_precision": None, "geog_source": None } if p_lambert2_etendue is not None: # assert this data is accurate output_row["geog"] = p_lambert2_etendue output_row["geog_precision"] = precisions.PARCEL output_row["geog_source"] = "lambert2_etendue" elif p_adresse is not None and row.precision_adresse == "numéro": output_row["geog"] = p_adresse output_row["geog_precision"] = precisions.HOUSENUMBER output_row["geog_source"] = "adresse" elif p_geocoded is not None and row.geocoded_result_score > 0.6 \ and row.geocoded_result_type == precisions.HOUSENUMBER: output_row["geog"] = p_geocoded output_row["geog_precision"] = precisions.HOUSENUMBER output_row["geog_source"] = "geocodage" writer.write_row_dict(output_row) session.close()