def start(args): if args.justPublish: Hamelin(vars(args), None) else: connection = ConnectionPool() if connection.isAvailable(): path = args.path if os.path.exists(path): if os.path.isdir(path): pathsToProcess = [] skipFirst = True for root, dirnames, filenames in os.walk(path): if skipFirst and len(dirnames) > 0: skipFirst = False continue arguments = vars(args).copy() arguments["path"] = root Hamelin(arguments, connection).export() connection.closeAll() else: Hamelin(vars(args), connection).export() connection.closeAll() else: print("error conectando a PostGIS", Logs.ERROR)
def extract( path, ckanID ): unzippedDir = "unzipped_" + ckanID acceptedDirectory = "conjunto_de_datos" if "kmz" not in path else None print("Extrayendo en " + unzippedDir, Logs.INFO ) if utils.extractZipFile(path, unzippedDir): return utils.listAllFiles(unzippedDir, acceptedFiles, acceptedDirectory)
def process(file, datasetName, outputDir, rawData=None, removeInvalidProperties=False): rawData = rawData if rawData is not None else utils.getEncodedFileContent( file)[0] kml2geojson.main.GEOTYPES = ["Polygon", "LineString", "Point"] try: data = rawData # remove all Document properties if removeInvalidProperties: data = re.compile("<Documen[^>]*>").sub("<Document>", data) geojson = kml2geojson.main.build_feature_collection( md.parseString(data)) for element in geojson["features"]: # extract data from nested tables properties = {} for prop in element["properties"]: value = str(element["properties"][prop]) if "<table" in value: page = htmlParser.document_fromstring(value) for row in page.xpath("body/table")[0].findall("tr"): childs = row.findall("td") if len(childs) == 2: variableName = utils.normalizeText(childs[0].text) properties[utils.normalizeText( variableName)] = utils.getValidTextValue( childs[1].text) else: if (prop != "styleUrl"): properties[utils.normalizeText( prop)] = utils.getValidTextValue(value) element["properties"] = properties sql = geojsonHandler.writeSQLScript(file, datasetName, outputDir, geojsonObject=geojson) except xml.parsers.expat.ExpatError as err: if removeInvalidProperties: print("no se pudo parsear el archivo", Logs.ERROR, {"error": str(err)}) else: process(file, datasetName, outputDir, rawData=rawData, removeInvalidProperties=True)
def getConnection(self): conn = None try: conn = self.pool.getconn() except PoolError: print("error obteniendo una conexión a PostGIS, reintentando...", Logs.ERROR) self.pool.closeall() # close all active connections in the pool conn = self.pool.getconn() conn.autocommit = True return conn
def printSQLScript(self, outputDir): table = self.file["tableName"] if self.file["type"] in processRouter: try: processRouter[self.file["type"]](self.file["path"], table, outputDir) return True except TypeError as typeErr: # none file created pass except Exception as e: print("Error procesando archivo", Logs.ERROR, {"error": str(e)}) return False
def getEncodedFileContent(filename): content = None for encoder in ENCODERS_LIST: try: print("Leyendo archivo. encoding=" + str(encoder), Logs.INFO) file = open(filename, encoding=encoder) content = file.read() file.close() return content, encoder except Exception as e: print("Error leyendo archivo", Logs.ERROR, {"error": str(e)})
def extractZipFile(path, unzippedDir): try: zip_ref = zipfile.ZipFile(path, "r") zip_ref.extractall(unzippedDir) zip_ref.close() return True except BadZipFile as badZipException: print("archivo invalido", Logs.ERROR, {"error": str(badZipException)}) except Exception as err: print("no se pudo extraer el contenido", Logs.ERROR, { "type": type(err), "message": str(err) }) return False
def listAllFiles(dir, acceptedFiles=None, acceptedDirectory=None): filesToProcessList = [] for root, dirnames, filenames in os.walk(dir): for filename in filenames: try: filenameAux, fileType = getFileNameAndType(filename) except Exception as e: print("No se pudo identificar el tipo de archivo", Logs.ERROR, {"path": filename}) continue if acceptedFiles is None or fileType in acceptedFiles: resource = os.path.join(root, filename) if acceptedDirectory is None or acceptedDirectory in resource: filesToProcessList.append(resource) return filesToProcessList
def process(file, datasetName, outputDir, jsonObject=None): data = json.loads(utils.getEncodedFileContent(file) [0]) if jsonObject is None else jsonObject # check if is geojson if "features" in data: geojsonObject = data else: geojsonObject = {"type": "FeatureCollection", "features": []} latitudColumn = None longitudColumn = None for column in list(data[0].keys()): # geographic columns if re.match(LATITUDE_KEY, column, re.IGNORECASE) is not None: latitudColumn = column elif re.match(LONGITUDE_KEY, column, re.IGNORECASE) is not None: longitudColumn = column if latitudColumn is None or longitudColumn is None: print("No se encontro información geografica", Logs.ERROR) return for row in data: feature = { "type": "Feature", "geometry": { "type": "Point", "coordinates": [ row.pop(longitudColumn, "0"), row.pop(latitudColumn, "0") ] }, "properties": row } geojsonObject["features"].append(feature) geojson.writeSQLScript(file, datasetName, outputDir, geojsonObject=geojsonObject)
def readFeatureData(self): if self.featureName: self.name = os.getenv("FEATURE_LABEL") self.ckanID = "" self.package_id = "" self.description = "" self.category = "" else: # get DGM information try: with urlopen( DGM_RESOURCES_API + "?id={}".format( self.ckanID ) ) as dgmResponse: dgmResponse = json.loads(dgmResponse.read().decode("utf-8")) if dgmResponse["pagination"]["total"] == 0: return False dgmResponse = dgmResponse["results"][0] self.package_id = dgmResponse["package-id"] # get CKAN information with urlopen( CKAN_PACKAGE_API + "?id={}".format( self.package_id ) ) as ckanResponse: ckanResponse = json.loads(ckanResponse.read().decode("utf-8")) ckanResponse = ckanResponse["result"] self.name = dgmResponse["name"] if "name" in dgmResponse else "" self.description = dgmResponse["description"] if "description" in dgmResponse else "" if "organization" in ckanResponse: self.organization = ckanResponse["organization"] for property in ckanResponse["extras"]: if property["key"] == "theme": self.category=property["value"] if "tags" in ckanResponse: for tag in ckanResponse["tags"]: if tag["state"] == "active": self.tags.append(tag["display_name"]) self.tagsRaw = ckanResponse["tags"] return self.isValid() except Exception as e: print("no se pudieron obtener los metadatos", Logs.ERROR, {"error": str(e)})
def __init__(self): try: min_connections = 1 max_connections = int( os.getenv("POSTGRES_MAX_CONNECTIONS")) if os.getenv( "POSTGRES_MAX_CONNECTIONS") is not None else 10 self.pool = SimpleConnectionPool(min_connections, max_connections, dbname=POSTGRES_DBNAME, user=POSTGRES_USER, host=POSTGRES_HOST, password=POSTGRES_PASSWORD, port=POSTGRES_PORT) except Exception as e: print( str(e), Logs.ERROR, { "postgresql": "{hostname}:{port}/{dbname}".format(hostname=POSTGRES_HOST, port=POSTGRES_PORT, dbname=POSTGRES_DBNAME) })
def process(file, datasetName, outputDir): sqlFilePath = "{outputDir}/{script}.sql".format(outputDir=outputDir, script=datasetName) # get shp projection srid = GEOMETRY_COLUMN_SRID projFileName = file[:-4] + ".prj" if os.path.exists(projFileName): srid = getShpProjection(projFileName) for encoder in ENCODERS_LIST: try: print("Leyendo archivo", {"encoding": encoder}) reader = shapefile.Reader(file, encoding=encoder) fields = reader.fields[1:] field_names = [field[0] for field in fields] buffer = [] for sr in reader.shapeRecords(): atr = dict(zip(field_names, sr.record)) geom = sr.shape.__geo_interface__ buffer.append( dict(type="Feature", geometry=geom, properties=atr)) geojsonObject = { "type": "FeatureCollection", "features": buffer } geojson.writeSQLScript(file, datasetName, outputDir, geojsonObject=geojsonObject, srid=srid) return except UnicodeDecodeError as unicodeError: print("no se pudo leer el archivo", Logs.ERROR, {"error": str(unicodeError)}) else: print("no se encontro el archivo .prj")
def runScript(self, scriptDir): insertCounter = 0 geometryType = None conn = self.getConnection() for root, dirnames, filenames in os.walk(scriptDir): for filename in filenames: if filename[-4:] == '.sql': scriptPath = root + "/" + filename print("Ejecutando " + scriptPath, Logs.INFO) with open(scriptPath, "r") as script: # create db cursor cursor = conn.cursor() for query in script.read().split( utils.QUERY_DELIMITER): if query == "": continue try: cursor.execute(query) if query.startswith("INSERT INTO"): insertCounter += 1 elif geometryType is None and query.startswith( "SELECT AddGeometryColumn"): geometryType = query.split(",")[-2][1:-1] except Exception as e: print(str(e), Logs.ERROR, {"query": query}) cursor.close() print("ok", Logs.INFO, {"inserted_features": str(insertCounter)}) self.closeConnection(conn) return insertCounter, geometryType
def publishLayer(file): metadata = file["metadata"] layerXml = "<featureType>" layerXml += "<name>{}</name>".format(file["tableName"]) layerName = cleanString(metadata.name) layerXml += "<title>{}</title>".format( layerName if layerName != "" else file["tableName"]) # layerXml += "<abstract>{}</abstract>".format( cleanString(metadata.description) ) # tags # layerXml += "<keywords>" # for tag in metadata.tags: # layerXml += "<string>{}</string>".format(tag) # layerXml += "</keywords>" layerXml += "</featureType>" url = "{geoserver_rest}/workspaces/{workspace}/datastores/{datastore}/featuretypes".format( workspace=GEOSERVER_WORKSPACE, datastore=GEOSERVER_DATASTORE, geoserver_rest=GEOSERVER_REST) headers = {'Content-Type': 'text/xml'} auth = (GEOSERVER_USER, GEOSERVER_PASSWORD) try: response = requests.post(url, headers=headers, auth=auth, data=layerXml) # raise exception with HTTP error code response.raise_for_status() print("tabla publicada correctamente en geoserver", Logs.INFO, {"geoserver_response": response.text}) return True except HTTPError as httpError: print( "Error publicando en geoserver", Logs.ERROR, { "error": str(httpError), "type": type(httpError), "url": url, "documento": layerXml, "respuesta": "{code} | {response}".format(code=str(response.status_code), response=response.text) }) except Exception as err: print( "Error conectando a geoserver", Logs.ERROR, { "error": str(err), "type": type(err), "url": url, "documento": layerXml }) return False
"Admite recursos que no cuenten con un valor de categoria en los metadatos" ) parser.add_argument( "--allowDescriptionNone", action="store_true", help= "Admite recursos que no cuenten con un valor de descripción en los metadatos" ) parser.add_argument( "--cleanCollection", action="store_true", help="Vacia la colección de mongo antes de iniciar el procesamiento") parser.add_argument( "--featureName", help= "Especifica un nombre para actualizar el API y geoserver sin consultar metadatos en DGM", metavar="name") parser.add_argument( "--justPublish", action="store_true", help= "Publica en el API la ruta especificada como id de geoserver, para especificar un label utiliza la opción --featureName" ) args = parser.parse_args() start_time = time.time() start(args) print("--- Tiempo de ejecución: %s minutos ---" % ((time.time() - start_time) / 60))
def writeSQLScript(file, datasetName, outputDir, geojsonObject=None, srid=GEOMETRY_COLUMN_SRID): scriptFileName = "{outputDir}/{script}.sql".format(outputDir=outputDir, script=datasetName) data = geojsonObject if geojsonObject is not None else json.loads( utils.getEncodedFileContent(file)[0] ) print("Escribiendo script en " + scriptFileName, Logs.INFO) createTableSQL = "" geometrySQL = [] insertSQL = [] # table columns columns = [] validColumns = [] columnsType = {} columnString = "" textColumns = ["cve_ent", "cve_mun", "cve_loc", "cvegeo"] # numeric columns that should be treated as str (catalogs) for column in data["features"][0]["properties"]: validColumnName = utils.normalizeText(column) columns.append(column) validColumns.append(validColumnName) columnsType[validColumnName] = "text" if validColumnName in textColumns else utils.getObjType(data["features"][0]["properties"][column]) columnString += validColumnName + "," sqlColumns = "gid serial PRIMARY KEY" for header in validColumns: sqlColumns += ",{column} {column_type}".format(column=header, column_type=columnsType[header]) createTableSQL = sqlCreateTable.format(datasetName=datasetName, columns=sqlColumns) geometryColumns = [] columnsCreated = {} counter = 0 for element in data["features"]: geometryType = element["geometry"]["type"] properties = element["properties"] featuresToProcess = [] if geometryType == "GeometryCollection": for geometry in element["geometry"]["geometries"]: featuresToProcess.append({ "geometry": geometry }) else: featuresToProcess.append(element) for feature in featuresToProcess: geometryType = feature["geometry"]["type"] # validate geometry coordinates if geometryType.lower() == "point": feature["geometry"]["coordinates"] = feature["geometry"]["coordinates"][0:2] elif geometryType.lower() == "polygon": coordinates = [[]] for batchCoordinates in feature["geometry"]["coordinates"]: # coordinates depth if isinstance(batchCoordinates[0][0], list): batchCoordinates = batchCoordinates[0] for index in range(0, len(batchCoordinates)): coordinates[0].append(batchCoordinates[index][0:2]) feature["geometry"]["coordinates"] = coordinates elif geometryType.lower() == "linestring": for index in range(0, len(feature["geometry"]["coordinates"])): feature["geometry"]["coordinates"][index] = feature["geometry"]["coordinates"][index][0:2] if not geometryType in columnsCreated: query, columnName = createGeometryColumn(datasetName, geometryType.upper(), "_{}".format(geometryType.lower()), srid=srid) geometryColumns.append(columnName) columnsCreated[geometryType] = True geometrySQL.append(query) values = [] for index in range(0, len(columns)): try: value = "" if properties[ columns[index] ] is None else utils.getValidTextValue(properties[ columns[index] ]) except: value = "" pass values.append( utils.getValidSQLValue(value, columnsType[validColumns[index]]) ) values.append("ST_SetSRID(ST_GeomFromGeoJSON('" + json.dumps(feature["geometry"]) + "')," + srid + ")") sql = sqlInsert.format(dataset=datasetName, columns=columnString, values=",".join(values), geometry_column=GEOMETRY_COLUMN_NAME, suffix="_"+geometryType.lower()) insertSQL.append(sql) # WRITE SQL FILE with open( scriptFileName , "w" ) as sqlScript: # create table sqlScript.write( createTableSQL + utils.QUERY_DELIMITER ) # create geometries for query in geometrySQL: sqlScript.write( query + utils.QUERY_DELIMITER ) # insert features for query in insertSQL: sqlScript.write( query + utils.QUERY_DELIMITER )
def __init__(self, arguments, postgisConnection): self.postgisConnection = postgisConnection self.arguments = arguments if arguments["justPublish"]: newLayer = { "geoserver": arguments["path"], "name_resource": arguments["featureName"] if arguments["featureName"] else arguments["path"].replace("_"," ").capitalize() } if MongoApiCollection().addLayer( None, newLayer ): print("API actualizada", Logs.INFO) else: print("no se pudo actualizar el API", Logs.ERROR) else: self.fileList = [] path = arguments["path"] if os.path.exists( path ): if os.path.isdir( path ): print("procesando directorio", Logs.INFO) for file in utils.listAllFiles( path, acceptedFiles + zippedFiles ): self.fileList.append( self.buildConfigObject(file) ) else: self.fileList.append( self.buildConfigObject(path) ) else: print("archivo no encontrado", Logs.ERROR) self.counter["error_archivo"] += 1 # clean if arguments["cleanCollection"]: print("limpiando colección", Logs.INFO) deleted = MongoApiCollection().cleanCollection() print("documentos eliminados", Logs.INFO, {"documentos": deleted.raw_result})
def processFile(self, file): if geoserver.featureExists( file["tableName"] ) and (not self.arguments["forceUpdates"] or not self.arguments["skipUpdates"] ): print("la capa ya existe en geoserver", Logs.ERROR) self.counter["procesados_anteriormente"] += 1 return caster = PostgisCaster( file ) workDirectory = file["tableName"] try: os.makedirs( workDirectory ) except: workDirectory += ''.join(random.choice( string.ascii_lowercase ) for i in range(5)) os.makedirs( workDirectory ) if caster.printSQLScript( workDirectory ): insertCounter, file["geometryType"] = self.postgisConnection.runScript( workDirectory ) if file["geometryType"] is not None: if self.arguments["skipUpdates"]: # script params print("omitiendo publicación de capas", Logs.INFO) else: # publish in geoserver geoserver.publishLayer( file ) if self.arguments["featureName"] or file["metadata"].isValid(): # update API if MongoApiCollection().addLayer( file ): print("API actualizada", Logs.INFO) else: print("no se pudo actualizar el API", Logs.ERROR) else: print("omitiendo publicación en API", Logs.ERROR, {"error": "metadatos no validos", "metadatos": str(file["metadata"])}) self.counter["procesados_correctamente"] += 1 else: print("no se generaron los comandos sql", Logs.ERROR) self.counter["error_archivo"] += 1 # cleanning shutil.rmtree( workDirectory )
def export(self): while len(self.fileList) > 0: file = self.fileList.pop() self.counter["total"] += 1 print("--------------------------------------------------------------------") print("procesando " + file["path"], Logs.INFO ) if file["type"] not in acceptedFiles and file["type"] not in zippedFiles: print("Archivo no soportado", Logs.ERROR) self.counter["archivo_no_soportado"] += 1 continue if self.arguments["forceUpdates"] or self.arguments["skipUpdates"] or not geoserver.featureExists( file["tableName"] ): # the layer is not in geoserver metadata = file["metadata"] if self.arguments["forceUpdates"] or self.arguments["skipUpdates"] or self.arguments["featureName"] or metadata.readFeatureData(): if file["type"] in zippedFiles: # is a zipped directory extractedFiles = zip.extract( file["path"], file["ckanID"] ) if extractedFiles is not None: namePreffix = "" if len(extractedFiles) == 1 else "{}__".format( file["tableName"] ) for extractedFile in extractedFiles: configObject = self.buildConfigObject( extractedFile, metadata=metadata ) name = utils.normalizeText( configObject["name"] ) name = name[ len("conjunto_de_datos_"):] if "conjunto_de_datos_" in name else name configObject["tableName"] = namePreffix + name configObject["extracted"] = True self.fileList.append( configObject ) else: # process file self.processFile( file ) else: print("no se encontraron metadatos en CKAN/DGM", Logs.ERROR, {"metadata": str(metadata)} ) self.counter["metadatos_error"] += 1 else: print("la capa ya existe en geoserver", Logs.ERROR) self.counter["procesados_anteriormente"] += 1 # print counters print("--------------------------------------------------------------------") print("counters", Logs.INFO, self.counter)