def get_path_from_params(params, step_directory): """ Donne le chemin du dossier "feuille" dans lequel travailler pour une donnée. :param params: dict les paramètres tirés de la configuration :param step_directory: string le chemin absolu du dossier correspondant à la racine de la tâche appropriée :return: string le chemin complet du répertoire approprié (qu'il existe ou non !) """ answer = '' uri = params['uri'] about = simplify(params['shortname']) if uri.startswith('htt'): site = get_website_name_from_url(uri) answer = join(step_directory, site, about) elif uri.startswith('file:///'): answer = join(step_directory, COPIED_DATA_DIRNAME, about) else: log( 'can\'t find associated path from params because of unrecognized uri "{}"' .format(uri), 'warn') return answer
def createTable(self, tabla, atributos, describe): try: log(__name__).info('Creando tabla ' + tabla) atributos = '' for atributo in describe: if len(atributo['type'].split('(')) > 1: longitud = atributo['type'].split('(')[1].split(')')[0] else: longitud = atributo['max_lenght'] if atributo['type'].split('(')[0] == "datetime": longitud = 6 if atributo['is_nullable']: nulo = 'NULL' else: nulo = 'NOT NULL' cadenaAtributo = atributo['name'].lower() + ' ' + \ self.checkType(atributo['type'].split('(')[0]) + \ '(' + str(longitud) + ') ' + \ nulo + ',' atributos = atributos + cadenaAtributo sql = 'CREATE TABLE ' + tabla + ' (' + atributos[:-1] + ') COLLATE utf8_spanish2_ci' conn = self.conn() conn.execute(sql) conn.close() except: log(__name__).critical('Error al crear la tabla ' + tabla) sys.exit(1)
def give_shortname_info(): """Donne un ensemble d'informations et de chemins utiles à propos des shortnames.""" log( 'in file "{}", one or several mistakes are present in shortname\'s formating, see the details below' .format(DYNAMIC_CONFIG_DATA_FILE_PATH), 'warn') log('the default values that defines some criteria of a shortname\'s validity can be found in config file "{}", feel free to change them' .format(DYNAMIC_CONFIG_DATA_FILE_PATH))
def drop(self, tabla): try: log(__name__).info('Borrando la tabla ' + tabla) sql = "DROP TABLE " + tabla conn = self.conn() conn.execute(sql) conn.close() except: log(__name__).warning('Error al borrar la tabla ' + tabla)
def main(): basePath = os.getenv("DIRSQL") try: initFile = open(basePath + "/init", "r") except: log(__name__).info('Creando fichero init!') initFile = open(basePath + "/init", "w") importador(quart=False) finally: initFile.close()
def search_for_files_to_process(a_res): """ Construit la liste des données à traiter. :param a_res: dict la configuration associée à la donnée atomique :return: une liste de liste de chemins complets de fichiers se rapportant à la même donnée atomique et les paramètres (complétés) associés """ # -------- déclarations préliminaires results = [] params = a_res['params'] shortname = simplify(params['shortname']) path_to_search_in = get_path_from_params(params, DOWNLOAD_ROOT) # traitement de la présence/abscence du data_name data_name = params['data_name'] if 'data_name' in params else '' validator_params = {'data_name': data_name} # -------- journalisation info_msg = '{}'.format(shortname) if data_name == '' else '{} / {}'.format(shortname, data_name) log('exploring data about "{}"'.format(info_msg)) # -------- recherche d'une donnée atomique data_files = search_with_criteria(path_to_search_in, is_data_file, validator_params=validator_params, search_depth=-1) data_files_names = [] for df in data_files: dirpath, name, ext = path_splitter(df) data_files_names.append(name) list_of_unique_names = list(set(data_files_names)) number_of_atomic_data = len(list_of_unique_names) we_caught_one_atomic_data = number_of_atomic_data == 1 # -------- une/l' unique donnée atomique a été trouvée if we_caught_one_atomic_data: if data_name == '': detected_data_name = list_of_unique_names[0] log('auto-detect found an atomic data called "{}"'.format(detected_data_name)) params['data_name'] = detected_data_name results = data_files # results = data_files[:] serait plus lent, car recopierai au lieu d'agir par référence else: if data_name == '': if number_of_atomic_data == 0: log('sorry, auto-detect did not found any data', 'warn') else: log('sorry, auto-detect will need you to define manually the data name because several atomic data have been found', 'warn') else: log('sorry, there is no matching data', 'warn') return results, params
def retrieve_all(): """Rapatrie l'ensemble des ressources.""" taskname = 'retrieving' log_task(taskname, 'start') some_action_performed = False no_exception = True with open(DYNAMIC_CONFIG_DATA_FILE_PATH, 'r') as f: j = json.load(f) try: for ressource in j['res']: params = ressource['params'] print("PARAMS "+str(params)) uri = params['uri'] # chaque uri est traitée de façon adéquate print("1111111111111111") if uri.startswith('htt'): if get_data_from_url(params): some_action_performed = True elif uri.startswith('file:///'): if get_data_from_filepath(params): some_action_performed = True print("22222222222222") else: log('unimplemented processing for uri "{}"'.format(uri), 'warn') except KeyError as e: no_exception = False printex(e, 'incorrect or missing node in "{}" (bad file structure)'.format(DYNAMIC_CONFIG_DATA_FILE_PATH)) except Exception as e: no_exception = False printex(e) if no_exception: if some_action_performed: log('all data downloaded successfully') else: log('nothing were done, because all data were already retrieved') else: log('sorry, some data could not be downloaded', 'warn') if some_action_performed: # nettoyage (suppression récursive) des dossiers vides potentiellement présents dans le répertoire de récupération des données log('removed potentially useless created directories in "{}"'.format(WORK_ROOT)) remove_empty_dirs_in_dir(DOWNLOAD_ROOT) log_task(taskname, 'end')
def conn(self): try: connection = pymysql.connect(os.getenv("DB_MYSQL_SERVER"), os.getenv("DB_MYSQL_USER"), os.getenv("DB_MYSQL_PASSWORD"), os.getenv("DB_MYSQL_DB")) log(__name__).info('Conectado a base de datos MySQL ' + os.getenv("DB_MYSQL_SERVER")) return connection.cursor() except: log(__name__).critical('Error al conectar con MySQL! ' + os.getenv("DB_MYSQL_SERVER")) sys.exit(1)
def importador(quart=False): log(__name__).info('Inicializando proceso de replicado...') callMsSQL = MsSQL() callMySQL = MySQL() consultas = cargarCSQL(quart) for consulta in consultas.items(): consultaMsSQL = callMsSQL.select(consulta[1]['sql_mssql']) describeMsSQL = callMsSQL.describes(consulta[1]['sql_mssql']) if len(consultaMsSQL) > 0: callMySQL.insert(consulta[0], consulta[1]['atributos'], consultaMsSQL, describeMsSQL)
def write(self, file): """ Ecrit le fichier de sortie :param: String nom du fichier """ log('Ecriture de la solution dans : ' + file) # map = self.map l = self.list fichier = open(file, "w") #Ecriture position des Blackbone fichier.write(str(len(l)) + "\n") for lib in l: fichier.write(str(lib[0]) + " " + str(lib[1]) + "\n") fichier.write(" ".join(str(x) for x in lib[2]) + "\n") log("Fin de l'Ecriture")
def get_data_from_filepath(params): """ Récupère une ressource locale en la copiant dans le dossier de téléchargement spécifié. urlretrieve() permettrait également de copier des fichiers en local Cependant il a été préféré de garder une meilleure maîtrise (indépendance du traitement en fonction de l'URI). """ # récupération du chemin dans lequel copier la ressource save_to = get_path_from_params(params, DOWNLOAD_ROOT) # pour journalisation about = simplify(params['shortname']) # si un dossier de ce nom existe déjà, on passe done_something = False if exists(save_to): log('ignored "{}" because data has already been copied ("{}" exists)'.format(about, save_to)) else: done_something = True # dossier final de destination create_dir_if_not_exists(save_to) # récupération du chemin de la ressource file_or_dir = urlparse(params['uri']) source_path = file_or_dir.path log('copying data about "{}" from "{}" to "{}"'.format(about, source_path, save_to)) print("C'EST MOIIIIII") # c'est un unique fichier de données ou une archive if isfile(source_path): dirpath, name, extension = path_splitter(source_path) filename = '{}{}{}'.format(simplify(params['data_name']), extsep, extension) print("FILENAME = "+filename) save_as = join(save_to, filename) copyfile(source_path, save_as) # c'est un dossier non compressé else: copytree(source_path, save_to) return done_something
def checkType(self, type): types = { 'int': 'int', 'nchar': 'varchar', 'nvarchar': 'varchar', 'float': 'float', 'smallint': 'smallint', 'datetime': 'datetime' } if type in types: log(__name__).info('Cambiado type de MsSQL a MySQL (' + type + ')') return types[type] else: log(__name__).critical( 'No se ha podido cambiar el type de MsSQL a MySQL (' + type + ')') sys.exit(1)
def create_dir_if_not_exists(path): """ Crée un répertoire si celui-ci est inexistant. :param path: string le chemin absolu du dossier à créer """ if not exists(path): try: makedirs(path) except PermissionError as e: path_top_level = sub(r"^({0}?[^{0}]*){0}.*$".format(sep), r"\1", path) printex( e, 'cannot create the directory "{}". check rights on "{}"'. format(path, path_top_level)) exit(1) except Exception as e: printex(e, 'while attempting to create directory "{}".'.format(path)) log('some directory could not be created. crashing.', 'error') exit(1) else: log('directory "{}" created'.format(path))
def insert(self, tabla, atributos, registros, describe): try: log(__name__).info('Insertando elementos en la tabla ' + tabla) self.drop(tabla) self.createTable(tabla, atributos, describe) stringAtributos = ','.join(atributos) valString = '%s, ' * len(registros[0]) sql = "INSERT INTO " + tabla + \ ' (' + stringAtributos.lower() + ') VALUES ('+valString[:-2]+') ' conn = self.conn() conn.executemany(sql, registros) except pymysql.Error as e: print("Error %d: %s" % (e.args[0], e.args[1])) log(__name__).critical('Error al insertar elementos en la tabla ' + tabla) sys.exit(1) return False finally: conn.close()
def conn(self, sql): try: connection = pymssql.connect(os.getenv("DB_MSSQL_SERVER"), os.getenv("DB_MSSQL_USER"), os.getenv("DB_MSSQL_PASSWORD"), os.getenv("DB_MSSQL_DB")) log(__name__).info('Conectado a base de datos MsSQL ' + os.getenv("DB_MSSQL_SERVER")) except: log(__name__).critical('Error al conectar con MsSQL! ' + os.getenv("DB_MSSQL_SERVER")) sys.exit(1) try: cursor = connection.cursor() cursor.execute(sql) r = cursor.fetchall() connection.close() return r except: log(__name__).critical('Error en consulta SQL de MsSQL: ' + sql) sys.exit(1)
def convert_then_import_all(): """Convertit/prépare l'ensemble des ressources, puis les importe en base.""" taskname = 'converting and importing' log_task(taskname, 'start') # variables de journalisation amount_of_atomic_data_processed = 0 amount_of_data_files_processed = 0 things_were_done = False no_problem = True # on itère les ressources du fichier de configuration with open(DYNAMIC_CONFIG_DATA_FILE_PATH, 'r') as f: j = json.load(f) # connexion à la base de données conn = connect_to_database() try: for ressource in j['res']: # on recherche 'LA' donnée atomique to_process, params = search_for_files_to_process(ressource) # on envoie à traiter la donnée trouvée amount_of_data_files = len(to_process) if amount_of_data_files > 0: log('{} data file found'.format(amount_of_data_files)) # on traite le groupe if process_group(to_process, ressource, conn): things_were_done = True amount_of_data_files_processed += amount_of_data_files amount_of_atomic_data_processed += 1 else: no_problem = False log('encountered some problem(s) with this data', 'error') except KeyError as e: printex( e, 'incorrect or missing node in "{}" (bad file structure)'. format(DYNAMIC_CONFIG_DATA_FILE_PATH)) except Exception as e: printex(e) finally: delete_empty_schemes(conn) conn.close() # on journalise if things_were_done: log('{} atomic data successfully processed ({} files)'.format( amount_of_atomic_data_processed, amount_of_data_files_processed)) else: if no_problem: log('nothing were done, because all found data were already prepared' ) else: log( 'nothing were done, because all found & processed data encountered errors', 'warn') log_task(taskname, 'end')
# emplacement des ressources accessibles au code # la visibilité de python s'arrêtera au répertoire src SRC_ROOT = normpath(join(dirname(__file__), '..')) RES_DIR = join(SRC_ROOT, 'res') IN_DIR = join(RES_DIR, 'in') OUT_DIR = join(RES_DIR, 'out') # si des fichiers sont fournis sur l'entrée standard, on les utilise queue = sys.argv[1:] # sinon on fournit les cartes par défaut if len(queue) == 0: queue = [join(IN_DIR, file) for file in listdir(IN_DIR)] log('{} file(s) to process in total'.format(len(queue))) for filepath in queue: folder, name, ext = path_splitter(filepath) output_file = join(OUT_DIR, '{}{}{}'.format(name, extsep, 'out')) log_task('{} level solving'.format(name), 'begin') problem = Parser.load(filepath) solution = problem.solve() solution.write(output_file) log_task('{} level solving'.format(name), 'end')
def process_group(files, ressource, conn): """ Envoie à traiter une ressource donnée. :param files: les fichiers à utiliser pour la conversion :param ressource: le mode d'import et ses paramètres :param conn: la connexion à la base de données :return: bool True si le traitement s'est déroulée sans encombre, False sinon """ # définitions facilitatrices params = ressource['params'] # construction de la liste des extensions des fichiers possédés exts = [] for f in files: dirpath, name, ext = path_splitter(f) exts.append(ext) # journalisation okko, handled, time_to_break = False, False, False # on recherche la correspondance entre # les fichiers à disposition (leurs extensions) # et les regroupement associés à un traitement for g in MANDATORY_EXT_GROUPS: for _, e in enumerate(exts): # les différents traitements sont précisés ci-dessous # il est possible de les distinguer par types (ex: shp, xls) puis par organisation (ex: insee, ign) if e.lower() == g: # -------- formats gérés par fiona if g in ['shp', 'mif']: handled = True # c'est ici que l'on récupère des informations bonus # ces informations sont à intégrer dans le dictionnaire des paramètres # elles pourront alors faire l'objet de traitement spécifiques, dans les fonctions de plus bas niveau # interesting_files = [] # for f in files: # dirpath, name, ext = path_splitter(f) # if ext.lower() in OPTIONAL_EXT_GROUPS: # interesting_files.append(f) # la donnée est reconnue comme "à importer" okko = to_database(ressource, conn) time_to_break = True # -------- autres formats géographiques, que l'on ramène à des formats gérés par fiona # elif g == 'remplacer_par_l_extension_a_gerer': # exemple de code pour facilement supporter d'autres formats géographiques non gérés par fiona # la fonction remplacer_par_l_extension_a_gerer2shp() aurait une documentation comme présentée ci-dessous # """ # Effectue la conversion d'un fichier d'un format spécifique non supporté par fiona, en fichier shape et renvoie la ressource à importer. # # :param ressource: dict l'import à effectuer # :return: la ressource modifiée (uri) et True si tout est OK False sinon # :rtype: dict, bool # """ # de même, elle peut procéder comme suit pour le fichier intermédiaire # converted_file_dir = get_path_from_params(params, CONVERSION_ROOT) # create_dir_if_not_exists(converted_file_dir) # converted_file_name = '{}{}{}'.format(simplify(params['shortname']), extsep, 'shp') # converted_file_path = join(converted_file_dir, converted_file_name) # enfin, ici, laisser uniquement ces trois lignes # ressource, okko = remplacer_par_l_extension_a_gerer2shp(ressource) # okko = to_database(ressource, conn) # time_to_break = True # -------- données non géographiques elif g in ['xls', 'xlsx']: handled = True okko = xls_handler(ressource, conn) time_to_break = True if time_to_break: break if time_to_break: break if not handled: log('sorry, the processing for the file format group {} is not implemented yet'.format(exts), 'warn') else: if not okko: log('sorry, processing failed for this atomic data', 'error') else: log('successfully processed this atomic data') return okko
def get_data_from_url(params): """Récupère une ressource web distante.""" # récupération du chemin dans lequel télécharger la ressource save_to = get_path_from_params(params, DOWNLOAD_ROOT) # pour journalisation about = simplify(params['shortname']) done_something = False # si un dossier de ce nom existe déjà, on passe if exists(save_to): log('ignored "{}" because data has already been downloaded (directory "{}" exists)'.format(about, save_to)) else: # déclarations uri = params['uri'] site = get_website_name_from_url(uri) # création de la destination create_dir_if_not_exists(save_to) save_as = join(save_to, about) # téléchargement de la ressource log('fetching data from the "{}" website about "{}"'.format(site.upper(), about)) # le try/catch interne à la boucle permet de passer outre les URLS invalides et de continuer try: # @deprecated ne permet pas de gérer les certificats SSL # urlretrieve(uri, save_as) with open(save_as, 'wb') as output_file: response = get(uri, verify=False, proxies=PROXIES) output_file.write(response.content) # gestion des exceptions except URLError as e: printex(e, 'problem during "{}" ressource download'.format(uri)) except Exception as e: printex(e) # problème inconnu quelconque # aucun problème n'est survenu else: done_something = True # renommage : ajout de l'extension au nom de l'archive téléchargée try: extension = get_archive_format(save_as)[0] file_with_extension = '{}{}{}'.format(save_as, extsep, extension) rename(save_as, file_with_extension) print("PRINT DATA_NAME : "+params['data_name']) except PatoolError as e: printex(e, 'the ressource located at "{}" is probably not an archive, download may have failed somehow. just copying it'.format(uri)) # ressources distantes fichiers solo filename = '{}{}{}'.format(params['data_name'], extsep, params['extension']) filepath = join(save_to, filename) rename(save_as, filepath) except Exception as e: printex(e) log('successfully downloaded data about "{}"'.format(about)) return done_something
def cargarCSQL(quart=False): if quart: log(__name__).info('quart') else: log(__name__).info('no quart') log(__name__).info('Cargando archivos SQL...') basePath = os.getenv("DIRSQL") consultas = {} for (dirPath, dirNames, fileNames) in walk(basePath): for file in fileNames: if os.path.splitext(file)[1] != ".sql": continue if file[0] == "_": continue contentFileStrip = "" contentFile = open(basePath + "/" + file, "r").readlines() for contentFileLine in contentFile: contentFileStrip = contentFileStrip + " " + contentFileLine.strip( ' ').rstrip("\n") buscaQuart = contentFileStrip.find("/*cada15m*/") if quart: log(__name__).warning('Ciclo quart') if buscaQuart > 0: pass else: continue else: log(__name__).warning('Ciclo normal') pass log(__name__).info('Cargando archivo ' + file) contentFileStripClear = re.sub('(\/\*.*\*\/ .*?)', '', contentFileStrip) log(__name__).info('Consulta: ' + contentFileStripClear) atributosConsulta = re.search( '(?<=SELECT)(.*?)(?=FROM)', contentFileStripClear).group().split(', ') atributosConsultaLimpios = [] for atributoConsulta in atributosConsulta: log(__name__).info(atributoConsulta) atributoConsulta = atributoConsulta.split(' AS ') if len(atributoConsulta) == 2: atributosConsultaLimpios.append( atributoConsulta[1].strip()) else: atributosConsultaLimpios.append( atributoConsulta[0].strip()) consultas[file.split(".")[0]] = { "sql_mssql": contentFileStripClear, "atributos": atributosConsultaLimpios } break return consultas
def my_json_res_file_checker(path): """ Vérifie la validité du fichier JSON contenant les liens de téléchargement et affiche des informations en conséquence. :param path: string le chemin du fichier de configuration data à vérifier """ with open(path, 'r') as f: j = json.load(f) everything_is_fine = False try: for ressource in j['res']: s = ressource['params'] # -------- vérification du mode d'import import_mode = ressource['import_mode'] if import_mode not in IMPORT_MODES: everything_is_fine = False log('unrecognized import mode "{}"'.format(import_mode), 'warn') # -------- vérification du formatage du shortname # génération d'un avertissement pour chaque nom court incorrect # le nom de correction peut être donné # avant le 1er avertissement, les critères de validité de taille requise d'un nom court sont indiquées shortname_length = len(s['shortname']) # c'est trop court if shortname_length < SHORTNAME_MIN_LENGTH: if not everything_is_fine: give_shortname_info() everything_is_fine = True log( 'shortname "{}" is too short (length is {}, minimum is {})' .format(s['shortname'], shortname_length, SHORTNAME_MIN_LENGTH), 'warn') # c'est trop long if shortname_length > SHORTNAME_MAX_LENGTH: if not everything_is_fine: give_shortname_info() everything_is_fine = True log( 'shortname "{}" is too long (length is {}, maximum is {})' .format(s['shortname'], shortname_length, SHORTNAME_MAX_LENGTH), 'warn') # c'est mal formaté if s['shortname'] != simplify(s['shortname']): if not everything_is_fine: give_shortname_info() everything_is_fine = True log( 'shortname "{}" is poorly formated. you should consider changing it manually to something like "{}" for example' .format(s['shortname'], simplify(s['shortname'])), 'warn') except KeyError as e: printex( e, 'incorrect or missing node in "{}" (bad file structure)'. format(path)) log('filecheck "{}" complete'.format(path))
# ________________________________ # ________________________________ chargement des configurations dynamiques # ________________________________ # -------- imports # effectués obligatoirement ici et non en haut de fichier # en effet ces fonctions utilisent des variables définies ci-dessus from packages.utils.path import create_dir_if_not_exists from shutil import copyfile # -------- configuration générale if exists(DYNAMIC_CONFIG_GENERAL_FILE_PATH): load_general_config(DYNAMIC_CONFIG_GENERAL_FILE_PATH) log('the dynamic configuration file values overriden the permanent configuration file ones' ) else: # création de l'arborescence create_dir_if_not_exists(CONFIG_ROOT) # on copie le permanent afin de créer le dynamique # (dans ce cas, pas besoin de charger le dynamique, puisqu'il est identique au permanent) log('generating dynamic general purpose configuration file "{}" from the permanent one "{}"' .format(DYNAMIC_CONFIG_GENERAL_FILE_PATH, PERMANENT_CONFIG_GENERAL_FILE_PATH)) copyfile(PERMANENT_CONFIG_GENERAL_FILE_PATH, DYNAMIC_CONFIG_GENERAL_FILE_PATH) # -------- configuration data
def extract_all(): """ Décompresse toutes les archives à disposition. Solutions explorées : - "lzma" ne gère pas les archives, mais seule la (dé)compression d'un unique fichier compressé en ".7z" - la bibliothèque C "libarchive" (téléchargée via conda) via un wrapper python "libarchive-c" (téléchargée via pip) Choix final porté sur "patoolib" car gère beaucoup de formats. En réalité c'est un wrapper. Après détection des types MIME, il fait appel aux exécutables/librairies appropriés ("7zr" par exemple). """ taskname = 'extracting' log_task(taskname, 'start') # liste des chemins complets des archives à extraire log('searching for archives to extract in "{}". this may take a while'.format(DOWNLOAD_ROOT)) archives_to_extract = search_with_criteria(DOWNLOAD_ROOT, is_archive, search_depth=2) archives_to_extract += search_with_criteria(join(DOWNLOAD_ROOT, COPIED_DATA_DIRNAME), is_archive, search_depth=2) total = len(archives_to_extract) log('{} archive(s) found in "{}"'.format(total, DOWNLOAD_ROOT)) # on extrait archive par archive no_exception = True some_action_performed = False done = 0 for archive_path in archives_to_extract: # journalisation de l'avancement global done += 1 # on vérifie que l'archive n'a pas déjà été extraite archive_dir = dirname(archive_path) if len(listdir(archive_dir)) > 1: # si archive pas seule dans le dossier, on considère qu'elle a été extraite log('archive "{}" ignored because previously extracted ({}/{})'.format(archive_path, done, total)) else: log('extracting archive "{}" ({}/{})'.format(archive_path, done, total)) try: extract_archive(archive_path, verbosity=-1, outdir=archive_dir, interactive=False) except PatoolError as e: no_exception = False printex(e, 'the file extension probably does not match the real data type') except Exception as e: no_exception = False printex(e) finally: some_action_performed = True if no_exception: if some_action_performed: log('all retrieved archives extracted successfully') else: log('nothing were done, because all archives were already extracted') else: log('sorry, some retrieved archives could not be extracted', 'warn') log_task(taskname, 'end')
def xls_handler(ressource, conn): """Parse et importe directement une donnée non géographique en base.""" # déclarations okko = True params = ressource['params'] import_mode = ressource['import_mode'] shortname, data_name = params['shortname'], params['data_name'] schema, table = params['schema'], params['table'] # la donnée aurait-elle déjà été importée ? already_done = False if exists(REMEMBER_IMPORT_FILE_PATH): already_done = check_for_line_in_file(remember_line_builder(params), REMEMBER_IMPORT_FILE_PATH) # on ignore la donnée si l'import a déjà été effectué précédemment if already_done: log('ignoring data "{} / {}" because previously imported into database' .format(shortname, data_name)) else: # fichier d'entrée in_f = find_key_file_from_params(params, DOWNLOAD_ROOT) if in_f == '': okko = False log( 'ignoring data about "{} / {}" because a crucial file is missing' .format(shortname, data_name), 'warn') else: log('importing data about "{} / {}" in mode "{}" in schema.table "{}.{}"' .format(shortname, data_name, import_mode, schema, table)) try: cur = conn.cursor() # parsage et import with open_workbook(in_f) as f: for sn in f.sheet_names(): s = f.sheet_by_name(sn) # -------- traitements préliminaires # structure intermédiaire attrs_and_their_types = [ (x[0], TO_POSTGRESQL_TYPE[x[1]]) for x in [ s.cell(0, ci).value.rsplit(':', 1) for ci in xrange(s.ncols) ] ] # champs pour requêtes scheme_dot_table = '"{}"."{}"'.format(schema, table) fields_list = '({})'.format(', '.join([ '{} {}'.format(the_attr, the_type) for the_attr, the_type in attrs_and_their_types ])) columns = [ the_attr for the_attr, the_type in attrs_and_their_types ] # requête query = 'CREATE TABLE IF NOT EXISTS {} {};'.format( scheme_dot_table, fields_list) # -------- création de la table if import_mode == 'controlee_nouvelle_table': create_schema_if_not_exists(schema, conn) execute_query(query, conn) # -------- remplissage de la table for ri in xrange(1, s.nrows): values = [ s.cell(ri, ci).value for ci in xrange(s.ncols) ] fionalike_struct = OrderedDict() for index, v in enumerate(values): # on repasse les "faux floats" en int try: v = int(v) except ValueError: pass fionalike_struct[columns[index]] = v query = build_insert_query(fionalike_struct, params, georef=False) cur.execute(query) except Exception as e: conn.rollback() okko = False printex(e, 'insert query failed into table "{}"'.format(table)) else: conn.commit() remember_this_import(params) return okko
def apply_params_to_properties(params, properties, from_where='insert'): """ Effectue les modifications sur les propriétés en fonction des paramètres. Ces modifications portent sur les attributs. On touche donc à la structure, ce qui équivaudrait à un "ALTER". :param params: les paramètres :param properties: les propriétés :param from_where: d'où cette fonction a été appellée (pour évter de log un message identique pour n tuples) :return: les propriétés modifiées et True si tout s'est bien passé sinon False """ okko = False pp = OrderedDict() try: # déclarations en noms courts mode = '' if check_for_node_in_parent('mode', params): mode = params['mode'] bb = {} some_b_is_waiting_for_each_one = [] if check_for_node_in_parent('bindings', params): bb = params['bindings'] some_b_is_waiting_for_each_one = [b['from'] for b in bb] # on effectue les modifications précisées dans bindings for prop_key in properties.keys(): need_to_copy_the_attribute = mode != 'keep_only' # MODIFIED & KEEP_ONLY for b in bb: b_from = b['from'] b_to = b['to'] if prop_key == b_from: # -------- DROP if b_to == '': if mode == 'keep_only': # on est sur un DROP + KEEP_ONLY, ce n'est pas valide # on avertit et ignore la clause DROP # en effet copy est déjà à False if from_where != 'insert': log( 'ignoring non-sense DROP of attribute "{}" because of KEEP_ONLY mode' .format(b_from), 'warn') else: # l'attribut ne sera pas retenu, c'est-à-dire non copié need_to_copy_the_attribute = False # changement de nom de champ else: pp[b_to] = properties[prop_key] # on vient de le copier en le renommant, inutile de le copier à nouveau "comme s'il n'était pas à modifier" need_to_copy_the_attribute = False # champ inchangé already_copied = prop_key in pp.keys() some_b_is_waiting_for_it = prop_key in some_b_is_waiting_for_each_one if need_to_copy_the_attribute and not already_copied and not some_b_is_waiting_for_it: pp[prop_key] = properties[prop_key] # pas de bindings donc on recopie sans se poser de questions if not check_for_node_in_parent('bindings', params): pp[prop_key] = properties[prop_key] # ajout des champs communs à tous les modes d'import pp[YEAR_NAME] = '{}:{}'.format(YEAR_TYPE, YEAR_LENGTH) pp[VERSION_NAME] = '{}:{}'.format(VERSION_TYPE, VERSION_LENGTH) pp[SRID_NAME] = '{}:{}'.format(SRID_TYPE, SRID_LENGTH) pp[GEOMETRY_NAME] = '{}:'.format(GEOMETRY_TYPE) except KeyError as e: printex(e, 'incorrect or missing node') except Exception as e: printex(e) else: okko = True return pp, okko
def to_database(ressource, conn): """ Parcoure la donnée et l'insère dans la base de données en créant dynamiquement les requêtes. Travaille les données en flux uniquement. Pour cela, on utilise les générateurs python et les requêtes sont construites à la volée pour chaque tuple (au sens base de données) de la donnée d'entrée. """ # déclarations okko = True cur = conn.cursor() import_mode = ressource['import_mode'] params = ressource['params'] mode = '' if check_for_node_in_parent('mode', params): mode = params['mode'] data_name = params['data_name'] shortname = simplify(params['shortname']) # on trouve le fichier clé pour ce traitement input_file = find_key_file_from_params(params, DOWNLOAD_ROOT) if input_file == '': log( 'ignoring data about "{} / {}" because a crucial file is missing'. format(shortname, data_name), 'warn') okko = False else: # la donnée aurait-elle déjà été importée ? already_done = False if exists(REMEMBER_IMPORT_FILE_PATH): already_done = check_for_line_in_file( remember_line_builder(params), REMEMBER_IMPORT_FILE_PATH) # on ignore la donnée si l'import a déjà été effectué précédemment if already_done: log('ignoring data "{} / {}" because previously imported into database 1' .format(shortname, data_name)) else: mode_msg = 'automatic' if mode == '' else mode log('importing data about "{} / {}" in mode "{} / {}" in schema.table "{}.{}"' .format(shortname, data_name, import_mode, mode_msg, params['schema'], params['table'])) with fiona.drivers(): # on ouvre le fichier d'entrée # ajoute, conserve, et modifie les colonnes désirées # et écrit le tout dans un fichier de sortie # l'ensemble des traitements s'effectuent en flux with fiona.open(input_file, 'r') as in_data: # on part du schéma initial in_schema = in_data.schema.copy() # effectuer les modifications sur les champs properties, prop_changes_ok = apply_params_to_properties( params, in_schema['properties'], from_where='create') okko = prop_changes_ok and okko if okko: try: # -------- schéma et table # si le schéma (au sens espace de noms pour la base de données) n'existe pas, le créer create_schema_if_not_exists(params['schema'], conn) # création éventuelle de la table if import_mode == 'controlee_nouvelle_table': # on fait en sorte que la table existe en tentant systématiquement de la créer create_table_if_not_exists( properties, params, conn) # -------- détection des projections I/O srid_src_user, srid_dst_user = get_user_srids( params) srid_src_detected, srid_dst_detected = get_detected_srids( in_data.crs_wkt, params, conn) srid_src, srid_dst = None, None # -------- réactions aux projections à disposition (source) if srid_src_user is not None: if srid_src_user != srid_src_detected and srid_src_detected is not None: log( 'detected source SRID ({}, ignored) is different from the enforced one that you gave ({}, picked)' .format(srid_src_detected, srid_src_user), 'warn') srid_src = srid_src_user else: if srid_src_detected is not None: srid_src = srid_src_detected else: okko = False log( 'sorry, you will need to define manually the source SRID', 'error') # -------- réactions aux projections à disposition (destination) if okko: if import_mode == 'controlee_nouvelle_table': srid_dst = srid_src if srid_dst_user is not None: srid_dst = srid_dst_user params[SRID_NAME] = srid_dst else: if srid_dst_user is not None: log( 'ignoring the given destination SRID, which is useless regarding the import mode', 'warn') if srid_dst_detected is not None: srid_dst = srid_dst_detected params[SRID_NAME] = srid_dst else: okko = False msg_beginning = 'weird, cannot find destination SRID.' if table_empty(params, conn): msg_ending = 'table exists but is empty. drop it manually, change the import mode, then retry' else: msg_ending = 'check in database that the SRID column is named "{}"'.format( SRID_NAME) log( '{} {}'.format( msg_beginning, msg_ending), 'error') # -------- modifications structurelles et import if okko: reprojection_needed = okko and (srid_src != srid_dst) proj_src, proj_dst = None, None if reprojection_needed: prefix = 'epsg:' proj_src, proj_dst = '{}{}'.format( prefix, srid_src), '{}{}'.format( prefix, srid_dst) log('data will be reprojected from SRID {} to {}' .format(srid_src, srid_dst)) # si nouveaux champs, les ajouter à la table en base de données log('adding new fields to table in database') if check_for_node_in_parent( 'new_fields', params): for nf in params['new_fields']: the_name = nf['name'] the_type_name, the_type_length = nf[ 'type'].split(':') query = 'ALTER TABLE "{}"."{}" ADD COLUMN {} {}({});'.format( params['schema'], params['table'], the_name, the_type_name, the_type_length) execute_query(query, conn) # on travaille les données en flux, chaque feature aura son "INSERT" for feature in in_data: # recopie des valeurs attributaires pour chaque feature prop, okko = apply_params_to_properties( params, feature['properties']) prop[SRID_NAME] = srid_dst prop[YEAR_NAME] = params['year'] prop[VERSION_NAME] = params['version'] # -------- reprojection geom = feature['geometry'] prop[GEOMETRY_NAME] = reproject( geom, proj_src, proj_dst ) if reprojection_needed else geom # construction de la requête dynamiquement insert_query = build_insert_query( prop, params) cur.execute(insert_query) except Exception as e: conn.rollback() okko = False printex( e, 'insert query failed into table "{}"'.format( params['table'])) else: conn.commit() remember_this_import(params) return okko
def load_general_config(filename): """ Charge un fichier de configuration. :param filename: string le chemin du fichier de configuration à charger """ with open(filename, 'r') as f: all_is_ok = False j = json.load(f) try: # emplacement racine à utiliser global PLACE_TO_GO # dossier de travail global WORK_ROOT # racine du stockage des fichiers de configuration global CONFIG_ROOT # racine du stockage des documents téléchargés global DOWNLOAD_ROOT # racine du stockage des données converties global CONVERSION_ROOT # nom du dossier pour les données copiées global COPIED_DATA_DIRNAME # fichier retenant les imports yaant été effectués global REMEMBER_IMPORT_FILE_NAME global REMEMBER_IMPORT_FILE_EXT global REMEMBER_IMPORT_FILENAME global REMEMBER_IMPORT_FILE_PATH # niveaux de sévérité global SEVERITY_LEVELS # niveau de verbosité global LOG_LEVEl # longueur de la décoration placée à droite et à gauche des logs délimitant les tâches principales global VISUAL_SECTION_DECORATOR_LENGTH # caractère de séparation visuelle utilisé pour les exceptions dans les logs global VISUAL_SEPARATOR_CARACTER # taille minimale et maximale d'un nom court global SHORTNAME_MIN_LENGTH global SHORTNAME_MAX_LENGTH # groupes d'extensions global EXTENSION_GROUPS global KEY_EXTS global MANDATORY_EXT_GROUPS global OPTIONAL_EXT_GROUPS global INTERESTING_EXTS # caractéristiques des champs ajoutés global YEAR_NAME global YEAR_TYPE global YEAR_LENGTH global VERSION_NAME global VERSION_TYPE global VERSION_LENGTH global SRID_NAME global SRID_TYPE global SRID_LENGTH global GEOMETRY_NAME global GEOMETRY_TYPE # correspondance entre les types tels que représentés dans les structure python générées par fiona et les types au sein de postgresql global TO_POSTGRESQL_TYPE # paramètres de connection à la base de données global DB_HOST global DB_PORT global DB_NAME global DB_USER_NAME global DB_USER_PASSWORD # chemin du fichier dynamique de configuration "general" global DYNAMIC_CONFIG_GENERAL_FILE_PATH # chemin du fichier dynamique de configuration "data" global DYNAMIC_CONFIG_DATA_FILE_PATH # définit les modes d'import valides global IMPORT_MODES # renseigne les proxys global PROXIES # attribution des valeurs PLACE_TO_GO = j['place_to_go'] WORK_ROOT = join(PLACE_TO_GO, j['work_root_name']) CONFIG_ROOT = join(WORK_ROOT, j['config_root_name']) DOWNLOAD_ROOT = join(WORK_ROOT, j['download_root_name']) CONVERSION_ROOT = join(WORK_ROOT, j['conversion_root_name']) COPIED_DATA_DIRNAME = join(j['copied_data_dirname']) REMEMBER_IMPORT_FILE_NAME = j['remember_import_file_name'] REMEMBER_IMPORT_FILE_EXT = j['remember_import_file_ext'] REMEMBER_IMPORT_FILENAME = '{}{}{}'.format( REMEMBER_IMPORT_FILE_NAME, extsep, REMEMBER_IMPORT_FILE_EXT) REMEMBER_IMPORT_FILE_PATH = join(CONFIG_ROOT, REMEMBER_IMPORT_FILENAME) SEVERITY_LEVELS = j['severity_levels'] LOG_LEVEl = j['log_level'] VISUAL_SECTION_DECORATOR_LENGTH = j[ 'visual_section_decorator_length'] VISUAL_SEPARATOR_CARACTER = j['visual_separator_caracter'] SHORTNAME_MIN_LENGTH = j['shortname_min_length'] SHORTNAME_MAX_LENGTH = j['shortname_max_length'] EXTENSION_GROUPS = j['extension_groups'] KEY_EXTS = [gg[0][0] for gg in EXTENSION_GROUPS] MANDATORY_EXT_GROUPS = [ g for gg in EXTENSION_GROUPS for g in gg[0] ] OPTIONAL_EXT_GROUPS = [g for gg in EXTENSION_GROUPS for g in gg[1]] INTERESTING_EXTS = MANDATORY_EXT_GROUPS + OPTIONAL_EXT_GROUPS YEAR_NAME = j['year_name'] YEAR_TYPE = j['year_type'] YEAR_LENGTH = j['year_length'] VERSION_NAME = j['version_name'] VERSION_TYPE = j['version_type'] VERSION_LENGTH = j['version_length'] SRID_NAME = j['srid_name'] SRID_TYPE = j['srid_type'] SRID_LENGTH = j['srid_length'] GEOMETRY_NAME = j['geometry_name'] GEOMETRY_TYPE = j['geometry_type'] TO_POSTGRESQL_TYPE = j['types_map'] DB_HOST = j['database_host'] DB_PORT = j['database_port'] DB_NAME = j['database_name'] DB_USER_NAME = j['database_user_name'] DB_USER_PASSWORD = j['database_user_password'] IMPORT_MODES = j['import_modes'] PROXIES = j['proxies'] DYNAMIC_CONFIG_GENERAL_FILE_PATH = join(CONFIG_ROOT, CONFIG_GENERAL_FILENAME) DYNAMIC_CONFIG_DATA_FILE_PATH = join(CONFIG_ROOT, CONFIG_DATA_FILENAME) except KeyError as e: printex( e, 'incorrect or missing node in "{}" (bad file structure)'. format(config_file_to_load)) except Exception as e: printex(e) else: all_is_ok = True log('general purpose configuration file "{}" correctly loaded'. format(filename)) # problème inconnu quelconque : on crash, puisque l'on aura de toute façon besoin des configs pour focntionner correctement if not all_is_ok: log( 'can\'t load correctly the permanent general configuration file (see the reason above). exiting', 'error') exit(1)