Example #1
0
def get_path_from_params(params, step_directory):
    """
    Donne le chemin du dossier "feuille" dans lequel travailler pour une donnée.
    
    :param params: dict les paramètres tirés de la configuration
    :param step_directory: string le chemin absolu du dossier correspondant à la racine de la tâche appropriée
    :return: string le chemin complet du répertoire approprié (qu'il existe ou non !)
    """

    answer = ''
    uri = params['uri']
    about = simplify(params['shortname'])

    if uri.startswith('htt'):
        site = get_website_name_from_url(uri)
        answer = join(step_directory, site, about)

    elif uri.startswith('file:///'):
        answer = join(step_directory, COPIED_DATA_DIRNAME, about)

    else:
        log(
            'can\'t find associated path from params because of unrecognized uri "{}"'
            .format(uri), 'warn')

    return answer
Example #2
0
    def createTable(self, tabla, atributos, describe):
        try:
            log(__name__).info('Creando tabla ' + tabla)

            atributos = ''
            for atributo in describe:
                if len(atributo['type'].split('(')) > 1:
                    longitud = atributo['type'].split('(')[1].split(')')[0]
                else:
                    longitud = atributo['max_lenght']

                if atributo['type'].split('(')[0] == "datetime":
                    longitud = 6

                if atributo['is_nullable']:
                    nulo = 'NULL'
                else:
                    nulo = 'NOT NULL'

                cadenaAtributo = atributo['name'].lower() + ' ' +  \
                    self.checkType(atributo['type'].split('(')[0]) + \
                    '(' + str(longitud) + ') ' + \
                    nulo + ','

                atributos = atributos + cadenaAtributo

            sql = 'CREATE TABLE ' + tabla + ' (' + atributos[:-1] + ') COLLATE utf8_spanish2_ci'
            conn = self.conn()
            conn.execute(sql)
            conn.close()
        except:
            log(__name__).critical('Error al crear la tabla ' + tabla)
            sys.exit(1)
def give_shortname_info():
    """Donne un ensemble d'informations et de chemins utiles à propos des shortnames."""
    log(
        'in file "{}", one or several mistakes are present in shortname\'s formating, see the details below'
        .format(DYNAMIC_CONFIG_DATA_FILE_PATH), 'warn')
    log('the default values that defines some criteria of a shortname\'s validity can be found in config file "{}", feel free to change them'
        .format(DYNAMIC_CONFIG_DATA_FILE_PATH))
Example #4
0
    def drop(self, tabla):
        try:
            log(__name__).info('Borrando la tabla ' + tabla)

            sql = "DROP TABLE " + tabla
            conn = self.conn()
            conn.execute(sql)
            conn.close()
        except:
            log(__name__).warning('Error al borrar la tabla ' + tabla)
Example #5
0
def main():
    basePath = os.getenv("DIRSQL")
    try:
        initFile = open(basePath + "/init", "r")
    except:
        log(__name__).info('Creando fichero init!')
        initFile = open(basePath + "/init", "w")
        importador(quart=False)
    finally:
        initFile.close()
Example #6
0
def search_for_files_to_process(a_res):
    """
    Construit la liste des données à traiter.

    :param a_res: dict la configuration associée à la donnée atomique
    :return: une liste de liste de chemins complets de fichiers se rapportant à la même donnée atomique et les paramètres (complétés) associés
    """

    # -------- déclarations préliminaires

    results = []
    params = a_res['params']
    shortname = simplify(params['shortname'])
    path_to_search_in = get_path_from_params(params, DOWNLOAD_ROOT)

    # traitement de la présence/abscence du data_name
    data_name = params['data_name'] if 'data_name' in params else ''
    validator_params = {'data_name': data_name}

    # -------- journalisation

    info_msg = '{}'.format(shortname) if data_name == '' else '{} / {}'.format(shortname, data_name)
    log('exploring data about "{}"'.format(info_msg))

    # -------- recherche d'une donnée atomique

    data_files = search_with_criteria(path_to_search_in, is_data_file, validator_params=validator_params, search_depth=-1)
    data_files_names = []

    for df in data_files:
        dirpath, name, ext = path_splitter(df)
        data_files_names.append(name)

    list_of_unique_names = list(set(data_files_names))
    number_of_atomic_data = len(list_of_unique_names)
    we_caught_one_atomic_data = number_of_atomic_data == 1

    # -------- une/l' unique donnée atomique a été trouvée

    if we_caught_one_atomic_data:
        if data_name == '':
            detected_data_name = list_of_unique_names[0]
            log('auto-detect found an atomic data called "{}"'.format(detected_data_name))
            params['data_name'] = detected_data_name
        results = data_files  # results = data_files[:]        serait plus lent, car recopierai au lieu d'agir par référence
    else:
        if data_name == '':
            if number_of_atomic_data == 0:
                log('sorry, auto-detect did not found any data', 'warn')
            else:
                log('sorry, auto-detect will need you to define manually the data name because several atomic data have been found', 'warn')
        else:
            log('sorry, there is no matching data', 'warn')

    return results, params
def retrieve_all():
    """Rapatrie l'ensemble des ressources."""

    taskname = 'retrieving'
    log_task(taskname, 'start')

    some_action_performed = False
    no_exception = True

    with open(DYNAMIC_CONFIG_DATA_FILE_PATH, 'r') as f:

        j = json.load(f)
        try:
            for ressource in j['res']:
                params = ressource['params']
                print("PARAMS "+str(params))
                uri = params['uri']

                # chaque uri est traitée de façon adéquate
                print("1111111111111111")
                if uri.startswith('htt'):
                    if get_data_from_url(params):
                        some_action_performed = True
                    
                elif uri.startswith('file:///'):
                    if get_data_from_filepath(params):
                        some_action_performed = True
                    print("22222222222222")
                else:
                    log('unimplemented processing for uri "{}"'.format(uri), 'warn')

        except KeyError as e:
            no_exception = False
            printex(e, 'incorrect or missing node in "{}" (bad file structure)'.format(DYNAMIC_CONFIG_DATA_FILE_PATH))
        except Exception as e:
            no_exception = False
            printex(e)

        if no_exception:
            if some_action_performed:
                log('all data downloaded successfully')
            else:
                log('nothing were done, because all data were already retrieved')
        else:
            log('sorry, some data could not be downloaded', 'warn')

        if some_action_performed:
            # nettoyage (suppression récursive) des dossiers vides potentiellement présents dans le répertoire de récupération des données
            log('removed potentially useless created directories in "{}"'.format(WORK_ROOT))
            remove_empty_dirs_in_dir(DOWNLOAD_ROOT)

        log_task(taskname, 'end')
Example #8
0
 def conn(self):
     try:
         connection = pymysql.connect(os.getenv("DB_MYSQL_SERVER"),
                                      os.getenv("DB_MYSQL_USER"),
                                      os.getenv("DB_MYSQL_PASSWORD"),
                                      os.getenv("DB_MYSQL_DB"))
         log(__name__).info('Conectado a base de datos MySQL ' +
                            os.getenv("DB_MYSQL_SERVER"))
         return connection.cursor()
     except:
         log(__name__).critical('Error al conectar con MySQL! ' +
                                os.getenv("DB_MYSQL_SERVER"))
         sys.exit(1)
Example #9
0
def importador(quart=False):
    log(__name__).info('Inicializando proceso de replicado...')

    callMsSQL = MsSQL()
    callMySQL = MySQL()

    consultas = cargarCSQL(quart)

    for consulta in consultas.items():
        consultaMsSQL = callMsSQL.select(consulta[1]['sql_mssql'])
        describeMsSQL = callMsSQL.describes(consulta[1]['sql_mssql'])

        if len(consultaMsSQL) > 0:
            callMySQL.insert(consulta[0], consulta[1]['atributos'],
                             consultaMsSQL, describeMsSQL)
Example #10
0
    def write(self, file):
        """
            Ecrit le fichier de sortie
            :param: String nom du fichier
        """
        log('Ecriture de la solution dans : ' + file)
        # map = self.map
        l = self.list
        fichier = open(file, "w")
        #Ecriture position des Blackbone
        fichier.write(str(len(l)) + "\n")
        for lib in l:
            fichier.write(str(lib[0]) + " " + str(lib[1]) + "\n")
            fichier.write(" ".join(str(x) for x in lib[2]) + "\n")

        log("Fin de l'Ecriture")
def get_data_from_filepath(params):
    """
    Récupère une ressource locale en la copiant dans le dossier de téléchargement spécifié.
    
    urlretrieve() permettrait également de copier des fichiers en local
    Cependant il a été préféré de garder une meilleure maîtrise (indépendance du traitement en fonction de l'URI).
    """

    # récupération du chemin dans lequel copier la ressource
    save_to = get_path_from_params(params, DOWNLOAD_ROOT)

    # pour journalisation
    about = simplify(params['shortname'])

    # si un dossier de ce nom existe déjà, on passe
    done_something = False
    if exists(save_to):
        log('ignored "{}" because data has already been copied ("{}" exists)'.format(about, save_to))

    else:

        done_something = True

        # dossier final de destination
        create_dir_if_not_exists(save_to)

        # récupération du chemin de la ressource
        file_or_dir = urlparse(params['uri'])
        source_path = file_or_dir.path

        log('copying data about "{}" from "{}" to "{}"'.format(about, source_path, save_to))
        print("C'EST MOIIIIII")

        # c'est un unique fichier de données ou une archive
        if isfile(source_path):
            dirpath, name, extension = path_splitter(source_path)
            filename = '{}{}{}'.format(simplify(params['data_name']), extsep, extension)
            print("FILENAME = "+filename)
            save_as = join(save_to, filename)
            copyfile(source_path, save_as)

        # c'est un dossier non compressé
        else:
            copytree(source_path, save_to)

    return done_something
Example #12
0
    def checkType(self, type):
        types = {
            'int': 'int',
            'nchar': 'varchar',
            'nvarchar': 'varchar',
            'float': 'float',
            'smallint': 'smallint',
            'datetime': 'datetime'
        }

        if type in types:
            log(__name__).info('Cambiado type de MsSQL a MySQL (' + type + ')')
            return types[type]
        else:
            log(__name__).critical(
                'No se ha podido cambiar el type de MsSQL a MySQL (' + type +
                ')')
            sys.exit(1)
def create_dir_if_not_exists(path):
    """
    Crée un répertoire si celui-ci est inexistant.
    
    :param path: string le chemin absolu du dossier à créer
    """
    if not exists(path):
        try:
            makedirs(path)
        except PermissionError as e:
            path_top_level = sub(r"^({0}?[^{0}]*){0}.*$".format(sep), r"\1",
                                 path)
            printex(
                e, 'cannot create the directory "{}". check rights on "{}"'.
                format(path, path_top_level))
            exit(1)
        except Exception as e:
            printex(e,
                    'while attempting to create directory "{}".'.format(path))
            log('some directory could not be created. crashing.', 'error')
            exit(1)
        else:
            log('directory "{}" created'.format(path))
Example #14
0
    def insert(self, tabla, atributos, registros, describe):
        try:
            log(__name__).info('Insertando elementos en la tabla ' + tabla)

            self.drop(tabla)
            self.createTable(tabla, atributos, describe)

            stringAtributos = ','.join(atributos)
            valString = '%s, ' * len(registros[0])

            sql = "INSERT INTO " + tabla + \
                ' (' + stringAtributos.lower() + ') VALUES ('+valString[:-2]+') '

            conn = self.conn()
            conn.executemany(sql, registros)
        except pymysql.Error as e:
            print("Error %d: %s" % (e.args[0], e.args[1]))

            log(__name__).critical('Error al insertar elementos en la tabla ' +
                                   tabla)
            sys.exit(1)
            return False
        finally:
            conn.close()
Example #15
0
    def conn(self, sql):
        try:
            connection = pymssql.connect(os.getenv("DB_MSSQL_SERVER"),
                                         os.getenv("DB_MSSQL_USER"),
                                         os.getenv("DB_MSSQL_PASSWORD"),
                                         os.getenv("DB_MSSQL_DB"))

            log(__name__).info('Conectado a base de datos MsSQL ' +
                               os.getenv("DB_MSSQL_SERVER"))
        except:
            log(__name__).critical('Error al conectar con MsSQL! ' +
                                   os.getenv("DB_MSSQL_SERVER"))
            sys.exit(1)

        try:
            cursor = connection.cursor()
            cursor.execute(sql)
            r = cursor.fetchall()
            connection.close()
            return r
        except:
            log(__name__).critical('Error en consulta SQL de MsSQL: ' + sql)
            sys.exit(1)
Example #16
0
def convert_then_import_all():
    """Convertit/prépare l'ensemble des ressources, puis les importe en base."""

    taskname = 'converting and importing'
    log_task(taskname, 'start')

    # variables de journalisation
    amount_of_atomic_data_processed = 0
    amount_of_data_files_processed = 0
    things_were_done = False
    no_problem = True

    # on itère les ressources du fichier de configuration
    with open(DYNAMIC_CONFIG_DATA_FILE_PATH, 'r') as f:
        j = json.load(f)

        # connexion à la base de données
        conn = connect_to_database()
        try:

            for ressource in j['res']:

                # on recherche 'LA' donnée atomique
                to_process, params = search_for_files_to_process(ressource)

                # on envoie à traiter la donnée trouvée
                amount_of_data_files = len(to_process)
                if amount_of_data_files > 0:
                    log('{} data file found'.format(amount_of_data_files))

                    # on traite le groupe
                    if process_group(to_process, ressource, conn):
                        things_were_done = True
                        amount_of_data_files_processed += amount_of_data_files
                        amount_of_atomic_data_processed += 1
                    else:
                        no_problem = False
                        log('encountered some problem(s) with this data',
                            'error')

        except KeyError as e:
            printex(
                e, 'incorrect or missing node in "{}" (bad file structure)'.
                format(DYNAMIC_CONFIG_DATA_FILE_PATH))
        except Exception as e:
            printex(e)
        finally:
            delete_empty_schemes(conn)
            conn.close()

    # on journalise
    if things_were_done:
        log('{} atomic data successfully processed ({} files)'.format(
            amount_of_atomic_data_processed, amount_of_data_files_processed))
    else:
        if no_problem:
            log('nothing were done, because all found data were already prepared'
                )
        else:
            log(
                'nothing were done, because all found & processed data encountered errors',
                'warn')

    log_task(taskname, 'end')
# emplacement des ressources accessibles au code
# la visibilité de python s'arrêtera au répertoire src
SRC_ROOT = normpath(join(dirname(__file__), '..'))
RES_DIR = join(SRC_ROOT, 'res')
IN_DIR = join(RES_DIR, 'in')
OUT_DIR = join(RES_DIR, 'out')

# si des fichiers sont fournis sur l'entrée standard, on les utilise
queue = sys.argv[1:]

# sinon on fournit les cartes par défaut
if len(queue) == 0:
    queue = [join(IN_DIR, file) for file in listdir(IN_DIR)]

log('{} file(s) to process in total'.format(len(queue)))

for filepath in queue:

    folder, name, ext = path_splitter(filepath)
    output_file = join(OUT_DIR, '{}{}{}'.format(name, extsep, 'out'))
    log_task('{} level solving'.format(name), 'begin')

    problem = Parser.load(filepath)

    solution = problem.solve()

    solution.write(output_file)

    log_task('{} level solving'.format(name), 'end')
Example #18
0
def process_group(files, ressource, conn):
    """
    Envoie à traiter une ressource donnée.
    
    :param files: les fichiers à utiliser pour la conversion
    :param ressource: le mode d'import et ses paramètres
    :param conn: la connexion à la base de données
    :return: bool True si le traitement s'est déroulée sans encombre, False sinon
    """

    # définitions facilitatrices
    params = ressource['params']

    # construction de la liste des extensions des fichiers possédés
    exts = []
    for f in files:
        dirpath, name, ext = path_splitter(f)
        exts.append(ext)

    # journalisation
    okko, handled, time_to_break = False, False, False

    # on recherche la correspondance entre
    # les fichiers à disposition (leurs extensions)
    # et les regroupement associés à un traitement

    for g in MANDATORY_EXT_GROUPS:
        for _, e in enumerate(exts):

            # les différents traitements sont précisés ci-dessous
            # il est possible de les distinguer par types (ex: shp, xls) puis par organisation (ex: insee, ign)

            if e.lower() == g:

                # -------- formats gérés par fiona
                if g in ['shp', 'mif']:

                    handled = True

                    # c'est ici que l'on récupère des informations bonus
                    # ces informations sont à intégrer dans le dictionnaire des paramètres
                    # elles pourront alors faire l'objet de traitement spécifiques, dans les fonctions de plus bas niveau

                    # interesting_files = []
                    # for f in files:
                    #     dirpath, name, ext = path_splitter(f)
                    #     if ext.lower() in OPTIONAL_EXT_GROUPS:
                    #         interesting_files.append(f)

                    # la donnée est reconnue comme "à importer"
                    okko = to_database(ressource, conn)
                    time_to_break = True

                # -------- autres formats géographiques, que l'on ramène à des formats gérés par fiona

                # elif g == 'remplacer_par_l_extension_a_gerer':

                    # exemple de code pour facilement supporter d'autres formats géographiques non gérés par fiona
                    # la fonction remplacer_par_l_extension_a_gerer2shp() aurait une documentation comme présentée ci-dessous
                    # """
                    # Effectue la conversion d'un fichier d'un format spécifique non supporté par fiona, en fichier shape et renvoie la ressource à importer.
                    #
                    # :param ressource: dict l'import à effectuer
                    # :return: la ressource modifiée (uri)     et     True si tout est OK False sinon
                    # :rtype: dict, bool
                    # """
                    # de même, elle peut procéder comme suit pour le fichier intermédiaire
                    # converted_file_dir = get_path_from_params(params, CONVERSION_ROOT)
                    # create_dir_if_not_exists(converted_file_dir)
                    # converted_file_name = '{}{}{}'.format(simplify(params['shortname']), extsep, 'shp')
                    # converted_file_path = join(converted_file_dir, converted_file_name)

                    # enfin, ici, laisser uniquement ces trois lignes
                    # ressource, okko = remplacer_par_l_extension_a_gerer2shp(ressource)
                    # okko = to_database(ressource, conn)
                    # time_to_break = True

                # -------- données non géographiques

                elif g in ['xls', 'xlsx']:

                    handled = True
                    okko = xls_handler(ressource, conn)
                    time_to_break = True

            if time_to_break:
                break
        if time_to_break:
            break

    if not handled:
        log('sorry, the processing for the file format group {} is not implemented yet'.format(exts), 'warn')
    else:
        if not okko:
            log('sorry, processing failed for this atomic data', 'error')
        else:
            log('successfully processed this atomic data')

    return okko
def get_data_from_url(params):
    """Récupère une ressource web distante."""

    # récupération du chemin dans lequel télécharger la ressource
    save_to = get_path_from_params(params, DOWNLOAD_ROOT)

    # pour journalisation
    about = simplify(params['shortname'])
    done_something = False

    # si un dossier de ce nom existe déjà, on passe
    if exists(save_to):
        log('ignored "{}" because data has already been downloaded (directory "{}" exists)'.format(about, save_to))

    else:

        # déclarations
        uri = params['uri']
        site = get_website_name_from_url(uri)

        # création de la destination
        create_dir_if_not_exists(save_to)
        save_as = join(save_to, about)

        # téléchargement de la ressource
        log('fetching data from the "{}" website about "{}"'.format(site.upper(), about))

        # le try/catch interne à la boucle permet de passer outre les URLS invalides et de continuer
        try:
            # @deprecated ne permet pas de gérer les certificats SSL
            # urlretrieve(uri, save_as)

            with open(save_as, 'wb') as output_file:
                response = get(uri, verify=False, proxies=PROXIES)
                output_file.write(response.content)

        # gestion des exceptions
        except URLError as e:
            printex(e, 'problem during "{}" ressource download'.format(uri))
        except Exception as e:
            printex(e)  # problème inconnu quelconque

        # aucun problème n'est survenu
        else:

            done_something = True
            # renommage : ajout de l'extension au nom de l'archive téléchargée
            try:
                extension = get_archive_format(save_as)[0]
                file_with_extension = '{}{}{}'.format(save_as, extsep, extension)
                rename(save_as, file_with_extension)
                print("PRINT DATA_NAME   :  "+params['data_name'])

            except PatoolError as e:
                printex(e, 'the ressource located at "{}" is probably not an archive, download may have failed somehow. just copying it'.format(uri))

                # ressources distantes fichiers solo
                filename = '{}{}{}'.format(params['data_name'], extsep, params['extension'])
                filepath = join(save_to, filename)
                rename(save_as, filepath)

            except Exception as e:
                printex(e)

            log('successfully downloaded data about "{}"'.format(about))

    return done_something
Example #20
0
def cargarCSQL(quart=False):
    if quart:
        log(__name__).info('quart')
    else:
        log(__name__).info('no quart')

    log(__name__).info('Cargando archivos SQL...')

    basePath = os.getenv("DIRSQL")
    consultas = {}

    for (dirPath, dirNames, fileNames) in walk(basePath):
        for file in fileNames:
            if os.path.splitext(file)[1] != ".sql":
                continue

            if file[0] == "_":
                continue

            contentFileStrip = ""
            contentFile = open(basePath + "/" + file, "r").readlines()

            for contentFileLine in contentFile:
                contentFileStrip = contentFileStrip + " " + contentFileLine.strip(
                    ' ').rstrip("\n")

            buscaQuart = contentFileStrip.find("/*cada15m*/")

            if quart:
                log(__name__).warning('Ciclo quart')
                if buscaQuart > 0:
                    pass
                else:
                    continue
            else:
                log(__name__).warning('Ciclo normal')
                pass

            log(__name__).info('Cargando archivo ' + file)

            contentFileStripClear = re.sub('(\/\*.*\*\/ .*?)', '',
                                           contentFileStrip)

            log(__name__).info('Consulta: ' + contentFileStripClear)

            atributosConsulta = re.search(
                '(?<=SELECT)(.*?)(?=FROM)',
                contentFileStripClear).group().split(', ')
            atributosConsultaLimpios = []
            for atributoConsulta in atributosConsulta:
                log(__name__).info(atributoConsulta)

                atributoConsulta = atributoConsulta.split(' AS ')
                if len(atributoConsulta) == 2:
                    atributosConsultaLimpios.append(
                        atributoConsulta[1].strip())
                else:
                    atributosConsultaLimpios.append(
                        atributoConsulta[0].strip())

            consultas[file.split(".")[0]] = {
                "sql_mssql": contentFileStripClear,
                "atributos": atributosConsultaLimpios
            }
        break
    return consultas
def my_json_res_file_checker(path):
    """
    Vérifie la validité du fichier JSON contenant les liens de téléchargement et affiche des informations en conséquence.

    :param path: string le chemin du fichier de configuration data à vérifier
    """

    with open(path, 'r') as f:
        j = json.load(f)

        everything_is_fine = False
        try:
            for ressource in j['res']:

                s = ressource['params']

                # -------- vérification du mode d'import

                import_mode = ressource['import_mode']
                if import_mode not in IMPORT_MODES:
                    everything_is_fine = False
                    log('unrecognized import mode "{}"'.format(import_mode),
                        'warn')

                # -------- vérification du formatage du shortname

                # génération d'un avertissement pour chaque nom court incorrect
                # le nom de correction peut être donné
                # avant le 1er avertissement, les critères de validité de taille requise d'un nom court sont indiquées

                shortname_length = len(s['shortname'])

                # c'est trop court
                if shortname_length < SHORTNAME_MIN_LENGTH:
                    if not everything_is_fine:
                        give_shortname_info()
                        everything_is_fine = True
                    log(
                        'shortname "{}" is too short (length is {}, minimum is {})'
                        .format(s['shortname'], shortname_length,
                                SHORTNAME_MIN_LENGTH), 'warn')

                # c'est trop long
                if shortname_length > SHORTNAME_MAX_LENGTH:
                    if not everything_is_fine:
                        give_shortname_info()
                        everything_is_fine = True
                    log(
                        'shortname "{}" is too long (length is {}, maximum is {})'
                        .format(s['shortname'], shortname_length,
                                SHORTNAME_MAX_LENGTH), 'warn')

                # c'est mal formaté
                if s['shortname'] != simplify(s['shortname']):
                    if not everything_is_fine:
                        give_shortname_info()
                        everything_is_fine = True
                    log(
                        'shortname "{}" is poorly formated. you should consider changing it manually to something like "{}" for example'
                        .format(s['shortname'],
                                simplify(s['shortname'])), 'warn')

        except KeyError as e:
            printex(
                e, 'incorrect or missing node in "{}" (bad file structure)'.
                format(path))

        log('filecheck "{}" complete'.format(path))
# ________________________________
# ________________________________ chargement des configurations dynamiques
# ________________________________

# -------- imports
# effectués obligatoirement ici et non en haut de fichier
# en effet ces fonctions utilisent des variables définies ci-dessus

from packages.utils.path import create_dir_if_not_exists
from shutil import copyfile

# -------- configuration générale
if exists(DYNAMIC_CONFIG_GENERAL_FILE_PATH):
    load_general_config(DYNAMIC_CONFIG_GENERAL_FILE_PATH)
    log('the dynamic configuration file values overriden the permanent configuration file ones'
        )

else:
    # création de l'arborescence
    create_dir_if_not_exists(CONFIG_ROOT)

    # on copie le permanent afin de créer le dynamique
    # (dans ce cas, pas besoin de charger le dynamique, puisqu'il est identique au permanent)
    log('generating dynamic general purpose configuration file "{}" from the permanent one "{}"'
        .format(DYNAMIC_CONFIG_GENERAL_FILE_PATH,
                PERMANENT_CONFIG_GENERAL_FILE_PATH))
    copyfile(PERMANENT_CONFIG_GENERAL_FILE_PATH,
             DYNAMIC_CONFIG_GENERAL_FILE_PATH)

# -------- configuration data
def extract_all():
    """
    Décompresse toutes les archives à disposition.

    Solutions explorées :
    - "lzma" ne gère pas les archives, mais seule la (dé)compression d'un unique fichier compressé en ".7z"
    - la bibliothèque C "libarchive" (téléchargée via conda) via un wrapper python "libarchive-c" (téléchargée via pip)

    Choix final porté sur "patoolib" car gère beaucoup de formats.
    En réalité c'est un wrapper.
    Après détection des types MIME, il fait appel aux exécutables/librairies appropriés ("7zr" par exemple).
    """

    taskname = 'extracting'
    log_task(taskname, 'start')

    # liste des chemins complets des archives à extraire
    log('searching for archives to extract in "{}". this may take a while'.format(DOWNLOAD_ROOT))
    archives_to_extract = search_with_criteria(DOWNLOAD_ROOT, is_archive, search_depth=2)
    archives_to_extract += search_with_criteria(join(DOWNLOAD_ROOT, COPIED_DATA_DIRNAME), is_archive, search_depth=2)
    total = len(archives_to_extract)
    log('{} archive(s) found in "{}"'.format(total, DOWNLOAD_ROOT))

    # on extrait archive par archive
    no_exception = True
    some_action_performed = False
    done = 0
    for archive_path in archives_to_extract:

        # journalisation de l'avancement global
        done += 1

        # on vérifie que l'archive n'a pas déjà été extraite
        archive_dir = dirname(archive_path)
        if len(listdir(archive_dir)) > 1:
            # si archive pas seule dans le dossier, on considère qu'elle a été extraite
            log('archive "{}" ignored because previously extracted ({}/{})'.format(archive_path, done, total))

        else:

            log('extracting archive "{}" ({}/{})'.format(archive_path, done, total))

            try:
                extract_archive(archive_path, verbosity=-1, outdir=archive_dir, interactive=False)
            except PatoolError as e:
                no_exception = False
                printex(e, 'the file extension probably does not match the real data type')
            except Exception as e:
                no_exception = False
                printex(e)
            finally:
                some_action_performed = True

    if no_exception:
        if some_action_performed:
            log('all retrieved archives extracted successfully')
        else:
            log('nothing were done, because all archives were already extracted')
    else:
        log('sorry, some retrieved archives could not be extracted', 'warn')

    log_task(taskname, 'end')
Example #24
0
def xls_handler(ressource, conn):
    """Parse et importe directement une donnée non géographique en base."""

    # déclarations
    okko = True
    params = ressource['params']
    import_mode = ressource['import_mode']
    shortname, data_name = params['shortname'], params['data_name']
    schema, table = params['schema'], params['table']

    # la donnée aurait-elle déjà été importée ?
    already_done = False
    if exists(REMEMBER_IMPORT_FILE_PATH):
        already_done = check_for_line_in_file(remember_line_builder(params),
                                              REMEMBER_IMPORT_FILE_PATH)

    # on ignore la donnée si l'import a déjà été effectué précédemment
    if already_done:
        log('ignoring data "{} / {}" because previously imported into database'
            .format(shortname, data_name))

    else:

        # fichier d'entrée
        in_f = find_key_file_from_params(params, DOWNLOAD_ROOT)
        if in_f == '':
            okko = False
            log(
                'ignoring data about "{} / {}" because a crucial file is missing'
                .format(shortname, data_name), 'warn')
        else:
            log('importing data about "{} / {}" in mode "{}" in schema.table "{}.{}"'
                .format(shortname, data_name, import_mode, schema, table))
            try:
                cur = conn.cursor()
                # parsage et import
                with open_workbook(in_f) as f:
                    for sn in f.sheet_names():
                        s = f.sheet_by_name(sn)

                        # -------- traitements préliminaires

                        # structure intermédiaire
                        attrs_and_their_types = [
                            (x[0], TO_POSTGRESQL_TYPE[x[1]]) for x in [
                                s.cell(0, ci).value.rsplit(':', 1)
                                for ci in xrange(s.ncols)
                            ]
                        ]

                        # champs pour requêtes
                        scheme_dot_table = '"{}"."{}"'.format(schema, table)
                        fields_list = '({})'.format(', '.join([
                            '{} {}'.format(the_attr, the_type)
                            for the_attr, the_type in attrs_and_their_types
                        ]))
                        columns = [
                            the_attr
                            for the_attr, the_type in attrs_and_their_types
                        ]

                        # requête
                        query = 'CREATE TABLE IF NOT EXISTS {} {};'.format(
                            scheme_dot_table, fields_list)

                        # -------- création de la table
                        if import_mode == 'controlee_nouvelle_table':
                            create_schema_if_not_exists(schema, conn)
                            execute_query(query, conn)

                        # -------- remplissage de la table
                        for ri in xrange(1, s.nrows):

                            values = [
                                s.cell(ri, ci).value for ci in xrange(s.ncols)
                            ]
                            fionalike_struct = OrderedDict()

                            for index, v in enumerate(values):
                                # on repasse les "faux floats" en int
                                try:
                                    v = int(v)
                                except ValueError:
                                    pass
                                fionalike_struct[columns[index]] = v

                            query = build_insert_query(fionalike_struct,
                                                       params,
                                                       georef=False)
                            cur.execute(query)

            except Exception as e:
                conn.rollback()
                okko = False
                printex(e, 'insert query failed into table "{}"'.format(table))
            else:
                conn.commit()
                remember_this_import(params)

    return okko
Example #25
0
def apply_params_to_properties(params, properties, from_where='insert'):
    """
    Effectue les modifications sur les propriétés en fonction des paramètres.

    Ces modifications portent sur les attributs.
    On touche donc à la structure, ce qui équivaudrait à un "ALTER".
    :param params: les paramètres
    :param properties: les propriétés
    :param from_where: d'où cette fonction a été appellée (pour évter de log un message identique pour n tuples)
    :return: les propriétés modifiées   et    True si tout s'est bien passé sinon False
    """
    okko = False
    pp = OrderedDict()

    try:

        # déclarations en noms courts
        mode = ''
        if check_for_node_in_parent('mode', params):
            mode = params['mode']
        bb = {}
        some_b_is_waiting_for_each_one = []
        if check_for_node_in_parent('bindings', params):
            bb = params['bindings']
            some_b_is_waiting_for_each_one = [b['from'] for b in bb]

        # on effectue les modifications précisées dans bindings
        for prop_key in properties.keys():
            need_to_copy_the_attribute = mode != 'keep_only'  # MODIFIED & KEEP_ONLY

            for b in bb:
                b_from = b['from']
                b_to = b['to']

                if prop_key == b_from:

                    # -------- DROP
                    if b_to == '':
                        if mode == 'keep_only':
                            # on est sur un DROP + KEEP_ONLY, ce n'est pas valide
                            # on avertit et ignore la clause DROP
                            # en effet copy est déjà à False
                            if from_where != 'insert':
                                log(
                                    'ignoring non-sense DROP of attribute "{}" because of KEEP_ONLY mode'
                                    .format(b_from), 'warn')
                        else:
                            # l'attribut ne sera pas retenu, c'est-à-dire non copié
                            need_to_copy_the_attribute = False

                    # changement de nom de champ
                    else:
                        pp[b_to] = properties[prop_key]
                        # on vient de le copier en le renommant, inutile de le copier à nouveau "comme s'il n'était pas à modifier"
                        need_to_copy_the_attribute = False

                # champ inchangé
                already_copied = prop_key in pp.keys()
                some_b_is_waiting_for_it = prop_key in some_b_is_waiting_for_each_one
                if need_to_copy_the_attribute and not already_copied and not some_b_is_waiting_for_it:
                    pp[prop_key] = properties[prop_key]

            # pas de bindings donc on recopie sans se poser de questions
            if not check_for_node_in_parent('bindings', params):
                pp[prop_key] = properties[prop_key]

        # ajout des champs communs à tous les modes d'import
        pp[YEAR_NAME] = '{}:{}'.format(YEAR_TYPE, YEAR_LENGTH)
        pp[VERSION_NAME] = '{}:{}'.format(VERSION_TYPE, VERSION_LENGTH)
        pp[SRID_NAME] = '{}:{}'.format(SRID_TYPE, SRID_LENGTH)
        pp[GEOMETRY_NAME] = '{}:'.format(GEOMETRY_TYPE)

    except KeyError as e:
        printex(e, 'incorrect or missing node')
    except Exception as e:
        printex(e)
    else:
        okko = True

    return pp, okko
Example #26
0
def to_database(ressource, conn):
    """
    Parcoure la donnée et l'insère dans la base de données en créant dynamiquement les requêtes.

    Travaille les données en flux uniquement.
    Pour cela, on utilise les générateurs python et les requêtes sont construites à la volée pour chaque tuple (au sens base de données) de la donnée d'entrée.
    """

    # déclarations
    okko = True
    cur = conn.cursor()
    import_mode = ressource['import_mode']
    params = ressource['params']
    mode = ''
    if check_for_node_in_parent('mode', params):
        mode = params['mode']
    data_name = params['data_name']
    shortname = simplify(params['shortname'])

    # on trouve le fichier clé pour ce traitement
    input_file = find_key_file_from_params(params, DOWNLOAD_ROOT)
    if input_file == '':
        log(
            'ignoring data about "{} / {}" because a crucial file is missing'.
            format(shortname, data_name), 'warn')
        okko = False
    else:

        # la donnée aurait-elle déjà été importée ?
        already_done = False
        if exists(REMEMBER_IMPORT_FILE_PATH):
            already_done = check_for_line_in_file(
                remember_line_builder(params), REMEMBER_IMPORT_FILE_PATH)

        # on ignore la donnée si l'import a déjà été effectué précédemment
        if already_done:
            log('ignoring data "{} / {}" because previously imported into database 1'
                .format(shortname, data_name))
        else:
            mode_msg = 'automatic' if mode == '' else mode
            log('importing data about "{} / {}" in mode "{} / {}" in schema.table "{}.{}"'
                .format(shortname, data_name, import_mode, mode_msg,
                        params['schema'], params['table']))

            with fiona.drivers():

                # on ouvre le fichier d'entrée
                # ajoute, conserve, et modifie les colonnes désirées
                # et écrit le tout dans un fichier de sortie
                # l'ensemble des traitements s'effectuent en flux

                with fiona.open(input_file, 'r') as in_data:

                    # on part du schéma initial
                    in_schema = in_data.schema.copy()

                    # effectuer les modifications sur les champs
                    properties, prop_changes_ok = apply_params_to_properties(
                        params, in_schema['properties'], from_where='create')
                    okko = prop_changes_ok and okko

                    if okko:
                        try:

                            # -------- schéma et table

                            # si le schéma (au sens espace de noms pour la base de données) n'existe pas, le créer
                            create_schema_if_not_exists(params['schema'], conn)

                            # création éventuelle de la table
                            if import_mode == 'controlee_nouvelle_table':
                                # on fait en sorte que la table existe en tentant systématiquement de la créer
                                create_table_if_not_exists(
                                    properties, params, conn)

                            # -------- détection des projections I/O

                            srid_src_user, srid_dst_user = get_user_srids(
                                params)
                            srid_src_detected, srid_dst_detected = get_detected_srids(
                                in_data.crs_wkt, params, conn)
                            srid_src, srid_dst = None, None

                            # -------- réactions aux projections à disposition (source)

                            if srid_src_user is not None:
                                if srid_src_user != srid_src_detected and srid_src_detected is not None:
                                    log(
                                        'detected source SRID ({}, ignored) is different from the enforced one that you gave ({}, picked)'
                                        .format(srid_src_detected,
                                                srid_src_user), 'warn')
                                srid_src = srid_src_user
                            else:
                                if srid_src_detected is not None:
                                    srid_src = srid_src_detected
                                else:
                                    okko = False
                                    log(
                                        'sorry, you will need to define manually the source SRID',
                                        'error')

                            # -------- réactions aux projections à disposition (destination)
                            if okko:
                                if import_mode == 'controlee_nouvelle_table':
                                    srid_dst = srid_src
                                    if srid_dst_user is not None:
                                        srid_dst = srid_dst_user
                                    params[SRID_NAME] = srid_dst
                                else:
                                    if srid_dst_user is not None:
                                        log(
                                            'ignoring the given destination SRID, which is useless regarding the import mode',
                                            'warn')
                                    if srid_dst_detected is not None:
                                        srid_dst = srid_dst_detected
                                        params[SRID_NAME] = srid_dst
                                    else:
                                        okko = False
                                        msg_beginning = 'weird, cannot find destination SRID.'
                                        if table_empty(params, conn):
                                            msg_ending = 'table exists but is empty. drop it manually, change the import mode, then retry'
                                        else:
                                            msg_ending = 'check in database that the SRID column is named "{}"'.format(
                                                SRID_NAME)
                                        log(
                                            '{} {}'.format(
                                                msg_beginning, msg_ending),
                                            'error')

                            # -------- modifications structurelles et import

                            if okko:

                                reprojection_needed = okko and (srid_src !=
                                                                srid_dst)
                                proj_src, proj_dst = None, None
                                if reprojection_needed:
                                    prefix = 'epsg:'
                                    proj_src, proj_dst = '{}{}'.format(
                                        prefix, srid_src), '{}{}'.format(
                                            prefix, srid_dst)
                                    log('data will be reprojected from SRID {} to {}'
                                        .format(srid_src, srid_dst))

                                # si nouveaux champs, les ajouter à la table en base de données
                                log('adding new fields to table in database')
                                if check_for_node_in_parent(
                                        'new_fields', params):
                                    for nf in params['new_fields']:
                                        the_name = nf['name']
                                        the_type_name, the_type_length = nf[
                                            'type'].split(':')
                                        query = 'ALTER TABLE "{}"."{}" ADD COLUMN {} {}({});'.format(
                                            params['schema'], params['table'],
                                            the_name, the_type_name,
                                            the_type_length)
                                        execute_query(query, conn)

                                # on travaille les données en flux, chaque feature aura son "INSERT"
                                for feature in in_data:

                                    # recopie des valeurs attributaires pour chaque feature
                                    prop, okko = apply_params_to_properties(
                                        params, feature['properties'])
                                    prop[SRID_NAME] = srid_dst
                                    prop[YEAR_NAME] = params['year']
                                    prop[VERSION_NAME] = params['version']

                                    # -------- reprojection
                                    geom = feature['geometry']
                                    prop[GEOMETRY_NAME] = reproject(
                                        geom, proj_src, proj_dst
                                    ) if reprojection_needed else geom

                                    # construction de la requête dynamiquement
                                    insert_query = build_insert_query(
                                        prop, params)
                                    cur.execute(insert_query)

                        except Exception as e:
                            conn.rollback()
                            okko = False
                            printex(
                                e,
                                'insert query failed into table "{}"'.format(
                                    params['table']))
                        else:
                            conn.commit()
                            remember_this_import(params)

    return okko
def load_general_config(filename):
    """
    Charge un fichier de configuration.
    
    :param filename: string le chemin du fichier de configuration à charger 
    """

    with open(filename, 'r') as f:

        all_is_ok = False
        j = json.load(f)
        try:

            # emplacement racine à utiliser
            global PLACE_TO_GO

            # dossier de travail
            global WORK_ROOT

            # racine du stockage des fichiers de configuration
            global CONFIG_ROOT

            # racine du stockage des documents téléchargés
            global DOWNLOAD_ROOT

            # racine du stockage des données converties
            global CONVERSION_ROOT

            # nom du dossier pour les données copiées
            global COPIED_DATA_DIRNAME

            # fichier retenant les imports yaant été effectués
            global REMEMBER_IMPORT_FILE_NAME
            global REMEMBER_IMPORT_FILE_EXT
            global REMEMBER_IMPORT_FILENAME
            global REMEMBER_IMPORT_FILE_PATH

            # niveaux de sévérité
            global SEVERITY_LEVELS

            # niveau de verbosité
            global LOG_LEVEl

            # longueur de la décoration placée à droite et à gauche des logs délimitant les tâches principales
            global VISUAL_SECTION_DECORATOR_LENGTH

            # caractère de séparation visuelle utilisé pour les exceptions dans les logs
            global VISUAL_SEPARATOR_CARACTER

            # taille minimale et maximale d'un nom court
            global SHORTNAME_MIN_LENGTH
            global SHORTNAME_MAX_LENGTH

            # groupes d'extensions
            global EXTENSION_GROUPS
            global KEY_EXTS
            global MANDATORY_EXT_GROUPS
            global OPTIONAL_EXT_GROUPS
            global INTERESTING_EXTS

            # caractéristiques des champs ajoutés
            global YEAR_NAME
            global YEAR_TYPE
            global YEAR_LENGTH
            global VERSION_NAME
            global VERSION_TYPE
            global VERSION_LENGTH
            global SRID_NAME
            global SRID_TYPE
            global SRID_LENGTH
            global GEOMETRY_NAME
            global GEOMETRY_TYPE

            # correspondance entre les types tels que représentés dans les structure python générées par fiona et les types au sein de postgresql
            global TO_POSTGRESQL_TYPE

            # paramètres de connection à la base de données
            global DB_HOST
            global DB_PORT
            global DB_NAME
            global DB_USER_NAME
            global DB_USER_PASSWORD

            # chemin du fichier dynamique de configuration "general"
            global DYNAMIC_CONFIG_GENERAL_FILE_PATH

            # chemin du fichier dynamique de configuration "data"
            global DYNAMIC_CONFIG_DATA_FILE_PATH

            # définit les modes d'import valides
            global IMPORT_MODES

            # renseigne les proxys
            global PROXIES

            # attribution des valeurs
            PLACE_TO_GO = j['place_to_go']
            WORK_ROOT = join(PLACE_TO_GO, j['work_root_name'])
            CONFIG_ROOT = join(WORK_ROOT, j['config_root_name'])
            DOWNLOAD_ROOT = join(WORK_ROOT, j['download_root_name'])
            CONVERSION_ROOT = join(WORK_ROOT, j['conversion_root_name'])
            COPIED_DATA_DIRNAME = join(j['copied_data_dirname'])
            REMEMBER_IMPORT_FILE_NAME = j['remember_import_file_name']
            REMEMBER_IMPORT_FILE_EXT = j['remember_import_file_ext']
            REMEMBER_IMPORT_FILENAME = '{}{}{}'.format(
                REMEMBER_IMPORT_FILE_NAME, extsep, REMEMBER_IMPORT_FILE_EXT)
            REMEMBER_IMPORT_FILE_PATH = join(CONFIG_ROOT,
                                             REMEMBER_IMPORT_FILENAME)
            SEVERITY_LEVELS = j['severity_levels']
            LOG_LEVEl = j['log_level']
            VISUAL_SECTION_DECORATOR_LENGTH = j[
                'visual_section_decorator_length']
            VISUAL_SEPARATOR_CARACTER = j['visual_separator_caracter']
            SHORTNAME_MIN_LENGTH = j['shortname_min_length']
            SHORTNAME_MAX_LENGTH = j['shortname_max_length']
            EXTENSION_GROUPS = j['extension_groups']
            KEY_EXTS = [gg[0][0] for gg in EXTENSION_GROUPS]
            MANDATORY_EXT_GROUPS = [
                g for gg in EXTENSION_GROUPS for g in gg[0]
            ]
            OPTIONAL_EXT_GROUPS = [g for gg in EXTENSION_GROUPS for g in gg[1]]
            INTERESTING_EXTS = MANDATORY_EXT_GROUPS + OPTIONAL_EXT_GROUPS
            YEAR_NAME = j['year_name']
            YEAR_TYPE = j['year_type']
            YEAR_LENGTH = j['year_length']
            VERSION_NAME = j['version_name']
            VERSION_TYPE = j['version_type']
            VERSION_LENGTH = j['version_length']
            SRID_NAME = j['srid_name']
            SRID_TYPE = j['srid_type']
            SRID_LENGTH = j['srid_length']
            GEOMETRY_NAME = j['geometry_name']
            GEOMETRY_TYPE = j['geometry_type']
            TO_POSTGRESQL_TYPE = j['types_map']
            DB_HOST = j['database_host']
            DB_PORT = j['database_port']
            DB_NAME = j['database_name']
            DB_USER_NAME = j['database_user_name']
            DB_USER_PASSWORD = j['database_user_password']
            IMPORT_MODES = j['import_modes']
            PROXIES = j['proxies']

            DYNAMIC_CONFIG_GENERAL_FILE_PATH = join(CONFIG_ROOT,
                                                    CONFIG_GENERAL_FILENAME)
            DYNAMIC_CONFIG_DATA_FILE_PATH = join(CONFIG_ROOT,
                                                 CONFIG_DATA_FILENAME)

        except KeyError as e:
            printex(
                e, 'incorrect or missing node in "{}" (bad file structure)'.
                format(config_file_to_load))
        except Exception as e:
            printex(e)
        else:
            all_is_ok = True
            log('general purpose configuration file "{}" correctly loaded'.
                format(filename))

        # problème inconnu quelconque : on crash, puisque l'on aura de toute façon besoin des configs pour focntionner correctement
        if not all_is_ok:
            log(
                'can\'t load correctly the permanent general configuration file (see the reason above). exiting',
                'error')
            exit(1)