Esempio n. 1
0
def unzip(directory, extension, raw_folder, preprocessing_folder):
    """
        Unzips zip and 7z compressed files and moves them from a raw_folder to a preprocessing_folder
        
        Args:
            directory (str): the folder containing the files you want to unzip
            extensions (lst): iterable of extension strings
            raw_folder (str): name of the folder containing raw data
            preprocessing_folder (str): name of the preprocessing folder
    """

    files = list_files(directory, [extension])
    for file_name in files:
        target_directory = os.path.dirname(
            file_name.replace(raw_folder, preprocessing_folder))
        if not os.path.exists(target_directory):
            os.makedirs(target_directory)
            logger.debug('Making directory: %s' % target_directory)
        if extension in ['zip']:
            subprocess.check_output(
                ['unzip', file_name, '-d', target_directory])
        elif extension in ['7z']:
            subprocess.check_output(
                ['7z', 'e', file_name,
                 '-o%s' % target_directory])
Esempio n. 2
0
def split_csv(directory, auto_split=1, threshold_row=25):
    """
        Iterates over csv files in a directory, splitting them into headers and body (if those files do not already exist).
        Uses 'header_split.sh'. If auto_split=1, 'header_split.sh' attempts to automatically detect the header row by finding the first row where all the elements are full except for 'threshold_row'. 
        If auto_split=0, 'threshold_row' is the row at which the splitting occurs
        
        Args:
            directory (str): the folder containing the files you want to split
            auto_split (int): 1 for automatic splitting, 0 for fixed split.
            threshold_row (int): either the number of empty cells in the header row (if auto_split=1) or the row at which to split the csv (if auto_split=0)
    """

    files = list_files(directory, ['csv'])
    for file_name in files:
        if not file_name.endswith('_body.csv') and not file_name.endswith(
                '_head.csv'):
            if not os.path.exists(file_name[:-4] +
                                  '_body.csv') and not os.path.exists(
                                      file_name[:-4] + '_head.csv'):
                subprocess.check_output([
                    'bash', SCRIPTS_DIR + '/header_split.sh', file_name[:-4],
                    str(auto_split),
                    str(threshold_row)
                ])
                logger.debug('Split csv %s' % file_name)
Esempio n. 3
0
def extract(directory, extensions, raw_folder, preprocessing_folder):
    """
        Finds all files with some extensions inside a directory and converts them into a carrot separated value files. 
        In the conversion, the files are moved from their initial raw_folder to a preprocessing folder, creating the necessary folder structure. Conversion requires 'xls_csv.sh', 'sav_csv.sh' and 'csv_csv.sh'
        Each .csv file is cleaned by removing newlines, trailing and leading spaces, and replacing consecutive spaces with a single space.
        Finally, each file is checked to see if all rows have the same number of fields using 'csv_health.sh'
        
        Args:
            directory (str): the folder containing the files you want to convert to csv
            extensions (lst): iterable of extension strings
            raw_folder (str): name of the folder containing raw data
            preprocessing_folder (str): name of the preprocessing folder
    """
    for extension in extensions:
        files = list_files(directory, [extension])
        for input_file in files:
            output_file = input_file.replace(raw_folder, preprocessing_folder)
            output_file = output_file.replace(extension, 'csv')

            try:
                if not os.path.exists(output_file):
                    logger.debug('Output file: %s' % output_file)
                    dirname = os.path.dirname(output_file)
                    if not os.path.exists(dirname):
                        os.makedirs(dirname)
                        logger.debug('Making directory: %s' % dirname)
                    if extension in ['xls', 'xlsx']:
                        subprocess.check_output([
                            'bash', SCRIPTS_DIR + '/xls_csv.sh', input_file,
                            output_file
                        ])
                        logger.debug('Converted xls to csv')
                    elif extension in ['sav']:
                        subprocess.check_output([
                            'bash', SCRIPTS_DIR + '/sav_csv.sh', input_file,
                            output_file
                        ])
                        logger.debug('Converted sav to csv')
                    elif extension in ['csv']:
                        subprocess.check_output([
                            'bash', SCRIPTS_DIR + '/csv_csv.sh', input_file,
                            output_file
                        ])
                        logger.debug('Converted csv to csv')
                    else:
                        logger.warn('Cannot do anything with %s extension' %
                                    extension)

                    healthy = subprocess.check_output(
                        ['bash', SCRIPTS_DIR + '/csv_health.sh', output_file])
                    if not healthy:
                        logger.warn('File is unhealthy: %s' % output_file)
            except Exception as e:
                logger.warn('File %s failed because %s' % (output_file, e))
                if os.path.exists(output_file):
                    subprocess.run(['rm', output_file])
Esempio n. 4
0
def check_csv_health(directory):
    """
        Iterates over a directory, finding csv files and checking that they have the same number of fields per row, using 'csv_health.sh'
        
        Args:
            directory (str): the folder containing the files you want to check
    """

    files = list_files(directory, ['csv'])
    for input_file in files:
        print(
            input_file,
            subprocess.check_output(
                ['bash', SCRIPTS_DIR + '/csv_health.sh', input_file]))
Esempio n. 5
0
def main(folder_number):
    """
    There are three steps for this script:
    1. Goes into the folder <pre_dir/folder_number> and iterates over all the .csv files in them (either , or ^ delimiter).
    2. For each file, it uploads the _body.csv to the `raw` schema, with column names a0,a1,a2... for folder 1; b0,b1,b2... for folder 2; etc
    3. It then, for each file, gets the _head.csv, and maps the new column names (a0, a1, b0, c0...) to the old column names (anio, status, departamento...)
    """

    ## Declaring which directory to process
    ## This is taken from yaml config file called 'ETL.yml'
    with open(BASE_DIR + '/mined/ETL/config.yaml') as config_file:
        yml_dict = yaml.load(config_file)

    ## variables for retrieving files
    data_parent_dir = os.path.join(yml_dict['base_dir'], yml_dict['pre_dir'])
    data_dir = yml_dict['folder_dictionary'][folder_number]['folder']
    working_dir = os.path.join(data_parent_dir, data_dir)
    ext = yml_dict['extension']
    delim = yml_dict['folder_dictionary']['comma_separated']
    schema = yml_dict['schema_name']
    if folder_number in delim:
        delim = ','
    else:
        delim = '^'

    ## Creating connection to postgres
    logger.info('Connecting to postgres')
    connection, cursor = create_pgconn()
    cursor.execute("set role el_salvador_mined_education_write")

    # START! Retrieve files from specified directory in datos
    logger.info('Retrieving body csv files')
    csv_files = list_files(working_dir, [ext])

    if not csv_files:
        logger.debug('ERROR: no files retrieved. Exiting program')

    logger.debug('Files retrieved: {}'.format(csv_files))

    for csv_f in csv_files:
        file_path_bytes = csv_f.encode('latin-1', 'ignore')
        file_path_str = file_path_bytes.decode('utf-8')

        relative_path = file_path_str.replace(
            data_parent_dir, '')  # Retrieves up to 'data_parent_dir'
        split_path = relative_path.split('/')
        table_name = str(folder_number) + '_' + '_'.join(
            split_path[1:]).replace(ext, '')
        table_name_with_schema = schema + '"' + table_name + '"'

        logger.info('Removing column mapping data')
        cursor.execute(
            f"""select distinct(table_name) from raw.column_mapping where table_name ~ '{schema}"{folder_number}_*'"""
        )
        fetch = cursor.fetchall(
        )  # fetchall() returns a list of tuples w/ 2 items in each tuple
        column_mapping_column_list = [
            string for tup in fetch for string in tup
        ]
        if table_name_with_schema in column_mapping_column_list:
            cursor.execute(
                f"""delete from raw.column_mapping where table_name = '{table_name_with_schema}'"""
            )
        else:
            pass

        # TODO: check if base_filename is > 63 characters due to Postgres tablename limit

        logger.info('Starting on file: {}'.format(csv_f.split('/')[-1]))
        logger.info('Table name will be: {}'.format(table_name_with_schema))

        ## Generate CREATE TABLE STATEMENT
        logger.info('Generating create table statement for table {}'.format(
            table_name_with_schema))
        csv_path_head = csv_f.replace(ext, '_head.csv')
        headers_list = list(read_csv(csv_path_head, sep=delim).columns)
        headers_list = [h.lower().replace("'", "") for h in headers_list]
        col_letter = chr(96 + folder_number)
        new_col_header = [
            col_letter + str(head) for head in range(len(headers_list))
        ]
        drop_table_query = f"""DROP TABLE IF EXISTS {table_name_with_schema};"""
        create_table_query = f"""CREATE TABLE {table_name_with_schema} ({' VARCHAR, '.join(new_col_header) + ' VARCHAR'});"""
        logger.debug(create_table_query)
        column_mapping_data = zip(headers_list, new_col_header,
                                  [table_name_with_schema] * len(headers_list))

        ## Creating table.
        logger.info('Dropping table {}'.format(table_name_with_schema))
        cursor.execute(drop_table_query)
        logger.debug('Drop table execution message: ()'.format(
            cursor.statusmessage))
        logger.info('Executing create table statement {}'.format(
            table_name_with_schema))
        cursor.execute(create_table_query)
        logging.debug('Create table execution message: {}'.format(
            cursor.statusmessage))

        ## Inserting data
        logger.info('Inserting into table')
        with open(csv_f, 'r') as csv_file:
            if folder_number in yml_dict['folder_dictionary'][
                    'comma_separated']:
                cursor.copy_expert(
                    f"""COPY {table_name_with_schema} FROM STDIN WITH CSV DELIMITER ',' QUOTE '"' NULL '' """,
                    csv_file)
            else:
                cursor.copy_expert(
                    f"""COPY {table_name_with_schema} FROM STDIN WITH CSV DELIMITER '^' NULL '' """,
                    csv_file)

            cursor.execute(
                f"""select count(*) from {table_name_with_schema};""")
            logging.info('Count on table {}: {}'.format(
                table_name_with_schema, cursor.fetchone()))

        ## Inserting column mapping data
        logger.info('Inserting column mapping data into col_mapping table')
        #TODO: should the tablename and col names in table be hard coded?
        for data in column_mapping_data:
            insert_query = f"""INSERT INTO raw.column_mapping(original_col_name, mapped_col_name, table_name) VALUES ('{data[0]}', '{data[1]}', '{data[2]}' );"""
            cursor.execute(insert_query)
        connection.commit()
    connection.close()

    logger.info(f'{data_dir} folder is completed')
Esempio n. 6
0
    folder_num = int(sys.argv[1])
    folder_dict = config['folder_dictionary'][folder_num]
    raw_path = config['base_dir'] + config['raw_dir']

    full_path = raw_path + folder_dict['folder']
    logger.debug(
        f"Extracting {folder_dict['file_types']} from {config['pre_dir']}")
    extract(full_path, folder_dict['file_types'], config['raw_dir'],
            config['pre_dir'])

    logger.debug(f"Splitting header and body")
    if folder_dict['header_split'] == 'auto':
        split_csv(full_path.replace(config['raw_dir'], config['pre_dir']))
    else:
        split_csv(full_path.replace(config['raw_dir'], config['pre_dir']), 0,
                  folder_dict['header_split'])
    '''FOLDER 1'''
    if '1' in sys.argv:
        for file in list_files(
                full_path.replace(config['raw_dir'], config['pre_dir']),
            ['csv']):
            if '_-_' in file:
                os.remove(file)
    '''FOLDER 8'''
    if '8' in sys.argv:
        for file in list_files(
                full_path.replace(config['raw_dir'], config['pre_dir']),
                'csv'):
            if '~$' in file in file:
                os.remove(file)