def unzip(directory, extension, raw_folder, preprocessing_folder): """ Unzips zip and 7z compressed files and moves them from a raw_folder to a preprocessing_folder Args: directory (str): the folder containing the files you want to unzip extensions (lst): iterable of extension strings raw_folder (str): name of the folder containing raw data preprocessing_folder (str): name of the preprocessing folder """ files = list_files(directory, [extension]) for file_name in files: target_directory = os.path.dirname( file_name.replace(raw_folder, preprocessing_folder)) if not os.path.exists(target_directory): os.makedirs(target_directory) logger.debug('Making directory: %s' % target_directory) if extension in ['zip']: subprocess.check_output( ['unzip', file_name, '-d', target_directory]) elif extension in ['7z']: subprocess.check_output( ['7z', 'e', file_name, '-o%s' % target_directory])
def split_csv(directory, auto_split=1, threshold_row=25): """ Iterates over csv files in a directory, splitting them into headers and body (if those files do not already exist). Uses 'header_split.sh'. If auto_split=1, 'header_split.sh' attempts to automatically detect the header row by finding the first row where all the elements are full except for 'threshold_row'. If auto_split=0, 'threshold_row' is the row at which the splitting occurs Args: directory (str): the folder containing the files you want to split auto_split (int): 1 for automatic splitting, 0 for fixed split. threshold_row (int): either the number of empty cells in the header row (if auto_split=1) or the row at which to split the csv (if auto_split=0) """ files = list_files(directory, ['csv']) for file_name in files: if not file_name.endswith('_body.csv') and not file_name.endswith( '_head.csv'): if not os.path.exists(file_name[:-4] + '_body.csv') and not os.path.exists( file_name[:-4] + '_head.csv'): subprocess.check_output([ 'bash', SCRIPTS_DIR + '/header_split.sh', file_name[:-4], str(auto_split), str(threshold_row) ]) logger.debug('Split csv %s' % file_name)
def extract(directory, extensions, raw_folder, preprocessing_folder): """ Finds all files with some extensions inside a directory and converts them into a carrot separated value files. In the conversion, the files are moved from their initial raw_folder to a preprocessing folder, creating the necessary folder structure. Conversion requires 'xls_csv.sh', 'sav_csv.sh' and 'csv_csv.sh' Each .csv file is cleaned by removing newlines, trailing and leading spaces, and replacing consecutive spaces with a single space. Finally, each file is checked to see if all rows have the same number of fields using 'csv_health.sh' Args: directory (str): the folder containing the files you want to convert to csv extensions (lst): iterable of extension strings raw_folder (str): name of the folder containing raw data preprocessing_folder (str): name of the preprocessing folder """ for extension in extensions: files = list_files(directory, [extension]) for input_file in files: output_file = input_file.replace(raw_folder, preprocessing_folder) output_file = output_file.replace(extension, 'csv') try: if not os.path.exists(output_file): logger.debug('Output file: %s' % output_file) dirname = os.path.dirname(output_file) if not os.path.exists(dirname): os.makedirs(dirname) logger.debug('Making directory: %s' % dirname) if extension in ['xls', 'xlsx']: subprocess.check_output([ 'bash', SCRIPTS_DIR + '/xls_csv.sh', input_file, output_file ]) logger.debug('Converted xls to csv') elif extension in ['sav']: subprocess.check_output([ 'bash', SCRIPTS_DIR + '/sav_csv.sh', input_file, output_file ]) logger.debug('Converted sav to csv') elif extension in ['csv']: subprocess.check_output([ 'bash', SCRIPTS_DIR + '/csv_csv.sh', input_file, output_file ]) logger.debug('Converted csv to csv') else: logger.warn('Cannot do anything with %s extension' % extension) healthy = subprocess.check_output( ['bash', SCRIPTS_DIR + '/csv_health.sh', output_file]) if not healthy: logger.warn('File is unhealthy: %s' % output_file) except Exception as e: logger.warn('File %s failed because %s' % (output_file, e)) if os.path.exists(output_file): subprocess.run(['rm', output_file])
def check_csv_health(directory): """ Iterates over a directory, finding csv files and checking that they have the same number of fields per row, using 'csv_health.sh' Args: directory (str): the folder containing the files you want to check """ files = list_files(directory, ['csv']) for input_file in files: print( input_file, subprocess.check_output( ['bash', SCRIPTS_DIR + '/csv_health.sh', input_file]))
def main(folder_number): """ There are three steps for this script: 1. Goes into the folder <pre_dir/folder_number> and iterates over all the .csv files in them (either , or ^ delimiter). 2. For each file, it uploads the _body.csv to the `raw` schema, with column names a0,a1,a2... for folder 1; b0,b1,b2... for folder 2; etc 3. It then, for each file, gets the _head.csv, and maps the new column names (a0, a1, b0, c0...) to the old column names (anio, status, departamento...) """ ## Declaring which directory to process ## This is taken from yaml config file called 'ETL.yml' with open(BASE_DIR + '/mined/ETL/config.yaml') as config_file: yml_dict = yaml.load(config_file) ## variables for retrieving files data_parent_dir = os.path.join(yml_dict['base_dir'], yml_dict['pre_dir']) data_dir = yml_dict['folder_dictionary'][folder_number]['folder'] working_dir = os.path.join(data_parent_dir, data_dir) ext = yml_dict['extension'] delim = yml_dict['folder_dictionary']['comma_separated'] schema = yml_dict['schema_name'] if folder_number in delim: delim = ',' else: delim = '^' ## Creating connection to postgres logger.info('Connecting to postgres') connection, cursor = create_pgconn() cursor.execute("set role el_salvador_mined_education_write") # START! Retrieve files from specified directory in datos logger.info('Retrieving body csv files') csv_files = list_files(working_dir, [ext]) if not csv_files: logger.debug('ERROR: no files retrieved. Exiting program') logger.debug('Files retrieved: {}'.format(csv_files)) for csv_f in csv_files: file_path_bytes = csv_f.encode('latin-1', 'ignore') file_path_str = file_path_bytes.decode('utf-8') relative_path = file_path_str.replace( data_parent_dir, '') # Retrieves up to 'data_parent_dir' split_path = relative_path.split('/') table_name = str(folder_number) + '_' + '_'.join( split_path[1:]).replace(ext, '') table_name_with_schema = schema + '"' + table_name + '"' logger.info('Removing column mapping data') cursor.execute( f"""select distinct(table_name) from raw.column_mapping where table_name ~ '{schema}"{folder_number}_*'""" ) fetch = cursor.fetchall( ) # fetchall() returns a list of tuples w/ 2 items in each tuple column_mapping_column_list = [ string for tup in fetch for string in tup ] if table_name_with_schema in column_mapping_column_list: cursor.execute( f"""delete from raw.column_mapping where table_name = '{table_name_with_schema}'""" ) else: pass # TODO: check if base_filename is > 63 characters due to Postgres tablename limit logger.info('Starting on file: {}'.format(csv_f.split('/')[-1])) logger.info('Table name will be: {}'.format(table_name_with_schema)) ## Generate CREATE TABLE STATEMENT logger.info('Generating create table statement for table {}'.format( table_name_with_schema)) csv_path_head = csv_f.replace(ext, '_head.csv') headers_list = list(read_csv(csv_path_head, sep=delim).columns) headers_list = [h.lower().replace("'", "") for h in headers_list] col_letter = chr(96 + folder_number) new_col_header = [ col_letter + str(head) for head in range(len(headers_list)) ] drop_table_query = f"""DROP TABLE IF EXISTS {table_name_with_schema};""" create_table_query = f"""CREATE TABLE {table_name_with_schema} ({' VARCHAR, '.join(new_col_header) + ' VARCHAR'});""" logger.debug(create_table_query) column_mapping_data = zip(headers_list, new_col_header, [table_name_with_schema] * len(headers_list)) ## Creating table. logger.info('Dropping table {}'.format(table_name_with_schema)) cursor.execute(drop_table_query) logger.debug('Drop table execution message: ()'.format( cursor.statusmessage)) logger.info('Executing create table statement {}'.format( table_name_with_schema)) cursor.execute(create_table_query) logging.debug('Create table execution message: {}'.format( cursor.statusmessage)) ## Inserting data logger.info('Inserting into table') with open(csv_f, 'r') as csv_file: if folder_number in yml_dict['folder_dictionary'][ 'comma_separated']: cursor.copy_expert( f"""COPY {table_name_with_schema} FROM STDIN WITH CSV DELIMITER ',' QUOTE '"' NULL '' """, csv_file) else: cursor.copy_expert( f"""COPY {table_name_with_schema} FROM STDIN WITH CSV DELIMITER '^' NULL '' """, csv_file) cursor.execute( f"""select count(*) from {table_name_with_schema};""") logging.info('Count on table {}: {}'.format( table_name_with_schema, cursor.fetchone())) ## Inserting column mapping data logger.info('Inserting column mapping data into col_mapping table') #TODO: should the tablename and col names in table be hard coded? for data in column_mapping_data: insert_query = f"""INSERT INTO raw.column_mapping(original_col_name, mapped_col_name, table_name) VALUES ('{data[0]}', '{data[1]}', '{data[2]}' );""" cursor.execute(insert_query) connection.commit() connection.close() logger.info(f'{data_dir} folder is completed')
folder_num = int(sys.argv[1]) folder_dict = config['folder_dictionary'][folder_num] raw_path = config['base_dir'] + config['raw_dir'] full_path = raw_path + folder_dict['folder'] logger.debug( f"Extracting {folder_dict['file_types']} from {config['pre_dir']}") extract(full_path, folder_dict['file_types'], config['raw_dir'], config['pre_dir']) logger.debug(f"Splitting header and body") if folder_dict['header_split'] == 'auto': split_csv(full_path.replace(config['raw_dir'], config['pre_dir'])) else: split_csv(full_path.replace(config['raw_dir'], config['pre_dir']), 0, folder_dict['header_split']) '''FOLDER 1''' if '1' in sys.argv: for file in list_files( full_path.replace(config['raw_dir'], config['pre_dir']), ['csv']): if '_-_' in file: os.remove(file) '''FOLDER 8''' if '8' in sys.argv: for file in list_files( full_path.replace(config['raw_dir'], config['pre_dir']), 'csv'): if '~$' in file in file: os.remove(file)