def _get_file_encoding(self, csv_file): csv_file_name = os.path.basename(csv_file) csv_file_full_path = os.path.realpath(csv_file) csv_file_dir = os.path.dirname(csv_file_full_path) encoding_file = os.path.join(csv_file_dir, self.csv_files_encoding_file) if os.path.isfile(encoding_file): enc_file = pd.read_table(encoding_file, index_col=0, header=None, delim_whitespace=True, squeeze=True) if not enc_file.index.is_unique: logger.error( f'{self.csv_files_encoding_file} has no unique files. Not using the file' ) return self.csv_files_encoding if csv_file_name in enc_file.index: file_encoding = enc_file.at[csv_file_name] logger.info( f'Encoding found in {self.csv_files_encoding_file}: {file_encoding}' ) return file_encoding else: logger.warning( f'No {self.csv_files_encoding_file} found, assuming {self.csv_files_encoding}' ) return self.csv_files_encoding
def _load_bgen_samples(self): if self.bgen_sample_file is None or not os.path.isfile( self.bgen_sample_file): logger.warning( 'BGEN sample file not set or does not exist: {}'.format( self.bgen_sample_file)) return logger.info('Loading BGEN sample file: {}'.format( self.bgen_sample_file)) create_table( BGEN_SAMPLES_TABLE, columns=[ 'index bigint NOT NULL', 'eid bigint NOT NULL', ], constraints=[ 'pk_{} PRIMARY KEY (index, eid)'.format(BGEN_SAMPLES_TABLE) ], db_engine=self._get_db_engine()) samples_data = pd.read_table(self.bgen_sample_file, sep=' ', header=0, usecols=['ID_1', 'ID_2'], skiprows=[1]) samples_data.set_index(np.arange(1, samples_data.shape[0] + 1), inplace=True) samples_data.drop('ID_2', axis=1, inplace=True) samples_data.rename(columns={'ID_1': 'eid'}, inplace=True) samples_data.to_sql(BGEN_SAMPLES_TABLE, self._get_db_engine(), if_exists='append')
def _run_bgenix(self, arguments): random_bgen_file = get_temp_file_name('.bgen', tmpdir=get_tmpdir(self.tmpdir)) with open(random_bgen_file, 'br+') as bgen_file: full_command = [self._get_bgenix_path()] + arguments logger.info(f'Running: {full_command}') run_status = subprocess.run(full_command, stdout=bgen_file, stderr=subprocess.PIPE) if run_status.returncode != 0: message = f'bgenix failed: {" ".join(run_status.args)}' output = run_status.stderr.decode() e = UkbRestProgramExecutionError( message, output, ) logger.debug(output) raise e return random_bgen_file
def _vacuum(self): logger.info('Vacuuming') with self._get_db_engine().connect().execution_options( isolation_level="AUTOCOMMIT") as conn: conn.execute(""" vacuum analyze; """)
def _load_events(self): if self.db_type == 'sqlite': logger.warning('Events loading is not supported in SQLite') return logger.info('Loading events table') # create table db_engine = self._get_db_engine() create_table( 'events', columns=[ 'eid bigint NOT NULL', 'field_id integer NOT NULL', 'instance integer NOT NULL', 'event text NOT NULL', ], constraints=[ 'pk_events PRIMARY KEY (eid, field_id, instance, event)' ], db_engine=db_engine) # insert data of categorical multiple fields categorical_variables = pd.read_sql( """ select column_name, field_id, inst, table_name from fields where type = 'Categorical (multiple)' """, self._get_db_engine()) for (field_id, field_instance), field_data in categorical_variables.groupby( by=['field_id', 'inst']): sql_st = """ insert into events (eid, field_id, instance, event) ( select distinct * from ( select eid, {field_id}, {field_instance}, unnest(array[{field_columns}]) as event from {tables} ) t where t.event is not null ) """.format( field_id=field_id, field_instance=field_instance, field_columns=', '.join( [cn for cn in set(field_data['column_name'])]), tables=self._create_joins(list(set(field_data['table_name'])), join_type='inner join'), ) with db_engine.connect() as con: con.execute(sql_st)
def _load_csv(self): logger.info('Loading CSV files into database') if self.db_type != 'sqlite': self._close_db_engine() # parallel csv loading is only supported in databases different than sqlite Parallel(n_jobs=self.loading_n_jobs)( delayed(self._load_single_csv)(table_name, file_path) for table_name, file_path in self.table_csvs) else: for table_name, file_path in self.table_csvs: self._load_single_csv(table_name, file_path)
def _create_temporary_csvs(self, csv_file, csv_file_idx): logger.info('Writing temporary CSV files') self._close_db_engine() self.table_csvs = Parallel(n_jobs=self.loading_n_jobs)( delayed(self._save_column_range)(csv_file, csv_file_idx, column_names_idx, column_names) for column_names_idx, column_names in self._loading_tmp['chunked_column_names']) self.table_list.update(table_name for table_name, file_path in self.table_csvs)
def _load_single_csv(self, table_name, file_path): logger.info('{} -> {}'.format(file_path, table_name)) if self.db_type == 'sqlite': statement = ('.mode csv\n' + '.separator ","\n' + '.headers on\n' + '.import {file_path} {table_name}\n').format( **locals()) p = Popen(['sqlite3', self.db_file], stdout=PIPE, stdin=PIPE, stderr=PIPE) stdout_data, stderr_data = p.communicate( input=str.encode(statement)) if p.returncode != 0: raise Exception(stdout_data + b'\n' + stderr_data) # For each column, set NULL rows with empty strings # FIXME: this codes needs refactoring for col_name in self._loading_tmp['chunked_table_column_names'][ table_name]: statement = ( 'update {table_name} set {col_name} = null where {col_name} == "nan";' ).format(**locals()) p = Popen(['sqlite3', self.db_file], stdout=PIPE, stdin=PIPE, stderr=PIPE) stdout_data, stderr_data = p.communicate( input=str.encode(statement)) if p.returncode != 0: raise Exception(stdout_data + b'\n' + stderr_data) elif self.db_type == 'postgresql': statement = ( "\copy {table_name} from '{file_path}' (format csv, header, null ('nan'))" ).format(**locals()) self._run_psql(statement) if self.delete_temp_csv: logger.debug(f'Removing CSV already loaded: {file_path}') os.remove(file_path)
def initialize(self): logger.info('Initializing') logger.info('Loading fields dtypes') self.init_field_dtypes() logger.info('Initialization finished!')
def load_codings(self, codings_dir): logger.info('Loading codings from {}'.format(codings_dir)) db_engine = self._get_db_engine() create_table( 'codings', columns=[ 'data_coding bigint NOT NULL', 'coding text NOT NULL', 'meaning text NOT NULL', 'node_id bigint NULL', 'parent_id bigint NULL', 'selectable boolean NULL', ], constraints=[ 'pk_codings PRIMARY KEY (data_coding, coding, meaning)' ], db_engine=self._get_db_engine()) for afile in glob(join(codings_dir, '*.tsv')): afile_base = basename(afile) logger.info('Processing coding file: {}'.format(afile_base)) data = pd.read_table(afile, sep='\t+', na_filter=False, engine='python') data_coding = int(splitext(afile_base)[0].split('_')[1]) data['data_coding'] = data_coding data.to_sql('codings', db_engine, if_exists='append', index=False) create_indexes( 'codings', ['data_coding', 'coding', 'node_id', 'parent_id', 'selectable'], db_engine=db_engine) self._vacuum('codings')
def _create_constraints(self): if self.db_type == 'sqlite': logger.warning('Indexes are not supported for SQLite') return logger.info('Creating table constraints (indexes, primary keys, etc)') # bgen's samples table if self.bgen_sample_file is not None and os.path.isfile( self.bgen_sample_file): create_indexes(BGEN_SAMPLES_TABLE, ('index', 'eid'), db_engine=self._get_db_engine()) # fields table create_indexes( 'fields', ('field_id', 'inst', 'arr', 'table_name', 'type', 'coding'), db_engine=self._get_db_engine()) # events table create_indexes('events', ('eid', 'field_id', 'instance', 'event', ('field_id', 'event')), db_engine=self._get_db_engine())
def _load_all_eids(self): logger.info('Loading all eids into table {}'.format(ALL_EIDS_TABLE)) create_table( ALL_EIDS_TABLE, columns=[ 'eid bigint NOT NULL', ], constraints=['pk_{} PRIMARY KEY (eid)'.format(ALL_EIDS_TABLE)], db_engine=self._get_db_engine()) select_eid_sql = ' UNION DISTINCT '.join( 'select eid from {}'.format(table_name) for table_name in self.table_list) insert_eids_sql = """ insert into {all_eids_table} (eid) ( {sql_eids} ) """.format(all_eids_table=ALL_EIDS_TABLE, sql_eids=select_eid_sql) with self._get_db_engine().connect() as con: con.execute(insert_eids_sql)
def load_data(self, vacuum=False): """ Load all CSV files specified into the database configured. :return: """ logger.info('Loading phenotype data into database') try: for csv_file_idx, csv_file in enumerate(self.ukb_csvs): logger.info('Working on {}'.format(csv_file)) self._create_tables_schema(csv_file, csv_file_idx) self._create_temporary_csvs(csv_file, csv_file_idx) self._load_csv() self._load_all_eids() self._load_bgen_samples() self._load_events() self._create_constraints() if vacuum: self._vacuum() except OperationalError as e: raise UkbRestSQLExecutionError( 'There was an error with the database: ' + str(e)) except UnicodeDecodeError as e: logger.debug(str(e)) raise UkbRestProgramExecutionError( 'Unicode decoding error when reading CSV file. Activate debug to show more details.' ) # delete temporary variable del (self._loading_tmp) logger.info('Loading finished!')
def load_withdrawals(self, withdrawals_dir): db_engine = self._get_db_engine() # create table (if not exists) with db_engine.connect() as conn: logger.info('Creating withdrawals table') conn.execute(f""" CREATE TABLE IF NOT EXISTS {WITHDRAWALS_TABLE} ( eid bigint primary key ) """) for input_file in glob(join(withdrawals_dir, '*.csv')): logger.info(f'Reading input file {input_file}') data = pd.read_csv(input_file, header=None) data = data.rename(columns={0: 'eid'}) n_data_before = data.shape[0] data = data.drop_duplicates() if n_data_before != data.shape[0]: logger.warning( f'Duplicate IDs in file were removed ({n_data_before} vs {data.shape[0]})' ) # remove duplicates already in DB current_eids = pd.read_sql( f'select eid from {WITHDRAWALS_TABLE}', conn)['eid'] data = data.loc[~data['eid'].isin(current_eids)] logger.info( f'Writing to SQL table: {data.shape[0]} new sample IDs') data.to_sql(WITHDRAWALS_TABLE, db_engine, index=False, if_exists='append')
def load_samples_data(self, data_dir, identifier_columns={}, skip_columns={}, separators={}): db_engine = self._get_db_engine() for afile in glob(join(data_dir, '*.txt')): filename = basename(afile) logger.info('Loading samples data from file: {}'.format(filename)) sep = separators[filename] if filename in separators else ' ' data = pd.read_table(afile, sep=sep) if filename in skip_columns: logger.info('Dropping columns: {}'.format(','.join( skip_columns[filename]))) data = data.drop(skip_columns[filename], axis=1) eid_columns = identifier_columns[ filename] if filename in identifier_columns else 'eid' if not isinstance(eid_columns, (list, tuple)): eid_columns = [eid_columns] if any(id_col not in data.columns for id_col in eid_columns): logger.error( "File '{0}' has no identifier column ({1})".format( filename, eid_columns)) continue table_name = splitext(filename)[0] # rename columns columns_rename = { old_col: self._rename_column(old_col, eid_columns) for old_col in data.columns } if len(eid_columns) == 1: columns_rename[eid_columns[0]] = 'eid' eid_columns[0] = 'eid' data = data.rename(columns=columns_rename) data.to_sql(table_name, db_engine, if_exists='replace', index=False) # add primary key logger.info('Adding primary key') with db_engine.connect() as conn: conn.execute(""" ALTER TABLE {table_name} ADD CONSTRAINT pk_{table_name} PRIMARY KEY ({id_cols}); """.format(table_name=table_name, id_cols=','.join(eid_columns))) # insert new data columns into fields table logger.info("Adding columns to 'fields' table") columns_to_fields = [x for x in data.columns if x != 'eid'] columns_dtypes_to_fields = [ self._get_column_type(x) for ix, x in enumerate(data.dtypes) if data.columns[ix] != 'eid' ] fields_table_data = pd.DataFrame({ 'column_name': columns_to_fields, 'field_id': columns_to_fields, 'table_name': table_name, 'type': columns_dtypes_to_fields, }) fields_table_data.to_sql('fields', db_engine, index=False, if_exists='append')
def _get_db_columns_dtypes(self, ukbcsv_file): """ Returns a Pandas-compatible type list with SQLAlchemy types for each column. :param ukbcsv_file: :return: """ logger.info('Getting columns types') filename = os.path.splitext(ukbcsv_file)[0] + '.html' logger.info('Reading data types from {}'.format(filename)) with open(filename, 'r', encoding='latin1') as f: tmp = pd.read_html(f, match='UDI', header=0, index_col=1, flavor='html5lib') logger.debug('Filling NaN values') df_types = tmp[0].loc[:, 'Type'] df_types = df_types.fillna(method='ffill') df_descriptions = tmp[0].loc[:, 'Description'] df_descriptions = df_descriptions.fillna(method='ffill') del tmp db_column_types = {} column_types = {} column_descriptions = {} column_codings = {} # open just to get columns csv_df = pd.read_csv(ukbcsv_file, index_col=0, header=0, nrows=1) columns = csv_df.columns.tolist() del csv_df logger.debug('Reading columns') for col in columns: col_type = df_types[col] final_db_col_type = TEXT if col_type == 'Continuous': final_db_col_type = FLOAT elif col_type == 'Integer': final_db_col_type = INT elif col_type in ('Date', 'Time'): final_db_col_type = TIMESTAMP db_column_types[col] = final_db_col_type column_types[self._rename_columns(col)] = col_type column_descriptions[self._rename_columns( col)] = df_descriptions[col].split('Uses data-coding ')[0] # search for column coding coding_matches = re.search(Pheno2SQL.RE_FIELD_CODING, df_descriptions[col]) if coding_matches is not None: column_codings[self._rename_columns(col)] = int( coding_matches.group('coding')) return db_column_types, column_types, column_descriptions, column_codings
def load_sql(self, sql_file): self._run_psql(sql_file, is_file=True) logger.info(f'SQL file loaded successfully: {sql_file}')
def _create_tables_schema(self, csv_file, csv_file_idx): """ Reads the data types of each data field in csv_file and create the necessary database tables. :return: """ logger.info('Creating database tables') tmp = pd.read_csv(csv_file, index_col=0, header=0, nrows=1, low_memory=False) old_columns = tmp.columns.tolist() del tmp new_columns = [self._rename_columns(x) for x in old_columns] # Remove columns that were previously loaded in other datasets if 'existing_col_names' not in self._loading_tmp: # dictionary with data-field as key and csv file as value columns_and_csv_files = {} else: columns_and_csv_files = self._loading_tmp['existing_col_names'] old_columns_clean = [] new_columns_clean = [] for old_col_name, new_col_name in tuple(zip(old_columns, new_columns)): if new_col_name in columns_and_csv_files: corresponding_csv_file = columns_and_csv_files[new_col_name] logger.warning( f'Column {new_col_name} already loaded from {corresponding_csv_file}. Skipping.' ) continue columns_and_csv_files[new_col_name] = csv_file old_columns_clean.append(old_col_name) new_columns_clean.append(new_col_name) self._loading_tmp['existing_col_names'] = columns_and_csv_files # keep only unique columns (not loaded in previous files) old_columns = old_columns_clean new_columns = new_columns_clean all_columns = tuple(zip(old_columns, new_columns)) # FIXME: check if self.n_columns_per_table is greater than the real number of columns self._loading_tmp['chunked_column_names'] = tuple( enumerate(self._chunker(all_columns, self.n_columns_per_table))) self._loading_tmp['chunked_table_column_names'] = \ {self._get_table_name(col_idx, csv_file_idx): [col[1] for col in col_names] for col_idx, col_names in self._loading_tmp['chunked_column_names']} # get columns dtypes (for PostgreSQL and standard ones) db_types_old_column_names, all_fields_dtypes, all_fields_description, all_fields_coding = self._get_db_columns_dtypes( csv_file) db_dtypes = { self._rename_columns(k): v for k, v in db_types_old_column_names.items() } self._fields_dtypes.update(all_fields_dtypes) data_sample = pd.read_csv(csv_file, index_col=0, header=0, nrows=1, dtype=str) data_sample = data_sample.rename(columns=self._rename_columns) # create fields table if csv_file_idx == 0: create_table('fields', columns=[ 'column_name text NOT NULL', 'table_name text', 'field_id text NOT NULL', 'description text', 'coding bigint', 'inst bigint', 'arr bigint', 'type text NOT NULL', ], constraints=['pk_fields PRIMARY KEY (column_name)'], db_engine=self._get_db_engine(), drop_if_exists=True) current_stop = 0 for column_names_idx, column_names in self._loading_tmp[ 'chunked_column_names']: new_columns_names = [x[1] for x in column_names] fields_ids = [] instances = [] arrays = [] fields_dtypes = [] fields_descriptions = [] fields_codings = [] for col_name in new_columns_names: match = re.match(Pheno2SQL.RE_FIELD_INFO, col_name) fields_ids.append(match.group('field_id')) instances.append(int(match.group('instance'))) arrays.append(int(match.group('array'))) fields_dtypes.append(all_fields_dtypes[col_name]) fields_descriptions.append(all_fields_description[col_name]) if col_name in all_fields_coding: fields_codings.append(all_fields_coding[col_name]) else: fields_codings.append(np.nan) # Create main table structure table_name = self._get_table_name(column_names_idx, csv_file_idx) logger.info('Table {} ({} columns)'.format(table_name, len(new_columns_names))) data_sample.loc[[], new_columns_names].to_sql(table_name, self._get_db_engine(), if_exists='replace', dtype=db_dtypes) with self._get_db_engine().connect() as conn: conn.execute(""" ALTER TABLE {table_name} ADD CONSTRAINT pk_{table_name} PRIMARY KEY (eid); """.format(table_name=table_name)) with self._get_db_engine().connect() as conn: conn.execute('DROP INDEX ix_{table_name}_eid;'.format( table_name=table_name)) # Create auxiliary table n_column_names = len(new_columns_names) current_start = current_stop current_stop = current_start + n_column_names aux_table = pd.DataFrame({ 'column_name': new_columns_names, 'field_id': fields_ids, 'inst': instances, 'arr': arrays, 'coding': fields_codings, 'table_name': table_name, 'type': fields_dtypes, 'description': fields_descriptions }) # aux_table = aux_table.set_index('column_name') aux_table.to_sql('fields', self._get_db_engine(), index=False, if_exists='append')