def _get_file_encoding(self, csv_file): csv_file_name = os.path.basename(csv_file) csv_file_full_path = os.path.realpath(csv_file) csv_file_dir = os.path.dirname(csv_file_full_path) encoding_file = os.path.join(csv_file_dir, self.csv_files_encoding_file) if os.path.isfile(encoding_file): enc_file = pd.read_table(encoding_file, index_col=0, header=None, delim_whitespace=True, squeeze=True) if not enc_file.index.is_unique: logger.error( f'{self.csv_files_encoding_file} has no unique files. Not using the file' ) return self.csv_files_encoding if csv_file_name in enc_file.index: file_encoding = enc_file.at[csv_file_name] logger.info( f'Encoding found in {self.csv_files_encoding_file}: {file_encoding}' ) return file_encoding else: logger.warning( f'No {self.csv_files_encoding_file} found, assuming {self.csv_files_encoding}' ) return self.csv_files_encoding
def _load_bgen_samples(self): if self.bgen_sample_file is None or not os.path.isfile( self.bgen_sample_file): logger.warning( 'BGEN sample file not set or does not exist: {}'.format( self.bgen_sample_file)) return logger.info('Loading BGEN sample file: {}'.format( self.bgen_sample_file)) create_table( BGEN_SAMPLES_TABLE, columns=[ 'index bigint NOT NULL', 'eid bigint NOT NULL', ], constraints=[ 'pk_{} PRIMARY KEY (index, eid)'.format(BGEN_SAMPLES_TABLE) ], db_engine=self._get_db_engine()) samples_data = pd.read_table(self.bgen_sample_file, sep=' ', header=0, usecols=['ID_1', 'ID_2'], skiprows=[1]) samples_data.set_index(np.arange(1, samples_data.shape[0] + 1), inplace=True) samples_data.drop('ID_2', axis=1, inplace=True) samples_data.rename(columns={'ID_1': 'eid'}, inplace=True) samples_data.to_sql(BGEN_SAMPLES_TABLE, self._get_db_engine(), if_exists='append')
def process_users_file(self): # process user/pass file yaml = YAML() if self.users_file is not None and os.path.isfile(self.users_file): users = self._read_yaml_file(self.users_file) # hash all non-hashed passwords new_users = {} for user, passw in users.items(): if not ENCODED_PASSWORD_PATTERN.match(passw): new_users[user] = generate_password_hash( passw, method=self.method) else: new_users[user] = passw with open(self.users_file, 'w') as f: yaml.dump(new_users, f) elif self.users_file is not None and not os.path.isfile( self.users_file): logger.warning( 'Users file for authentication does not exist. No access will be allowed until the file is properly created.' ) elif self.users_file is None: logger.warning( 'No users file was specified, so HTTP Basic authentication is disabled.' )
def _load_events(self): if self.db_type == 'sqlite': logger.warning('Events loading is not supported in SQLite') return logger.info('Loading events table') # create table db_engine = self._get_db_engine() create_table( 'events', columns=[ 'eid bigint NOT NULL', 'field_id integer NOT NULL', 'instance integer NOT NULL', 'event text NOT NULL', ], constraints=[ 'pk_events PRIMARY KEY (eid, field_id, instance, event)' ], db_engine=db_engine) # insert data of categorical multiple fields categorical_variables = pd.read_sql( """ select column_name, field_id, inst, table_name from fields where type = 'Categorical (multiple)' """, self._get_db_engine()) for (field_id, field_instance), field_data in categorical_variables.groupby( by=['field_id', 'inst']): sql_st = """ insert into events (eid, field_id, instance, event) ( select distinct * from ( select eid, {field_id}, {field_instance}, unnest(array[{field_columns}]) as event from {tables} ) t where t.event is not null ) """.format( field_id=field_id, field_instance=field_instance, field_columns=', '.join( [cn for cn in set(field_data['column_name'])]), tables=self._create_joins(list(set(field_data['table_name'])), join_type='inner join'), ) with db_engine.connect() as con: con.execute(sql_st)
def _create_constraints(self): if self.db_type == 'sqlite': logger.warning('Indexes are not supported for SQLite') return logger.info('Creating table constraints (indexes, primary keys, etc)') # bgen's samples table if self.bgen_sample_file is not None and os.path.isfile( self.bgen_sample_file): create_indexes(BGEN_SAMPLES_TABLE, ('index', 'eid'), db_engine=self._get_db_engine()) # fields table create_indexes( 'fields', ('field_id', 'inst', 'arr', 'table_name', 'type', 'coding'), db_engine=self._get_db_engine()) # events table create_indexes('events', ('eid', 'field_id', 'instance', 'event', ('field_id', 'event')), db_engine=self._get_db_engine())
def load_withdrawals(self, withdrawals_dir): db_engine = self._get_db_engine() # create table (if not exists) with db_engine.connect() as conn: logger.info('Creating withdrawals table') conn.execute(f""" CREATE TABLE IF NOT EXISTS {WITHDRAWALS_TABLE} ( eid bigint primary key ) """) for input_file in glob(join(withdrawals_dir, '*.csv')): logger.info(f'Reading input file {input_file}') data = pd.read_csv(input_file, header=None) data = data.rename(columns={0: 'eid'}) n_data_before = data.shape[0] data = data.drop_duplicates() if n_data_before != data.shape[0]: logger.warning( f'Duplicate IDs in file were removed ({n_data_before} vs {data.shape[0]})' ) # remove duplicates already in DB current_eids = pd.read_sql( f'select eid from {WITHDRAWALS_TABLE}', conn)['eid'] data = data.loc[~data['eid'].isin(current_eids)] logger.info( f'Writing to SQL table: {data.shape[0]} new sample IDs') data.to_sql(WITHDRAWALS_TABLE, db_engine, index=False, if_exists='append')
def _setup_genotype_path(): genotype_path = environ.get(GENOTYPE_PATH_ENV, None) if not isdir(genotype_path): logger.warning( 'The genotype directory does not exist. You have to mount it using ' 'the option "-v hostDir:{}" of "docker run"'.format(genotype_path)) return bgen_files = [ f for f in listdir(genotype_path) if f.lower().endswith('.bgen') ] if len(bgen_files) == 0: logger.warning('No .bgen files were found in the genotype directory') bgi_files = [ f for f in listdir(genotype_path) if f.lower().endswith('.bgi') ] if len(bgi_files) == 0: logger.warning('No .bgi files were found in the genotype directory')
def __init__(self, ukb_csvs, db_uri, bgen_sample_file=None, table_prefix='ukb_pheno_', n_columns_per_table=sys.maxsize, loading_n_jobs=-1, tmpdir=tempfile.mkdtemp(prefix='ukbrest'), loading_chunksize=5000, sql_chunksize=None, delete_temp_csv=True): """ :param ukb_csvs: files are loaded in the order they are specified :param db_uri: :param table_prefix: :param n_columns_per_table: :param loading_n_jobs: :param tmpdir: :param loading_chunksize: number of lines to read when loading CSV files to the SQL database. :param sql_chunksize: when an SQL query is submited to get phenotypes, this parameteres indicates the chunksize (number of rows). """ super(Pheno2SQL, self).__init__(db_uri) if isinstance(ukb_csvs, (tuple, list)): self.ukb_csvs = ukb_csvs else: self.ukb_csvs = (ukb_csvs, ) self.bgen_sample_file = bgen_sample_file parse_result = urlparse(self.db_uri) self.db_type = parse_result.scheme if self.db_type == 'sqlite': logger.warning('sqlite does not support parallel loading') self.db_file = self.db_uri.split(':///')[-1] elif self.db_type == 'postgresql': self.db_host = parse_result.hostname self.db_port = parse_result.port self.db_name = parse_result.path.split('/')[-1] self.db_user = parse_result.username self.db_pass = parse_result.password self.table_prefix = table_prefix self.n_columns_per_table = n_columns_per_table logger.debug("n_columns_per_table set to {}".format( self.n_columns_per_table)) self.loading_n_jobs = loading_n_jobs self.tmpdir = tmpdir self.loading_chunksize = loading_chunksize self.sql_chunksize = sql_chunksize if self.sql_chunksize is None: logger.warning( '{} was not set, no chunksize for SQL queries, what can lead to ' 'memory problems.'.format(SQL_CHUNKSIZE_ENV)) self._fields_dtypes = {} # this is a temporary variable that holds information about loading self._loading_tmp = {} self.table_list = set() self.csv_files_encoding_file = 'encodings.txt' self.csv_files_encoding = 'utf-8' self.delete_temp_csv = delete_temp_csv
def _create_tables_schema(self, csv_file, csv_file_idx): """ Reads the data types of each data field in csv_file and create the necessary database tables. :return: """ logger.info('Creating database tables') tmp = pd.read_csv(csv_file, index_col=0, header=0, nrows=1, low_memory=False) old_columns = tmp.columns.tolist() del tmp new_columns = [self._rename_columns(x) for x in old_columns] # Remove columns that were previously loaded in other datasets if 'existing_col_names' not in self._loading_tmp: # dictionary with data-field as key and csv file as value columns_and_csv_files = {} else: columns_and_csv_files = self._loading_tmp['existing_col_names'] old_columns_clean = [] new_columns_clean = [] for old_col_name, new_col_name in tuple(zip(old_columns, new_columns)): if new_col_name in columns_and_csv_files: corresponding_csv_file = columns_and_csv_files[new_col_name] logger.warning( f'Column {new_col_name} already loaded from {corresponding_csv_file}. Skipping.' ) continue columns_and_csv_files[new_col_name] = csv_file old_columns_clean.append(old_col_name) new_columns_clean.append(new_col_name) self._loading_tmp['existing_col_names'] = columns_and_csv_files # keep only unique columns (not loaded in previous files) old_columns = old_columns_clean new_columns = new_columns_clean all_columns = tuple(zip(old_columns, new_columns)) # FIXME: check if self.n_columns_per_table is greater than the real number of columns self._loading_tmp['chunked_column_names'] = tuple( enumerate(self._chunker(all_columns, self.n_columns_per_table))) self._loading_tmp['chunked_table_column_names'] = \ {self._get_table_name(col_idx, csv_file_idx): [col[1] for col in col_names] for col_idx, col_names in self._loading_tmp['chunked_column_names']} # get columns dtypes (for PostgreSQL and standard ones) db_types_old_column_names, all_fields_dtypes, all_fields_description, all_fields_coding = self._get_db_columns_dtypes( csv_file) db_dtypes = { self._rename_columns(k): v for k, v in db_types_old_column_names.items() } self._fields_dtypes.update(all_fields_dtypes) data_sample = pd.read_csv(csv_file, index_col=0, header=0, nrows=1, dtype=str) data_sample = data_sample.rename(columns=self._rename_columns) # create fields table if csv_file_idx == 0: create_table('fields', columns=[ 'column_name text NOT NULL', 'table_name text', 'field_id text NOT NULL', 'description text', 'coding bigint', 'inst bigint', 'arr bigint', 'type text NOT NULL', ], constraints=['pk_fields PRIMARY KEY (column_name)'], db_engine=self._get_db_engine(), drop_if_exists=True) current_stop = 0 for column_names_idx, column_names in self._loading_tmp[ 'chunked_column_names']: new_columns_names = [x[1] for x in column_names] fields_ids = [] instances = [] arrays = [] fields_dtypes = [] fields_descriptions = [] fields_codings = [] for col_name in new_columns_names: match = re.match(Pheno2SQL.RE_FIELD_INFO, col_name) fields_ids.append(match.group('field_id')) instances.append(int(match.group('instance'))) arrays.append(int(match.group('array'))) fields_dtypes.append(all_fields_dtypes[col_name]) fields_descriptions.append(all_fields_description[col_name]) if col_name in all_fields_coding: fields_codings.append(all_fields_coding[col_name]) else: fields_codings.append(np.nan) # Create main table structure table_name = self._get_table_name(column_names_idx, csv_file_idx) logger.info('Table {} ({} columns)'.format(table_name, len(new_columns_names))) data_sample.loc[[], new_columns_names].to_sql(table_name, self._get_db_engine(), if_exists='replace', dtype=db_dtypes) with self._get_db_engine().connect() as conn: conn.execute(""" ALTER TABLE {table_name} ADD CONSTRAINT pk_{table_name} PRIMARY KEY (eid); """.format(table_name=table_name)) with self._get_db_engine().connect() as conn: conn.execute('DROP INDEX ix_{table_name}_eid;'.format( table_name=table_name)) # Create auxiliary table n_column_names = len(new_columns_names) current_start = current_stop current_stop = current_start + n_column_names aux_table = pd.DataFrame({ 'column_name': new_columns_names, 'field_id': fields_ids, 'inst': instances, 'arr': arrays, 'coding': fields_codings, 'table_name': table_name, 'type': fields_dtypes, 'description': fields_descriptions }) # aux_table = aux_table.set_index('column_name') aux_table.to_sql('fields', self._get_db_engine(), index=False, if_exists='append')