def _run_bgenix(self, arguments): random_bgen_file = get_temp_file_name('.bgen', tmpdir=get_tmpdir(self.tmpdir)) with open(random_bgen_file, 'br+') as bgen_file: full_command = [self._get_bgenix_path()] + arguments logger.info(f'Running: {full_command}') run_status = subprocess.run(full_command, stdout=bgen_file, stderr=subprocess.PIPE) if run_status.returncode != 0: message = f'bgenix failed: {" ".join(run_status.args)}' output = run_status.stderr.decode() e = UkbRestProgramExecutionError( message, output, ) logger.debug(output) raise e return random_bgen_file
def func_wrapper(*args, **kwargs): try: return func(*args, **kwargs) except UkbRestException as e: pass except Exception as e: tb = traceback.format_exc() logger.debug(tb) msg = f'\n{str(e)}' # if isinstance(e, JoblibException): # msg = '' logger.error(f'Loading finished with an unknown error. Activate debug to see full stack trace.{msg}')
def _load_single_csv(self, table_name, file_path): logger.info('{} -> {}'.format(file_path, table_name)) if self.db_type == 'sqlite': statement = ('.mode csv\n' + '.separator ","\n' + '.headers on\n' + '.import {file_path} {table_name}\n').format( **locals()) p = Popen(['sqlite3', self.db_file], stdout=PIPE, stdin=PIPE, stderr=PIPE) stdout_data, stderr_data = p.communicate( input=str.encode(statement)) if p.returncode != 0: raise Exception(stdout_data + b'\n' + stderr_data) # For each column, set NULL rows with empty strings # FIXME: this codes needs refactoring for col_name in self._loading_tmp['chunked_table_column_names'][ table_name]: statement = ( 'update {table_name} set {col_name} = null where {col_name} == "nan";' ).format(**locals()) p = Popen(['sqlite3', self.db_file], stdout=PIPE, stdin=PIPE, stderr=PIPE) stdout_data, stderr_data = p.communicate( input=str.encode(statement)) if p.returncode != 0: raise Exception(stdout_data + b'\n' + stderr_data) elif self.db_type == 'postgresql': statement = ( "\copy {table_name} from '{file_path}' (format csv, header, null ('nan'))" ).format(**locals()) self._run_psql(statement) if self.delete_temp_csv: logger.debug(f'Removing CSV already loaded: {file_path}') os.remove(file_path)
def _save_column_range(self, csv_file, csv_file_idx, column_names_idx, column_names): table_name = self._get_table_name(column_names_idx, csv_file_idx) output_csv_filename = os.path.join(get_tmpdir(self.tmpdir), table_name + '.csv') full_column_names = ['eid'] + [x[0] for x in column_names] data_reader = pd.read_csv(csv_file, index_col=0, header=0, usecols=full_column_names, chunksize=self.loading_chunksize, dtype=str, encoding=self._get_file_encoding(csv_file)) new_columns = [x[1] for x in column_names] logger.debug('{}'.format(output_csv_filename)) write_headers = True if self.db_type == 'sqlite': write_headers = False for chunk_idx, chunk in enumerate(data_reader): chunk = chunk.rename(columns=self._rename_columns) # chunk = self._replace_null_str(chunk) if chunk_idx == 0: chunk.loc[:, new_columns].to_csv(output_csv_filename, quoting=csv.QUOTE_NONNUMERIC, na_rep=np.nan, header=write_headers, mode='w') else: chunk.loc[:, new_columns].to_csv(output_csv_filename, quoting=csv.QUOTE_NONNUMERIC, na_rep=np.nan, header=False, mode='a') return table_name, output_csv_filename
def _query_generic(self, sql_query, order_by_dict=None, results_transformator=None): final_sql_query = sql_query if order_by_dict is not None: outer_sql = """ select {data_fields} from {order_by} s left outer join ( {base_sql} ) u using (eid) order by s.index asc """.format(order_by=order_by_dict['table'], base_sql=sql_query, data_fields=order_by_dict['columns_select']) final_sql_query = outer_sql logger.debug(final_sql_query) try: results_iterator = pd.read_sql(final_sql_query, self._get_db_engine(), index_col='eid', chunksize=self.sql_chunksize) except ProgrammingError as e: raise UkbRestSQLExecutionError(str(e)) if self.sql_chunksize is None: results_iterator = iter([results_iterator]) for chunk in results_iterator: if results_transformator is not None: chunk = results_transformator(chunk) yield chunk
def load_data(self, vacuum=False): """ Load all CSV files specified into the database configured. :return: """ logger.info('Loading phenotype data into database') try: for csv_file_idx, csv_file in enumerate(self.ukb_csvs): logger.info('Working on {}'.format(csv_file)) self._create_tables_schema(csv_file, csv_file_idx) self._create_temporary_csvs(csv_file, csv_file_idx) self._load_csv() self._load_all_eids() self._load_bgen_samples() self._load_events() self._create_constraints() if vacuum: self._vacuum() except OperationalError as e: raise UkbRestSQLExecutionError( 'There was an error with the database: ' + str(e)) except UnicodeDecodeError as e: logger.debug(str(e)) raise UkbRestProgramExecutionError( 'Unicode decoding error when reading CSV file. Activate debug to show more details.' ) # delete temporary variable del (self._loading_tmp) logger.info('Loading finished!')
def __init__(self, ukb_csvs, db_uri, bgen_sample_file=None, table_prefix='ukb_pheno_', n_columns_per_table=sys.maxsize, loading_n_jobs=-1, tmpdir=tempfile.mkdtemp(prefix='ukbrest'), loading_chunksize=5000, sql_chunksize=None, delete_temp_csv=True): """ :param ukb_csvs: files are loaded in the order they are specified :param db_uri: :param table_prefix: :param n_columns_per_table: :param loading_n_jobs: :param tmpdir: :param loading_chunksize: number of lines to read when loading CSV files to the SQL database. :param sql_chunksize: when an SQL query is submited to get phenotypes, this parameteres indicates the chunksize (number of rows). """ super(Pheno2SQL, self).__init__(db_uri) if isinstance(ukb_csvs, (tuple, list)): self.ukb_csvs = ukb_csvs else: self.ukb_csvs = (ukb_csvs, ) self.bgen_sample_file = bgen_sample_file parse_result = urlparse(self.db_uri) self.db_type = parse_result.scheme if self.db_type == 'sqlite': logger.warning('sqlite does not support parallel loading') self.db_file = self.db_uri.split(':///')[-1] elif self.db_type == 'postgresql': self.db_host = parse_result.hostname self.db_port = parse_result.port self.db_name = parse_result.path.split('/')[-1] self.db_user = parse_result.username self.db_pass = parse_result.password self.table_prefix = table_prefix self.n_columns_per_table = n_columns_per_table logger.debug("n_columns_per_table set to {}".format( self.n_columns_per_table)) self.loading_n_jobs = loading_n_jobs self.tmpdir = tmpdir self.loading_chunksize = loading_chunksize self.sql_chunksize = sql_chunksize if self.sql_chunksize is None: logger.warning( '{} was not set, no chunksize for SQL queries, what can lead to ' 'memory problems.'.format(SQL_CHUNKSIZE_ENV)) self._fields_dtypes = {} # this is a temporary variable that holds information about loading self._loading_tmp = {} self.table_list = set() self.csv_files_encoding_file = 'encodings.txt' self.csv_files_encoding = 'utf-8' self.delete_temp_csv = delete_temp_csv
def _get_db_columns_dtypes(self, ukbcsv_file): """ Returns a Pandas-compatible type list with SQLAlchemy types for each column. :param ukbcsv_file: :return: """ logger.info('Getting columns types') filename = os.path.splitext(ukbcsv_file)[0] + '.html' logger.info('Reading data types from {}'.format(filename)) with open(filename, 'r', encoding='latin1') as f: tmp = pd.read_html(f, match='UDI', header=0, index_col=1, flavor='html5lib') logger.debug('Filling NaN values') df_types = tmp[0].loc[:, 'Type'] df_types = df_types.fillna(method='ffill') df_descriptions = tmp[0].loc[:, 'Description'] df_descriptions = df_descriptions.fillna(method='ffill') del tmp db_column_types = {} column_types = {} column_descriptions = {} column_codings = {} # open just to get columns csv_df = pd.read_csv(ukbcsv_file, index_col=0, header=0, nrows=1) columns = csv_df.columns.tolist() del csv_df logger.debug('Reading columns') for col in columns: col_type = df_types[col] final_db_col_type = TEXT if col_type == 'Continuous': final_db_col_type = FLOAT elif col_type == 'Integer': final_db_col_type = INT elif col_type in ('Date', 'Time'): final_db_col_type = TIMESTAMP db_column_types[col] = final_db_col_type column_types[self._rename_columns(col)] = col_type column_descriptions[self._rename_columns( col)] = df_descriptions[col].split('Uses data-coding ')[0] # search for column coding coding_matches = re.search(Pheno2SQL.RE_FIELD_CODING, df_descriptions[col]) if coding_matches is not None: column_codings[self._rename_columns(col)] = int( coding_matches.group('coding')) return db_column_types, column_types, column_descriptions, column_codings