Beispiel #1
0
    def _run_bgenix(self, arguments):
        random_bgen_file = get_temp_file_name('.bgen',
                                              tmpdir=get_tmpdir(self.tmpdir))

        with open(random_bgen_file, 'br+') as bgen_file:
            full_command = [self._get_bgenix_path()] + arguments

            logger.info(f'Running: {full_command}')

            run_status = subprocess.run(full_command,
                                        stdout=bgen_file,
                                        stderr=subprocess.PIPE)

            if run_status.returncode != 0:
                message = f'bgenix failed: {" ".join(run_status.args)}'
                output = run_status.stderr.decode()

                e = UkbRestProgramExecutionError(
                    message,
                    output,
                )

                logger.debug(output)

                raise e

        return random_bgen_file
Beispiel #2
0
    def func_wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except UkbRestException as e:
            pass
        except Exception as e:
            tb = traceback.format_exc()
            logger.debug(tb)

            msg = f'\n{str(e)}'
            # if isinstance(e, JoblibException):
            #     msg = ''

            logger.error(f'Loading finished with an unknown error. Activate debug to see full stack trace.{msg}')
Beispiel #3
0
    def _load_single_csv(self, table_name, file_path):
        logger.info('{} -> {}'.format(file_path, table_name))

        if self.db_type == 'sqlite':
            statement = ('.mode csv\n' + '.separator ","\n' + '.headers on\n' +
                         '.import {file_path} {table_name}\n').format(
                             **locals())

            p = Popen(['sqlite3', self.db_file],
                      stdout=PIPE,
                      stdin=PIPE,
                      stderr=PIPE)
            stdout_data, stderr_data = p.communicate(
                input=str.encode(statement))

            if p.returncode != 0:
                raise Exception(stdout_data + b'\n' + stderr_data)

            # For each column, set NULL rows with empty strings
            # FIXME: this codes needs refactoring
            for col_name in self._loading_tmp['chunked_table_column_names'][
                    table_name]:
                statement = (
                    'update {table_name} set {col_name} = null where {col_name} == "nan";'
                ).format(**locals())

                p = Popen(['sqlite3', self.db_file],
                          stdout=PIPE,
                          stdin=PIPE,
                          stderr=PIPE)
                stdout_data, stderr_data = p.communicate(
                    input=str.encode(statement))

                if p.returncode != 0:
                    raise Exception(stdout_data + b'\n' + stderr_data)

        elif self.db_type == 'postgresql':
            statement = (
                "\copy {table_name} from '{file_path}' (format csv, header, null ('nan'))"
            ).format(**locals())

            self._run_psql(statement)

            if self.delete_temp_csv:
                logger.debug(f'Removing CSV already loaded: {file_path}')
                os.remove(file_path)
Beispiel #4
0
    def _save_column_range(self, csv_file, csv_file_idx, column_names_idx,
                           column_names):
        table_name = self._get_table_name(column_names_idx, csv_file_idx)
        output_csv_filename = os.path.join(get_tmpdir(self.tmpdir),
                                           table_name + '.csv')
        full_column_names = ['eid'] + [x[0] for x in column_names]

        data_reader = pd.read_csv(csv_file,
                                  index_col=0,
                                  header=0,
                                  usecols=full_column_names,
                                  chunksize=self.loading_chunksize,
                                  dtype=str,
                                  encoding=self._get_file_encoding(csv_file))

        new_columns = [x[1] for x in column_names]

        logger.debug('{}'.format(output_csv_filename))

        write_headers = True
        if self.db_type == 'sqlite':
            write_headers = False

        for chunk_idx, chunk in enumerate(data_reader):
            chunk = chunk.rename(columns=self._rename_columns)
            # chunk = self._replace_null_str(chunk)

            if chunk_idx == 0:
                chunk.loc[:, new_columns].to_csv(output_csv_filename,
                                                 quoting=csv.QUOTE_NONNUMERIC,
                                                 na_rep=np.nan,
                                                 header=write_headers,
                                                 mode='w')
            else:
                chunk.loc[:, new_columns].to_csv(output_csv_filename,
                                                 quoting=csv.QUOTE_NONNUMERIC,
                                                 na_rep=np.nan,
                                                 header=False,
                                                 mode='a')

        return table_name, output_csv_filename
Beispiel #5
0
    def _query_generic(self,
                       sql_query,
                       order_by_dict=None,
                       results_transformator=None):
        final_sql_query = sql_query

        if order_by_dict is not None:
            outer_sql = """
                select {data_fields}
                from {order_by} s left outer join (
                    {base_sql}
                ) u
                using (eid)
                order by s.index asc
            """.format(order_by=order_by_dict['table'],
                       base_sql=sql_query,
                       data_fields=order_by_dict['columns_select'])

            final_sql_query = outer_sql

        logger.debug(final_sql_query)

        try:
            results_iterator = pd.read_sql(final_sql_query,
                                           self._get_db_engine(),
                                           index_col='eid',
                                           chunksize=self.sql_chunksize)
        except ProgrammingError as e:
            raise UkbRestSQLExecutionError(str(e))

        if self.sql_chunksize is None:
            results_iterator = iter([results_iterator])

        for chunk in results_iterator:
            if results_transformator is not None:
                chunk = results_transformator(chunk)

            yield chunk
Beispiel #6
0
    def load_data(self, vacuum=False):
        """
        Load all CSV files specified into the database configured.
        :return:
        """
        logger.info('Loading phenotype data into database')

        try:
            for csv_file_idx, csv_file in enumerate(self.ukb_csvs):
                logger.info('Working on {}'.format(csv_file))

                self._create_tables_schema(csv_file, csv_file_idx)
                self._create_temporary_csvs(csv_file, csv_file_idx)
                self._load_csv()

            self._load_all_eids()
            self._load_bgen_samples()
            self._load_events()
            self._create_constraints()

            if vacuum:
                self._vacuum()

        except OperationalError as e:
            raise UkbRestSQLExecutionError(
                'There was an error with the database: ' + str(e))
        except UnicodeDecodeError as e:
            logger.debug(str(e))
            raise UkbRestProgramExecutionError(
                'Unicode decoding error when reading CSV file. Activate debug to show more details.'
            )

        # delete temporary variable
        del (self._loading_tmp)

        logger.info('Loading finished!')
Beispiel #7
0
    def __init__(self,
                 ukb_csvs,
                 db_uri,
                 bgen_sample_file=None,
                 table_prefix='ukb_pheno_',
                 n_columns_per_table=sys.maxsize,
                 loading_n_jobs=-1,
                 tmpdir=tempfile.mkdtemp(prefix='ukbrest'),
                 loading_chunksize=5000,
                 sql_chunksize=None,
                 delete_temp_csv=True):
        """
        :param ukb_csvs: files are loaded in the order they are specified
        :param db_uri:
        :param table_prefix:
        :param n_columns_per_table:
        :param loading_n_jobs:
        :param tmpdir:
        :param loading_chunksize: number of lines to read when loading CSV files to the SQL database.
        :param sql_chunksize: when an SQL query is submited to get phenotypes, this parameteres indicates the
        chunksize (number of rows).
        """

        super(Pheno2SQL, self).__init__(db_uri)

        if isinstance(ukb_csvs, (tuple, list)):
            self.ukb_csvs = ukb_csvs
        else:
            self.ukb_csvs = (ukb_csvs, )

        self.bgen_sample_file = bgen_sample_file

        parse_result = urlparse(self.db_uri)
        self.db_type = parse_result.scheme

        if self.db_type == 'sqlite':
            logger.warning('sqlite does not support parallel loading')
            self.db_file = self.db_uri.split(':///')[-1]
        elif self.db_type == 'postgresql':
            self.db_host = parse_result.hostname
            self.db_port = parse_result.port
            self.db_name = parse_result.path.split('/')[-1]
            self.db_user = parse_result.username
            self.db_pass = parse_result.password

        self.table_prefix = table_prefix
        self.n_columns_per_table = n_columns_per_table
        logger.debug("n_columns_per_table set to {}".format(
            self.n_columns_per_table))
        self.loading_n_jobs = loading_n_jobs
        self.tmpdir = tmpdir
        self.loading_chunksize = loading_chunksize

        self.sql_chunksize = sql_chunksize
        if self.sql_chunksize is None:
            logger.warning(
                '{} was not set, no chunksize for SQL queries, what can lead to '
                'memory problems.'.format(SQL_CHUNKSIZE_ENV))

        self._fields_dtypes = {}

        # this is a temporary variable that holds information about loading
        self._loading_tmp = {}

        self.table_list = set()

        self.csv_files_encoding_file = 'encodings.txt'
        self.csv_files_encoding = 'utf-8'

        self.delete_temp_csv = delete_temp_csv
Beispiel #8
0
    def _get_db_columns_dtypes(self, ukbcsv_file):
        """
        Returns a Pandas-compatible type list with SQLAlchemy types for each column.

        :param ukbcsv_file:
        :return:
        """

        logger.info('Getting columns types')

        filename = os.path.splitext(ukbcsv_file)[0] + '.html'

        logger.info('Reading data types from {}'.format(filename))
        with open(filename, 'r', encoding='latin1') as f:
            tmp = pd.read_html(f,
                               match='UDI',
                               header=0,
                               index_col=1,
                               flavor='html5lib')

        logger.debug('Filling NaN values')
        df_types = tmp[0].loc[:, 'Type']
        df_types = df_types.fillna(method='ffill')

        df_descriptions = tmp[0].loc[:, 'Description']
        df_descriptions = df_descriptions.fillna(method='ffill')
        del tmp

        db_column_types = {}
        column_types = {}
        column_descriptions = {}
        column_codings = {}

        # open just to get columns
        csv_df = pd.read_csv(ukbcsv_file, index_col=0, header=0, nrows=1)
        columns = csv_df.columns.tolist()
        del csv_df

        logger.debug('Reading columns')
        for col in columns:
            col_type = df_types[col]
            final_db_col_type = TEXT

            if col_type == 'Continuous':
                final_db_col_type = FLOAT

            elif col_type == 'Integer':
                final_db_col_type = INT

            elif col_type in ('Date', 'Time'):
                final_db_col_type = TIMESTAMP

            db_column_types[col] = final_db_col_type
            column_types[self._rename_columns(col)] = col_type
            column_descriptions[self._rename_columns(
                col)] = df_descriptions[col].split('Uses data-coding ')[0]

            # search for column coding
            coding_matches = re.search(Pheno2SQL.RE_FIELD_CODING,
                                       df_descriptions[col])
            if coding_matches is not None:
                column_codings[self._rename_columns(col)] = int(
                    coding_matches.group('coding'))

        return db_column_types, column_types, column_descriptions, column_codings