Ejemplo n.º 1
0
    def _get_file_encoding(self, csv_file):
        csv_file_name = os.path.basename(csv_file)

        csv_file_full_path = os.path.realpath(csv_file)
        csv_file_dir = os.path.dirname(csv_file_full_path)

        encoding_file = os.path.join(csv_file_dir,
                                     self.csv_files_encoding_file)

        if os.path.isfile(encoding_file):
            enc_file = pd.read_table(encoding_file,
                                     index_col=0,
                                     header=None,
                                     delim_whitespace=True,
                                     squeeze=True)

            if not enc_file.index.is_unique:
                logger.error(
                    f'{self.csv_files_encoding_file} has no unique files. Not using the file'
                )
                return self.csv_files_encoding

            if csv_file_name in enc_file.index:
                file_encoding = enc_file.at[csv_file_name]
                logger.info(
                    f'Encoding found in {self.csv_files_encoding_file}: {file_encoding}'
                )

                return file_encoding
        else:
            logger.warning(
                f'No {self.csv_files_encoding_file} found, assuming {self.csv_files_encoding}'
            )
            return self.csv_files_encoding
Ejemplo n.º 2
0
    def _load_bgen_samples(self):
        if self.bgen_sample_file is None or not os.path.isfile(
                self.bgen_sample_file):
            logger.warning(
                'BGEN sample file not set or does not exist: {}'.format(
                    self.bgen_sample_file))
            return

        logger.info('Loading BGEN sample file: {}'.format(
            self.bgen_sample_file))

        create_table(
            BGEN_SAMPLES_TABLE,
            columns=[
                'index bigint NOT NULL',
                'eid bigint NOT NULL',
            ],
            constraints=[
                'pk_{} PRIMARY KEY (index, eid)'.format(BGEN_SAMPLES_TABLE)
            ],
            db_engine=self._get_db_engine())

        samples_data = pd.read_table(self.bgen_sample_file,
                                     sep=' ',
                                     header=0,
                                     usecols=['ID_1', 'ID_2'],
                                     skiprows=[1])
        samples_data.set_index(np.arange(1, samples_data.shape[0] + 1),
                               inplace=True)
        samples_data.drop('ID_2', axis=1, inplace=True)
        samples_data.rename(columns={'ID_1': 'eid'}, inplace=True)

        samples_data.to_sql(BGEN_SAMPLES_TABLE,
                            self._get_db_engine(),
                            if_exists='append')
Ejemplo n.º 3
0
    def _run_bgenix(self, arguments):
        random_bgen_file = get_temp_file_name('.bgen',
                                              tmpdir=get_tmpdir(self.tmpdir))

        with open(random_bgen_file, 'br+') as bgen_file:
            full_command = [self._get_bgenix_path()] + arguments

            logger.info(f'Running: {full_command}')

            run_status = subprocess.run(full_command,
                                        stdout=bgen_file,
                                        stderr=subprocess.PIPE)

            if run_status.returncode != 0:
                message = f'bgenix failed: {" ".join(run_status.args)}'
                output = run_status.stderr.decode()

                e = UkbRestProgramExecutionError(
                    message,
                    output,
                )

                logger.debug(output)

                raise e

        return random_bgen_file
Ejemplo n.º 4
0
    def _vacuum(self):
        logger.info('Vacuuming')

        with self._get_db_engine().connect().execution_options(
                isolation_level="AUTOCOMMIT") as conn:
            conn.execute("""
                vacuum analyze;
            """)
Ejemplo n.º 5
0
    def _load_events(self):
        if self.db_type == 'sqlite':
            logger.warning('Events loading is not supported in SQLite')
            return

        logger.info('Loading events table')

        # create table
        db_engine = self._get_db_engine()

        create_table(
            'events',
            columns=[
                'eid bigint NOT NULL',
                'field_id integer NOT NULL',
                'instance integer NOT NULL',
                'event text NOT NULL',
            ],
            constraints=[
                'pk_events PRIMARY KEY (eid, field_id, instance, event)'
            ],
            db_engine=db_engine)

        # insert data of categorical multiple fields
        categorical_variables = pd.read_sql(
            """
            select column_name, field_id, inst, table_name
            from fields
            where type = 'Categorical (multiple)'
        """, self._get_db_engine())

        for (field_id,
             field_instance), field_data in categorical_variables.groupby(
                 by=['field_id', 'inst']):
            sql_st = """
                insert into events (eid, field_id, instance, event)
                (
                    select distinct *
                    from (
                        select eid, {field_id}, {field_instance}, unnest(array[{field_columns}]) as event
                        from {tables}
                    ) t
                    where t.event is not null
                )
            """.format(
                field_id=field_id,
                field_instance=field_instance,
                field_columns=', '.join(
                    [cn for cn in set(field_data['column_name'])]),
                tables=self._create_joins(list(set(field_data['table_name'])),
                                          join_type='inner join'),
            )

            with db_engine.connect() as con:
                con.execute(sql_st)
Ejemplo n.º 6
0
    def _load_csv(self):
        logger.info('Loading CSV files into database')

        if self.db_type != 'sqlite':
            self._close_db_engine()
            # parallel csv loading is only supported in databases different than sqlite
            Parallel(n_jobs=self.loading_n_jobs)(
                delayed(self._load_single_csv)(table_name, file_path)
                for table_name, file_path in self.table_csvs)
        else:
            for table_name, file_path in self.table_csvs:
                self._load_single_csv(table_name, file_path)
Ejemplo n.º 7
0
    def _create_temporary_csvs(self, csv_file, csv_file_idx):
        logger.info('Writing temporary CSV files')

        self._close_db_engine()
        self.table_csvs = Parallel(n_jobs=self.loading_n_jobs)(
            delayed(self._save_column_range)(csv_file, csv_file_idx,
                                             column_names_idx, column_names)
            for column_names_idx, column_names in
            self._loading_tmp['chunked_column_names'])

        self.table_list.update(table_name
                               for table_name, file_path in self.table_csvs)
Ejemplo n.º 8
0
    def _load_single_csv(self, table_name, file_path):
        logger.info('{} -> {}'.format(file_path, table_name))

        if self.db_type == 'sqlite':
            statement = ('.mode csv\n' + '.separator ","\n' + '.headers on\n' +
                         '.import {file_path} {table_name}\n').format(
                             **locals())

            p = Popen(['sqlite3', self.db_file],
                      stdout=PIPE,
                      stdin=PIPE,
                      stderr=PIPE)
            stdout_data, stderr_data = p.communicate(
                input=str.encode(statement))

            if p.returncode != 0:
                raise Exception(stdout_data + b'\n' + stderr_data)

            # For each column, set NULL rows with empty strings
            # FIXME: this codes needs refactoring
            for col_name in self._loading_tmp['chunked_table_column_names'][
                    table_name]:
                statement = (
                    'update {table_name} set {col_name} = null where {col_name} == "nan";'
                ).format(**locals())

                p = Popen(['sqlite3', self.db_file],
                          stdout=PIPE,
                          stdin=PIPE,
                          stderr=PIPE)
                stdout_data, stderr_data = p.communicate(
                    input=str.encode(statement))

                if p.returncode != 0:
                    raise Exception(stdout_data + b'\n' + stderr_data)

        elif self.db_type == 'postgresql':
            statement = (
                "\copy {table_name} from '{file_path}' (format csv, header, null ('nan'))"
            ).format(**locals())

            self._run_psql(statement)

            if self.delete_temp_csv:
                logger.debug(f'Removing CSV already loaded: {file_path}')
                os.remove(file_path)
Ejemplo n.º 9
0
    def initialize(self):
        logger.info('Initializing')

        logger.info('Loading fields dtypes')
        self.init_field_dtypes()

        logger.info('Initialization finished!')
Ejemplo n.º 10
0
    def load_codings(self, codings_dir):
        logger.info('Loading codings from {}'.format(codings_dir))
        db_engine = self._get_db_engine()

        create_table(
            'codings',
            columns=[
                'data_coding bigint NOT NULL',
                'coding text NOT NULL',
                'meaning text NOT NULL',
                'node_id bigint NULL',
                'parent_id bigint NULL',
                'selectable boolean NULL',
            ],
            constraints=[
                'pk_codings PRIMARY KEY (data_coding, coding, meaning)'
            ],
            db_engine=self._get_db_engine())

        for afile in glob(join(codings_dir, '*.tsv')):
            afile_base = basename(afile)

            logger.info('Processing coding file: {}'.format(afile_base))

            data = pd.read_table(afile,
                                 sep='\t+',
                                 na_filter=False,
                                 engine='python')

            data_coding = int(splitext(afile_base)[0].split('_')[1])
            data['data_coding'] = data_coding

            data.to_sql('codings', db_engine, if_exists='append', index=False)

        create_indexes(
            'codings',
            ['data_coding', 'coding', 'node_id', 'parent_id', 'selectable'],
            db_engine=db_engine)

        self._vacuum('codings')
Ejemplo n.º 11
0
    def _create_constraints(self):
        if self.db_type == 'sqlite':
            logger.warning('Indexes are not supported for SQLite')
            return

        logger.info('Creating table constraints (indexes, primary keys, etc)')

        # bgen's samples table
        if self.bgen_sample_file is not None and os.path.isfile(
                self.bgen_sample_file):
            create_indexes(BGEN_SAMPLES_TABLE, ('index', 'eid'),
                           db_engine=self._get_db_engine())

        # fields table
        create_indexes(
            'fields',
            ('field_id', 'inst', 'arr', 'table_name', 'type', 'coding'),
            db_engine=self._get_db_engine())

        # events table
        create_indexes('events', ('eid', 'field_id', 'instance', 'event',
                                  ('field_id', 'event')),
                       db_engine=self._get_db_engine())
Ejemplo n.º 12
0
    def _load_all_eids(self):
        logger.info('Loading all eids into table {}'.format(ALL_EIDS_TABLE))

        create_table(
            ALL_EIDS_TABLE,
            columns=[
                'eid bigint NOT NULL',
            ],
            constraints=['pk_{} PRIMARY KEY (eid)'.format(ALL_EIDS_TABLE)],
            db_engine=self._get_db_engine())

        select_eid_sql = ' UNION DISTINCT '.join(
            'select eid from {}'.format(table_name)
            for table_name in self.table_list)

        insert_eids_sql = """
            insert into {all_eids_table} (eid)
            (
                {sql_eids}
            )
        """.format(all_eids_table=ALL_EIDS_TABLE, sql_eids=select_eid_sql)

        with self._get_db_engine().connect() as con:
            con.execute(insert_eids_sql)
Ejemplo n.º 13
0
    def load_data(self, vacuum=False):
        """
        Load all CSV files specified into the database configured.
        :return:
        """
        logger.info('Loading phenotype data into database')

        try:
            for csv_file_idx, csv_file in enumerate(self.ukb_csvs):
                logger.info('Working on {}'.format(csv_file))

                self._create_tables_schema(csv_file, csv_file_idx)
                self._create_temporary_csvs(csv_file, csv_file_idx)
                self._load_csv()

            self._load_all_eids()
            self._load_bgen_samples()
            self._load_events()
            self._create_constraints()

            if vacuum:
                self._vacuum()

        except OperationalError as e:
            raise UkbRestSQLExecutionError(
                'There was an error with the database: ' + str(e))
        except UnicodeDecodeError as e:
            logger.debug(str(e))
            raise UkbRestProgramExecutionError(
                'Unicode decoding error when reading CSV file. Activate debug to show more details.'
            )

        # delete temporary variable
        del (self._loading_tmp)

        logger.info('Loading finished!')
Ejemplo n.º 14
0
    def load_withdrawals(self, withdrawals_dir):
        db_engine = self._get_db_engine()

        # create table (if not exists)
        with db_engine.connect() as conn:
            logger.info('Creating withdrawals table')
            conn.execute(f"""
                CREATE TABLE IF NOT EXISTS {WITHDRAWALS_TABLE} (
                    eid bigint primary key
                )
            """)

            for input_file in glob(join(withdrawals_dir, '*.csv')):
                logger.info(f'Reading input file {input_file}')

                data = pd.read_csv(input_file, header=None)
                data = data.rename(columns={0: 'eid'})

                n_data_before = data.shape[0]
                data = data.drop_duplicates()
                if n_data_before != data.shape[0]:
                    logger.warning(
                        f'Duplicate IDs in file were removed ({n_data_before} vs {data.shape[0]})'
                    )

                # remove duplicates already in DB
                current_eids = pd.read_sql(
                    f'select eid from {WITHDRAWALS_TABLE}', conn)['eid']
                data = data.loc[~data['eid'].isin(current_eids)]

                logger.info(
                    f'Writing to SQL table: {data.shape[0]} new sample IDs')
                data.to_sql(WITHDRAWALS_TABLE,
                            db_engine,
                            index=False,
                            if_exists='append')
Ejemplo n.º 15
0
    def load_samples_data(self,
                          data_dir,
                          identifier_columns={},
                          skip_columns={},
                          separators={}):
        db_engine = self._get_db_engine()

        for afile in glob(join(data_dir, '*.txt')):
            filename = basename(afile)
            logger.info('Loading samples data from file: {}'.format(filename))

            sep = separators[filename] if filename in separators else ' '

            data = pd.read_table(afile, sep=sep)

            if filename in skip_columns:
                logger.info('Dropping columns: {}'.format(','.join(
                    skip_columns[filename])))
                data = data.drop(skip_columns[filename], axis=1)

            eid_columns = identifier_columns[
                filename] if filename in identifier_columns else 'eid'
            if not isinstance(eid_columns, (list, tuple)):
                eid_columns = [eid_columns]

            if any(id_col not in data.columns for id_col in eid_columns):
                logger.error(
                    "File '{0}' has no identifier column ({1})".format(
                        filename, eid_columns))
                continue

            table_name = splitext(filename)[0]

            # rename columns
            columns_rename = {
                old_col: self._rename_column(old_col, eid_columns)
                for old_col in data.columns
            }

            if len(eid_columns) == 1:
                columns_rename[eid_columns[0]] = 'eid'
                eid_columns[0] = 'eid'

            data = data.rename(columns=columns_rename)

            data.to_sql(table_name,
                        db_engine,
                        if_exists='replace',
                        index=False)

            # add primary key
            logger.info('Adding primary key')
            with db_engine.connect() as conn:
                conn.execute("""
                    ALTER TABLE {table_name} ADD CONSTRAINT pk_{table_name} PRIMARY KEY ({id_cols});
                """.format(table_name=table_name,
                           id_cols=','.join(eid_columns)))

            # insert new data columns into fields table
            logger.info("Adding columns to 'fields' table")
            columns_to_fields = [x for x in data.columns if x != 'eid']
            columns_dtypes_to_fields = [
                self._get_column_type(x) for ix, x in enumerate(data.dtypes)
                if data.columns[ix] != 'eid'
            ]

            fields_table_data = pd.DataFrame({
                'column_name': columns_to_fields,
                'field_id': columns_to_fields,
                'table_name': table_name,
                'type': columns_dtypes_to_fields,
            })

            fields_table_data.to_sql('fields',
                                     db_engine,
                                     index=False,
                                     if_exists='append')
Ejemplo n.º 16
0
    def _get_db_columns_dtypes(self, ukbcsv_file):
        """
        Returns a Pandas-compatible type list with SQLAlchemy types for each column.

        :param ukbcsv_file:
        :return:
        """

        logger.info('Getting columns types')

        filename = os.path.splitext(ukbcsv_file)[0] + '.html'

        logger.info('Reading data types from {}'.format(filename))
        with open(filename, 'r', encoding='latin1') as f:
            tmp = pd.read_html(f,
                               match='UDI',
                               header=0,
                               index_col=1,
                               flavor='html5lib')

        logger.debug('Filling NaN values')
        df_types = tmp[0].loc[:, 'Type']
        df_types = df_types.fillna(method='ffill')

        df_descriptions = tmp[0].loc[:, 'Description']
        df_descriptions = df_descriptions.fillna(method='ffill')
        del tmp

        db_column_types = {}
        column_types = {}
        column_descriptions = {}
        column_codings = {}

        # open just to get columns
        csv_df = pd.read_csv(ukbcsv_file, index_col=0, header=0, nrows=1)
        columns = csv_df.columns.tolist()
        del csv_df

        logger.debug('Reading columns')
        for col in columns:
            col_type = df_types[col]
            final_db_col_type = TEXT

            if col_type == 'Continuous':
                final_db_col_type = FLOAT

            elif col_type == 'Integer':
                final_db_col_type = INT

            elif col_type in ('Date', 'Time'):
                final_db_col_type = TIMESTAMP

            db_column_types[col] = final_db_col_type
            column_types[self._rename_columns(col)] = col_type
            column_descriptions[self._rename_columns(
                col)] = df_descriptions[col].split('Uses data-coding ')[0]

            # search for column coding
            coding_matches = re.search(Pheno2SQL.RE_FIELD_CODING,
                                       df_descriptions[col])
            if coding_matches is not None:
                column_codings[self._rename_columns(col)] = int(
                    coding_matches.group('coding'))

        return db_column_types, column_types, column_descriptions, column_codings
Ejemplo n.º 17
0
 def load_sql(self, sql_file):
     self._run_psql(sql_file, is_file=True)
     logger.info(f'SQL file loaded successfully: {sql_file}')
Ejemplo n.º 18
0
    def _create_tables_schema(self, csv_file, csv_file_idx):
        """
        Reads the data types of each data field in csv_file and create the necessary database tables.
        :return:
        """
        logger.info('Creating database tables')

        tmp = pd.read_csv(csv_file,
                          index_col=0,
                          header=0,
                          nrows=1,
                          low_memory=False)
        old_columns = tmp.columns.tolist()
        del tmp
        new_columns = [self._rename_columns(x) for x in old_columns]

        # Remove columns that were previously loaded in other datasets
        if 'existing_col_names' not in self._loading_tmp:
            # dictionary with data-field as key and csv file as value
            columns_and_csv_files = {}
        else:
            columns_and_csv_files = self._loading_tmp['existing_col_names']

        old_columns_clean = []
        new_columns_clean = []

        for old_col_name, new_col_name in tuple(zip(old_columns, new_columns)):
            if new_col_name in columns_and_csv_files:
                corresponding_csv_file = columns_and_csv_files[new_col_name]
                logger.warning(
                    f'Column {new_col_name} already loaded from {corresponding_csv_file}. Skipping.'
                )
                continue

            columns_and_csv_files[new_col_name] = csv_file

            old_columns_clean.append(old_col_name)
            new_columns_clean.append(new_col_name)

        self._loading_tmp['existing_col_names'] = columns_and_csv_files

        # keep only unique columns (not loaded in previous files)
        old_columns = old_columns_clean
        new_columns = new_columns_clean
        all_columns = tuple(zip(old_columns, new_columns))

        # FIXME: check if self.n_columns_per_table is greater than the real number of columns
        self._loading_tmp['chunked_column_names'] = tuple(
            enumerate(self._chunker(all_columns, self.n_columns_per_table)))
        self._loading_tmp['chunked_table_column_names'] = \
            {self._get_table_name(col_idx, csv_file_idx): [col[1] for col in col_names]
             for col_idx, col_names in self._loading_tmp['chunked_column_names']}

        # get columns dtypes (for PostgreSQL and standard ones)
        db_types_old_column_names, all_fields_dtypes, all_fields_description, all_fields_coding = self._get_db_columns_dtypes(
            csv_file)
        db_dtypes = {
            self._rename_columns(k): v
            for k, v in db_types_old_column_names.items()
        }
        self._fields_dtypes.update(all_fields_dtypes)

        data_sample = pd.read_csv(csv_file,
                                  index_col=0,
                                  header=0,
                                  nrows=1,
                                  dtype=str)
        data_sample = data_sample.rename(columns=self._rename_columns)

        # create fields table
        if csv_file_idx == 0:
            create_table('fields',
                         columns=[
                             'column_name text NOT NULL',
                             'table_name text',
                             'field_id text NOT NULL',
                             'description text',
                             'coding bigint',
                             'inst bigint',
                             'arr bigint',
                             'type text NOT NULL',
                         ],
                         constraints=['pk_fields PRIMARY KEY (column_name)'],
                         db_engine=self._get_db_engine(),
                         drop_if_exists=True)

        current_stop = 0
        for column_names_idx, column_names in self._loading_tmp[
                'chunked_column_names']:
            new_columns_names = [x[1] for x in column_names]

            fields_ids = []
            instances = []
            arrays = []
            fields_dtypes = []
            fields_descriptions = []
            fields_codings = []

            for col_name in new_columns_names:
                match = re.match(Pheno2SQL.RE_FIELD_INFO, col_name)

                fields_ids.append(match.group('field_id'))
                instances.append(int(match.group('instance')))
                arrays.append(int(match.group('array')))

                fields_dtypes.append(all_fields_dtypes[col_name])
                fields_descriptions.append(all_fields_description[col_name])

                if col_name in all_fields_coding:
                    fields_codings.append(all_fields_coding[col_name])
                else:
                    fields_codings.append(np.nan)

            # Create main table structure
            table_name = self._get_table_name(column_names_idx, csv_file_idx)
            logger.info('Table {} ({} columns)'.format(table_name,
                                                       len(new_columns_names)))
            data_sample.loc[[],
                            new_columns_names].to_sql(table_name,
                                                      self._get_db_engine(),
                                                      if_exists='replace',
                                                      dtype=db_dtypes)

            with self._get_db_engine().connect() as conn:
                conn.execute("""
                    ALTER TABLE {table_name} ADD CONSTRAINT pk_{table_name} PRIMARY KEY (eid);
                """.format(table_name=table_name))

            with self._get_db_engine().connect() as conn:
                conn.execute('DROP INDEX ix_{table_name}_eid;'.format(
                    table_name=table_name))

            # Create auxiliary table
            n_column_names = len(new_columns_names)
            current_start = current_stop
            current_stop = current_start + n_column_names

            aux_table = pd.DataFrame({
                'column_name': new_columns_names,
                'field_id': fields_ids,
                'inst': instances,
                'arr': arrays,
                'coding': fields_codings,
                'table_name': table_name,
                'type': fields_dtypes,
                'description': fields_descriptions
            })
            # aux_table = aux_table.set_index('column_name')
            aux_table.to_sql('fields',
                             self._get_db_engine(),
                             index=False,
                             if_exists='append')