Example #1
0
    def _get_file_encoding(self, csv_file):
        csv_file_name = os.path.basename(csv_file)

        csv_file_full_path = os.path.realpath(csv_file)
        csv_file_dir = os.path.dirname(csv_file_full_path)

        encoding_file = os.path.join(csv_file_dir,
                                     self.csv_files_encoding_file)

        if os.path.isfile(encoding_file):
            enc_file = pd.read_table(encoding_file,
                                     index_col=0,
                                     header=None,
                                     delim_whitespace=True,
                                     squeeze=True)

            if not enc_file.index.is_unique:
                logger.error(
                    f'{self.csv_files_encoding_file} has no unique files. Not using the file'
                )
                return self.csv_files_encoding

            if csv_file_name in enc_file.index:
                file_encoding = enc_file.at[csv_file_name]
                logger.info(
                    f'Encoding found in {self.csv_files_encoding_file}: {file_encoding}'
                )

                return file_encoding
        else:
            logger.warning(
                f'No {self.csv_files_encoding_file} found, assuming {self.csv_files_encoding}'
            )
            return self.csv_files_encoding
Example #2
0
    def _load_bgen_samples(self):
        if self.bgen_sample_file is None or not os.path.isfile(
                self.bgen_sample_file):
            logger.warning(
                'BGEN sample file not set or does not exist: {}'.format(
                    self.bgen_sample_file))
            return

        logger.info('Loading BGEN sample file: {}'.format(
            self.bgen_sample_file))

        create_table(
            BGEN_SAMPLES_TABLE,
            columns=[
                'index bigint NOT NULL',
                'eid bigint NOT NULL',
            ],
            constraints=[
                'pk_{} PRIMARY KEY (index, eid)'.format(BGEN_SAMPLES_TABLE)
            ],
            db_engine=self._get_db_engine())

        samples_data = pd.read_table(self.bgen_sample_file,
                                     sep=' ',
                                     header=0,
                                     usecols=['ID_1', 'ID_2'],
                                     skiprows=[1])
        samples_data.set_index(np.arange(1, samples_data.shape[0] + 1),
                               inplace=True)
        samples_data.drop('ID_2', axis=1, inplace=True)
        samples_data.rename(columns={'ID_1': 'eid'}, inplace=True)

        samples_data.to_sql(BGEN_SAMPLES_TABLE,
                            self._get_db_engine(),
                            if_exists='append')
Example #3
0
    def process_users_file(self):
        # process user/pass file
        yaml = YAML()

        if self.users_file is not None and os.path.isfile(self.users_file):
            users = self._read_yaml_file(self.users_file)

            # hash all non-hashed passwords
            new_users = {}
            for user, passw in users.items():
                if not ENCODED_PASSWORD_PATTERN.match(passw):
                    new_users[user] = generate_password_hash(
                        passw, method=self.method)
                else:
                    new_users[user] = passw

            with open(self.users_file, 'w') as f:
                yaml.dump(new_users, f)
        elif self.users_file is not None and not os.path.isfile(
                self.users_file):
            logger.warning(
                'Users file for authentication does not exist. No access will be allowed until the file is properly created.'
            )
        elif self.users_file is None:
            logger.warning(
                'No users file was specified, so HTTP Basic authentication is disabled.'
            )
Example #4
0
    def _load_events(self):
        if self.db_type == 'sqlite':
            logger.warning('Events loading is not supported in SQLite')
            return

        logger.info('Loading events table')

        # create table
        db_engine = self._get_db_engine()

        create_table(
            'events',
            columns=[
                'eid bigint NOT NULL',
                'field_id integer NOT NULL',
                'instance integer NOT NULL',
                'event text NOT NULL',
            ],
            constraints=[
                'pk_events PRIMARY KEY (eid, field_id, instance, event)'
            ],
            db_engine=db_engine)

        # insert data of categorical multiple fields
        categorical_variables = pd.read_sql(
            """
            select column_name, field_id, inst, table_name
            from fields
            where type = 'Categorical (multiple)'
        """, self._get_db_engine())

        for (field_id,
             field_instance), field_data in categorical_variables.groupby(
                 by=['field_id', 'inst']):
            sql_st = """
                insert into events (eid, field_id, instance, event)
                (
                    select distinct *
                    from (
                        select eid, {field_id}, {field_instance}, unnest(array[{field_columns}]) as event
                        from {tables}
                    ) t
                    where t.event is not null
                )
            """.format(
                field_id=field_id,
                field_instance=field_instance,
                field_columns=', '.join(
                    [cn for cn in set(field_data['column_name'])]),
                tables=self._create_joins(list(set(field_data['table_name'])),
                                          join_type='inner join'),
            )

            with db_engine.connect() as con:
                con.execute(sql_st)
Example #5
0
    def _create_constraints(self):
        if self.db_type == 'sqlite':
            logger.warning('Indexes are not supported for SQLite')
            return

        logger.info('Creating table constraints (indexes, primary keys, etc)')

        # bgen's samples table
        if self.bgen_sample_file is not None and os.path.isfile(
                self.bgen_sample_file):
            create_indexes(BGEN_SAMPLES_TABLE, ('index', 'eid'),
                           db_engine=self._get_db_engine())

        # fields table
        create_indexes(
            'fields',
            ('field_id', 'inst', 'arr', 'table_name', 'type', 'coding'),
            db_engine=self._get_db_engine())

        # events table
        create_indexes('events', ('eid', 'field_id', 'instance', 'event',
                                  ('field_id', 'event')),
                       db_engine=self._get_db_engine())
Example #6
0
    def load_withdrawals(self, withdrawals_dir):
        db_engine = self._get_db_engine()

        # create table (if not exists)
        with db_engine.connect() as conn:
            logger.info('Creating withdrawals table')
            conn.execute(f"""
                CREATE TABLE IF NOT EXISTS {WITHDRAWALS_TABLE} (
                    eid bigint primary key
                )
            """)

            for input_file in glob(join(withdrawals_dir, '*.csv')):
                logger.info(f'Reading input file {input_file}')

                data = pd.read_csv(input_file, header=None)
                data = data.rename(columns={0: 'eid'})

                n_data_before = data.shape[0]
                data = data.drop_duplicates()
                if n_data_before != data.shape[0]:
                    logger.warning(
                        f'Duplicate IDs in file were removed ({n_data_before} vs {data.shape[0]})'
                    )

                # remove duplicates already in DB
                current_eids = pd.read_sql(
                    f'select eid from {WITHDRAWALS_TABLE}', conn)['eid']
                data = data.loc[~data['eid'].isin(current_eids)]

                logger.info(
                    f'Writing to SQL table: {data.shape[0]} new sample IDs')
                data.to_sql(WITHDRAWALS_TABLE,
                            db_engine,
                            index=False,
                            if_exists='append')
Example #7
0
def _setup_genotype_path():
    genotype_path = environ.get(GENOTYPE_PATH_ENV, None)

    if not isdir(genotype_path):
        logger.warning(
            'The genotype directory does not exist. You have to mount it using '
            'the option "-v hostDir:{}" of "docker run"'.format(genotype_path))
        return

    bgen_files = [
        f for f in listdir(genotype_path) if f.lower().endswith('.bgen')
    ]
    if len(bgen_files) == 0:
        logger.warning('No .bgen files were found in the genotype directory')

    bgi_files = [
        f for f in listdir(genotype_path) if f.lower().endswith('.bgi')
    ]
    if len(bgi_files) == 0:
        logger.warning('No .bgi files were found in the genotype directory')
Example #8
0
    def __init__(self,
                 ukb_csvs,
                 db_uri,
                 bgen_sample_file=None,
                 table_prefix='ukb_pheno_',
                 n_columns_per_table=sys.maxsize,
                 loading_n_jobs=-1,
                 tmpdir=tempfile.mkdtemp(prefix='ukbrest'),
                 loading_chunksize=5000,
                 sql_chunksize=None,
                 delete_temp_csv=True):
        """
        :param ukb_csvs: files are loaded in the order they are specified
        :param db_uri:
        :param table_prefix:
        :param n_columns_per_table:
        :param loading_n_jobs:
        :param tmpdir:
        :param loading_chunksize: number of lines to read when loading CSV files to the SQL database.
        :param sql_chunksize: when an SQL query is submited to get phenotypes, this parameteres indicates the
        chunksize (number of rows).
        """

        super(Pheno2SQL, self).__init__(db_uri)

        if isinstance(ukb_csvs, (tuple, list)):
            self.ukb_csvs = ukb_csvs
        else:
            self.ukb_csvs = (ukb_csvs, )

        self.bgen_sample_file = bgen_sample_file

        parse_result = urlparse(self.db_uri)
        self.db_type = parse_result.scheme

        if self.db_type == 'sqlite':
            logger.warning('sqlite does not support parallel loading')
            self.db_file = self.db_uri.split(':///')[-1]
        elif self.db_type == 'postgresql':
            self.db_host = parse_result.hostname
            self.db_port = parse_result.port
            self.db_name = parse_result.path.split('/')[-1]
            self.db_user = parse_result.username
            self.db_pass = parse_result.password

        self.table_prefix = table_prefix
        self.n_columns_per_table = n_columns_per_table
        logger.debug("n_columns_per_table set to {}".format(
            self.n_columns_per_table))
        self.loading_n_jobs = loading_n_jobs
        self.tmpdir = tmpdir
        self.loading_chunksize = loading_chunksize

        self.sql_chunksize = sql_chunksize
        if self.sql_chunksize is None:
            logger.warning(
                '{} was not set, no chunksize for SQL queries, what can lead to '
                'memory problems.'.format(SQL_CHUNKSIZE_ENV))

        self._fields_dtypes = {}

        # this is a temporary variable that holds information about loading
        self._loading_tmp = {}

        self.table_list = set()

        self.csv_files_encoding_file = 'encodings.txt'
        self.csv_files_encoding = 'utf-8'

        self.delete_temp_csv = delete_temp_csv
Example #9
0
    def _create_tables_schema(self, csv_file, csv_file_idx):
        """
        Reads the data types of each data field in csv_file and create the necessary database tables.
        :return:
        """
        logger.info('Creating database tables')

        tmp = pd.read_csv(csv_file,
                          index_col=0,
                          header=0,
                          nrows=1,
                          low_memory=False)
        old_columns = tmp.columns.tolist()
        del tmp
        new_columns = [self._rename_columns(x) for x in old_columns]

        # Remove columns that were previously loaded in other datasets
        if 'existing_col_names' not in self._loading_tmp:
            # dictionary with data-field as key and csv file as value
            columns_and_csv_files = {}
        else:
            columns_and_csv_files = self._loading_tmp['existing_col_names']

        old_columns_clean = []
        new_columns_clean = []

        for old_col_name, new_col_name in tuple(zip(old_columns, new_columns)):
            if new_col_name in columns_and_csv_files:
                corresponding_csv_file = columns_and_csv_files[new_col_name]
                logger.warning(
                    f'Column {new_col_name} already loaded from {corresponding_csv_file}. Skipping.'
                )
                continue

            columns_and_csv_files[new_col_name] = csv_file

            old_columns_clean.append(old_col_name)
            new_columns_clean.append(new_col_name)

        self._loading_tmp['existing_col_names'] = columns_and_csv_files

        # keep only unique columns (not loaded in previous files)
        old_columns = old_columns_clean
        new_columns = new_columns_clean
        all_columns = tuple(zip(old_columns, new_columns))

        # FIXME: check if self.n_columns_per_table is greater than the real number of columns
        self._loading_tmp['chunked_column_names'] = tuple(
            enumerate(self._chunker(all_columns, self.n_columns_per_table)))
        self._loading_tmp['chunked_table_column_names'] = \
            {self._get_table_name(col_idx, csv_file_idx): [col[1] for col in col_names]
             for col_idx, col_names in self._loading_tmp['chunked_column_names']}

        # get columns dtypes (for PostgreSQL and standard ones)
        db_types_old_column_names, all_fields_dtypes, all_fields_description, all_fields_coding = self._get_db_columns_dtypes(
            csv_file)
        db_dtypes = {
            self._rename_columns(k): v
            for k, v in db_types_old_column_names.items()
        }
        self._fields_dtypes.update(all_fields_dtypes)

        data_sample = pd.read_csv(csv_file,
                                  index_col=0,
                                  header=0,
                                  nrows=1,
                                  dtype=str)
        data_sample = data_sample.rename(columns=self._rename_columns)

        # create fields table
        if csv_file_idx == 0:
            create_table('fields',
                         columns=[
                             'column_name text NOT NULL',
                             'table_name text',
                             'field_id text NOT NULL',
                             'description text',
                             'coding bigint',
                             'inst bigint',
                             'arr bigint',
                             'type text NOT NULL',
                         ],
                         constraints=['pk_fields PRIMARY KEY (column_name)'],
                         db_engine=self._get_db_engine(),
                         drop_if_exists=True)

        current_stop = 0
        for column_names_idx, column_names in self._loading_tmp[
                'chunked_column_names']:
            new_columns_names = [x[1] for x in column_names]

            fields_ids = []
            instances = []
            arrays = []
            fields_dtypes = []
            fields_descriptions = []
            fields_codings = []

            for col_name in new_columns_names:
                match = re.match(Pheno2SQL.RE_FIELD_INFO, col_name)

                fields_ids.append(match.group('field_id'))
                instances.append(int(match.group('instance')))
                arrays.append(int(match.group('array')))

                fields_dtypes.append(all_fields_dtypes[col_name])
                fields_descriptions.append(all_fields_description[col_name])

                if col_name in all_fields_coding:
                    fields_codings.append(all_fields_coding[col_name])
                else:
                    fields_codings.append(np.nan)

            # Create main table structure
            table_name = self._get_table_name(column_names_idx, csv_file_idx)
            logger.info('Table {} ({} columns)'.format(table_name,
                                                       len(new_columns_names)))
            data_sample.loc[[],
                            new_columns_names].to_sql(table_name,
                                                      self._get_db_engine(),
                                                      if_exists='replace',
                                                      dtype=db_dtypes)

            with self._get_db_engine().connect() as conn:
                conn.execute("""
                    ALTER TABLE {table_name} ADD CONSTRAINT pk_{table_name} PRIMARY KEY (eid);
                """.format(table_name=table_name))

            with self._get_db_engine().connect() as conn:
                conn.execute('DROP INDEX ix_{table_name}_eid;'.format(
                    table_name=table_name))

            # Create auxiliary table
            n_column_names = len(new_columns_names)
            current_start = current_stop
            current_stop = current_start + n_column_names

            aux_table = pd.DataFrame({
                'column_name': new_columns_names,
                'field_id': fields_ids,
                'inst': instances,
                'arr': arrays,
                'coding': fields_codings,
                'table_name': table_name,
                'type': fields_dtypes,
                'description': fields_descriptions
            })
            # aux_table = aux_table.set_index('column_name')
            aux_table.to_sql('fields',
                             self._get_db_engine(),
                             index=False,
                             if_exists='append')