Esempio n. 1
0
    def test_fetch(self):
        ''' can fetch variants within a genomic region
        '''
        chrom, start, stop = '01', 5000, 50000
        bfile = BgenFile(self.folder / 'example.16bits.bgen')
        self.assertTrue(
            bfile._check_for_index(str(self.folder / 'example.16bits.bgen')))

        self.assertTrue(list(bfile.fetch('02')) == [])
Esempio n. 2
0
    def __init__(self,
                 bgen_file_path,
                 phenotype_file_path,
                 index_column_name,
                 covariate_file_path=None,
                 sample_file_path=None):
        """
        This software is meant to be called from the command line, so no documentation is included here. Note, the code here is a bit verbose, which was done in an attempt to minimize the number of function calls given the need to perform millions of calls. This could likely be optimized in a better way.


        """
        self.index_column_name = index_column_name

        assert os.path.isfile(bgen_file_path), "bgen file does not exist"

        if os.path.isfile(bgen_file_path + '.bgi') is False:
            print(
                "Warning: No bgen index (.bgi) file provided in same directory as bgen file. Initial reading of the bgen is MUCH faster with index file. "
            )

        if sample_file_path is not None:
            assert os.path.isfile(
                sample_file_path
            ), "sample file does not exist at provided location"
        else:
            sample_file_path = bgen_file_path.strip('bgen') + 'sample'
            if os.path.isfile(sample_file_path) is False:
                raise FileNotFoundError(
                    "No sample file at {0:s}. A sample file must be provided.".
                    format(sample_file_path))

        print(
            'Reading bgen file from {0:s} using sample file {1:s}. If these seem like an error, kill program.'
            .format(bgen_file_path, sample_file_path))

        self.bgen_dataset = BgenFile(bgen_file_path,
                                     sample_path=sample_file_path)

        if os.path.isfile(phenotype_file_path):
            self.phenotype_dataset = pd.read_csv(phenotype_file_path,
                                                 sep='\t',
                                                 index_col=index_column_name)
        else:
            raise FileNotFoundError("No phenotype file at provided location")

        if covariate_file_path is not None:
            if os.path.isfile(covariate_file_path):
                self.covariate_dataset = pd.read_csv(
                    covariate_file_path, sep='\t', index_col=index_column_name)
            else:
                raise FileNotFoundError(
                    "No covariate file at provided location")
        else:
            print(
                "No covariate file provided. Will use phenotype file for covariates.\n",
                flush=True)
            self.covariate_dataset = self.phenotype_dataset
Esempio n. 3
0
    def test_fetch_whole_chrom(self):
        ''' fetching just with chrom gives all variants on chromosome
        '''
        chrom, start, stop = '01', 5000, 50000
        bfile = BgenFile(self.folder / 'example.16bits.bgen')

        # test fetching a whole chromosome
        sortkey = lambda x: (x.chrom, x.pos)
        for x, y in zip(sorted(bfile.fetch(chrom), key=sortkey),
                        sorted(self.gen_data, key=sortkey)):
            self.assertEqual(x.rsid, y.rsid)
            self.assertEqual(x.chrom, y.chrom)
            self.assertEqual(x.pos, y.pos)
Esempio n. 4
0
    def test_fetch_after_position(self):
        ''' fetching variants with chrom and start gives all variants after pos
        '''
        chrom, start, stop = '01', 5000, 50000
        bfile = BgenFile(self.folder / 'example.16bits.bgen')

        sortkey = lambda x: (x.chrom, x.pos)
        gen_vars = [
            x for x in sorted(self.gen_data, key=sortkey) if start <= x.pos
        ]
        for x, y in zip(sorted(bfile.fetch(chrom, start), key=sortkey),
                        gen_vars):
            self.assertEqual(x.rsid, y.rsid)
            self.assertEqual(x.chrom, y.chrom)
            self.assertEqual(x.pos, y.pos)
Esempio n. 5
0
    def test_context_handler_closed_bgen_length(self):
        ''' error raised if accessing length of exited BgenFile
        '''
        path = self.folder / 'example.16bits.zstd.bgen'
        with BgenFile(path) as bfile:
            self.assertTrue(len(bfile) > 0)

        with self.assertRaises(ValueError):
            len(bfile)
Esempio n. 6
0
    def test_context_handler_closed_bgen_slice(self):
        ''' error raised if slicing variant from exited BgenFile
        '''
        path = self.folder / 'example.16bits.zstd.bgen'
        with BgenFile(path) as bfile:
            self.assertTrue(len(bfile) > 0)

        with self.assertRaises(ValueError):
            var = bfile[0]
Esempio n. 7
0
    def test_context_handler_closed_bgen_at_position(self):
        ''' error raised if getting variant at position from exited BgenFile
        '''
        path = self.folder / 'example.16bits.zstd.bgen'
        with BgenFile(path) as bfile:
            self.assertTrue(len(bfile) > 0)

        with self.assertRaises(ValueError):
            var = bfile.at_position(100)
Esempio n. 8
0
    def test_context_handler_closed_bgen_with_rsid(self):
        ''' error raised if getting variant with rsid from exited BgenFile
        '''
        path = self.folder / 'example.16bits.zstd.bgen'
        with BgenFile(path) as bfile:
            self.assertTrue(len(bfile) > 0)

        with self.assertRaises(ValueError):
            var = bfile.with_rsid('rs111')
 def test_zstd_compressed(self):
     ''' check we can parse genotypes from zstd compressed geno probabilities
     '''
     path = self.folder / 'example.16bits.zstd.bgen'
     bfile = BgenFile(str(path))
     for var, g in zip(bfile, self.gen_data):
         self.assertEqual(g, var)
         self.assertTrue(
             arrays_equal(g.probabilities, var.probabilities, 16))
Esempio n. 10
0
    def test_context_handler_closed_bgen_positions(self):
        ''' no positions available from exited BgenFile
        '''
        path = self.folder / 'example.16bits.zstd.bgen'
        with BgenFile(path) as bfile:
            self.assertTrue(len(bfile.positions()) > 0)

        with self.assertRaises(ValueError):
            bfile.positions()
Esempio n. 11
0
 def test_v11(self):
     ''' check we can open a bgen in v1.1 format, and parse genotypes correctly
     '''
     path = self.folder / 'example.v11.bgen'
     bfile = BgenFile(str(path))
     bit_depth = 16
     for var, g in zip(bfile, self.gen_data):
         self.assertEqual(g, var)
         self.assertTrue(
             arrays_equal(g.probabilities, var.probabilities, bit_depth))
Esempio n. 12
0
 def test_load_haplotypes_bgen(self):
     ''' check we can open a bgen with haplotypes, and parse genotypes correctly
     '''
     path = self.folder / 'haplotypes.bgen'
     bfile = BgenFile(str(path))
     bit_depth = 16
     for var, g in zip(bfile, self.haps_data):
         self.assertEqual(g, var)
         self.assertTrue(
             arrays_equal(g.probabilities, var.probabilities, bit_depth))
Esempio n. 13
0
 def test_load_example_genotypes_bit_depths(self):
     ''' check parsing genotypes from the example files with different bit depths
     '''
     for path in self.folder.glob('example.*bits.bgen'):
         bit_depth = int(path.stem.split('.')[1].strip('bits'))
         bfile = BgenFile(str(path))
         for var, g in zip(bfile, self.gen_data):
             self.assertEqual(g, var)
             self.assertTrue(
                 arrays_equal(g.probabilities, var.probabilities,
                              bit_depth))
Esempio n. 14
0
 def test_load_complex_file(self):
     ''' make sure we can open a complex bgen file
     '''
     path = self.folder / 'complex.bgen'
     bfile = BgenFile(path)
     bit_depth = 16
     for var, g in zip(bfile, self.vcf_data):
         self.assertEqual(g, var)
         self.assertTrue(
             arrays_equal(g.probabilities, var.probabilities, bit_depth))
         self.assertTrue(all(x == y for x, y in zip(g.ploidy, var.ploidy)))
Esempio n. 15
0
    def test_fetch_in_region(self):
        ''' fetching variants with chrom, start, stop gives variants in region
        '''
        chrom, start, stop = '01', 5000, 50000
        bfile = BgenFile(self.folder / 'example.16bits.bgen')

        sortkey = lambda x: (x.chrom, x.pos)
        gen_vars = [
            x for x in sorted(self.gen_data, key=sortkey)
            if start <= x.pos <= stop
        ]
        for x, y in zip(sorted(bfile.fetch(chrom, start, stop), key=sortkey),
                        gen_vars):
            self.assertEqual(x.rsid, y.rsid)
            self.assertEqual(x.chrom, y.chrom)
            self.assertEqual(x.pos, y.pos)

        # check that we don't get any variants in a region without any
        self.assertEqual(list(bfile.fetch(chrom, start * 1000, stop * 1000)),
                         [])
Esempio n. 16
0
    def test_load_complex_files(self):
        ''' make sure we can open the complex bgen files
        '''

        for path in self.folder.glob('complex.*.bgen'):
            bit_depth = int(path.stem.split('.')[1].strip('bits'))
            bfile = BgenFile(path)
            for var, g in zip(bfile, self.vcf_data):
                self.assertEqual(g, var)
                self.assertTrue(
                    arrays_equal(g.probabilities, var.probabilities,
                                 bit_depth))
Esempio n. 17
0
    def test_index_opens(self):
        ''' loads index when available
        '''
        bfile = BgenFile(self.folder / 'example.15bits.bgen')
        self.assertFalse(
            bfile._check_for_index(str(self.folder / 'example.15bits.bgen')))

        bfile = BgenFile(self.folder / 'example.16bits.bgen')
        self.assertTrue(
            bfile._check_for_index(str(self.folder / 'example.16bits.bgen')))
Esempio n. 18
0
 def test_pickling(self):
     ''' BgenVar should pickle and unpickle
     '''
     path = self.folder / 'example.16bits.zstd.bgen'
     with BgenFile(path) as bfile:
         for var in bfile:
             # this checks that we can pickle and unpickle a BgenVar
             pickled = pickle.dumps(var)
             unpickled = pickle.loads(pickled)
             
             # check attributes of the original and unpickled are identical
             self.assertEqual(var.varid, unpickled.varid)
             self.assertEqual(var.rsid, unpickled.rsid)
             self.assertEqual(var.chrom, unpickled.chrom)
             self.assertEqual(var.pos, unpickled.pos)
             self.assertEqual(var.alleles, unpickled.alleles)
Esempio n. 19
0
 def test_minor_allele_dosage_v11(self):
     ''' test we calculate minor_allele_dosage correctly with version 1 bgens
     '''
     path = self.folder / 'example.v11.bgen'
     with BgenFile(path) as bfile:
         for var in bfile:
             dose = var.minor_allele_dosage
             probs = var.probabilities
             
             # calculate dosages for each allele
             a1 = (probs[:, 0] * 2 + probs[:, 1])
             a2 = (probs[:, 2] * 2 + probs[:, 1])
             
             # get delta between var.minor_allele_dosage and values calculated here
             recomputed = a2 if np.nansum(a1) >= np.nansum(a2) else a1
             delta = abs(dose - recomputed)
             
             # check difference between the two estimates is sufficiently low
             self.assertTrue(np.nanmax(delta) < 7e-5)
Esempio n. 20
0
class QRankGWAS:
    def __init__(self,
                 bgen_file_path,
                 phenotype_file_path,
                 index_column_name,
                 covariate_file_path=None,
                 sample_file_path=None):
        """
        This software is meant to be called from the command line, so no documentation is included here. Note, the code here is a bit verbose, which was done in an attempt to minimize the number of function calls given the need to perform millions of calls. This could likely be optimized in a better way.


        """
        self.index_column_name = index_column_name

        assert os.path.isfile(bgen_file_path), "bgen file does not exist"

        if os.path.isfile(bgen_file_path + '.bgi') is False:
            print(
                "Warning: No bgen index (.bgi) file provided in same directory as bgen file. Initial reading of the bgen is MUCH faster with index file. "
            )

        if sample_file_path is not None:
            assert os.path.isfile(
                sample_file_path
            ), "sample file does not exist at provided location"
        else:
            sample_file_path = bgen_file_path.strip('bgen') + 'sample'
            if os.path.isfile(sample_file_path) is False:
                raise FileNotFoundError(
                    "No sample file at {0:s}. A sample file must be provided.".
                    format(sample_file_path))

        print(
            'Reading bgen file from {0:s} using sample file {1:s}. If these seem like an error, kill program.'
            .format(bgen_file_path, sample_file_path))

        self.bgen_dataset = BgenFile(bgen_file_path,
                                     sample_path=sample_file_path)

        if os.path.isfile(phenotype_file_path):
            self.phenotype_dataset = pd.read_csv(phenotype_file_path,
                                                 sep='\t',
                                                 index_col=index_column_name)
        else:
            raise FileNotFoundError("No phenotype file at provided location")

        if covariate_file_path is not None:
            if os.path.isfile(covariate_file_path):
                self.covariate_dataset = pd.read_csv(
                    covariate_file_path, sep='\t', index_col=index_column_name)
            else:
                raise FileNotFoundError(
                    "No covariate file at provided location")
        else:
            print(
                "No covariate file provided. Will use phenotype file for covariates.\n",
                flush=True)
            self.covariate_dataset = self.phenotype_dataset

    def ConstructDataArrays(self,
                            phenotype_name,
                            covariate_cols=None,
                            included_subjects=None):
        if included_subjects is None:
            self.included_subjects = self.phenotype_dataset.index.to_numpy()
        else:
            self.included_subjects = np.intersect1d(
                included_subjects, self.phenotype_dataset.index.to_numpy())

        self.Y = self.phenotype_dataset.loc[self.included_subjects][[
            phenotype_name
        ]]
        if covariate_cols is not None:
            self.Z = self.covariate_dataset.loc[
                self.included_subjects][covariate_cols]
        else:
            self.Z = None

        sample_vals_np = np.array(self.bgen_dataset.samples,
                                  dtype=self.included_subjects.dtype)
        sample_vals_np_sorted = np.sort(sample_vals_np)
        sample_vals_np_idx_sorted = np.argsort(sample_vals_np)
        conv_dict = dict(zip(sample_vals_np_sorted, sample_vals_np_idx_sorted))
        self.included_subjects_bgen_idx = np.array(
            [conv_dict[x] for x in self.included_subjects])

    def BuildQRank(self,
                   quantiles,
                   param_tol=1e-8,
                   max_fitting_iter=5000,
                   output_file_prefix=None,
                   randomize=False):
        self.qrank = QRank(self.Y,
                           covariate_matrix=self.Z,
                           quantiles=quantiles)
        self.qrank.FitNullModels(tol=param_tol,
                                 maxiter=max_fitting_iter,
                                 randomize=randomize)
        if output_file_prefix is not None:
            residual_table = pd.DataFrame(index=self.included_subjects)

            for q in quantiles:
                residual_table['q.{0:g}.residuals'.format(
                    q)] = self.qrank.null_model_results[q].resid
                with open(
                        output_file_prefix +
                        '.NullModelResults.{0:g}.txt'.format(q),
                        'w') as model_file:
                    model_file.write(
                        self.qrank.null_model_results[q].summary().as_text())
                    self.qrank.null_model_results[q].save(
                        output_file_prefix + '.NullModel.{0:g}.pth'.format(q))
            residual_table.to_csv(output_file_prefix +
                                  '.NullModelResiduals.txt',
                                  sep='\t')

    def PerformGWASAdditive(self,
                            output_file_prefix,
                            maf_cutoff,
                            print_freq=1000,
                            variant_list=None):

        if variant_list is None:
            total_num_variants = len(self.bgen_dataset)
            variant_iterator = self.bgen_dataset
        elif len(variant_list) > 1000:
            print(
                "Adjusting bgen index to drop excluded variants from the analysis. This may take several minutes up front."
            )
            all_rsids = self.bgen_dataset.rsids()
            rsid_table = pd.DataFrame({
                'rsid': all_rsids,
                'bgen_index': np.arange(len(all_rsids))
            })
            rsid_table.set_index('rsid', inplace=True, drop=False)
            rsid_table = rsid_table.drop(
                np.intersect1d(variant_list, rsid_table.index.to_numpy()))
            self.bgen_dataset.drop_variants(rsid_table['bgen_index'].to_list())

            total_num_variants = len(self.bgen_dataset)

            def variant_iterator_func(num_var):
                for x in range(num_var):
                    yield self.bgen_dataset[x]

            variant_iterator = variant_iterator_func(total_num_variants)
        else:
            # use a custom generator, load in real time
            #
            def variant_iterator_func(v_list):
                for x in v_list:
                    yield self.bgen_dataset.with_rsid(x)

            variant_iterator = variant_iterator_func(variant_list)

        with open(output_file_prefix + '.Additive.QRankGWAS.txt',
                  'w',
                  buffering=io.DEFAULT_BUFFER_SIZE * 10) as output_file:
            output_file.write('snpid\trsid\tchrom\tpos\tmaj\tmin\tmaf\t')
            output_file.write(
                '\t'.join(['p.{0:g}'.format(x)
                           for x in self.qrank.quantiles]) + '\tp.comp\n')

            variant_counter = 0
            avg_elapsed_time = 0.0
            block_counter = 0
            start = time.time()

            for variant in variant_iterator:
                if len(variant.alleles) == 2:
                    dosage = variant.minor_allele_dosage[
                        self.included_subjects_bgen_idx]
                    maf = dosage.sum() / (dosage.shape[0] * 2.0)

                    if (maf >= maf_cutoff):
                        if (variant.alleles.index(variant.minor_allele)
                                == 1) and (maf <= 0.5):
                            alleles = variant.alleles
                        else:
                            alleles = variant.alleles[::-1]

                        output_file.write('{0:s}'.format(variant.varid))
                        output_file.write('\t{0:s}'.format(variant.rsid))
                        output_file.write('\t{0:s}'.format(variant.chrom))
                        output_file.write('\t{0:d}'.format(variant.pos))
                        output_file.write('\t{0:s}'.format(alleles[0]))
                        output_file.write('\t{0:s}'.format(alleles[1]))
                        output_file.write('\t{0:.8g}'.format(maf))
                        pvals = self.qrank.ComputePValues(dosage)
                        for p in pvals[0]:
                            output_file.write('\t{0:.8g}'.format(p))
                        output_file.write('\t{0:.8g}'.format(pvals[1]))
                        output_file.write('\n')
                variant_counter += 1
                if (variant_counter) % print_freq == 0:
                    end = time.time()
                    block_counter += 1
                    elapsed = end - start
                    print(
                        'Processed {0:d} of {1:d} variants ({2:.1f}% of total)'
                        .format(
                            variant_counter, total_num_variants,
                            round((variant_counter / total_num_variants) *
                                  1000.0) / 10.0),
                        flush=True)
                    print('Elapsed time {0:.2f} sec'.format(elapsed))
                    avg_elapsed_time = ((avg_elapsed_time *
                                         (block_counter - 1) + elapsed) /
                                        block_counter)
                    print('Estimated Total Time Required: {0:.2f} hours\n'.
                          format(((total_num_variants / print_freq) *
                                  avg_elapsed_time) / 3600))
                    start = time.time()
Esempio n. 21
0
 def test_load_missing_file(self):
     ''' check passing in a path to a missing file fails gracefully
     '''
     with self.assertRaises(ValueError):
         BgenFile('/zzz/jjj/qqq.bgen')
Esempio n. 22
0
 def test_Path(self):
     ''' check we can open bgen files from Path objects
     '''
     path = self.folder / 'example.v11.bgen'
     bfile = BgenFile(path)