def testBoundariedMiddle(self):
        BoundaryCheck.chrom = 4
        DataParser.boundary = BoundaryCheck(bp=[30734, 33528])
        impute_parser.encoding = impute_parser.Encoding.Recessive
        PhenoCovar.sex_as_covariate = True
        pc = PhenoCovar()
        parser = impute_parser.Parser(self.fam_file,
                                      [self.gen_file, self.gen_file2],
                                      chroms=[3, 4])
        parser.load_family_details(pc)
        parser.load_genotypes()

        idx = 0
        dropped = 0
        for snp in parser:
            while self.positions[idx] < 30734 or self.positions[idx] > 33528:
                idx += 1
                dropped += 1
            self.assertEqual(self.positions[idx], snp.pos)
            for i in range(0, len(self.recessive_encoding[idx])):
                self.assertAlmostEqual(self.recessive_encoding[idx][i],
                                       snp.genotype_data[i],
                                       places=3)
            idx += 1
        self.assertEqual(12, dropped)
    def testTpedBounded(self):
        BoundaryCheck.chrom = 1
        DataParser.boundary = BoundaryCheck(bp=[2000,3000])
        pheno = PhenoCovar()
        dataset = TransposedPedigreeParser(self.tfam_filename, self.tped_filename)
        dataset.load_tfam(pheno)
        dataset.load_genotypes()

        results = [x for x in mv_esteq.RunAnalysis(dataset, pheno)]

        self.assertEqual(1, results[0].chr)
        self.assertEqual(2000, results[0].pos)
        self.assertAlmostEqual(0.57778118, results[0].p_mvtest, places=6)
        self.assertAlmostEqual(0.02798537, results[0].betas[1], places=6)
        self.assertAlmostEqual(0.033790691857, results[0].beta_stderr[1], places=6)
        self.assertAlmostEqual(0.40755865, results[0].beta_pvalues[1], places=6)
        self.assertAlmostEqual(0.03275892, results[0].betas[3], places=6)
        self.assertAlmostEqual(0.0475661, results[0].beta_stderr[3], places=6)
        self.assertAlmostEqual(0.49101013, results[0].beta_pvalues[3], places=6)

        self.assertAlmostEqual(0.44661276, results[1].p_mvtest, places=6)
        self.assertAlmostEqual(0.01663975, results[1].betas[1], places=6)
        self.assertAlmostEqual(0.03443300, results[1].beta_stderr[1], places=6)
        self.assertAlmostEqual(0.62891811, results[1].beta_pvalues[1], places=6)
        self.assertAlmostEqual(0.05712017, results[1].betas[3], places=6)
        self.assertAlmostEqual(0.04783608, results[1].beta_stderr[3], places=6)
        self.assertAlmostEqual(0.232446188, results[1].beta_pvalues[3], places=6)
    def setUp(self):
        self.allele_1 = list("AAACCCGGGTCGTGTATACC")
        self.allele_2 = list("CGTGTATACCAAACCCGGGT")

        self.WriteTestFiles()

        self.phenotypes = [
            0.1, 0.4, 1.0, 0.5, 0.9, 1.0, 0.1, 0.4, 1.0, 0.5, 0.9, 1.0
        ]
        self.sex = [1, 2, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1]

        self.gen_ext = impute_parser.Parser.gen_ext
        self.chrom = BoundaryCheck.chrom
        self.boundary = DataParser.boundary
        DataParser.boundary = BoundaryCheck()
        self.min_maf = DataParser.min_maf
        self.max_maf = DataParser.max_maf
        self.snp_miss_tol = DataParser.snp_miss_tol
        self.ind_miss_tol = DataParser.ind_miss_tol
        DataParser.ind_exclusions = []
        self.sex_as_covar = PhenoCovar.sex_as_covariate
        self.has_sex = DataParser.has_sex
        self.has_pheno = DataParser.has_pheno
        self.has_parents = DataParser.has_parents
        self.has_fid = DataParser.has_fid
        self.has_liability = DataParser.has_liability
        self.encoding = impute_parser.encoding
        self.compression = DataParser.compressed_pedigree
        DataParser.compressed_pedigree = True
        self.parser_info_thresh = impute_parser.Parser.info_threshold
        impute_parser.Parser.info_threshold = 0.0
    def setUp(self):
        self.WriteTestFiles()

        self.phenotypes = [
            0.1, 0.4, 1.0, 0.5, 0.9, 1.0, 0.1, 0.4, 1.0, 0.5, 0.9, 1.0
        ]
        self.sex = [1, 1, 2, 2, 1, 1, 1, 1, 2, 2, 1, 1]

        self.chrom = BoundaryCheck.chrom
        self.boundary = DataParser.boundary
        self.min_maf = DataParser.min_maf
        self.max_maf = DataParser.max_maf
        self.snp_miss_tol = DataParser.snp_miss_tol
        self.ind_miss_tol = DataParser.ind_miss_tol
        self.sex_as_covar = PhenoCovar.sex_as_covariate
        self.has_sex = DataParser.has_sex
        self.has_pheno = DataParser.has_pheno
        self.has_parents = DataParser.has_parents
        self.has_fid = DataParser.has_fid
        self.has_liability = DataParser.has_liability

        # the faked pheno/covariate non-missing
        self.non_missing = numpy.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                                        1]) == 1

        DataParser.boundary = BoundaryCheck()
Beispiel #5
0
    def setUp(self):
        self.WriteTestFiles()

        self.ped            = [l.strip() for l in open(self.ped_filename).readlines()]

        self.phenotypes     = [0.1, 0.4, 1.0, 0.5, 0.9, 1.0, 0.1, 0.4, 1.0, 0.5, 0.9, 1.0]
        self.sex            = [1,1,2,2,1,1,1,1,2,2,1,1]

        self.chrom          = BoundaryCheck.chrom
        self.boundary       = DataParser.boundary
        DataParser.boundary = BoundaryCheck()
        self.min_maf        = DataParser.min_maf
        self.max_maf        = DataParser.max_maf
        self.snp_miss_tol   = DataParser.snp_miss_tol
        self.ind_miss_tol   = DataParser.ind_miss_tol
        DataParser.ind_exclusions = []
        DataParser.ind_inclusions = []
        self.sex_as_covar = PhenoCovar.sex_as_covariate
        self.has_sex        = DataParser.has_sex
        self.has_pheno      = DataParser.has_pheno
        self.has_parents    = DataParser.has_parents
        self.has_fid        = DataParser.has_fid
        self.has_liability  = DataParser.has_liability
        self.sex_as_covariate = PhenoCovar.sex_as_covariate
        self.standardizer = libgwas.standardizer.get_standardizer()
        libgwas.standardizer.set_standardizer(libgwas.standardizer.NoStandardization)
Beispiel #6
0
    def testBoundary(self):
        pc = PhenoCovar()
        DataParser.boundary = BoundaryCheck()
        BoundaryCheck.chrom = 2
        parser = Parser(self.nonmissing, data_field='GT')
        parser.init_subjects(pc)
        parser.load_genotypes()

        index = 4
        for snp in parser:
            snp_filter = numpy.ones(snp.missing_genotypes.shape[0]) == 1
            try:
                genodata = snp.get_genotype_data(snp_filter)
                self.assertEqual(int(self.nonmissing_mapdata[index][0]),
                                 snp.chr)
                self.assertEqual(int(self.nonmissing_mapdata[index][1]),
                                 snp.pos)
                self.assertEqual(self.nonmissing_mapdata[index][2], snp.rsid)
                self.assertEqual(self.genotypes[index],
                                 list(genodata.genotypes))
            except TooMuchMissing as e:
                pass
            except InvalidFrequency as e:
                pass
            index += 1
        self.assertEqual(7, index)
    def testTpedAnalysis(self):
        # We'll start with the correct phenotype with the genotypes, so we'll use
        # a boundary to restrict us to only use the first SNP
        BoundaryCheck.chrom = 1
        DataParser.boundary = BoundaryCheck()
        pheno = PhenoCovar()
        dataset = TransposedPedigreeParser(self.tfam_filename, self.tped_filename)
        dataset.load_tfam(pheno)
        dataset.load_genotypes()

        results = [x for x in mv_esteq.RunAnalysis(dataset, pheno)]
        self.assertAlmostEqual(0.0034756155, results[0].p_mvtest, places=6)
        self.assertAlmostEqual(0.1134684009, results[0].betas[1], places=6)
        self.assertAlmostEqual(0.0337649965541, results[0].beta_stderr[1], places=6)
        self.assertAlmostEqual(0.0007779211, results[0].beta_pvalues[1], places=6)
        self.assertAlmostEqual(-0.0033479839, results[0].betas[3], places=6)
        self.assertAlmostEqual(0.0492050029324, results[0].beta_stderr[3], places=6)
        self.assertAlmostEqual(0.9457525716, results[0].beta_pvalues[3], places=6)

        self.assertAlmostEqual(0.57778118, results[1].p_mvtest, places=6)
        self.assertAlmostEqual(0.02798537, results[1].betas[1], places=6)
        self.assertAlmostEqual(0.033790691857, results[1].beta_stderr[1], places=6)
        self.assertAlmostEqual(0.40755865, results[1].beta_pvalues[1], places=6)
        self.assertAlmostEqual(0.03275892, results[1].betas[3], places=6)
        self.assertAlmostEqual(0.0475661, results[1].beta_stderr[3], places=6)
        self.assertAlmostEqual(0.49101013, results[1].beta_pvalues[3], places=6)

        self.assertAlmostEqual(0.44661276, results[2].p_mvtest, places=6)
        self.assertAlmostEqual(0.01663975, results[2].betas[1], places=6)
        self.assertAlmostEqual(0.03443300, results[2].beta_stderr[1], places=6)
        self.assertAlmostEqual(0.62891811, results[2].beta_pvalues[1], places=6)
        self.assertAlmostEqual(0.05712017, results[2].betas[3], places=6)
        self.assertAlmostEqual(0.04783608, results[2].beta_stderr[3], places=6)
        self.assertAlmostEqual(0.232446188, results[2].beta_pvalues[3], places=6)
Beispiel #8
0
    def setUp(self):
        self.allele_1 = list("AAACCCGGGTCGTGTATACC")
        self.allele_2 = list("CGTGTATACCAAACCCGGGT")

        self.WriteTestFiles()

        self.phenotypes = [
            0.1, 0.4, 1.0, 0.5, 0.9, 1.0, 0.1, 0.4, 1.0, 0.5, 0.9, 1.0
        ]
        self.sex = [1, 2, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1]
        self.chrpos_encoding = mach_parser.Parser.chrpos_encoding
        self.dosage_ext = mach_parser.Parser.dosage_ext
        self.info_ext = mach_parser.Parser.info_ext
        self.chrom = BoundaryCheck.chrom
        self.boundary = DataParser.boundary
        DataParser.boundary = BoundaryCheck()
        self.min_maf = DataParser.min_maf
        self.max_maf = DataParser.max_maf
        self.snp_miss_tol = DataParser.snp_miss_tol
        self.ind_miss_tol = DataParser.ind_miss_tol
        DataParser.ind_exclusions = []
        self.sex_as_covar = PhenoCovar.sex_as_covariate
        self.has_sex = DataParser.has_sex
        self.has_pheno = DataParser.has_pheno
        self.has_parents = DataParser.has_parents
        self.has_fid = DataParser.has_fid
        self.has_liability = DataParser.has_liability
        self.encoding = mach_parser.encoding
        self.compression = DataParser.compressed_pedigree
        DataParser.compressed_pedigree = True
        #self.chunk_stride = mach_parser.chunk_stride
        self.standardizer = libgwas.standardizer.get_standardizer()
        libgwas.standardizer.set_standardizer(
            libgwas.standardizer.NoStandardization)
Beispiel #9
0
 def testBoundaryX(self):
     BoundaryCheck.set_chrom('X')
     b = BoundaryCheck()
     self.assertEqual(23, BoundaryCheck.chrom)
     self.assertEqual('X', BoundaryCheck.chrom_name)
     self.assertTrue(b.TestBoundary('x', 100, "rs100"))
     self.assertTrue(b.TestBoundary('X', 1000, "rs1000"))
     self.assertTrue(b.TestBoundary(23, 1000, "rs1000"))
     self.assertFalse(b.TestBoundary('Y', 100, "rs100"))
Beispiel #10
0
 def testBoundaryChr10(self):
     BoundaryCheck.set_chrom('chr10')
     b = BoundaryCheck(mb=[1,3])
     self.assertTrue(b.valid)
     self.assertFalse(b.NoExclusions())
     self.assertTrue(b.TestBoundary(10, 1000000, ""))
     self.assertTrue(b.TestBoundary(10, 1200000, ""))
     self.assertTrue(b.TestBoundary(10, 3000000, ""))
     self.assertFalse(b.TestBoundary(10, 3000001, ""))
     self.assertFalse(b.TestBoundary(10, 999999, ""))
     self.assertFalse(b.TestBoundary(1, 1000500, ""))
Beispiel #11
0
 def testBoundaryInitMB(self):
     BoundaryCheck.chrom = 10
     b = BoundaryCheck(mb=[1,3])
     self.assertTrue(b.valid)
     self.assertFalse(b.NoExclusions())
     self.assertTrue(b.TestBoundary(10, 1000000, ""))
     self.assertTrue(b.TestBoundary(10, 1200000, ""))
     self.assertTrue(b.TestBoundary(10, 3000000, ""))
     self.assertFalse(b.TestBoundary(10, 3000001, ""))
     self.assertFalse(b.TestBoundary(10, 999999, ""))
     self.assertFalse(b.TestBoundary(1, 1000500, ""))
Beispiel #12
0
 def testBoundaryInitBP(self):
     BoundaryCheck.chrom = 1
     b = BoundaryCheck(bp=[10000, 500000])
     self.assertFalse(b.NoExclusions())
     self.assertTrue(b.valid)
     self.assertEqual(False, b.TestBoundary(1, 500, ""))
     self.assertEqual(True, b.TestBoundary(1, 10000, ""))
     self.assertEqual(True, b.TestBoundary(1, 500000, ""))
     self.assertEqual(True, b.TestBoundary(1, 250000, ""))
     self.assertEqual(False, b.TestBoundary(2, 250000, ""))
     self.assertEqual(False, b.TestBoundary(10, 10000, ""))
Beispiel #13
0
 def testMapFileWithRegionAndSnpExclusion(self):
     BoundaryCheck.chrom = 2
     DataParser.boundary = BoundaryCheck(bp=[0, 10000])
     ped_parser = PedigreeParser(self.map_filename, self.ped_filename)
     ped_parser.load_mapfile()
     self.assertEqual(2, len(ped_parser.markers))
     self.assertEqual(7, len(ped_parser.snp_mask[:, 0]))
     self.assertEqual(2, ped_parser.locus_count)
     # Masks are filters, so we should have 7 entries, but 4 will be 1
     self.assertEqual(5, numpy.sum(ped_parser.snp_mask[:, 0]))
     self.assertEqual(0, ped_parser.snp_mask[4, 1])
     self.assertEqual(0, ped_parser.snp_mask[5, 0])
Beispiel #14
0
    def setUp(self):
        self.missing = "tests/bedfiles/ped_missing"
        self.missing_bed = resource_filename("libgwas",
                                             "%s.bed" % (self.missing))
        self.missing_bim = resource_filename("libgwas",
                                             "%s.bim" % (self.missing))
        self.missing_fam = resource_filename("libgwas",
                                             "%s.fam" % (self.missing))
        self.genotypes = [[0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0],
                          [1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1],
                          [0, 2, 1, 1, 0, 0, 0, 2, 1, 1, 0, 0],
                          [0, 1, 2, 1, 1, 0, 0, 1, 2, 1, 1, 0],
                          [1, 2, 0, 1, 0, 0, 1, 2, 0, 1, 0, 0],
                          [1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0],
                          [0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0]]
        self.nonmissing = "tests/bedfiles/ped_nomiss"
        self.nonmissing_bed = resource_filename("libgwas",
                                                "%s.bed" % (self.nonmissing))
        self.nonmissing_bim = resource_filename("libgwas",
                                                "%s.bim" % (self.nonmissing))
        self.nonmissing_fam = resource_filename("libgwas",
                                                "%s.fam" % (self.nonmissing))

        self.genotypes_w_missing = [[0, 1],
                                    [1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1],
                                    [0, 1, 1, 0, 0, 0, 2, 1, 1, 0, 0],
                                    [0, 2, 1, 1, 0, 0, 1, 2, 1, 1, 0],
                                    [1, 0, 1, 0, 0, 1, 2, 0, 1, 0, 0],
                                    [1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0],
                                    [0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0]]
        self.nonmissing_mapdata = libgwas.get_lines(self.nonmissing_bim,
                                                    split=True)
        self.missing_mapdata = libgwas.get_lines(self.missing_bim, split=True)

        self.phenotypes = [
            0.1, 0.4, 1.0, 0.5, 0.9, 1.0, 0.1, 0.4, 1.0, 0.5, 0.9, 1.0
        ]
        self.sex = [1, 1, 2, 2, 1, 1, 1, 1, 2, 2, 1, 1]

        self.chrom = BoundaryCheck.chrom
        self.boundary = DataParser.boundary
        self.min_maf = DataParser.min_maf
        self.max_maf = DataParser.max_maf
        self.snp_miss_tol = DataParser.snp_miss_tol
        self.ind_miss_tol = DataParser.ind_miss_tol
        self.sex_as_covar = PhenoCovar.sex_as_covariate
        self.has_sex = DataParser.has_sex
        self.has_pheno = DataParser.has_pheno
        self.has_parents = DataParser.has_parents
        self.has_fid = DataParser.has_fid
        self.has_liability = DataParser.has_liability

        DataParser.boundary = BoundaryCheck()
Beispiel #15
0
 def testBoundaryInitKB(self):
     BoundaryCheck.chrom = 5
     b = BoundaryCheck(kb=[20, 50])
     self.assertFalse(b.NoExclusions())
     self.assertEqual(True, b.valid)
     self.assertEqual(False, b.TestBoundary(5, 15000, ""))
     self.assertEqual(True, b.TestBoundary(5, 20000, ""))
     self.assertEqual(True, b.TestBoundary(5, 30000, ""))
     self.assertEqual(True, b.TestBoundary(5, 50000, ""))
     self.assertEqual(False, b.TestBoundary(5, 50001, ""))
     self.assertEqual(False, b.TestBoundary(1, 25000, ""))
     self.assertEqual(False, b.TestBoundary(10, 20000, ""))
Beispiel #16
0
 def testBoundaryInitBPWithExclusions(self):
     BoundaryCheck.chrom = 1
     b = BoundaryCheck(bp=[10000, 500000])
     b.LoadExclusions(["rs12345", "rs234567", "rs345678"])
     self.assertFalse(b.NoExclusions())
     self.assertTrue(b.valid)
     self.assertFalse(b.TestBoundary(1, 500, ""))
     self.assertTrue(b.TestBoundary(1, 10000, ""))
     self.assertFalse(b.TestBoundary(1, 10010, "rs12345"))
     self.assertTrue(b.TestBoundary(1, 24000, "rs9876"))
     self.assertFalse(b.TestBoundary(1, 25000, "rs234567"))
     self.assertTrue(b.TestBoundary(1, 250000, ""))
     self.assertTrue(b.TestBoundary(1, 500000, ""))
     self.assertFalse(b.TestBoundary(2, 250000, ""))
     self.assertFalse(b.TestBoundary(10, 10000, ""))
Beispiel #17
0
 def testBoundaryInitBPWithInclusions(self):
     BoundaryCheck.chrom = 1
     b = BoundaryCheck(bp=[10000, 500000])
     b.LoadSNPs(["rs12345", "rs23456"])
     self.assertFalse(b.NoExclusions())
     self.assertTrue(b.valid)
     self.assertEqual(False, b.TestBoundary(1, 500, ""))
     self.assertEqual(True, b.TestBoundary(1, 10000, ""))
     self.assertEqual(True, b.TestBoundary(1, 500000, ""))
     self.assertEqual(True, b.TestBoundary(1, 250000, ""))
     self.assertEqual(False, b.TestBoundary(2, 250000, ""))
     self.assertEqual(False, b.TestBoundary(10, 10000, ""))
     self.assertTrue(b.TestBoundary(1, 1000000, "rs12345"))
     self.assertTrue(b.TestBoundary(1, 1200000, "rs23456"))
     self.assertFalse(b.TestBoundary(1, 1200011, "rs345678"))
Beispiel #18
0
    def testDefaultBoundaryInitialization(self):

        # By default, it will identify as invalid, since it didn't find any boundaries
        # This is just for simplifying command line parsing
        BoundaryCheck.chrom = -1
        b = BoundaryCheck()
        self.assertEqual(False, b.valid)

        # At this point, this should any valid chromosome/position combination
        self.assertTrue(b.TestBoundary(1, 100, ""))
        self.assertTrue(True, b.TestBoundary(10, 1000000, ""))
        self.assertTrue(True, b.TestBoundary(25, 10000, ""))

        # We should test that our short circuit functionality works
        self.assertTrue(b.NoExclusions())
Beispiel #19
0
    def testBoundedMiddle(self):
        mach_parser.Parser.chrpos_encoding = True
        BoundaryCheck.chrom = 2
        DataParser.boundary = BoundaryCheck(bp=[1020, 1137])
        PhenoCovar.sex_as_covariate = True
        pc = PhenoCovar()
        parser = mach_parser.Parser([self.gen_file, self.gen_file2])
        parser.load_family_details(pc)
        parser.load_genotypes()

        idx = 8
        for snp in parser:
            self.assertEqual(self.positions[idx], snp.pos)
            for i in range(0, len(self.dosage_encoding[idx])):
                self.assertAlmostEqual(self.dosage_encoding[idx][i], snp.genotype_data[i], places=3)
            idx += 1
        self.assertEqual(13, idx)
Beispiel #20
0
 def testBoundaryExceedPos(self):
     BoundaryCheck.chrom = 10
     b = BoundaryCheck(mb=[1,3])
     self.assertTrue(b.valid)
     self.assertFalse(b.NoExclusions())
     self.assertFalse(b.TestBoundary(10, 100, ""))
     self.assertFalse(b.beyond_upper_bound)
     self.assertTrue(b.TestBoundary(10, 1000000, ""))
     self.assertFalse(b.beyond_upper_bound)
     self.assertTrue(b.TestBoundary(10, 1200000, ""))
     self.assertFalse(b.beyond_upper_bound)
     self.assertTrue(b.TestBoundary(10, 3000000, ""))
     self.assertFalse(b.beyond_upper_bound)
     self.assertFalse(b.TestBoundary(10, 3000001, ""))
     self.assertTrue(b.beyond_upper_bound)
     self.assertFalse(b.TestBoundary(10, 999999, ""))
     self.assertFalse(b.beyond_upper_bound)
Beispiel #21
0
    def testPedBoundary(self):
        pc = PhenoCovar()
        ped_parser = PedigreeParser(self.map_filename, self.ped_filename)
        DataParser.boundary = BoundaryCheck()
        BoundaryCheck.chrom = 2
        ped_parser.load_mapfile()
        ped_parser.load_genotypes(pc)
        pedigree = get_lines(self.map_filename, split=True)

        index = 4
        for snp in ped_parser:
            self.assertEqual(int(pedigree[index][0]), snp.chr)
            self.assertEqual(int(pedigree[index][3]), snp.pos)
            self.assertEqual(pedigree[index][1], snp.rsid)
            self.assertEqual(self.genotypes[index], list(snp.genotype_data))

            index += 1
        self.assertEqual(7, index)
Beispiel #22
0
    def setUp(self):
        self.missing = "bedfiles/analysis"
        self.missing_bed = resource_filename("tests",
                                             "%s.bed" % (self.missing))
        self.missing_bim = resource_filename("tests",
                                             "%s.bim" % (self.missing))
        self.missing_fam = resource_filename("tests",
                                             "%s.fam" % (self.missing))
        self.genotypes = [[2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2],
                          [1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1],
                          [2, 0, 1, 1, 2, 2, 2, 0, 1, 1, 2, 2],
                          [2, 1, 0, 1, 1, 2, 2, 1, 0, 1, 1, 2],
                          [1, 0, 2, 1, 2, 2, 1, 0, 2, 1, 2, 2],
                          [1, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2],
                          [2, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2]]

        self.nonmissing = "bedfiles/analysis"
        self.nonmissing_bed = resource_filename("tests",
                                                "%s.bed" % (self.nonmissing))
        self.nonmissing_bim = resource_filename("tests",
                                                "%s.bim" % (self.nonmissing))
        self.nonmissing_fam = resource_filename("tests",
                                                "%s.fam" % (self.nonmissing))
        self.genotypes_w_missing = [[
            2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1
        ], [1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1],
                                    [2, -1, 1, 1, 2, 2, 2, 0, 1, 1, 2, 2],
                                    [2, -1, 0, 1, 1, 2, 2, 1, 0, 1, 1, 2],
                                    [1, -1, 2, 1, 2, 2, 1, 0, 2, 1, 2, 2],
                                    [1, -1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2],
                                    [2, -1, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2]]
        self.chrom = BoundaryCheck.chrom
        self.boundary = DataParser.boundary
        self.min_maf = DataParser.min_maf
        self.max_maf = DataParser.max_maf
        self.snp_miss_tol = DataParser.snp_miss_tol
        self.ind_miss_tol = DataParser.ind_miss_tol
        self.sex_as_covar = PhenoCovar.sex_as_covariate
        self.standardizer = libgwas.standardizer.get_standardizer()
        libgwas.standardizer.set_standardizer(
            libgwas.standardizer.NoStandardization)

        DataParser.boundary = BoundaryCheck()
Beispiel #23
0
    def testBedBounded(self):
        BoundaryCheck.chrom = 1
        DataParser.boundary = BoundaryCheck(bp=[2000, 3000])
        pheno = PhenoCovar()
        ped_parser = bed_parser.Parser(self.nonmissing_fam,
                                       self.nonmissing_bim,
                                       self.nonmissing_bed)
        ped_parser.load_fam(pheno)
        ped_parser.load_bim(map3=False)
        ped_parser.load_genotypes()

        results = [x for x in mv_esteq.RunAnalysis(ped_parser, pheno)]

        self.assertEqual(1, results[0].chr)
        self.assertEqual(2000, results[0].pos)
        self.assertAlmostEqual(0.5777811, results[0].p_mvtest, places=6)
        self.assertAlmostEqual(0.4221215, results[0].lmpv, places=6)
        self.assertAlmostEqual(0.4466128, results[1].p_mvtest, places=6)
        self.assertAlmostEqual(0.6138634, results[1].lmpv, places=6)
    def testBoundariedUpper(self):
        BoundaryCheck.chrom = 3
        DataParser.boundary = BoundaryCheck(bp=[21000, 50000])
        impute_parser.encoding = impute_parser.Encoding.Recessive
        PhenoCovar.sex_as_covariate = True
        pc = PhenoCovar()
        parser = impute_parser.Parser(self.fam_file,
                                      [self.gen_file, self.gen_file2],
                                      chroms=[3, 4])
        parser.load_family_details(pc)
        parser.load_genotypes()

        idx = 6

        for snp in parser:
            self.assertEqual(self.positions[idx], snp.pos)
            for i in range(0, len(self.recessive_encoding[idx])):
                self.assertAlmostEqual(self.recessive_encoding[idx][i],
                                       snp.genotype_data[i],
                                       places=3)
            idx += 1
    def testPedBoundaryTPed(self):
        pc = PhenoCovar()
        ped_parser = TransposedPedigreeParser(self.tfam_filename,
                                              self.tped_filename)
        DataParser.boundary = BoundaryCheck()
        BoundaryCheck.chrom = 2
        ped_parser.load_tfam(pc)
        ped_parser.load_genotypes()

        pedigree = get_lines(self.tped_filename, split=True)

        index = 4
        loci = ped_parser.get_loci()
        for snp in loci:
            self.assertEqual(int(pedigree[index][0]), snp.chr)
            self.assertEqual(int(pedigree[index][3]), snp.pos)

            index += 1
        self.assertEqual(3, ped_parser.locus_count)
        index = 4
        for snp in ped_parser:
            snp_filter = numpy.ones(snp.missing_genotypes.shape[0]) == 1
            try:
                genodata = snp.get_genotype_data(snp_filter)
                self.assertEqual(int(pedigree[index][0]), snp.chr)
                self.assertEqual(int(pedigree[index][3]), snp.pos)
                self.assertEqual(pedigree[index][1], snp.rsid)
                self.assertAlmostEqual(self.hetero_freq_tped[index],
                                       genodata.hetero_freq,
                                       places=4)
                self.assertEqual(self.genotypes[index],
                                 list(genodata.genotypes))
            except TooMuchMissing as e:
                pass
            except InvalidFrequency as e:
                pass
            index += 1
        self.assertEqual(7, index)
    def testTPedAnalysisCov(self):
        PhenoCovar.sex_as_covariate = True
        DataParser.boundary = BoundaryCheck()
        pheno = PhenoCovar()
        dataset = TransposedPedigreeParser(self.tfam_filename, self.tped_filename)
        dataset.load_tfam(pheno)
        dataset.load_genotypes()
        #pheno.standardize_variables()

        results = [x for x in mv_esteq.RunAnalysis(dataset, pheno)]

        self.assertAlmostEqual(0.00342380, results[0].p_mvtest, places=6)
        self.assertAlmostEqual(0.11362883, results[0].betas[1], places=6)
        self.assertAlmostEqual(0.0337610, results[0].beta_stderr[1], places=6)
        self.assertAlmostEqual(0.00076356, results[0].beta_pvalues[1], places=6)
        self.assertAlmostEqual(0.01911090, results[0].betas[3], places=6)
        self.assertAlmostEqual(0.10143178, results[0].beta_stderr[3], places=6)
        self.assertAlmostEqual(0.8505542, results[0].beta_pvalues[3], places=6)


        self.assertAlmostEqual(0.584950593047, results[1].p_mvtest, places=6)
        self.assertAlmostEqual(0.0276543736525, results[1].betas[1], places=6)
        self.assertAlmostEqual(0.03383588, results[1].beta_stderr[1], places=6)
        self.assertAlmostEqual(0.413751829881, results[1].beta_pvalues[1], places=6)
Beispiel #27
0
    def testPedBoundaryBed(self):
        pc = PhenoCovar()
        DataParser.boundary = BoundaryCheck()
        BoundaryCheck.chrom = 2
        ped_parser = bed_parser.Parser(self.nonmissing_fam,
                                       self.nonmissing_bim,
                                       self.nonmissing_bed)
        ped_parser.load_fam(pc)
        ped_parser.load_bim(map3=False)
        ped_parser.load_genotypes()

        pedigree = self.nonmissing_mapdata

        index = 4
        valid_loci = 0
        for snp in ped_parser:
            for y in pc:
                (pheno, covars,
                 nonmissing) = y.get_variables(snp.missing_genotypes)

                try:
                    genodata = snp.get_genotype_data(nonmissing)
                    self.assertEqual(int(pedigree[index][0]), snp.chr)
                    self.assertEqual(int(pedigree[index][3]), snp.pos)
                    self.assertEqual(pedigree[index][1], snp.rsid)
                    self.assertEqual(self.genotypes[index],
                                     list(genodata.genotypes))
                    valid_loci += 1
                except TooMuchMissing as e:
                    pass
                except InvalidFrequency as e:
                    pass

            index += 1
        self.assertEqual(3, valid_loci)
        self.assertEqual(7, index)
Beispiel #28
0
    def LoadCmdLine(self, args=sys.argv[1:]):
        """Parse user arguments using argparse and set up components"""
        parser = argparse.ArgumentParser(description="MV Test: " + __version__,
                                         epilog="""
mvtest.py is uses many of the same arguments as plink, but there are a few
differences, so please consider the list above carefully.
        """)

        parser.add_argument("-v",
                            action='store_true',
                            help="Print version number")
        parser.add_argument(
            "--vall",
            action='store_true',
            help="Print version number along with each dependency")

        parser.add_argument("--chr",
                            type=int,
                            default=-1,
                            metavar="N",
                            help="Select Chromosome")
        parser.add_argument(
            "--snps",
            type=str,
            default="",
            help="Comma-delimited list of SNP(s): rs1,rs2,rs3-rs6")
        parser.add_argument("--from-bp",
                            type=int,
                            metavar="START",
                            help="SNP range start")
        parser.add_argument("--to-bp",
                            type=int,
                            metavar="END",
                            help="SNP range end")
        parser.add_argument("--from-kb",
                            type=int,
                            metavar="START",
                            help="SNP range start")
        parser.add_argument("--to-kb",
                            type=int,
                            metavar="END",
                            help="SNP range end")
        parser.add_argument("--from-mb",
                            type=int,
                            metavar="START",
                            help="SNP range start")
        parser.add_argument("--to-mb",
                            type=int,
                            metavar="END",
                            help="SNP range end")
        parser.add_argument(
            "--exclude",
            type=str,
            default="",
            help="Comma-delimited list of rsids to be excluded")

        # For now, I'm not implementing keep, since we don't have any real meaningful need for analyzing individuals
        # PLINK does, but we don't do the QC stuff they do.
        parser.add_argument(
            "--keep",
            type=str,
            default="",
            help="Comma-delimited list of individuals to be analyzed")
        parser.add_argument(
            "--remove",
            type=str,
            default="",
            help=
            "Comma-delimited list of individuals to be removed from analysis")

        parser.add_argument("--file",
                            type=str,
                            help="Prefix for .ped and .map files")
        parser.add_argument("--ped",
                            type=argparse.FileType('r'),
                            help="PLINK compatible .ped file")
        parser.add_argument("--map",
                            type=argparse.FileType('r'),
                            help="PLINK compatible .map file")
        parser.add_argument("--map3",
                            action='store_true',
                            help="MAP file has only 3 columns")
        parser.add_argument("--no-sex",
                            action='store_true',
                            help="Pedigree file doesn't have column 5 (sex)")
        parser.add_argument(
            "--no-parents",
            action="store_true",
            help="Pedigree file doesn't have columns 3 and 4 (parents)")
        parser.add_argument(
            "--no-fid",
            action="store_true",
            help="Pedigree file doesn't have column 1 (family ID)")
        parser.add_argument(
            "--no-pheno",
            action="store_true",
            help="Pedigree file doesn't have column 6 (phenotype")
        parser.add_argument("--liability",
                            action="store_true",
                            help="Pedigree file has column 7 (liability)")

        parser.add_argument("--bfile",
                            type=str,
                            help="Prefix for .bed, .bim and .fam files")
        parser.add_argument("--bed",
                            type=argparse.FileType('r'),
                            help="Binary Ped file (.bed)")
        parser.add_argument("--bim",
                            type=argparse.FileType('r'),
                            help="Binary ped marker file (.bim)")
        parser.add_argument("--fam",
                            type=argparse.FileType('r'),
                            help="Binary ped family file (.fam)")

        parser.add_argument("--tfile",
                            type=str,
                            help="Prefix for .tped and .tfam files")
        parser.add_argument("--tped",
                            type=argparse.FileType('r'),
                            help="Transposed Pedigree file (.tped)")
        parser.add_argument("--tfam",
                            type=argparse.FileType('r'),
                            help="Transposed pedigre Family file (.tfam)")
        parser.add_argument(
            "--compressed",
            action="store_true",
            help="Ped/TPed compressed with gzip (named .ped.tgz or .tped.tgz)")

        parser.add_argument(
            "--impute",
            type=argparse.FileType('r'),
            help="File containing list of impute output for analysis")
        parser.add_argument(
            "--impute-fam",
            type=argparse.FileType('r'),
            help="File containing family details for impute data")
        parser.add_argument(
            "--impute-offset",
            type=int,
            default=-1,
            help="Impute file index (1 based) to begin analysis")
        parser.add_argument(
            "--impute-count",
            type=int,
            default=-1,
            help="Number of impute files to process (for this node)")
        parser.add_argument(
            "--impute-uncompressed",
            action="store_true",
            help="Indicate that the impute input is not gzipped, but plain text"
        )
        parser.add_argument(
            "--impute-encoding",
            type=str,
            choices=['additive', 'dominant', 'recessive', 'genotype'],
            default='additive',
            help='Genetic model to be used')
        parser.add_argument("--impute-info-ext",
                            type=str,
                            default='info',
                            help="Portion of filename denotes info filename")
        parser.add_argument("--impute-gen-ext",
                            type=str,
                            default='gen.gz',
                            help="Portion of filename that denotes gen file")
        parser.add_argument(
            "--impute-info-thresh",
            type=float,
            default=0.4,
            help="Threshold for filtering imputed SNPs with poor 'info' values"
        )

        parser.add_argument(
            "--mach",
            type=argparse.FileType('r'),
            help="File containing list of MACH output for analysis")
        parser.add_argument("--mach-offset",
                            type=int,
                            default=-1,
                            help="Mach file index (1 based) to begin analysis")
        parser.add_argument(
            "--mach-count",
            type=int,
            default=-1,
            help="Number of mach files to process (for this node)")
        parser.add_argument("--mach-uncompressed",
                            action="store_true",
                            help="Indicate that the mach input is not gzipped")
        parser.add_argument(
            "--mach-chunk-size",
            type=int,
            default=100000,
            help=
            "Max number of loci to load at once (higher increases memory requirements with some speed benefits)"
        )
        parser.add_argument("--mach-info-ext",
                            type=str,
                            default="info.gz",
                            help="Portion of filename denotes info filenames")
        parser.add_argument("--mach-dose-ext",
                            type=str,
                            default="dose.gz",
                            help="Portion of filename that denotes dose files")
        parser.add_argument("--mach-min-rsquared",
                            type=float,
                            default=0.3,
                            help="Filter out loci with RSquared < this value")
        parser.add_argument(
            "--mach-chrpos",
            action="store_true",
            help=
            "When true, first col in .info file must be chr:pos (additional pieces allowed)"
        )

        parser.add_argument("--pheno",
                            type=argparse.FileType('r'),
                            help="File containing phenotypes")
        parser.add_argument("--sample-pheno",
                            type=argparse.FileType('r'),
                            help="(Mach) Sample file containing phenotypes")
        parser.add_argument(
            "--mphenos",
            type=str,
            default="",
            help=
            "Column number(s) for phenotype to be analyzed if number of columns > 1"
        )
        parser.add_argument(
            "--pheno-names",
            type=str,
            default="",
            help=
            "Name for phenotype(s) to be analyzed (must be in --pheno file)")
        parser.add_argument("--all-pheno",
                            action="store_true",
                            help="Analyze all columns from the phenotype file")
        #parser.add_argument("--all-pheno", action='store_true', help="Analyze each phenotype")

        parser.add_argument("--covar",
                            type=argparse.FileType('r'),
                            help="File containing covariates")
        parser.add_argument("--sample-covar",
                            type=argparse.FileType('r'),
                            help="(Mach) Sample file containing covariates")
        parser.add_argument("--covar-numbers",
                            type=str,
                            default="",
                            help="Comma-separated list of covariate indices")
        parser.add_argument("--covar-names",
                            type=str,
                            default="",
                            help="Comma-separated list of covariate names")
        parser.add_argument(
            "--sex",
            action='store_true',
            help="Use sex from the pedigree file as a covariate")
        parser.add_argument("--missing-phenotype",
                            type=float,
                            default=-9.0,
                            help="Encoding for missing phenotypes")

        parser.add_argument("--maf",
                            type=float,
                            default=0.0,
                            help="Minimum MAF allowed for analysis")
        parser.add_argument("--max-maf",
                            type=float,
                            default=1.0,
                            help="MAX MAF allowed for analysis")
        parser.add_argument("--geno",
                            type=float,
                            default=1.0,
                            help="MAX per-SNP missing for analysis")
        parser.add_argument("--mind",
                            type=float,
                            default=1.0,
                            help="MAX per-person missing")

        parser.add_argument("--verbose",
                            action='store_true',
                            help="Output additional data details")

        parser.set_defaults(all_pheno=False, sex=False, mach_chrpos=False)
        args = parser.parse_args(args)

        # Report version, if requested, and exit
        if args.v:
            print >> sys.stderr, "%s: %s" % (os.path.basename(__file__),
                                             __version__)
            sys.exit(0)

        if args.vall:
            print >> sys.stderr, "%s: %s" % (os.path.basename(__file__),
                                             __version__)
            print >> sys.stderr, "%s: %s" % (os.path.dirname(
                libgwas.__file__), libgwas.__version__)
            print >> sys.stderr, "%s: %s" % (os.path.dirname(
                scipy.__file__), scipy.__version__)
            print >> sys.stderr, "%s: %s" % (os.path.dirname(
                numpy.__file__), numpy.__version__)
            sys.exit(0)

        ###############################################################################################################
        # Here we deal with the various ways we filter SNPs in and out of anlysis
        # We might handle MACH files differently. We'll default the chromosome
        # to be "NA" which is how those can be returned.
        if args.mach is None or args.mach_chrpos:
            BoundaryCheck.chrom = args.chr
        else:
            if args.chr != -1:
                libgwas.Exit(
                    ("Positional based filtering (--chr, --from/--to)" +
                     " only work with mach_chrpos. See manual for details."))
            BoundaryCheck.chrom = "NA"
        snps = args.snps.split(",")
        try:
            b = BoundaryCheck(bp=(args.from_bp, args.to_bp),
                              kb=(args.from_kb, args.to_kb),
                              mb=(args.from_mb, args.to_mb))
        except InvalidBoundarySpec, e:
            print >> sys.stderr, "Invalid boundary spec associated: %s" % (
                e.malformed_boundary)
            sys.exit(1)
Beispiel #29
0
    def LoadCmdLine(self, args=sys.argv[1:]):
        """Parse user arguments using argparse and set up components"""
        parser = argparse.ArgumentParser(description="MV Test: " + __version__,
                                         epilog="""
mvtest.py is uses many of the same arguments as plink, but there are a few
differences, so please consider the list above carefully.
        """)

        parser.add_argument("-v",
                            action='store_true',
                            help="Print version number")
        parser.add_argument(
            "--vall",
            action='store_true',
            help="Print version number along with each dependency")

        parser.add_argument("--chr",
                            type=int,
                            default=-1,
                            metavar="N",
                            help="Select Chromosome")
        parser.add_argument(
            "--snps",
            type=str,
            default="",
            help="Comma-delimited list of SNP(s): rs1,rs2,rs3-rs6")
        parser.add_argument("--from-bp",
                            type=int,
                            metavar="START",
                            help="SNP range start")
        parser.add_argument("--to-bp",
                            type=int,
                            metavar="END",
                            help="SNP range end")
        parser.add_argument("--from-kb",
                            type=int,
                            metavar="START",
                            help="SNP range start")
        parser.add_argument("--to-kb",
                            type=int,
                            metavar="END",
                            help="SNP range end")
        parser.add_argument("--from-mb",
                            type=int,
                            metavar="START",
                            help="SNP range start")
        parser.add_argument("--to-mb",
                            type=int,
                            metavar="END",
                            help="SNP range end")
        parser.add_argument(
            "--exclude",
            type=str,
            default="",
            help="Comma-delimited list of rsids to be excluded")

        # For now, I'm not implementing keep, since we don't have any real meaningful need for analyzing individuals
        # PLINK does, but we don't do the QC stuff they do.
        parser.add_argument(
            "--keep",
            type=str,
            default="",
            help="Comma-delimited list of individuals to be analyzed")
        parser.add_argument(
            "--remove",
            type=str,
            default="",
            help=
            "Comma-delimited list of individuals to be removed from analysis")

        parser.add_argument("--file",
                            type=str,
                            help="Prefix for .ped and .map files")
        parser.add_argument("--ped",
                            type=argparse.FileType('r'),
                            help="PLINK compatible .ped file")
        parser.add_argument("--map",
                            type=argparse.FileType('r'),
                            help="PLINK compatible .map file")
        parser.add_argument("--map3",
                            action='store_true',
                            help="MAP file has only 3 columns")
        parser.add_argument("--no-sex",
                            action='store_true',
                            help="Pedigree file doesn't have column 5 (sex)")
        parser.add_argument(
            "--no-parents",
            action="store_true",
            help="Pedigree file doesn't have columns 3 and 4 (parents)")
        parser.add_argument(
            "--no-fid",
            action="store_true",
            help="Pedigree file doesn't have column 1 (family ID)")
        parser.add_argument(
            "--no-pheno",
            action="store_true",
            help="Pedigree file doesn't have column 6 (phenotype")
        parser.add_argument("--liability",
                            action="store_true",
                            help="Pedigree file has column 7 (liability)")

        parser.add_argument("--bfile",
                            type=str,
                            help="Prefix for .bed, .bim and .fam files")
        parser.add_argument("--bed",
                            type=argparse.FileType('r'),
                            help="Binary Ped file (.bed)")
        parser.add_argument("--bim",
                            type=argparse.FileType('r'),
                            help="Binary ped marker file (.bim)")
        parser.add_argument("--fam",
                            type=argparse.FileType('r'),
                            help="Binary ped family file (.fam)")

        parser.add_argument("--tfile",
                            type=str,
                            help="Prefix for .tped and .tfam files")
        parser.add_argument("--tped",
                            type=argparse.FileType('r'),
                            help="Transposed Pedigree file (.tped)")
        parser.add_argument("--tfam",
                            type=argparse.FileType('r'),
                            help="Transposed pedigre Family file (.tfam)")
        parser.add_argument(
            "--compressed",
            action="store_true",
            help="Ped/TPed compressed with gzip (named .ped.tgz or .tped.tgz)")

        parser.add_argument(
            "--impute",
            type=argparse.FileType('r'),
            help="File containing list of impute output for analysis")
        parser.add_argument(
            "--impute-fam",
            type=argparse.FileType('r'),
            help="File containing family details for impute data")
        parser.add_argument(
            "--impute-offset",
            type=int,
            default=-1,
            help="Impute file index (1 based) to begin analysis")
        parser.add_argument(
            "--impute-count",
            type=int,
            default=-1,
            help="Number of impute files to process (for this node)")
        parser.add_argument(
            "--impute-uncompressed",
            action="store_true",
            help="Indicate that the impute input is not gzipped, but plain text"
        )
        parser.add_argument(
            "--impute-encoding",
            type=str,
            choices=['additive', 'dominant', 'recessive', 'genotype'],
            default='additive',
            help='Genetic model to be used')
        parser.add_argument("--impute-info-ext",
                            type=str,
                            default='info',
                            help="Portion of filename denotes info filename")
        parser.add_argument("--impute-gen-ext",
                            type=str,
                            default='gen.gz',
                            help="Portion of filename that denotes gen file")
        parser.add_argument(
            "--impute-info-thresh",
            type=float,
            default=0.4,
            help="Threshold for filtering imputed SNPs with poor 'info' values"
        )

        parser.add_argument(
            "--mach",
            type=argparse.FileType('r'),
            help="File containing list of MACH output for analysis")
        parser.add_argument("--mach-offset",
                            type=int,
                            default=-1,
                            help="Mach file index (1 based) to begin analysis")
        parser.add_argument(
            "--mach-count",
            type=int,
            default=-1,
            help="Number of mach files to process (for this node)")
        parser.add_argument("--mach-uncompressed",
                            action="store_true",
                            help="Indicate that the mach input is not gzipped")
        parser.add_argument(
            "--mach-chunk-size",
            type=int,
            default=100000,
            help=
            "Max number of loci to load at once (higher increases memory requirements with some speed benefits)"
        )
        parser.add_argument("--mach-info-ext",
                            type=str,
                            default="info.gz",
                            help="Portion of filename denotes info filenames")
        parser.add_argument("--mach-dose-ext",
                            type=str,
                            default="dose.gz",
                            help="Portion of filename that denotes dose files")
        parser.add_argument("--mach-min-rsquared",
                            type=float,
                            default=0.3,
                            help="Filter out loci with RSquared < this value")
        parser.add_argument(
            "--mach-chrpos",
            action="store_true",
            help=
            "When true, first col in .info file must be chr:pos (additional pieces allowed)"
        )

        parser.add_argument("--pheno",
                            type=argparse.FileType('r'),
                            help="File containing phenotypes")
        parser.add_argument("--sample-pheno",
                            type=argparse.FileType('r'),
                            help="(Mach) Sample file containing phenotypes")
        parser.add_argument(
            "--mphenos",
            type=str,
            default="",
            help=
            "Column number(s) for phenotype to be analyzed if number of columns > 1"
        )
        parser.add_argument(
            "--pheno-names",
            type=str,
            default="",
            help=
            "Name for phenotype(s) to be analyzed (must be in --pheno file)")
        parser.add_argument("--all-pheno",
                            action="store_true",
                            help="Analyze all columns from the phenotype file")
        #parser.add_argument("--all-pheno", action='store_true', help="Analyze each phenotype")

        parser.add_argument("--covar",
                            type=argparse.FileType('r'),
                            help="File containing covariates")
        parser.add_argument("--sample-covar",
                            type=argparse.FileType('r'),
                            help="(Mach) Sample file containing covariates")
        parser.add_argument("--covar-numbers",
                            type=str,
                            default="",
                            help="Comma-separated list of covariate indices")
        parser.add_argument("--covar-names",
                            type=str,
                            default="",
                            help="Comma-separated list of covariate names")
        parser.add_argument(
            "--sex",
            action='store_true',
            help="Use sex from the pedigree file as a covariate")
        parser.add_argument("--missing-phenotype",
                            type=float,
                            default=-9.0,
                            help="Encoding for missing phenotypes")

        parser.add_argument("--maf",
                            type=float,
                            default=0.0,
                            help="Minimum MAF allowed for analysis")
        parser.add_argument("--max-maf",
                            type=float,
                            default=1.0,
                            help="MAX MAF allowed for analysis")
        parser.add_argument("--geno",
                            type=float,
                            default=1.0,
                            help="MAX per-SNP missing for analysis")
        parser.add_argument("--mind",
                            type=float,
                            default=1.0,
                            help="MAX per-person missing")

        parser.add_argument("--verbose",
                            action='store_true',
                            help="Output additional data details")

        parser.set_defaults(all_pheno=False, sex=False, mach_chrpos=False)
        args = parser.parse_args(args)

        # Report version, if requested, and exit
        if args.v:
            print("%s: %s" % (os.path.basename(__file__), __version__),
                  file=sys.stderr)
            sys.exit(0)

        if args.vall:
            print("%s: %s" % (os.path.basename(__file__), __version__),
                  file=sys.stderr)
            print("%s: %s" %
                  (os.path.dirname(libgwas.__file__), libgwas.__version__),
                  file=sys.stderr)
            print("%s: %s" %
                  (os.path.dirname(scipy.__file__), scipy.__version__),
                  file=sys.stderr)
            print("%s: %s" %
                  (os.path.dirname(numpy.__file__), numpy.__version__),
                  file=sys.stderr)
            sys.exit(0)

        ###############################################################################################################
        # Here we deal with the various ways we filter SNPs in and out of anlysis
        # We might handle MACH files differently. We'll default the chromosome
        # to be "NA" which is how those can be returned.
        if args.mach is None or args.mach_chrpos:
            BoundaryCheck.chrom = args.chr
        else:
            if args.chr != -1:
                libgwas.Exit(
                    ("Positional based filtering (--chr, --from/--to)" +
                     " only work with mach_chrpos. See manual for details."))
            BoundaryCheck.chrom = "NA"
        snps = args.snps.split(",")
        try:
            b = BoundaryCheck(bp=(args.from_bp, args.to_bp),
                              kb=(args.from_kb, args.to_kb),
                              mb=(args.from_mb, args.to_mb))
        except InvalidBoundarySpec as e:
            print("Invalid boundary spec associated: %s" %
                  (e.malformed_boundary),
                  file=sys.stderr)
            sys.exit(1)
        try:
            s = SnpBoundaryCheck(snps=snps)
        except InvalidBoundarySpec as e:
            print("Invalid SNP boundary defined: %s" % (e.malformed_boundary),
                  file=sys.stderr)
            print(
                "SNPs must be either single or have be a range such as rs123-rs345",
                file=sys.stderr)
            sys.exit(1)

        if b.valid and s.valid:
            print(
                "Only one type of boundary conditions is permitted. Either use --from-bp, etc. or rs123-rs345. ",
                file=sys.stderr)
            sys.exit(1)

        if len(b.bounds) > 0 and not b.valid:
            if BoundaryCheck.chrom == "NA":
                libgwas.Exit(
                    ("Positional based filtering (--chr, --from/--to)" +
                     " only work with mach_chrpos. See manual for details."))

        if s.valid:
            DataParser.boundary = s
        # If b isn't valid, we still want to potentially allow for chr and SNPs, it just won't have
        else:
            b.LoadSNPs(snps)
            # any actual boundary listings
            DataParser.boundary = b
        DataParser.boundary.LoadExclusions(snps=args.exclude.split(","))

        ###############################################################################################################
        # Setup the various Dataset filter criteria
        DataParser.min_maf = args.maf
        DataParser.max_maf = args.max_maf
        DataParser.snp_miss_tol = args.geno
        DataParser.ind_miss_tol = args.mind

        DataParser.ind_exclusions = ParseIndList(args.remove)

        PhenoCovar.sex_as_covariate = args.sex

        if args.compressed:
            DataParser.compressed_pedigree = True

        DataParser.has_sex = not args.no_sex
        DataParser.has_parents = not args.no_parents
        DataParser.has_fid = not args.no_fid
        DataParser.has_pheno = not args.no_pheno
        DataParser.has_liability = args.liability

        pheno_covar = PhenoCovar()
        self.verbose = False
        if args.verbose:
            self.verbose = True

        if args.file != None or args.ped or args.map:
            if args.ped and not args.map or args.map and not args.ped:
                print(
                    "When analyzing pedigree data, both .map and .ped must be specified",
                    file=sys.stderr)
                sys.exit(1)
            if args.ped:
                dataset = pedigree_parser.Parser(args.map.name, args.ped.name)
            else:
                dataset = pedigree_parser.Parser("%s.map" % (args.file),
                                                 "%s.ped" % (args.file))

            dataset.load_mapfile(map3=args.map3)
            dataset.load_genotypes(pheno_covar)
        elif args.tfile != None or args.tped or args.tfam:
            if args.tped and not args.tfam or args.tfam and not args.tped:
                print(
                    "When analyzing transposed pedigree data, both .tfam and .tped must be specified",
                    file=sys.stderr)
                sys.exit(1)
            if args.tped:
                dataset = transposed_pedigree_parser.Parser(
                    args.tfam.name, args.tped.name)
            else:
                dataset = transposed_pedigree_parser.Parser(
                    "%s.tfam" % (args.tfile), "%s.tped" % (args.tfile))
            dataset.load_tfam(pheno_covar)
            dataset.load_genotypes()
        elif args.bfile != None:
            dataset = bed_parser.Parser("%s.fam" % (args.bfile),
                                        "%s.bim" % (args.bfile),
                                        "%s.bed" % (args.bfile))
            dataset.load_bim(map3=args.map3)
            dataset.load_fam(pheno_covar)
            dataset.load_genotypes()
        elif args.bed or args.bim or args.fam:
            if (args.bed and not args.fam or not args.bim) or (
                    args.bim and not args.bed
                    or not args.fam) or (args.fam and not args.bed
                                         or not args.bim):
                print(
                    "When analyzing binary pedigree data, .bed, .bim and .fam files must be provided",
                    file=sys.stderr)
                sys.exit(1)
            dataset = bed_parser.Parser(args.fam, args.bim, args.bed)
            dataset.load_bim(map3=args.map3)
            dataset.load_fam(pheno_covar)
            dataset.load_genotypes()
        elif args.impute:
            DataParser.compressed_pedigree = not args.impute_uncompressed

            if (args.impute_offset > 0 and args.impute_count == -1) or (
                    args.impute_offset == -1 and args.impute_count > 0):
                print(
                    "--impute-count and --impute_offset must both > 0 if one is set other than -1.  ",
                    file=sys.stderr)
                sys.exit(1)
            if DataParser.snp_miss_tol != 1.0:
                print("--geno does not have any impact on imputed data",
                      file=sys.stderr)
                sys.exit(1)
            if DataParser.ind_miss_tol != 1.0:
                print("--mind does not have any impact on imputed data",
                      file=sys.stderr)
                sys.exit(1)
            impute_parser.SetEncoding(args.impute_encoding)
            impute_parser.Parser.info_ext = args.impute_info_ext
            impute_parser.Parser.info_threshold = args.impute_info_thresh
            libgwas.ExitIf(
                "--impute-fam is required for when processing imputed data",
                args.impute_fam == None)
            archives, chroms, infos = self.ParseImputeFile(
                args.impute.name, args.impute_offset, args.impute_count)
            dataset = impute_parser.Parser(args.impute_fam.name, archives,
                                           chroms, infos)
            dataset.load_family_details(pheno_covar)
            dataset.load_genotypes()
        elif args.mach:

            DataParser.compressed_pedigree = not args.mach_uncompressed
            if (args.mach_offset > 0
                    and args.mach_count == -1) or (args.mach_offset == -1
                                                   and args.impute_count > 0):
                print(
                    "--mach-count and --mach_offset must both be > 0 if one is set other than -1. ",
                    file=sys.stderr)
                sys.exit(1)
            if DataParser.snp_miss_tol != 1.0:
                print("--geno does not have any impact on imputed data",
                      file=sys.stderr)
                sys.exit(1)
            if DataParser.ind_miss_tol != 1.0:
                print("--mind does not have any impact on imputed data",
                      file=sys.stderr)
                sys.exit(1)
            if BoundaryCheck.chrom != "NA" and not args.mach_chrpos:
                libgwas.Exit(
                    ("Positional based filtering (--chr, --from/--to)" +
                     " only work with mach_chrpos. See manual for details."))
            mach_parser.Parser.chrpos_encoding = args.mach_chrpos
            mach_parser.Parser.info_ext = args.mach_info_ext
            mach_parser.Parser.dosage_ext = args.mach_dose_ext
            mach_parser.Parser.chunk_stride = args.mach_chunk_size
            mach_parser.Parser.min_rsquared = args.mach_min_rsquared
            archives, infos = self.ParseMachFile(args.mach.name,
                                                 args.mach_offset,
                                                 args.mach_count)
            dataset = mach_parser.Parser(archives, infos)
            dataset.load_family_details(pheno_covar)
            dataset.load_genotypes()

        else:
            parser.print_usage(sys.stderr)
            print(
                "\nNo data has been specified. Users must specify either pedigree or transposed pedigree to continue",
                file=sys.stderr)
            sys.exit(1)

        if args.pheno or args.sample_pheno:
            mphenos = []
            if args.mphenos != "":
                mphenos = args.mphenos.split(",")

            nphenos = []
            if args.pheno_names != "":
                nphenos = args.pheno_names.split(",")

            if len(mphenos) + len(nphenos) == 0 and not args.all_pheno:
                libgwas.Exit("You must select one or more phenotypes when ")
            sample_file = False
            pheno_filename = args.pheno
            if args.sample_pheno:
                pheno_filename = args.sample_pheno
                sample_file = True
            pheno_covar.load_phenofile(pheno_filename, mphenos, nphenos,
                                       sample_file)

        if args.covar:
            pheno_covar.load_covarfile(args.covar,
                                       args.covar_numbers.split(","),
                                       args.covar_names.split(","))
        pheno_covar.do_standardize_variables = True
        return dataset, pheno_covar
Beispiel #30
0
    def setUp(self):
        self.allele_1 = list("AAACCCGGGTCGTGTATACC")
        self.allele_2 = list("CGTGTATACCAAACCCGGGT")

        self.nomissing = resource_filename("libgwas",
                                           'tests/bedfiles/test.bgen')
        self.nomissing_sample = resource_filename(
            "libgwas", 'tests/bedfiles/test.bgen.sample')

        self.additive_encoding = [
            [
                0.000000, 0.127169, 0.000000, 0.026108, 0.151843, 0.104356,
                0.040452, 0.106310, 0.026459, 0.403708, 0.000000, 0.000000
            ],
            [
                0.379370, 0.568566, 0.938415, 0.404898, 0.627787, 0.835401,
                0.230304, 0.406455, 0.384497, 0.456489, 0.616754, 0.345785
            ],
            [
                0.819898, 1.012375, 0.567895, 0.703471, 0.915068, 0.590921,
                0.432792, 0.760784, 0.630350, 0.562997, 0.971618, 0.248157
            ],
            [
                0.607736, 0.702739, 0.381674, 0.360235, 0.203494, 0.641962,
                0.351629, 0.203464, 0.159060, 0.550835, 0.597894, 0.145480
            ],
            [
                0.580652, 0.666178, 0.798642, 0.815656, 0.881361, 0.369909,
                1.057939, 0.671199, 0.980896, 0.551324, 0.655192, 0.928801
            ],
            [
                0.677592, 0.623163, 0.691890, 0.922942, 0.992157, 0.338140,
                0.410758, 0.317647, 0.689769, 0.341741, 0.765316, 0.213367
            ],
            [
                0.402716, 0.352697, 0.111910, 0.208103, 0.022477, 0.166033,
                0.568475, 0.125948, 0.117266, 0.557259, 0.403021, 0.228489
            ],
            [
                0.332448, 0.555749, 0.589624, 0.275273, 0.762158, 0.189563,
                0.594308, 0.625116, 0.263355, 0.747143, 0.453483, 0.549889
            ],
            [
                0.659739, 0.697078, 0.876234, 0.766857, 0.345663, 0.712200,
                0.643534, 0.642817, 0.706020, 0.452323, 0.394064, 0.606271
            ],
            [
                1.424872, 1.197848, 1.489326, 1.362325, 1.172427, 1.503670,
                1.514473, 1.507042, 1.345663, 1.486091, 1.292332, 1.445747
            ],
            [
                0.284001, 0.000000, 0.000000, 0.208789, 0.000000, 0.068513,
                0.075715, 0.050599, 0.014786, 0.000000, 0.000000, 0.000000
            ],
            [
                0.506264, 0.224201, 0.860700, 0.196689, 0.476310, 0.524498,
                0.719203, 0.851087, 0.441306, 0.518334, 0.532097, 0.626398
            ],
            [
                0.774090, 0.838102, 0.662806, 0.449104, 0.685237, 0.807706,
                0.912200, 0.833310, 0.515007, 0.628107, 0.591669, 0.638407
            ],
            [
                0.411810, 0.125216, 0.350500, 0.838285, 0.281407, 0.657359,
                0.470710, 0.187610, 0.432212, 0.587961, 0.171679, 0.731609
            ],
            [
                0.513161, 0.909300, 0.455039, 1.007599, 1.106294, 0.633326,
                0.528801, 0.509804, 0.962905, 0.570611, 0.773098, 0.491997
            ],
            [
                0.763394, 0.872496, 0.506065, 0.640696, 0.291096, 0.655985,
                0.750210, 0.628885, 0.465293, 0.513451, 0.625498, 0.737087
            ],
            [
                0.345403, 0.168490, 0.484215, 0.169299, 0.511498, 0.295995,
                0.063813, 0.405005, 0.810697, 0.227893, 0.079698, 0.400641
            ],
            [
                0.296605, 0.445899, 0.180896, 0.886610, 0.728588, 0.685908,
                0.701930, 0.495094, 0.417212, 0.123598, 0.828504, 0.589837
            ],
            [
                0.297200, 0.624506, 0.787900, 0.609308, 0.674388, 0.695109,
                0.691096, 0.537652, 0.870207, 0.834089, 0.492699, 0.322377
            ],
            [
                1.807797, 1.595346, 1.613611, 1.205295, 1.541299, 1.484199,
                1.312688, 1.560998, 1.517891, 1.587198, 1.231907, 1.528298
            ]
        ]
        self.phenotypes = [
            0.1, 0.4, 1.0, 0.5, 0.9, 1.0, 0.1, 0.4, 1.0, 0.5, 0.9, 1.0
        ]
        self.sex = [1, 2, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1]

        self.positions = [
            13970, 15367, 16764, 18161, 19558, 20955, 22352, 23749, 25146,
            26543, 27940, 29337, 30734, 32131, 33528, 34925, 36322, 37719,
            39116, 40513
        ]
        self.mafs = [
            0.0411001754788, 0.258113349101, 0.342346964726, 0.20442511635,
            0.373239490349, 0.291020065614, 0.136016378017, 0.247421225299,
            0.312616667938, 0.697575722896, 0.0292668039979, 0.269878690776,
            0.34732267745, 0.218598204522, 0.352580682078, 0.310423183541,
            0.165110246433, 0.265861753262, 0.309855420768, 0.749438595153
        ]
        self.rsids = "rs1320,rs13267,rs132134,rs132201,rs132268,rs132335,rs132402,rs132469,rs132536,rs132603,rs132670,rs132737,rs132804,rs132871,rs132938,rs1321005,rs1321072,rs1321139,rs1321206,rs1321273".split(
            ",")
        self.ind_ids = 'ID0001,ID20002,ID0003,ID0004,IID0005,0006,ID0007,ID0008,ID0009,ID0010,ID0011,IID0012'.split(
            ',')
        # For now, we aren't calculating the info scores and it always returns 1.0
        self.info = [
            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
        ]

        self.chrom = BoundaryCheck.chrom
        self.boundary = DataParser.boundary
        DataParser.boundary = BoundaryCheck()
        self.min_maf = DataParser.min_maf
        self.max_maf = DataParser.max_maf
        self.snp_miss_tol = DataParser.snp_miss_tol
        self.ind_miss_tol = DataParser.ind_miss_tol
        DataParser.ind_exclusions = []
        self.sex_as_covar = PhenoCovar.sex_as_covariate
        self.has_sex = DataParser.has_sex
        self.has_pheno = DataParser.has_pheno
        self.has_parents = DataParser.has_parents
        self.has_fid = DataParser.has_fid
        self.has_liability = DataParser.has_liability
        self.encoding = libgwas.bgen_parser.encoding
        self.parser_info_thresh = libgwas.bgen_parser.Parser.info_threshold
        libgwas.bgen_parser.Parser.info_threshold = 0.0

        self.raw = numpy.zeros((20, 12, 3))

        self.orig_id_encoding = PhenoCovar.id_encoding

        # This is required for the current bgen sample file format
        PhenoCovar.id_encoding = PhenoIdFormat.FID