def test_pheno_no_header(self): # Indicate that we want to use sex as a covariate PhenoCovar.sex_as_covariate = False pc = PhenoCovar() load_pedigree(pc, self.ped) pc.load_phenofile(self.no_header) self.assertEqual(1, len(pc.phenotype_data)) self.assertEqual(0, len(pc.covariate_data)) self.assertEqual(6, len(pc.phenotype_data[0])) self.assertEqual(1, len(pc.phenotype_names)) self.assertEqual("Pheno-1", pc.phenotype_names[0]) phenotype_values = [0.9, 1.0, 0.4, 0.8, 1, 0.1] for idx in xrange(0, len(phenotype_values)): self.assertAlmostEqual(phenotype_values[idx], pc.phenotype_data[0][idx])
def testPedWithMissingComplete(self): pc = PhenoCovar() ped_parser = bed_parser.Parser(self.missing_fam, self.missing_bim, self.missing_bed) ped_parser.load_fam(pc) ped_parser.load_bim(map3=False) ped_parser.load_genotypes() mapdata = self.missing_mapdata index = 0 for snp in ped_parser: try: for y in pc: (pheno, covars, nonmissing) = y.get_variables(snp.missing_genotypes) genodata = snp.get_genotype_data(nonmissing) self.assertEqual(int(mapdata[index][0]), snp.chr) self.assertEqual(int(mapdata[index][3]), snp.pos) self.assertEqual(mapdata[index][1], snp.rsid) self.assertEqual(self.genotypes_w_missing[index], list(genodata.genotypes)) except TooMuchMissing as e: pass except InvalidFrequency as e: pass except InvariantVar as e: pass index += 1 self.assertEqual(7, index)
def testTPedPhenoComplete(self): PhenoCovar.sex_as_covariate = True pc = PhenoCovar() ped_parser = bed_parser.Parser(self.nonmissing_fam, self.nonmissing_bim, self.nonmissing_bed) ped_parser.load_fam(pc) ped_parser.load_bim(map3=False) ped_parser.load_genotypes() self.assertEqual(12, len(pc.covariate_data[0])) self.assertEqual(12, len(pc.phenotype_data[0])) self.assertEqual(1, len(pc.phenotype_names)) mapdata = self.nonmissing_mapdata index = 0 for snp in ped_parser: for y in pc: (pheno, covars, nonmissing) = y.get_variables(snp.missing_genotypes) try: genodata = snp.get_genotype_data(nonmissing) self.assertEqual(int(mapdata[index][0]), snp.chr) self.assertEqual(int(mapdata[index][3]), snp.pos) self.assertEqual(mapdata[index][1], snp.rsid) self.assertEqual(self.genotypes[index], list(genodata.genotypes)) except TooMuchMissing as e: pass except InvalidFrequency as e: pass index += 1 self.assertEqual(7, index)
def testEmptyIterator(self): pc = PhenoCovar() count = 0 for test in pc: count += 1 self.assertEqual(0, count)
def testPedRegionBoundaryTPed(self): pc = PhenoCovar() DataParser.boundary = SnpBoundaryCheck(snps=["rs0005-rs0006"]) BoundaryCheck.chrom = 2 ped_parser = bed_parser.Parser(self.nonmissing_fam, self.nonmissing_bim, self.nonmissing_bed) ped_parser.load_fam(pc) ped_parser.load_bim(map3=False) ped_parser.load_genotypes() pedigree = self.nonmissing_mapdata index = 4 self.assertEqual(2, ped_parser.locus_count) for snp in ped_parser: for y in pc: (pheno, covars, nonmissing) = y.get_variables(snp.missing_genotypes) try: genodata = snp.get_genotype_data(nonmissing) self.assertEqual(int(pedigree[index][0]), snp.chr) self.assertEqual(int(pedigree[index][3]), snp.pos) self.assertEqual(pedigree[index][1], snp.rsid) self.assertEqual(self.genotypes[index], list(genodata.genotypes)) except TooMuchMissing as e: pass except InvalidFrequency as e: pass index += 1 self.assertEqual(6, index)
def load_pedigree(pc, ped): for line in ped: fam, ind, sex, ph = line.split() pc.add_subject(PhenoCovar.build_id([fam, ind, sex, ph]), sex=int(sex), phenotype=float(ph)) pc.freeze_subjects()
def testInfoFileUseNoChrPos(self): # We'll give it an invalid gen_ext so that we can be certain that it's using the files provided mach_parser.Parser.chrpos_encoding = False DataParser.boundary = SnpBoundaryCheck(self.locus_labels) mach_parser.Parser.gen_ext = 'asdf' PhenoCovar.sex_as_covariate = True pc = PhenoCovar() parser = mach_parser.Parser( [self.gen_file, self.gen_file2], info_files=[self.info_file1, self.info_file2]) parser.load_family_details(pc) parser.load_genotypes() idx = 0 for snp in parser: self.assertEqual("NA", snp.pos) self.assertEqual("NA", snp.chr) self.assertEqual("%s:%s" % (self.chroms[idx], self.positions[idx]), snp.rsid) for i in range(0, len(self.dosage_encoding[idx])): self.assertAlmostEqual(self.dosage_encoding[idx][i], snp.genotype_data[i], places=3) idx += 1 self.assertEqual(20, idx)
def test_tped_standardization_w_missing1(self): PhenoCovar.sex_as_covariate = True DataParser.ind_exclusions = ["11:11", "12:12"] pc = PhenoCovar() ped_parser = TransposedPedigreeParser(self.tfam_filename, self.tped_filename) ped_parser.load_tfam(pc) ped_parser.load_genotypes() nonmissing = numpy.empty(pc.phenotype_data[0].shape, dtype=numpy.bool) nonmissing[:] = True libgwas.standardizer.set_standardizer( libgwas.standardizer.NoStandardization) raw_pheno = [0.1, 0.4, 1.0, 0.5, 0.9, 1.0, 0.1, 0.4, 1.0, 0.5] raw_cov = [1, 1, 2, 2, 1, 1, 1, 1, 2, 2] for pheno in pc: (y, c, total_nonmissing) = pheno.get_variables(numpy.invert(nonmissing)) for i in range(0, len(raw_pheno)): self.assertAlmostEqual(raw_pheno[i], y[i]) self.assertAlmostEqual(raw_cov[i], c[0][i]) pc = PhenoCovar() ped_parser = TransposedPedigreeParser(self.tfam_filename, self.tped_filename) ped_parser.load_tfam(pc) ped_parser.load_genotypes() pc.do_standardize_variables = True libgwas.standardizer.set_standardizer(Standardizer) std_pheno = [ -1.43314068, -0.55570761, 1.19915853, -0.26322992, 0.90668084, 1.19915853, -1.43314068, -0.55570761, 1.19915853, -0.26322992 ] std_cov = [ -0.81649658, -0.81649658, 1.22474487, 1.22474487, -0.81649658, -0.81649658, -0.81649658, -0.81649658, 1.22474487, 1.22474487 ] for pheno in pc: (y, c, total_nonmissing) = pheno.get_variables(numpy.invert(nonmissing)) for i in range(0, len(std_pheno)): self.assertAlmostEqual(std_pheno[i], y[i]) self.assertAlmostEqual(std_cov[i], c[0][i])
def testIndIdsDefault(self): pc = PhenoCovar() load_pedigree(pc, self.ped) ids = sorted( "Fam1:Ind1,Fam2:Ind2,Fam3:Ind3,Fam4:Ind4,Fam5:Ind5,Fam6:Ind6,Fam4:Ind7,Fam9:Ind1" .split(",")) self.assertEqual(ids, sorted(pc.pedigree_data.keys()))
def testIndIdsFID(self): PhenoCovar.id_encoding = PhenoIdFormat.FID ped = get_lines(self.filenames[2]) pc = PhenoCovar() load_pedigree(pc, ped) ids = "Fam1,Fam2,Fam3,Fam4,Fam5,Fam6".split(",") self.assertEqual(ids, sorted(pc.pedigree_data.keys()))
def test_basic_population(self): # Indicate that we want to use sex as a covariate PhenoCovar.sex_as_covariate = True pc = PhenoCovar() #sex = [1, 1, 2, 2, 1, 1] for line in self.ped: fam, ind, sex, ph = line.split() pc.add_subject("%s:%s" % (fam, ind), sex=int(sex), phenotype=float(ph)) self.assertEqual(1, len(pc.covariate_data)) self.assertEqual(1, len(pc.phenotype_data)) self.assertEqual(6, len(pc.covariate_data[0])) self.assertEqual(6, len(pc.phenotype_data[0])) i = 0 for line in self.ped: fam, ind, sex, ph = line.split() iid = "%s:%s" % (fam, ind) pdata = pc.pedigree_data[iid] self.assertEqual(pdata, i) self.assertAlmostEqual(float(ph), pc.phenotype_data[0][i]) self.assertEqual(int(sex), pc.covariate_data[0][i]) i += 1 # Indicate that we do not want to use sex as a covariate PhenoCovar.sex_as_covariate = False newpc = PhenoCovar() for line in self.ped: fam, ind, sex, ph = line.split() newpc.add_subject("%s:%s" % (fam, ind), sex=int(sex), phenotype=float(ph)) # Test that sex wasn't loaded as a covariate due to the setting of PhenoCovar.sex_as_covariate self.assertEqual(0, len(newpc.covariate_data)) self.assertEqual(1, len(newpc.phenotype_data)) self.assertEqual(6, len(newpc.pedigree_data)) self.assertEqual(6, len(newpc.phenotype_data[0])) i = 0 for line in self.ped: fam, ind, sex, ph = line.split() iid = "%s:%s" % (fam, ind) pdata = pc.pedigree_data[iid] self.assertEqual(pdata, i) self.assertAlmostEqual(float(ph), pc.phenotype_data[0][i]) i += 1
def testBasicWithCovar(self): PhenoCovar.sex_as_covariate = False pc = PhenoCovar() load_pedigree(pc, self.ped) pc.individual_mask = [0, 0, 0, 0, 0, 0, 1, 1] pc.load_covarfile(self.header, names=["BMI", "MSA"]) pc.freeze_subjects() pheno = [0.1, 0.4, 1.0, 0.5, 0.9, 1.0] sex = [1, 1, 2, 1, 1, 1] # First, test without sex as covariate count = 0 for test in pc: test_pheno, covars, nonmissing = test.get_variables() self.assertEqual(6, numpy.sum(nonmissing)) self.assertEqual(2, len(covars)) for phenotype in self.phenotypes[0:2]: self.assertEqual(self.phenotypes[0], list(covars[0])) self.assertEqual(self.phenotypes[2], list(covars[1])) self.assertEqual("BMI", test.get_covariate_name(0)) self.assertEqual("MSA", test.get_covariate_name(1)) self.assertEqual(6, len(pheno)) self.assertEqual("Pheno-1", test.get_phenotype_name()) for i in range(0, len(pheno)): self.assertAlmostEqual(pheno[i], test_pheno[i]) count += 1 self.assertEqual(1, count) PhenoCovar.sex_as_covariate = True
def testWithoutSample(self): pc = PhenoCovar() parser = libgwas.bgen_parser.Parser(self.nomissing) parser.load_family_details(pc) parser.load_genotypes() idx = 0 for id in ["ID%s" % str(x).zfill(4) for x in range(1, 12)]: self.assertTrue(id in pc.pedigree_data)
def testIndIdsIID(self): PhenoCovar.id_encoding = PhenoIdFormat.IID ped = get_lines(self.filenames[2]) pc = PhenoCovar() load_pedigree(pc, ped) ids = "Ind1,Ind2,Ind3,Ind4,Ind5,Ind6".split(",") self.assertEqual(ids, sorted(pc.pedigree_data.keys()))
def testPedWithMissingMxSnpComplete(self): pc = PhenoCovar() DataParser.snp_miss_tol = 0.5 # We should only lose 1 ped_parser = TransposedPedigreeParser(self.tfam_filename, self.miss_tped_filename) ped_parser.load_tfam(pc) ped_parser.load_genotypes() mapdata = libgwas.get_lines(self.miss_tped_filename, split=True) genotypes_w_missing = [[0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0], [1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1], [0, -1, 1, 1, 0, 0, 0, 2, 1, 1, 0, 0], [0, -1, 2, 1, 1, 0, 0, 1, 2, 1, 1, 0], [1, -1, 0, 1, 0, 0, 1, 2, 0, 1, 0, 0], [1, -1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, -1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0]] hetero_freq_tped = [ 0.3636, 0.5, 0.3636, 0.4545, 0.3636, 0.2727, 0.2727 ] self.assertEqual(7, ped_parser.locus_count) index = 0 loci = ped_parser.get_loci() for snp in loci: self.assertEqual(int(mapdata[index][0]), snp.chr) self.assertEqual(int(mapdata[index][3]), snp.pos) index += 1 self.assertEqual(7, index) index = 0 missing = 0 valid = 0 for snp in ped_parser: for y in pc: (pheno, covars, nonmissing) = y.get_variables(snp.missing_genotypes) try: genodata = snp.get_genotype_data(nonmissing) self.assertEqual(int(mapdata[index][0]), snp.chr) self.assertEqual(int(mapdata[index][3]), snp.pos) self.assertEqual(mapdata[index][1], snp.rsid) self.assertEqual(genotypes_w_missing[index], list(snp.genotype_data)) self.assertAlmostEqual(hetero_freq_tped[index], genodata.hetero_freq, places=4) valid += 1 except TooMuchMissing as e: missing += 1 except InvalidFrequency as e: pass index += 1 self.assertEqual(1, missing) self.assertEqual(6, valid) self.assertEqual(7, index)
def test_tped_standardization_w_dbl_missing(self): PhenoCovar.sex_as_covariate = True DataParser.ind_exclusions = ["11:11", "12:12"] pc = PhenoCovar() ped_parser = TransposedPedigreeParser(self.tfam_filename, self.tped_filename) ped_parser.load_tfam(pc) ped_parser.load_genotypes() nonmissing = numpy.empty(pc.phenotype_data[0].shape, dtype=numpy.bool) nonmissing[:] = True nonmissing[0] = False nonmissing[1] = False libgwas.standardizer.set_standardizer(libgwas.standardizer.NoStandardization) raw_pheno = [1.0, 0.5, 0.9, 1.0, 0.1, 0.4, 1.0, 0.5] raw_cov = [2, 2, 1, 1, 1, 1, 2, 2] for pheno in pc: (y, c, total_nonmissing) = pheno.get_variables(numpy.invert(nonmissing)) for i in range(0, len(raw_pheno)): self.assertAlmostEqual(raw_pheno[i], y[i]) self.assertAlmostEqual(raw_cov[i], c[0][i]) pc = PhenoCovar() ped_parser = TransposedPedigreeParser(self.tfam_filename, self.tped_filename) ped_parser.load_tfam(pc) ped_parser.load_genotypes() pc.do_standardize_variables = True libgwas.standardizer.set_standardizer(Standardizer) std_pheno = [ 1.19915853, -0.26322992, 0.90668084, 1.19915853, -1.43314068, -0.55570761, 1.19915853, -0.26322992] std_cov = [ 1.22474487, 1.22474487, -0.81649658, -0.81649658, -0.81649658, -0.81649658, 1.22474487, 1.22474487] test_var = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] for pheno in pc: (y, c, total_nonmissing) = pheno.get_variables(numpy.invert(nonmissing)) for i in range(0, len(std_pheno)): self.assertAlmostEqual(std_pheno[i], y[i]) self.assertAlmostEqual(std_cov[i], c[0][i])
def testPedMultiPheno(self): PhenoCovar.sex_as_covariate = True pc = PhenoCovar() ped_parser = PedigreeParser(self.map_filename, self.ped_filename) ped_parser.load_mapfile() ped_parser.load_genotypes(pc) with open(self.pheno_file) as f: pc.load_phenofile(f, indices=[2, 3]) mapdata = get_lines(self.map_filename, split=True) sex = [1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 1] pheno_data = [[0.1, 0.2, 0.3, 0.4, 0.5, 0.6], [1.0, 0.5, 0.6, 0.5, 1.0, 0.1], [0.5, 1.0, 0.1, 0.5, 1.0, 0.2]] dual_pheno = [[1.0, 0.5, 0.6, 0.5, 1.0, 0.1, 0.5, 0.6, 0.5, 1.0, 0.1], [0.5, 1.0, 0.1, 0.5, 1.0, 0.2, 1.0, 0.1, 0.5, 1.0, 0.2]] self.assertEqual(2, len(pc.phenotype_data)) index = 0 for snp in ped_parser: self.assertEqual(int(mapdata[index][0]), snp.chr) self.assertEqual(int(mapdata[index][3]), snp.pos) self.assertEqual(mapdata[index][1], snp.rsid) self.assertEqual(self.genotypes[index], list(snp.genotype_data)) idx = 0 for y in pc: non_missing = numpy.ones(len(snp.genotype_data), dtype=bool) non_missing[6] = False (pheno, covariates, nm_indata) = y.get_variables(numpy.invert(non_missing)) for i in range(0, 11): self.assertEqual(sex[i], covariates[0][i]) self.assertAlmostEqual(dual_pheno[idx][i], pheno[i]) idx += 1 self.assertEqual(2, idx) index += 1 self.assertEqual(7, index)
def test_exceed_column_count(self): pc = PhenoCovar() load_pedigree(pc, self.ped) self.assertRaises(InvalidSelection, pc.load_phenofile, self.header, indices=[12]) self.assertRaises(InvalidSelection, pc.load_phenofile, self.no_header, indices=[12])
def testCovarMissingAll(self): # Indicate that we want to use sex as a covariate PhenoCovar.sex_as_covariate = True prefix = "__test_pheno" filename = "%s_miss.txt" % (prefix) f = open(filename, "w") f.write("""FID\tIID\tBMI\tIBM\tMSA F1\tI1\t-9\t1.0\t0.5 F2\tI2\t0.2\t-9\t1.0 F3\tI3\t0.3\t0.6\t-9 F4\tI4\t0.4\t0.5\t0.5 F4\tI5\t0.5\t1.0\t1.0 F4\tI6\t0.6\t0.1\t0.2""") f.close() file = open(filename) with self.assertRaises(NoMatchedPhenoCovars): pc = PhenoCovar() load_pedigree(pc, self.ped) pc.load_covarfile(file, indices=[1])
def test_tped_standardization2(self): DataParser.has_sex = True DataParser.has_pheno = True PhenoCovar.sex_as_covariate = True pc = PhenoCovar() ped_parser = TransposedPedigreeParser(self.tfam_filename, self.tped_filename) ped_parser.load_tfam(pc) ped_parser.load_genotypes() nonmissing = numpy.empty(pc.phenotype_data[0].shape, dtype=numpy.bool) nonmissing[:] = True libgwas.standardizer.set_standardizer(libgwas.standardizer.NoStandardization) raw_pheno = [0.1, 0.4, 1.0, 0.5, 0.9, 1.0, 0.1, 0.4, 1.0, 0.5, 0.9, 1.0] raw_cov = [1, 1, 2, 2, 1, 1, 1, 1, 2, 2, 1, 1] for pheno in pc: (y, c, total_nonmissing) = pheno.get_variables(numpy.invert(nonmissing)) for i in range(0, len(raw_pheno)): self.assertAlmostEqual(raw_pheno[i], y[i]) self.assertAlmostEqual(raw_cov[i], c[0][i]) pc = PhenoCovar() ped_parser = TransposedPedigreeParser(self.tfam_filename, self.tped_filename) ped_parser.load_tfam(pc) ped_parser.load_genotypes() pc.do_standardize_variables = True libgwas.standardizer.set_standardizer(Standardizer) std_pheno = [-1.61601695, -0.73455316, 1.02837442, -0.4407319 , 0.73455316, 1.02837442, -1.61601695, -0.73455316, 1.02837442, -0.4407319 , 0.73455316, 1.02837442] std_cov = [-0.70710678, -0.70710678, 1.41421356, 1.41421356, -0.70710678, -0.70710678, -0.70710678, -0.70710678, 1.41421356, 1.41421356, -0.70710678, -0.70710678] for pheno in pc: (y, c, total_nonmissing) = pheno.get_variables(numpy.invert(nonmissing)) for i in range(0, len(std_pheno)): self.assertAlmostEqual(std_pheno[i], y[i]) self.assertAlmostEqual(std_cov[i], c[0][i])
def test_sample_pheno_with_header(self): # Indicate that we want to use sex as a covariate PhenoCovar.sex_as_covariate = True pc = PhenoCovar() load_pedigree(pc, self.ped) pc.load_phenofile(open(self.filenames[8]), indices=[1,2], sample_file=True) self.assertEqual(2, len(pc.phenotype_data)) self.assertEqual(1, len(pc.covariate_data)) self.assertEqual(6, len(pc.phenotype_data[0])) self.assertEqual(6, len(pc.phenotype_data[1])) self.assertEqual(6, len(pc.covariate_data[0])) self.assertEqual(["BMI", "IBM"], pc.phenotype_names) index = 0 for p1, p2, p3 in self.phenotypes: self.assertEqual(p1, pc.phenotype_data[0][index]) self.assertEqual(p2, pc.phenotype_data[1][index]) index += 1 pc.load_phenofile(open(self.filenames[8]), indices=[1,2,3], sample_file=True) self.assertEqual(3, len(pc.phenotype_data)) self.assertEqual(1, len(pc.covariate_data)) self.assertEqual(6, len(pc.phenotype_data[0])) self.assertEqual(6, len(pc.phenotype_data[1])) self.assertEqual(6, len(pc.phenotype_data[2])) self.assertEqual(["BMI", "IBM", "MSA"], pc.phenotype_names) index = 0 for p1, p2, p3 in self.phenotypes: self.assertEqual(p1, pc.phenotype_data[0][index]) self.assertEqual(p2, pc.phenotype_data[1][index]) self.assertEqual(p3, pc.phenotype_data[2][index]) index += 1
def testPhenoNoHeader(self): # Indicate that we want to use sex as a covariate PhenoCovar.sex_as_covariate = True pc = PhenoCovar() load_pedigree(pc, self.ped) pc.load_phenofile(open(self.filenames[3]), indices=[2,3]) self.assertEqual(2, len(pc.phenotype_data)) self.assertEqual(1, len(pc.covariate_data)) self.assertEqual(6, len(pc.phenotype_data[0])) self.assertEqual(6, len(pc.phenotype_data[1])) self.assertEqual(6, len(pc.covariate_data[0])) self.assertEqual(["Pheno-2", "Pheno-3"], pc.phenotype_names) index = 0 for p1, p2, p3 in self.phenotypes: self.assertEqual(p2, pc.phenotype_data[0][index]) self.assertEqual(p3, pc.phenotype_data[1][index]) index += 1 pc.load_phenofile(open(self.filenames[3]), indices=[1,2,3]) self.assertEqual(3, len(pc.phenotype_data)) self.assertEqual(1, len(pc.covariate_data)) self.assertEqual(6, len(pc.phenotype_data[0])) self.assertEqual(6, len(pc.phenotype_data[1])) self.assertEqual(6, len(pc.phenotype_data[2])) self.assertEqual(["Pheno-1", "Pheno-2", "Pheno-3"], pc.phenotype_names) index = 0 for p1, p2, p3 in self.phenotypes: self.assertEqual(p1, pc.phenotype_data[0][index]) self.assertEqual(p2, pc.phenotype_data[1][index]) self.assertEqual(p3, pc.phenotype_data[2][index]) index += 1
def testPedMultiPheno(self): PhenoCovar.sex_as_covariate = True pc = PhenoCovar() ped_parser = PedigreeParser(self.map_filename, self.ped_filename) ped_parser.load_mapfile() ped_parser.load_genotypes(pc) pc.load_phenofile(open(self.pheno_file), indices=[2, 3]) mapdata = [x.strip().split() for x in open(self.map_filename).readlines()] sex = [1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 1] pheno_data = [[0.1, 0.2, 0.3, 0.4, 0.5, 0.6], [1.0, 0.5, 0.6, 0.5, 1.0, 0.1], [0.5, 1.0, 0.1, 0.5, 1.0, 0.2]] dual_pheno = [ [1.0, 0.5, 0.6, 0.5, 1.0, 0.1, 0.5, 0.6, 0.5, 1.0, 0.1], [0.5, 1.0, 0.1, 0.5, 1.0, 0.2, 1.0, 0.1, 0.5, 1.0, 0.2], ] self.assertEqual(2, len(pc.phenotype_data)) index = 0 for snp in ped_parser: self.assertEqual(int(mapdata[index][0]), snp.chr) self.assertEqual(int(mapdata[index][3]), snp.pos) self.assertEqual(mapdata[index][1], snp.rsid) self.assertEqual(self.genotypes[index], list(snp.genotype_data)) idx = 0 for y in pc: non_missing = numpy.ones(len(snp.genotype_data), dtype=bool) non_missing[6] = False (pheno, covariates, nm_indata) = y.get_variables(numpy.invert(non_missing)) for i in range(0, 11): self.assertEqual(sex[i], covariates[0][i]) self.assertAlmostEqual(dual_pheno[idx][i], pheno[i]) idx += 1 self.assertEqual(2, idx) index += 1 self.assertEqual(7, index)
def testBasicWithMask(self): PhenoCovar.sex_as_covariate = True pc = PhenoCovar() load_pedigree(pc, self.ped) pc.individual_mask = [0, 0, 1, 0, 0, 0, 1, 0] orig_pheno = [0.1, 0.4, 0.5, 0.9, 1.0, 0.9] sex = [1, 1, 2, 1, 1, 1] count = 0 for test in pc: pheno, covars, nonmissing = test.get_variables(numpy.array(pc.individual_mask, dtype=bool)) self.assertEqual(6, numpy.sum(nonmissing)) self.assertEqual(1, len(covars)) self.assertEqual(sex, list(covars[count])) self.assertEqual("SEX", test.get_covariate_name(count)) self.assertEqual(6, len(pheno)) self.assertEqual("Pheno-1", test.get_phenotype_name()) for idx in range(0, len(pheno)): self.assertEqual(orig_pheno[idx], pheno[idx]) count += 1 self.assertEqual(1, count)
def testAllelesIteration(self): pc = PhenoCovar() ped_parser = TransposedPedigreeParser(self.tfam_filename, self.tped_filename) ped_parser.load_tfam(pc) ped_parser.load_genotypes() index = 0 for snp in ped_parser: self.assertEqual(self.tped1_alleles[index][1], snp.minor_allele) self.assertEqual(self.tped1_alleles[index][0], snp.major_allele) index += 1 self.assertEqual(7, index)
def testPedNegativePositions(self): pc = PhenoCovar() ped_parser = PedigreeParser(self.map_miss_filename, self.ped_filename) ped_parser.load_mapfile() ped_parser.load_genotypes(pc) mapdata = get_lines(self.map_filename, split=True) index = 2 for snp in ped_parser: self.assertEqual(int(mapdata[index][0]), snp.chr) self.assertEqual(int(mapdata[index][3]), snp.pos) self.assertEqual(mapdata[index][1], snp.rsid) self.assertEqual(self.genotypes[index], list(snp.genotype_data)) index += 1 self.assertEqual(7, index)
def testFamilyData(self): PhenoCovar.sex_as_covariate = True pc = PhenoCovar() parser = impute_parser.Parser(self.fam_file, [self.gen_file, self.gen_file2], chroms=["3", "4"]) parser.load_family_details(pc) parser.load_genotypes() idx = 0 for id in self.ind_ids: self.assertTrue(id in pc.pedigree_data) self.assertEqual(self.phenotypes[idx], pc.phenotype_data[0][idx]) self.assertEqual(self.sex[idx], pc.covariate_data[0][idx]) idx += 1
def testForInvariant(self): prefix = "__test_pedigree" self.pheno_file = "%s_mch.txt" % (prefix) f = open(self.pheno_file, "w") f.write( """FID\tIID\tBMI\tIBM\tMSA 1\t1\t0.1\t1.0\t1.0 2\t2\t0.2\t0.5\t1.0 3\t3\t0.3\t0.6\t1.0 4\t4\t0.4\t0.5\t1.0 5\t5\t0.5\t1.0\t1.0 6\t6\t0.6\t0.1\t1.0 17\t7\t0.1\t1.0\t1.0 8\t8\t0.2\t0.5\t1.0 9\t9\t0.3\t0.6\t1.0 10\t10\t0.4\t0.5\t1.0 11\t11\t0.5\t1.0\t1.0 12\t12\t0.6\t0.1\t1.0""" ) f.close() PhenoCovar.sex_as_covariate = True pc = PhenoCovar() ped_parser = PedigreeParser(self.map_filename, self.ped_filename) ped_parser.load_mapfile() ped_parser.load_genotypes(pc) pc.load_phenofile(open(self.pheno_file), indices=[3]) index = 0 mapdata = [x.strip().split() for x in open(self.map_filename).readlines()] with self.assertRaises(InvariantVar): for snp in ped_parser: for y in pc: non_missing = numpy.ones(len(snp.genotype_data), dtype=bool) (pheno, covariates, nonmissing) = y.get_variables(numpy.invert(non_missing))
def testCovarHeader(self): # Indicate that we want to use sex as a covariate PhenoCovar.sex_as_covariate = True pc = PhenoCovar() load_pedigree(pc, self.ped) self.assertEqual(1, len(pc.phenotype_data)) self.assertEqual(1, len(pc.covariate_data)) self.assertEqual(8, len(pc.phenotype_data[0])) self.assertEqual(1, len(pc.phenotype_names)) self.assertEqual("Pheno-1", pc.phenotype_names[0]) for idx in xrange(0, len(self.pheno)): self.assertAlmostEqual(self.pheno[idx], pc.phenotype_data[0][idx]) self.assertEqual(self.sex[idx], pc.covariate_data[0][idx]) pc.load_covarfile(self.header) self.assertEqual("BMI", pc.covariate_labels[1]) self.assertEqual("SEX", pc.covariate_labels[0]) covar = [0.9, 1.0, 0.4, 0.8, 1, 0.1, PhenoCovar.missing_encoding, PhenoCovar.missing_encoding] for idx in xrange(0, len(covar)): self.assertAlmostEqual(self.sex[idx], pc.covariate_data[0][idx]) self.assertAlmostEqual(covar[idx], pc.covariate_data[1][idx]) self.assertAlmostEqual(self.pheno[idx], pc.phenotype_data[0][idx])
def testForInvariant(self): prefix = "__test_pedigree" self.pheno_file = "%s_mch.txt" % (prefix) f = open(self.pheno_file, "w") f.write("""FID\tIID\tBMI\tIBM\tMSA 1\t1\t0.1\t1.0\t1.0 2\t2\t0.2\t0.5\t1.0 3\t3\t0.3\t0.6\t1.0 4\t4\t0.4\t0.5\t1.0 5\t5\t0.5\t1.0\t1.0 6\t6\t0.6\t0.1\t1.0 17\t7\t0.1\t1.0\t1.0 8\t8\t0.2\t0.5\t1.0 9\t9\t0.3\t0.6\t1.0 10\t10\t0.4\t0.5\t1.0 11\t11\t0.5\t1.0\t1.0 12\t12\t0.6\t0.1\t1.0""") f.close() PhenoCovar.sex_as_covariate = True pc = PhenoCovar() ped_parser = PedigreeParser(self.map_filename, self.ped_filename) ped_parser.load_mapfile() ped_parser.load_genotypes(pc) pc.load_phenofile(open(self.pheno_file), indices=[3]) index = 0 mapdata = [x.strip().split() for x in open(self.map_filename).readlines()] with self.assertRaises(InvariantVar): for snp in ped_parser: for y in pc: non_missing = numpy.ones(len(snp.genotype_data), dtype=bool) (pheno, covariates, nonmissing) = y.get_variables(numpy.invert(non_missing))
def testEmptyIterator(self): pc = PhenoCovar() count = 0 invalid = 0 missingness = 0 try: for test in pc: count += 1 except InvariantVar as e: invalid += 1 except TooMuchMissingpPhenoCovar as e: missingness += 1 self.assertEqual(0, invalid) self.assertEqual(1, missingness) self.assertEqual(0, count)
def testMAF(self): mach_parser.Parser.chrpos_encoding = True pc = PhenoCovar() parser = mach_parser.Parser([self.gen_file]) parser.load_family_details(pc) parser.load_genotypes() idx = 0 for snp in parser: self.assertEqual(self.positions[idx], snp.pos) maf = numpy.mean(snp.genotype_data/2) self.assertAlmostEqual(maf, snp.maf, places=3) idx += 1 self.assertEqual(10, idx)
def testPedWithMissingMxSnpComplete(self): pc = PhenoCovar() DataParser.snp_miss_tol = 0.5 # We should only lose 1 ped_parser = TransposedPedigreeParser(self.tfam_filename, self.miss_tped_filename) ped_parser.load_tfam(pc) ped_parser.load_genotypes() mapdata = get_lines(self.miss_tped_filename, split=True) genotypes_w_missing = [[0, 0], [1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1], [0, 1, 1, 0, 0, 0, 2, 1, 1, 0, 0], [0, 2, 1, 1, 0, 0, 1, 2, 1, 1, 0], [1, 0, 1, 0, 0, 1, 2, 0, 1, 0, 0], [1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0]] hetero_freq_tped = [ 0.3636, 0.5, 0.3636, 0.4545, 0.3636, 0.2727, 0.2727 ] self.assertEqual(7, ped_parser.locus_count) index = 0 loci = ped_parser.get_loci() for snp in loci: self.assertEqual(int(mapdata[index][0]), snp.chr) self.assertEqual(int(mapdata[index][3]), snp.pos) index += 1 index = 0 for snp in ped_parser: snp_filter = numpy.ones(snp.missing_genotypes.shape[0]) == 1 try: genodata = snp.get_genotype_data(snp_filter) self.assertEqual(int(mapdata[index][0]), snp.chr) self.assertEqual(int(mapdata[index][3]), snp.pos) self.assertEqual(mapdata[index][1], snp.rsid) self.assertAlmostEqual(hetero_freq_tped[index], genodata.hetero_freq, places=4) self.assertEqual(genotypes_w_missing[index], list(genodata.genotypes)) except TooMuchMissing as e: pass except InvalidFrequency as e: pass index += 1 self.assertEqual(7, index)
DataParser.ind_miss_tol = args.mind DataParser.ind_exclusions = ParseIndList(args.remove) PhenoCovar.sex_as_covariate = args.sex if args.compressed: DataParser.compressed_pedigree = True DataParser.has_sex = not args.no_sex DataParser.has_parents = not args.no_parents DataParser.has_fid = not args.no_fid DataParser.has_pheno = not args.no_pheno DataParser.has_liability = args.liability pheno_covar = PhenoCovar() self.verbose=False if args.verbose: self.verbose = True if args.file != None or args.ped or args.map: if args.ped and not args.map or args.map and not args.ped: print >> sys.stderr, "When analyzing pedigree data, both .map and .ped must be specified" sys.exit(1) if args.ped: dataset = pedigree_parser.Parser(args.map.name, args.ped.name) else: dataset = pedigree_parser.Parser("%s.map" % (args.file), "%s.ped" % (args.file)) dataset.load_mapfile(map3=args.map3) dataset.load_genotypes(pheno_covar)