Esempio n. 1
0
    def testInfoFileUseNoChrPos(self):
        # We'll give it an invalid gen_ext so that we can be certain that it's using the files provided
        mach_parser.Parser.chrpos_encoding = False
        DataParser.boundary = SnpBoundaryCheck(self.locus_labels)
        mach_parser.Parser.gen_ext = 'asdf'
        PhenoCovar.sex_as_covariate = True
        pc = PhenoCovar()
        parser = mach_parser.Parser(
            [self.gen_file, self.gen_file2],
            info_files=[self.info_file1, self.info_file2])
        parser.load_family_details(pc)
        parser.load_genotypes()

        idx = 0

        for snp in parser:
            self.assertEqual("NA", snp.pos)
            self.assertEqual("NA", snp.chr)
            self.assertEqual("%s:%s" % (self.chroms[idx], self.positions[idx]),
                             snp.rsid)
            for i in range(0, len(self.dosage_encoding[idx])):
                self.assertAlmostEqual(self.dosage_encoding[idx][i],
                                       snp.genotype_data[i],
                                       places=3)
            idx += 1
        self.assertEqual(20, idx)
Esempio n. 2
0
    def testMAF(self):
        mach_parser.Parser.chrpos_encoding = True
        pc = PhenoCovar()
        parser = mach_parser.Parser([self.gen_file])
        parser.load_family_details(pc)
        parser.load_genotypes()

        idx = 0

        for snp in parser:
            self.assertEqual(self.positions[idx], snp.pos)
            maf = numpy.mean(snp.genotype_data/2)
            self.assertAlmostEqual(maf, snp.maf, places=3)
            idx += 1
        self.assertEqual(10, idx)
Esempio n. 3
0
    def testFilterMAF(self):
        mach_parser.Parser.chrpos_encoding = True
        DataParser.min_maf = 0.45
        pc = PhenoCovar()
        parser = mach_parser.Parser([self.gen_file])
        parser.load_family_details(pc)
        parser.load_genotypes()

        idx = 0
        for snp in parser:
            while numpy.mean(self.mafs[idx]) < DataParser.min_maf:
                idx += 1
            self.assertEqual(self.positions[idx], snp.pos)
            self.assertEqual(snp.major_allele, self.allele_2[idx])
            self.assertEqual(snp.minor_allele, self.allele_1[idx])
            idx += 1
Esempio n. 4
0
    def testAlelles(self):
        mach_parser.Parser.chrpos_encoding = True

        pc = PhenoCovar()
        parser = mach_parser.Parser([self.gen_file])
        parser.load_family_details(pc)
        parser.load_genotypes()

        idx = 0

        for snp in parser:
            self.assertEqual(self.positions[idx], snp.pos)
            self.assertEqual(snp.major_allele, self.allele_2[idx])
            self.assertEqual(snp.minor_allele, self.allele_1[idx])
            idx += 1
        self.assertEqual(10, idx)
Esempio n. 5
0
    def testChromosomesNoChrPos(self):
        mach_parser.Parser.chrpos_encoding = False

        pc = PhenoCovar()
        parser = mach_parser.Parser([self.gen_file, self.gen_file2])
        parser.load_family_details(pc)
        parser.load_genotypes()

        idx = 0

        for snp in parser:
            self.assertEqual("NA", snp.pos)
            self.assertEqual("NA", snp.chr)
            self.assertEqual("%d:%d" % (self.chroms[idx], self.positions[idx]), snp.rsid)
            idx += 1
        self.assertEqual(20, idx)
Esempio n. 6
0
    def testBoundedMiddle(self):
        mach_parser.Parser.chrpos_encoding = True
        BoundaryCheck.chrom = 2
        DataParser.boundary = BoundaryCheck(bp=[1020, 1137])
        PhenoCovar.sex_as_covariate = True
        pc = PhenoCovar()
        parser = mach_parser.Parser([self.gen_file, self.gen_file2])
        parser.load_family_details(pc)
        parser.load_genotypes()

        idx = 8
        for snp in parser:
            self.assertEqual(self.positions[idx], snp.pos)
            for i in range(0, len(self.dosage_encoding[idx])):
                self.assertAlmostEqual(self.dosage_encoding[idx][i], snp.genotype_data[i], places=3)
            idx += 1
        self.assertEqual(13, idx)
Esempio n. 7
0
    def testFilteredInd(self):
        mach_parser.Parser.chrpos_encoding = True
        DataParser.ind_exclusions = self.ind_ids[0:2]
        PhenoCovar.sex_as_covariate = True
        pc = PhenoCovar()
        parser = mach_parser.Parser([self.gen_file, self.gen_file2])
        parser.load_family_details(pc)
        parser.load_genotypes()

        idx = 0

        for snp in parser:
            self.assertEqual(self.positions[idx], snp.pos)
            self.assertEqual(10, snp.genotype_data.shape[0])
            for i in range(2, len(self.dosage_encoding[idx])):
                self.assertAlmostEqual(self.dosage_encoding[idx][i], snp.genotype_data[i-2], places=3)
            idx += 1
        self.assertEqual(20, idx)
Esempio n. 8
0
    def testValues(self):
        mach_parser.Parser.chrpos_encoding = True
        PhenoCovar.sex_as_covariate = True
        pc = PhenoCovar()
        parser = mach_parser.Parser([self.gen_file, self.gen_file2])
        parser.load_family_details(pc)
        parser.load_genotypes()

        idx = 0

        for snp in parser:
            self.assertEqual(self.positions[idx], snp.pos)
            self.assertEqual(self.chroms[idx], snp.chr)

            for i in range(0, len(self.dosage_encoding[idx])):
                self.assertAlmostEqual(self.dosage_encoding[idx][i], snp.genotype_data[i], places=3)
            idx += 1
        self.assertEqual(20, idx)
Esempio n. 9
0
    def testLongerList(self):
        mach_parser.Parser.chrpos_encoding = True
        PhenoCovar.sex_as_covariate = True
        pc = PhenoCovar()
        parser = mach_parser.Parser([self.gen_file, self.gen_file2]*3)
        parser.load_family_details(pc)
        parser.load_genotypes()

        idx = 0

        values = numpy.vstack((self.dosage_encoding, self.dosage_encoding, self.dosage_encoding))
        positions = self.positions * 3

        for snp in parser:
            self.assertEqual(positions[idx], snp.pos)
            for i in range(0, len(values[idx])):
                self.assertAlmostEqual(values[idx][i], snp.genotype_data[i], places=3)
            idx += 1
        self.assertEqual(60, idx)
Esempio n. 10
0
    def testInfoFileUse(self):
        mach_parser.Parser.chrpos_encoding = True

        # We'll give it an invalid gen_ext so that we can be certain that it's using the files provided
        mach_parser.Parser.gen_ext='asdf'
        PhenoCovar.sex_as_covariate = True
        pc = PhenoCovar()
        parser = mach_parser.Parser([self.gen_file, self.gen_file2], info_files=[self.info_file1, self.info_file2])
        parser.load_family_details(pc)
        parser.load_genotypes()

        idx = 0

        for snp in parser:
            self.assertEqual(self.positions[idx], snp.pos)
            for i in range(0, len(self.dosage_encoding[idx])):
                self.assertAlmostEqual(self.dosage_encoding[idx][i], snp.genotype_data[i], places=3)
            idx += 1
        self.assertEqual(20, idx)
Esempio n. 11
0
    def testValuesUncompressed(self):
        mach_parser.Parser.chrpos_encoding = True
        DataParser.compressed_pedigree = False
        mach_parser.Parser.gen_ext = "gen"
        PhenoCovar.sex_as_covariate = True
        pc = PhenoCovar()
        parser = mach_parser.Parser([self.uncmp_1, self.uncmp_2])
        parser.load_family_details(pc)
        parser.load_genotypes()

        idx = 0

        for snp in parser:
            self.assertEqual(self.positions[idx], snp.pos)
            self.assertEqual(self.chroms[idx], snp.chr)

            for i in range(0, len(self.dosage_encoding[idx])):
                self.assertAlmostEqual(self.dosage_encoding[idx][i], snp.genotype_data[i], places=3)
            idx += 1
        self.assertEqual(20, idx)
Esempio n. 12
0
    def WriteTestFiles(self, prefix = "__test_imputed"):

        self.ind_ids = ["ID0001->FAM001",
                        "ID0002->FAM002",
                        "ID0003->FAM003",
                        "ID0004->FAM004",
                        "ID0005->FAM005",
                        "ID0006->FAM006",
                        "ID0007->FAM007",
                        "ID0008->FAM008",
                        "ID0009->FAM009",
                        "ID0010->FAM010",
                        "ID0011->FAM011",
                        "ID0012->FAM012"]

        self.gen_file = "%s.dose.gz" % (prefix)
        self.gen_file2 = "%s-2.dose.gz" % (prefix)
        self.info_file1 = "%s.info.gz" % (prefix)
        self.info_file2 = "%s-2.info.gz" % (prefix)
        self.uncmp_1 = "%s.dose" % (prefix)
        self.uncmp_2 = "%s-2.dose" % (prefix)
        self.info_ucmp1 = "%s.info" % (prefix)
        self.info_ucmp2 = "%s-2.info" % (prefix)
        self.mach_file = "%s.mach" % (prefix)

        self.pheno_covar = "__phenocovar.txt"
        with open(self.pheno_covar, "w") as file:
            file.write("""FID\tIID\tSEX\tAGE\tBMI
ID0001\tFAM001\t1\t30\t28.54
ID0002\tFAM002\t1\t33\t30.10
ID0003\tFAM003\t2\t28\t24.00
ID0004\tFAM004\t2\t40\t29.21
ID0005\tFAM005\t1\t50\t31.23
ID0006\tFAM006\t1\t30\t29.54
ID0007\tFAM007\t1\t33\t33.10
ID0008\tFAM008\t2\t28\t27.00
ID0009\tFAM009\t2\t40\t27.21
ID0010\tFAM010\t1\t50\t30.23
ID0011\tFAM011\t2\t28\t24.00
ID0012\tFAM012\t2\t40\t29.21
""")

        with open(self.mach_file, "w") as file:
            print("%s.dose.gz %s.info.gz" % (prefix, prefix), file=file)
            print("%s-2.dose.gz %s-2.info.gz" % (prefix, prefix), file=file)

        gen_file = gzip.open(self.gen_file, 'wt')
        uncmp_file = open(self.uncmp_1, 'w')
        idx = 0
        self.dosage_encoding = numpy.zeros((20, 12))
        self.positions = []
        self.mafs = numpy.zeros(len(base_freq) * 2)

        info_file = open(self.info_file1, 'w')
        print("snp_id rs_id position exp_freq_a1 info certainty type info_type0 concord_type0 r2_type0", file=info_file)

        self.chroms = [ int(x) for x in ['1'] * 7 + ['2'] * 7 + ['3'] * 6]
        self.positions = [1012, 1020, 1026, 1032, 1100, 1137, 1149] * 2 + [1012, 1020, 1026, 1032, 1100, 1137]
        self.alleles = [list(numpy.random.choice(['A','C','G','T'], 2, replace=False)) for x in range(0, 20)]
        idx = 0

        mafs = numpy.zeros((10))
        dosages = numpy.zeros((12, 10))
        for ind in self.ind_ids:
            f = numpy.random.normal(base_freq, scale=0.1)
            f[f>=1.0] = 0.99
            maf = 1.0 - f
            AA = f * f
            Aa = 2 * f * maf
            aa = maf * maf
            dosages[idx] = Aa + 2*AA
            mafs += dosages[idx] / 2
            print("\t".join([
                ind,
                "DOSE"] +
                ["%.3f" % x for x in dosages[idx]]
            ), file=gen_file)
            print("\t".join([
                ind,
                "DOSE"] +
                ["%.3f" % x for x in dosages[idx]]
            ), file=uncmp_file)
            idx += 1
        self.mafs[0:10] = mafs/10
        self.dosage_encoding[0:10,:] = numpy.transpose(dosages)
        info_file.close()
        gen_file.close()
        uncmp_file.close()
        info_file = gzip.open(self.info_file1, 'wt')
        info_ufile = open(self.info_ucmp1, 'w')
        print("SNP\tAl1\tAl2\tFreq1\tMAF\tAvgCall\tRsq\tGenotyped\tLooRsq\tEmpR\tEmpRsq\tDose1\tdose2", file=info_file)
        print("SNP\tAl1\tAl2\tFreq1\tMAF\tAvgCall\tRsq\tGenotyped\tLooRsq\tEmpR\tEmpRsq\tDose1\tdose2", file=info_ufile)
        for idx in range(0, 10):
            print("\t".join([
                "%s:%d" % (self.chroms[idx],self.positions[idx]),
                self.allele_1[idx],
                self.allele_2[idx],
                str(1.0-self.mafs[idx]),
                str(self.mafs[idx]),
                '0.99912',
                '0.8',
                "\t".join(['-'] * 6)
            ]), file=info_file)
            print("\t".join([
                "%s:%d" % (self.chroms[idx],self.positions[idx]),
                self.allele_1[idx],
                self.allele_2[idx],
                str(1.0-self.mafs[idx]),
                str(self.mafs[idx]),
                '0.99912',
                '0.8',
                "\t".join(['-'] * 6)
            ]), file=info_ufile)
        info_file.close()
        info_ufile.close()


        gen_file = gzip.open(self.gen_file2, 'wt')
        uncmp_file = open(self.uncmp_2, 'w')

        idx = 0
        mafs = numpy.zeros((10))
        dosages = numpy.zeros((12, 10))
        for ind in self.ind_ids:
            f = numpy.random.normal(base_freq, scale=0.1)
            f[f>=1.0] = 0.99
            maf = 1.0 - f
            mafs += maf
            AA = f * f
            Aa = 2 * f * maf
            aa = maf * maf
            dosages[idx] = Aa + 2*aa
            print("\t".join([
                ind,
                "DOSE"] +
                ["%.3f" % x for x in dosages[idx]]
            ), file=gen_file)
            print("\t".join([
                ind,
                "DOSE"] +
                ["%.3f" % x for x in dosages[idx]]
            ), file=uncmp_file)
            idx += 1
        self.mafs[10:] = mafs/10
        self.dosage_encoding[10:,:] = numpy.transpose(dosages)

        gen_file.close()
        uncmp_file.close()

        info_file = gzip.open(self.info_file2, 'wt')
        info_cfile = open(self.info_ucmp2, 'w')
        print("SNP\tAl1\tAl2\tFreq1\tMAF\tAvgCall\tRsq\tGenotyped\tLooRsq\tEmpR\tEmpRsq\tDose1\tdose2", file=info_file)
        print("SNP\tAl1\tAl2\tFreq1\tMAF\tAvgCall\tRsq\tGenotyped\tLooRsq\tEmpR\tEmpRsq\tDose1\tdose2", file=info_cfile)
        for idx in range(10, 20):
            print("\t".join([
                "%s:%d" % (self.chroms[idx],self.positions[idx]),
                self.allele_1[idx],
                self.allele_2[idx],
                str(1.0-self.mafs[idx]),
                str(self.mafs[idx]),
                '0.99912',
                '0.8',
                "\t".join(['-'] * 6)
            ]), file=info_file)
            print("\t".join([
                "%s:%d" % (self.chroms[idx],self.positions[idx]),
                self.allele_1[idx],
                self.allele_2[idx],
                str(1.0-self.mafs[idx]),
                str(self.mafs[idx]),
                '0.99912',
                '0.8',
                "\t".join(['-'] * 6)
            ]), file=info_cfile)
        info_cfile.close()
        info_file.close()

        self.mach_parser = mach_parser.Parser([self.gen_file])
Esempio n. 13
0
    def LoadCmdLine(self, args=sys.argv[1:]):
        """Parse user arguments using argparse and set up components"""
        parser = argparse.ArgumentParser(description="MV Test: " + __version__,
                                         epilog="""
mvtest.py is uses many of the same arguments as plink, but there are a few
differences, so please consider the list above carefully.
        """)

        parser.add_argument("-v",
                            action='store_true',
                            help="Print version number")
        parser.add_argument(
            "--vall",
            action='store_true',
            help="Print version number along with each dependency")

        parser.add_argument("--chr",
                            type=int,
                            default=-1,
                            metavar="N",
                            help="Select Chromosome")
        parser.add_argument(
            "--snps",
            type=str,
            default="",
            help="Comma-delimited list of SNP(s): rs1,rs2,rs3-rs6")
        parser.add_argument("--from-bp",
                            type=int,
                            metavar="START",
                            help="SNP range start")
        parser.add_argument("--to-bp",
                            type=int,
                            metavar="END",
                            help="SNP range end")
        parser.add_argument("--from-kb",
                            type=int,
                            metavar="START",
                            help="SNP range start")
        parser.add_argument("--to-kb",
                            type=int,
                            metavar="END",
                            help="SNP range end")
        parser.add_argument("--from-mb",
                            type=int,
                            metavar="START",
                            help="SNP range start")
        parser.add_argument("--to-mb",
                            type=int,
                            metavar="END",
                            help="SNP range end")
        parser.add_argument(
            "--exclude",
            type=str,
            default="",
            help="Comma-delimited list of rsids to be excluded")

        # For now, I'm not implementing keep, since we don't have any real meaningful need for analyzing individuals
        # PLINK does, but we don't do the QC stuff they do.
        parser.add_argument(
            "--keep",
            type=str,
            default="",
            help="Comma-delimited list of individuals to be analyzed")
        parser.add_argument(
            "--remove",
            type=str,
            default="",
            help=
            "Comma-delimited list of individuals to be removed from analysis")

        parser.add_argument("--file",
                            type=str,
                            help="Prefix for .ped and .map files")
        parser.add_argument("--ped",
                            type=argparse.FileType('r'),
                            help="PLINK compatible .ped file")
        parser.add_argument("--map",
                            type=argparse.FileType('r'),
                            help="PLINK compatible .map file")
        parser.add_argument("--map3",
                            action='store_true',
                            help="MAP file has only 3 columns")
        parser.add_argument("--no-sex",
                            action='store_true',
                            help="Pedigree file doesn't have column 5 (sex)")
        parser.add_argument(
            "--no-parents",
            action="store_true",
            help="Pedigree file doesn't have columns 3 and 4 (parents)")
        parser.add_argument(
            "--no-fid",
            action="store_true",
            help="Pedigree file doesn't have column 1 (family ID)")
        parser.add_argument(
            "--no-pheno",
            action="store_true",
            help="Pedigree file doesn't have column 6 (phenotype")
        parser.add_argument("--liability",
                            action="store_true",
                            help="Pedigree file has column 7 (liability)")

        parser.add_argument("--bfile",
                            type=str,
                            help="Prefix for .bed, .bim and .fam files")
        parser.add_argument("--bed",
                            type=argparse.FileType('r'),
                            help="Binary Ped file (.bed)")
        parser.add_argument("--bim",
                            type=argparse.FileType('r'),
                            help="Binary ped marker file (.bim)")
        parser.add_argument("--fam",
                            type=argparse.FileType('r'),
                            help="Binary ped family file (.fam)")

        parser.add_argument("--tfile",
                            type=str,
                            help="Prefix for .tped and .tfam files")
        parser.add_argument("--tped",
                            type=argparse.FileType('r'),
                            help="Transposed Pedigree file (.tped)")
        parser.add_argument("--tfam",
                            type=argparse.FileType('r'),
                            help="Transposed pedigre Family file (.tfam)")
        parser.add_argument(
            "--compressed",
            action="store_true",
            help="Ped/TPed compressed with gzip (named .ped.tgz or .tped.tgz)")

        parser.add_argument(
            "--impute",
            type=argparse.FileType('r'),
            help="File containing list of impute output for analysis")
        parser.add_argument(
            "--impute-fam",
            type=argparse.FileType('r'),
            help="File containing family details for impute data")
        parser.add_argument(
            "--impute-offset",
            type=int,
            default=-1,
            help="Impute file index (1 based) to begin analysis")
        parser.add_argument(
            "--impute-count",
            type=int,
            default=-1,
            help="Number of impute files to process (for this node)")
        parser.add_argument(
            "--impute-uncompressed",
            action="store_true",
            help="Indicate that the impute input is not gzipped, but plain text"
        )
        parser.add_argument(
            "--impute-encoding",
            type=str,
            choices=['additive', 'dominant', 'recessive', 'genotype'],
            default='additive',
            help='Genetic model to be used')
        parser.add_argument("--impute-info-ext",
                            type=str,
                            default='info',
                            help="Portion of filename denotes info filename")
        parser.add_argument("--impute-gen-ext",
                            type=str,
                            default='gen.gz',
                            help="Portion of filename that denotes gen file")
        parser.add_argument(
            "--impute-info-thresh",
            type=float,
            default=0.4,
            help="Threshold for filtering imputed SNPs with poor 'info' values"
        )

        parser.add_argument(
            "--mach",
            type=argparse.FileType('r'),
            help="File containing list of MACH output for analysis")
        parser.add_argument("--mach-offset",
                            type=int,
                            default=-1,
                            help="Mach file index (1 based) to begin analysis")
        parser.add_argument(
            "--mach-count",
            type=int,
            default=-1,
            help="Number of mach files to process (for this node)")
        parser.add_argument("--mach-uncompressed",
                            action="store_true",
                            help="Indicate that the mach input is not gzipped")
        parser.add_argument(
            "--mach-chunk-size",
            type=int,
            default=100000,
            help=
            "Max number of loci to load at once (higher increases memory requirements with some speed benefits)"
        )
        parser.add_argument("--mach-info-ext",
                            type=str,
                            default="info.gz",
                            help="Portion of filename denotes info filenames")
        parser.add_argument("--mach-dose-ext",
                            type=str,
                            default="dose.gz",
                            help="Portion of filename that denotes dose files")
        parser.add_argument("--mach-min-rsquared",
                            type=float,
                            default=0.3,
                            help="Filter out loci with RSquared < this value")
        parser.add_argument(
            "--mach-chrpos",
            action="store_true",
            help=
            "When true, first col in .info file must be chr:pos (additional pieces allowed)"
        )

        parser.add_argument("--pheno",
                            type=argparse.FileType('r'),
                            help="File containing phenotypes")
        parser.add_argument("--sample-pheno",
                            type=argparse.FileType('r'),
                            help="(Mach) Sample file containing phenotypes")
        parser.add_argument(
            "--mphenos",
            type=str,
            default="",
            help=
            "Column number(s) for phenotype to be analyzed if number of columns > 1"
        )
        parser.add_argument(
            "--pheno-names",
            type=str,
            default="",
            help=
            "Name for phenotype(s) to be analyzed (must be in --pheno file)")
        parser.add_argument("--all-pheno",
                            action="store_true",
                            help="Analyze all columns from the phenotype file")
        #parser.add_argument("--all-pheno", action='store_true', help="Analyze each phenotype")

        parser.add_argument("--covar",
                            type=argparse.FileType('r'),
                            help="File containing covariates")
        parser.add_argument("--sample-covar",
                            type=argparse.FileType('r'),
                            help="(Mach) Sample file containing covariates")
        parser.add_argument("--covar-numbers",
                            type=str,
                            default="",
                            help="Comma-separated list of covariate indices")
        parser.add_argument("--covar-names",
                            type=str,
                            default="",
                            help="Comma-separated list of covariate names")
        parser.add_argument(
            "--sex",
            action='store_true',
            help="Use sex from the pedigree file as a covariate")
        parser.add_argument("--missing-phenotype",
                            type=float,
                            default=-9.0,
                            help="Encoding for missing phenotypes")

        parser.add_argument("--maf",
                            type=float,
                            default=0.0,
                            help="Minimum MAF allowed for analysis")
        parser.add_argument("--max-maf",
                            type=float,
                            default=1.0,
                            help="MAX MAF allowed for analysis")
        parser.add_argument("--geno",
                            type=float,
                            default=1.0,
                            help="MAX per-SNP missing for analysis")
        parser.add_argument("--mind",
                            type=float,
                            default=1.0,
                            help="MAX per-person missing")

        parser.add_argument("--verbose",
                            action='store_true',
                            help="Output additional data details")

        parser.set_defaults(all_pheno=False, sex=False, mach_chrpos=False)
        args = parser.parse_args(args)

        # Report version, if requested, and exit
        if args.v:
            print("%s: %s" % (os.path.basename(__file__), __version__),
                  file=sys.stderr)
            sys.exit(0)

        if args.vall:
            print("%s: %s" % (os.path.basename(__file__), __version__),
                  file=sys.stderr)
            print("%s: %s" %
                  (os.path.dirname(libgwas.__file__), libgwas.__version__),
                  file=sys.stderr)
            print("%s: %s" %
                  (os.path.dirname(scipy.__file__), scipy.__version__),
                  file=sys.stderr)
            print("%s: %s" %
                  (os.path.dirname(numpy.__file__), numpy.__version__),
                  file=sys.stderr)
            sys.exit(0)

        ###############################################################################################################
        # Here we deal with the various ways we filter SNPs in and out of anlysis
        # We might handle MACH files differently. We'll default the chromosome
        # to be "NA" which is how those can be returned.
        if args.mach is None or args.mach_chrpos:
            BoundaryCheck.chrom = args.chr
        else:
            if args.chr != -1:
                libgwas.Exit(
                    ("Positional based filtering (--chr, --from/--to)" +
                     " only work with mach_chrpos. See manual for details."))
            BoundaryCheck.chrom = "NA"
        snps = args.snps.split(",")
        try:
            b = BoundaryCheck(bp=(args.from_bp, args.to_bp),
                              kb=(args.from_kb, args.to_kb),
                              mb=(args.from_mb, args.to_mb))
        except InvalidBoundarySpec as e:
            print("Invalid boundary spec associated: %s" %
                  (e.malformed_boundary),
                  file=sys.stderr)
            sys.exit(1)
        try:
            s = SnpBoundaryCheck(snps=snps)
        except InvalidBoundarySpec as e:
            print("Invalid SNP boundary defined: %s" % (e.malformed_boundary),
                  file=sys.stderr)
            print(
                "SNPs must be either single or have be a range such as rs123-rs345",
                file=sys.stderr)
            sys.exit(1)

        if b.valid and s.valid:
            print(
                "Only one type of boundary conditions is permitted. Either use --from-bp, etc. or rs123-rs345. ",
                file=sys.stderr)
            sys.exit(1)

        if len(b.bounds) > 0 and not b.valid:
            if BoundaryCheck.chrom == "NA":
                libgwas.Exit(
                    ("Positional based filtering (--chr, --from/--to)" +
                     " only work with mach_chrpos. See manual for details."))

        if s.valid:
            DataParser.boundary = s
        # If b isn't valid, we still want to potentially allow for chr and SNPs, it just won't have
        else:
            b.LoadSNPs(snps)
            # any actual boundary listings
            DataParser.boundary = b
        DataParser.boundary.LoadExclusions(snps=args.exclude.split(","))

        ###############################################################################################################
        # Setup the various Dataset filter criteria
        DataParser.min_maf = args.maf
        DataParser.max_maf = args.max_maf
        DataParser.snp_miss_tol = args.geno
        DataParser.ind_miss_tol = args.mind

        DataParser.ind_exclusions = ParseIndList(args.remove)

        PhenoCovar.sex_as_covariate = args.sex

        if args.compressed:
            DataParser.compressed_pedigree = True

        DataParser.has_sex = not args.no_sex
        DataParser.has_parents = not args.no_parents
        DataParser.has_fid = not args.no_fid
        DataParser.has_pheno = not args.no_pheno
        DataParser.has_liability = args.liability

        pheno_covar = PhenoCovar()
        self.verbose = False
        if args.verbose:
            self.verbose = True

        if args.file != None or args.ped or args.map:
            if args.ped and not args.map or args.map and not args.ped:
                print(
                    "When analyzing pedigree data, both .map and .ped must be specified",
                    file=sys.stderr)
                sys.exit(1)
            if args.ped:
                dataset = pedigree_parser.Parser(args.map.name, args.ped.name)
            else:
                dataset = pedigree_parser.Parser("%s.map" % (args.file),
                                                 "%s.ped" % (args.file))

            dataset.load_mapfile(map3=args.map3)
            dataset.load_genotypes(pheno_covar)
        elif args.tfile != None or args.tped or args.tfam:
            if args.tped and not args.tfam or args.tfam and not args.tped:
                print(
                    "When analyzing transposed pedigree data, both .tfam and .tped must be specified",
                    file=sys.stderr)
                sys.exit(1)
            if args.tped:
                dataset = transposed_pedigree_parser.Parser(
                    args.tfam.name, args.tped.name)
            else:
                dataset = transposed_pedigree_parser.Parser(
                    "%s.tfam" % (args.tfile), "%s.tped" % (args.tfile))
            dataset.load_tfam(pheno_covar)
            dataset.load_genotypes()
        elif args.bfile != None:
            dataset = bed_parser.Parser("%s.fam" % (args.bfile),
                                        "%s.bim" % (args.bfile),
                                        "%s.bed" % (args.bfile))
            dataset.load_bim(map3=args.map3)
            dataset.load_fam(pheno_covar)
            dataset.load_genotypes()
        elif args.bed or args.bim or args.fam:
            if (args.bed and not args.fam or not args.bim) or (
                    args.bim and not args.bed
                    or not args.fam) or (args.fam and not args.bed
                                         or not args.bim):
                print(
                    "When analyzing binary pedigree data, .bed, .bim and .fam files must be provided",
                    file=sys.stderr)
                sys.exit(1)
            dataset = bed_parser.Parser(args.fam, args.bim, args.bed)
            dataset.load_bim(map3=args.map3)
            dataset.load_fam(pheno_covar)
            dataset.load_genotypes()
        elif args.impute:
            DataParser.compressed_pedigree = not args.impute_uncompressed

            if (args.impute_offset > 0 and args.impute_count == -1) or (
                    args.impute_offset == -1 and args.impute_count > 0):
                print(
                    "--impute-count and --impute_offset must both > 0 if one is set other than -1.  ",
                    file=sys.stderr)
                sys.exit(1)
            if DataParser.snp_miss_tol != 1.0:
                print("--geno does not have any impact on imputed data",
                      file=sys.stderr)
                sys.exit(1)
            if DataParser.ind_miss_tol != 1.0:
                print("--mind does not have any impact on imputed data",
                      file=sys.stderr)
                sys.exit(1)
            impute_parser.SetEncoding(args.impute_encoding)
            impute_parser.Parser.info_ext = args.impute_info_ext
            impute_parser.Parser.info_threshold = args.impute_info_thresh
            libgwas.ExitIf(
                "--impute-fam is required for when processing imputed data",
                args.impute_fam == None)
            archives, chroms, infos = self.ParseImputeFile(
                args.impute.name, args.impute_offset, args.impute_count)
            dataset = impute_parser.Parser(args.impute_fam.name, archives,
                                           chroms, infos)
            dataset.load_family_details(pheno_covar)
            dataset.load_genotypes()
        elif args.mach:

            DataParser.compressed_pedigree = not args.mach_uncompressed
            if (args.mach_offset > 0
                    and args.mach_count == -1) or (args.mach_offset == -1
                                                   and args.impute_count > 0):
                print(
                    "--mach-count and --mach_offset must both be > 0 if one is set other than -1. ",
                    file=sys.stderr)
                sys.exit(1)
            if DataParser.snp_miss_tol != 1.0:
                print("--geno does not have any impact on imputed data",
                      file=sys.stderr)
                sys.exit(1)
            if DataParser.ind_miss_tol != 1.0:
                print("--mind does not have any impact on imputed data",
                      file=sys.stderr)
                sys.exit(1)
            if BoundaryCheck.chrom != "NA" and not args.mach_chrpos:
                libgwas.Exit(
                    ("Positional based filtering (--chr, --from/--to)" +
                     " only work with mach_chrpos. See manual for details."))
            mach_parser.Parser.chrpos_encoding = args.mach_chrpos
            mach_parser.Parser.info_ext = args.mach_info_ext
            mach_parser.Parser.dosage_ext = args.mach_dose_ext
            mach_parser.Parser.chunk_stride = args.mach_chunk_size
            mach_parser.Parser.min_rsquared = args.mach_min_rsquared
            archives, infos = self.ParseMachFile(args.mach.name,
                                                 args.mach_offset,
                                                 args.mach_count)
            dataset = mach_parser.Parser(archives, infos)
            dataset.load_family_details(pheno_covar)
            dataset.load_genotypes()

        else:
            parser.print_usage(sys.stderr)
            print(
                "\nNo data has been specified. Users must specify either pedigree or transposed pedigree to continue",
                file=sys.stderr)
            sys.exit(1)

        if args.pheno or args.sample_pheno:
            mphenos = []
            if args.mphenos != "":
                mphenos = args.mphenos.split(",")

            nphenos = []
            if args.pheno_names != "":
                nphenos = args.pheno_names.split(",")

            if len(mphenos) + len(nphenos) == 0 and not args.all_pheno:
                libgwas.Exit("You must select one or more phenotypes when ")
            sample_file = False
            pheno_filename = args.pheno
            if args.sample_pheno:
                pheno_filename = args.sample_pheno
                sample_file = True
            pheno_covar.load_phenofile(pheno_filename, mphenos, nphenos,
                                       sample_file)

        if args.covar:
            pheno_covar.load_covarfile(args.covar,
                                       args.covar_numbers.split(","),
                                       args.covar_names.split(","))
        pheno_covar.do_standardize_variables = True
        return dataset, pheno_covar
Esempio n. 14
0
    def WriteTestFiles(self, prefix = "__test_imputed"):

        self.ind_ids = ["ID0001:FAM001",
                        "ID0002:FAM002",
                        "ID0003:FAM003",
                        "ID0004:FAM004",
                        "ID0005:FAM005",
                        "ID0006:FAM006",
                        "ID0007:FAM007",
                        "ID0008:FAM008",
                        "ID0009:FAM009",
                        "ID0010:FAM010",
                        "ID0011:FAM011",
                        "ID0012:FAM012"]

        self.gen_file = "%s.dose.gz" % (prefix)
        self.gen_file2 = "%s-2.dose.gz" % (prefix)
        self.info_file1 = "%s.info.gz" % (prefix)
        self.info_file2 = "%s-2.info.gz" % (prefix)
        self.uncmp_1 = "%s.dose" % (prefix)
        self.uncmp_2 = "%s-2.dose" % (prefix)
        self.info_ucmp1 = "%s.info" % (prefix)
        self.info_ucmp2 = "%s-2.info" % (prefix)
        gen_file = gzip.open(self.gen_file, 'wb')
        uncmp_file = open(self.uncmp_1, 'w')
        idx = 0
        self.dosage_encoding = numpy.zeros((20, 12))
        self.positions = []
        self.mafs = numpy.zeros(len(base_freq) * 2)

        info_file = open(self.info_file1, 'w')
        print >> info_file, "snp_id rs_id position exp_freq_a1 info certainty type info_type0 concord_type0 r2_type0"

        self.chroms = [ int(x) for x in ['1'] * 7 + ['2'] * 7 + ['3'] * 6]
        self.positions = [1012, 1020, 1026, 1032, 1100, 1137, 1149] * 2 + [1012, 1020, 1026, 1032, 1100, 1137]
        self.alleles = [list(numpy.random.choice(['A','C','G','T'], 2, replace=False)) for x in range(0, 20)]
        idx = 0

        mafs = numpy.zeros((10))
        dosages = numpy.zeros((12, 10))
        for ind in self.ind_ids:
            f = numpy.random.normal(base_freq, scale=0.1)
            f[f>=1.0] = 0.99
            maf = 1.0 - f
            AA = f * f
            Aa = 2 * f * maf
            aa = maf * maf
            dosages[idx] = Aa + 2*AA
            mafs += dosages[idx] / 2
            print >> gen_file, "\t".join([
                ind,
                "DOSE"] +
                ["%.3f" % x for x in dosages[idx]]
            )
            print >> uncmp_file, "\t".join([
                ind,
                "DOSE"] +
                ["%.3f" % x for x in dosages[idx]]
            )
            idx += 1
        self.mafs[0:10] = mafs/10
        self.dosage_encoding[0:10,:] = numpy.transpose(dosages)
        gen_file.close()
        uncmp_file.close()
        info_file = gzip.open(self.info_file1, 'wb')
        info_ufile = open(self.info_ucmp1, 'w')
        print >> info_file, "SNP\tAl1\tAl2\tFreq1\tMAF\tAvgCall\tRsq\tGenotyped\tLooRsq\tEmpR\tEmpRsq\tDose1\tdose2"
        print >> info_ufile, "SNP\tAl1\tAl2\tFreq1\tMAF\tAvgCall\tRsq\tGenotyped\tLooRsq\tEmpR\tEmpRsq\tDose1\tdose2"
        for idx in range(0, 10):
            print >> info_file, "\t".join([
                "%s:%d" % (self.chroms[idx],self.positions[idx]),
                self.allele_1[idx],
                self.allele_2[idx],
                str(1.0-self.mafs[idx]),
                str(self.mafs[idx]),
                '0.99912',
                '0.8',
                "\t".join(['-'] * 6)
            ])
            print >> info_ufile, "\t".join([
                "%s:%d" % (self.chroms[idx],self.positions[idx]),
                self.allele_1[idx],
                self.allele_2[idx],
                str(1.0-self.mafs[idx]),
                str(self.mafs[idx]),
                '0.99912',
                '0.8',
                "\t".join(['-'] * 6)
            ])
        info_file.close()
        info_ufile.close()


        gen_file = gzip.open(self.gen_file2, 'wb')
        uncmp_file = open(self.uncmp_2, 'w')

        idx = 0
        mafs = numpy.zeros((10))
        dosages = numpy.zeros((12, 10))
        for ind in self.ind_ids:
            f = numpy.random.normal(base_freq, scale=0.1)
            f[f>=1.0] = 0.99
            maf = 1.0 - f
            mafs += maf
            AA = f * f
            Aa = 2 * f * maf
            aa = maf * maf
            dosages[idx] = Aa + 2*aa
            print >> gen_file, "\t".join([
                ind,
                "DOSE"] +
                ["%.3f" % x for x in dosages[idx]]
            )
            print >> uncmp_file, "\t".join([
                ind,
                "DOSE"] +
                ["%.3f" % x for x in dosages[idx]]
            )
            idx += 1
        self.mafs[10:] = mafs/10
        self.dosage_encoding[10:,:] = numpy.transpose(dosages)

        gen_file.close()

        info_file = gzip.open(self.info_file2, 'wb')
        info_cfile = open(self.info_ucmp2, 'w')
        print >> info_file, "SNP\tAl1\tAl2\tFreq1\tMAF\tAvgCall\tRsq\tGenotyped\tLooRsq\tEmpR\tEmpRsq\tDose1\tdose2"
        print >> info_cfile, "SNP\tAl1\tAl2\tFreq1\tMAF\tAvgCall\tRsq\tGenotyped\tLooRsq\tEmpR\tEmpRsq\tDose1\tdose2"
        for idx in range(10, 20):
            print >> info_file, "\t".join([
                "%s:%d" % (self.chroms[idx],self.positions[idx]),
                self.allele_1[idx],
                self.allele_2[idx],
                str(1.0-self.mafs[idx]),
                str(self.mafs[idx]),
                '0.99912',
                '0.8',
                "\t".join(['-'] * 6)
            ])
            print >> info_cfile, "\t".join([
                "%s:%d" % (self.chroms[idx],self.positions[idx]),
                self.allele_1[idx],
                self.allele_2[idx],
                str(1.0-self.mafs[idx]),
                str(self.mafs[idx]),
                '0.99912',
                '0.8',
                "\t".join(['-'] * 6)
            ])
        info_cfile.close()
        info_file.close()

        self.mach_parser = mach_parser.Parser([self.gen_file])
Esempio n. 15
0
            if DataParser.ind_miss_tol != 1.0:
                print >> sys.stderr, "--mind does not have any impact on imputed data"
                sys.exit(1)
            if BoundaryCheck.chrom != "NA" and not args.mach_chrpos:
                libgwas.Exit(
                    ("Positional based filtering (--chr, --from/--to)" +
                     " only work with mach_chrpos. See manual for details."))
            mach_parser.Parser.chrpos_encoding = args.mach_chrpos
            mach_parser.Parser.info_ext = args.mach_info_ext
            mach_parser.Parser.dosage_ext = args.mach_dose_ext
            mach_parser.Parser.chunk_stride = args.mach_chunk_size
            mach_parser.Parser.min_rsquared = args.mach_min_rsquared
            archives, infos = self.ParseMachFile(args.mach.name,
                                                 args.mach_offset,
                                                 args.mach_count)
            dataset = mach_parser.Parser(archives, infos)
            dataset.load_family_details(pheno_covar)
            dataset.load_genotypes()

        else:
            parser.print_usage(sys.stderr)
            print >> sys.stderr, "\nNo data has been specified. Users must specify either pedigree or transposed pedigree to continue"
            sys.exit(1)

        if args.pheno or args.sample_pheno:
            mphenos = []
            if args.mphenos != "":
                mphenos = args.mphenos.split(",")

            nphenos = []
            if args.pheno_names != "":