def testBoundaryInitBPWithInclusions(self): BoundaryCheck.chrom = 1 b = BoundaryCheck(bp=[10000, 500000]) b.LoadSNPs(["rs12345", "rs23456"]) self.assertFalse(b.NoExclusions()) self.assertTrue(b.valid) self.assertEqual(False, b.TestBoundary(1, 500, "")) self.assertEqual(True, b.TestBoundary(1, 10000, "")) self.assertEqual(True, b.TestBoundary(1, 500000, "")) self.assertEqual(True, b.TestBoundary(1, 250000, "")) self.assertEqual(False, b.TestBoundary(2, 250000, "")) self.assertEqual(False, b.TestBoundary(10, 10000, "")) self.assertTrue(b.TestBoundary(1, 1000000, "rs12345")) self.assertTrue(b.TestBoundary(1, 1200000, "rs23456")) self.assertFalse(b.TestBoundary(1, 1200011, "rs345678"))
def LoadCmdLine(self, args=sys.argv[1:]): """Parse user arguments using argparse and set up components""" parser = argparse.ArgumentParser(description="MV Test: " + __version__, epilog=""" mvtest.py is uses many of the same arguments as plink, but there are a few differences, so please consider the list above carefully. """) parser.add_argument("-v", action='store_true', help="Print version number") parser.add_argument( "--vall", action='store_true', help="Print version number along with each dependency") parser.add_argument("--chr", type=int, default=-1, metavar="N", help="Select Chromosome") parser.add_argument( "--snps", type=str, default="", help="Comma-delimited list of SNP(s): rs1,rs2,rs3-rs6") parser.add_argument("--from-bp", type=int, metavar="START", help="SNP range start") parser.add_argument("--to-bp", type=int, metavar="END", help="SNP range end") parser.add_argument("--from-kb", type=int, metavar="START", help="SNP range start") parser.add_argument("--to-kb", type=int, metavar="END", help="SNP range end") parser.add_argument("--from-mb", type=int, metavar="START", help="SNP range start") parser.add_argument("--to-mb", type=int, metavar="END", help="SNP range end") parser.add_argument( "--exclude", type=str, default="", help="Comma-delimited list of rsids to be excluded") # For now, I'm not implementing keep, since we don't have any real meaningful need for analyzing individuals # PLINK does, but we don't do the QC stuff they do. parser.add_argument( "--keep", type=str, default="", help="Comma-delimited list of individuals to be analyzed") parser.add_argument( "--remove", type=str, default="", help= "Comma-delimited list of individuals to be removed from analysis") parser.add_argument("--file", type=str, help="Prefix for .ped and .map files") parser.add_argument("--ped", type=argparse.FileType('r'), help="PLINK compatible .ped file") parser.add_argument("--map", type=argparse.FileType('r'), help="PLINK compatible .map file") parser.add_argument("--map3", action='store_true', help="MAP file has only 3 columns") parser.add_argument("--no-sex", action='store_true', help="Pedigree file doesn't have column 5 (sex)") parser.add_argument( "--no-parents", action="store_true", help="Pedigree file doesn't have columns 3 and 4 (parents)") parser.add_argument( "--no-fid", action="store_true", help="Pedigree file doesn't have column 1 (family ID)") parser.add_argument( "--no-pheno", action="store_true", help="Pedigree file doesn't have column 6 (phenotype") parser.add_argument("--liability", action="store_true", help="Pedigree file has column 7 (liability)") parser.add_argument("--bfile", type=str, help="Prefix for .bed, .bim and .fam files") parser.add_argument("--bed", type=argparse.FileType('r'), help="Binary Ped file (.bed)") parser.add_argument("--bim", type=argparse.FileType('r'), help="Binary ped marker file (.bim)") parser.add_argument("--fam", type=argparse.FileType('r'), help="Binary ped family file (.fam)") parser.add_argument("--tfile", type=str, help="Prefix for .tped and .tfam files") parser.add_argument("--tped", type=argparse.FileType('r'), help="Transposed Pedigree file (.tped)") parser.add_argument("--tfam", type=argparse.FileType('r'), help="Transposed pedigre Family file (.tfam)") parser.add_argument( "--compressed", action="store_true", help="Ped/TPed compressed with gzip (named .ped.tgz or .tped.tgz)") parser.add_argument( "--impute", type=argparse.FileType('r'), help="File containing list of impute output for analysis") parser.add_argument( "--impute-fam", type=argparse.FileType('r'), help="File containing family details for impute data") parser.add_argument( "--impute-offset", type=int, default=-1, help="Impute file index (1 based) to begin analysis") parser.add_argument( "--impute-count", type=int, default=-1, help="Number of impute files to process (for this node)") parser.add_argument( "--impute-uncompressed", action="store_true", help="Indicate that the impute input is not gzipped, but plain text" ) parser.add_argument( "--impute-encoding", type=str, choices=['additive', 'dominant', 'recessive', 'genotype'], default='additive', help='Genetic model to be used') parser.add_argument("--impute-info-ext", type=str, default='info', help="Portion of filename denotes info filename") parser.add_argument("--impute-gen-ext", type=str, default='gen.gz', help="Portion of filename that denotes gen file") parser.add_argument( "--impute-info-thresh", type=float, default=0.4, help="Threshold for filtering imputed SNPs with poor 'info' values" ) parser.add_argument( "--mach", type=argparse.FileType('r'), help="File containing list of MACH output for analysis") parser.add_argument("--mach-offset", type=int, default=-1, help="Mach file index (1 based) to begin analysis") parser.add_argument( "--mach-count", type=int, default=-1, help="Number of mach files to process (for this node)") parser.add_argument("--mach-uncompressed", action="store_true", help="Indicate that the mach input is not gzipped") parser.add_argument( "--mach-chunk-size", type=int, default=100000, help= "Max number of loci to load at once (higher increases memory requirements with some speed benefits)" ) parser.add_argument("--mach-info-ext", type=str, default="info.gz", help="Portion of filename denotes info filenames") parser.add_argument("--mach-dose-ext", type=str, default="dose.gz", help="Portion of filename that denotes dose files") parser.add_argument("--mach-min-rsquared", type=float, default=0.3, help="Filter out loci with RSquared < this value") parser.add_argument( "--mach-chrpos", action="store_true", help= "When true, first col in .info file must be chr:pos (additional pieces allowed)" ) parser.add_argument("--pheno", type=argparse.FileType('r'), help="File containing phenotypes") parser.add_argument("--sample-pheno", type=argparse.FileType('r'), help="(Mach) Sample file containing phenotypes") parser.add_argument( "--mphenos", type=str, default="", help= "Column number(s) for phenotype to be analyzed if number of columns > 1" ) parser.add_argument( "--pheno-names", type=str, default="", help= "Name for phenotype(s) to be analyzed (must be in --pheno file)") parser.add_argument("--all-pheno", action="store_true", help="Analyze all columns from the phenotype file") #parser.add_argument("--all-pheno", action='store_true', help="Analyze each phenotype") parser.add_argument("--covar", type=argparse.FileType('r'), help="File containing covariates") parser.add_argument("--sample-covar", type=argparse.FileType('r'), help="(Mach) Sample file containing covariates") parser.add_argument("--covar-numbers", type=str, default="", help="Comma-separated list of covariate indices") parser.add_argument("--covar-names", type=str, default="", help="Comma-separated list of covariate names") parser.add_argument( "--sex", action='store_true', help="Use sex from the pedigree file as a covariate") parser.add_argument("--missing-phenotype", type=float, default=-9.0, help="Encoding for missing phenotypes") parser.add_argument("--maf", type=float, default=0.0, help="Minimum MAF allowed for analysis") parser.add_argument("--max-maf", type=float, default=1.0, help="MAX MAF allowed for analysis") parser.add_argument("--geno", type=float, default=1.0, help="MAX per-SNP missing for analysis") parser.add_argument("--mind", type=float, default=1.0, help="MAX per-person missing") parser.add_argument("--verbose", action='store_true', help="Output additional data details") parser.set_defaults(all_pheno=False, sex=False, mach_chrpos=False) args = parser.parse_args(args) # Report version, if requested, and exit if args.v: print("%s: %s" % (os.path.basename(__file__), __version__), file=sys.stderr) sys.exit(0) if args.vall: print("%s: %s" % (os.path.basename(__file__), __version__), file=sys.stderr) print("%s: %s" % (os.path.dirname(libgwas.__file__), libgwas.__version__), file=sys.stderr) print("%s: %s" % (os.path.dirname(scipy.__file__), scipy.__version__), file=sys.stderr) print("%s: %s" % (os.path.dirname(numpy.__file__), numpy.__version__), file=sys.stderr) sys.exit(0) ############################################################################################################### # Here we deal with the various ways we filter SNPs in and out of anlysis # We might handle MACH files differently. We'll default the chromosome # to be "NA" which is how those can be returned. if args.mach is None or args.mach_chrpos: BoundaryCheck.chrom = args.chr else: if args.chr != -1: libgwas.Exit( ("Positional based filtering (--chr, --from/--to)" + " only work with mach_chrpos. See manual for details.")) BoundaryCheck.chrom = "NA" snps = args.snps.split(",") try: b = BoundaryCheck(bp=(args.from_bp, args.to_bp), kb=(args.from_kb, args.to_kb), mb=(args.from_mb, args.to_mb)) except InvalidBoundarySpec as e: print("Invalid boundary spec associated: %s" % (e.malformed_boundary), file=sys.stderr) sys.exit(1) try: s = SnpBoundaryCheck(snps=snps) except InvalidBoundarySpec as e: print("Invalid SNP boundary defined: %s" % (e.malformed_boundary), file=sys.stderr) print( "SNPs must be either single or have be a range such as rs123-rs345", file=sys.stderr) sys.exit(1) if b.valid and s.valid: print( "Only one type of boundary conditions is permitted. Either use --from-bp, etc. or rs123-rs345. ", file=sys.stderr) sys.exit(1) if len(b.bounds) > 0 and not b.valid: if BoundaryCheck.chrom == "NA": libgwas.Exit( ("Positional based filtering (--chr, --from/--to)" + " only work with mach_chrpos. See manual for details.")) if s.valid: DataParser.boundary = s # If b isn't valid, we still want to potentially allow for chr and SNPs, it just won't have else: b.LoadSNPs(snps) # any actual boundary listings DataParser.boundary = b DataParser.boundary.LoadExclusions(snps=args.exclude.split(",")) ############################################################################################################### # Setup the various Dataset filter criteria DataParser.min_maf = args.maf DataParser.max_maf = args.max_maf DataParser.snp_miss_tol = args.geno DataParser.ind_miss_tol = args.mind DataParser.ind_exclusions = ParseIndList(args.remove) PhenoCovar.sex_as_covariate = args.sex if args.compressed: DataParser.compressed_pedigree = True DataParser.has_sex = not args.no_sex DataParser.has_parents = not args.no_parents DataParser.has_fid = not args.no_fid DataParser.has_pheno = not args.no_pheno DataParser.has_liability = args.liability pheno_covar = PhenoCovar() self.verbose = False if args.verbose: self.verbose = True if args.file != None or args.ped or args.map: if args.ped and not args.map or args.map and not args.ped: print( "When analyzing pedigree data, both .map and .ped must be specified", file=sys.stderr) sys.exit(1) if args.ped: dataset = pedigree_parser.Parser(args.map.name, args.ped.name) else: dataset = pedigree_parser.Parser("%s.map" % (args.file), "%s.ped" % (args.file)) dataset.load_mapfile(map3=args.map3) dataset.load_genotypes(pheno_covar) elif args.tfile != None or args.tped or args.tfam: if args.tped and not args.tfam or args.tfam and not args.tped: print( "When analyzing transposed pedigree data, both .tfam and .tped must be specified", file=sys.stderr) sys.exit(1) if args.tped: dataset = transposed_pedigree_parser.Parser( args.tfam.name, args.tped.name) else: dataset = transposed_pedigree_parser.Parser( "%s.tfam" % (args.tfile), "%s.tped" % (args.tfile)) dataset.load_tfam(pheno_covar) dataset.load_genotypes() elif args.bfile != None: dataset = bed_parser.Parser("%s.fam" % (args.bfile), "%s.bim" % (args.bfile), "%s.bed" % (args.bfile)) dataset.load_bim(map3=args.map3) dataset.load_fam(pheno_covar) dataset.load_genotypes() elif args.bed or args.bim or args.fam: if (args.bed and not args.fam or not args.bim) or ( args.bim and not args.bed or not args.fam) or (args.fam and not args.bed or not args.bim): print( "When analyzing binary pedigree data, .bed, .bim and .fam files must be provided", file=sys.stderr) sys.exit(1) dataset = bed_parser.Parser(args.fam, args.bim, args.bed) dataset.load_bim(map3=args.map3) dataset.load_fam(pheno_covar) dataset.load_genotypes() elif args.impute: DataParser.compressed_pedigree = not args.impute_uncompressed if (args.impute_offset > 0 and args.impute_count == -1) or ( args.impute_offset == -1 and args.impute_count > 0): print( "--impute-count and --impute_offset must both > 0 if one is set other than -1. ", file=sys.stderr) sys.exit(1) if DataParser.snp_miss_tol != 1.0: print("--geno does not have any impact on imputed data", file=sys.stderr) sys.exit(1) if DataParser.ind_miss_tol != 1.0: print("--mind does not have any impact on imputed data", file=sys.stderr) sys.exit(1) impute_parser.SetEncoding(args.impute_encoding) impute_parser.Parser.info_ext = args.impute_info_ext impute_parser.Parser.info_threshold = args.impute_info_thresh libgwas.ExitIf( "--impute-fam is required for when processing imputed data", args.impute_fam == None) archives, chroms, infos = self.ParseImputeFile( args.impute.name, args.impute_offset, args.impute_count) dataset = impute_parser.Parser(args.impute_fam.name, archives, chroms, infos) dataset.load_family_details(pheno_covar) dataset.load_genotypes() elif args.mach: DataParser.compressed_pedigree = not args.mach_uncompressed if (args.mach_offset > 0 and args.mach_count == -1) or (args.mach_offset == -1 and args.impute_count > 0): print( "--mach-count and --mach_offset must both be > 0 if one is set other than -1. ", file=sys.stderr) sys.exit(1) if DataParser.snp_miss_tol != 1.0: print("--geno does not have any impact on imputed data", file=sys.stderr) sys.exit(1) if DataParser.ind_miss_tol != 1.0: print("--mind does not have any impact on imputed data", file=sys.stderr) sys.exit(1) if BoundaryCheck.chrom != "NA" and not args.mach_chrpos: libgwas.Exit( ("Positional based filtering (--chr, --from/--to)" + " only work with mach_chrpos. See manual for details.")) mach_parser.Parser.chrpos_encoding = args.mach_chrpos mach_parser.Parser.info_ext = args.mach_info_ext mach_parser.Parser.dosage_ext = args.mach_dose_ext mach_parser.Parser.chunk_stride = args.mach_chunk_size mach_parser.Parser.min_rsquared = args.mach_min_rsquared archives, infos = self.ParseMachFile(args.mach.name, args.mach_offset, args.mach_count) dataset = mach_parser.Parser(archives, infos) dataset.load_family_details(pheno_covar) dataset.load_genotypes() else: parser.print_usage(sys.stderr) print( "\nNo data has been specified. Users must specify either pedigree or transposed pedigree to continue", file=sys.stderr) sys.exit(1) if args.pheno or args.sample_pheno: mphenos = [] if args.mphenos != "": mphenos = args.mphenos.split(",") nphenos = [] if args.pheno_names != "": nphenos = args.pheno_names.split(",") if len(mphenos) + len(nphenos) == 0 and not args.all_pheno: libgwas.Exit("You must select one or more phenotypes when ") sample_file = False pheno_filename = args.pheno if args.sample_pheno: pheno_filename = args.sample_pheno sample_file = True pheno_covar.load_phenofile(pheno_filename, mphenos, nphenos, sample_file) if args.covar: pheno_covar.load_covarfile(args.covar, args.covar_numbers.split(","), args.covar_names.split(",")) pheno_covar.do_standardize_variables = True return dataset, pheno_covar