def LoadReaders(vcffiles): """ Return list of VCF readers """ if len(vcffiles) == 0: common.ERROR("No VCF files found") for f in vcffiles: if not f.endswith(".vcf.gz"): common.ERROR("Make sure %s is bgzipped and indexed"%f) if not os.path.isfile(f): common.ERROR("Could not find VCF file %s"%f) if not os.path.isfile(f+".tbi"): common.ERROR("Could not find VCF index %s.tbi"%f) return [vcf.Reader(open(f, "rb")) for f in vcffiles]
def WriteMergedHeader(vcfw, args, readers, cmd): """ Write merged header for VCFs in args.vcfs Also do some checks on the VCFs to make sure merging is appropriate """ # Check contigs the same for all readers contigs = readers[0].contigs for i in range(1, len(readers)): if readers[i].contigs != contigs: common.ERROR("Different contigs found across VCF files. Make sure all files used the same reference") # Write VCF format, commands, and contigs vcfw.write("##fileformat=VCFv4.1\n") for r in readers: vcfw.write("##command="+r.metadata["command"][0]+"\n") vcfw.write("##command="+cmd+"\n") for key,val in contigs.items(): vcfw.write("##contig=<ID=%s,length=%s>\n"%(val.id, val.length)) # Write GangSTR specific INFO fields for field in ["END", "PERIOD", "RU", "REF","STUTTERP","STUTTERDOWN","STUTTERP","EXPTHRESH"]: vcfw.write(GetInfoString(readers[0].infos[field])+"\n") if args.merge_ggl: vcfw.write(GetInfoString(readers[0].infos["GRID"])+"\n") # Write GangSTR specific FORMAT fields for field in ["GT", "DP", "Q", "REPCN", "REPCI", "RC", "ML", "INS", "STDERR", "QEXP"]: vcfw.write(GetFormatString(readers[0].formats[field])+"\n") if args.merge_ggl: vcfw.write(GetFormatString(readers[0].formats["GGL"])+"\n") # Write sample list samples=GetSamples(readers, usefilenames=args.update_sample_from_file) header_fields = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] vcfw.write("#"+"\t".join(header_fields+samples)+"\n")
def LoadRegions(self, filename): if not os.path.exists(filename): common.ERROR("%s not found" % filename) self.regions = BedTool(filename) if not self.regions._tabixed(): sys.stderr.write("Creating tabix index for %s\n" % filename) self.regions.tabix(force=True)
def GetSamples(readers, usefilenames=False): samples = [] for r in readers: if usefilenames: samples = samples + [r.filename.strip(".vcf.gz")+":"+ s for s in r.samples] else: samples = samples + r.samples if len(set(samples))!=len(samples): common.ERROR("Duplicate samples found. Quitting") return samples
def ParseFam(args): """ Parse fam file and extract affected and unaffected sample IDs. Input: - args (namespace from parser.parse_args) Output: - isAffected ({str: bool}): dictionary for affected and unaffected sample status """ filename = args.fam min_affec = args.affec_min_call_count min_unaff = args.unaff_min_call_count isAffected = {} with open(filename, 'r') as f: i = 0 count_affec = 0 count_unaff = 0 for line in f: i = i + 1 recs = line.strip().split('\t') if len(recs) < 6: common.ERROR("Insufficient number of columns in line " + str(i) + " of fam file: " + filename) sid = recs[1] phe = recs[5] if phe == '2': isAffected[sid] = True count_affec = count_affec + 1 else: isAffected[sid] = False count_unaff = count_unaff + 1 if min_affec != -1: if count_affec < min_affec: common.ERROR("Minimum number of affected calls (" + str(min_affec) + \ ") larger than number of affected samples in fam file (" + str(count_affec) + ")") else: min_affec = count_affec if min_unaff != -1: if count_unaff < min_unaff: common.ERROR("Minimum number of unaffected calls (" + str(min_unaff) + \ ") larger than number of unaffected samples in fam file (" + str(count_unaff) + ")") else: min_unaff = count_unaff return isAffected, min_affec, min_unaff
def LoadCondition(vcffile, condition, sample_order): reader2 = vcf.Reader(open(vcffile, "rb")) chrom, start = condition.split(":") region = "%s:%s-%s" % (chrom, start, int(start) + 1) reader2.fetch(region) for record in reader2: print record.start, int(start), record.ID if record.start == int(start): return LoadGT(record, sample_order, is_str=False) common.ERROR("Could not find SNP to condition on")
def GetRefAllele(current_records, mergelist): refs = [] chrom = "" pos = -1 for i in range(len(mergelist)): if mergelist[i]: chrom = current_records[i].CHROM pos = current_records[i].POS refs.append(current_records[i].REF.upper()) if len(set(refs)) != 1: common.ERROR("Conflicting refs found at %s:%s"%(chrom, pos)) return refs[0]
def GetInfoItem(current_records, mergelist, info_field, fail=True): """ Get info item. Make sure it's the same across merged records if fail=True, die if items not the same """ vals = set() for i in range(len(mergelist)): if mergelist[i]: vals.add(current_records[i].INFO[info_field]) if len(vals)==1: return "%s=%s"%(info_field, vals.pop()) else: if fail: common.ERROR("More than one value found for %s"%info_field) sys.stderr.write("WARNING more than one value found for %s"%info_field) return None
def main(): parser = argparse.ArgumentParser(__doc__) ### Required arguments ### req_group = parser.add_argument_group("Required arguments") req_group.add_argument("--vcfs", help="Comma-separated list of VCF files to merge (must be sorted, bgzipped and indexed)", type=str, required=True) req_group.add_argument("--out", help="Prefix to name output files", type=str, required=True) ### Special merge options ### spec_group = parser.add_argument_group("Special merge options") spec_group.add_argument("--update-sample-from-file", help="Use file names, rather than sample header names, when merging", action="store_true") spec_group.add_argument("--merge-ggl", help="Merge GGL fields", action="store_true") ### Optional arguments ### opt_group = parser.add_argument_group("Optional arguments") opt_group.add_argument("--verbose", help="Print out extra info", action="store_true") opt_group.add_argument("--quiet", help="Don't print out anything", action="store_true") ### Parse args ### args = parser.parse_args() if args.merge_ggl: common.ERROR("--merge-ggl not implemented yet") # TODO remove ### Load readers ### vcfreaders = LoadReaders(args.vcfs.split(",")) contigs = vcfreaders[0].contigs chroms = list(contigs) ### Set up VCF writer ### vcfw = open(args.out + ".vcf", "w") WriteMergedHeader(vcfw, args, vcfreaders, " ".join(sys.argv)) ### Walk through sorted readers, merging records as we go ### current_records = [next(reader) for reader in vcfreaders] is_min = GetMinRecords(current_records, chroms, debug=args.verbose) done = DoneReading(current_records) while not done: if args.verbose: PrintCurrentRecords(current_records, is_min) CheckMin(is_min) MergeRecords(vcfreaders, current_records, is_min, vcfw, args) current_records = GetNextRecords(vcfreaders, current_records, is_min) is_min = GetMinRecords(current_records, chroms) done = DoneReading(current_records)
def CheckMin(is_min): if sum(is_min)==0: common.ERROR("Unexpected error. Stuck in infinite loop and exiting.")
def CheckFilters(args): """ Perform checks on user input for filters Input: - invcf (vcf.Reader) - args (argparse namespace) Exit program if checks fail """ if args.affec_max_expansion_prob_het is not None: if args.affec_max_expansion_prob_het < 0 or args.affec_max_expansion_prob_het > 1: common.ERROR( "--affec-max-expansion-prob-het must be between 0 and 1") if args.affec_min_expansion_prob_het is not None: if args.affec_min_expansion_prob_het < 0 or args.affec_min_expansion_prob_het > 1: common.ERROR( "--affec-min-expansion-prob-het must be between 0 and 1") if args.affec_min_expansion_prob_het is not None and args.affec_max_expansion_prob_het is not None: if args.affec_min_expansion_prob_het > args.affec_max_expansion_prob_het: common.ERROR( "--affec-min-expansion-prob-het must be less than --affec-max-expansion-prob-het" ) if args.unaff_max_expansion_prob_het is not None: if args.unaff_max_expansion_prob_het < 0 or args.unaff_max_expansion_prob_het > 1: common.ERROR( "--unaff-max-expansion-prob-het must be between 0 and 1") if args.unaff_min_expansion_prob_het is not None: if args.unaff_min_expansion_prob_het < 0 or args.unaff_min_expansion_prob_het > 1: common.ERROR( "--unaff-min-expansion-prob-het must be between 0 and 1") if args.unaff_min_expansion_prob_het is not None and args.unaff_max_expansion_prob_het is not None: if args.unaff_min_expansion_prob_het > args.unaff_max_expansion_prob_het: common.ERROR( "--unaff-min-expansion-prob-het must be less than --unaff-max-expansion-prob-het" ) if args.affec_max_expansion_prob_hom is not None: if args.affec_max_expansion_prob_hom < 0 or args.affec_max_expansion_prob_hom > 1: common.ERROR( "--affec-max-expansion-prob-hom must be between 0 and 1") if args.affec_min_expansion_prob_hom is not None: if args.affec_min_expansion_prob_hom < 0 or args.affec_min_expansion_prob_hom > 1: common.ERROR( "--affec-min-expansion-prob-hom must be between 0 and 1") if args.affec_min_expansion_prob_hom is not None and args.affec_max_expansion_prob_hom is not None: if args.affec_min_expansion_prob_hom < args.affec_max_expansion_prob_hom: common.ERROR( "--affec-min-expansion-prob-hom must be less than --affec-max-expansion-prob-hom" ) if args.unaff_max_expansion_prob_hom is not None: if args.unaff_max_expansion_prob_hom < 0 or args.unaff_max_expansion_prob_hom > 1: common.ERROR( "--unaff-max-expansion-prob-hom must be between 0 and 1") if args.unaff_min_expansion_prob_hom is not None: if args.unaff_min_expansion_prob_hom < 0 or args.unaff_min_expansion_prob_hom > 1: common.ERROR( "--unaff-min-expansion-prob-hom must be between 0 and 1") if args.unaff_min_expansion_prob_hom is not None and args.unaff_max_expansion_prob_hom is not None: if args.unaff_min_expansion_prob_hom < args.unaff_max_expansion_prob_hom: common.ERROR( "--unaff-min-expansion-prob-hom must be less than --unaff-max-expansion-prob-hom" ) if args.affec_max_expansion_prob_total is not None: if args.affec_max_expansion_prob_total < 0 or args.affec_max_expansion_prob_total > 1: common.ERROR( "--affec-max-expansion-prob-total must be between 0 and 1") if args.affec_min_expansion_prob_total is not None: if args.affec_min_expansion_prob_total < 0 or args.affec_min_expansion_prob_total > 1: common.ERROR( "--affec-min-expansion-prob-total must be between 0 and 1") if args.affec_min_expansion_prob_total is not None and args.affec_max_expansion_prob_total is not None: if args.affec_min_expansion_prob_total < args.affec_max_expansion_prob_total: common.ERROR( "--affec-min-expansion-prob-total must be less than --affec-max-expansion-prob-total" ) if args.unaff_max_expansion_prob_total is not None: if args.unaff_max_expansion_prob_total < 0 or args.unaff_max_expansion_prob_total > 1: common.ERROR( "--unaff-max-expansion-prob-total must be between 0 and 1") if args.unaff_min_expansion_prob_total is not None: if args.unaff_min_expansion_prob_total < 0 or args.unaff_min_expansion_prob_total > 1: common.ERROR( "--unaff-min-expansion-prob-total must be between 0 and 1") if args.unaff_min_expansion_prob_total is not None and args.unaff_max_expansion_prob_total is not None: if args.unaff_min_expansion_prob_total < args.unaff_max_expansion_prob_total: common.ERROR( "--unaff-min-expansion-prob-total must be less than --unaff-max-expansion-prob-total" ) if args.affec_min_call_count != -1 and args.affec_min_call_count < 0: common.ERROR("Minimum number of affected calls (" + str(args.affec_min_call_count) + \ ") must be 0 or more") if args.unaff_min_call_count != -1 and args.unaff_min_call_count < 0: common.ERROR("Minimum number of unaffected calls (" + str(args.unaff_min_call_count) + \ ") must be 0 or more")
def main(): parser = argparse.ArgumentParser(__doc__) inout_group = parser.add_argument_group("Input/output") inout_group.add_argument("--vcf", help="Input VCF file", type=str) inout_group.add_argument("--out", help="Output prefix", type=str) inout_group.add_argument("--fam", help="FAM file with phenotype info", type=str) inout_group.add_argument("--samples", help="File with list of samples to include", type=str) inout_group.add_argument("--exclude-samples", help="File with list of samples to exclude", type=str) pheno_group = parser.add_argument_group("Phenotypes") pheno_group.add_argument("--pheno", help="Phenotypes file (to use instead of --fam)", type=str) pheno_group.add_argument("--mpheno", help="Use (n+2)th column from --pheno", type=int, default=1) pheno_group.add_argument("--missing-phenotype", help="Missing phenotype code", type=str, default="-9") covar_group = parser.add_argument_group("Covariates") covar_group.add_argument("--covar", help="Covariates file", type=str) covar_group.add_argument( "--covar-name", help="Names of covariates to load. Comma-separated", type=str) covar_group.add_argument( "--covar-number", help="Column number of covariates to load. Comma-separated", type=str) covar_group.add_argument("--sex", help="Include sex from fam file as covariate", action="store_true") covar_group.add_argument("--cohort-pgc", help="Use cohort from PGC FIDs as a covariate", action="store_true") assoc_group = parser.add_argument_group("Association testing") assoc_group.add_argument("--linear", help="Perform linear regression", action="store_true") assoc_group.add_argument("--logistic", help="Perform logistic regression", action="store_true") assoc_group.add_argument("--region", help="Only process this region (chrom:start-end)", type=str) assoc_group.add_argument("--infer-snpstr", help="Infer which positions are SNPs vs. STRs", action="store_true") assoc_group.add_argument( "--allele-tests", help="Also perform allele-based tests using each separate allele", action="store_true") assoc_group.add_argument( "--allele-tests-length", help="Also perform allele-based tests using allele length", action="store_true") assoc_group.add_argument("--minmaf", help="Ignore bi-allelic sites with low MAF", type=float, default=0.01) assoc_group.add_argument("--str-only", help="Used with --infer-snptr, only analyze STRs", action="store_true") assoc_group.add_argument( "--remove-rare-str-alleles", help="Remove genotypes with alleles less than this freq", default=0.0, type=float) assoc_group.add_argument( "--max-iter", help="Maximum number of iterations for logistic regression", default=100, type=int) fm_group = parser.add_argument_group("Fine mapping") fm_group.add_argument("--condition", help="Condition on this position chrom:start", type=str) args = parser.parse_args() # Some initial checks if int(args.linear) + int(args.logistic) != 1: ERROR("Must choose one of --linear or --logistic") # Load phenotype information common.MSG("Loading phenotype information...") if args.fam is not None: pdata = LoadPhenoData(args.fam, fam=True, missing=args.missing_phenotype, sex=args.sex) elif args.pheno is not None: if args.sex: ERROR("--sex only works when using --fam (not --pheno)") pdata = LoadPhenoData(args.pheno, fam=False, missing=args.missing_phenotype, mpheno=args.mpheno) else: common.ERROR("Must specify phenotype using either --fam or --pheno") common.MSG("Loaded %s samples..." % pdata.shape[0]) # Load covariate information common.MSG("Loading covariate information...") covarcols = [] if args.covar is not None: pdata, covarcols = AddCovars(pdata, args.covar, args.covar_name, args.covar_number) if args.sex: covarcols.append("sex") if args.cohort_pgc: pdata["cohort"] = pdata["FID"].apply(lambda x: x.split("*")[0]) covarcols.append("cohort") common.MSG("Loaded %s samples..." % pdata.shape[0]) # Include/exclude samples common.MSG("Loading sample information...") if args.samples is not None: pdata = RestrictSamples(pdata, args.samples, include=True) if args.exclude_samples is not None: pdata = RestrictSamples(pdata, args.exclude_samples, include=False) common.MSG("Left with %s samples..." % pdata.shape[0]) # Setup VCF reader common.MSG("Set up VCF reader") reader = vcf.Reader(open(args.vcf, "rb")) # Set sample ID to FID_IID to match vcf common.MSG("Set up sample info") pdata["sample"] = pdata.apply(lambda x: x["FID"] + "_" + x["IID"], 1) reader_samples = set(reader.samples) pdata = pdata[pdata["sample"].apply(lambda x: x in reader_samples)] sample_order = list(pdata["sample"]) pdata = pdata[["phenotype", "sample"] + covarcols] common.MSG("Left with %s samples..." % pdata.shape[0]) # Get data to condition on if args.condition is not None: cond_gt = LoadCondition(args.vcf, args.condition, sample_order) pdata["condition"] = cond_gt[0] covarcols.append("condition") # Prepare output file if args.out == "stdout": outf = sys.stdout else: outf = open(args.out, "w") PrintHeader(outf, case_control=args.logistic, quant=args.linear, comment_lines=[" ".join(sys.argv)]) # Perform association test for each record common.MSG("Perform associations... with covars %s" % str(covarcols)) if args.region: reader = reader.fetch(args.region) for record in reader: # Check MAF aaf = sum(record.aaf) aaf = min([aaf, 1 - aaf]) if aaf < args.minmaf: continue # Infer whether we should treat as a SNP or STR is_str = True # by default, assume all data is STRs if args.infer_snpstr: if len(record.REF) == 1 and len(record.ALT) == 1 and len( record.ALT[0]) == 1: is_str = False if is_str and len(record.REF) < MIN_STR_LENGTH: continue # probably an indel if not is_str and args.str_only: continue # Extract genotypes in sample order, perform regression, and output common.MSG(" Load genotypes...") gts, exclude_samples = LoadGT(record, sample_order, is_str=is_str, rmrare=args.remove_rare_str_alleles) pdata["GT"] = gts if is_str: minmaf = 1 else: minmaf = args.minmaf common.MSG(" Perform association...") assoc = PerformAssociation(pdata, covarcols, case_control=args.logistic, quant=args.linear, minmaf=minmaf, exclude_samples=exclude_samples, maxiter=args.max_iter) common.MSG(" Output association...") OutputAssoc(record.CHROM, record.POS, assoc, outf, assoc_type=GetAssocType(is_str, name=record.ID)) # Allele based tests common.MSG(" Allele based tests...") if is_str and args.allele_tests: alleles = [record.REF] + record.ALT for i in range(len(record.ALT) + 1): gts, exclude_samples = LoadGT(record, sample_order, is_str=True, use_alt_num=i) pdata["GT"] = gts assoc = PerformAssociation(pdata, covarcols, case_control=args.logistic, quant=args.linear, exclude_samples=exclude_samples, maxiter=args.max_iter) OutputAssoc(record.CHROM, record.POS, assoc, outf, assoc_type=GetAssocType(is_str, alt=alleles[i])) if is_str and args.allele_tests_length: for length in set([len(record.REF)] + [len(alt) for alt in record.ALT]): gts, exclude_samples = LoadGT(record, sample_order, is_str=True, use_alt_length=length) pdata["GT"] = gts assoc = PerformAssociation(pdata, covarcols, case_control=args.logistic, quant=args.linear, exclude_samples=exclude_samples, maxiter=args.max_iter) OutputAssoc(record.CHROM, record.POS, assoc, outf, assoc_type=GetAssocType(is_str, alt_len=length))