def main(args): if not os.path.exists(args.vcf): common.WARNING("%s does not exist"%args.vcf) return 1 # Set up reader and harmonizer invcf = vcf.Reader(filename=args.vcf) if args.vcftype != 'auto': vcftype = trh.VCFTYPES[args.vcftype] else: vcftype = trh.InferVCFType(invcf) # Load samples if args.samples: samplelist = [item.strip() for item in open(args.samples, "r").readlines()] else: samplelist = invcf.samples # Set up data to keep track of sample_calls = dict([(sample, 0) for sample in samplelist]) # sample->numcalls contigs = invcf.contigs if len(contigs) == 0: common.MSG("Warning: no contigs found in VCF file.") chrom_calls = dict([(chrom, 0) for chrom in contigs]) # chrom->numcalls diffs_from_ref = [] # for each allele call, keep track of diff (bp) from ref diffs_from_ref_unit = [] # for each allele call, keep track of diff (units) from ref reflens = [] # for each allele call, keep track of reference length (bp) numrecords = 0 for record in invcf: if args.numrecords is not None and numrecords >= args.numrecords: break chrom = record.CHROM trrecord = trh.HarmonizeRecord(vcftype, record) if args.period is not None and len(trrecord.motif) != args.period: continue # Extract stats rl = len(trrecord.ref_allele) allele_counts = trrecord.GetAlleleCounts(uselength=False, samplelist=samplelist) called_samples = [item.sample for item in record if item.called] # Update data num_calls = 0 for s in called_samples: try: sample_calls[s] += 1 num_calls += 1 except KeyError: pass chrom_calls[chrom] = chrom_calls.get(chrom, 0) + num_calls for allele in allele_counts.keys(): allelediff = len(allele)-rl count = allele_counts[allele] reflens.extend([rl]*count) diffs_from_ref.extend([allelediff]*count) diffs_from_ref_unit.extend([allelediff/len(trrecord.motif)]*count) numrecords += 1 OutputDiffRefHistogram(diffs_from_ref_unit, args.out + "-diffref-histogram.pdf") OutputDiffRefBias(diffs_from_ref, reflens, args.out + "-diffref-bias.pdf") OutputSampleCallrate(sample_calls, args.out+"-sample-callnum.pdf") OutputChromCallrate(chrom_calls, args.out+"-chrom-callnum.pdf") return 0
def main(args): # Load VCF file if not os.path.exists(args.vcf): common.WARNING("%s does not exist" % args.vcf) return 1 invcf = vcf.Reader(filename=args.vcf) # Set up record harmonizer and infer VCF type vcftype = trh.InferVCFType(invcf) # Check filters all make sense if not CheckFilters(invcf, args, vcftype): return 1 # Set up locus-level filter list try: filter_list = BuildLocusFilters(args, vcftype) except ValueError: return 1 invcf.filters = {} for f in filter_list: short_doc = f.__doc__ or '' short_doc = short_doc.split('\n')[0].lstrip() invcf.filters[f.filter_name()] = _Filter(f.filter_name(), short_doc) # Set up call-level filters call_filters = BuildCallFilters(args) # Add new FORMAT fields if "FILTER" not in invcf.formats: invcf.formats["FILTER"] = _Format("FILTER", 1, "String", "Call-level filter") # Add new INFO fields invcf.infos["AC"] = _Info("AC", -1, "Integer", "Alternate allele counts", source=None, version=None) invcf.infos["REFAC"] = _Info("REFAC", 1, "Integer", "Reference allele count", source=None, version=None) invcf.infos["HET"] = _Info("HET", 1, "Float", "Heterozygosity", source=None, version=None) invcf.infos["HWEP"] = _Info("HWEP", 1, "Float", "HWE p-value for obs. vs. exp het rate", source=None, version=None) invcf.infos["HRUN"] = _Info("HRUN", 1, "Integer", "Length of longest homopolymer run", source=None, version=None) # Set up output files if not os.path.exists(os.path.dirname(os.path.abspath(args.out))): common.WARNING("Output directory does not exist") return 1 outvcf = MakeWriter(args.out + ".vcf", invcf, " ".join(sys.argv)) if outvcf is None: return 1 # Set up sample info all_reasons = GetAllCallFilters(call_filters) sample_info = {} for s in invcf.samples: sample_info[s] = {"numcalls": 0, "totaldp": 0} for r in all_reasons: sample_info[s][r] = 0 # Set up locus info loc_info = {"totalcalls": 0, "PASS": 0} for filt in filter_list: loc_info[filt.filter_name()] = 0 # Go through each record record_counter = 0 while True: try: record = next(invcf) except IndexError: common.WARNING( "Skipping TR that couldn't be parsed by PyVCF. Check VCF format" ) if args.die_on_warning: return 1 except StopIteration: break if args.verbose: common.MSG("Processing %s:%s" % (record.CHROM, record.POS)) record_counter += 1 if args.num_records is not None and record_counter > args.num_records: break # Call-level filters record = ApplyCallFilters(record, invcf, call_filters, sample_info) # Locus-level filters record.FILTER = None output_record = True for filt in filter_list: if filt(record) == None: continue if args.drop_filtered: output_record = False break record.add_filter(filt.filter_name()) loc_info[filt.filter_name()] += 1 if args.drop_filtered: if record.call_rate == 0: output_record = False if output_record: trrecord = trh.HarmonizeRecord(vcftype, record) # Recalculate locus-level INFO fields record.INFO["HRUN"] = utils.GetHomopolymerRun(record.REF) if record.num_called > 0: allele_freqs = trrecord.GetAlleleFreqs( uselength=args.use_length) genotype_counts = trrecord.GetGenotypeCounts( uselength=args.use_length) record.INFO["HET"] = utils.GetHeterozygosity(allele_freqs) record.INFO["HWEP"] = utils.GetHardyWeinbergBinomialTest( allele_freqs, genotype_counts) record.INFO["AC"] = [ int(item * (3 * record.num_called)) for item in record.aaf ] record.INFO["REFAC"] = int( (1 - sum(record.aaf)) * (2 * record.num_called)) else: record.INFO["HET"] = -1 record.INFO["HWEP"] = -1 record.INFO["AC"] = [0] * len(record.ALT) record.INFO["REFAC"] = 0 # Recalc filter if record.FILTER is None and not args.drop_filtered: record.FILTER = "PASS" loc_info["PASS"] += 1 loc_info["totalcalls"] += record.num_called # Output the record outvcf.write_record(record) # Output log info WriteSampLog(sample_info, all_reasons, args.out + ".samplog.tab") WriteLocLog(loc_info, args.out + ".loclog.tab") return 0
def main(args): ### Check and load VCF files ### vcfreaders = mergeutils.LoadReaders([args.vcf1, args.vcf2], region=args.region) contigs = vcfreaders[0].contigs chroms = list(contigs) ### Load shared samples ### samples = mergeutils.GetSharedSamples(vcfreaders) if len(samples) == 0: common.WARNING("No shared smaples found between vcf readers") return 1 if args.samples: usesamples = set( [item.strip() for item in open(args.samples, "r").readlines()]) samples = list(set(samples).intersection(usesamples)) if len(samples) == 0: common.WARNING("No shared samples found between files") return 1 ### Determine FORMAT fields we should look for ### if args.stratify_file is not None and args.stratify_file not in [0, 1, 2]: common.MSG("--stratify-file must be 0,1, or 2") return 1 format_fields, format_binsizes = GetFormatFields(args.stratify_fields, args.stratify_binsizes, args.stratify_file, vcfreaders) ### Keep track of data to summarize at the end ### results_dir = { "chrom": [], "start": [], "period": [], "sample": [], "gtstring1": [], "gtstring2": [], "gtsum1": [], "gtsum2": [], "metric-conc-seq": [], "metric-conc-len": [], } for ff in format_fields: results_dir[ff + "1"] = [] results_dir[ff + "2"] = [] vcftype1 = trh.VCFTYPES[args.vcftype1] vcftype2 = trh.VCFTYPES[args.vcftype2] ### Walk through sorted readers, merging records as we go ### current_records = [next(reader) for reader in vcfreaders] is_min = mergeutils.GetMinRecords(current_records, chroms) done = mergeutils.DoneReading(current_records) num_records = 0 while not done: if any([item is None for item in current_records]): break if args.numrecords is not None and num_records >= args.numrecords: break if args.verbose: mergeutils.PrintCurrentRecords(current_records, is_min) if mergeutils.CheckMin(is_min): return 1 if all([is_min]): if (current_records[0].CHROM == current_records[1].CHROM and \ current_records[0].POS == current_records[1].POS): UpdateComparisonResults(trh.HarmonizeRecord(vcftype1, current_records[0]), \ trh.HarmonizeRecord(vcftype2, current_records[1]), \ format_fields, samples, results_dir) current_records = mergeutils.GetNextRecords(vcfreaders, current_records, is_min) is_min = mergeutils.GetMinRecords(current_records, chroms) done = mergeutils.DoneReading(current_records) num_records += 1 ### Load all results to a dataframe and output full results ### data = pd.DataFrame(results_dir) data.to_csv(args.out + "-callcompare.tab", sep="\t", index=False) ### Overall metrics ### OutputOverallMetrics(data, format_fields, format_binsizes, args.stratify_file, args.period, args.out) if not args.noplot: OutputBubblePlot(data, args.period, args.out, minval=args.bubble_min, maxval=args.bubble_max) ### Per-locus metrics ### OutputLocusMetrics(data, args.out, args.noplot) ### Per-sample metrics ### OutputSampleMetrics(data, args.out, args.noplot) return 0
def main(): parser = argparse.ArgumentParser(__doc__) inout_group = parser.add_argument_group("Input/output") inout_group.add_argument("--vcf", help="Input VCF file", type=str) inout_group.add_argument("--out", help="Output prefix", type=str) inout_group.add_argument("--fam", help="FAM file with phenotype info", type=str) inout_group.add_argument("--samples", help="File with list of samples to include", type=str) inout_group.add_argument("--exclude-samples", help="File with list of samples to exclude", type=str) inout_group.add_argument("--vcf-samples-delim", help="FID and IID delimiter in VCF", type=str) pheno_group = parser.add_argument_group("Phenotypes") pheno_group.add_argument("--pheno", help="Phenotypes file (to use instead of --fam)", type=str) pheno_group.add_argument("--mpheno", help="Use (n+2)th column from --pheno", type=int, default=1) pheno_group.add_argument("--missing-phenotype", help="Missing phenotype code", type=str, default="-9") covar_group = parser.add_argument_group("Covariates") covar_group.add_argument("--covar", help="Covariates file", type=str) covar_group.add_argument( "--covar-name", help="Names of covariates to load. Comma-separated", type=str) covar_group.add_argument( "--covar-number", help="Column number of covariates to load. Comma-separated", type=str) covar_group.add_argument("--sex", help="Include sex from fam file as covariate", action="store_true") covar_group.add_argument("--cohort-pgc", help="Use cohort from PGC FIDs as a covariate", action="store_true") assoc_group = parser.add_argument_group("Association testing") assoc_group.add_argument("--linear", help="Perform linear regression", action="store_true") assoc_group.add_argument("--logistic", help="Perform logistic regression", action="store_true") assoc_group.add_argument("--region", help="Only process this region (chrom:start-end)", type=str) assoc_group.add_argument("--infer-snpstr", help="Infer which positions are SNPs vs. STRs", action="store_true") assoc_group.add_argument( "--allele-tests", help="Also perform allele-based tests using each separate allele", action="store_true") assoc_group.add_argument( "--allele-tests-length", help="Also perform allele-based tests using allele length", action="store_true") assoc_group.add_argument("--minmaf", help="Ignore bi-allelic sites with low MAF.", type=float, default=0.01) assoc_group.add_argument("--str-only", help="Used with --infer-snptr, only analyze STRs", action="store_true") assoc_group.add_argument( "--remove-rare-str-alleles", help="Remove genotypes with alleles less than this freq", default=0.0, type=float) assoc_group.add_argument( "--max-iter", help="Maximum number of iterations for logistic regression", default=100, type=int) assoc_group.add_argument("--use-gp", help="Use GP field from Beagle output", action="store_true") assoc_group.add_argument( "--iqr-outliers", help="Filter outliers using IQR (GP-based regression only)", action="store_true") assoc_group.add_argument( "--iqr-outliers-min-samples", help= "Minimum number of samples allowed per dosage. Use -1 to disable this.", default=100, type=int) fm_group = parser.add_argument_group("Fine mapping") fm_group.add_argument("--condition", help="Condition on this position chrom:start", type=str) fm_group.add_argument("--condition-file", help="Load Condition from this file", type=str) fm_group.add_argument("--condition-sample-column", help="Column name for samples in the condition file", type=str) args = parser.parse_args() # Some initial checks if int(args.linear) + int(args.logistic) != 1: common.ERROR("Must choose one of --linear or --logistic") # Load phenotype information common.MSG("Loading phenotype information...") if args.fam is not None: pdata = LoadPhenoData(args.fam, fam=True, missing=args.missing_phenotype, sex=args.sex) elif args.pheno is not None: if args.sex: common.ERROR("--sex only works when using --fam (not --pheno)") pdata = LoadPhenoData(args.pheno, fam=False, missing=args.missing_phenotype, mpheno=args.mpheno) else: common.ERROR("Must specify phenotype using either --fam or --pheno") common.MSG("Loaded %s samples..." % pdata.shape[0]) # Load covariate information common.MSG("Loading covariate information...") covarcols = [] if args.covar is not None: pdata, covarcols = AddCovars(pdata, args.covar, args.covar_name, args.covar_number) if args.sex: covarcols.append("sex") if args.cohort_pgc: pdata["cohort"] = pdata["FID"].apply( lambda x: "_".join(x.split("*")[0].split("_")[1:4])) pdata["cohort"] = pdata["cohort"].astype('category') cohortdata = pd.get_dummies(pdata["cohort"]) for col in cohortdata.columns: pdata[col] = cohortdata[col] covarcols.append(col) common.MSG("Loaded %s samples..." % pdata.shape[0]) # Include/exclude samples common.MSG("Loading sample information...") if args.samples is not None: pdata = RestrictSamples(pdata, args.samples, include=True) if args.exclude_samples is not None: pdata = RestrictSamples(pdata, args.exclude_samples, include=False) common.MSG("Left with %s samples..." % pdata.shape[0]) # Setup VCF reader common.MSG("Set up VCF reader") reader = VCF(args.vcf) samples_list = reader.samples if args.region: reader = reader(args.region) #reader.fetch(args.region) # Set sample ID to FID_IID to match vcf common.MSG("Set up sample info") if args.vcf_samples_delim is not None: pdata["sample"] = pdata.apply( lambda x: x["FID"] + args.vcf_samples_delim + x["IID"], 1) else: pdata["sample"] = pdata.apply(lambda x: x["IID"], 1) reader_samples = set(samples_list) pdata = pdata[pdata["sample"].apply(lambda x: x in reader_samples)] sample_order = list(pdata["sample"]) pdata = pdata[["phenotype", "sample"] + covarcols] common.MSG("Left with %s samples..." % pdata.shape[0]) # Get data to condition on if args.condition is not None: if args.condition_file is not None: cond_gt = LoadConditionFromFile(args.condition_file, args.condition, sample_order, args.condition_sample_column) else: cond_gt = LoadCondition(args.vcf, args.condition, sample_order) pdata["condition"] = cond_gt[0] pdata.to_csv("as3mt.phenotype.txt", index=False) covarcols.append("condition") # Prepare output file if args.out == "stdout": outf = sys.stdout else: outf = open(args.out, "w") PrintHeader(outf, case_control=args.logistic, quant=args.linear, comment_lines=[" ".join(sys.argv)]) # Perform association test for each record common.MSG("Perform associations... with covars %s" % str(covarcols)) for record in reader: if record.call_rate == 0: continue aaf = record.aaf aaf = min([aaf, 1 - aaf]) if len(record.ALT) == 1: if aaf < args.minmaf and args.minmaf != 1: continue # Infer whether we should treat as a SNP or STR is_str = True # by default, assume all data is STRs if args.infer_snpstr: if len(record.REF) == 1 and len(record.ALT) == 1 and len( record.ALT[0]) == 1: is_str = False # if is_str and len(record.REF) < MIN_STR_LENGTH: continue # probably an indel if not is_str and args.str_only: continue # Extract genotypes in sample order, perform regression, and output common.MSG(" Load genotypes...") gts, exclude_samples, aaf = LoadGT( record, sample_order, is_str=is_str, rmrare=args.remove_rare_str_alleles, use_gp=args.use_gp, vcf_samples=samples_list, iqr_outliers=args.iqr_outliers, iqr_outliers_min_samples=args.iqr_outliers_min_samples) pdata["GT"] = gts pdata["intercept"] = 1 minmaf = args.minmaf common.MSG(" Perform association...") assoc = PerformAssociation(pdata, covarcols, case_control=args.logistic, quant=args.linear, maf=aaf, exclude_samples=exclude_samples, maxiter=args.max_iter) assoc["REF"] = str(record.REF) if not is_str: assoc["ALT"] = str(record.ALT[0]) else: assoc["ALT"] = ",".join([str(i) for i in record.ALT]) common.MSG(" Output association...") OutputAssoc(record.CHROM, record.POS, assoc, outf, assoc_type=GetAssocType(is_str, name=record.ID)) # Allele based tests common.MSG(" Allele based tests...") if is_str and args.allele_tests: alleles = [record.REF] + record.ALT for i in range(len(record.ALT) + 1): gts, exclude_samples, aaf = LoadGT(record, sample_order, is_str=True, use_alt_num=i, vcf_samples=samples_list) pdata["GT"] = gts if pdata.shape[0] == 0: continue allele_maf = sum(pdata["GT"]) * 1.0 / (2 * pdata.shape[0]) if allele_maf == 0: continue assoc = PerformAssociation(pdata, covarcols, case_control=args.logistic, quant=args.linear, maf=allele_maf, exclude_samples=exclude_samples, maxiter=args.max_iter) assoc["REF"] = str(record.REF) assoc["ALT"] = alleles[i] OutputAssoc(record.CHROM, record.POS, assoc, outf, assoc_type=GetAssocType(is_str, alt=alleles[i], name=record.ID)) if is_str and args.allele_tests_length: for length in set([len(record.REF)] + [len(alt) for alt in record.ALT]): gts, exclude_samples, aaf = LoadGT(record, sample_order, is_str=True, use_alt_length=length, vcf_samples=samples_list) pdata["GT"] = gts if pdata.shape[0] == 0: continue allele_maf = sum(pdata["GT"]) * 1.0 / (2 * pdata.shape[0]) if allele_maf == 0: continue assoc = PerformAssociation(pdata, covarcols, case_control=args.logistic, quant=args.linear, maf=allele_maf, exclude_samples=exclude_samples, maxiter=args.max_iter) assoc["REF"] = "Length-%d" % (len(str(record.REF))) assoc["ALT"] = "Length-%d" % (length) OutputAssoc(record.CHROM, record.POS, assoc, outf, assoc_type=GetAssocType(is_str, alt_len=length, name=record.ID))
def main(args): if not os.path.exists(args.vcf): common.WARNING("%s does not exist" % args.vcf) return 1 # Load samples sample_lists = [] sample_prefixes = [] if args.samples: sfiles = args.samples.split(",") if args.sample_prefixes: sample_prefixes = args.sample_prefixes.split(",") else: sample_prefixes = [str(item) for item in range(1, len(sfiles) + 1)] if len(sfiles) != len(sample_prefixes): common.MSG("--sample-prefixes must be same length as --samples") return 1 for sf in sfiles: sample_lists.append( [item.strip() for item in open(sf, "r").readlines()]) invcf = vcf.Reader(filename=args.vcf) if args.vcftype != 'auto': vcftype = trh.VCFTYPES[args.vcftype] else: vcftype = trh.InferVCFType(invcf) header = ["chrom", "start", "end"] if args.thresh: header.extend(GetHeader("thresh", sample_prefixes)) if args.afreq: header.extend(GetHeader("afreq", sample_prefixes)) if args.acount: header.extend(GetHeader("acount", sample_prefixes)) if args.hwep: header.extend(GetHeader("hwep", sample_prefixes)) if args.het: header.extend(GetHeader("het", sample_prefixes)) if args.mean: header.extend(GetHeader("mean", sample_prefixes)) if args.mode: header.extend(GetHeader("mode", sample_prefixes)) if args.var: header.extend(GetHeader("var", sample_prefixes)) if args.numcalled: header.extend(GetHeader("numcalled", sample_prefixes)) if args.out == "stdout": if args.plot_afreq: common.MSG("Cannot use --out stdout when generating plots") return 1 outf = sys.stdout else: outf = open(args.out + ".tab", "w") outf.write("\t".join(header) + "\n") if args.region: if not os.path.isfile(args.vcf + ".tbi"): common.MSG("Make sure %s is bgzipped and indexed" % args.vcf) return 1 regions = invcf.fetch(args.region) else: regions = invcf num_plotted = 0 for record in regions: trrecord = trh.HarmonizeRecord(vcftype, record) if args.plot_afreq and num_plotted <= MAXPLOTS: PlotAlleleFreqs(trrecord, args.out, samplelists=sample_lists, sampleprefixes=sample_prefixes) num_plotted += 1 items = [record.CHROM, record.POS, record.INFO["END"]] if args.thresh: items.extend(GetThresh(trrecord, samplelists=sample_lists)) if args.afreq: items.extend( GetAFreq(trrecord, samplelists=sample_lists, uselength=args.use_length)) if args.acount: items.extend( GetAFreq(trrecord, samplelists=sample_lists, uselength=args.use_length, count=True)) if args.hwep: items.extend( GetHWEP(trrecord, samplelists=sample_lists, uselength=args.use_length)) if args.het: items.extend( GetHet(trrecord, samplelists=sample_lists, uselength=args.use_length)) if args.mean: items.extend(GetMean(trrecord, samplelists=sample_lists)) if args.mode: items.extend(GetMode(trrecord, samplelists=sample_lists)) if args.var: items.extend(GetVariance(trrecord, samplelists=sample_lists)) if args.numcalled: items.extend(GetNumSamples(trrecord, samplelists=sample_lists)) outf.write("\t".join([str(item) for item in items]) + "\n") outf.close() return 0
def test_MSG(): common.MSG("Writing a test message", debug=False) common.MSG("Writing a test message", debug=True)