def main(): parser = argparse.ArgumentParser( description="Chiamante v" + version + "\nCalls genotypes for Illumina Bead chips (possibly augmented with sequence genotype likelihoods for improved accuracy)" ) parser.add_argument("arrayfilename", metavar="arrayfilename", type=str, help="the micro array data") # OPTIONAL parser.add_argument("-output", metavar="output.vcf.gz", default="", type=str, help="output file") parser.add_argument( "-chromosome", dest="chromosome", action="store", default=None, type=str, help="Only call loci on this chromosome", ) parser.add_argument("--nohwe", dest="nohwe", action="store_true", help="") parser.add_argument( "-seq", dest="seqfilename", metavar="genotype_likelihoods.vcf.gz", default=None, type=str, help="the genotype likelihoods from sequence data", ) parser.add_argument( "-pop", dest="pop", action="store", default="", type=str, help="text file containing space delimited population of each sample eg. YRI YRI CEU CHB...", ) parser.add_argument( "-hwe_filter", dest="hwe_filter", action="store", default=5, type=float, help="-log10 HWE filter (values greater than this will be flagged in the FILTER field of the vcf", ) parser.add_argument( "-aqual_filter", dest="aqual_filter", action="store", default=10, type=float, help="AQUAL filter (values lower than this will be flagged in the FILTER field of the vcf", ) parser.add_argument( "-squal_filter", dest="squal_filter", action="store", default=10, type=float, help="SQUAL filter (values lower than this will be flagged in the FILTER field of the vcf", ) # EXPERIMENTAL/DEPRECATED - there be dragons here. parser.add_argument( "-snpinfo", dest="snpinfo", metavar="snpinfo.txt(.gz)", action="store", default=None, type=str, help=argparse.SUPPRESS, ) # "SNP information file - required if you are using Broad or Illuminus style input") parser.add_argument("-pp_filter", dest="pp_filter", action="store", default=0.9, type=float, help=argparse.SUPPRESS) parser.add_argument( "-pos", dest="positionfile", action="store", default=None, type=str, help=argparse.SUPPRESS ) # 'optional list of genomic positions to call at (default: everything in arrayfile)') parser.add_argument("--broad", dest="broad", action="store_true", help=argparse.SUPPRESS) parser.add_argument("--illuminus", dest="illuminus", action="store_true", help=argparse.SUPPRESS) parser.add_argument( "-sf", dest="sf", action="store", default=1.0 / 3.0, type=float, help=argparse.SUPPRESS ) #'sequence fail likelihood (as a rule dont touch this)') parser.add_argument( "-af", dest="af", action="store", default=0.005, type=float, help=argparse.SUPPRESS ) #'array fail likelihood (as a rule dont touch this)') parser.add_argument( "-nprocess", dest="nprocess", action="store", default=1, type=int, help="Number of processes to use for genotyping calling (set this to the number of avaiable CPU cores)", ) parser.add_argument( "-thisjob", dest="thisjob", action="store", default=0, type=int, help=argparse.SUPPRESS ) #'array fail likelihood (as a rule dont touch this)') parser.add_argument( "-niteration", dest="niteration", metavar="30", action="store", default=30, type=int, help=argparse.SUPPRESS ) # 'Maximum number of ECM iterations') parser.add_argument( "-nsubiteration", dest="nsubiteration", metavar="30", action="store", default=30, type=int, help=argparse.SUPPRESS, ) # 'Maximum number of ECM iterations per SNP') parser.add_argument( "-sigfig", dest="sigfig", metavar="3", action="store", default=3, type=int, help="signifciant figures in output" ) parser.add_argument( "-tolerance", dest="tolerance", action="store", default=0.1, type=float, help=argparse.SUPPRESS ) #'convergence toleran parser.add_argument("-chunksize", dest="chunksize", action="store", default=10000, type=int, help=argparse.SUPPRESS) parser.add_argument("-f", dest="f", action="store", default=False, type=bool, help=argparse.SUPPRESS) parser.add_argument("--vcfin", dest="vcfin", action="store_true", help=argparse.SUPPRESS) parser.add_argument("--debug", dest="debug", action="store_true", help=argparse.SUPPRESS) parser.add_argument("--plot", dest="plot", action="store_true", help=argparse.SUPPRESS) parser.add_argument("--GL", dest="gl", action="store_true", help=argparse.SUPPRESS) # generate GLs print "Chiamante version", version args = parser.parse_args() args.arrayonly = False args.renormalise = True tol = 0.001 buf = 500 * 2 ** 20 # buffer for concatenating files if args.debug: print "DEBUG MODE!" chromosomes = set([str(val) for val in range(1, 23)]) if args.chromosome != None: if args.chromosome not in chromosomes: print "Invalid chromosome specified", args.chromosome quit() chromosomes = set([args.chromosome]) if (args.chromosome == None and args.seqfilename != None) and not args.f: print "WARNING: you have specified sequence likelihoods but without the -chrom argument.\n This is no problem, but sequence data is usually split between chromosomes.\nMaybe you meant to use the -chrom argument.\nRun with -f to ignore this warning.\nExiting..." quit() if args.output == "": args.output = args.arrayfilename.split(".")[0] + "_chiamante" ids = chiamante_io.get_arrayids(args) start_time0 = time.time() if args.positionfile == None: print "Parsing", args.arrayfilename, "..." positions = chiamante_io.get_positions(args) elif args.chromosome == None: print "You need to specificy a chromosome if positions file is specified" quit() else: tmp = np.array(chiamante_io.ropen(args.positionfile).read().split(), np.int) positions = [np.array([args.chromosome for idx in range(len(tmp))]), tmp] print len(positions[0]), "SNPs and", len(ids), "samples in", args.arrayfilename if args.pop != "": print "Reading populations from", args.pop poptable, pop = chiamante_io.setup_populations(chiamante_io.ropen(args.pop).read().split(), True) else: print "No population information provide I am assuming all samples are from the same population" # print "Took ",time.time() - start_time0," seconds" start_time0 = time.time() chromosomes = chromosomes.intersection(set(positions[0].tolist())) if args.seqfilename == None: seqfilename = "None" else: seqfilename = args.seqfilename chiamante_io.ropen(seqfilename) for chrom in chromosomes: if len(chromosomes) == 1 and ("chr" + str(chrom)) in args.output: outfilename = args.output else: outfilename = args.output + "_chr" + str(chrom) outfile = chiamante_io.vcfWriter( outfilename, ids, header=[ "fileformat=VCFv4.0", "source=chiamante_v" + version, "arrayfile=" + args.arrayfilename, "seqfile=" + seqfilename, "timestamp=" + str(datetime.datetime.now()), "FILTER=<ID=a" + str(args.aqual_filter) + ',Description="Array data quality below' + str(args.aqual_filter) + '">', "FILTER=<ID=s" + str(args.squal_filter) + ',Description="Sequence data quality below' + str(args.squal_filter) + '">', "FILTER=<ID=hwe" + str(args.hwe_filter) + ',Description="-log10 HWE exact test p-value above' + str(args.hwe_filter) + '">', 'FILTER=<ID=NOT_POLY,Description="loci does not appear to be polymorphic in this sample">', 'INFO=<ID=HWE,Number=1,Type=Float,Description="-log10 HWE exact test p-value">', 'INFO=<ID=AQUAL,Number=1,Type=Integer,Description="phred scaled array failure rate">', 'INFO=<ID=SQUAL,Number=1,Type=Integer,Description="phred scaled sequence failure rate">', 'FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', 'FORMAT=<ID=GP,Number=3,Type=Float,Description="Genotype probabilities">', ], ) del outfile positions_on_this_chrom = positions[1][positions[0] == str(chrom)] positions_on_this_chrom.sort() npos = len(positions_on_this_chrom) nchunk = max(1, int(round(float(npos) / float(args.chunksize)))) neach = int(round(float(npos) / (nchunk))) splitpos = range(0, npos, neach) + [npos] # print splitpos new_positions = [positions_on_this_chrom[splitpos[i] : splitpos[i + 1]] for i in range(nchunk)] start_time1 = time.time() print "Calling genotypes for chromosome", chrom, "using", args.nprocess, "processes..." timestamp = str(datetime.datetime.now()).replace(" ", "_") workdir = args.output + ".working_" + timestamp try: os.mkdir(workdir) except: pass workdir += "/" print "Chromosome", chrom, "split into", len(new_positions), "chunks" # if __name__ == '__main__': if True: if not args.debug: # pool = Pool(processes=args.nprocess) # starts processes for chunk, pos in enumerate(new_positions): # print chunk,pos tmpargs = deepcopy(args) tmpargs.chrom = str(chrom) # tmpargs.output = workdir+tmpargs.output+"_"+str(chrom)+"_"+str(chunk) tmpargs.output = workdir + "_" + str(chrom) + "_" + str(chunk) stdout = tmpargs.output + ".stdout" # pool.apply_async(chiamante_main.main,args=(tmpargs,pos,stdout)) # pool_result = pool.apply_async(process_chunk,args=(tmpargs,pos,stdout)) process_chunk(tmpargs, pos) # pool_result.get() # pool.close() # pool.join() else: for chunk, pos in enumerate(new_positions): print "Processing chromosome", chrom, "Chunk", chunk, "..." tmpargs = deepcopy(args) tmpargs.chrom = str(chrom) # tmpargs.output = workdir+tmpargs.output+"_"+str(chrom)+"_"+str(chunk) tmpargs.output = workdir + "_" + str(chrom) + "_" + str(chunk) stdout = tmpargs.output + ".stdout" # chiamante_main.main(tmpargs,pos) process_chunk(tmpargs, pos) print "Chromosome", chrom, "took", time.time() - start_time1, " seconds" print "Calling finished. Writing output..." outfile = open(outfilename + ".vcf.gz", "ab", buf) logfile = open(args.output + "_" + str(chrom) + ".log", "wb") for chunk in range(nchunk): postfix = "_" + str(chrom) + "_" + str(chunk) logfile.write(open(workdir + postfix + ".log", "rb").read()) tempfile = open(workdir + postfix + ".vcf.gz", "rb") while True: data = tempfile.read(buf) if data: outfile.write(data) else: break # os.remove(args.output+postfix+".log") # os.remove(args.output+postfix+".vcf.gz") if not args.debug: shutil.rmtree(workdir, ignore_errors=True) # try: shutil.rmtree(workdir) # except: pass print "Took ", time.time() - start_time0, " seconds"
def process_chunk(args, positions=None, stdout=None): np.seterr(invalid="raise") ngig = 8 resource.setrlimit(resource.RLIMIT_AS, (ngig * pow(2, 30), ngig * pow(2, 30))) start_time = time.time() print "Processing Chr", args.chrom, ":", min(positions), "-", max(positions), "..." if args.plot: import plot_functions as plot if not args.debug and stdout != None: sys.stdout = open(stdout, "w") if args.output == "": args.output = args.arrayfilename.split(".")[0] + "_chiamante" # if args.nprocess>0:args.output+="_"+str(args.thisjob) chiamante_io.checkfiledoesnotexist(args.output + ".vcf.gz") logfile = open(args.output + ".log", "wb") sorted_positions, ids, pop, snpinfo, array_chromosomes, array_positions, signal = chiamante_io.array_setup( args, positions ) nsnp = len(sorted_positions) # meanfile = chiamante_io.wopen('snp_means.txt.gz',force=True) out = chiamante_io.vcfWriter(args.output, ids=None, header=None, significant_figures=args.sigfig) nsample = len(ids) signal[np.isnan(signal)] = 0 signal[signal < 8] += 8 signal = np.log2(signal) if args.renormalise: # if some samples have intensities with mean >4 SD away, just apply simple normalisation x = signal.mean(0) ii = np.abs((x - x.mean(0)) / x.std(0)).max(1) > 4 if ii.sum() > 0: # print "WARNING:",ids[ii],"have extremely atypical signal values - normalising them" mu = x[np.logical_not(ii)].mean(0) for i in np.where(ii)[0]: signal[:, i] = (signal[:, i] - signal[:, i].mean(0)) + mu signal[signal < 0] = 0.0 nsnp = signal.shape[0] nsample = signal.shape[1] nsubsample = len(signal) if args.seqfilename != None: doseq = True seqlik = chiamante_io.get_sequence_likelihoods( args.seqfilename, array_chromosomes, sorted_positions, ids, snpinfo, logfile ) else: doseq = False seqlik = {} pops = set(pop) popidx = [np.where(pop == popid)[0] for popid in pops] npop = len(pops) start, priors = chiamante_init.setup_omni25( 3, len(np.unique(pop)), sigma_mu_multiplier=1.0, rafprior=[1.01, 1.01], sigma_multiplier=10, hwe_var=-1, illumina=True, s0_multiplier=1, nocorrelation=False, ) nit = args.niteration # print "Running chiamante style mixture distribution with hyperprior on mu_mu and mu_sigma..." gprob = np.empty((nsample, 4), np.float) new_means = np.empty((nsubsample, 6), np.float) old_means = np.ones((nsubsample, 6), np.float) parms = {} if doseq: working_arrays = chiamante_ecm.chiamante_work_arrays(nsample, 1.0 / 3.0, seqlik[seqlik.keys()[0]]["lik"]) else: working_arrays = chiamante_ecm.chiamante_work_arrays(nsample) i = 0 # print len(sorted_positions),"positions on",args.chromosome # print "Min:",min(sorted_positions) # print "Max:",max(sorted_positions) # print "I/O took",time.time() - start_time,"seconds" start_time = time.time() while i <= nit: print "Iteration", i snpnumber = 0 for k, pos in enumerate(sorted_positions): # LOOP OVER EVERY SNP if k % 1000 == 0: print "Iteration", i, "SNP", k, "Position", pos for snp in snpinfo[pos]: x1 = signal[snp[9]] if pos in seqlik: x2 = seqlik[pos]["lik"] else: x2 = None if i == 0: start["mu"] = chiamante_init.get_mustart(x1, illumina=True) else: start = parms[snp[2]] if i < nit: subiterations = 1 else: subiterations = args.nsubiteration # on the last inter-SNP iteration we let each loci converge # version2 two full passes, resetting starting values on second pass # start['mu'] = chiamante_init.get_mustart(x1,illumina=True) # subiterations = args.nsubiteration chiamante = chiamante_ecm.chiamante_mainloop( x1, x2, priors, start, seqfaildens=args.sf, arrfaildens=args.af, popidx=popidx, working=working_arrays, df=[5, 5, 5], niteration=subiterations, hwe_prior=not args.nohwe, tolerance=0.01, C=False, genotype_likelihoods=args.gl, ) if i == nit: # ON THE LAST ITERATION WE WRITE OUT TO THE VCF infostring = "" filterstring = "" if pos in seqlik: flip = seqlik[pos]["flip"] else: # if doseq: filterstring += ';SEQ_MISSING' if snp[8] == "ALT:REF": flip = True else: flip = False chiamante_ecm.corrected_posteriors(working_arrays, gprob, doseq) # FILTER and INFO parameters ngeno = gprob.sum(0) try: hwe = chiamante_statfunc.hwe_test(gprob[:, :3], popidx) except: print "ERROR!\nExiting...\nPlease report to [email protected]" quit() aqual = -10 * np.log10(chiamante["parameters"]["eta_array"]) infostring += "HWE=%.4f" % hwe + ";AQUAL=" + str(int(aqual)) if doseq: if x2 != None: squal = -10 * np.log10(chiamante["parameters"]["eta_seq"]) infostring += ";SQUAL=" + str(int(squal)) if squal < args.squal_filter: filterstring += ";s" + str(args.squal_filter) else: infostring += ";SQUAL=." if aqual < args.aqual_filter: filterstring += ";a" + str(args.aqual_filter) if hwe > args.hwe_filter: filterstring += ";hwe" + str(args.hwe_filter) if gprob.max(0)[1] < args.pp_filter: filterstring += ";NOT_POLY" if len(filterstring) > 0: filterstring = filterstring[1:] else: filterstring = "PASS" if flip: posterior_probs = gprob[:, [2, 1, 0, 3]] ref = snp[4] alt = snp[3] if args.gl: gl = chiamante["gl"][:, [2, 1, 0]] else: posterior_probs = gprob ref = snp[3] alt = snp[4] if args.gl: gl = chiamante["gl"] if infostring == "": infostring = "." if args.gl: data = {"GP": posterior_probs, "GL": gl} else: data = {"GP": posterior_probs} out.writerow( snp[0], snp[1], snp[2], ref, alt, data=data, info=infostring, filterstring=filterstring ) if args.plot: plot.debug_plot_snp( x1, "figs/" + str(pos) + "_" + str(i), x2, chiamante["gprobs"], gprob, snp[3], snp[4], snp[2], priors, start, chiamante["parameters"], flip, 2, ) else: parms[snp[2]] = chiamante["parameters"] # store parameters for next pass for j in range(3): new_means[snpnumber, (j * 2) : ((j + 1) * 2)] = chiamante["parameters"]["mu"][j] snpnumber += 1 # CALCULATE PRIORS FOR MU_0 AND SIGMA_MU # chiamante_init.build_priors(new_means,priors) old_means -= new_means print old_means.max() if i == nit: i += 1 elif i > 0 and old_means.max() < args.tolerance: print "Converged." i = nit else: i += 1 if nsnp > 1: chiamante_init.build_priors(new_means, priors) tmp_means = old_means old_means = new_means new_means = tmp_means print "\nChr", args.chromosome, ":", min(sorted_positions), "-", max( sorted_positions ), "finished in", time.time() - start_time, " seconds"