def bcfmpileup(flist, ref, bcname, regionrestrict, diploid, q, threads, cmdfile, logfile): """ Call genotypes using BCFTools. :param flist: File list. :param ref: Reference genome. :param bcname: Base name of input file. :param verbose: Verbose output to log. :param cmdfile: File storing external commands invoked. :param logfile: Output log. :return: Name of merged sample VCF. """ print "\nCreating mpileup consensus using BCFTools and writing as a VCF..." mpileupcmd = "bcftools mpileup --threads " + threads + " -C 50 -d 8000 -Ov -f " + ref + " -q " + q + " " for i in range(len(flist)): sample = flist[i] mpileupcmd = mpileupcmd + sample + ".bam" + " " if regionrestrict: mpileupcmd = mpileupcmd + " -r " + regionrestrict mpileupcmd = mpileupcmd + " | bcftools call --threads " + threads + " -Oz -m -o " + bcname + "-samples.vcf.gz - " if diploid: pass else: mpileupcmd = mpileupcmd + " --ploidy 1 " upa_util.bash_command(mpileupcmd, False, cmdfile, logfile) return bcname + "-samples.vcf.gz"
def haplogrep_gen_hsd(flist, ref, bcname, regdic, cmdfile, logfile): """ Generate HSD file from list of samples using BCFtools mpileup/call. :param flist: File list. :param ref: Reference genome. :param bcname: Base name of input file. :param regdic: Region dictionary. :param cmdfile: File storing external commands invoked. :param logfile: Output log. :return: Dictionary of Regions coevered by each sample. """ flength = len(flist) fbamlist = [] print "\nCreating mpileup consensus and writing as a VCF..." bar = progressbar.ProgressBar() for i in bar(range(flength)): sample = flist[i] fbamlist.append(sample + ".bam") mtmcmd = "bcftools mpileup -I -d 8000 -Ov -f " + ref + " " for fbam in fbamlist: mtmcmd = mtmcmd + fbam + " " mtmcmd = mtmcmd + "| bcftools call -V indels --ploidy 1 -Ov -m -v -o " + bcname + "-4hgrp.vcf" upa_util.bash_command(mtmcmd, False, cmdfile, logfile) return regdic
def addreadgroup(flist, binloc, verbose, cmdfile, logfile): """ Adds the read group information by using Picard :param flist: File list. :param verbose: Verbose output to log. :param cmdfile: File storing external commands invoked. :param logfile: Output log. :return: """ for i in range(len(flist)): sample = flist[i] basename = upa_util.name_strip(sample) addrgcmd = "java -jar " + binloc + "picard.jar AddOrReplaceReadGroups I=" + sample + ".bam O=" + sample + ".intermediate.bam RGID=4 RGLB=" + sample + " RGPL=illumina RGPU=BerkeleyHiSeq RGSM=" + basename + " VALIDATION_STRINGENCY=LENIENT" upa_util.bash_command(addrgcmd, verbose, cmdfile, logfile) shutil.move(sample + ".bam", sample + ".norg.bam") shutil.move(sample + ".intermediate.bam", sample + ".bam")
def stripchr(flist, verbose, cmdfile, logfile): """ Strips 'chr' from BAM files :param flist: File list. :param verbose: Verbose output to log. :param cmdfile: File storing external commands invoked. :param logfile: Output log. :return: """ print "\nStripping chr from chromosome names..." for i in range(len(flist)): sample = flist[i] stripchrcmd = "samtools view -H " + sample + ".bam | sed -e 's/SN:chr1/SN:1/' | sed -e 's/SN:chr2/SN:2/' | sed -e 's/SN:chr3/SN:3/' | sed -e 's/SN:chr4/SN:4/' | sed -e 's/SN:chr5/SN:5/' | sed -e 's/SN:chr6/SN:6/' | sed -e 's/SN:chr7/SN:7/' | sed -e 's/SN:chr8/SN:8/' | sed -e 's/SN:chr9/SN:9/' | sed -e 's/SN:chr10/SN:10/' | sed -e 's/SN:chr11/SN:11/' | sed -e 's/SN:chr12/SN:12/' | sed -e 's/SN:chr13/SN:13/' | sed -e 's/SN:chr14/SN:14/' | sed -e 's/SN:chr15/SN:15/' | sed -e 's/SN:chr16/SN:16/' | sed -e 's/SN:chr17/SN:17/' | sed -e 's/SN:chr18/SN:18/' | sed -e 's/SN:chr19/SN:19/' | sed -e 's/SN:chr20/SN:20/' | sed -e 's/SN:chr21/SN:21/' | sed -e 's/SN:chr22/SN:22/' | sed -e 's/SN:chrX/SN:X/' | sed -e 's/SN:chrY/SN:Y/' | sed -e 's/SN:chrM/SN:MT/' | samtools reheader - " + sample + ".bam > " + sample + ".intermediate.bam" upa_util.bash_command(stripchrcmd, verbose, cmdfile, logfile) shutil.move(sample + ".bam", sample + ".wchr.bam") shutil.move(sample + ".intermediate.bam", sample + ".bam")
def gen_reg_line(sample, mindepth, maxgap, cmdfile, logfile): """ Generate line describing regions covered by sample :param sample: File list. :param mindepth: Minimum depth of coverage to include. :param maxgap: Maximum gap size to allow a region to continue. :param cmdfile: File storing external commands invoked. :param logfile: Output log. :return: Line describing genomic regions covered by sample. """ regline = "" # depthinfo = upa_util.bash_command("samtools depth " + sample + " > upadepthout.txt", False, cmdfile, logfile) depthinfo = upa_util.bash_command("samtools depth " + sample, False, cmdfile, logfile) depthlines = depthinfo.split("\n") curstart = 0 lastpos = 0 firstthrough = True for dline in depthlines: dcols = dline.split("\t") if len(dcols) < 3: continue curpos = int(dcols[1]) curdepth = int(dcols[2]) if firstthrough: if curdepth >= mindepth: curstart = curpos firstthrough = False else: if curpos <= lastpos + maxgap: # Continuous run if curdepth >= mindepth: pass else: # depth too small, stop here curend = lastpos regline = regline + str(curstart) + "-" + str(curend) + ";" curstart = curpos else: # discontinous curend = lastpos regline = regline + str(curstart) + "-" + str(curend) + ";" curstart = curpos lastpos = curpos return regline
def genocaller(flist, bedfile, bcname, indent, ref, regionrestrict, threads, verbose, cmdfile, logfile): """ Calls genotypes using Krishna Veeramah's GenoCaller_indent :param flist: File list. :param bedfile: UCSC-style BED file. :param bcname: Base name of input file. :param indent: Indent depth to each end of read. :param ref: Reference genome. :param regionrestrict: Area of genome to limit calling. :param threads: Number of multiprocessing threads to use. :param verbose: Verbose output to log. :param cmdfile: File storing external commands invoked. :param logfile: Output log. :return: Name of merged sample VCF. """ print "\nGenoCaller..." samplevcfnames = [] for i in range(len(flist)): sample = flist[i] gccmd = "GenoCaller_indent.py " + sample + ".bam " + bedfile + " " + ref + " " + indent upa_util.bash_command(gccmd, verbose, cmdfile, logfile) #Must compress to allow bcftools to merge with open(sample + "." + bedfile + ".indent" + str(indent) + ".vcf", 'r') as f_in, bgzf.open(sample + "." + bedfile + ".indent" + str(indent) + ".vcf.gz", 'wb') as f_out: # with open(sample + "." + bedfile + ".indent" + str(indent) + ".vcf", 'r') as f_in, gzip.open(sample + "." + bedfile + ".indent" + str(indent) + ".vcf.gz", 'wb') as f_out: shutil.copyfileobj(f_in, f_out) samplevcfname = sample + "." + bedfile + ".indent" + str(indent) + ".vcf.gz" # sampleemitallname = sample + "." + bedfile + ".indent" + str(indent) + ".emit_all.vcf.gz" if os.path.isfile(samplevcfname): upa_util.vcf_name_strip(samplevcfname) upa_util.bash_command("bcftools index --threads " + threads + " " + samplevcfname, verbose, cmdfile, logfile) samplevcfnames.append(samplevcfname) else: print "ERROR: Cannot find " + samplevcfname #Merge the resulting VCFs together using bcftools bcfmergecmd = "bcftools merge --threads " + threads + " -Oz -o " + bcname + "-samples.vcf.gz " if regionrestrict: bcfmergecmd = bcfmergecmd + " -r " + regionrestrict for samplevcfname in samplevcfnames: bcfmergecmd = bcfmergecmd + samplevcfname + " " upa_util.bash_command(bcfmergecmd, verbose, cmdfile,logfile) return bcname + "-samples.vcf"
def run_ad(): """ Invokes ADMIXTURE software for the user-specified number of reps, between chosen low and high K. :return: """ print "\nRunning Admixture..." famfilename = file + ".h.fam" ind2popname = filebase + "/" + filebase + ".ind2pop" famfile = open(famfilename, 'r') ind2pop = open(ind2popname, 'w') for famline in famfile: famcols = famline.split() ind2pop.write(famcols[0]) ind2pop.write("\n") famfile.close() ind2pop.close() kcvss = [] klogls = [] kbests = {} pongfile = open(rdir + "pong_filemap", 'w') for k in xrange(lowk, (hik + 1)): print "\n*********** K:" + str(k) kreplist = [] kcvs = [] logls = [] bar = progressbar.ProgressBar() for j in bar(xrange(0, reps + 1)): jreplist = [] stdoutfilename = rdir + filebase + ".h." + str(k) + ".r" + str( j) + ".log" stdoutfile = open(stdoutfilename, 'w') stdoutfile.write( upa_util.bash_command( "admixture --cv " + file + ".h.bed " + str(k) + " -j" + threads + " -s " + str(rng.getrandbits(32)) + " -C " + str(termcrit) + " -m " + optmethod, verbose, cmdfile, logfile)) stdoutfile.close() pfile = filebase + ".h." + str(k) + ".P" qfile = filebase + ".h." + str(k) + ".Q" shutil.move( pfile, rdir + filebase + ".h." + str(k) + ".r" + str(j) + ".P") shutil.move( qfile, rdir + filebase + ".h." + str(k) + ".r" + str(j) + ".Q") grepcmd = "grep -h CV " + stdoutfilename grepline = upa_util.bash_command(grepcmd, True, cmdfile, logfile) grepcols = grepline.split() kcvs.append(float(grepcols[3])) jreplist.append(float(grepcols[3])) jreplist.append(filebase + ".h." + str(k) + ".r" + str(j)) stdreadout = open(stdoutfilename, 'r') for stdoutline in stdreadout: if stdoutline.startswith("Loglikelihood:"): loggrepcols = stdoutline.split() logls.append(loggrepcols[1]) jreplist.append(loggrepcols[1]) continue jreplist.append(str(j)) kreplist.append(jreplist) runid = "run" runid += str(j) runid += "_K" runid += str(k) pongfile.write(runid) pongfile.write("\t") pongfile.write(str(k)) pongfile.write("\t") pongfile.write(filebase + ".h." + str(k) + ".r" + str(j) + ".Q") pongfile.write("\n") llbest = 0 for i in xrange(reps): if logls[i] < logls[llbest]: llbest = i print("For K = " + str(k) + " best index: " + str(llbest)) pbestname = rdir + filebase + ".h." + str(k) + ".r" + str( llbest) + ".P" shutil.copy(pbestname, bestdir) qbestname = rdir + filebase + ".h." + str(k) + ".r" + str( llbest) + ".Q" shutil.copy(qbestname, bestdir) newqbestname = bestdir + "/" + filebase + "." + str(k) + ".r" + str( llbest) + ".Q" logbestname = rdir + filebase + ".h." + str(k) + ".r" + str( llbest) + ".log" shutil.move(logbestname, bestdir) kbests[k] = kreplist[llbest] kcvss.append(kcvs) klogls.append(logls) cvfileoutname = rdir + filebase + "-cvout.csv" cvfileout = open(cvfileoutname, 'w') for k in xrange(lowk, (hik + 1)): cvfileout.write(str(k)) cvfileout.write(",") cvfileout.write("\n") for j in xrange(reps): for i in xrange(len(kcvss)): cvfileout.write(str(kcvss[i][j])) cvfileout.write(",") cvfileout.write("\n") cvfileout.close() upa_util.bash_command("Rscript cvsplot.R " + cvfileoutname, verbose, cmdfile, logfile) bestname = bestdir + "/bests.csv" bestout = open(bestname, 'w') bestout.write("K,CV,filebase,logL, Rep\n") for k, l in kbests.iteritems(): bestout.write(str(k)) bestout.write(",") for i in l: bestout.write(str(i)) bestout.write(",") bestout.write("\n") bestout.close() pongfile.close() pongcmd = "pong -f -m " + rdir + "pong_filemap -i " + ind2popname print "\n\n\n\n Copy and paste the following to run pong:\n" print pongcmd print "\nYou will need a separate terminal logged in with ssh -X to visualize." print "On that terminal type :" print "\nfirefox &\n" print "And go to http://localhost:4000" print "\nAlternatively, you can simply download the folder " + filebase + "to your own machine" print "and run pong there. See http://brown.edu/Research/Ramachandran_Lab/projects/" elapsed_time = time.time() - start_time print "Number threads: " + str(threads) + " Elapsed time: " + str( elapsed_time)
def converttohaploid(): """ Convert a tped file to haploid :return: """ print "\nConverting to tped format..." upa_util.bash_command( "plink --bed " + file + ".bed --bim " + file + ".bim --fam " + file + ".fam --alleleACGT --recode transpose --out " + file, verbose, cmdfile, logfile) print "\nConverting to haploid..." tpedfile = open(file + ".tped", 'r') oldtped = [] tlc = 0 for tline in tpedfile: oldtped.append(tline) tlc = tlc + 1 tpedfile.close() newtped = [] bar = progressbar.ProgressBar() for i in bar(range(tlc)): tline = oldtped[i] cols = tline.rstrip().split() newtlist = [] genotypes = cols[4:] nalleles = list(set(genotypes)) alleles = [] for g in nalleles: if g != '0': alleles.append(g) if tvonly: if 'A' in alleles and 'G' in alleles: continue if 'C' in alleles and 'T' in alleles: continue for icol in cols[0:4]: newtlist.append(icol) for j in xrange(0, len(genotypes), 2): thisg = random.choice([genotypes[j], genotypes[(j + 1)]]) newtlist.append(thisg) newtlist.append(thisg) newtlist.append("\n") newtped.append(' '.join(newtlist)) tpedfile.close() toutfile = open(file + ".h.tped", 'w') for toutline in newtped: toutfile.write(toutline) toutfile.close() shutil.copy(file + ".fam", file + ".h.tfam") print "\nConverting to bed format..." upa_util.bash_command( "plink --tfile " + file + ".h --make-bed --out " + file + ".h", True, cmdfile, logfile)
def haplogrep_java(invcf, scriptsloc, cmdfile, logfile): """ Submit HSD file directly to Haplogrep server. :param invcf: Input VCF file. :param scriptsloc: Location of scripts repository on local machine. :param cmdfile: File storing external commands invoked. :param logfile: Output log. :return: """ filebase, filext = os.path.splitext(invcf) upa_util.bash_command("java -jar " + scriptsloc + "haplogrep-2.1.1.jar --format vcf --in " + invcf + " --out " + filebase + ".hsd --phylotree 17", False, cmdfile, logfile) firstfile = filebase + "-FIRST.hsd" # upa_util.bash_command("java -jar " + scriptsloc + "haplogrep-2.1.1.jar --format vcf --in " + invcf + " --out " + firstfile + " --phylotree 17", False, cmdfile, logfile) # # then edit that HSD file and do it again # hsdoutlines = [] # hsdfirstfile = open(firstfile, 'r') # for hsdline in hsdfirstfile: # # hsdnewline = "" # hsdcols = hsdline.strip().split("\t") # if hsdcols[0] == "SampleID": # hsdheadline = hsdline.strip() # else: # if len(regdic[hsdcols[0]]) > 1: # LEAVE OUT indivs for whom where are no viable regions # # # # for i in range(len(hsdcols)): # if i == 0 or i == 5: # # variantcols = hsdcols[i].split(";") # for variant in variantcols: # varvar = variant.strip() # print varvar # # hsdnewline = hsdnewline + hsdcols[i] + "\t" # elif i == 1: # hsdnewline = hsdnewline + regdic[hsdcols[0]] + "\t" # elif i == 2: # hsdnewline = hsdnewline + "?\t" # else: # pass # hsdoutlines.append(hsdnewline) # hsdfirstfile.close() # # secondfile = filebase + "-SECOND.hsd" # # hsdsecondfile = open(secondfile, 'w') # hsdsecondfile.write("SampleID\tRange\tHaplogroup\tPolymorphisms") # hsdsecondfile.write("\n") # for hsdoutline in hsdoutlines: # hsdoutcols = hsdoutline.split("\t") # if len(hsdoutcols[3]) > 0: # LEAVE OUT indivs for whom where are no viable regions # hsdsecondfile.write(hsdoutline) # hsdsecondfile.write("\n") # hsdsecondfile.close() # # finalfile = filebase + "-FINAL.hsd" # # upa_util.bash_command("java -jar /data/scripts/haplogrep-2.1.1.jar --format hsd --in " + secondfile + " --out " + finalfile + " --phylotree 17", False, cmdfile, logfile)
# ----------------------------- print "\nProcessing input files..." if bampreprocess: if stripchr: print("\nStripping chr") upa_input.stripchr(flist, verbose, cmdfile, logfile) if addreadgroup: print("\nAdding read group...") upa_input.addreadgroup(flist, binloc, verbose, cmdfile, logfile) if samindex: print "\nIndexing..." bar = progressbar.ProgressBar() for i in bar(range(flength)): sample = flist[i] upa_util.bash_command("samtools index " + sample + ".bam", verbose, cmdfile, logfile) if vcf_file: if os.path.isfile(vcf_file): samplevcffile = vcf_file #User submitting a VCF file else: print "ERROR: Cannot find " + vcf_file exit(1) elif callmethod == 'bcf': samplevcffile = upa_input.bcfmpileup(flist, ref, bcname, regionrestrict, diploid, q, threads, cmdfile, logfile) elif callmethod == 'genocaller': samplevcffile = upa_input.genocaller(flist, gcbedfile, bcname, gcindent, ref, regionrestrict, threads, verbose, cmdfile,