def main(): args = check_options(get_options()) genomesize = int(os.path.getsize(args.genome)/1e6) kmer = int(log(genomesize, 4)+1) if kmer < 17: kmer = 17 #jellyfish par lowercount = 2 #jellyfish par jfsize = '100M' # splite sequence longer than 10M spsize = 10000000 step = args.step maxkmerscore = int(args.length * args.homology / 100) - kmer jfpool = Pool(args.threads) # ?build kmerindex jfkmerfile = os.path.join(args.saved,(os.path.basename(args.genome)+'_'+str(kmer)+'mer.jf')) kmerbuild = True if os.path.isfile(jfkmerfile): if not args.docker: print("find:", jfkmerfile) kmmess = "Found kmerfile "+jfkmerfile+". Do you want rebuild it? Press Y or N to continue:" print(kmmess) while True: char = getch() if char.lower() in ("y", "n"): print(char) if char == 'y': kmerbuild = True elif char == 'n': kmerbuild = False break # ?build bwa index bwaindexfile = os.path.basename(args.genome) bwatestindex = os.path.join(args.saved, bwaindexfile+'.sa') bwaindex = os.path.join(args.saved, bwaindexfile) bwabuild = True if os.path.isfile(bwatestindex): if not args.docker: print('find:', bwatestindex) bwamess = "Found bwa index file " + bwatestindex + ". Do you want rebuild it? Press Y or N to continue:" print(bwamess) while True: char = getch() if char.lower() in ("y", "n"): print(char) if char == 'y': bwabuild = True elif char == 'n': bwabuild = False break print("genomesize:",genomesize, "kmer:",kmer, "jfkmerfile:", jfkmerfile, "kmerbuild:", kmerbuild, "bwabuild:", bwabuild, "threads:", args.threads) # Build Jellyfish index if kmerbuild: jfcount = jellyfish.jfcount(jfpath=args.jellyfish, mer=kmer, infile=args.genome, output=jfkmerfile, threads=args.threads, lowercount=lowercount, size=jfsize) if jfcount: print("JellyFish Count finished ...") else: print("JellyFish Count Error!!!") sys.exit(1) else: print("Use ", jfkmerfile) # End build Jellyfish index if bwabuild: bwa.bwaindex(args.bwa, args.genome, args.saved) print("bwa index build finished ...") else: print("Use", bwatestindex) jffilteredprobe = list() fastain = Fasta(args.input) jffpbrunerlist = list() for seqname in fastain.keys(): chrlen = len(fastain[seqname]) if chrlen < spsize: start = 0 end = chrlen - 1 jffpbruner = jellyfish.JFfpbruner(jfpath=args.jellyfish, jfkmerfile=jfkmerfile, mer=kmer, pyfasta=fastain, seqname=seqname, pblength=args.length, maxkmerscore=maxkmerscore, start=start, end=end, step=step) jffpbrunerlist.append(jffpbruner) else: chrblock = int(chrlen/spsize) + 1 for i in range(chrblock): start = i * spsize end = start + spsize - 1 if end >= chrlen: end = chrlen - 1 jffpbruner = jellyfish.JFfpbruner(jfpath=args.jellyfish, jfkmerfile=jfkmerfile, mer=kmer, pyfasta=fastain, seqname=seqname, pblength=args.length, maxkmerscore=maxkmerscore, start=start, end=end, step=step) jffpbrunerlist.append(jffpbruner) jffinished = 0 for curpblist in jfpool.imap_unordered(jellyfish.kmerfilterprobe, jffpbrunerlist): jffilteredprobe.extend(curpblist) jffinished += 1 print("Jellyfish filter: ",jffinished,'/',len(jffpbrunerlist), sep='') jfpool.close() print('Jellyfish filter finished!!') tmppbfa = os.path.join(args.saved, os.path.basename(args.input)+'_tmp_probe.fa') tmppbfaio = open(tmppbfa, 'w') seqnum = 0 for tmppb in jffilteredprobe: print('>','seq',seqnum, sep='',file=tmppbfaio) print(tmppb,file=tmppbfaio) seqnum += 1 tmppbfaio.close() del jffilteredprobe bwafiltedpb = bwa.bwafilter(bwabin=args.bwa, reffile=bwaindex, inputfile=tmppbfa, minas=args.length, maxxs=int(args.length*args.homology/100), threadnumber=args.threads) # print(bwafiltedpb) tmpbwaftlist = os.path.join(args.saved, os.path.basename(args.input)+'.bed') alltmpbwaftlist = os.path.join(args.saved, os.path.basename(args.input)+'_all.bed') tmpbwaftlistio = open(tmpbwaftlist,'w') allbwaftlistio = open(alltmpbwaftlist,'w') seqlenfile = os.path.join(args.saved, os.path.basename(args.input)+'.len') seqlenio = open(seqlenfile,'w') seqlength = bwa.bwareflength(bwabin=args.bwa, reffile=bwaindex) for seqname in seqlength: print(seqname, seqlength[seqname], sep='\t', file=seqlenio) seqlenio.close() oligobefortmf = list() for pbtmp in bwafiltedpb: # print(pbtmp, file=tmpbwaftlistio) nowpbcounter = dict() nowpbcounter['seq'] = pbtmp nowpbcounter['dTm'] = args.dtm nowpbcounter['rprimer'] = args.primer oligobefortmf.append(nowpbcounter) keepedprobe = list() ctedpb = 0 oligobefortmflen = len(oligobefortmf) print("oligobefortmflen:",oligobefortmflen) pbftpool = Pool() for (pb, keep) in pbftpool.imap_unordered(probefilter, oligobefortmf): if keep: keepedprobe.append(pb) # print(pb, file=tmpbwaftlistio) ctedpb += 1 if ctedpb % 10000 == 0: print(ctedpb,'/',oligobefortmflen) pbdictbychr = dict() pbftpool.close() for pb in keepedprobe: seq, chro, start = pb.split('\t') start = int(start) if chro in pbdictbychr: pbdictbychr[chro][start] = seq else: pbdictbychr[chro] = dict() pbdictbychr[chro][start] = seq lenrprimer = len(args.primer) if lenrprimer == 0: lenrprimer = 5 slidwindow = lenrprimer+args.length for chro in pbdictbychr: startn = 0 for startnow in sorted(pbdictbychr[chro]): endnow = startnow + args.length - 1 print(chro, startnow, endnow, pbdictbychr[chro][startnow],file=allbwaftlistio,sep='\t') if startnow > startn+slidwindow: #startn = startnow+slidwindow startn = startnow print(chro, startnow, endnow, pbdictbychr[chro][startnow], file=tmpbwaftlistio, sep='\t') tmpbwaftlistio.close() allbwaftlistio.close() print("Job finshed!!")
def run(self): if self.kmerbuild: jfcounter = jellyfish.jfcount(jfpath=self.jellyfishpath, mer=self.kmer, infile=self.genomefile, output=self.jfkmerfile, threads=self.threadsnumber, lowercount=self.lowercount, size=self.size) """ check jelly fish count run correctly """ if jfcounter: self.progressnumber = self.progressnumber + 5 self.notifyProgress.emit(self.progressnumber) self.notifyMessage.emit("JellyFish Count finished...") else: self.notifyMessage.emit("JellyFish Count Error!!!") else: jfcountmess = "Use " + self.jfkmerfile self.progressnumber = self.progressnumber + 5 self.notifyProgress.emit(self.progressnumber) self.notifyMessage.emit(jfcountmess) if self.indexbuild: if self.aligner == 'BWA': bwa.bwaindex(self.alnpath, self.genomefile, self.samplefolder) self.notifyMessage.emit("BWA Index build finished...") self.progressnumber = self.progressnumber + 5 self.notifyProgress.emit(self.progressnumber) elif self.aligner == 'BLAT': """ add code for BLAT """ pass else: self.progressnumber = self.progressnumber + 5 self.notifyProgress.emit(self.progressnumber) """ load and splite input file """ # splite sequence longer than 10M spsize = 10000000 maxkmerscore = int(self.pblength * self.homology / 100) - self.kmer jffilteredprobe = list() fastain = Fasta(self.inputfile) jffpbrunerlist = list() for seqname in fastain.keys(): chrlen = len(fastain[seqname]) if chrlen < spsize: start = 0 end = chrlen - 1 jffpbruner = jellyfish.JFfpbruner(jfpath=self.jellyfishpath, jfkmerfile=self.jfkmerfile, mer=self.kmer, pyfasta=fastain, seqname=seqname, pblength=self.pblength, maxkmerscore=maxkmerscore, start=start, end=end, step=self.step) jffpbrunerlist.append(jffpbruner) else: chrblock = int(chrlen / spsize) + 1 for i in range(chrblock): start = i * spsize end = start + spsize - 1 if end >= chrlen: end = chrlen - 1 jffpbruner = jellyfish.JFfpbruner( jfpath=self.jellyfishpath, jfkmerfile=self.jfkmerfile, mer=self.kmer, pyfasta=fastain, seqname=seqname, pblength=self.pblength, maxkmerscore=maxkmerscore, start=start, end=end, step=self.step) jffpbrunerlist.append(jffpbruner) jffinished = 0 for curpblist in self.pool.imap_unordered(jellyfish.kmerfilterprobe, jffpbrunerlist): jffilteredprobe.extend(curpblist) tmpprogress = float( format( self.progressnumber + (jffinished / len(jffpbrunerlist) * 40), ".2f")) self.notifyProgress.emit(tmpprogress) if self.isRunning(): print("running") else: print("not running") jffinished += 1 self.notifyMessage.emit('kmer filter finished!!') self.progressnumber = 50.0 self.notifyProgress.emit(self.progressnumber) tmppbfa = os.path.join( self.samplefolder, os.path.basename(self.inputfile) + '_tmp_probes.fa') tmppbfaio = open(tmppbfa, 'w') seqnum = 0 for tmppb in jffilteredprobe: print('>', 'seq', seqnum, sep='', file=tmppbfaio) print(tmppb, file=tmppbfaio) seqnum += 1 tmppbfaio.close() #delete jffilteredprobe and release memory del jffilteredprobe bwaindexfile = os.path.join(self.samplefolder, os.path.basename(self.genomefile)) bwafiltedpb = bwa.bwafilter(bwabin=self.alnpath, reffile=bwaindexfile, inputfile=tmppbfa, minas=self.pblength, maxxs=int(self.pblength * self.homology / 100), threadnumber=self.threadsnumber) tmpbwaftlist = os.path.join(self.samplefolder, os.path.basename(self.inputfile) + '.bed') alltmpbwaftlist = os.path.join( self.samplefolder, os.path.basename(self.inputfile) + '_all.bed') tmpbwaftlistio = open(tmpbwaftlist, 'w') allbwaftlistio = open(alltmpbwaftlist, 'w') seqlenfile = os.path.join(self.samplefolder, os.path.basename(self.inputfile)) + '.len' seqlenio = open(seqlenfile, 'w') seqlength = bwa.bwareflength(bwabin=self.alnpath, reffile=bwaindexfile) for seqname in seqlength: print(seqname, seqlength[seqname], sep='\t', file=seqlenio) seqlenio.close() oligobefortmf = list() for pbtmp in bwafiltedpb: # print(pbtmp, file=tmpbwaftlistio) nowpbcounter = dict() nowpbcounter['seq'] = pbtmp nowpbcounter['dTm'] = self.dTm nowpbcounter['rprimer'] = self.rprimer oligobefortmf.append(nowpbcounter) keepedprobe = list() self.progressnumber = 55 self.notifyProgress.emit(self.progressnumber) ctedpb = 0 oligobefortmflen = len(oligobefortmf) for (pb, keep) in self.pool.imap_unordered(probefilter, oligobefortmf): if keep: keepedprobe.append(pb) # print(pb, file=tmpbwaftlistio) ctedpb += 1 if ctedpb % 10000 == 0: tmpprogress = float( format( self.progressnumber + (ctedpb / oligobefortmflen * 30), ".2f")) self.notifyProgress.emit(tmpprogress) self.notifyProgress.emit(90) pbdictbychr = dict() #load pb to dict for pb in keepedprobe: # print(pb, file=tmpbwaftlistio) seq, chro, start = pb.split('\t') start = int(start) if chro in pbdictbychr: pbdictbychr[chro][start] = seq else: pbdictbychr[chro] = dict() pbdictbychr[chro][start] = seq #get lenth of primer lenrprimer = len(self.rprimer) if lenrprimer == 0: lenrprimer = 5 slidwindow = lenrprimer + self.pblength for chro in pbdictbychr: startn = 0 for startnow in sorted(pbdictbychr[chro]): endnow = startnow + self.pblength - 1 print(chro, startnow, endnow, pbdictbychr[chro][startnow], file=allbwaftlistio, sep='\t') if startnow > startn + slidwindow: #startn = startnow+slidwindow startn = startnow print(chro, startnow, endnow, pbdictbychr[chro][startnow], file=tmpbwaftlistio, sep='\t') tmpbwaftlistio.close() allbwaftlistio.close() #remove temp fasta file # os.remove(tmppbfa) self.notifyProgress.emit(100) self.notifyMessage.emit('all finished!!')
def run(self): if self.kmerbuild: jfcounter = jellyfish.jfcount(jfpath=self.jellyfishpath, mer=self.kmer, infile=self.genomefile, output=self.jfkmerfile, threads=self.threadsnumber, lowercount=self.lowercount, size=self.size) """ check jelly fish count run correctly """ if jfcounter: self.progressnumber = self.progressnumber + 5 self.notifyProgress.emit(self.progressnumber) self.notifyMessage.emit("JellyFish Count finished...") else: self.notifyMessage.emit("JellyFish Count Error!!!") else: jfcountmess = "Use " + self.jfkmerfile self.progressnumber = self.progressnumber + 5 self.notifyProgress.emit(self.progressnumber) self.notifyMessage.emit(jfcountmess) if self.indexbuild: if self.aligner == 'BWA': bwa.bwaindex(self.alnpath, self.genomefile, self.samplefolder) self.notifyMessage.emit("BWA Index build finished...") self.progressnumber = self.progressnumber + 5 self.notifyProgress.emit(self.progressnumber) elif self.aligner == 'BLAT': """ add code for BLAT """ pass else: self.progressnumber = self.progressnumber + 5 self.notifyProgress.emit(self.progressnumber) """ load and splite input file """ # splite sequence longer than 10M spsize = 10000000 maxkmerscore = int(self.pblength * self.homology / 100) - self.kmer jffilteredprobe = list() fastain = Fasta(self.inputfile) jffpbrunerlist = list() for seqname in fastain.keys(): chrlen = len(fastain[seqname]) if chrlen < spsize: start = 0 end = chrlen - 1 jffpbruner = jellyfish.JFfpbruner(jfpath=self.jellyfishpath, jfkmerfile=self.jfkmerfile, mer=self.kmer, pyfasta=fastain, seqname=seqname, pblength=self.pblength, maxkmerscore=maxkmerscore, start=start, end=end, step=self.step) jffpbrunerlist.append(jffpbruner) else: chrblock = int(chrlen / spsize) + 1 for i in range(chrblock): start = i * spsize end = start + spsize - 1 if end >= chrlen: end = chrlen - 1 jffpbruner = jellyfish.JFfpbruner(jfpath=self.jellyfishpath, jfkmerfile=self.jfkmerfile, mer=self.kmer, pyfasta=fastain, seqname=seqname, pblength=self.pblength, maxkmerscore=maxkmerscore, start=start, end=end, step=self.step) jffpbrunerlist.append(jffpbruner) jffinished = 0 for curpblist in self.pool.imap_unordered(jellyfish.kmerfilterprobe, jffpbrunerlist): jffilteredprobe.extend(curpblist) tmpprogress = float(format(self.progressnumber + (jffinished/len(jffpbrunerlist) * 40),".2f")) self.notifyProgress.emit(tmpprogress) if self.isRunning(): print("running") else: print("not running") jffinished += 1 self.notifyMessage.emit('jelly fish finished!!') self.progressnumber = 50.0 self.notifyProgress.emit(self.progressnumber) tmppbfa = os.path.join(self.samplefolder, os.path.basename(self.inputfile)+'_tmp_probes.fa') tmppbfaio = open(tmppbfa, 'w') seqnum = 0 for tmppb in jffilteredprobe: print('>','seq',seqnum, sep='',file=tmppbfaio) print(tmppb,file=tmppbfaio) seqnum += 1 tmppbfaio.close() #delete jffilteredprobe and release memory del jffilteredprobe bwaindexfile = os.path.join(self.samplefolder, os.path.basename(self.genomefile)) bwafiltedpb = bwa.bwafilter(bwabin=self.alnpath, reffile=bwaindexfile, inputfile=tmppbfa, minas=self.pblength, maxxs=int(self.pblength * self.homology / 100), threadnumber=self.threadsnumber) tmpbwaftlist = os.path.join(self.samplefolder, os.path.basename(self.inputfile)+'.bed') alltmpbwaftlist = os.path.join(self.samplefolder, os.path.basename(self.inputfile)+'_all.bed') tmpbwaftlistio = open(tmpbwaftlist,'w') allbwaftlistio = open(alltmpbwaftlist,'w') seqlenfile = os.path.join(self.samplefolder, os.path.basename(self.inputfile))+'.len' seqlenio = open(seqlenfile, 'w') seqlength = bwa.bwareflength(bwabin=self.alnpath, reffile=bwaindexfile) for seqname in seqlength: print(seqname, seqlength[seqname], sep='\t', file=seqlenio) seqlenio.close() oligobefortmf = list() for pbtmp in bwafiltedpb: # print(pbtmp, file=tmpbwaftlistio) nowpbcounter = dict() nowpbcounter['seq'] = pbtmp nowpbcounter['dTm'] = self.dTm nowpbcounter['rprimer'] = self.rprimer oligobefortmf.append(nowpbcounter) keepedprobe = list() self.progressnumber = 55 self.notifyProgress.emit(self.progressnumber) ctedpb = 0 oligobefortmflen = len(oligobefortmf) for (pb, keep) in self.pool.imap_unordered(probefilter, oligobefortmf): if keep: keepedprobe.append(pb) # print(pb, file=tmpbwaftlistio) ctedpb += 1 if ctedpb % 10000 == 0: tmpprogress = float(format(self.progressnumber + (ctedpb/oligobefortmflen * 30),".2f")) self.notifyProgress.emit(tmpprogress) self.notifyProgress.emit(90) pbdictbychr = dict() #load pb to dict for pb in keepedprobe: # print(pb, file=tmpbwaftlistio) seq, chro, start = pb.split('\t') start = int(start) if chro in pbdictbychr: pbdictbychr[chro][start] = seq else: pbdictbychr[chro] = dict() pbdictbychr[chro][start] = seq #get lenth of primer lenrprimer = len(self.rprimer) if lenrprimer == 0: lenrprimer = 5 slidwindow = lenrprimer+self.pblength for chro in pbdictbychr: startn = 0 for startnow in sorted(pbdictbychr[chro]): endnow = startnow + self.pblength - 1 print(chro, startnow, endnow, pbdictbychr[chro][startnow],file=allbwaftlistio,sep='\t') if startnow > startn+slidwindow: #startn = startnow+slidwindow startn = startnow print(chro, startnow, endnow, pbdictbychr[chro][startnow], file=tmpbwaftlistio, sep='\t') tmpbwaftlistio.close() allbwaftlistio.close() #remove temp fasta file # os.remove(tmppbfa) self.notifyProgress.emit(100) self.notifyMessage.emit('all finished!!')
def main(): args = check_options(get_options()) genomesize = int(os.path.getsize(args.genome)/1e6) kmer = int(log(genomesize, 4)+1) if kmer < 17: kmer = 17 #jellyfish par lowercount = 2 #jellyfish par jfsize = '100M' # splite sequence longer than 10M spsize = 10000000 step = args.step maxkmerscore = int(((args.length * args.homology / 100) - kmer) * args.ploidy/2 + 0.5 ) jfpool = Pool(args.threads) # ?build kmerindex jfkmerfile = os.path.join(args.saved,(os.path.basename(args.genome)+'_'+str(kmer)+'mer.jf')) kmerbuild = True if os.path.isfile(jfkmerfile): if not args.docker: print("find:", jfkmerfile) kmmess = "Found kmerfile "+jfkmerfile+". Do you want rebuild it? Press Y or N to continue:" print(kmmess) while True: char = getch() if char.lower() in ("y", "n"): print(char) if char == 'y': kmerbuild = True elif char == 'n': kmerbuild = False break # ?build bwa index bwaindexfile = os.path.basename(args.genome) bwatestindex = os.path.join(args.saved, bwaindexfile+'.sa') bwaindex = os.path.join(args.saved, bwaindexfile) bwabuild = True if os.path.isfile(bwatestindex): if not args.docker: print('find:', bwatestindex) bwamess = "Found bwa index file " + bwatestindex + ". Do you want rebuild it? Press Y or N to continue:" print(bwamess) while True: char = getch() if char.lower() in ("y", "n"): print(char) if char == 'y': bwabuild = True elif char == 'n': bwabuild = False break print("genomesize:",genomesize, "kmer:",kmer, "jfkmerfile:", jfkmerfile, "kmerbuild:", kmerbuild, "bwabuild:", bwabuild, "threads:", args.threads) # Build Jellyfish index if kmerbuild: jfcount = jellyfish.jfcount(jfpath=args.jellyfish, mer=kmer, infile=args.genome, output=jfkmerfile, threads=args.threads, lowercount=lowercount, size=jfsize) if jfcount: print("JellyFish Count finished ...") else: print("JellyFish Count Error!!!") sys.exit(1) else: print("Use ", jfkmerfile) # End build Jellyfish index if bwabuild: bwa.bwaindex(args.bwa, args.genome, args.saved) print("bwa index build finished ...") else: print("Use", bwatestindex) jffilteredprobe = list() ##### if genomesize < 1000: fastain = Fasta(args.input) jffpbrunerlist = list() for seqname in fastain.keys(): chrlen = len(fastain[seqname]) if chrlen < spsize: start = 0 end = chrlen - 1 jffpbruner = jellyfish.JFfpbruner(jfpath=args.jellyfish, jfkmerfile=jfkmerfile, mer=kmer, pyfasta=fastain, seqname=seqname, pblength=args.length, maxkmerscore=maxkmerscore, start=start, end=end, step=step) jffpbrunerlist.append(jffpbruner) else: chrblock = int(chrlen/spsize) + 1 for i in range(chrblock): start = i * spsize end = start + spsize - 1 if end >= chrlen: end = chrlen - 1 jffpbruner = jellyfish.JFfpbruner(jfpath=args.jellyfish, jfkmerfile=jfkmerfile, mer=kmer, pyfasta=fastain, seqname=seqname, pblength=args.length, maxkmerscore=maxkmerscore, start=start, end=end, step=step) jffpbrunerlist.append(jffpbruner) jffinished = 0 print(len(jffpbrunerlist)) for curpblist in jfpool.imap_unordered(jellyfish.kmerfilterprobe, jffpbrunerlist): jffilteredprobe.extend(curpblist) jffinished += 1 print("Jellyfish filter: ",jffinished,'/',len(jffpbrunerlist), sep='') jfpool.close() print('Jellyfish filter finished!!') else: ### split fa file when geome size greater than 1 Gb print("genome size > 1G") subFas = spgenome.spgenome(args.input, args.saved) for subFafile in subFas: print(subFafile) fastain = Fasta(subFafile) jffpbrunerlist = list() for seqname in fastain.keys(): chrlen = len(fastain[seqname]) if chrlen < spsize: start = 0 end = chrlen - 1 jffpbruner = jellyfish.JFfpbruner(jfpath=args.jellyfish, jfkmerfile=jfkmerfile, mer=kmer, pyfasta=fastain, seqname=seqname, pblength=args.length, maxkmerscore=maxkmerscore, start=start, end=end, step=step) jffpbrunerlist.append(jffpbruner) else: chrblock = int(chrlen / spsize) + 1 for i in range(chrblock): start = i * spsize end = start + spsize - 1 if end >= chrlen: end = chrlen - 1 jffpbruner = jellyfish.JFfpbruner(jfpath=args.jellyfish, jfkmerfile=jfkmerfile, mer=kmer, pyfasta=fastain, seqname=seqname, pblength=args.length, maxkmerscore=maxkmerscore, start=start, end=end, step=step) jffpbrunerlist.append(jffpbruner) jffinished = 0 print(len(jffpbrunerlist)) for curpblist in jfpool.imap_unordered(jellyfish.kmerfilterprobe, jffpbrunerlist): jffilteredprobe.extend(curpblist) jffinished += 1 print(subFafile + " Jellyfish filter: ", jffinished, '/', len(jffpbrunerlist), sep='') jfpool.close() print('Jellyfish filter finished!!') tmppbfa = os.path.join(args.saved, os.path.basename(args.input)+'_tmp_probe.fa') tmppbfaio = open(tmppbfa, 'w') seqnum = 0 for tmppb in jffilteredprobe: print('>','seq',seqnum, sep='',file=tmppbfaio) print(tmppb,file=tmppbfaio) seqnum += 1 tmppbfaio.close() del jffilteredprobe print("run bwafilter") print("maxxs:", int(args.length*args.homology/100)) bwafiltedpb = bwa.bwafilter(bwabin=args.bwa, reffile=bwaindex, inputfile=tmppbfa, minas=args.length, maxxs=int(args.length*args.homology/100), threadnumber=args.threads) print("bwafiltedpb len",len(bwafiltedpb)) print(bwafiltedpb[0:10]) tmpbwaftlist = os.path.join(args.saved, os.path.basename(args.input)+'.bed') alltmpbwaftlist = os.path.join(args.saved, os.path.basename(args.input)+'_all.bed') tmpbwaftlistio = open(tmpbwaftlist,'w') allbwaftlistio = open(alltmpbwaftlist,'w') seqlenfile = os.path.join(args.saved, os.path.basename(args.input)+'.len') seqlenio = open(seqlenfile,'w') seqlength = bwa.bwareflength(bwabin=args.bwa, reffile=bwaindex) for seqname in seqlength: print(seqname, seqlength[seqname], sep='\t', file=seqlenio) seqlenio.close() oligobefortmf = list() for pbtmp in bwafiltedpb: # print(pbtmp, file=tmpbwaftlistio) nowpbcounter = dict() nowpbcounter['seq'] = pbtmp nowpbcounter['dTm'] = args.dtm nowpbcounter['rprimer'] = args.primer oligobefortmf.append(nowpbcounter) keepedprobe = list() ctedpb = 0 oligobefortmflen = len(oligobefortmf) print("oligobefortmflen:",oligobefortmflen) pbftpool = Pool() for (pb, keep) in pbftpool.imap_unordered(probefilter, oligobefortmf): if keep: keepedprobe.append(pb) # print(pb, file=tmpbwaftlistio) ctedpb += 1 if ctedpb % 10000 == 0: print(ctedpb,'/',oligobefortmflen) pbdictbychr = dict() pbftpool.close() for pb in keepedprobe: seq, chro, start = pb.split('\t') start = int(start) if chro in pbdictbychr: pbdictbychr[chro][start] = seq else: pbdictbychr[chro] = dict() pbdictbychr[chro][start] = seq lenrprimer = len(args.primer) if lenrprimer == 0: lenrprimer = 5 slidwindow = lenrprimer+args.length for chro in pbdictbychr: startn = 0 for startnow in sorted(pbdictbychr[chro]): endnow = startnow + args.length - 1 print(chro, startnow, endnow, pbdictbychr[chro][startnow],file=allbwaftlistio,sep='\t') if startnow > startn+slidwindow: #startn = startnow+slidwindow startn = startnow print(chro, startnow, endnow, pbdictbychr[chro][startnow], file=tmpbwaftlistio, sep='\t') tmpbwaftlistio.close() allbwaftlistio.close() print("Job finshed!!")
def main(): args = check_options(get_options()) # jellyfish par jfsize = '100M' # ?build bwa index bwaindexfile = os.path.basename(args.genome) tmpfolder = args.tmp bwatestindex = os.path.join(tmpfolder, bwaindexfile + '.sa') bwaindex = os.path.join(tmpfolder, bwaindexfile) bwabuild = True if os.path.isfile(bwatestindex): bwabuild = False if bwabuild: # build bwa index bwa.bwaindex(args.bwa, args.genome, tmpfolder) print("bwa index build finished ...") else: print("Use", bwatestindex) sampleinfor = dict() names = args.names.split(',') reads1 = args.reads1.split(',') reads2 = args.reads2.split(',') cnsfile = os.path.join(args.saved, '_'.join(names) + '_cns_probe.csv') print(cnsfile) cnsio = open(cnsfile, 'w') for i in range(len(names)): name = names[i] read1 = reads1[i] read2 = reads2[i] bamfile = os.path.join(tmpfolder, name + '.bam') bcffile = os.path.join(tmpfolder, name + '.bcf') jffile = os.path.join(tmpfolder, name + '.jf') cnsprobe = os.path.join(args.saved, name + '_probe.txt') # new add indel indelNprobe = os.path.join(args.saved, name + '_indel_probe.txt') mindepth = os.path.join(tmpfolder, name + '_mindepth.bed') if name in sampleinfor: print("error same name:", name) else: sampleinfor[name] = dict() sampleinfor[name]['read1'] = read1 sampleinfor[name]['read2'] = read2 sampleinfor[name]['bamfile'] = bamfile sampleinfor[name]['bcffile'] = bcffile sampleinfor[name]['jffile'] = jffile # sampleinfor[name]['kmerscore'] = kmerscore # # sampleinfor[name]['kmerscoreio'] = open(kmerscore, 'w') sampleinfor[name]['cnsprobe'] = cnsprobe sampleinfor[name]['cnsprobeio'] = open(cnsprobe, 'w') # new add indel sampleinfor[name]['indelNprobelist'] = list() sampleinfor[name]['indelNprobeio'] = open(indelNprobe, 'w') sampleinfor[name]['mindepth'] = mindepth # run bwa mem bwa.bwamem_paired(bwabin=args.bwa, samtoolsbin=args.samtools, reffile=bwaindex, outfile=bamfile, inputfile1=read1, inputfile2=read2, samplename=name, threadnumber=args.threads) print("bwa mem", name, 'finished') # get min depth bed file bamdepth.bamdepthtobed(bamfile=bamfile, outbed=mindepth, mindepth=args.mindepth, minlength=200) print(mindepth, 'done') # generate bcf file from bam file bcftools.bamtobcf(bcfbin=args.bcftools, reffile=bwaindex, bamfile=bamfile, outbcf=bcffile) print(bcffile, "done") # generate jf file jellyfish.makegenerator(filenames=[read1, read2], type='gz', generators='generators') jellyfish.jfgeneratorscount(jfpath=args.jellyfish, mer=args.length, output=jffile, generators='generators', threads=args.threads, size='100M') print(jffile, "done") probe = BedTool(args.probe).sort() for name in sampleinfor: nowprobe = BedTool(sampleinfor[name]['mindepth']).sort() probe = probe.intersect(nowprobe, wa=True, u=True) # cnsprobe for name in sampleinfor: bcfpool = Pool(args.threads) bcfrunerlist = list() consensusprobelist = list() for i in probe: probestr = str(i).rstrip() bcfconsensusruner = bcftools.BcfConsensusRuner( probestr=probestr, bcftoolspath=args.bcftools, bcffile=sampleinfor[name]['bcffile'], sample=name) bcfrunerlist.append(bcfconsensusruner) # consensusprobe = bcftools.probestrtoconsensus(bcfconsensusruner) # # print(probestr, consensusprobe, sep='\t') reslist = list() for res in bcfpool.imap_unordered(bcftools.probestrtoconsensus, bcfrunerlist): # print(res['probestr'], name, res['consensusprobe'], sep='\t', file=sampleinfor[name]['cnsprobeio']) if len(res['consensusprobe']) != args.length: sampleinfor[name]['indelNprobelist'].append(res) elif 'N' in res['consensusprobe']: continue else: consensusprobelist.append(res['consensusprobe']) # consensusprobelist.append(res) reslist.append(res) bcfpool.close() consensusprobekmerscore = jellyfish.jfquerylist( jfkmerfile=sampleinfor[name]['jffile'], jfpath=args.jellyfish, seqlist=consensusprobelist) kmerscoredict = dict() kmerscorelist = list() for score in consensusprobekmerscore: # print(score, file=sampleinfor[name]['kmerscoreio']) (subseq, kmerscore) = score.split(',') if 'N' not in subseq: kmerscoredict[subseq] = int(kmerscore) kmerscorelist.append(int(kmerscore)) maxkmer = pd.Series(kmerscorelist).quantile(0.9) minkmer = args.minkmer for consensusprobe in reslist: probestr = consensusprobe['probestr'] consensusprobeseq = consensusprobe['consensusprobe'] if consensusprobeseq in kmerscoredict: if kmerscoredict[consensusprobeseq] <= maxkmer: if kmerscoredict[consensusprobeseq] >= minkmer: print(probestr, consensusprobeseq, kmerscoredict[consensusprobeseq], sep='\t', file=sampleinfor[name]['cnsprobeio']) for name in sampleinfor: sampleinfor[name]['cnsprobeio'].close() # sampleinfor[name]['kmerscoreio'].close() # print(sampleinfor) for res in sampleinfor[name]['indelNprobelist']: print(res['probestr'], name, res['consensusprobe'], sep='\t', file=sampleinfor[name]['indelNprobeio']) sampleinfor[name]['indelNprobeio'].close() probdict = dict() for name in sampleinfor: with open(sampleinfor[name]['cnsprobe']) as inio: for infor in inio: infor = infor.rstrip() inforlist = infor.split('\t') orgprb = inforlist[3] if orgprb in probdict: probdict[orgprb][name] = infor else: probdict[orgprb] = dict() probdict[orgprb][name] = infor print('chrom', 'start', 'end', 'refseq', ','.join(sampleinfor), 'consensusprobe', 'consensusscore', 'consensussite', 'consensusdiff', sep=',', file=cnsio) for orgprb in probdict: sharecount = len(probdict[orgprb]) values_view = probdict[orgprb].values() value_iterator = iter(values_view) first_value = next(value_iterator).split('\t') outinfo = first_value[0:3] if len(sampleinfor) == sharecount: # print(sampleinfor, sharecount) # print(orgprb, len(probdict[orgprb])) probelist = list() namelist = list() namelist.append('refseq') probelist.append(orgprb) for name in sampleinfor: infor = probdict[orgprb][name].split('\t') speciesprobe = infor[-2] namelist.append(name) if len(speciesprobe) == len(orgprb): probelist.append(speciesprobe) if len(namelist) == len(probelist): # print(namelist, probelist) res = probecompare.getconsensusprobe(probelist) outinfo.extend(probelist) print(','.join(outinfo), res['consensusprobe'], res['consensusscore'], res['consensussite'], res['consensusdiff'], sep=',', file=cnsio) cnsio.close() print("finished")