def producer(info): try: inputdata = "%s.fasta" % (str(info[0])) consensus = str(info[1]) seqs = [ ">%s\n%s" % (str(i), str(s)) for i, s in zip(info[2].split("\t"), info[3].split("\t")) ] with open(inputdata, "w") as o: print >> o, ">%(seqID)s\n%(seq)s" % dict(seqID=CONSENSUS_NAME, seq=consensus) print >> o, "%s" % ("\n".join(seqs)) cline = """trimal -in %s -fasta -gt 0.8 -st 0.001 -cons 60 -colnumbering""" % ( inputdata) child = subprocess.Popen(str(cline), stdout=subprocess.PIPE, universal_newlines=True, shell=(sys.platform != "win32")) sout, serr = child.communicate() removeFiles([inputdata]) sout = filter(None, sout.splitlines()) # strip empty lines fasta = "\n".join(sout[:-1]) log = sout[-1] return fasta, log, info[0] except: return None
def samToBamBuffers(samdat, fprefix): cline = """samtools view -bS -""" child = subprocess.Popen(str(cline), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=(sys.platform!="win32"), close_fds = True) sout, serr = child.communicate(samdat) bamout = "%s.tmp.bam"%(fprefix) cline = """samtools sort - -o %s.tmp"""%(fprefix) child2 = subprocess.Popen(str(cline), stdin = subprocess.PIPE, stderr = subprocess.PIPE, stdout = subprocess.PIPE, shell = (sys.platform!="win32"), close_fds = True) bamdat, berr = child2.communicate(sout) with open(bamout, "wb") as o: o.write(bamdat) os.system("""samtools index %s > /dev/null 2> /dev/null"""%(bamout)) bamidxdat = open("%s.bai"%(bamout), "rb").read() removeFiles([bamout, "%s.tmp.bam.bai"%(fprefix)]) return bamdat, bamidxdat
def samToBamBuffers(samdat, fprefix): cline = """samtools view -bS -""" child = subprocess.Popen(str(cline), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=(sys.platform != "win32"), close_fds=True) sout, serr = child.communicate(samdat) bamout = "%s.tmp.bam" % (fprefix) cline = """samtools sort - -o %s.tmp""" % (fprefix) child2 = subprocess.Popen(str(cline), stdin=subprocess.PIPE, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=(sys.platform != "win32"), close_fds=True) bamdat, berr = child2.communicate(sout) with open(bamout, "wb") as o: o.write(bamdat) os.system("""samtools index %s > /dev/null 2> /dev/null""" % (bamout)) bamidxdat = open("%s.bai" % (bamout), "rb").read() removeFiles([bamout, "%s.tmp.bam.bai" % (fprefix)]) return bamdat, bamidxdat
def run(self): samout = "%s.sam"%(self.name) bamout = "%s.bam"%(self.name) msaInfo = self.parseMSA(StringIO.StringIO(self.data)) refs = [ (r.id, len(r.seq) ) for r in SeqIO.parse(StringIO.StringIO(self.data), "fasta")] readgroups, refIDToReadgroup, groupdata = self.generateReadGroups(msaInfo.keys()) header = dict(HD = dict(VN = '1.0'), SQ = [ {'LN': refs[0][1], 'SN': refs[0][0] }], RG = readgroups) outfile = pysam.Samfile(samout, "wh", header = header) for refName, refVals in msaInfo.iteritems(): isReversed = True if len(refVals) == 3: isReversed = False samid = os.path.join(self.samdir,"%s.bam" % refName ) samfile = pysam.Samfile(samid, "rb" ) coverage = self.processSingleRef(samfile, refName, refVals[0], refVals[1], refVals[2], isReversed, outfile, header, refIDToReadgroup[refName]) groupdata[refName].append(coverage) samfile.close() outfile.close() # need it in memory anyways... samdat = open(samout, "rb").read() cline = """samtools view -bS -""" child = subprocess.Popen(str(cline), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=(sys.platform!="win32"), close_fds = True) sout, serr = child.communicate(samdat) cline = """samtools sort - -o %s"""%(self.name) child2 = subprocess.Popen(str(cline), stdin = subprocess.PIPE, stderr = subprocess.PIPE, stdout = subprocess.PIPE, shell = (sys.platform!="win32"), close_fds = True) bamdat, berr = child2.communicate(sout) with open(bamout, "wb") as o: o.write(bamdat) os.system("""samtools index %s > /dev/null 2> /dev/null"""%(bamout)) bamidxdat = open("%s.bai"%(bamout), "rb").read() newcon = self.updateConsensus(bamout) if len(newcon) != int(refs[0][1]): print >> sys.stderr, "Con length mismatch : %s"%(bamout) print >> sys.stderr , self.data removeFiles([samout, bamout, "%s.bai"%(bamout)]) return samdat, bamdat, bamidxdat, self.name, groupdata.values(), newcon
def find_shared_regions(args): tmpname = mkstemp(dir = ".") os.close(tmpname[0]) tmpname = tmpname[1] qaction = "" if args.quiet: qaction = " > /dev/null 2> /dev/null " cline = USRCH%dict(input=args.input1, database=args.input2, output= tmpname, threads = args.threads, tail = qaction ) #print cline child = subprocess.Popen(str(cline), shell=(sys.platform!="win32") ) child.wait() data = [l.strip().split() for l in open(tmpname)] removeFiles([tmpname]) return data
def genGo(self): self.genPbBin() os.chdir(XlsToolDir) # 生成 xxx.pb.go os.system("protoc --version") os.system("protoc -I . --go_out=. ./*.proto") # 移动文件到目的文件夹 DstGoPbDir = os.path.join(SelfPath, "../gen") utils.moveFiles(XlsToolDir, DstGoPbDir, ["*.pb.go"]) utils.moveFiles(XlsToolDir, DstGoPbDir, ["*.bytes"]) # 清除 多余文件 utils.removeFiles(XlsToolDir, ["*_pb2.py", "*.pyc", "*.log", "*.txt", "*.proto"])
def producer(info): try: inputdata = "%s.fasta"%(str(info[0])) consensus = str(info[1]) seqs = [">%s\n%s"%(str(i), str(s)) for i, s in zip(info[2].split("\t"), info[3].split("\t"))] with open(inputdata, "w") as o: print >> o, ">%(seqID)s\n%(seq)s"%dict(seqID=CONSENSUS_NAME, seq=consensus) print >> o, "%s"%("\n".join(seqs)) cline = """trimal -in %s -fasta -gt 0.8 -st 0.001 -cons 60 -colnumbering"""%(inputdata) child = subprocess.Popen(str(cline), stdout=subprocess.PIPE, universal_newlines = True, shell=(sys.platform!="win32")) sout, serr = child.communicate() removeFiles([inputdata]) sout = filter(None, sout.splitlines()) # strip empty lines fasta = "\n".join(sout[:-1]) log = sout[-1] return fasta, log, info[0] except: return None
def producer(info): fileidx = str(info[0]) inputdata = "%s.con.fasta"%(fileidx) consensus = str(info[1]) with open(inputdata, "w") as o: print >> o, ">%(seqID)s\n%(seq)s"%dict(seqID = CONSENSUS_NAME, seq = consensus) os.system("""samtools faidx %s"""%(inputdata)) bamfile = "%s.bam"%(fileidx) for g in info[3].split("\t"): cline = """samtools view -bhr "%s" - > %s_%s"""%(g, g, bamfile) child = subprocess.Popen(str(cline), stdin=subprocess.PIPE, stderr=subprocess.PIPE, shell=(sys.platform!="win32"), close_fds = True) child.communicate(info[2]) with open("%s.ids"%(fileidx), "a") as o: print >> o, "%s_%s"%(g, bamfile) cline = """varscan.sh %s %s %s 2> /dev/null"""%( "%s.ids"%(fileidx), bamfile, inputdata ) child = subprocess.Popen(str(cline), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=(sys.platform!="win32"), close_fds = True) dat, err = child.communicate() hdr = [] for l in StringIO.StringIO(dat): if l[0] == '#' and l[1] != '#': hdr = [ (k, v) for k, v in zip(info[3].split("\t"), l.split()[9:], )] break removeFiles([inputdata, "%s.fai"%(inputdata)]) return fileidx, dat, hdr
def producer(info): fileid = str(info[0]) try: vcfInput = vcf.Reader(StringIO.StringIO(info[1])) except: return None line = None try: line = vcfInput.next() except: return None if not line: return None bamfile = "%s.bam"%(fileid) bamidxfile = "%s.bam.bai"%(fileid) with open(bamfile, "wb") as o: o.write(info[2]) with open(bamidxfile, "wb") as o: o.write(info[3]) vcfInput = vcf.Reader(StringIO.StringIO(info[1])) vcfohndl = StringIO.StringIO() vcfOutput = vcf.Writer(vcfohndl, vcfInput) jsonhndl = StringIO.StringIO() data = computeData(vcfInput, vcfOutput, bamfile, 0) json.dump(data, jsonhndl, separators=(',', ':')) jsonhndl.flush() jsonstr = jsonhndl.getvalue() jsonhndl.close() vcfohndl.flush() modvcf = vcfohndl.getvalue() vcfohndl.close() removeFiles([bamfile, bamidxfile]) return info[0], modvcf, jsonstr
def producer(info): fileid = str(info[0]) try: vcfInput = vcf.Reader(StringIO.StringIO(info[1])) except: return None line = None try: line = vcfInput.next() except: return None if not line: return None bamfile = "%s.bam" % (fileid) bamidxfile = "%s.bam.bai" % (fileid) with open(bamfile, "wb") as o: o.write(info[2]) with open(bamidxfile, "wb") as o: o.write(info[3]) vcfInput = vcf.Reader(StringIO.StringIO(info[1])) vcfohndl = StringIO.StringIO() vcfOutput = vcf.Writer(vcfohndl, vcfInput) jsonhndl = StringIO.StringIO() data = computeData(vcfInput, vcfOutput, bamfile, 0) json.dump(data, jsonhndl, separators=(',', ':')) jsonhndl.flush() jsonstr = jsonhndl.getvalue() jsonhndl.close() vcfohndl.flush() modvcf = vcfohndl.getvalue() vcfohndl.close() removeFiles([bamfile, bamidxfile]) return info[0], modvcf, jsonstr
def producer(info): fileidx = str(info[0]) consensus = ">%(seqID)s\n%(seq)s\n" % dict(seqID=CONSENSUS_NAME, seq=str(info[2])) seqs = [ ">%s\n%s" % (str(i), str(s)) for i, s in zip(info[3].split("\t"), info[4].split("\t")) ] fastafile = StringIO.StringIO(consensus + "\n".join(seqs)) bamfile = "%s.bam" % (fileidx) bamidxfile = "%s.bam.bai" % (fileidx) with open(bamfile, "wb") as o: o.write(info[5]) with open(bamidxfile, "wb") as o: o.write(info[6]) refs = [(r.id, len(r.seq)) for r in SeqIO.parse(fastafile, "fasta")] sfile = pysam.Samfile(bamfile) hdr = sfile.header.copy() hdr['SQ'] = [{'LN': refs[0][1], 'SN': refs[0][0]}] total = 0 indices = eval("[" + info[1].replace("#ColumnsMap", "") + "]") #indices = eval("[" + info[1] + "]") maxidx = max(indices) + 1 shift = [0] * (maxidx) for i in xrange(len(shift)): shift[i] = total if i not in indices: total += 1 outfile = pysam.Samfile("%s.trim.sam" % (fileidx), "wh", header=hdr) newcoverage = defaultdict(int) # DSL 20160113 -- Since the outfile already has the header, a None check won't work. Use a boolean flag for now to verify we have sequences. hasSeqs = False for read in sfile.fetch(): newseq = "" newcig = "" newqual = "" # mahdi: causing the program to crash. Cause still unknown if read.cigarstring == None: # DLS 20160113 -- instead of tossing the entire sam file out, just skip the bad sequence... #break; continue # DLS 20160113 -- added to notify later steps the SAM file we are building does infact contain at least 1 sequence. hasSeqs = True cigar = expandCigar(read.cigarstring) back = 0 pos = read.pos for idx, c in enumerate(cigar): # stop if we exceed the reference length if pos >= maxidx: break pos += 1 # only process bases that we havent removed in cleanup # also, count any D's that we missed to shift us back in the query and qual if (idx + read.pos) not in indices: # an idx not in indices indicates that trimal has removed this particular position if cigar[idx].upper() == 'D': back += 1 continue # if the idx exists in the logs of trimAL, the base should be kept. newcig += cigar[idx] #D means we dont have a base to add. if cigar[idx].upper() != 'D': newseq += read.query[idx - back] if read.qqual: newqual += read.qqual[idx - back] else: back += 1 #if a sequence has nothing to add, why add it ?!? if newseq: read.pos = max((read.pos - shift[read.pos]), 0) read.seq = newseq read.cigarstring = compressCigar(newcig) if not newqual: read.qual = "I" * len(newseq) else: read.qual = newqual newcoverage[read.qname.split("_")[-1]] += 1 outfile.write(read) outfile.close() samout = "%s.trim.sam" % (fileidx) bamout = "%s.trim.bam" % (fileidx) samdat = open(samout).read() removeFiles([bamfile, bamidxfile, samout]) # DLS 20160113 -- The consumer should handle the None case correctly, by skipping the INSERT/UPDATE action. # I am speculating an issue occurs in the samToBam when the samfile is empty. if hasSeqs == False: return None bamdat, bamidxdat = samToBam(samdat, "%s.trim" % (fileidx), buffers=True) return fileidx, samdat, bamdat, bamidxdat, newcoverage
def producer(info): fileidx = str(info[0]) consensus = ">%(seqID)s\n%(seq)s\n"%dict(seqID=CONSENSUS_NAME, seq=str(info[2])) seqs = [">%s\n%s"%(str(i), str(s)) for i, s in zip(info[3].split("\t"), info[4].split("\t"))] fastafile = StringIO.StringIO(consensus + "\n".join(seqs) ) bamfile = "%s.bam"%(fileidx) bamidxfile = "%s.bam.bai"%(fileidx) with open(bamfile, "wb") as o: o.write(info[5]) with open(bamidxfile, "wb") as o: o.write(info[6]) refs = [ (r.id, len(r.seq) ) for r in SeqIO.parse(fastafile, "fasta")] sfile = pysam.Samfile(bamfile) hdr = sfile.header.copy() hdr['SQ'] = [ {'LN': refs[0][1], 'SN': refs[0][0] }] total = 0 indices = eval("["+info[1].replace("#ColumnsMap","") + "]") #indices = eval("[" + info[1] + "]") maxidx = max(indices) + 1 shift = [0]*(maxidx) for i in xrange(len(shift)): shift[i] = total if i not in indices: total += 1 outfile = pysam.Samfile("%s.trim.sam"%(fileidx), "wh", header = hdr) newcoverage = defaultdict(int) # DSL 20160113 -- Since the outfile already has the header, a None check won't work. Use a boolean flag for now to verify we have sequences. hasSeqs = False for read in sfile.fetch(): newseq = "" newcig = "" newqual = "" # mahdi: causing the program to crash. Cause still unknown if read.cigarstring == None: # DLS 20160113 -- instead of tossing the entire sam file out, just skip the bad sequence... #break; continue # DLS 20160113 -- added to notify later steps the SAM file we are building does infact contain at least 1 sequence. hasSeqs = True cigar = expandCigar(read.cigarstring) back = 0 pos = read.pos for idx, c in enumerate(cigar): # stop if we exceed the reference length if pos >= maxidx: break pos += 1 # only process bases that we havent removed in cleanup # also, count any D's that we missed to shift us back in the query and qual if (idx + read.pos) not in indices: # an idx not in indices indicates that trimal has removed this particular position if cigar[idx].upper() == 'D': back += 1 continue # if the idx exists in the logs of trimAL, the base should be kept. newcig += cigar[idx] #D means we dont have a base to add. if cigar[idx].upper() != 'D': newseq += read.query[idx - back] if read.qqual: newqual += read.qqual[idx - back] else: back += 1 #if a sequence has nothing to add, why add it ?!? if newseq: read.pos = max( (read.pos - shift[read.pos]), 0) read.seq = newseq read.cigarstring = compressCigar(newcig) if not newqual: read.qual = "I"* len(newseq) else: read.qual = newqual newcoverage[read.qname.split("_")[-1]] += 1 outfile.write(read) outfile.close() samout = "%s.trim.sam"%(fileidx) bamout = "%s.trim.bam"%(fileidx) samdat = open(samout).read() removeFiles([bamfile, bamidxfile, samout]) # DLS 20160113 -- The consumer should handle the None case correctly, by skipping the INSERT/UPDATE action. # I am speculating an issue occurs in the samToBam when the samfile is empty. if hasSeqs == False: return None bamdat, bamidxdat = samToBam(samdat, "%s.trim"%(fileidx), buffers = True) return fileidx, samdat, bamdat, bamidxdat, newcoverage