def selectRefBySampeSize(minsampleSize,tempDir,copysequence,reference): totalRef=os.path.splitext(copysequence)[0]+".ref" print totalRef fref=open(totalRef,'w') #used as reference for chimera detection requry=os.path.splitext(copysequence)[0]+".requry" fqury=open(requry,'w') #used as requery for chimera detecthion nonchimera=os.path.splitext(copysequence)[0]+".nonchimera" fnonch=open(nonchimera,'w') #a part of sequence for output that nonchimera. requrySeqDict=fasta2dict(copysequence) for name in requrySeqDict: if int(getAttriValueFromSeqName(name,"sampleSize"))>=minsampleSize: fref.write(">%s\n%s\n"%(name,requrySeqDict[name])) fnonch.write(">%s\n%s\n"%(name,requrySeqDict[name])) else: fqury.write(">%s\n%s\n"%(name,requrySeqDict[name])) refdict=fasta2dict(reference) for name in refdict: fref.write(">%s\n%s\n"%(name,refdict[name])) return totalRef,requry,nonchimera
def outputRetainedCandidate(sourceFasta,candidateList,candidateSeq): fcand=open(candidateSeq,"w") sourceDict=fasta2dict(sourceFasta) for seqName in candidateList: sequence=sourceDict.get(seqName,False) if sequence: fcand.write(">"+seqName+"\n"+sequence+"\n") else: print "warnning, %s is not found in %s."%(seqName,sourceFasta)#there is some error. fcand.close() return True
def selectRefBySampeSize(minsampleSize, tempDir, copysequence, reference): totalRef = os.path.splitext(copysequence)[0] + ".ref" print totalRef fref = open(totalRef, 'w') #used as reference for chimera detection requry = os.path.splitext(copysequence)[0] + ".requry" fqury = open(requry, 'w') #used as requery for chimera detecthion nonchimera = os.path.splitext(copysequence)[0] + ".nonchimera" fnonch = open(nonchimera, 'w') #a part of sequence for output that nonchimera. requrySeqDict = fasta2dict(copysequence) for name in requrySeqDict: if int(getAttriValueFromSeqName(name, "sampleSize")) >= minsampleSize: fref.write(">%s\n%s\n" % (name, requrySeqDict[name])) fnonch.write(">%s\n%s\n" % (name, requrySeqDict[name])) else: fqury.write(">%s\n%s\n" % (name, requrySeqDict[name])) refdict = fasta2dict(reference) for name in refdict: fref.write(">%s\n%s\n" % (name, refdict[name])) return totalRef, requry, nonchimera
def outputRetainedCandidate(sourceFasta, candidateList, candidateSeq): fcand = open(candidateSeq, "w") sourceDict = fasta2dict(sourceFasta) for seqName in candidateList: sequence = sourceDict.get(seqName, False) if sequence: fcand.write(">" + seqName + "\n" + sequence + "\n") else: print "warnning, %s is not found in %s." % ( seqName, sourceFasta) #there is some error. fcand.close() return True
def getCandidateSequence(candidateSeqNameList,sourceFasta,candidateFasta): fcand=open(candidateFasta,"w") sourceDict=fasta2dict(sourceFasta) for sequenceName in candidateSeqNameList: candSequence=sourceDict.get(sequenceName,False) if candSequence: fcand.write(">%s\n"%(sequenceName)) fcand.write("%s\n"%(candSequence)) else: print "Error, % not found." sys.exit(1) fcand.close() return True
def getSeedSequence(seedSeqNameList,sourceFasta,seedFasta): fseed=open(seedFasta,"w") sourceDict=fasta2dict(sourceFasta) for sequenceName in seedSeqNameList: seedSequence=sourceDict.get(sequenceName,False) if seedSequence: fseed.write(">%s\n"%(sequenceName)) fseed.write("%s\n"%(seedSequence)) else: print "Error, % not found." sys.exit(1) fseed.close() return True
def distanceCalculate(seed, candidate, filerDistance, proc, tempDir, script_loc): output = tempDir + "/seed_candidate.distance" exepath = script_loc + "/lib/distanceCalculateWithKmer" fastaDict = fasta2dict(seed) dictLen = len(fastaDict) keyList = fastaDict.keys() oneLen = dictLen / proc + 1 k = 1 one = 0 subSeedPathList = [] while len(keyList) > 0: subfile = tempDir + "/inputseed_" + str(k) + ".fasta" subSeedPathList.append(subfile) fout = open(subfile, "w") k += 1 for n in range(oneLen): if len(keyList) > 0: name = keyList.pop() seq = fastaDict[name] fout.write(">%s\n%s\n" % (name, seq)) else: break fout.close() outputfileList = [] pool = multiprocessing.Pool(processes=proc) k = 0 for subfile in subSeedPathList: k += 1 outputfile = tempDir + "/outdistance_" + str(k) + ".distance" outputfileList.append(outputfile) pool.apply_async( multi, (exepath, subfile, candidate, outputfile, filerDistance)) pool.close() pool.join() fout = open(output, 'w') for suboutput in outputfileList: for line in open(suboutput, "r"): fout.write(line) fout.close() return output
def distanceCalculate(seed,candidate,filerDistance,proc,tempDir,script_loc): output=tempDir+"/seed_candidate.distance" exepath=script_loc+"/lib/distanceCalculateWithKmer" fastaDict=fasta2dict(seed) dictLen=len(fastaDict) keyList=fastaDict.keys() oneLen=dictLen/proc+1 k=1 one=0 subSeedPathList=[] while len(keyList)>0: subfile=tempDir+"/inputseed_"+str(k)+".fasta" subSeedPathList.append(subfile) fout=open(subfile,"w") k+=1 for n in range(oneLen): if len(keyList)>0: name=keyList.pop() seq=fastaDict[name] fout.write(">%s\n%s\n"%(name,seq)) else: break fout.close() outputfileList=[] pool=multiprocessing.Pool(processes=proc) k=0 for subfile in subSeedPathList: k+=1 outputfile=tempDir+"/outdistance_"+str(k)+".distance" outputfileList.append(outputfile) pool.apply_async(multi,(exepath,subfile,candidate,outputfile,filerDistance)) pool.close() pool.join() fout=open(output,'w') for suboutput in outputfileList: for line in open(suboutput,"r"): fout.write(line) fout.close() return output
sys.exit(1) if i in ('-p', '--processors'): try: processors = int(a) except: print "***Error, the processors (--processors/-p) must be integer.***\n" print usage sys.exit(1) otulist = "heuristic_search_OTU.list" outseq = "heuristic_search_OTU.fa" script_loc = os.path.split(os.path.realpath(sys.argv[0]))[0] exepath = script_loc + "/lib/distanceCalculateWithKmerOneFile" tempDir = script_loc + "/temp" + str(random.randint(10000, 99999)) os.mkdir(tempDir) fastaDict = fasta2dict(seqfile) singleDict = fasta2dict(singleton) #Executable permissions muthurPath = script_loc + "/lib/Mothur.cen_64/mothur/mothur" distance1 = script_loc + "/lib/distanceCalculateWithKmer" distance2 = script_loc + "/lib/distanceCalculateWithKmerOneFile" try: os.chmod(muthurPath, stat.S_IRWXU) except: commands = "chmod a+x %s" % (muthurPath) print "Please give executable permission to %s by this terminal commands:\n%s" % ( muthurPath, commands) sys.exit(1) try:
outputOTUinfo=oldWorkDir+"/taxonomy_guided_OTU.list" outputOTUseq=oldWorkDir+"/taxonomy_guided_OTU.fa" outputTAX=oldWorkDir+"/taxonomy_guided_OTU.genus" #calculate distance of seed and candidate annotated_unannotated_distances=distanceCalculate(seedFasta,sourceFasta,KmerFilter,processors,tempDir,script_loc) seedDict,candidateList=clustSeedCandidate(seedFasta,annotated_unannotated_distances,threshold,sourceFasta) #assign singleton sequences to annotated sequences singleton_annotated_distances=distanceCalculate(seedFasta,singletonfile,KmerFilter,processors,tempDir,script_loc) seedDict,singletonList=assignedSingleton2Annotated(seedDict,singleton_annotated_distances,threshold,singletonfile) #calculate distance between seed SameGenusDict=createSeedTaxonomyDict(taxonomy,seedDict) seedSeqDict=fasta2dict(seedFasta) distanceEXE=script_loc+"/lib/distanceCalculateWithKmerOneFile" os.chdir(tempDir) k=0 pool=multiprocessing.Pool(processes=processors) calculateGenus=[] bigGunus=[] for genus in SameGenusDict: if len(SameGenusDict[genus])<100: k+=1 fastaFile=tempDir+"/OneGenus_%d.fa"%(k) pool.apply_async(multiOTUcall,(fastaFile,SameGenusDict[genus],distanceEXE,muthurPath)) else: bigGunus.append(genus) pool.close() pool.join()
useParaList.append("fasta=%s" % (copysequence)) useParaList.append("reference=%s" % (reference)) useParaList.append("processors=%s" % (processors)) muthurParaStr = "\"#chimera.uchime(" + ",".join(useParaList) + ")\"" muthurPath = script_loc + "/lib/Mothur.cen_64/mothur/mothur" commands.getoutput(muthurPath + " " + muthurParaStr) #put chimera sequence names into list. accons = os.path.splitext(copysequence)[0] + ".uchime.accnos" acconsList = [] for line in open(accons, "r"): acconsList.append(line.strip()) faDict = fasta2dict(copysequence) fout1 = open(nonchimera, "w") fout2 = open(chimera, "w") for line in faDict: if line in acconsList: fout2.write(">%s\n%s\n" % (line, faDict[line])) else: fout1.write(">%s\n%s\n" % (line, faDict[line])) if minsamplesize: subnonDict = fasta2dict(subnonchimera) for name in subnonDict: fout1.write(">%s\n%s\n" % (name, subnonDict[name])) remove_intermediate_file = r"rm -rf " + script_loc + r"/temp* "
#calculate distance of seed and candidate annotated_unannotated_distances = distanceCalculate( seedFasta, sourceFasta, KmerFilter, processors, tempDir, script_loc) seedDict, candidateList = clustSeedCandidate( seedFasta, annotated_unannotated_distances, threshold, sourceFasta) #assign singleton sequences to annotated sequences singleton_annotated_distances = distanceCalculate(seedFasta, singletonfile, KmerFilter, processors, tempDir, script_loc) seedDict, singletonList = assignedSingleton2Annotated( seedDict, singleton_annotated_distances, threshold, singletonfile) #calculate distance between seed SameGenusDict = createSeedTaxonomyDict(taxonomy, seedDict) seedSeqDict = fasta2dict(seedFasta) distanceEXE = script_loc + "/lib/distanceCalculateWithKmerOneFile" os.chdir(tempDir) k = 0 pool = multiprocessing.Pool(processes=processors) calculateGenus = [] bigGunus = [] for genus in SameGenusDict: if len(SameGenusDict[genus]) < 100: k += 1 fastaFile = tempDir + "/OneGenus_%d.fa" % (k) pool.apply_async( multiOTUcall, (fastaFile, SameGenusDict[genus], distanceEXE, muthurPath)) else: bigGunus.append(genus)
sys.exit(1) if i in ('-p','--processors'): try: processors=int(a) except: print "***Error, the processors (--processors/-p) must be integer.***\n" print usage sys.exit(1) otulist="heuristic_search_OTU.list" outseq="heuristic_search_OTU.fa" script_loc=os.path.split(os.path.realpath(sys.argv[0]))[0] exepath=script_loc+"/lib/distanceCalculateWithKmerOneFile" tempDir=script_loc+"/temp"+str(random.randint(10000,99999)) os.mkdir(tempDir) fastaDict=fasta2dict(seqfile) singleDict=fasta2dict(singleton) #Executable permissions muthurPath=script_loc+"/lib/Mothur.cen_64/mothur/mothur" distance1=script_loc+"/lib/distanceCalculateWithKmer" distance2=script_loc+"/lib/distanceCalculateWithKmerOneFile" try: os.chmod(muthurPath,stat.S_IRWXU) except: commands="chmod a+x %s"%(muthurPath) print "Please give executable permission to %s by this terminal commands:\n%s"%(muthurPath,commands) sys.exit(1) try: os.chmod(distance1,stat.S_IRWXU)
useParaList.append("fasta=%s"%(copysequence)) useParaList.append("reference=%s"%(reference)) useParaList.append("processors=%s"%(processors)) muthurParaStr="\"#chimera.uchime("+",".join(useParaList)+")\"" muthurPath=script_loc+"/lib/Mothur.cen_64/mothur/mothur" commands.getoutput(muthurPath+" "+muthurParaStr) #put chimera sequence names into list. accons=os.path.splitext(copysequence)[0]+".uchime.accnos" acconsList=[] for line in open(accons,"r"): acconsList.append(line.strip()) faDict=fasta2dict(copysequence) fout1=open(nonchimera,"w") fout2=open(chimera,"w") for line in faDict: if line in acconsList: fout2.write(">%s\n%s\n"%(line,faDict[line])) else: fout1.write(">%s\n%s\n"%(line,faDict[line])) if minsamplesize: subnonDict=fasta2dict(subnonchimera) for name in subnonDict: fout1.write(">%s\n%s\n"%(name,subnonDict[name])) remove_intermediate_file=r"rm -rf "+script_loc+r"/temp* "