def sequenceFilter(seqDict,minSize,minAbundance): nameList=seqDict.keys() for name in nameList: size=int(getAttriValueFromSeqName(name,"sampleSize")) abundance=int(getAttriValueFromSeqName(name,"abundance")) if size<minSize or abundance<minAbundance: del seqDict[name] return seqDict
def sequenceFilter(seqDict, minSize, minAbundance): nameList = seqDict.keys() for name in nameList: size = int(getAttriValueFromSeqName(name, "sampleSize")) abundance = int(getAttriValueFromSeqName(name, "abundance")) if size < minSize or abundance < minAbundance: del seqDict[name] return seqDict
def listbySampleSize(fastafile): reDict = {} for line in open(fastafile, "r"): if line.startswith(">"): size = getAttriValueFromSeqName(line, "sampleSize") if reDict.get(size, False): reDict[size].append(line.lstrip(">").rstrip()) else: reDict[size] = [line.lstrip(">").rstrip()] return reDict
def listbySampleSize(fastafile): reDict={} for line in open(fastafile,"r"): if line.startswith(">"): size=getAttriValueFromSeqName(line,"sampleSize") if reDict.get(size,False): reDict[size].append(line.lstrip(">").rstrip()) else: reDict[size]=[line.lstrip(">").rstrip()] return reDict
def assignedSingleton2Annotated(seedDict, distanceFile, threshold, singletonfile): singletonList = [] #return reDict = {} usesingleton = [] for line in open(distanceFile, 'r'): lineList = line.strip().split() candidateDict = reDict.get(lineList[1], None) if candidateDict: if candidateDict.get(lineList[2], None): reDict[lineList[1]][lineList[2]].append(lineList[0]) else: reDict[lineList[1]][lineList[2]] = [lineList[0]] else: reDict[lineList[1]] = {} reDict[lineList[1]][lineList[2]] = [lineList[0]] for Candi in reDict: candidateName = Candi candiDict = reDict[Candi] distanceList = candiDict.keys() minDistance = min(distanceList) if float(minDistance) <= float(threshold): usesingleton.append(Candi) seedList = candiDict[minDistance] if len(seedList) > 1: maxSize = 0 selectSeedSequenceName = "" for seqName in seedList: sampleSize = getAttriValueFromSeqName( seqName, "sampleSize") if int(sampleSize) > int(maxSize): maxSize = sampleSize selectSeedSequenceName = seqName else: pass else: selectSeedSequenceName = seedList[0] reDict[Candi] = selectSeedSequenceName else: reDict[Candi] = 'None' for line in open(singletonfile, 'r'): if line.startswith('>'): name = line[1:].strip() if name in usesingleton: continue else: singletonList.append(name) # for candiName in reDict: ParentsSeq = reDict[candiName] if ParentsSeq == 'None': pass else: seedDict[ParentsSeq].append(candiName) return seedDict, singletonList
def assignedSingleton2Annotated(seedDict,distanceFile,threshold,singletonfile): singletonList=[] #return reDict={} usesingleton=[] for line in open(distanceFile,'r'): lineList=line.strip().split() candidateDict=reDict.get(lineList[1],None) if candidateDict: if candidateDict.get(lineList[2],None): reDict[lineList[1]][lineList[2]].append(lineList[0]) else: reDict[lineList[1]][lineList[2]]=[lineList[0]] else: reDict[lineList[1]]={} reDict[lineList[1]][lineList[2]]=[lineList[0]] for Candi in reDict: candidateName=Candi candiDict=reDict[Candi] distanceList=candiDict.keys() minDistance=min(distanceList) if float(minDistance)<=float(threshold): usesingleton.append(Candi) seedList=candiDict[minDistance] if len(seedList)>1: maxSize=0 selectSeedSequenceName="" for seqName in seedList: sampleSize=getAttriValueFromSeqName(seqName,"sampleSize") if int(sampleSize)>int(maxSize): maxSize=sampleSize selectSeedSequenceName=seqName else: pass else: selectSeedSequenceName=seedList[0] reDict[Candi]=selectSeedSequenceName else: reDict[Candi]='None' for line in open(singletonfile,'r'): if line.startswith('>'): name=line[1:].strip() if name in usesingleton: continue else: singletonList.append(name) # for candiName in reDict: ParentsSeq=reDict[candiName] if ParentsSeq=='None': pass else: seedDict[ParentsSeq].append(candiName) return seedDict,singletonList
def selectRefBySampeSize(minsampleSize,tempDir,copysequence,reference): totalRef=os.path.splitext(copysequence)[0]+".ref" print totalRef fref=open(totalRef,'w') #used as reference for chimera detection requry=os.path.splitext(copysequence)[0]+".requry" fqury=open(requry,'w') #used as requery for chimera detecthion nonchimera=os.path.splitext(copysequence)[0]+".nonchimera" fnonch=open(nonchimera,'w') #a part of sequence for output that nonchimera. requrySeqDict=fasta2dict(copysequence) for name in requrySeqDict: if int(getAttriValueFromSeqName(name,"sampleSize"))>=minsampleSize: fref.write(">%s\n%s\n"%(name,requrySeqDict[name])) fnonch.write(">%s\n%s\n"%(name,requrySeqDict[name])) else: fqury.write(">%s\n%s\n"%(name,requrySeqDict[name])) refdict=fasta2dict(reference) for name in refdict: fref.write(">%s\n%s\n"%(name,refdict[name])) return totalRef,requry,nonchimera
def selectRefBySampeSize(minsampleSize, tempDir, copysequence, reference): totalRef = os.path.splitext(copysequence)[0] + ".ref" print totalRef fref = open(totalRef, 'w') #used as reference for chimera detection requry = os.path.splitext(copysequence)[0] + ".requry" fqury = open(requry, 'w') #used as requery for chimera detecthion nonchimera = os.path.splitext(copysequence)[0] + ".nonchimera" fnonch = open(nonchimera, 'w') #a part of sequence for output that nonchimera. requrySeqDict = fasta2dict(copysequence) for name in requrySeqDict: if int(getAttriValueFromSeqName(name, "sampleSize")) >= minsampleSize: fref.write(">%s\n%s\n" % (name, requrySeqDict[name])) fnonch.write(">%s\n%s\n" % (name, requrySeqDict[name])) else: fqury.write(">%s\n%s\n" % (name, requrySeqDict[name])) refdict = fasta2dict(reference) for name in refdict: fref.write(">%s\n%s\n" % (name, refdict[name])) return totalRef, requry, nonchimera
ftable=open(outtable,'w') fmulti=open(outmultiple,'w') fsingle=open(outsingleton,'w') flist=open(unique_list,'w') lenDict,tagList=readFasta(fastaList) ftable.write("sequenceName\tsampleSize\ttotalAbundance\taverageAbundance\t%s\n"%'\t'.join(tagList)) n=0 for lens in lenDict: for seq in lenDict[lens]: if len(lenDict[lens])==0: continue n+=1 nameList=[] for seqname in lenDict[lens][seq]: sample=getAttriValueFromSeqName(seqname,'SampleTag') nameList.append(sample) distriList=[] for tag in tagList: count=nameList.count(tag) distriList.append(count) samplesize=len(tagList)-distriList.count(0) abundance=sum(distriList) average=float(sum(distriList))/len(distriList) flist.write(">unique%d;sampleSize=%d;abundance=%d;\t%s\n"%(n,samplesize,abundance,','.join(lenDict[lens][seq]))) fout.write(">unique%d;sampleSize=%d;abundance=%d;\n%s\n"%(n,samplesize,abundance,seq)) if character=="sampleSize": char=samplesize elif character=="abundance": char=abundance if char>1: