Ejemplo n.º 1
0
def sequenceFilter(seqDict,minSize,minAbundance):
    nameList=seqDict.keys()
    for name in nameList:
        size=int(getAttriValueFromSeqName(name,"sampleSize"))
        abundance=int(getAttriValueFromSeqName(name,"abundance"))
        if size<minSize or abundance<minAbundance:
            del seqDict[name]
    return seqDict
Ejemplo n.º 2
0
def sequenceFilter(seqDict, minSize, minAbundance):
    nameList = seqDict.keys()
    for name in nameList:
        size = int(getAttriValueFromSeqName(name, "sampleSize"))
        abundance = int(getAttriValueFromSeqName(name, "abundance"))
        if size < minSize or abundance < minAbundance:
            del seqDict[name]
    return seqDict
Ejemplo n.º 3
0
def listbySampleSize(fastafile):
    reDict = {}
    for line in open(fastafile, "r"):
        if line.startswith(">"):
            size = getAttriValueFromSeqName(line, "sampleSize")
            if reDict.get(size, False):
                reDict[size].append(line.lstrip(">").rstrip())
            else:
                reDict[size] = [line.lstrip(">").rstrip()]
    return reDict
Ejemplo n.º 4
0
def listbySampleSize(fastafile):
    reDict={}
    for line in open(fastafile,"r"):
        if line.startswith(">"):
            size=getAttriValueFromSeqName(line,"sampleSize")
            if reDict.get(size,False):
                reDict[size].append(line.lstrip(">").rstrip())
            else:
                reDict[size]=[line.lstrip(">").rstrip()]
    return reDict
Ejemplo n.º 5
0
def assignedSingleton2Annotated(seedDict, distanceFile, threshold,
                                singletonfile):
    singletonList = []  #return
    reDict = {}
    usesingleton = []
    for line in open(distanceFile, 'r'):
        lineList = line.strip().split()
        candidateDict = reDict.get(lineList[1], None)
        if candidateDict:
            if candidateDict.get(lineList[2], None):
                reDict[lineList[1]][lineList[2]].append(lineList[0])
            else:
                reDict[lineList[1]][lineList[2]] = [lineList[0]]
        else:
            reDict[lineList[1]] = {}
            reDict[lineList[1]][lineList[2]] = [lineList[0]]
    for Candi in reDict:
        candidateName = Candi
        candiDict = reDict[Candi]
        distanceList = candiDict.keys()
        minDistance = min(distanceList)
        if float(minDistance) <= float(threshold):
            usesingleton.append(Candi)
            seedList = candiDict[minDistance]
            if len(seedList) > 1:
                maxSize = 0
                selectSeedSequenceName = ""
                for seqName in seedList:
                    sampleSize = getAttriValueFromSeqName(
                        seqName, "sampleSize")
                    if int(sampleSize) > int(maxSize):
                        maxSize = sampleSize
                        selectSeedSequenceName = seqName
                    else:
                        pass
            else:
                selectSeedSequenceName = seedList[0]
            reDict[Candi] = selectSeedSequenceName
        else:
            reDict[Candi] = 'None'
    for line in open(singletonfile, 'r'):
        if line.startswith('>'):
            name = line[1:].strip()
            if name in usesingleton:
                continue
            else:
                singletonList.append(name)
    #
    for candiName in reDict:
        ParentsSeq = reDict[candiName]
        if ParentsSeq == 'None':
            pass
        else:
            seedDict[ParentsSeq].append(candiName)
    return seedDict, singletonList
Ejemplo n.º 6
0
def assignedSingleton2Annotated(seedDict,distanceFile,threshold,singletonfile):
    singletonList=[] #return 
    reDict={}
    usesingleton=[]
    for line in open(distanceFile,'r'):
        lineList=line.strip().split()
        candidateDict=reDict.get(lineList[1],None)
        if candidateDict:
            if candidateDict.get(lineList[2],None):
                reDict[lineList[1]][lineList[2]].append(lineList[0])
            else:
                reDict[lineList[1]][lineList[2]]=[lineList[0]]
        else:
            reDict[lineList[1]]={}
            reDict[lineList[1]][lineList[2]]=[lineList[0]]
    for Candi in reDict:
        candidateName=Candi
        candiDict=reDict[Candi]
        distanceList=candiDict.keys()
        minDistance=min(distanceList)
        if float(minDistance)<=float(threshold):
            usesingleton.append(Candi)
            seedList=candiDict[minDistance]
            if len(seedList)>1:
                maxSize=0
                selectSeedSequenceName=""
                for seqName in seedList:
                    sampleSize=getAttriValueFromSeqName(seqName,"sampleSize")
                    if int(sampleSize)>int(maxSize):
                        maxSize=sampleSize
                        selectSeedSequenceName=seqName
                    else:
                        pass
            else:
                selectSeedSequenceName=seedList[0]
            reDict[Candi]=selectSeedSequenceName
        else:
            reDict[Candi]='None'
    for line in open(singletonfile,'r'):
        if line.startswith('>'):
            name=line[1:].strip()
            if name in usesingleton:
                continue
            else:
                singletonList.append(name)
    #
    for candiName in reDict:
        ParentsSeq=reDict[candiName]
        if ParentsSeq=='None':
            pass
        else:
            seedDict[ParentsSeq].append(candiName)
    return seedDict,singletonList
Ejemplo n.º 7
0
def selectRefBySampeSize(minsampleSize,tempDir,copysequence,reference):
    totalRef=os.path.splitext(copysequence)[0]+".ref"
    print totalRef
    fref=open(totalRef,'w')  #used as reference for chimera detection
    requry=os.path.splitext(copysequence)[0]+".requry"
    fqury=open(requry,'w') #used as requery for chimera detecthion
    nonchimera=os.path.splitext(copysequence)[0]+".nonchimera"
    fnonch=open(nonchimera,'w') #a part of sequence for output that nonchimera.
    requrySeqDict=fasta2dict(copysequence)
    for name in requrySeqDict:
        if int(getAttriValueFromSeqName(name,"sampleSize"))>=minsampleSize:
            fref.write(">%s\n%s\n"%(name,requrySeqDict[name]))
            fnonch.write(">%s\n%s\n"%(name,requrySeqDict[name]))
        else:
            fqury.write(">%s\n%s\n"%(name,requrySeqDict[name]))
    refdict=fasta2dict(reference)
    for name in refdict:
        fref.write(">%s\n%s\n"%(name,refdict[name]))
    return totalRef,requry,nonchimera
Ejemplo n.º 8
0
def selectRefBySampeSize(minsampleSize, tempDir, copysequence, reference):
    totalRef = os.path.splitext(copysequence)[0] + ".ref"
    print totalRef
    fref = open(totalRef, 'w')  #used as reference for chimera detection
    requry = os.path.splitext(copysequence)[0] + ".requry"
    fqury = open(requry, 'w')  #used as requery for chimera detecthion
    nonchimera = os.path.splitext(copysequence)[0] + ".nonchimera"
    fnonch = open(nonchimera,
                  'w')  #a part of sequence for output that nonchimera.
    requrySeqDict = fasta2dict(copysequence)
    for name in requrySeqDict:
        if int(getAttriValueFromSeqName(name, "sampleSize")) >= minsampleSize:
            fref.write(">%s\n%s\n" % (name, requrySeqDict[name]))
            fnonch.write(">%s\n%s\n" % (name, requrySeqDict[name]))
        else:
            fqury.write(">%s\n%s\n" % (name, requrySeqDict[name]))
    refdict = fasta2dict(reference)
    for name in refdict:
        fref.write(">%s\n%s\n" % (name, refdict[name]))
    return totalRef, requry, nonchimera
Ejemplo n.º 9
0
 ftable=open(outtable,'w')
 fmulti=open(outmultiple,'w')
 fsingle=open(outsingleton,'w')
 flist=open(unique_list,'w')
 
 lenDict,tagList=readFasta(fastaList)
 ftable.write("sequenceName\tsampleSize\ttotalAbundance\taverageAbundance\t%s\n"%'\t'.join(tagList))
 n=0
 for lens in lenDict:
     for seq in lenDict[lens]:
         if len(lenDict[lens])==0:
             continue
         n+=1
         nameList=[]
         for seqname in lenDict[lens][seq]:
             sample=getAttriValueFromSeqName(seqname,'SampleTag')
             nameList.append(sample)
         distriList=[]
         for tag in tagList:
             count=nameList.count(tag)
             distriList.append(count)
         samplesize=len(tagList)-distriList.count(0)
         abundance=sum(distriList)
         average=float(sum(distriList))/len(distriList)
         flist.write(">unique%d;sampleSize=%d;abundance=%d;\t%s\n"%(n,samplesize,abundance,','.join(lenDict[lens][seq])))
         fout.write(">unique%d;sampleSize=%d;abundance=%d;\n%s\n"%(n,samplesize,abundance,seq))
         if character=="sampleSize":
             char=samplesize
         elif character=="abundance":
             char=abundance
         if char>1: