コード例 #1
0
ファイル: chimera_detection.py プロジェクト: chengroup/bioOTU
def selectRefBySampeSize(minsampleSize,tempDir,copysequence,reference):
    totalRef=os.path.splitext(copysequence)[0]+".ref"
    print totalRef
    fref=open(totalRef,'w')  #used as reference for chimera detection
    requry=os.path.splitext(copysequence)[0]+".requry"
    fqury=open(requry,'w') #used as requery for chimera detecthion
    nonchimera=os.path.splitext(copysequence)[0]+".nonchimera"
    fnonch=open(nonchimera,'w') #a part of sequence for output that nonchimera.
    requrySeqDict=fasta2dict(copysequence)
    for name in requrySeqDict:
        if int(getAttriValueFromSeqName(name,"sampleSize"))>=minsampleSize:
            fref.write(">%s\n%s\n"%(name,requrySeqDict[name]))
            fnonch.write(">%s\n%s\n"%(name,requrySeqDict[name]))
        else:
            fqury.write(">%s\n%s\n"%(name,requrySeqDict[name]))
    refdict=fasta2dict(reference)
    for name in refdict:
        fref.write(">%s\n%s\n"%(name,refdict[name]))
    return totalRef,requry,nonchimera
コード例 #2
0
def outputRetainedCandidate(sourceFasta,candidateList,candidateSeq):
    fcand=open(candidateSeq,"w")
    sourceDict=fasta2dict(sourceFasta)
    for seqName in candidateList:
        sequence=sourceDict.get(seqName,False)
        if sequence:
            fcand.write(">"+seqName+"\n"+sequence+"\n")
        else:
            print "warnning, %s is not found in %s."%(seqName,sourceFasta)#there is some error.
    fcand.close()
    return True
コード例 #3
0
def selectRefBySampeSize(minsampleSize, tempDir, copysequence, reference):
    totalRef = os.path.splitext(copysequence)[0] + ".ref"
    print totalRef
    fref = open(totalRef, 'w')  #used as reference for chimera detection
    requry = os.path.splitext(copysequence)[0] + ".requry"
    fqury = open(requry, 'w')  #used as requery for chimera detecthion
    nonchimera = os.path.splitext(copysequence)[0] + ".nonchimera"
    fnonch = open(nonchimera,
                  'w')  #a part of sequence for output that nonchimera.
    requrySeqDict = fasta2dict(copysequence)
    for name in requrySeqDict:
        if int(getAttriValueFromSeqName(name, "sampleSize")) >= minsampleSize:
            fref.write(">%s\n%s\n" % (name, requrySeqDict[name]))
            fnonch.write(">%s\n%s\n" % (name, requrySeqDict[name]))
        else:
            fqury.write(">%s\n%s\n" % (name, requrySeqDict[name]))
    refdict = fasta2dict(reference)
    for name in refdict:
        fref.write(">%s\n%s\n" % (name, refdict[name]))
    return totalRef, requry, nonchimera
コード例 #4
0
def outputRetainedCandidate(sourceFasta, candidateList, candidateSeq):
    fcand = open(candidateSeq, "w")
    sourceDict = fasta2dict(sourceFasta)
    for seqName in candidateList:
        sequence = sourceDict.get(seqName, False)
        if sequence:
            fcand.write(">" + seqName + "\n" + sequence + "\n")
        else:
            print "warnning, %s is not found in %s." % (
                seqName, sourceFasta)  #there is some error.
    fcand.close()
    return True
コード例 #5
0
def getCandidateSequence(candidateSeqNameList,sourceFasta,candidateFasta):
    fcand=open(candidateFasta,"w")
    sourceDict=fasta2dict(sourceFasta)
    for sequenceName in candidateSeqNameList:
        candSequence=sourceDict.get(sequenceName,False)
        if candSequence:
            fcand.write(">%s\n"%(sequenceName))
            fcand.write("%s\n"%(candSequence))
        else:
            print "Error, % not found."
            sys.exit(1)
    fcand.close()
    return True
コード例 #6
0
def getSeedSequence(seedSeqNameList,sourceFasta,seedFasta):
    fseed=open(seedFasta,"w")
    sourceDict=fasta2dict(sourceFasta)
    for sequenceName in seedSeqNameList:
        seedSequence=sourceDict.get(sequenceName,False)
        if seedSequence:
            fseed.write(">%s\n"%(sequenceName))
            fseed.write("%s\n"%(seedSequence))
        else:
            print "Error, % not found."
            sys.exit(1)
    fseed.close()
    return True
コード例 #7
0
def distanceCalculate(seed, candidate, filerDistance, proc, tempDir,
                      script_loc):

    output = tempDir + "/seed_candidate.distance"

    exepath = script_loc + "/lib/distanceCalculateWithKmer"

    fastaDict = fasta2dict(seed)
    dictLen = len(fastaDict)
    keyList = fastaDict.keys()
    oneLen = dictLen / proc + 1
    k = 1
    one = 0
    subSeedPathList = []
    while len(keyList) > 0:
        subfile = tempDir + "/inputseed_" + str(k) + ".fasta"
        subSeedPathList.append(subfile)
        fout = open(subfile, "w")
        k += 1
        for n in range(oneLen):
            if len(keyList) > 0:
                name = keyList.pop()
                seq = fastaDict[name]
                fout.write(">%s\n%s\n" % (name, seq))
            else:
                break
        fout.close()
    outputfileList = []
    pool = multiprocessing.Pool(processes=proc)
    k = 0
    for subfile in subSeedPathList:
        k += 1
        outputfile = tempDir + "/outdistance_" + str(k) + ".distance"
        outputfileList.append(outputfile)
        pool.apply_async(
            multi, (exepath, subfile, candidate, outputfile, filerDistance))
    pool.close()
    pool.join()
    fout = open(output, 'w')
    for suboutput in outputfileList:
        for line in open(suboutput, "r"):
            fout.write(line)
    fout.close()
    return output
コード例 #8
0
def distanceCalculate(seed,candidate,filerDistance,proc,tempDir,script_loc):

    output=tempDir+"/seed_candidate.distance"

    exepath=script_loc+"/lib/distanceCalculateWithKmer"

    fastaDict=fasta2dict(seed)
    dictLen=len(fastaDict)
    keyList=fastaDict.keys()
    oneLen=dictLen/proc+1
    k=1
    one=0
    subSeedPathList=[]
    while len(keyList)>0:
        subfile=tempDir+"/inputseed_"+str(k)+".fasta"
        subSeedPathList.append(subfile)
        fout=open(subfile,"w")
        k+=1
        for n in range(oneLen):
            if len(keyList)>0:
                name=keyList.pop()
                seq=fastaDict[name]
                fout.write(">%s\n%s\n"%(name,seq))
            else:
                break
        fout.close()
    outputfileList=[]
    pool=multiprocessing.Pool(processes=proc)
    k=0
    for subfile in subSeedPathList:
        k+=1
        outputfile=tempDir+"/outdistance_"+str(k)+".distance"
        outputfileList.append(outputfile)
        pool.apply_async(multi,(exepath,subfile,candidate,outputfile,filerDistance))
    pool.close()
    pool.join()
    fout=open(output,'w')
    for suboutput in outputfileList:
        for line in open(suboutput,"r"):
            fout.write(line)
    fout.close()
    return output
コード例 #9
0
                sys.exit(1)
        if i in ('-p', '--processors'):
            try:
                processors = int(a)
            except:
                print "***Error, the processors (--processors/-p) must be integer.***\n"
                print usage
                sys.exit(1)

    otulist = "heuristic_search_OTU.list"
    outseq = "heuristic_search_OTU.fa"
    script_loc = os.path.split(os.path.realpath(sys.argv[0]))[0]
    exepath = script_loc + "/lib/distanceCalculateWithKmerOneFile"
    tempDir = script_loc + "/temp" + str(random.randint(10000, 99999))
    os.mkdir(tempDir)
    fastaDict = fasta2dict(seqfile)
    singleDict = fasta2dict(singleton)

    #Executable permissions
    muthurPath = script_loc + "/lib/Mothur.cen_64/mothur/mothur"
    distance1 = script_loc + "/lib/distanceCalculateWithKmer"
    distance2 = script_loc + "/lib/distanceCalculateWithKmerOneFile"

    try:
        os.chmod(muthurPath, stat.S_IRWXU)
    except:
        commands = "chmod a+x %s" % (muthurPath)
        print "Please give executable permission to %s by this terminal commands:\n%s" % (
            muthurPath, commands)
        sys.exit(1)
    try:
コード例 #10
0
    outputOTUinfo=oldWorkDir+"/taxonomy_guided_OTU.list"
    outputOTUseq=oldWorkDir+"/taxonomy_guided_OTU.fa"
    outputTAX=oldWorkDir+"/taxonomy_guided_OTU.genus"

    #calculate distance of seed and candidate
    annotated_unannotated_distances=distanceCalculate(seedFasta,sourceFasta,KmerFilter,processors,tempDir,script_loc)
    seedDict,candidateList=clustSeedCandidate(seedFasta,annotated_unannotated_distances,threshold,sourceFasta)

    #assign singleton sequences to annotated sequences
    singleton_annotated_distances=distanceCalculate(seedFasta,singletonfile,KmerFilter,processors,tempDir,script_loc)
    seedDict,singletonList=assignedSingleton2Annotated(seedDict,singleton_annotated_distances,threshold,singletonfile)


    #calculate distance between seed
    SameGenusDict=createSeedTaxonomyDict(taxonomy,seedDict)
    seedSeqDict=fasta2dict(seedFasta)
    distanceEXE=script_loc+"/lib/distanceCalculateWithKmerOneFile"
    os.chdir(tempDir)
    k=0
    pool=multiprocessing.Pool(processes=processors)
    calculateGenus=[]
    bigGunus=[]
    for genus in SameGenusDict:
        if len(SameGenusDict[genus])<100:
            k+=1
            fastaFile=tempDir+"/OneGenus_%d.fa"%(k)
            pool.apply_async(multiOTUcall,(fastaFile,SameGenusDict[genus],distanceEXE,muthurPath))
        else:
            bigGunus.append(genus)
    pool.close()
    pool.join()
コード例 #11
0
        useParaList.append("fasta=%s" % (copysequence))
        useParaList.append("reference=%s" % (reference))

    useParaList.append("processors=%s" % (processors))

    muthurParaStr = "\"#chimera.uchime(" + ",".join(useParaList) + ")\""
    muthurPath = script_loc + "/lib/Mothur.cen_64/mothur/mothur"
    commands.getoutput(muthurPath + " " + muthurParaStr)

    #put chimera sequence names into list.
    accons = os.path.splitext(copysequence)[0] + ".uchime.accnos"
    acconsList = []
    for line in open(accons, "r"):
        acconsList.append(line.strip())

    faDict = fasta2dict(copysequence)

    fout1 = open(nonchimera, "w")
    fout2 = open(chimera, "w")

    for line in faDict:
        if line in acconsList:
            fout2.write(">%s\n%s\n" % (line, faDict[line]))
        else:
            fout1.write(">%s\n%s\n" % (line, faDict[line]))
    if minsamplesize:
        subnonDict = fasta2dict(subnonchimera)
        for name in subnonDict:
            fout1.write(">%s\n%s\n" % (name, subnonDict[name]))

    remove_intermediate_file = r"rm -rf " + script_loc + r"/temp* "
コード例 #12
0
    #calculate distance of seed and candidate
    annotated_unannotated_distances = distanceCalculate(
        seedFasta, sourceFasta, KmerFilter, processors, tempDir, script_loc)
    seedDict, candidateList = clustSeedCandidate(
        seedFasta, annotated_unannotated_distances, threshold, sourceFasta)

    #assign singleton sequences to annotated sequences
    singleton_annotated_distances = distanceCalculate(seedFasta, singletonfile,
                                                      KmerFilter, processors,
                                                      tempDir, script_loc)
    seedDict, singletonList = assignedSingleton2Annotated(
        seedDict, singleton_annotated_distances, threshold, singletonfile)

    #calculate distance between seed
    SameGenusDict = createSeedTaxonomyDict(taxonomy, seedDict)
    seedSeqDict = fasta2dict(seedFasta)
    distanceEXE = script_loc + "/lib/distanceCalculateWithKmerOneFile"
    os.chdir(tempDir)
    k = 0
    pool = multiprocessing.Pool(processes=processors)
    calculateGenus = []
    bigGunus = []
    for genus in SameGenusDict:
        if len(SameGenusDict[genus]) < 100:
            k += 1
            fastaFile = tempDir + "/OneGenus_%d.fa" % (k)
            pool.apply_async(
                multiOTUcall,
                (fastaFile, SameGenusDict[genus], distanceEXE, muthurPath))
        else:
            bigGunus.append(genus)
コード例 #13
0
                sys.exit(1)
        if i in ('-p','--processors'):
            try:
                processors=int(a)
            except:
                print "***Error, the processors (--processors/-p) must be integer.***\n"
                print usage
                sys.exit(1)

    otulist="heuristic_search_OTU.list"
    outseq="heuristic_search_OTU.fa"
    script_loc=os.path.split(os.path.realpath(sys.argv[0]))[0]
    exepath=script_loc+"/lib/distanceCalculateWithKmerOneFile"
    tempDir=script_loc+"/temp"+str(random.randint(10000,99999))
    os.mkdir(tempDir)
    fastaDict=fasta2dict(seqfile)
    singleDict=fasta2dict(singleton)

    #Executable permissions
    muthurPath=script_loc+"/lib/Mothur.cen_64/mothur/mothur"
    distance1=script_loc+"/lib/distanceCalculateWithKmer"
    distance2=script_loc+"/lib/distanceCalculateWithKmerOneFile"

    try:
        os.chmod(muthurPath,stat.S_IRWXU)
    except:
        commands="chmod a+x %s"%(muthurPath)
        print "Please give executable permission to %s by this terminal commands:\n%s"%(muthurPath,commands)
        sys.exit(1)
    try:
        os.chmod(distance1,stat.S_IRWXU)
コード例 #14
0
ファイル: chimera_detection.py プロジェクト: chengroup/bioOTU
        useParaList.append("fasta=%s"%(copysequence))
        useParaList.append("reference=%s"%(reference))

    useParaList.append("processors=%s"%(processors))

    muthurParaStr="\"#chimera.uchime("+",".join(useParaList)+")\""
    muthurPath=script_loc+"/lib/Mothur.cen_64/mothur/mothur"
    commands.getoutput(muthurPath+" "+muthurParaStr)

    #put chimera sequence names into list. 
    accons=os.path.splitext(copysequence)[0]+".uchime.accnos"
    acconsList=[]
    for line in open(accons,"r"):
        acconsList.append(line.strip())

    faDict=fasta2dict(copysequence)

    fout1=open(nonchimera,"w")
    fout2=open(chimera,"w")

    for line in faDict:
        if line in acconsList:
            fout2.write(">%s\n%s\n"%(line,faDict[line]))
        else:
            fout1.write(">%s\n%s\n"%(line,faDict[line]))
    if minsamplesize:
        subnonDict=fasta2dict(subnonchimera)
        for name in subnonDict:
            fout1.write(">%s\n%s\n"%(name,subnonDict[name]))

    remove_intermediate_file=r"rm -rf "+script_loc+r"/temp* "