def getLenStat(fileName, minLength=1000): """ Get basic statistics concerning the lengths of the sequence. @param fileName: fasta file @type fileName: str """ buf = "" c = 0 bp = 0 minLen = sys.maxint maxLen = 0 totalBp = 0 totalCount = 0 for k, l in fas.getSequenceToBpDict(fileName).iteritems(): totalCount += 1 totalBp += l if l >= minLength: c += 1 bp += l if l < minLen: minLen = l elif l > maxLen: maxLen = l buf += 'Bigger than %sbp (sequences: %s, Mbp: %s)\n' % ( minLength, c, round(float(bp) / 1000000.0, 3)) buf += 'Bigger than %sbp (min: %s, max %s, avg %s bp)\n' % ( minLength, minLen, maxLen, round((float(bp) / c))) buf += 'Total (sequences: %s, Mbp: %s)\n' % ( totalCount, round(float(totalBp) / 1000000.0, 3)) return buf
def getLenStat(fileName, minLength=1000): """ Get basic statistics concerning the lengths of the sequence. @param fileName: fasta file @type fileName: str """ buf = "" c = 0 bp = 0 minLen = sys.maxint maxLen = 0 totalBp = 0 totalCount = 0 for k, l in fas.getSequenceToBpDict(fileName).iteritems(): totalCount += 1 totalBp += l if l >= minLength: c += 1 bp += l if l < minLen: minLen = l elif l > maxLen: maxLen = l buf += 'Bigger than %sbp (sequences: %s, Mbp: %s)\n' % (minLength, c, round(float(bp) / 1000000.0, 3)) buf += 'Bigger than %sbp (min: %s, max %s, avg %s bp)\n' % (minLength, minLen, maxLen, round((float(bp) / c))) buf += 'Total (sequences: %s, Mbp: %s)\n' % (totalCount, round(float(totalBp) / 1000000.0, 3)) return buf
def getMaxLen(fastaFilePath): """ Gets the length of the sequence that has maximum length in a fasta file. """ maxLen = 0 for val in fasta.getSequenceToBpDict(fastaFilePath).itervalues(): if maxLen < int(val): maxLen = int(val) return maxLen
def toContigsLabelList(inFastaFileName, readsF, readsR, readOnContig, community, outMappingFileName): """ Gets mapping from contigIds to lists of taxonIds of individual reads of the contigs. @param inFastaFileName: @param readsF: @param readsR: @param readOnContig: @param community: @param outMappingFileName: """ # contigIds contigIdToBp = fas.getSequenceToBpDict(inFastaFileName) # map: contigId -> list of readIds contigIdToReadList = csv.getMapping(readOnContig, 1, 0, sep='\t', comment='r') # taxonIds as a list for reads readFTaxonIdList = getReadsTaxonIdList(readsF, community) print 's1' readRTaxonIdList = getReadsTaxonIdList(readsR, community) print 's2' if len(readFTaxonIdList) != len(readRTaxonIdList): print( 'toContigsLabels: different number of reads in the reads files, exit' ) return for i in range(len(readFTaxonIdList))[1:]: if readFTaxonIdList[i] != readRTaxonIdList[i]: print( 'toContigsLabels: at index %s different taxon ids %s and %s' % (i, readFTaxonIdList[i], readRTaxonIdList[i])) if readFTaxonIdList[i] is None or readRTaxonIdList[i] is None: print('toContigsLabels: at index %s, one is None %s or %s' % (i, readFTaxonIdList[i], readRTaxonIdList[i])) print 's3' # out = csv.OutFileBuffer(outMappingFileName) for contigId in contigIdToBp: try: readList = contigIdToReadList[contigId] taxonIdList = [] for readId in readList: taxonIdList.append(readFTaxonIdList[int(readId)]) out.writeText( str(contigId) + '\t' + ','.join(map(str, taxonIdList)) + '\n') except KeyError: print("No label for contigId: %s" % contigId) out.close() print 's4'
def filterOutSequencesBatch(taxonIdSet, srcDir, dstDir, notAllowedSeqIdSet): """ For each fasta file that is in directory srcDir filters out sequences that are not defined in the allowedSeqIdSet. """ for taxonId in taxonIdSet: srcFilePath = os.path.join(srcDir,str(str(taxonId) + '.1.fna')) dstFilePath = os.path.join(dstDir,str(str(taxonId) + '.1.fna')) seqIdDict = fasta.getSequenceToBpDict(srcFilePath) allowedNamesSet = set() for id in seqIdDict.iterkeys(): if id not in notAllowedSeqIdSet: allowedNamesSet.add(id) fasta.filterOutSequences(srcFilePath, dstFilePath, allowedNamesSet)
def filterOutSequencesBatch(taxonIdSet, srcDir, dstDir, notAllowedSeqIdSet): """ For each fasta file that is in directory srcDir filters out sequences that are not defined in the allowedSeqIdSet. """ for taxonId in taxonIdSet: srcFilePath = os.path.join(srcDir, str(str(taxonId) + '.1.fna')) dstFilePath = os.path.join(dstDir, str(str(taxonId) + '.1.fna')) seqIdDict = fasta.getSequenceToBpDict(srcFilePath) allowedNamesSet = set() for id in seqIdDict.iterkeys(): if id not in notAllowedSeqIdSet: allowedNamesSet.add(id) fasta.filterOutSequences(srcFilePath, dstFilePath, allowedNamesSet)
def toContigsLabelList(inFastaFileName, readsF, readsR, readOnContig, community, outMappingFileName): """ Gets mapping from contigIds to lists of taxonIds of individual reads of the contigs. @param inFastaFileName: @param readsF: @param readsR: @param readOnContig: @param community: @param outMappingFileName: """ # contigIds contigIdToBp = fas.getSequenceToBpDict(inFastaFileName) # map: contigId -> list of readIds contigIdToReadList = csv.getMapping(readOnContig, 1, 0, sep='\t', comment='r') # taxonIds as a list for reads readFTaxonIdList = getReadsTaxonIdList(readsF, community) print 's1' readRTaxonIdList = getReadsTaxonIdList(readsR, community) print 's2' if len(readFTaxonIdList) != len(readRTaxonIdList): print('toContigsLabels: different number of reads in the reads files, exit') return for i in range(len(readFTaxonIdList))[1:]: if readFTaxonIdList[i] != readRTaxonIdList[i]: print('toContigsLabels: at index %s different taxon ids %s and %s' % (i, readFTaxonIdList[i], readRTaxonIdList[i] )) if readFTaxonIdList[i] is None or readRTaxonIdList[i] is None: print('toContigsLabels: at index %s, one is None %s or %s' % (i, readFTaxonIdList[i], readRTaxonIdList[i])) print 's3' # out = csv.OutFileBuffer(outMappingFileName) for contigId in contigIdToBp: try: readList = contigIdToReadList[contigId] taxonIdList = [] for readId in readList: taxonIdList.append(readFTaxonIdList[int(readId)]) out.writeText(str(contigId) + '\t' + ','.join(map(str, taxonIdList)) + '\n') except KeyError: print("No label for contigId: %s" % contigId) out.close() print 's4'
def __init__(self, seqIdToBp, seqIdToPred, seqIdToTruePred, taxonomy, correctLabelThreshold=None): """ Initializes the accuracy object. @param seqIdToBp: dictionary or a fasta file @param seqIdToPred: dictionary or a prediction file @param seqIdToTruePred: dictionary or a true prediction file @param taxonomy: database file in the sqlite3 format, or taxonomy object retrieved from not closed Accuracy """ if isinstance(seqIdToBp, dict): self._seqToBp = seqIdToBp else: assert os.path.isfile(seqIdToBp) self._seqToBp = fasta.getSequenceToBpDict(seqIdToBp) if isinstance(seqIdToPred, dict): self._seqToPred = seqIdToPred else: assert os.path.isfile(seqIdToPred) self._seqToPred = cami.readAssignments(seqIdToPred) if isinstance(seqIdToTruePred, dict): self._seqToTrue = seqIdToTruePred else: assert os.path.isfile(seqIdToTruePred) self._seqToTrue = cami.readAssignments(seqIdToTruePred) if isinstance(taxonomy, _TaxonomyWrapperA): self._taxonomy = taxonomy else: assert os.path.isfile(taxonomy) self._taxonomy = _TaxonomyWrapperA(taxonomy) # correct the predictions self._seqToPred if correctLabelThreshold is not None: self._seqToPred = self._correctPredictions(self._seqToBp, self._seqToPred, self._seqToTrue, self._taxonomy, correctLabelThreshold)
def __init__(self, seqIdToBp, seqIdToPred, seqIdToTruePred, taxonomy, correctLabelThreshold=None): """ Initializes the accuracy object. @param seqIdToBp: dictionary or a fasta file @param seqIdToPred: dictionary or a prediction file @param seqIdToTruePred: dictionary or a true prediction file @param taxonomy: database file in the sqlite3 format, or taxonomy object retrieved from not closed Accuracy """ if isinstance(seqIdToBp, dict): self._seqToBp = seqIdToBp else: assert os.path.isfile(seqIdToBp) self._seqToBp = fasta.getSequenceToBpDict(seqIdToBp) if isinstance(seqIdToPred, dict): self._seqToPred = seqIdToPred else: assert os.path.isfile(seqIdToPred) self._seqToPred = csv.predToDict(seqIdToPred) if isinstance(seqIdToTruePred, dict): self._seqToTrue = seqIdToTruePred else: assert os.path.isfile(seqIdToTruePred) self._seqToTrue = csv.predToDict(seqIdToTruePred) if isinstance(taxonomy, _TaxonomyWrapperA): self._taxonomy = taxonomy else: assert os.path.isfile(taxonomy) self._taxonomy = _TaxonomyWrapperA(taxonomy) # correct the predictions self._seqToPred if correctLabelThreshold is not None: self._seqToPred = self._correctPredictions( self._seqToBp, self._seqToPred, self._seqToTrue, self._taxonomy, correctLabelThreshold)
def computeTrainingAccuracy(workingDir, taWorkingDir, sampleSpecificDir, ppsTrainDataDir, outputDir, ppsInstallDir, ppsScripts, ppsConfigFilePath, predictLogFileName, modelTaxonIdFilePath, databaseFile): """ Computes the training accuracy for the PPS training data. This function doesn't consider training data used to train intermediate (misc?) nodes! The training data that correspond to the sample specific data is fragmented (via PPS) and contained in the training data of different lengths. @param workingDir: working directory of the PPS+ pipeline @param taWorkingDir: working directory for the accuracy computation @param sampleSpecificDir: directory containing the sample specific data @param ppsTrainDataDir: directory 'sampled_fasta' containing PPS training data @param outputDir: directory for output files @param ppsScripts: directory containing PPS scripts @param ppsConfigFilePath: the PPS configuration file @param ppsInstallDir: directory where PPS is installed @param predictLogFileName: logging file for PPS prediction @param modelTaxonIdFilePath: file containing all leaf ncbi taxon ids that are modelled @param databaseFile: ncbi taxonomy file in the sqlite3 format """ for d in [ workingDir, taWorkingDir, sampleSpecificDir, ppsTrainDataDir, outputDir, ppsInstallDir, ppsScripts, os.path.dirname(predictLogFileName) ]: assert os.path.isdir(d), "Directory '%s' doesn't exist!" % d for f in [ppsConfigFilePath, databaseFile, modelTaxonIdFilePath]: assert os.path.isfile(f), "File '%s' doesn't exist!" % f # all directories that contain PPS training data trainDirList = [sampleSpecificDir] for d in os.listdir(ppsTrainDataDir): trainDirList.append(os.path.join(ppsTrainDataDir, d)) # fasta file with all training sequences allTrainFastaFile = os.path.join(taWorkingDir, 'all_train_data.fna') out = csv.OutFileBuffer(allTrainFastaFile) seqIdToTruePred = {} # merge all training fasta files to one fasta file for d in trainDirList: dName = os.path.basename(d) for f in os.listdir(d): taxonId = int(os.path.basename(f).rsplit('.', 2)[0]) for seqId, seq in fasta.fastaFileToDict(os.path.join( d, f)).iteritems(): if d == sampleSpecificDir: #label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1]) id = str( taxonId) + '|' + dName + '|' + seqId + '|label:' + str( taxonId) else: id = str(taxonId) + '|' + dName + '|' + seqId out.writeText('>' + id + '\n' + seq + '\n') seqIdToTruePred[id] = taxonId out.close() # predict the merged file using the generated model if os.name == 'posix': predictCmd = str( os.path.join(ppsScripts, 'predict.rb') + ' ' + allTrainFastaFile + ' ' + ppsConfigFilePath) #print(predictCmd) logOut = open(predictLogFileName, 'w') predictProc = subprocess.Popen( predictCmd, shell=True, bufsize=-1, cwd=ppsInstallDir, stdout=logOut, stderr=subprocess.STDOUT) # stdout=subprocess.STDOUT predictProc.wait() logOut.close() if predictProc.returncode != 0: raise Exception( "PPS 'predict' training data returned with non-zero status: %s, cmd: %s" % (predictProc.returncode, predictCmd)) else: print("Can't run PPS on a non-posix system!") return # read in predicted train data seqIdToPred = csv.predToDict(allTrainFastaFile + '.nox.fna.out') # read fasta file seqIdToBp = fasta.getSequenceToBpDict(allTrainFastaFile) # leaf taxonIds that are modelled modelLeafTaxonIds = set(map(int, csv.getColumnAsList(modelTaxonIdFilePath))) taxonomyS = taxonomy_ncbi.TaxonomyNcbi(databaseFile, considerNoRank=True) notLeafTaxonIds = set() for id in modelLeafTaxonIds: notLeafTaxonIds.update( set(map(int, (taxonomyS.getParentsNcbidSet(id))))) taxonomyS.close() # get only sequences with true taxonId defined at leaf level that is modelled or lower seqIdToBp2 = {} seqIdToPred2 = {} seqIdToTruePred2 = {} seqIdToBpMisc = {} seqIdToPredMisc = {} seqIdToTruePredMisc = {} for seqId, bp in seqIdToBp.iteritems(): label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1]) if label not in notLeafTaxonIds: seqIdToBp2[seqId] = bp seqIdToPred2[seqId] = seqIdToPred[seqId] seqIdToTruePred2[seqId] = seqIdToTruePred[seqId] else: seqIdToBpMisc[seqId] = bp seqIdToPredMisc[seqId] = seqIdToPred[seqId] seqIdToTruePredMisc[seqId] = seqIdToTruePred[seqId] seqIdToBp = seqIdToBp2 seqIdToPred = seqIdToPred2 seqIdToTruePred = seqIdToTruePred2 # accuracy for all, filter out sample specific data (whole length) seqIdToBpNoSampleSpec = {} for seqId, bp in seqIdToBp.iteritems(): if str(seqId).split( '|', 2)[1].strip() != os.path.basename(sampleSpecificDir).strip(): seqIdToBpNoSampleSpec[seqId] = bp acc = accuracy.Accuracy(seqIdToBpNoSampleSpec, seqIdToPred, seqIdToTruePred, databaseFile) out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_all.txt')) out.writeText( acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:], minFracClade=None, minFracPred=None, overview=True)) out.close() taxonomyA = acc.getTaxonomy() acc.close(closeTaxonomy=False) # accuracy for (misc) nodes acc = accuracy.Accuracy(seqIdToBpMisc, seqIdToPredMisc, seqIdToTruePredMisc, taxonomyA) out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_misc.txt')) out.writeText( acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:], minFracClade=None, minFracPred=None, overview=True)) out.close() acc.close(closeTaxonomy=False) # generate the confusion matrices (for the "for all" scenario) cm = confusion_matrix.ConfusionMatrix(seqIdToBp, seqIdToPred, seqIdToTruePred, databaseFile, taxonomy_ncbi.TAXONOMIC_RANKS[1:]) for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]: cm.generateConfusionMatrix( rank, os.path.join(outputDir, 'train_accuracy_cmp_all')) taxonomyCM = cm.getTaxonomy() cm.close(closeTaxonomy=False) # accuracy for individual directories (seq lengths) # (the sample specific fragments are among PPS sampled fasta) for d in trainDirList: dName = os.path.basename(d) seqIdToBpSub = {} seqIdToPredSub = {} seqIdToTruePredSub = {} for seqId, bp in seqIdToBp.iteritems(): if str(seqId).split('|', 2)[1].strip() == str(dName).strip(): seqIdToBpSub[seqId] = seqIdToBp[seqId] seqIdToPredSub[seqId] = seqIdToPred[seqId] seqIdToTruePredSub[seqId] = seqIdToTruePred[seqId] # accuracy acc = accuracy.Accuracy(seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyA) out = csv.OutFileBuffer( os.path.join(outputDir, 'train_accuracy_' + dName + '.txt')) out.writeText( acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:], minFracClade=None, minFracPred=None, overview=True)) # confusion matrices cm = confusion_matrix.ConfusionMatrix( seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyCM, taxonomy_ncbi.TAXONOMIC_RANKS[1:]) for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]: cm.generateConfusionMatrix( rank, os.path.join(outputDir, 'train_accuracy_cmp_' + dName)) cm.close(closeTaxonomy=False) out.close() acc.close(closeTaxonomy=False) taxonomyA.close() taxonomyCM.close()
def __init__(self, seqNameToBp, seqNameToPred, seqNameToRefPred, taxonomy, ranksList=None): """ Initializes the main class that computes the confusion matrices. @param seqNameToBp: contains mapping, sequence name to bp (as int); or a fasta file @type seqNameToBp: dict; or a fasta file @param seqNameToPred: contains mapping, sequence name to taxonId; or a tab separated prediction file @type seqNameToPred: dict; or a tab separated file, first column ~ sequence name, last column taxonId @param seqNameToRefPred: contains mapping, sequence name to taxon Id; or a tab separated reference file @type seqNameToRefPred: dict; or a tab separated file, first column ~ sequence name, last column taxonId @param ranksList: list of ranks for which the confusion matrices will be computed (None ~ all default ranks) @type ranksList: list of str @param taxonomy: database file in the sqlite3 format; or taxonomy returned by function "getTaxonomy" """ # Check input options and read in the data (if appropriate) self._initFailed = False # replace this with exceptions! if isinstance(seqNameToBp, dict): self._seqNameToBp = seqNameToBp elif isinstance(seqNameToBp, str) and os.path.isfile(seqNameToBp): self._seqNameToBp = fas.getSequenceToBpDict(seqNameToBp) else: print("Can't get sequence info from:", seqNameToBp) self._initFailed = True return if isinstance(seqNameToPred, dict): self._seqNameToPred = seqNameToPred elif isinstance(seqNameToPred, str) and os.path.isfile(seqNameToPred): self._seqNameToPred = csv.predToDict(seqNameToPred) else: print("Can't get prediction info from:", seqNameToPred) self._initFailed = True return if isinstance(seqNameToRefPred, dict): self._seqNameToRefPred = seqNameToRefPred elif isinstance(seqNameToRefPred, str) and os.path.isfile(seqNameToRefPred): self._seqNameToRefPred = csv.predToDict(seqNameToRefPred) else: print("Can't get reference prediction info from:", seqNameToRefPred) self._initFailed = True return if isinstance(taxonomy, str) and os.path.isfile(taxonomy): self._taxonomy = _TaxonomyWrapCM(taxonomy) elif isinstance(taxonomy, _TaxonomyWrapCM): self._taxonomy = taxonomy else: print("Can't use taxonomy: ", taxonomy) if ranksList is None: ranksList = taxonomy_ncbi.TAXONOMIC_RANKS[1:] # default ranks else: allowedRanksSet = set(taxonomy_ncbi.TAXONOMIC_RANKS[1:]) # custom ranks for rank in ranksList: if rank not in allowedRanksSet: print('Rank: "' + str(rank) + '" is not allowed!') self._initFailed = True return rankIdsList = [] # rankIds that will be considered for rank in ranksList: rankIdsList.append(self._taxonomy.getRankId(rank)) self._allowedRankIdsSet = set(rankIdsList) # get predictions at different taxonomic ranks # rankId -> (seqId -> taxonIdAtRank) self._rankIdToPredMap = {} self._rankIdToRefMap = {} for rankId in rankIdsList: self._rankIdToPredMap[rankId] = {} self._rankIdToRefMap[rankId] = {} # get predictions at given ranks for seqId, taxonId in self._seqNameToPred.iteritems(): while (taxonId is not None) and (taxonId != 1): rankId = self._taxonomy.getRankIdOfTaxonId(taxonId) if rankId in self._allowedRankIdsSet: self._rankIdToPredMap[rankId][seqId] = taxonId taxonId = self._taxonomy.getParent(taxonId) # get reference predictions at given ranks for seqId, taxonId in self._seqNameToRefPred.iteritems(): while (taxonId is not None) and (taxonId != 1): rankId = self._taxonomy.getRankIdOfTaxonId(taxonId) if rankId in self._allowedRankIdsSet: self._rankIdToRefMap[rankId][seqId] = taxonId taxonId = self._taxonomy.getParent(taxonId)
def __init__(self, seqNameToBp, seqNameToPred, seqNameToRefPred, taxonomy, ranksList=None): """ Initializes the main class that computes the confusion matrices. @param seqNameToBp: contains mapping, sequence name to bp (as int); or a fasta file @type seqNameToBp: dict; or a fasta file @param seqNameToPred: contains mapping, sequence name to taxonId; or a tab separated prediction file @type seqNameToPred: dict; or a tab separated file, first column ~ sequence name, last column taxonId @param seqNameToRefPred: contains mapping, sequence name to taxon Id; or a tab separated reference file @type seqNameToRefPred: dict; or a tab separated file, first column ~ sequence name, last column taxonId @param ranksList: list of ranks for which the confusion matrices will be computed (None ~ all default ranks) @type ranksList: list of str @param taxonomy: database file in the sqlite3 format; or taxonomy returned by function "getTaxonomy" """ # Check input options and read in the data (if appropriate) self._initFailed = False # replace this with exceptions! if isinstance(seqNameToBp, dict): self._seqNameToBp = seqNameToBp elif isinstance(seqNameToBp, str) and os.path.isfile(seqNameToBp): self._seqNameToBp = fas.getSequenceToBpDict(seqNameToBp) else: print("Can't get sequence info from:", seqNameToBp) self._initFailed = True return if isinstance(seqNameToPred, dict): self._seqNameToPred = seqNameToPred elif isinstance(seqNameToPred, str) and os.path.isfile(seqNameToPred): self._seqNameToPred = cami.readAssignments(seqNameToPred) else: print("Can't get prediction info from:", seqNameToPred) self._initFailed = True return if isinstance(seqNameToRefPred, dict): self._seqNameToRefPred = seqNameToRefPred elif isinstance(seqNameToRefPred, str) and os.path.isfile(seqNameToRefPred): self._seqNameToRefPred = cami.readAssignments(seqNameToRefPred) else: print("Can't get reference prediction info from:", seqNameToRefPred) self._initFailed = True return if isinstance(taxonomy, str) and os.path.isfile(taxonomy): self._taxonomy = _TaxonomyWrapCM(taxonomy) elif isinstance(taxonomy, _TaxonomyWrapCM): self._taxonomy = taxonomy else: print("Can't use taxonomy: ", taxonomy) if ranksList is None: ranksList = taxonomy_ncbi.TAXONOMIC_RANKS[1:] # default ranks else: allowedRanksSet = set( taxonomy_ncbi.TAXONOMIC_RANKS[1:]) # custom ranks for rank in ranksList: if rank not in allowedRanksSet: print('Rank: "' + str(rank) + '" is not allowed!') self._initFailed = True return rankIdsList = [] # rankIds that will be considered for rank in ranksList: rankIdsList.append(self._taxonomy.getRankId(rank)) self._allowedRankIdsSet = set(rankIdsList) # get predictions at different taxonomic ranks # rankId -> (seqId -> taxonIdAtRank) self._rankIdToPredMap = {} self._rankIdToRefMap = {} for rankId in rankIdsList: self._rankIdToPredMap[rankId] = {} self._rankIdToRefMap[rankId] = {} # get predictions at given ranks for seqId, taxonId in self._seqNameToPred.iteritems(): while (taxonId is not None) and (taxonId != 1): rankId = self._taxonomy.getRankIdOfTaxonId(taxonId) if rankId in self._allowedRankIdsSet: self._rankIdToPredMap[rankId][seqId] = taxonId taxonId = self._taxonomy.getParent(taxonId) # get reference predictions at given ranks for seqId, taxonId in self._seqNameToRefPred.iteritems(): while (taxonId is not None) and (taxonId != 1): rankId = self._taxonomy.getRankIdOfTaxonId(taxonId) if rankId in self._allowedRankIdsSet: self._rankIdToRefMap[rankId][seqId] = taxonId taxonId = self._taxonomy.getParent(taxonId)
def __init__(self, contigNameToBp, contigNameToNcbid, scaffToContigList, taxonomy, minScaffContigCount=None, minScaffBpLen=None, cladesSet=None, considerContigWithNoScaff=True, ignoreScaffPredToRoot=True): """ Initializes the main Consistency class. @param contigNameToBp: dictionary that maps contig names to bp (int); or a fasta file that contain contigs @param contigNameToNcbid: dictionary that maps contig names to ncbids (int); or a prediction file - first column contig name, last column ncbid @param scaffToContigList: dictionary that maps scaffold names to list of contig names; or a file - first column scaffold name, second column contig name @param minScaffContigCount: consider only scaffolds that contain at least this number of contigs @param minScaffBpLen: consider only scaffolds with at least this collective length (in bp) @param cladesSet: consider only scaffolds that contain at least one contig from this set @param considerContigWithNoScaff: consider also contigs that are not assigned to scaffolds (as artificial scaffolds) @param ignoreScaffPredToRoot: ignore scaffolds that are assigned based on the root (uninformative) """ # check input options assert minScaffContigCount is None or isinstance(minScaffContigCount, int) assert minScaffBpLen is None or isinstance(minScaffBpLen, int) assert cladesSet is None or isinstance(cladesSet, set) assert isinstance(considerContigWithNoScaff, bool) assert isinstance(ignoreScaffPredToRoot, bool) if isinstance(contigNameToBp, dict): self._contigNameToBp = contigNameToBp elif isinstance(contigNameToBp, str) and os.path.isfile(contigNameToBp): self._contigNameToBp = getSequenceToBpDict(contigNameToBp) else: print("Can't get contig info from: ", contigNameToBp) return if isinstance(contigNameToNcbid, dict): self._contigToPred = contigNameToNcbid elif isinstance(contigNameToNcbid, str) and os.path.isfile(contigNameToNcbid): self._contigToPred = cami.readAssignments(contigNameToNcbid) else: print("Can't get prediction info from: ", contigNameToNcbid) return if isinstance(scaffToContigList, dict): self._scaffToContigsList = scaffToContigList elif isinstance(scaffToContigList, str) and os.path.isfile(scaffToContigList): self._scaffToContigsList = getMapping(scaffToContigList, 0, 1, '\t') else: print("Can't get scaffold config mapping from: ", scaffToContigList) return if isinstance(taxonomy, _TaxonomyWrapper) and (not taxonomy.isClosed()): self._taxonomy = taxonomy elif isinstance(taxonomy, str) and os.path.isfile(taxonomy): self._taxonomy = _TaxonomyWrapper(taxonomy) else: print("Can't use taxonomy:", taxonomy) return # check the consistency of the data! # if a contig that is defined in the mapping doesn't exist (in the fasta file) we remove it for scaff, contigsList in self._scaffToContigsList.iteritems(): removeList = [] for contig in contigsList: if contig not in self._contigNameToBp: removeList.append(contig) for contig in removeList: contigsList.remove(contig) # if a contig was predicted but there is no scaffold assigned to it then this # contig is assigned to an "artificial scaffold" if considerContigWithNoScaff: scaffContigSet = set() for s, l in self._scaffToContigsList.iteritems(): for c in l: scaffContigSet.add(c) aloneContigSet = set() for c in self._contigToPred: if c not in scaffContigSet: aloneContigSet.add(c) for c in aloneContigSet: scaffName = str('scaffold_' + c) # make up a scaffold name assert scaffName not in self._scaffToContigsList, 'The names of contigs are ambiguous!' self._scaffToContigsList[scaffName] = [c] # filter out scaffolds according to the input constrains self._scaffolds = dict() for scaffName, contigsList in self._scaffToContigsList.iteritems(): if minScaffContigCount is not None: if len(contigsList) < minScaffContigCount: continue if minScaffBpLen is not None: sum = 0 for contig in contigsList: sum += self._contigNameToBp[contig] if sum < minScaffBpLen: continue if cladesSet is not None: passScaff = False for contig in contigsList: if (contig in self._contigToPred) and (self._contigToPred[contig] in cladesSet): passScaff = True break if not passScaff: continue # process the scaffold, but if everything in the scaffold was assigned to the root, then ignore it! s = self._processScaffold(scaffName) if not ((s.getNcbid() == 1) and ignoreScaffPredToRoot): self._scaffolds[scaffName] = s
def getProfile(readsFFastaFile, communityFile, contigMFastaFile, contigLFastaFile, taxonomyMFile, taxonomyDbFile, outProfileFile): """ Gets the profile of the dataset. @param readsFFastaFile: @param communityFile: @param contigMFastaFile: @param contigLFastaFile: @param taxonomyMFile: @param taxonomyDbFile: taxonomy in the sqlite3 format @param outProfileFile: output file """ # get map: taxonId -> read count taxonIdToReadCount = {} readTotalCount = 0 for taxonId in getReadsTaxonIdList(readsFFastaFile, communityFile, readHeaderToCommunityId=getCommunityId)[1:]: if taxonId in taxonIdToReadCount: taxonIdToReadCount[taxonId] += 1 else: taxonIdToReadCount[taxonId] = 1 readTotalCount += 1 # get map: taxonId -> contig count # get map: taxonId -> contig bp taxonIdToContigCount = {} taxonIdToContigBp = {} totalContigCount = 0 seqIdToTaxonId = csv.predToDict(taxonomyMFile) seqIdToBp = fas.getSequenceToBpDict(contigMFastaFile) for seqId, bp in seqIdToBp.iteritems(): totalContigCount += 1 taxonId = seqIdToTaxonId[seqId] if taxonId in taxonIdToContigBp: taxonIdToContigBp[taxonId] += bp else: taxonIdToContigBp[taxonId] = bp if taxonId in taxonIdToContigCount: taxonIdToContigCount[taxonId] += 1 else: taxonIdToContigCount[taxonId] = 1 taxonIdToTotalBp = {} taxonIdToAvgSumCov = {} taxonIdToAvgCov = {} totalBp = 0.0 for taxonId in taxonIdToContigBp: taxonIdToTotalBp[taxonId] = 0.0 taxonIdToAvgSumCov[taxonId] = 0.0 taxonIdToAvgCov[taxonId] = 0.0 for seqId in fas.fastaFileToDictWholeNames(contigLFastaFile): shortSeqId = getShortContigId(seqId) if shortSeqId in seqIdToBp: coverage = getCoverage(seqId) bp = seqIdToBp[shortSeqId] taxonId = seqIdToTaxonId[shortSeqId] taxonIdToTotalBp[taxonId] += bp taxonIdToAvgSumCov[taxonId] += float(coverage) * float(bp) totalBp += bp for taxonId, bp in taxonIdToTotalBp.iteritems(): if bp > 0: taxonIdToAvgCov[taxonId] = taxonIdToAvgSumCov[taxonId] / float(bp) tupleList = [] taxonomy = taxonomy_ncbi.TaxonomyNcbi(taxonomyDbFile, considerNoRank=True) ranks = taxonomy_ncbi.TAXONOMIC_RANKS[2:] avgCoverage = 0.0 for taxonId, readCount in taxonIdToReadCount.iteritems(): scName = ScientificNameAtRank(taxonId, taxonomy, ranks) tupleList.append((taxonId, round(100 * (readCount / float(readTotalCount)), 1), round(100 * (taxonIdToTotalBp.get(taxonId, 0) / float(totalBp)), 1), round(taxonIdToAvgCov.get(taxonId, 0), 2), round(taxonIdToTotalBp.get(taxonId, 0) / 1000000.0, 2), taxonIdToContigCount.get(taxonId, 0), taxonomy.getScientificName(taxonId), scName.getNameAtRank('phylum'), scName.getNameAtRank('class'), scName.getNameAtRank('order'), scName.getNameAtRank('family'), scName.getNameAtRank('genus'), scName.getNameAtRank('species') # this could be done in a nicer way )) avgCoverage += taxonIdToAvgCov.get(taxonId, 0) * taxonIdToTotalBp.get(taxonId, 0) avgCoverage /= float(totalBp) tupleList.sort(key=lambda x: x[2], reverse=True) out = csv.OutFileBuffer(outProfileFile) out.writeText('#taxonId, % reads, % contigs, avg coverage, MB contigs, contigs count, strain name, ' + ",".join(ranks) + '\n') for entry in tupleList: out.writeText(','.join(map(str, entry)) + '\n') out.writeText('#Sum/Avg., -, -, ' + str(round(avgCoverage, 2)) + ', ' + str(round(totalBp / 1000000.0, 2)) + ', ' + str(totalContigCount) + ', -\n') out.close() taxonomy.close()
def __init__(self, contigNameToBp, contigNameToNcbid, scaffToContigList, taxonomy, minScaffContigCount=None, minScaffBpLen=None, cladesSet=None, considerContigWithNoScaff=True, ignoreScaffPredToRoot=True): """ Initializes the main Consistency class. @param contigNameToBp: dictionary that maps contig names to bp (int); or a fasta file that contain contigs @param contigNameToNcbid: dictionary that maps contig names to ncbids (int); or a prediction file - first column contig name, last column ncbid @param scaffToContigList: dictionary that maps scaffold names to list of contig names; or a file - first column scaffold name, second column contig name @param minScaffContigCount: consider only scaffolds that contain at least this number of contigs @param minScaffBpLen: consider only scaffolds with at least this collective length (in bp) @param cladesSet: consider only scaffolds that contain at least one contig from this set @param considerContigWithNoScaff: consider also contigs that are not assigned to scaffolds (as artificial scaffolds) @param ignoreScaffPredToRoot: ignore scaffolds that are assigned based on the root (uninformative) """ # check input options assert minScaffContigCount is None or isinstance( minScaffContigCount, int) assert minScaffBpLen is None or isinstance(minScaffBpLen, int) assert cladesSet is None or isinstance(cladesSet, set) assert isinstance(considerContigWithNoScaff, bool) assert isinstance(ignoreScaffPredToRoot, bool) if isinstance(contigNameToBp, dict): self._contigNameToBp = contigNameToBp elif isinstance(contigNameToBp, str) and os.path.isfile(contigNameToBp): self._contigNameToBp = getSequenceToBpDict(contigNameToBp) else: print("Can't get contig info from: ", contigNameToBp) return if isinstance(contigNameToNcbid, dict): self._contigToPred = contigNameToNcbid elif isinstance(contigNameToNcbid, str) and os.path.isfile(contigNameToNcbid): self._contigToPred = cami.readAssignments(contigNameToNcbid) else: print("Can't get prediction info from: ", contigNameToNcbid) return if isinstance(scaffToContigList, dict): self._scaffToContigsList = scaffToContigList elif isinstance(scaffToContigList, str) and os.path.isfile(scaffToContigList): self._scaffToContigsList = getMapping(scaffToContigList, 0, 1, '\t') else: print("Can't get scaffold config mapping from: ", scaffToContigList) return if isinstance(taxonomy, _TaxonomyWrapper) and (not taxonomy.isClosed()): self._taxonomy = taxonomy elif isinstance(taxonomy, str) and os.path.isfile(taxonomy): self._taxonomy = _TaxonomyWrapper(taxonomy) else: print("Can't use taxonomy:", taxonomy) return # check the consistency of the data! # if a contig that is defined in the mapping doesn't exist (in the fasta file) we remove it for scaff, contigsList in self._scaffToContigsList.iteritems(): removeList = [] for contig in contigsList: if contig not in self._contigNameToBp: removeList.append(contig) for contig in removeList: contigsList.remove(contig) # if a contig was predicted but there is no scaffold assigned to it then this # contig is assigned to an "artificial scaffold" if considerContigWithNoScaff: scaffContigSet = set() for s, l in self._scaffToContigsList.iteritems(): for c in l: scaffContigSet.add(c) aloneContigSet = set() for c in self._contigToPred: if c not in scaffContigSet: aloneContigSet.add(c) for c in aloneContigSet: scaffName = str('scaffold_' + c) # make up a scaffold name assert scaffName not in self._scaffToContigsList, 'The names of contigs are ambiguous!' self._scaffToContigsList[scaffName] = [c] # filter out scaffolds according to the input constrains self._scaffolds = dict() for scaffName, contigsList in self._scaffToContigsList.iteritems(): if minScaffContigCount is not None: if len(contigsList) < minScaffContigCount: continue if minScaffBpLen is not None: sum = 0 for contig in contigsList: sum += self._contigNameToBp[contig] if sum < minScaffBpLen: continue if cladesSet is not None: passScaff = False for contig in contigsList: if (contig in self._contigToPred) and ( self._contigToPred[contig] in cladesSet): passScaff = True break if not passScaff: continue # process the scaffold, but if everything in the scaffold was assigned to the root, then ignore it! s = self._processScaffold(scaffName) if not ((s.getNcbid() == 1) and ignoreScaffPredToRoot): self._scaffolds[scaffName] = s
def _main(): # define arguments parser = argparse.ArgumentParser(description='Default task: PPS+ evaluation', epilog='') parser.add_argument('-b', '--cont-binning-file', nargs=1, type=file, required=True, help='Binning file containing labels assigned to contigs.', metavar='assignments.csv', dest='b') parser.add_argument('-t', '--cont-true-binning-file', nargs=1, type=file, required=True, help='Binning file containing true labels for the contigs.', metavar='labels.csv', dest='t') parser.add_argument('-f', '--cont-contigs-file-listing', nargs=1, type=file, required=False, help='A list of paths of FASTA contigs files.', metavar='fasta_listing.txt', dest='f') parser.add_argument('-m', '--cont-scaffold-contig-mapping', nargs=1, type=file, required=False, help='Scaffold contig mapping, tab separated.', metavar='mapping.csv', dest='m') parser.add_argument('-n', '--cont-ncbi-taxonomy', nargs=1, required=False, help='Directory containing the NCBI names.dmp and nodes.dmp files.', metavar='taxonomy_dir', dest='n') parser.add_argument('-o', '--cont-output-dir', nargs=1, required=True, help='Output directory.', metavar='output_dir', dest='o') parser.add_argument('-j', '--default-job', nargs='+', help='What task/job should be performed (p~precision/recall, s~scaff-contig consistency, ' 'c~confusion tables, default - if not spec compute all)', metavar='', dest='j') args = parser.parse_args() # read and check the arguments seqIdToBp = None scaffToContig = None binning = None trueBinning = None outputDir = None job = None if args.o and len(args.o) == 1 and os.path.isdir(args.o[0]): outputDir = args.o[0] if args.b and len(args.b) == 1 and os.path.isfile(args.b[0].name): binningFile = args.b[0].name binning = cami.readAssignments(binningFile) if args.t and len(args.t) == 1 and os.path.isfile(args.t[0].name): trueBinningFile = args.t[0].name trueBinning = cami.readAssignments(trueBinningFile) if args.f and len(args.f) == 1 and os.path.isfile(args.f[0].name): seqIdToBp = fasta.getSequenceToBpDict(args.f[0].name) # contigsFileListing = args.f[0].name # for line in open(contigsFileListing): # if os.path.isfile(line.strip()): # d = fasta.getSequenceToBpDict(line.strip()) # if seqIdToBp is None: # seqIdToBp = d # else: # count = len(d) + len(seqIdToBp) # seqIdToBp.update(d) # if count > len(seqIdToBp): # sys.stderr.write('The fasta files contain duplicate entries!') if args.m and len(args.m) == 1 and os.path.isfile(args.m[0].name): scaffoldContigMapping = args.m[0].name scaffToContig = csv.getMapping(scaffoldContigMapping, 0, 1, '\t') taxonomyPath = os.path.join(outputDir, 'taxonomy_ncbi.db') if not os.path.isfile(taxonomyPath): if args.n and len(args.n) == 1 and os.path.isdir(args.n[0]): # build the ncbi taxonomy in the case it doesn't exist ncbitax2sqlite.build_database(Args(db=taxonomyPath, dmp=args.n[0])) else: taxonomyPath = None if args.j and len(args.j) > 0 and len(set(args.j).intersection(set(['p', 's', 'c']))) > 0: job = set(args.j) # print job # print args.j # print len(seqIdToBp) # print len(binning) # print len(trueBinning) # print taxonomyPath # print outputDir if (job is None or 'p' in args.j) and seqIdToBp and binning and trueBinning and taxonomyPath and outputDir: print('Computing precision/recall') # precision/recall - no correction acc = accuracy.Accuracy(seqIdToBp, binning, trueBinning, taxonomyPath) out = csv.OutFileBuffer(os.path.join(outputDir, 'precision_recall.csv')) out.writeText(acc.getAccuracyPrint(RANKS, MIN_FRAC_CLADE, MIN_FRAC_CLADE)) out.close() acc.close() # precision/recall - with correction acc = accuracy.Accuracy(seqIdToBp, binning, trueBinning, taxonomyPath, CORRECT_LABEL_THRESHOLD) out = csv.OutFileBuffer(os.path.join(outputDir, 'precision_recall_correction.csv')) out.writeText(acc.getAccuracyPrint(RANKS, MIN_FRAC_CLADE, MIN_FRAC_CLADE)) out.close() acc.close() # compute confusion matrices if (job is None or 'c' in args.j) and seqIdToBp and binning and trueBinning and taxonomyPath and outputDir: print('Computing confusion matrices') confusionMatrix = confusion_matrix.ConfusionMatrix(seqIdToBp, binning, trueBinning, taxonomyPath, RANKS) for rank in RANKS: confusionMatrix.generateConfusionMatrix(rank, os.path.join(outputDir, 'confusion_matrix')) confusionMatrix.close() # compute scaffold contig consistency if (job is None or 's' in args.j) and seqIdToBp and binning and scaffToContig and taxonomyPath \ and outputDir: print('Computing scaffold-contig consistency') cons = consistency.Consistency(seqIdToBp, binning, scaffToContig, taxonomyPath) out = csv.OutFileBuffer(os.path.join(outputDir, 'consistency.txt')) out.writeText(cons.getGroupedScaffoldsPrint()) cons.close() out.close() createEvalMetaFile(outputDir)
def getProfile(readsFFastaFile, communityFile, contigMFastaFile, contigLFastaFile, taxonomyMFile, taxonomyDbFile, outProfileFile): """ Gets the profile of the dataset. @param readsFFastaFile: @param communityFile: @param contigMFastaFile: @param contigLFastaFile: @param taxonomyMFile: @param taxonomyDbFile: taxonomy in the sqlite3 format @param outProfileFile: output file """ # get map: taxonId -> read count taxonIdToReadCount = {} readTotalCount = 0 for taxonId in getReadsTaxonIdList( readsFFastaFile, communityFile, readHeaderToCommunityId=getCommunityId)[1:]: if taxonId in taxonIdToReadCount: taxonIdToReadCount[taxonId] += 1 else: taxonIdToReadCount[taxonId] = 1 readTotalCount += 1 # get map: taxonId -> contig count # get map: taxonId -> contig bp taxonIdToContigCount = {} taxonIdToContigBp = {} totalContigCount = 0 seqIdToTaxonId = csv.predToDict(taxonomyMFile) seqIdToBp = fas.getSequenceToBpDict(contigMFastaFile) for seqId, bp in seqIdToBp.iteritems(): totalContigCount += 1 taxonId = seqIdToTaxonId[seqId] if taxonId in taxonIdToContigBp: taxonIdToContigBp[taxonId] += bp else: taxonIdToContigBp[taxonId] = bp if taxonId in taxonIdToContigCount: taxonIdToContigCount[taxonId] += 1 else: taxonIdToContigCount[taxonId] = 1 taxonIdToTotalBp = {} taxonIdToAvgSumCov = {} taxonIdToAvgCov = {} totalBp = 0.0 for taxonId in taxonIdToContigBp: taxonIdToTotalBp[taxonId] = 0.0 taxonIdToAvgSumCov[taxonId] = 0.0 taxonIdToAvgCov[taxonId] = 0.0 for seqId in fas.fastaFileToDictWholeNames(contigLFastaFile): shortSeqId = getShortContigId(seqId) if shortSeqId in seqIdToBp: coverage = getCoverage(seqId) bp = seqIdToBp[shortSeqId] taxonId = seqIdToTaxonId[shortSeqId] taxonIdToTotalBp[taxonId] += bp taxonIdToAvgSumCov[taxonId] += float(coverage) * float(bp) totalBp += bp for taxonId, bp in taxonIdToTotalBp.iteritems(): if bp > 0: taxonIdToAvgCov[taxonId] = taxonIdToAvgSumCov[taxonId] / float(bp) tupleList = [] taxonomy = taxonomy_ncbi.TaxonomyNcbi(taxonomyDbFile, considerNoRank=True) ranks = taxonomy_ncbi.TAXONOMIC_RANKS[2:] avgCoverage = 0.0 for taxonId, readCount in taxonIdToReadCount.iteritems(): scName = ScientificNameAtRank(taxonId, taxonomy, ranks) tupleList.append(( taxonId, round(100 * (readCount / float(readTotalCount)), 1), round(100 * (taxonIdToTotalBp.get(taxonId, 0) / float(totalBp)), 1), round(taxonIdToAvgCov.get(taxonId, 0), 2), round(taxonIdToTotalBp.get(taxonId, 0) / 1000000.0, 2), taxonIdToContigCount.get(taxonId, 0), taxonomy.getScientificName(taxonId), scName.getNameAtRank('phylum'), scName.getNameAtRank('class'), scName.getNameAtRank('order'), scName.getNameAtRank('family'), scName.getNameAtRank('genus'), scName.getNameAtRank( 'species') # this could be done in a nicer way )) avgCoverage += taxonIdToAvgCov.get(taxonId, 0) * taxonIdToTotalBp.get( taxonId, 0) avgCoverage /= float(totalBp) tupleList.sort(key=lambda x: x[2], reverse=True) out = csv.OutFileBuffer(outProfileFile) out.writeText( '#taxonId, % reads, % contigs, avg coverage, MB contigs, contigs count, strain name, ' + ",".join(ranks) + '\n') for entry in tupleList: out.writeText(','.join(map(str, entry)) + '\n') out.writeText('#Sum/Avg., -, -, ' + str(round(avgCoverage, 2)) + ', ' + str(round(totalBp / 1000000.0, 2)) + ', ' + str(totalContigCount) + ', -\n') out.close() taxonomy.close()
def computeTrainingAccuracy(workingDir, taWorkingDir, sampleSpecificDir, ppsTrainDataDir, outputDir, ppsInstallDir, ppsScripts, ppsConfigFilePath, predictLogFileName, modelTaxonIdFilePath, databaseFile): """ Computes the training accuracy for the PPS training data. This function doesn't consider training data used to train intermediate (misc?) nodes! The training data that correspond to the sample specific data is fragmented (via PPS) and contained in the training data of different lengths. @param workingDir: working directory of the PPS+ pipeline @param taWorkingDir: working directory for the accuracy computation @param sampleSpecificDir: directory containing the sample specific data @param ppsTrainDataDir: directory 'sampled_fasta' containing PPS training data @param outputDir: directory for output files @param ppsScripts: directory containing PPS scripts @param ppsConfigFilePath: the PPS configuration file @param ppsInstallDir: directory where PPS is installed @param predictLogFileName: logging file for PPS prediction @param modelTaxonIdFilePath: file containing all leaf ncbi taxon ids that are modelled @param databaseFile: ncbi taxonomy file in the sqlite3 format """ for d in [workingDir, taWorkingDir, sampleSpecificDir, ppsTrainDataDir, outputDir, ppsInstallDir, ppsScripts, os.path.dirname(predictLogFileName)]: assert os.path.isdir(d), "Directory '%s' doesn't exist!" % d for f in [ppsConfigFilePath, databaseFile, modelTaxonIdFilePath]: assert os.path.isfile(f), "File '%s' doesn't exist!" % f # all directories that contain PPS training data trainDirList = [sampleSpecificDir] for d in os.listdir(ppsTrainDataDir): trainDirList.append(os.path.join(ppsTrainDataDir, d)) # fasta file with all training sequences allTrainFastaFile = os.path.join(taWorkingDir, 'all_train_data.fna') out = csv.OutFileBuffer(allTrainFastaFile) seqIdToTruePred = {} # merge all training fasta files to one fasta file for d in trainDirList: dName = os.path.basename(d) for f in os.listdir(d): taxonId = int(os.path.basename(f).rsplit('.', 2)[0]) for seqId, seq in fasta.fastaFileToDict(os.path.join(d, f)).iteritems(): if d == sampleSpecificDir: #label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1]) id = str(taxonId) + '|' + dName + '|' + seqId + '|label:' + str(taxonId) else: id = str(taxonId) + '|' + dName + '|' + seqId out.writeText('>' + id + '\n' + seq + '\n') seqIdToTruePred[id] = taxonId out.close() # predict the merged file using the generated model if os.name == 'posix': predictCmd = str(os.path.join(ppsScripts, 'predict.rb') + ' ' + allTrainFastaFile + ' ' + ppsConfigFilePath) #print(predictCmd) logOut = open(predictLogFileName, 'w') predictProc = subprocess.Popen(predictCmd, shell=True, bufsize=-1, cwd=ppsInstallDir, stdout=logOut, stderr=subprocess.STDOUT) # stdout=subprocess.STDOUT predictProc.wait() logOut.close() if predictProc.returncode != 0: raise Exception("PPS 'predict' training data returned with non-zero status: %s, cmd: %s" % (predictProc.returncode, predictCmd)) else: print("Can't run PPS on a non-posix system!") return # read in predicted train data seqIdToPred = csv.predToDict(allTrainFastaFile + '.nox.fna.out') # read fasta file seqIdToBp = fasta.getSequenceToBpDict(allTrainFastaFile) # leaf taxonIds that are modelled modelLeafTaxonIds = set(map(int, csv.getColumnAsList(modelTaxonIdFilePath))) taxonomyS = taxonomy_ncbi.TaxonomyNcbi(databaseFile, considerNoRank=True) notLeafTaxonIds = set() for id in modelLeafTaxonIds: notLeafTaxonIds.update(set(map(int, (taxonomyS.getParentsNcbidSet(id))))) taxonomyS.close() # get only sequences with true taxonId defined at leaf level that is modelled or lower seqIdToBp2 = {} seqIdToPred2 = {} seqIdToTruePred2 = {} seqIdToBpMisc = {} seqIdToPredMisc = {} seqIdToTruePredMisc = {} for seqId, bp in seqIdToBp.iteritems(): label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1]) if label not in notLeafTaxonIds: seqIdToBp2[seqId] = bp seqIdToPred2[seqId] = seqIdToPred[seqId] seqIdToTruePred2[seqId] = seqIdToTruePred[seqId] else: seqIdToBpMisc[seqId] = bp seqIdToPredMisc[seqId] = seqIdToPred[seqId] seqIdToTruePredMisc[seqId] = seqIdToTruePred[seqId] seqIdToBp = seqIdToBp2 seqIdToPred = seqIdToPred2 seqIdToTruePred = seqIdToTruePred2 # accuracy for all, filter out sample specific data (whole length) seqIdToBpNoSampleSpec = {} for seqId, bp in seqIdToBp.iteritems(): if str(seqId).split('|', 2)[1].strip() != os.path.basename(sampleSpecificDir).strip(): seqIdToBpNoSampleSpec[seqId] = bp acc = accuracy.Accuracy(seqIdToBpNoSampleSpec, seqIdToPred, seqIdToTruePred, databaseFile) out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_all.txt')) out.writeText(acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:], minFracClade=None, minFracPred=None, overview=True)) out.close() taxonomyA = acc.getTaxonomy() acc.close(closeTaxonomy=False) # accuracy for (misc) nodes acc = accuracy.Accuracy(seqIdToBpMisc, seqIdToPredMisc, seqIdToTruePredMisc, taxonomyA) out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_misc.txt')) out.writeText(acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:], minFracClade=None, minFracPred=None, overview=True)) out.close() acc.close(closeTaxonomy=False) # generate the confusion matrices (for the "for all" scenario) cm = confusion_matrix.ConfusionMatrix(seqIdToBp, seqIdToPred, seqIdToTruePred, databaseFile, taxonomy_ncbi.TAXONOMIC_RANKS[1:]) for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]: cm.generateConfusionMatrix(rank, os.path.join(outputDir, 'train_accuracy_cmp_all')) taxonomyCM = cm.getTaxonomy() cm.close(closeTaxonomy=False) # accuracy for individual directories (seq lengths) # (the sample specific fragments are among PPS sampled fasta) for d in trainDirList: dName = os.path.basename(d) seqIdToBpSub = {} seqIdToPredSub = {} seqIdToTruePredSub = {} for seqId, bp in seqIdToBp.iteritems(): if str(seqId).split('|', 2)[1].strip() == str(dName).strip(): seqIdToBpSub[seqId] = seqIdToBp[seqId] seqIdToPredSub[seqId] = seqIdToPred[seqId] seqIdToTruePredSub[seqId] = seqIdToTruePred[seqId] # accuracy acc = accuracy.Accuracy(seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyA) out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_' + dName + '.txt')) out.writeText(acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:], minFracClade=None, minFracPred=None, overview=True)) # confusion matrices cm = confusion_matrix.ConfusionMatrix(seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyCM, taxonomy_ncbi.TAXONOMIC_RANKS[1:]) for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]: cm.generateConfusionMatrix(rank, os.path.join(outputDir, 'train_accuracy_cmp_' + dName)) cm.close(closeTaxonomy=False) out.close() acc.close(closeTaxonomy=False) taxonomyA.close() taxonomyCM.close()
def _main(): # define arguments parser = argparse.ArgumentParser( description='Default task: PPS+ evaluation', epilog='') parser.add_argument( '-b', '--cont-binning-file', nargs=1, type=file, required=True, help='Binning file containing labels assigned to contigs.', metavar='assignments.csv', dest='b') parser.add_argument( '-t', '--cont-true-binning-file', nargs=1, type=file, required=True, help='Binning file containing true labels for the contigs.', metavar='labels.csv', dest='t') parser.add_argument('-f', '--cont-contigs-file-listing', nargs=1, type=file, required=False, help='A list of paths of FASTA contigs files.', metavar='fasta_listing.txt', dest='f') parser.add_argument('-m', '--cont-scaffold-contig-mapping', nargs=1, type=file, required=False, help='Scaffold contig mapping, tab separated.', metavar='mapping.csv', dest='m') parser.add_argument( '-n', '--cont-ncbi-taxonomy', nargs=1, required=False, help='Directory containing the NCBI names.dmp and nodes.dmp files.', metavar='taxonomy_dir', dest='n') parser.add_argument('-o', '--cont-output-dir', nargs=1, required=True, help='Output directory.', metavar='output_dir', dest='o') parser.add_argument( '-j', '--default-job', nargs='+', help= 'What task/job should be performed (p~precision/recall, s~scaff-contig consistency, ' 'c~confusion tables, default - if not spec compute all)', metavar='', dest='j') args = parser.parse_args() # read and check the arguments seqIdToBp = None scaffToContig = None binning = None trueBinning = None outputDir = None job = None if args.o and len(args.o) == 1 and os.path.isdir(args.o[0]): outputDir = args.o[0] if args.b and len(args.b) == 1 and os.path.isfile(args.b[0].name): binningFile = args.b[0].name binning = cami.readAssignments(binningFile) if args.t and len(args.t) == 1 and os.path.isfile(args.t[0].name): trueBinningFile = args.t[0].name trueBinning = cami.readAssignments(trueBinningFile) if args.f and len(args.f) == 1 and os.path.isfile(args.f[0].name): seqIdToBp = fasta.getSequenceToBpDict(args.f[0].name) # contigsFileListing = args.f[0].name # for line in open(contigsFileListing): # if os.path.isfile(line.strip()): # d = fasta.getSequenceToBpDict(line.strip()) # if seqIdToBp is None: # seqIdToBp = d # else: # count = len(d) + len(seqIdToBp) # seqIdToBp.update(d) # if count > len(seqIdToBp): # sys.stderr.write('The fasta files contain duplicate entries!') if args.m and len(args.m) == 1 and os.path.isfile(args.m[0].name): scaffoldContigMapping = args.m[0].name scaffToContig = csv.getMapping(scaffoldContigMapping, 0, 1, '\t') taxonomyPath = os.path.join(outputDir, 'taxonomy_ncbi.db') if not os.path.isfile(taxonomyPath): if args.n and len(args.n) == 1 and os.path.isdir(args.n[0]): # build the ncbi taxonomy in the case it doesn't exist ncbitax2sqlite.build_database(Args(db=taxonomyPath, dmp=args.n[0])) else: taxonomyPath = None if args.j and len(args.j) > 0 and len( set(args.j).intersection(set(['p', 's', 'c']))) > 0: job = set(args.j) # print job # print args.j # print len(seqIdToBp) # print len(binning) # print len(trueBinning) # print taxonomyPath # print outputDir if ( job is None or 'p' in args.j ) and seqIdToBp and binning and trueBinning and taxonomyPath and outputDir: print('Computing precision/recall') # precision/recall - no correction acc = accuracy.Accuracy(seqIdToBp, binning, trueBinning, taxonomyPath) out = csv.OutFileBuffer(os.path.join(outputDir, 'precision_recall.csv')) out.writeText( acc.getAccuracyPrint(RANKS, MIN_FRAC_CLADE, MIN_FRAC_CLADE)) out.close() acc.close() # precision/recall - with correction acc = accuracy.Accuracy(seqIdToBp, binning, trueBinning, taxonomyPath, CORRECT_LABEL_THRESHOLD) out = csv.OutFileBuffer( os.path.join(outputDir, 'precision_recall_correction.csv')) out.writeText( acc.getAccuracyPrint(RANKS, MIN_FRAC_CLADE, MIN_FRAC_CLADE)) out.close() acc.close() # compute confusion matrices if ( job is None or 'c' in args.j ) and seqIdToBp and binning and trueBinning and taxonomyPath and outputDir: print('Computing confusion matrices') confusionMatrix = confusion_matrix.ConfusionMatrix( seqIdToBp, binning, trueBinning, taxonomyPath, RANKS) for rank in RANKS: confusionMatrix.generateConfusionMatrix( rank, os.path.join(outputDir, 'confusion_matrix')) confusionMatrix.close() # compute scaffold contig consistency if (job is None or 's' in args.j) and seqIdToBp and binning and scaffToContig and taxonomyPath \ and outputDir: print('Computing scaffold-contig consistency') cons = consistency.Consistency(seqIdToBp, binning, scaffToContig, taxonomyPath) out = csv.OutFileBuffer(os.path.join(outputDir, 'consistency.txt')) out.writeText(cons.getGroupedScaffoldsPrint()) cons.close() out.close() createEvalMetaFile(outputDir)