def getLenStat(fileName, minLength=1000):
    """
        Get basic statistics concerning the lengths of the sequence.

        @param fileName: fasta file
        @type fileName: str
    """
    buf = ""
    c = 0
    bp = 0
    minLen = sys.maxint
    maxLen = 0
    totalBp = 0
    totalCount = 0
    for k, l in fas.getSequenceToBpDict(fileName).iteritems():
        totalCount += 1
        totalBp += l
        if l >= minLength:
            c += 1
            bp += l
            if l < minLen:
                minLen = l
            elif l > maxLen:
                maxLen = l

    buf += 'Bigger than %sbp (sequences: %s, Mbp: %s)\n' % (
        minLength, c, round(float(bp) / 1000000.0, 3))
    buf += 'Bigger than %sbp (min: %s, max %s, avg %s bp)\n' % (
        minLength, minLen, maxLen, round((float(bp) / c)))
    buf += 'Total (sequences: %s, Mbp: %s)\n' % (
        totalCount, round(float(totalBp) / 1000000.0, 3))
    return buf
Exemple #2
0
def getLenStat(fileName, minLength=1000):
    """
        Get basic statistics concerning the lengths of the sequence.

        @param fileName: fasta file
        @type fileName: str
    """
    buf = ""
    c = 0
    bp = 0
    minLen = sys.maxint
    maxLen = 0
    totalBp = 0
    totalCount = 0
    for k, l in fas.getSequenceToBpDict(fileName).iteritems():
        totalCount += 1
        totalBp += l
        if l >= minLength:
            c += 1
            bp += l
            if l < minLen:
                minLen = l
            elif l > maxLen:
                maxLen = l

    buf += 'Bigger than %sbp (sequences: %s, Mbp: %s)\n' % (minLength, c, round(float(bp) / 1000000.0, 3))
    buf += 'Bigger than %sbp (min: %s, max %s, avg %s bp)\n' % (minLength, minLen, maxLen, round((float(bp) / c)))
    buf += 'Total (sequences: %s, Mbp: %s)\n' % (totalCount, round(float(totalBp) / 1000000.0, 3))
    return buf
Exemple #3
0
def getMaxLen(fastaFilePath):
    """
        Gets the length of the sequence that has maximum length in a fasta file.
    """
    maxLen = 0
    for val in fasta.getSequenceToBpDict(fastaFilePath).itervalues():
        if maxLen < int(val):
            maxLen = int(val)
    return maxLen
def toContigsLabelList(inFastaFileName, readsF, readsR, readOnContig,
                       community, outMappingFileName):
    """
        Gets mapping from contigIds to lists of taxonIds of individual reads of the contigs.

        @param inFastaFileName:
        @param readsF:
        @param readsR:
        @param readOnContig:
        @param community:
        @param outMappingFileName:
    """
    # contigIds
    contigIdToBp = fas.getSequenceToBpDict(inFastaFileName)

    # map: contigId -> list of readIds
    contigIdToReadList = csv.getMapping(readOnContig,
                                        1,
                                        0,
                                        sep='\t',
                                        comment='r')

    # taxonIds as a list for reads
    readFTaxonIdList = getReadsTaxonIdList(readsF, community)
    print 's1'
    readRTaxonIdList = getReadsTaxonIdList(readsR, community)
    print 's2'

    if len(readFTaxonIdList) != len(readRTaxonIdList):
        print(
            'toContigsLabels: different number of reads in the reads files, exit'
        )
        return

    for i in range(len(readFTaxonIdList))[1:]:
        if readFTaxonIdList[i] != readRTaxonIdList[i]:
            print(
                'toContigsLabels: at index %s different taxon ids %s and %s' %
                (i, readFTaxonIdList[i], readRTaxonIdList[i]))
        if readFTaxonIdList[i] is None or readRTaxonIdList[i] is None:
            print('toContigsLabels: at index %s, one is None %s or %s' %
                  (i, readFTaxonIdList[i], readRTaxonIdList[i]))
    print 's3'
    #
    out = csv.OutFileBuffer(outMappingFileName)
    for contigId in contigIdToBp:
        try:
            readList = contigIdToReadList[contigId]
            taxonIdList = []
            for readId in readList:
                taxonIdList.append(readFTaxonIdList[int(readId)])
            out.writeText(
                str(contigId) + '\t' + ','.join(map(str, taxonIdList)) + '\n')
        except KeyError:
            print("No label for contigId: %s" % contigId)
    out.close()
    print 's4'
Exemple #5
0
def getMaxLen(fastaFilePath):
    """
        Gets the length of the sequence that has maximum length in a fasta file.
    """
    maxLen = 0
    for val in fasta.getSequenceToBpDict(fastaFilePath).itervalues():
        if maxLen < int(val):
            maxLen = int(val)
    return maxLen
Exemple #6
0
def filterOutSequencesBatch(taxonIdSet, srcDir, dstDir, notAllowedSeqIdSet):
    """
        For each fasta file that is in directory srcDir filters out sequences that are not defined in the allowedSeqIdSet.
    """
    for taxonId in taxonIdSet:
        srcFilePath = os.path.join(srcDir,str(str(taxonId) + '.1.fna'))
        dstFilePath = os.path.join(dstDir,str(str(taxonId) + '.1.fna'))

        seqIdDict = fasta.getSequenceToBpDict(srcFilePath)
        allowedNamesSet = set()
        for id in seqIdDict.iterkeys():
            if id not in notAllowedSeqIdSet:
                allowedNamesSet.add(id)

        fasta.filterOutSequences(srcFilePath, dstFilePath, allowedNamesSet)
Exemple #7
0
def filterOutSequencesBatch(taxonIdSet, srcDir, dstDir, notAllowedSeqIdSet):
    """
        For each fasta file that is in directory srcDir filters out sequences that are not defined in the allowedSeqIdSet.
    """
    for taxonId in taxonIdSet:
        srcFilePath = os.path.join(srcDir, str(str(taxonId) + '.1.fna'))
        dstFilePath = os.path.join(dstDir, str(str(taxonId) + '.1.fna'))

        seqIdDict = fasta.getSequenceToBpDict(srcFilePath)
        allowedNamesSet = set()
        for id in seqIdDict.iterkeys():
            if id not in notAllowedSeqIdSet:
                allowedNamesSet.add(id)

        fasta.filterOutSequences(srcFilePath, dstFilePath, allowedNamesSet)
Exemple #8
0
def toContigsLabelList(inFastaFileName, readsF, readsR, readOnContig, community, outMappingFileName):
    """
        Gets mapping from contigIds to lists of taxonIds of individual reads of the contigs.

        @param inFastaFileName:
        @param readsF:
        @param readsR:
        @param readOnContig:
        @param community:
        @param outMappingFileName:
    """
    # contigIds
    contigIdToBp = fas.getSequenceToBpDict(inFastaFileName)

    # map: contigId -> list of readIds
    contigIdToReadList = csv.getMapping(readOnContig, 1, 0, sep='\t', comment='r')

    # taxonIds as a list for reads
    readFTaxonIdList = getReadsTaxonIdList(readsF, community)
    print 's1'
    readRTaxonIdList = getReadsTaxonIdList(readsR, community)
    print 's2'

    if len(readFTaxonIdList) != len(readRTaxonIdList):
        print('toContigsLabels: different number of reads in the reads files, exit')
        return

    for i in range(len(readFTaxonIdList))[1:]:
        if readFTaxonIdList[i] != readRTaxonIdList[i]:
            print('toContigsLabels: at index %s different taxon ids %s and %s' %
                  (i, readFTaxonIdList[i], readRTaxonIdList[i] ))
        if readFTaxonIdList[i] is None or readRTaxonIdList[i] is None:
            print('toContigsLabels: at index %s, one is None %s or %s' % (i, readFTaxonIdList[i], readRTaxonIdList[i]))
    print 's3'
    #
    out = csv.OutFileBuffer(outMappingFileName)
    for contigId in contigIdToBp:
        try:
            readList = contigIdToReadList[contigId]
            taxonIdList = []
            for readId in readList:
                taxonIdList.append(readFTaxonIdList[int(readId)])
            out.writeText(str(contigId) + '\t' + ','.join(map(str, taxonIdList)) + '\n')
        except KeyError:
            print("No label for contigId: %s" % contigId)
    out.close()
    print 's4'
Exemple #9
0
    def __init__(self,
                 seqIdToBp,
                 seqIdToPred,
                 seqIdToTruePred,
                 taxonomy,
                 correctLabelThreshold=None):
        """
            Initializes the accuracy object.
            @param seqIdToBp: dictionary or a fasta file
            @param seqIdToPred: dictionary or a prediction file
            @param seqIdToTruePred: dictionary or a true prediction file
            @param taxonomy: database file in the sqlite3 format, or taxonomy object retrieved from not closed Accuracy
        """
        if isinstance(seqIdToBp, dict):
            self._seqToBp = seqIdToBp
        else:
            assert os.path.isfile(seqIdToBp)
            self._seqToBp = fasta.getSequenceToBpDict(seqIdToBp)

        if isinstance(seqIdToPred, dict):
            self._seqToPred = seqIdToPred
        else:
            assert os.path.isfile(seqIdToPred)
            self._seqToPred = cami.readAssignments(seqIdToPred)

        if isinstance(seqIdToTruePred, dict):
            self._seqToTrue = seqIdToTruePred
        else:
            assert os.path.isfile(seqIdToTruePred)
            self._seqToTrue = cami.readAssignments(seqIdToTruePred)

        if isinstance(taxonomy, _TaxonomyWrapperA):
            self._taxonomy = taxonomy
        else:
            assert os.path.isfile(taxonomy)
            self._taxonomy = _TaxonomyWrapperA(taxonomy)

        # correct the predictions self._seqToPred
        if correctLabelThreshold is not None:
            self._seqToPred = self._correctPredictions(self._seqToBp,
                                                       self._seqToPred,
                                                       self._seqToTrue,
                                                       self._taxonomy,
                                                       correctLabelThreshold)
Exemple #10
0
    def __init__(self, seqIdToBp, seqIdToPred, seqIdToTruePred, taxonomy, correctLabelThreshold=None):
        """
            Initializes the accuracy object.
            @param seqIdToBp: dictionary or a fasta file
            @param seqIdToPred: dictionary or a prediction file
            @param seqIdToTruePred: dictionary or a true prediction file
            @param taxonomy: database file in the sqlite3 format, or taxonomy object retrieved from not closed Accuracy
        """
        if isinstance(seqIdToBp, dict):
            self._seqToBp = seqIdToBp
        else:
            assert os.path.isfile(seqIdToBp)
            self._seqToBp = fasta.getSequenceToBpDict(seqIdToBp)

        if isinstance(seqIdToPred, dict):
            self._seqToPred = seqIdToPred
        else:
            assert os.path.isfile(seqIdToPred)
            self._seqToPred = csv.predToDict(seqIdToPred)

        if isinstance(seqIdToTruePred, dict):
            self._seqToTrue = seqIdToTruePred
        else:
            assert os.path.isfile(seqIdToTruePred)
            self._seqToTrue = csv.predToDict(seqIdToTruePred)

        if isinstance(taxonomy, _TaxonomyWrapperA):
            self._taxonomy = taxonomy
        else:
            assert os.path.isfile(taxonomy)
            self._taxonomy = _TaxonomyWrapperA(taxonomy)

        # correct the predictions self._seqToPred
        if correctLabelThreshold is not None:
            self._seqToPred = self._correctPredictions(
                self._seqToBp, self._seqToPred, self._seqToTrue, self._taxonomy, correctLabelThreshold)
Exemple #11
0
def computeTrainingAccuracy(workingDir, taWorkingDir, sampleSpecificDir,
                            ppsTrainDataDir, outputDir, ppsInstallDir,
                            ppsScripts, ppsConfigFilePath, predictLogFileName,
                            modelTaxonIdFilePath, databaseFile):
    """
        Computes the training accuracy for the PPS training data.
        This function doesn't consider training data used to train intermediate (misc?) nodes!
        The training data that correspond to the sample specific data is fragmented (via PPS) and
        contained in the training data of different lengths.

        @param workingDir: working directory of the PPS+ pipeline
        @param taWorkingDir: working directory for the accuracy computation
        @param sampleSpecificDir: directory containing the sample specific data
        @param ppsTrainDataDir: directory 'sampled_fasta' containing PPS training data
        @param outputDir: directory for output files
        @param ppsScripts: directory containing PPS scripts
        @param ppsConfigFilePath: the PPS configuration file
        @param ppsInstallDir: directory where PPS is installed
        @param predictLogFileName: logging file for PPS prediction
        @param modelTaxonIdFilePath: file containing all leaf ncbi taxon ids that are modelled
        @param databaseFile: ncbi taxonomy file in the sqlite3 format
    """
    for d in [
            workingDir, taWorkingDir, sampleSpecificDir, ppsTrainDataDir,
            outputDir, ppsInstallDir, ppsScripts,
            os.path.dirname(predictLogFileName)
    ]:
        assert os.path.isdir(d), "Directory '%s' doesn't exist!" % d
    for f in [ppsConfigFilePath, databaseFile, modelTaxonIdFilePath]:
        assert os.path.isfile(f), "File '%s' doesn't exist!" % f

    # all directories that contain PPS training data
    trainDirList = [sampleSpecificDir]
    for d in os.listdir(ppsTrainDataDir):
        trainDirList.append(os.path.join(ppsTrainDataDir, d))

    # fasta file with all training sequences
    allTrainFastaFile = os.path.join(taWorkingDir, 'all_train_data.fna')
    out = csv.OutFileBuffer(allTrainFastaFile)
    seqIdToTruePred = {}

    # merge all training fasta files to one fasta file
    for d in trainDirList:
        dName = os.path.basename(d)
        for f in os.listdir(d):
            taxonId = int(os.path.basename(f).rsplit('.', 2)[0])
            for seqId, seq in fasta.fastaFileToDict(os.path.join(
                    d, f)).iteritems():
                if d == sampleSpecificDir:
                    #label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1])
                    id = str(
                        taxonId) + '|' + dName + '|' + seqId + '|label:' + str(
                            taxonId)
                else:
                    id = str(taxonId) + '|' + dName + '|' + seqId
                out.writeText('>' + id + '\n' + seq + '\n')
                seqIdToTruePred[id] = taxonId
    out.close()

    # predict the merged file using the generated model
    if os.name == 'posix':
        predictCmd = str(
            os.path.join(ppsScripts, 'predict.rb') + ' ' + allTrainFastaFile +
            ' ' + ppsConfigFilePath)
        #print(predictCmd)
        logOut = open(predictLogFileName, 'w')
        predictProc = subprocess.Popen(
            predictCmd,
            shell=True,
            bufsize=-1,
            cwd=ppsInstallDir,
            stdout=logOut,
            stderr=subprocess.STDOUT)  # stdout=subprocess.STDOUT
        predictProc.wait()
        logOut.close()
        if predictProc.returncode != 0:
            raise Exception(
                "PPS 'predict' training data returned with non-zero status: %s, cmd: %s"
                % (predictProc.returncode, predictCmd))
    else:
        print("Can't run PPS on a non-posix system!")
        return

    # read in predicted train data
    seqIdToPred = csv.predToDict(allTrainFastaFile + '.nox.fna.out')

    # read fasta file
    seqIdToBp = fasta.getSequenceToBpDict(allTrainFastaFile)

    # leaf taxonIds that are modelled
    modelLeafTaxonIds = set(map(int,
                                csv.getColumnAsList(modelTaxonIdFilePath)))

    taxonomyS = taxonomy_ncbi.TaxonomyNcbi(databaseFile, considerNoRank=True)
    notLeafTaxonIds = set()
    for id in modelLeafTaxonIds:
        notLeafTaxonIds.update(
            set(map(int, (taxonomyS.getParentsNcbidSet(id)))))
    taxonomyS.close()

    # get only sequences with true taxonId defined at leaf level that is modelled or lower
    seqIdToBp2 = {}
    seqIdToPred2 = {}
    seqIdToTruePred2 = {}
    seqIdToBpMisc = {}
    seqIdToPredMisc = {}
    seqIdToTruePredMisc = {}
    for seqId, bp in seqIdToBp.iteritems():
        label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1])
        if label not in notLeafTaxonIds:
            seqIdToBp2[seqId] = bp
            seqIdToPred2[seqId] = seqIdToPred[seqId]
            seqIdToTruePred2[seqId] = seqIdToTruePred[seqId]
        else:
            seqIdToBpMisc[seqId] = bp
            seqIdToPredMisc[seqId] = seqIdToPred[seqId]
            seqIdToTruePredMisc[seqId] = seqIdToTruePred[seqId]
    seqIdToBp = seqIdToBp2
    seqIdToPred = seqIdToPred2
    seqIdToTruePred = seqIdToTruePred2

    # accuracy for all, filter out sample specific data (whole length)
    seqIdToBpNoSampleSpec = {}
    for seqId, bp in seqIdToBp.iteritems():
        if str(seqId).split(
                '|',
                2)[1].strip() != os.path.basename(sampleSpecificDir).strip():
            seqIdToBpNoSampleSpec[seqId] = bp

    acc = accuracy.Accuracy(seqIdToBpNoSampleSpec, seqIdToPred,
                            seqIdToTruePred, databaseFile)
    out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_all.txt'))
    out.writeText(
        acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                             minFracClade=None,
                             minFracPred=None,
                             overview=True))
    out.close()
    taxonomyA = acc.getTaxonomy()
    acc.close(closeTaxonomy=False)

    # accuracy for (misc) nodes
    acc = accuracy.Accuracy(seqIdToBpMisc, seqIdToPredMisc,
                            seqIdToTruePredMisc, taxonomyA)
    out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_misc.txt'))
    out.writeText(
        acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                             minFracClade=None,
                             minFracPred=None,
                             overview=True))
    out.close()
    acc.close(closeTaxonomy=False)

    # generate the confusion matrices (for the "for all" scenario)
    cm = confusion_matrix.ConfusionMatrix(seqIdToBp, seqIdToPred,
                                          seqIdToTruePred, databaseFile,
                                          taxonomy_ncbi.TAXONOMIC_RANKS[1:])
    for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]:
        cm.generateConfusionMatrix(
            rank, os.path.join(outputDir, 'train_accuracy_cmp_all'))
    taxonomyCM = cm.getTaxonomy()
    cm.close(closeTaxonomy=False)

    # accuracy for individual directories (seq lengths)
    # (the sample specific fragments are among PPS sampled fasta)
    for d in trainDirList:
        dName = os.path.basename(d)
        seqIdToBpSub = {}
        seqIdToPredSub = {}
        seqIdToTruePredSub = {}
        for seqId, bp in seqIdToBp.iteritems():
            if str(seqId).split('|', 2)[1].strip() == str(dName).strip():
                seqIdToBpSub[seqId] = seqIdToBp[seqId]
                seqIdToPredSub[seqId] = seqIdToPred[seqId]
                seqIdToTruePredSub[seqId] = seqIdToTruePred[seqId]

        # accuracy
        acc = accuracy.Accuracy(seqIdToBpSub, seqIdToPredSub,
                                seqIdToTruePredSub, taxonomyA)
        out = csv.OutFileBuffer(
            os.path.join(outputDir, 'train_accuracy_' + dName + '.txt'))
        out.writeText(
            acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                                 minFracClade=None,
                                 minFracPred=None,
                                 overview=True))

        # confusion matrices
        cm = confusion_matrix.ConfusionMatrix(
            seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyCM,
            taxonomy_ncbi.TAXONOMIC_RANKS[1:])
        for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]:
            cm.generateConfusionMatrix(
                rank, os.path.join(outputDir, 'train_accuracy_cmp_' + dName))
        cm.close(closeTaxonomy=False)

        out.close()
        acc.close(closeTaxonomy=False)
    taxonomyA.close()
    taxonomyCM.close()
Exemple #12
0
    def __init__(self, seqNameToBp, seqNameToPred, seqNameToRefPred, taxonomy, ranksList=None):
        """
            Initializes the main class that computes the confusion matrices.

            @param seqNameToBp: contains mapping, sequence name to bp (as int); or a fasta file
                @type seqNameToBp: dict; or a fasta file
            @param seqNameToPred: contains mapping, sequence name to taxonId; or a tab separated prediction file
                @type seqNameToPred: dict; or a tab separated file, first column ~ sequence name, last column taxonId
            @param seqNameToRefPred: contains mapping, sequence name to taxon Id; or a tab separated reference file
                @type seqNameToRefPred: dict; or a tab separated file, first column ~ sequence name, last column taxonId
            @param ranksList: list of ranks for which the confusion matrices will be computed (None ~ all default ranks)
                @type ranksList: list of str
            @param taxonomy: database file in the sqlite3 format; or taxonomy returned by function "getTaxonomy"
        """
        # Check input options and read in the data (if appropriate)
        self._initFailed = False  # replace this with exceptions!
        if isinstance(seqNameToBp, dict):
            self._seqNameToBp = seqNameToBp
        elif isinstance(seqNameToBp, str) and os.path.isfile(seqNameToBp):
            self._seqNameToBp = fas.getSequenceToBpDict(seqNameToBp)
        else:
            print("Can't get sequence info from:", seqNameToBp)
            self._initFailed = True
            return
        if isinstance(seqNameToPred, dict):
            self._seqNameToPred = seqNameToPred
        elif isinstance(seqNameToPred, str) and os.path.isfile(seqNameToPred):
            self._seqNameToPred = csv.predToDict(seqNameToPred)
        else:
            print("Can't get prediction info from:", seqNameToPred)
            self._initFailed = True
            return
        if isinstance(seqNameToRefPred, dict):
            self._seqNameToRefPred = seqNameToRefPred
        elif isinstance(seqNameToRefPred, str) and os.path.isfile(seqNameToRefPred):
            self._seqNameToRefPred = csv.predToDict(seqNameToRefPred)
        else:
            print("Can't get reference prediction info from:", seqNameToRefPred)
            self._initFailed = True
            return
        if isinstance(taxonomy, str) and os.path.isfile(taxonomy):
            self._taxonomy = _TaxonomyWrapCM(taxonomy)
        elif isinstance(taxonomy, _TaxonomyWrapCM):
            self._taxonomy = taxonomy
        else:
            print("Can't use taxonomy: ", taxonomy)
        if ranksList is None:
            ranksList = taxonomy_ncbi.TAXONOMIC_RANKS[1:]  # default ranks
        else:
            allowedRanksSet = set(taxonomy_ncbi.TAXONOMIC_RANKS[1:])  # custom ranks
            for rank in ranksList:
                if rank not in allowedRanksSet:
                    print('Rank: "' + str(rank) + '" is not allowed!')
                    self._initFailed = True
                    return
        rankIdsList = []  # rankIds that will be considered
        for rank in ranksList:
            rankIdsList.append(self._taxonomy.getRankId(rank))
        self._allowedRankIdsSet = set(rankIdsList)

        # get predictions at different taxonomic ranks
        # rankId -> (seqId -> taxonIdAtRank)
        self._rankIdToPredMap = {}
        self._rankIdToRefMap = {}
        for rankId in rankIdsList:
            self._rankIdToPredMap[rankId] = {}
            self._rankIdToRefMap[rankId] = {}

        # get predictions at given ranks
        for seqId, taxonId in self._seqNameToPred.iteritems():
            while (taxonId is not None) and (taxonId != 1):
                rankId = self._taxonomy.getRankIdOfTaxonId(taxonId)
                if rankId in self._allowedRankIdsSet:
                    self._rankIdToPredMap[rankId][seqId] = taxonId
                taxonId = self._taxonomy.getParent(taxonId)

        # get reference predictions at given ranks
        for seqId, taxonId in self._seqNameToRefPred.iteritems():
            while (taxonId is not None) and (taxonId != 1):
                rankId = self._taxonomy.getRankIdOfTaxonId(taxonId)
                if rankId in self._allowedRankIdsSet:
                    self._rankIdToRefMap[rankId][seqId] = taxonId
                taxonId = self._taxonomy.getParent(taxonId)
Exemple #13
0
    def __init__(self,
                 seqNameToBp,
                 seqNameToPred,
                 seqNameToRefPred,
                 taxonomy,
                 ranksList=None):
        """
            Initializes the main class that computes the confusion matrices.

            @param seqNameToBp: contains mapping, sequence name to bp (as int); or a fasta file
                @type seqNameToBp: dict; or a fasta file
            @param seqNameToPred: contains mapping, sequence name to taxonId; or a tab separated prediction file
                @type seqNameToPred: dict; or a tab separated file, first column ~ sequence name, last column taxonId
            @param seqNameToRefPred: contains mapping, sequence name to taxon Id; or a tab separated reference file
                @type seqNameToRefPred: dict; or a tab separated file, first column ~ sequence name, last column taxonId
            @param ranksList: list of ranks for which the confusion matrices will be computed (None ~ all default ranks)
                @type ranksList: list of str
            @param taxonomy: database file in the sqlite3 format; or taxonomy returned by function "getTaxonomy"
        """
        # Check input options and read in the data (if appropriate)
        self._initFailed = False  # replace this with exceptions!
        if isinstance(seqNameToBp, dict):
            self._seqNameToBp = seqNameToBp
        elif isinstance(seqNameToBp, str) and os.path.isfile(seqNameToBp):
            self._seqNameToBp = fas.getSequenceToBpDict(seqNameToBp)
        else:
            print("Can't get sequence info from:", seqNameToBp)
            self._initFailed = True
            return
        if isinstance(seqNameToPred, dict):
            self._seqNameToPred = seqNameToPred
        elif isinstance(seqNameToPred, str) and os.path.isfile(seqNameToPred):
            self._seqNameToPred = cami.readAssignments(seqNameToPred)
        else:
            print("Can't get prediction info from:", seqNameToPred)
            self._initFailed = True
            return
        if isinstance(seqNameToRefPred, dict):
            self._seqNameToRefPred = seqNameToRefPred
        elif isinstance(seqNameToRefPred,
                        str) and os.path.isfile(seqNameToRefPred):
            self._seqNameToRefPred = cami.readAssignments(seqNameToRefPred)
        else:
            print("Can't get reference prediction info from:",
                  seqNameToRefPred)
            self._initFailed = True
            return
        if isinstance(taxonomy, str) and os.path.isfile(taxonomy):
            self._taxonomy = _TaxonomyWrapCM(taxonomy)
        elif isinstance(taxonomy, _TaxonomyWrapCM):
            self._taxonomy = taxonomy
        else:
            print("Can't use taxonomy: ", taxonomy)
        if ranksList is None:
            ranksList = taxonomy_ncbi.TAXONOMIC_RANKS[1:]  # default ranks
        else:
            allowedRanksSet = set(
                taxonomy_ncbi.TAXONOMIC_RANKS[1:])  # custom ranks
            for rank in ranksList:
                if rank not in allowedRanksSet:
                    print('Rank: "' + str(rank) + '" is not allowed!')
                    self._initFailed = True
                    return
        rankIdsList = []  # rankIds that will be considered
        for rank in ranksList:
            rankIdsList.append(self._taxonomy.getRankId(rank))
        self._allowedRankIdsSet = set(rankIdsList)

        # get predictions at different taxonomic ranks
        # rankId -> (seqId -> taxonIdAtRank)
        self._rankIdToPredMap = {}
        self._rankIdToRefMap = {}
        for rankId in rankIdsList:
            self._rankIdToPredMap[rankId] = {}
            self._rankIdToRefMap[rankId] = {}

        # get predictions at given ranks
        for seqId, taxonId in self._seqNameToPred.iteritems():
            while (taxonId is not None) and (taxonId != 1):
                rankId = self._taxonomy.getRankIdOfTaxonId(taxonId)
                if rankId in self._allowedRankIdsSet:
                    self._rankIdToPredMap[rankId][seqId] = taxonId
                taxonId = self._taxonomy.getParent(taxonId)

        # get reference predictions at given ranks
        for seqId, taxonId in self._seqNameToRefPred.iteritems():
            while (taxonId is not None) and (taxonId != 1):
                rankId = self._taxonomy.getRankIdOfTaxonId(taxonId)
                if rankId in self._allowedRankIdsSet:
                    self._rankIdToRefMap[rankId][seqId] = taxonId
                taxonId = self._taxonomy.getParent(taxonId)
    def __init__(self, contigNameToBp, contigNameToNcbid, scaffToContigList, taxonomy,
                 minScaffContigCount=None, minScaffBpLen=None, cladesSet=None, considerContigWithNoScaff=True,
                 ignoreScaffPredToRoot=True):
        """
            Initializes the main Consistency class.

            @param contigNameToBp: dictionary that maps contig names to bp (int);
                or a fasta file that contain contigs
            @param contigNameToNcbid: dictionary that maps contig names to ncbids (int);
                or a prediction file - first column contig name, last column ncbid
            @param scaffToContigList: dictionary that maps scaffold names to list of contig names;
                or a file - first column scaffold name, second column contig name
            @param minScaffContigCount: consider only scaffolds that contain at least this number of contigs
            @param minScaffBpLen: consider only scaffolds with at least this collective length (in bp)
            @param cladesSet: consider only scaffolds that contain at least one contig from this set
            @param considerContigWithNoScaff: consider also contigs that are not assigned to scaffolds
                (as artificial scaffolds)
            @param ignoreScaffPredToRoot: ignore scaffolds that are assigned based on the root (uninformative)
        """
        # check input options
        assert minScaffContigCount is None or isinstance(minScaffContigCount, int)
        assert minScaffBpLen is None or isinstance(minScaffBpLen, int)
        assert cladesSet is None or isinstance(cladesSet, set)
        assert isinstance(considerContigWithNoScaff, bool)
        assert isinstance(ignoreScaffPredToRoot, bool)

        if isinstance(contigNameToBp, dict):
            self._contigNameToBp = contigNameToBp
        elif isinstance(contigNameToBp, str) and os.path.isfile(contigNameToBp):
            self._contigNameToBp = getSequenceToBpDict(contigNameToBp)
        else:
            print("Can't get contig info from: ", contigNameToBp)
            return
        if isinstance(contigNameToNcbid, dict):
            self._contigToPred = contigNameToNcbid
        elif isinstance(contigNameToNcbid, str) and os.path.isfile(contigNameToNcbid):
            self._contigToPred = cami.readAssignments(contigNameToNcbid)
        else:
            print("Can't get prediction info from: ", contigNameToNcbid)
            return
        if isinstance(scaffToContigList, dict):
            self._scaffToContigsList = scaffToContigList
        elif isinstance(scaffToContigList, str) and os.path.isfile(scaffToContigList):
            self._scaffToContigsList = getMapping(scaffToContigList, 0, 1, '\t')
        else:
            print("Can't get scaffold config mapping from: ", scaffToContigList)
            return

        if isinstance(taxonomy, _TaxonomyWrapper) and (not taxonomy.isClosed()):
            self._taxonomy = taxonomy
        elif isinstance(taxonomy, str) and os.path.isfile(taxonomy):
            self._taxonomy = _TaxonomyWrapper(taxonomy)
        else:
            print("Can't use taxonomy:", taxonomy)
            return

        # check the consistency of the data!

        # if a contig that is defined in the mapping doesn't exist (in the fasta file) we remove it
        for scaff, contigsList in self._scaffToContigsList.iteritems():
            removeList = []
            for contig in contigsList:
                if contig not in self._contigNameToBp:
                    removeList.append(contig)

            for contig in removeList:
                contigsList.remove(contig)

        # if a contig was predicted but there is no scaffold assigned to it then this
        # contig is assigned to an "artificial scaffold"
        if considerContigWithNoScaff:
            scaffContigSet = set()
            for s, l in self._scaffToContigsList.iteritems():
                for c in l:
                    scaffContigSet.add(c)
            aloneContigSet = set()
            for c in self._contigToPred:
                if c not in scaffContigSet:
                    aloneContigSet.add(c)

            for c in aloneContigSet:
                scaffName = str('scaffold_' + c)  # make up a scaffold name
                assert scaffName not in self._scaffToContigsList, 'The names of contigs are ambiguous!'
                self._scaffToContigsList[scaffName] = [c]

        # filter out scaffolds according to the input constrains
        self._scaffolds = dict()
        for scaffName, contigsList in self._scaffToContigsList.iteritems():
            if minScaffContigCount is not None:
                if len(contigsList) < minScaffContigCount:
                    continue

            if minScaffBpLen is not None:
                sum = 0
                for contig in contigsList:
                    sum += self._contigNameToBp[contig]
                if sum < minScaffBpLen:
                    continue

            if cladesSet is not None:
                passScaff = False
                for contig in contigsList:
                    if (contig in self._contigToPred) and (self._contigToPred[contig] in cladesSet):
                        passScaff = True
                        break
                if not passScaff:
                    continue

            # process the scaffold, but if everything in the scaffold was assigned to the root, then ignore it!
            s = self._processScaffold(scaffName)
            if not ((s.getNcbid() == 1) and ignoreScaffPredToRoot):
                self._scaffolds[scaffName] = s
Exemple #15
0
def getProfile(readsFFastaFile,
               communityFile, contigMFastaFile, contigLFastaFile, taxonomyMFile, taxonomyDbFile, outProfileFile):
    """
        Gets the profile of the dataset.

        @param readsFFastaFile:
        @param communityFile:
        @param contigMFastaFile:
        @param contigLFastaFile:
        @param taxonomyMFile:
        @param taxonomyDbFile: taxonomy in the sqlite3 format
        @param outProfileFile: output file
    """
    # get map: taxonId -> read count
    taxonIdToReadCount = {}
    readTotalCount = 0
    for taxonId in getReadsTaxonIdList(readsFFastaFile, communityFile, readHeaderToCommunityId=getCommunityId)[1:]:
        if taxonId in taxonIdToReadCount:
            taxonIdToReadCount[taxonId] += 1
        else:
            taxonIdToReadCount[taxonId] = 1
        readTotalCount += 1

    # get map: taxonId -> contig count
    # get map: taxonId -> contig bp
    taxonIdToContigCount = {}
    taxonIdToContigBp = {}
    totalContigCount = 0
    seqIdToTaxonId = csv.predToDict(taxonomyMFile)
    seqIdToBp = fas.getSequenceToBpDict(contigMFastaFile)
    for seqId, bp in seqIdToBp.iteritems():
        totalContigCount += 1
        taxonId = seqIdToTaxonId[seqId]
        if taxonId in taxonIdToContigBp:
            taxonIdToContigBp[taxonId] += bp
        else:
            taxonIdToContigBp[taxonId] = bp
        if taxonId in taxonIdToContigCount:
            taxonIdToContigCount[taxonId] += 1
        else:
            taxonIdToContigCount[taxonId] = 1

    taxonIdToTotalBp = {}
    taxonIdToAvgSumCov = {}
    taxonIdToAvgCov = {}
    totalBp = 0.0
    for taxonId in taxonIdToContigBp:
        taxonIdToTotalBp[taxonId] = 0.0
        taxonIdToAvgSumCov[taxonId] = 0.0
        taxonIdToAvgCov[taxonId] = 0.0

    for seqId in fas.fastaFileToDictWholeNames(contigLFastaFile):
        shortSeqId = getShortContigId(seqId)
        if shortSeqId in seqIdToBp:
            coverage = getCoverage(seqId)
            bp = seqIdToBp[shortSeqId]
            taxonId = seqIdToTaxonId[shortSeqId]
            taxonIdToTotalBp[taxonId] += bp
            taxonIdToAvgSumCov[taxonId] += float(coverage) * float(bp)
            totalBp += bp

    for taxonId, bp in taxonIdToTotalBp.iteritems():
        if bp > 0:
            taxonIdToAvgCov[taxonId] = taxonIdToAvgSumCov[taxonId] / float(bp)

    tupleList = []
    taxonomy = taxonomy_ncbi.TaxonomyNcbi(taxonomyDbFile, considerNoRank=True)
    ranks = taxonomy_ncbi.TAXONOMIC_RANKS[2:]
    avgCoverage = 0.0
    for taxonId, readCount in taxonIdToReadCount.iteritems():
        scName = ScientificNameAtRank(taxonId, taxonomy, ranks)
        tupleList.append((taxonId,
                          round(100 * (readCount / float(readTotalCount)), 1),
                          round(100 * (taxonIdToTotalBp.get(taxonId, 0) / float(totalBp)), 1),
                          round(taxonIdToAvgCov.get(taxonId, 0), 2),
                          round(taxonIdToTotalBp.get(taxonId, 0) / 1000000.0, 2),
                          taxonIdToContigCount.get(taxonId, 0),
                          taxonomy.getScientificName(taxonId),
                          scName.getNameAtRank('phylum'),
                          scName.getNameAtRank('class'),
                          scName.getNameAtRank('order'),
                          scName.getNameAtRank('family'),
                          scName.getNameAtRank('genus'),
                          scName.getNameAtRank('species')  # this could be done in a nicer way
        ))

        avgCoverage += taxonIdToAvgCov.get(taxonId, 0) * taxonIdToTotalBp.get(taxonId, 0)
    avgCoverage /= float(totalBp)
    tupleList.sort(key=lambda x: x[2], reverse=True)

    out = csv.OutFileBuffer(outProfileFile)
    out.writeText('#taxonId, % reads, % contigs, avg coverage, MB contigs, contigs count, strain name, ' +
                  ",".join(ranks) + '\n')
    for entry in tupleList:
        out.writeText(','.join(map(str, entry)) + '\n')

    out.writeText('#Sum/Avg., -, -, ' + str(round(avgCoverage, 2)) + ', ' + str(round(totalBp / 1000000.0, 2)) +
                  ', ' + str(totalContigCount) + ', -\n')
    out.close()
    taxonomy.close()
    def __init__(self,
                 contigNameToBp,
                 contigNameToNcbid,
                 scaffToContigList,
                 taxonomy,
                 minScaffContigCount=None,
                 minScaffBpLen=None,
                 cladesSet=None,
                 considerContigWithNoScaff=True,
                 ignoreScaffPredToRoot=True):
        """
            Initializes the main Consistency class.

            @param contigNameToBp: dictionary that maps contig names to bp (int);
                or a fasta file that contain contigs
            @param contigNameToNcbid: dictionary that maps contig names to ncbids (int);
                or a prediction file - first column contig name, last column ncbid
            @param scaffToContigList: dictionary that maps scaffold names to list of contig names;
                or a file - first column scaffold name, second column contig name
            @param minScaffContigCount: consider only scaffolds that contain at least this number of contigs
            @param minScaffBpLen: consider only scaffolds with at least this collective length (in bp)
            @param cladesSet: consider only scaffolds that contain at least one contig from this set
            @param considerContigWithNoScaff: consider also contigs that are not assigned to scaffolds
                (as artificial scaffolds)
            @param ignoreScaffPredToRoot: ignore scaffolds that are assigned based on the root (uninformative)
        """
        # check input options
        assert minScaffContigCount is None or isinstance(
            minScaffContigCount, int)
        assert minScaffBpLen is None or isinstance(minScaffBpLen, int)
        assert cladesSet is None or isinstance(cladesSet, set)
        assert isinstance(considerContigWithNoScaff, bool)
        assert isinstance(ignoreScaffPredToRoot, bool)

        if isinstance(contigNameToBp, dict):
            self._contigNameToBp = contigNameToBp
        elif isinstance(contigNameToBp,
                        str) and os.path.isfile(contigNameToBp):
            self._contigNameToBp = getSequenceToBpDict(contigNameToBp)
        else:
            print("Can't get contig info from: ", contigNameToBp)
            return
        if isinstance(contigNameToNcbid, dict):
            self._contigToPred = contigNameToNcbid
        elif isinstance(contigNameToNcbid,
                        str) and os.path.isfile(contigNameToNcbid):
            self._contigToPred = cami.readAssignments(contigNameToNcbid)
        else:
            print("Can't get prediction info from: ", contigNameToNcbid)
            return
        if isinstance(scaffToContigList, dict):
            self._scaffToContigsList = scaffToContigList
        elif isinstance(scaffToContigList,
                        str) and os.path.isfile(scaffToContigList):
            self._scaffToContigsList = getMapping(scaffToContigList, 0, 1,
                                                  '\t')
        else:
            print("Can't get scaffold config mapping from: ",
                  scaffToContigList)
            return

        if isinstance(taxonomy,
                      _TaxonomyWrapper) and (not taxonomy.isClosed()):
            self._taxonomy = taxonomy
        elif isinstance(taxonomy, str) and os.path.isfile(taxonomy):
            self._taxonomy = _TaxonomyWrapper(taxonomy)
        else:
            print("Can't use taxonomy:", taxonomy)
            return

        # check the consistency of the data!

        # if a contig that is defined in the mapping doesn't exist (in the fasta file) we remove it
        for scaff, contigsList in self._scaffToContigsList.iteritems():
            removeList = []
            for contig in contigsList:
                if contig not in self._contigNameToBp:
                    removeList.append(contig)

            for contig in removeList:
                contigsList.remove(contig)

        # if a contig was predicted but there is no scaffold assigned to it then this
        # contig is assigned to an "artificial scaffold"
        if considerContigWithNoScaff:
            scaffContigSet = set()
            for s, l in self._scaffToContigsList.iteritems():
                for c in l:
                    scaffContigSet.add(c)
            aloneContigSet = set()
            for c in self._contigToPred:
                if c not in scaffContigSet:
                    aloneContigSet.add(c)

            for c in aloneContigSet:
                scaffName = str('scaffold_' + c)  # make up a scaffold name
                assert scaffName not in self._scaffToContigsList, 'The names of contigs are ambiguous!'
                self._scaffToContigsList[scaffName] = [c]

        # filter out scaffolds according to the input constrains
        self._scaffolds = dict()
        for scaffName, contigsList in self._scaffToContigsList.iteritems():
            if minScaffContigCount is not None:
                if len(contigsList) < minScaffContigCount:
                    continue

            if minScaffBpLen is not None:
                sum = 0
                for contig in contigsList:
                    sum += self._contigNameToBp[contig]
                if sum < minScaffBpLen:
                    continue

            if cladesSet is not None:
                passScaff = False
                for contig in contigsList:
                    if (contig in self._contigToPred) and (
                            self._contigToPred[contig] in cladesSet):
                        passScaff = True
                        break
                if not passScaff:
                    continue

            # process the scaffold, but if everything in the scaffold was assigned to the root, then ignore it!
            s = self._processScaffold(scaffName)
            if not ((s.getNcbid() == 1) and ignoreScaffPredToRoot):
                self._scaffolds[scaffName] = s
def _main():
    # define arguments
    parser = argparse.ArgumentParser(description='Default task: PPS+ evaluation', epilog='')

    parser.add_argument('-b', '--cont-binning-file', nargs=1, type=file, required=True,
                        help='Binning file containing labels assigned to contigs.', metavar='assignments.csv', dest='b')

    parser.add_argument('-t', '--cont-true-binning-file', nargs=1, type=file, required=True,
                        help='Binning file containing true labels for the contigs.', metavar='labels.csv', dest='t')

    parser.add_argument('-f', '--cont-contigs-file-listing', nargs=1, type=file, required=False,
                        help='A list of paths of FASTA contigs files.', metavar='fasta_listing.txt', dest='f')

    parser.add_argument('-m', '--cont-scaffold-contig-mapping', nargs=1, type=file, required=False,
                        help='Scaffold contig mapping, tab separated.', metavar='mapping.csv', dest='m')

    parser.add_argument('-n', '--cont-ncbi-taxonomy', nargs=1, required=False,
                        help='Directory containing the NCBI names.dmp and nodes.dmp files.', metavar='taxonomy_dir',
                        dest='n')

    parser.add_argument('-o', '--cont-output-dir', nargs=1, required=True,
                        help='Output directory.', metavar='output_dir', dest='o')

    parser.add_argument('-j', '--default-job', nargs='+',
                        help='What task/job should be performed (p~precision/recall, s~scaff-contig consistency, '
                             'c~confusion tables, default - if not spec compute all)', metavar='', dest='j')

    args = parser.parse_args()

    # read and check the arguments
    seqIdToBp = None
    scaffToContig = None
    binning = None
    trueBinning = None
    outputDir = None
    job = None

    if args.o and len(args.o) == 1 and os.path.isdir(args.o[0]):
        outputDir = args.o[0]

    if args.b and len(args.b) == 1 and os.path.isfile(args.b[0].name):
        binningFile = args.b[0].name
        binning = cami.readAssignments(binningFile)

    if args.t and len(args.t) == 1 and os.path.isfile(args.t[0].name):
        trueBinningFile = args.t[0].name
        trueBinning = cami.readAssignments(trueBinningFile)

    if args.f and len(args.f) == 1 and os.path.isfile(args.f[0].name):
        seqIdToBp = fasta.getSequenceToBpDict(args.f[0].name)

        # contigsFileListing = args.f[0].name
        # for line in open(contigsFileListing):
        #     if os.path.isfile(line.strip()):
        #         d = fasta.getSequenceToBpDict(line.strip())
        #         if seqIdToBp is None:
        #             seqIdToBp = d
        #         else:
        #             count = len(d) + len(seqIdToBp)
        #             seqIdToBp.update(d)
        #             if count > len(seqIdToBp):
        #                 sys.stderr.write('The fasta files contain duplicate entries!')

    if args.m and len(args.m) == 1 and os.path.isfile(args.m[0].name):
        scaffoldContigMapping = args.m[0].name
        scaffToContig = csv.getMapping(scaffoldContigMapping, 0, 1, '\t')

    taxonomyPath = os.path.join(outputDir, 'taxonomy_ncbi.db')
    if not os.path.isfile(taxonomyPath):
        if args.n and len(args.n) == 1 and os.path.isdir(args.n[0]):
            # build the ncbi taxonomy in the case it doesn't exist
            ncbitax2sqlite.build_database(Args(db=taxonomyPath, dmp=args.n[0]))
        else:
            taxonomyPath = None

    if args.j and len(args.j) > 0 and len(set(args.j).intersection(set(['p', 's', 'c']))) > 0:
        job = set(args.j)

    # print job
    # print args.j
    # print len(seqIdToBp)
    # print len(binning)
    # print len(trueBinning)
    # print taxonomyPath
    # print outputDir

    if (job is None or 'p' in args.j) and seqIdToBp and binning and trueBinning and taxonomyPath and outputDir:
        print('Computing precision/recall')
        # precision/recall - no correction
        acc = accuracy.Accuracy(seqIdToBp, binning, trueBinning, taxonomyPath)
        out = csv.OutFileBuffer(os.path.join(outputDir, 'precision_recall.csv'))
        out.writeText(acc.getAccuracyPrint(RANKS, MIN_FRAC_CLADE, MIN_FRAC_CLADE))
        out.close()
        acc.close()

        # precision/recall - with correction
        acc = accuracy.Accuracy(seqIdToBp, binning, trueBinning, taxonomyPath, CORRECT_LABEL_THRESHOLD)
        out = csv.OutFileBuffer(os.path.join(outputDir, 'precision_recall_correction.csv'))
        out.writeText(acc.getAccuracyPrint(RANKS, MIN_FRAC_CLADE, MIN_FRAC_CLADE))
        out.close()
        acc.close()

    # compute confusion matrices
    if (job is None or 'c' in args.j) and seqIdToBp and binning and trueBinning and taxonomyPath and outputDir:
        print('Computing confusion matrices')
        confusionMatrix = confusion_matrix.ConfusionMatrix(seqIdToBp, binning, trueBinning, taxonomyPath, RANKS)
        for rank in RANKS:
            confusionMatrix.generateConfusionMatrix(rank, os.path.join(outputDir, 'confusion_matrix'))
        confusionMatrix.close()

    # compute scaffold contig consistency
    if (job is None or 's' in args.j) and seqIdToBp and binning and scaffToContig and taxonomyPath \
            and outputDir:
        print('Computing scaffold-contig consistency')
        cons = consistency.Consistency(seqIdToBp, binning, scaffToContig, taxonomyPath)
        out = csv.OutFileBuffer(os.path.join(outputDir, 'consistency.txt'))
        out.writeText(cons.getGroupedScaffoldsPrint())
        cons.close()
        out.close()

    createEvalMetaFile(outputDir)
def getProfile(readsFFastaFile, communityFile, contigMFastaFile,
               contigLFastaFile, taxonomyMFile, taxonomyDbFile,
               outProfileFile):
    """
        Gets the profile of the dataset.

        @param readsFFastaFile:
        @param communityFile:
        @param contigMFastaFile:
        @param contigLFastaFile:
        @param taxonomyMFile:
        @param taxonomyDbFile: taxonomy in the sqlite3 format
        @param outProfileFile: output file
    """
    # get map: taxonId -> read count
    taxonIdToReadCount = {}
    readTotalCount = 0
    for taxonId in getReadsTaxonIdList(
            readsFFastaFile, communityFile,
            readHeaderToCommunityId=getCommunityId)[1:]:
        if taxonId in taxonIdToReadCount:
            taxonIdToReadCount[taxonId] += 1
        else:
            taxonIdToReadCount[taxonId] = 1
        readTotalCount += 1

    # get map: taxonId -> contig count
    # get map: taxonId -> contig bp
    taxonIdToContigCount = {}
    taxonIdToContigBp = {}
    totalContigCount = 0
    seqIdToTaxonId = csv.predToDict(taxonomyMFile)
    seqIdToBp = fas.getSequenceToBpDict(contigMFastaFile)
    for seqId, bp in seqIdToBp.iteritems():
        totalContigCount += 1
        taxonId = seqIdToTaxonId[seqId]
        if taxonId in taxonIdToContigBp:
            taxonIdToContigBp[taxonId] += bp
        else:
            taxonIdToContigBp[taxonId] = bp
        if taxonId in taxonIdToContigCount:
            taxonIdToContigCount[taxonId] += 1
        else:
            taxonIdToContigCount[taxonId] = 1

    taxonIdToTotalBp = {}
    taxonIdToAvgSumCov = {}
    taxonIdToAvgCov = {}
    totalBp = 0.0
    for taxonId in taxonIdToContigBp:
        taxonIdToTotalBp[taxonId] = 0.0
        taxonIdToAvgSumCov[taxonId] = 0.0
        taxonIdToAvgCov[taxonId] = 0.0

    for seqId in fas.fastaFileToDictWholeNames(contigLFastaFile):
        shortSeqId = getShortContigId(seqId)
        if shortSeqId in seqIdToBp:
            coverage = getCoverage(seqId)
            bp = seqIdToBp[shortSeqId]
            taxonId = seqIdToTaxonId[shortSeqId]
            taxonIdToTotalBp[taxonId] += bp
            taxonIdToAvgSumCov[taxonId] += float(coverage) * float(bp)
            totalBp += bp

    for taxonId, bp in taxonIdToTotalBp.iteritems():
        if bp > 0:
            taxonIdToAvgCov[taxonId] = taxonIdToAvgSumCov[taxonId] / float(bp)

    tupleList = []
    taxonomy = taxonomy_ncbi.TaxonomyNcbi(taxonomyDbFile, considerNoRank=True)
    ranks = taxonomy_ncbi.TAXONOMIC_RANKS[2:]
    avgCoverage = 0.0
    for taxonId, readCount in taxonIdToReadCount.iteritems():
        scName = ScientificNameAtRank(taxonId, taxonomy, ranks)
        tupleList.append((
            taxonId,
            round(100 * (readCount / float(readTotalCount)), 1),
            round(100 * (taxonIdToTotalBp.get(taxonId, 0) / float(totalBp)),
                  1),
            round(taxonIdToAvgCov.get(taxonId, 0), 2),
            round(taxonIdToTotalBp.get(taxonId, 0) / 1000000.0, 2),
            taxonIdToContigCount.get(taxonId, 0),
            taxonomy.getScientificName(taxonId),
            scName.getNameAtRank('phylum'),
            scName.getNameAtRank('class'),
            scName.getNameAtRank('order'),
            scName.getNameAtRank('family'),
            scName.getNameAtRank('genus'),
            scName.getNameAtRank(
                'species')  # this could be done in a nicer way
        ))

        avgCoverage += taxonIdToAvgCov.get(taxonId, 0) * taxonIdToTotalBp.get(
            taxonId, 0)
    avgCoverage /= float(totalBp)
    tupleList.sort(key=lambda x: x[2], reverse=True)

    out = csv.OutFileBuffer(outProfileFile)
    out.writeText(
        '#taxonId, % reads, % contigs, avg coverage, MB contigs, contigs count, strain name, '
        + ",".join(ranks) + '\n')
    for entry in tupleList:
        out.writeText(','.join(map(str, entry)) + '\n')

    out.writeText('#Sum/Avg., -, -, ' + str(round(avgCoverage, 2)) + ', ' +
                  str(round(totalBp / 1000000.0, 2)) + ', ' +
                  str(totalContigCount) + ', -\n')
    out.close()
    taxonomy.close()
Exemple #19
0
def computeTrainingAccuracy(workingDir, taWorkingDir, sampleSpecificDir, ppsTrainDataDir, outputDir, ppsInstallDir,
                            ppsScripts, ppsConfigFilePath, predictLogFileName, modelTaxonIdFilePath, databaseFile):
    """
        Computes the training accuracy for the PPS training data.
        This function doesn't consider training data used to train intermediate (misc?) nodes!
        The training data that correspond to the sample specific data is fragmented (via PPS) and
        contained in the training data of different lengths.

        @param workingDir: working directory of the PPS+ pipeline
        @param taWorkingDir: working directory for the accuracy computation
        @param sampleSpecificDir: directory containing the sample specific data
        @param ppsTrainDataDir: directory 'sampled_fasta' containing PPS training data
        @param outputDir: directory for output files
        @param ppsScripts: directory containing PPS scripts
        @param ppsConfigFilePath: the PPS configuration file
        @param ppsInstallDir: directory where PPS is installed
        @param predictLogFileName: logging file for PPS prediction
        @param modelTaxonIdFilePath: file containing all leaf ncbi taxon ids that are modelled
        @param databaseFile: ncbi taxonomy file in the sqlite3 format
    """
    for d in [workingDir, taWorkingDir, sampleSpecificDir,
              ppsTrainDataDir, outputDir, ppsInstallDir, ppsScripts, os.path.dirname(predictLogFileName)]:
        assert os.path.isdir(d), "Directory '%s' doesn't exist!" % d
    for f in [ppsConfigFilePath, databaseFile, modelTaxonIdFilePath]:
        assert os.path.isfile(f), "File '%s' doesn't exist!" % f

    # all directories that contain PPS training data
    trainDirList = [sampleSpecificDir]
    for d in os.listdir(ppsTrainDataDir):
        trainDirList.append(os.path.join(ppsTrainDataDir, d))

    # fasta file with all training sequences
    allTrainFastaFile = os.path.join(taWorkingDir, 'all_train_data.fna')
    out = csv.OutFileBuffer(allTrainFastaFile)
    seqIdToTruePred = {}

    # merge all training fasta files to one fasta file
    for d in trainDirList:
        dName = os.path.basename(d)
        for f in os.listdir(d):
            taxonId = int(os.path.basename(f).rsplit('.', 2)[0])
            for seqId, seq in fasta.fastaFileToDict(os.path.join(d, f)).iteritems():
                if d == sampleSpecificDir:
                    #label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1])
                    id = str(taxonId) + '|' + dName + '|' + seqId + '|label:' + str(taxonId)
                else:
                    id = str(taxonId) + '|' + dName + '|' + seqId
                out.writeText('>' + id + '\n' + seq + '\n')
                seqIdToTruePred[id] = taxonId
    out.close()

    # predict the merged file using the generated model
    if os.name == 'posix':
        predictCmd = str(os.path.join(ppsScripts, 'predict.rb') + ' ' + allTrainFastaFile + ' ' + ppsConfigFilePath)
        #print(predictCmd)
        logOut = open(predictLogFileName, 'w')
        predictProc = subprocess.Popen(predictCmd, shell=True, bufsize=-1, cwd=ppsInstallDir, stdout=logOut,
                                       stderr=subprocess.STDOUT)  # stdout=subprocess.STDOUT
        predictProc.wait()
        logOut.close()
        if predictProc.returncode != 0:
            raise Exception("PPS 'predict' training data returned with non-zero status: %s, cmd: %s" %
                            (predictProc.returncode, predictCmd))
    else:
        print("Can't run PPS on a non-posix system!")
        return

    # read in predicted train data
    seqIdToPred = csv.predToDict(allTrainFastaFile + '.nox.fna.out')

    # read fasta file
    seqIdToBp = fasta.getSequenceToBpDict(allTrainFastaFile)

    # leaf taxonIds that are modelled
    modelLeafTaxonIds = set(map(int, csv.getColumnAsList(modelTaxonIdFilePath)))

    taxonomyS = taxonomy_ncbi.TaxonomyNcbi(databaseFile, considerNoRank=True)
    notLeafTaxonIds = set()
    for id in modelLeafTaxonIds:
        notLeafTaxonIds.update(set(map(int, (taxonomyS.getParentsNcbidSet(id)))))
    taxonomyS.close()

    # get only sequences with true taxonId defined at leaf level that is modelled or lower
    seqIdToBp2 = {}
    seqIdToPred2 = {}
    seqIdToTruePred2 = {}
    seqIdToBpMisc = {}
    seqIdToPredMisc = {}
    seqIdToTruePredMisc = {}
    for seqId, bp in seqIdToBp.iteritems():
        label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1])
        if label not in notLeafTaxonIds:
            seqIdToBp2[seqId] = bp
            seqIdToPred2[seqId] = seqIdToPred[seqId]
            seqIdToTruePred2[seqId] = seqIdToTruePred[seqId]
        else:
            seqIdToBpMisc[seqId] = bp
            seqIdToPredMisc[seqId] = seqIdToPred[seqId]
            seqIdToTruePredMisc[seqId] = seqIdToTruePred[seqId]
    seqIdToBp = seqIdToBp2
    seqIdToPred = seqIdToPred2
    seqIdToTruePred = seqIdToTruePred2

    # accuracy for all, filter out sample specific data (whole length)
    seqIdToBpNoSampleSpec = {}
    for seqId, bp in seqIdToBp.iteritems():
        if str(seqId).split('|', 2)[1].strip() != os.path.basename(sampleSpecificDir).strip():
            seqIdToBpNoSampleSpec[seqId] = bp

    acc = accuracy.Accuracy(seqIdToBpNoSampleSpec, seqIdToPred, seqIdToTruePred, databaseFile)
    out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_all.txt'))
    out.writeText(acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                                       minFracClade=None, minFracPred=None, overview=True))
    out.close()
    taxonomyA = acc.getTaxonomy()
    acc.close(closeTaxonomy=False)

    # accuracy for (misc) nodes
    acc = accuracy.Accuracy(seqIdToBpMisc, seqIdToPredMisc, seqIdToTruePredMisc, taxonomyA)
    out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_misc.txt'))
    out.writeText(acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                                       minFracClade=None, minFracPred=None, overview=True))
    out.close()
    acc.close(closeTaxonomy=False)

    # generate the confusion matrices (for the "for all" scenario)
    cm = confusion_matrix.ConfusionMatrix(seqIdToBp, seqIdToPred, seqIdToTruePred, databaseFile,
                                          taxonomy_ncbi.TAXONOMIC_RANKS[1:])
    for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]:
        cm.generateConfusionMatrix(rank, os.path.join(outputDir, 'train_accuracy_cmp_all'))
    taxonomyCM = cm.getTaxonomy()
    cm.close(closeTaxonomy=False)

    # accuracy for individual directories (seq lengths)
    # (the sample specific fragments are among PPS sampled fasta)
    for d in trainDirList:
        dName = os.path.basename(d)
        seqIdToBpSub = {}
        seqIdToPredSub = {}
        seqIdToTruePredSub = {}
        for seqId, bp in seqIdToBp.iteritems():
            if str(seqId).split('|', 2)[1].strip() == str(dName).strip():
                seqIdToBpSub[seqId] = seqIdToBp[seqId]
                seqIdToPredSub[seqId] = seqIdToPred[seqId]
                seqIdToTruePredSub[seqId] = seqIdToTruePred[seqId]

        # accuracy
        acc = accuracy.Accuracy(seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyA)
        out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_' + dName + '.txt'))
        out.writeText(acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                                           minFracClade=None, minFracPred=None, overview=True))

        # confusion matrices
        cm = confusion_matrix.ConfusionMatrix(seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyCM,
                                              taxonomy_ncbi.TAXONOMIC_RANKS[1:])
        for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]:
            cm.generateConfusionMatrix(rank, os.path.join(outputDir, 'train_accuracy_cmp_' + dName))
        cm.close(closeTaxonomy=False)

        out.close()
        acc.close(closeTaxonomy=False)
    taxonomyA.close()
    taxonomyCM.close()
Exemple #20
0
def _main():
    # define arguments
    parser = argparse.ArgumentParser(
        description='Default task: PPS+ evaluation', epilog='')

    parser.add_argument(
        '-b',
        '--cont-binning-file',
        nargs=1,
        type=file,
        required=True,
        help='Binning file containing labels assigned to contigs.',
        metavar='assignments.csv',
        dest='b')

    parser.add_argument(
        '-t',
        '--cont-true-binning-file',
        nargs=1,
        type=file,
        required=True,
        help='Binning file containing true labels for the contigs.',
        metavar='labels.csv',
        dest='t')

    parser.add_argument('-f',
                        '--cont-contigs-file-listing',
                        nargs=1,
                        type=file,
                        required=False,
                        help='A list of paths of FASTA contigs files.',
                        metavar='fasta_listing.txt',
                        dest='f')

    parser.add_argument('-m',
                        '--cont-scaffold-contig-mapping',
                        nargs=1,
                        type=file,
                        required=False,
                        help='Scaffold contig mapping, tab separated.',
                        metavar='mapping.csv',
                        dest='m')

    parser.add_argument(
        '-n',
        '--cont-ncbi-taxonomy',
        nargs=1,
        required=False,
        help='Directory containing the NCBI names.dmp and nodes.dmp files.',
        metavar='taxonomy_dir',
        dest='n')

    parser.add_argument('-o',
                        '--cont-output-dir',
                        nargs=1,
                        required=True,
                        help='Output directory.',
                        metavar='output_dir',
                        dest='o')

    parser.add_argument(
        '-j',
        '--default-job',
        nargs='+',
        help=
        'What task/job should be performed (p~precision/recall, s~scaff-contig consistency, '
        'c~confusion tables, default - if not spec compute all)',
        metavar='',
        dest='j')

    args = parser.parse_args()

    # read and check the arguments
    seqIdToBp = None
    scaffToContig = None
    binning = None
    trueBinning = None
    outputDir = None
    job = None

    if args.o and len(args.o) == 1 and os.path.isdir(args.o[0]):
        outputDir = args.o[0]

    if args.b and len(args.b) == 1 and os.path.isfile(args.b[0].name):
        binningFile = args.b[0].name
        binning = cami.readAssignments(binningFile)

    if args.t and len(args.t) == 1 and os.path.isfile(args.t[0].name):
        trueBinningFile = args.t[0].name
        trueBinning = cami.readAssignments(trueBinningFile)

    if args.f and len(args.f) == 1 and os.path.isfile(args.f[0].name):
        seqIdToBp = fasta.getSequenceToBpDict(args.f[0].name)

        # contigsFileListing = args.f[0].name
        # for line in open(contigsFileListing):
        #     if os.path.isfile(line.strip()):
        #         d = fasta.getSequenceToBpDict(line.strip())
        #         if seqIdToBp is None:
        #             seqIdToBp = d
        #         else:
        #             count = len(d) + len(seqIdToBp)
        #             seqIdToBp.update(d)
        #             if count > len(seqIdToBp):
        #                 sys.stderr.write('The fasta files contain duplicate entries!')

    if args.m and len(args.m) == 1 and os.path.isfile(args.m[0].name):
        scaffoldContigMapping = args.m[0].name
        scaffToContig = csv.getMapping(scaffoldContigMapping, 0, 1, '\t')

    taxonomyPath = os.path.join(outputDir, 'taxonomy_ncbi.db')
    if not os.path.isfile(taxonomyPath):
        if args.n and len(args.n) == 1 and os.path.isdir(args.n[0]):
            # build the ncbi taxonomy in the case it doesn't exist
            ncbitax2sqlite.build_database(Args(db=taxonomyPath, dmp=args.n[0]))
        else:
            taxonomyPath = None

    if args.j and len(args.j) > 0 and len(
            set(args.j).intersection(set(['p', 's', 'c']))) > 0:
        job = set(args.j)

    # print job
    # print args.j
    # print len(seqIdToBp)
    # print len(binning)
    # print len(trueBinning)
    # print taxonomyPath
    # print outputDir

    if (
            job is None or 'p' in args.j
    ) and seqIdToBp and binning and trueBinning and taxonomyPath and outputDir:
        print('Computing precision/recall')
        # precision/recall - no correction
        acc = accuracy.Accuracy(seqIdToBp, binning, trueBinning, taxonomyPath)
        out = csv.OutFileBuffer(os.path.join(outputDir,
                                             'precision_recall.csv'))
        out.writeText(
            acc.getAccuracyPrint(RANKS, MIN_FRAC_CLADE, MIN_FRAC_CLADE))
        out.close()
        acc.close()

        # precision/recall - with correction
        acc = accuracy.Accuracy(seqIdToBp, binning, trueBinning, taxonomyPath,
                                CORRECT_LABEL_THRESHOLD)
        out = csv.OutFileBuffer(
            os.path.join(outputDir, 'precision_recall_correction.csv'))
        out.writeText(
            acc.getAccuracyPrint(RANKS, MIN_FRAC_CLADE, MIN_FRAC_CLADE))
        out.close()
        acc.close()

    # compute confusion matrices
    if (
            job is None or 'c' in args.j
    ) and seqIdToBp and binning and trueBinning and taxonomyPath and outputDir:
        print('Computing confusion matrices')
        confusionMatrix = confusion_matrix.ConfusionMatrix(
            seqIdToBp, binning, trueBinning, taxonomyPath, RANKS)
        for rank in RANKS:
            confusionMatrix.generateConfusionMatrix(
                rank, os.path.join(outputDir, 'confusion_matrix'))
        confusionMatrix.close()

    # compute scaffold contig consistency
    if (job is None or 's' in args.j) and seqIdToBp and binning and scaffToContig and taxonomyPath \
            and outputDir:
        print('Computing scaffold-contig consistency')
        cons = consistency.Consistency(seqIdToBp, binning, scaffToContig,
                                       taxonomyPath)
        out = csv.OutFileBuffer(os.path.join(outputDir, 'consistency.txt'))
        out.writeText(cons.getGroupedScaffoldsPrint())
        cons.close()
        out.close()

    createEvalMetaFile(outputDir)