예제 #1
0
def scafToContigOutput(scaffContigMapFile, scaffPPSOutFile, contigPPSOutFile):
    """
        Takes scaffold-contigs mapping and scaffold placement (.out file), outputs contigs placement (.out file)

        @param scaffContigMapFile: tab sepparated scaffold-contigs mapping (scaffoldName \t contigName)
        @param scaffPPSOutFile: scaffold predictions (PPS output file)
        @param contigPPSOutFile: contigs predictions (as if it was a PPS output file)
    """
    # init output
    out = csv.OutFileBuffer(contigPPSOutFile)

    # read scaffold predictions
    scaffNameToTaxonId = csv.predToDict(scaffPPSOutFile)

    # read mapping: scaffName -> contigNameList
    scaffNameToContigNameList = csv.getMapping(scaffContigMapFile, 0, 1, sep='\t')

    # store contigs' predictions (according to scaffolds' predictions)
    for scaffName, contigNameList in scaffNameToContigNameList.iteritems():
        taxonId = scaffNameToTaxonId.get(scaffName, None)
        if taxonId is None:
            taxonId = 1
        for contigName in contigNameList:
            out.writeText(contigName + '\t' + str(taxonId) + '\n')
    out.close()
예제 #2
0
def getReadsTaxonIdList(readsFile,
                        communityFile,
                        readHeaderToCommunityId=getCommunityId):
    """
        Gets list of taxonIds in the same order as they are in the readOnContig file.
        The first taxonId is at index 1.

        @param readsFile:
        @param communityFile:
        @param readHeaderToCommunityId:
        @return:
    """
    communityIdToTaxonId = csv.predToDict(communityFile)
    d = [None]
    rowList = csv.getColumnAsList(readsFile, colNum=0, sep='\n')
    for line in rowList:
        if str(line).startswith('>'):
            try:
                taxonId = int(
                    communityIdToTaxonId.get(readHeaderToCommunityId(line)))
            except TypeError as ex:
                print(ex.message)
                print("%s, %s" % (taxonId, line))
                raise ex
            d.append(taxonId)
            d.append(taxonId)
    return d
예제 #3
0
def toWellMappedContigs(inFastaFile,
                        inTaxonomyWFile,
                        outFastaFile,
                        outFastaMisAssembledFile,
                        outTaxonomyFile,
                        weightThreshold=0.99):
    """
        Creates the fasta and mapping files that contain well assembled contigs (filter out misassembled contigs).

        @param inFastaFile: input fasta file with contigs
        @param inTaxonomyWFile: input file that contains taxonomy with weights (seqId, weight, taxonId)
        @param outFastaFile: fasta file containing well assembled sequences
        @param outFastaMisAssembledFile: fasta file containing misassembled contigs
        @param outTaxonomyFile: resulting taxonomy of the well assembled sequences (seqId, taxonId)
        @param weightThreshold: only contigs the weight of which is at least this value will be taken
        @return: statistics
    """
    seqIdToTaxonId = csv.predToDict(inTaxonomyWFile)
    seqIdToWeight = csv.getMapping(inTaxonomyWFile, 0, 1, '\t')
    outFastaOk = csv.OutFileBuffer(outFastaFile)
    outFastaMis = csv.OutFileBuffer(outFastaMisAssembledFile)
    outTaxonomyOk = csv.OutFileBuffer(outTaxonomyFile)

    totalBp = 0.0
    totalCount = 0.0
    okBp = 0.0
    okCount = 0.0
    avgSumBp = 0.0

    for seqId, seq in fas.fastaFileToDictWholeNames(inFastaFile).iteritems():
        bp = len(seq)
        totalBp += bp
        totalCount += 1
        seqIdPrefix = str(seqId).split(' ')[0]
        weight = seqIdToWeight[seqIdPrefix][0]
        fastaEntry = '>' + str(seqIdPrefix) + '\n' + str(seq) + '\n'
        if float(weight) >= weightThreshold:
            outFastaOk.writeText(fastaEntry)
            outTaxonomyOk.writeText(
                str(seqIdPrefix) + '\t' + str(seqIdToTaxonId[seqIdPrefix]) +
                '\n')
            okBp += bp
            okCount += 1
            avgSumBp += getCoverage(seqId) * bp
        else:
            outFastaMis.writeText(fastaEntry)

    outFastaOk.close()
    outFastaMis.close()
    outTaxonomyOk.close()

    return 'Taken: %s/%sMB, %s/%sseq, %s%% bp %s%% seq, avg coverage %s' % (
        round(okBp / 1000000, 2), round(totalBp / 1000000, 2), okCount,
        totalCount, round(
            (okBp / totalBp) * 100, 2), round(
                (okCount / totalCount) * 100, 2), round(avgSumBp / okBp, 3))
예제 #4
0
파일: cami.py 프로젝트: algbioi/docker_ppsp
def readAssignments(assignmentFile):
    """
        Reads an assingment file, either in the cami format or in the PPS output (out) format

        @rtype: dict
        @return: mapping(name->taxonId)
    """
    if os.path.basename(assignmentFile).split('.')[-1] == 'binning':
        return readCami(assignmentFile)
    else:
        return csv.predToDict(assignmentFile)
예제 #5
0
def readAssignments(assignmentFile):
    """
        Reads an assingment file, either in the cami format or in the PPS output (out) format

        @rtype: dict
        @return: mapping(name->taxonId)
    """
    if os.path.basename(assignmentFile).split('.')[-1] == 'cami':
        return readCami(assignmentFile)
    else:
        return csv.predToDict(assignmentFile)
예제 #6
0
def toWellMappedContigs(inFastaFile, inTaxonomyWFile,
                        outFastaFile, outFastaMisAssembledFile, outTaxonomyFile, weightThreshold=0.99):
    """
        Creates the fasta and mapping files that contain well assembled contigs (filter out misassembled contigs).

        @param inFastaFile: input fasta file with contigs
        @param inTaxonomyWFile: input file that contains taxonomy with weights (seqId, weight, taxonId)
        @param outFastaFile: fasta file containing well assembled sequences
        @param outFastaMisAssembledFile: fasta file containing misassembled contigs
        @param outTaxonomyFile: resulting taxonomy of the well assembled sequences (seqId, taxonId)
        @param weightThreshold: only contigs the weight of which is at least this value will be taken
        @return: statistics
    """
    seqIdToTaxonId = csv.predToDict(inTaxonomyWFile)
    seqIdToWeight = csv.getMapping(inTaxonomyWFile, 0, 1, '\t')
    outFastaOk = csv.OutFileBuffer(outFastaFile)
    outFastaMis = csv.OutFileBuffer(outFastaMisAssembledFile)
    outTaxonomyOk = csv.OutFileBuffer(outTaxonomyFile)

    totalBp = 0.0
    totalCount = 0.0
    okBp = 0.0
    okCount = 0.0
    avgSumBp = 0.0

    for seqId, seq in fas.fastaFileToDictWholeNames(inFastaFile).iteritems():
        bp = len(seq)
        totalBp += bp
        totalCount += 1
        seqIdPrefix = str(seqId).split(' ')[0]
        weight = seqIdToWeight[seqIdPrefix][0]
        fastaEntry = '>' + str(seqIdPrefix) + '\n' + str(seq) + '\n'
        if float(weight) >= weightThreshold:
            outFastaOk.writeText(fastaEntry)
            outTaxonomyOk.writeText(str(seqIdPrefix) + '\t' + str(seqIdToTaxonId[seqIdPrefix]) + '\n')
            okBp += bp
            okCount += 1
            avgSumBp += getCoverage(seqId) * bp
        else:
            outFastaMis.writeText(fastaEntry)

    outFastaOk.close()
    outFastaMis.close()
    outTaxonomyOk.close()

    return 'Taken: %s/%sMB, %s/%sseq, %s%% bp %s%% seq, avg coverage %s' % (round(okBp / 1000000, 2),
                                                                            round(totalBp / 1000000, 2),
                                                                            okCount, totalCount,
                                                                            round((okBp / totalBp) * 100, 2),
                                                                            round((okCount / totalCount) * 100, 2),
                                                                            round(avgSumBp / okBp, 3))
예제 #7
0
    def __init__(self, seqIdToBp, seqIdToPred, seqIdToTruePred, taxonomy, correctLabelThreshold=None):
        """
            Initializes the accuracy object.
            @param seqIdToBp: dictionary or a fasta file
            @param seqIdToPred: dictionary or a prediction file
            @param seqIdToTruePred: dictionary or a true prediction file
            @param taxonomy: database file in the sqlite3 format, or taxonomy object retrieved from not closed Accuracy
        """
        if isinstance(seqIdToBp, dict):
            self._seqToBp = seqIdToBp
        else:
            assert os.path.isfile(seqIdToBp)
            self._seqToBp = fasta.getSequenceToBpDict(seqIdToBp)

        if isinstance(seqIdToPred, dict):
            self._seqToPred = seqIdToPred
        else:
            assert os.path.isfile(seqIdToPred)
            self._seqToPred = csv.predToDict(seqIdToPred)

        if isinstance(seqIdToTruePred, dict):
            self._seqToTrue = seqIdToTruePred
        else:
            assert os.path.isfile(seqIdToTruePred)
            self._seqToTrue = csv.predToDict(seqIdToTruePred)

        if isinstance(taxonomy, _TaxonomyWrapperA):
            self._taxonomy = taxonomy
        else:
            assert os.path.isfile(taxonomy)
            self._taxonomy = _TaxonomyWrapperA(taxonomy)

        # correct the predictions self._seqToPred
        if correctLabelThreshold is not None:
            self._seqToPred = self._correctPredictions(
                self._seqToBp, self._seqToPred, self._seqToTrue, self._taxonomy, correctLabelThreshold)
예제 #8
0
def filterOutContigs(inFastaFile, inTaxFile, outFastaFile, outTaxFile, notAllowedTaxonIdList):
    outFasta = csv.OutFileBuffer(outFastaFile)
    outTax = csv.OutFileBuffer(outTaxFile)
    seqIdToTaxonId = csv.predToDict(inTaxFile)
    notAllowedTaxonIdSet = set(notAllowedTaxonIdList)
    taxonIdToFilteredSeq = {}
    for taxonId in notAllowedTaxonIdSet:
        taxonIdToFilteredSeq[taxonId] = 0
    for seqId, seq in fas.fastaFileToDict(inFastaFile).iteritems():
        taxonId = int(seqIdToTaxonId[seqId])
        if taxonId not in notAllowedTaxonIdSet:
            outFasta.writeText('>' + str(seqId) + '\n' + str(seq) + '\n')
            outTax.writeText(str(seqId) + '\t' + str(taxonId) + '\n')
        else:
            taxonIdToFilteredSeq[taxonId] += 1
    outFasta.close()
    outTax.close()
    print("filtered taxonId -> seqCount: " + str(taxonIdToFilteredSeq))
예제 #9
0
def filterOutContigs(inFastaFile, inTaxFile, outFastaFile, outTaxFile,
                     notAllowedTaxonIdList):
    outFasta = csv.OutFileBuffer(outFastaFile)
    outTax = csv.OutFileBuffer(outTaxFile)
    seqIdToTaxonId = csv.predToDict(inTaxFile)
    notAllowedTaxonIdSet = set(notAllowedTaxonIdList)
    taxonIdToFilteredSeq = {}
    for taxonId in notAllowedTaxonIdSet:
        taxonIdToFilteredSeq[taxonId] = 0
    for seqId, seq in fas.fastaFileToDict(inFastaFile).iteritems():
        taxonId = int(seqIdToTaxonId[seqId])
        if taxonId not in notAllowedTaxonIdSet:
            outFasta.writeText('>' + str(seqId) + '\n' + str(seq) + '\n')
            outTax.writeText(str(seqId) + '\t' + str(taxonId) + '\n')
        else:
            taxonIdToFilteredSeq[taxonId] += 1
    outFasta.close()
    outTax.close()
    print("filtered taxonId -> seqCount: " + str(taxonIdToFilteredSeq))
예제 #10
0
def getReadsTaxonIdList(readsFile, communityFile, readHeaderToCommunityId=getCommunityId):
    """
        Gets list of taxonIds in the same order as they are in the readOnContig file.
        The first taxonId is at index 1.

        @param readsFile:
        @param communityFile:
        @param readHeaderToCommunityId:
        @return:
    """
    communityIdToTaxonId = csv.predToDict(communityFile)
    d = [None]
    rowList = csv.getColumnAsList(readsFile, colNum=0, sep='\n')
    for line in rowList:
        if str(line).startswith('>'):
            try:
                taxonId = int(communityIdToTaxonId.get(readHeaderToCommunityId(line)))
            except TypeError as ex:
                print(ex.message)
                print("%s, %s" % (taxonId, line))
                raise ex
            d.append(taxonId)
            d.append(taxonId)
    return d
예제 #11
0
파일: pps.py 프로젝트: algbioi/ppsplus
def computeTrainingAccuracy(workingDir, taWorkingDir, sampleSpecificDir, ppsTrainDataDir, outputDir, ppsInstallDir,
                            ppsScripts, ppsConfigFilePath, predictLogFileName, modelTaxonIdFilePath, databaseFile):
    """
        Computes the training accuracy for the PPS training data.
        This function doesn't consider training data used to train intermediate (misc?) nodes!
        The training data that correspond to the sample specific data is fragmented (via PPS) and
        contained in the training data of different lengths.

        @param workingDir: working directory of the PPS+ pipeline
        @param taWorkingDir: working directory for the accuracy computation
        @param sampleSpecificDir: directory containing the sample specific data
        @param ppsTrainDataDir: directory 'sampled_fasta' containing PPS training data
        @param outputDir: directory for output files
        @param ppsScripts: directory containing PPS scripts
        @param ppsConfigFilePath: the PPS configuration file
        @param ppsInstallDir: directory where PPS is installed
        @param predictLogFileName: logging file for PPS prediction
        @param modelTaxonIdFilePath: file containing all leaf ncbi taxon ids that are modelled
        @param databaseFile: ncbi taxonomy file in the sqlite3 format
    """
    for d in [workingDir, taWorkingDir, sampleSpecificDir,
              ppsTrainDataDir, outputDir, ppsInstallDir, ppsScripts, os.path.dirname(predictLogFileName)]:
        assert os.path.isdir(d), "Directory '%s' doesn't exist!" % d
    for f in [ppsConfigFilePath, databaseFile, modelTaxonIdFilePath]:
        assert os.path.isfile(f), "File '%s' doesn't exist!" % f

    # all directories that contain PPS training data
    trainDirList = [sampleSpecificDir]
    for d in os.listdir(ppsTrainDataDir):
        trainDirList.append(os.path.join(ppsTrainDataDir, d))

    # fasta file with all training sequences
    allTrainFastaFile = os.path.join(taWorkingDir, 'all_train_data.fna')
    out = csv.OutFileBuffer(allTrainFastaFile)
    seqIdToTruePred = {}

    # merge all training fasta files to one fasta file
    for d in trainDirList:
        dName = os.path.basename(d)
        for f in os.listdir(d):
            taxonId = int(os.path.basename(f).rsplit('.', 2)[0])
            for seqId, seq in fasta.fastaFileToDict(os.path.join(d, f)).iteritems():
                if d == sampleSpecificDir:
                    #label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1])
                    id = str(taxonId) + '|' + dName + '|' + seqId + '|label:' + str(taxonId)
                else:
                    id = str(taxonId) + '|' + dName + '|' + seqId
                out.writeText('>' + id + '\n' + seq + '\n')
                seqIdToTruePred[id] = taxonId
    out.close()

    # predict the merged file using the generated model
    if os.name == 'posix':
        predictCmd = str(os.path.join(ppsScripts, 'predict.rb') + ' ' + allTrainFastaFile + ' ' + ppsConfigFilePath)
        #print(predictCmd)
        logOut = open(predictLogFileName, 'w')
        predictProc = subprocess.Popen(predictCmd, shell=True, bufsize=-1, cwd=ppsInstallDir, stdout=logOut,
                                       stderr=subprocess.STDOUT)  # stdout=subprocess.STDOUT
        predictProc.wait()
        logOut.close()
        if predictProc.returncode != 0:
            raise Exception("PPS 'predict' training data returned with non-zero status: %s, cmd: %s" %
                            (predictProc.returncode, predictCmd))
    else:
        print("Can't run PPS on a non-posix system!")
        return

    # read in predicted train data
    seqIdToPred = csv.predToDict(allTrainFastaFile + '.nox.fna.out')

    # read fasta file
    seqIdToBp = fasta.getSequenceToBpDict(allTrainFastaFile)

    # leaf taxonIds that are modelled
    modelLeafTaxonIds = set(map(int, csv.getColumnAsList(modelTaxonIdFilePath)))

    taxonomyS = taxonomy_ncbi.TaxonomyNcbi(databaseFile, considerNoRank=True)
    notLeafTaxonIds = set()
    for id in modelLeafTaxonIds:
        notLeafTaxonIds.update(set(map(int, (taxonomyS.getParentsNcbidSet(id)))))
    taxonomyS.close()

    # get only sequences with true taxonId defined at leaf level that is modelled or lower
    seqIdToBp2 = {}
    seqIdToPred2 = {}
    seqIdToTruePred2 = {}
    seqIdToBpMisc = {}
    seqIdToPredMisc = {}
    seqIdToTruePredMisc = {}
    for seqId, bp in seqIdToBp.iteritems():
        label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1])
        if label not in notLeafTaxonIds:
            seqIdToBp2[seqId] = bp
            seqIdToPred2[seqId] = seqIdToPred[seqId]
            seqIdToTruePred2[seqId] = seqIdToTruePred[seqId]
        else:
            seqIdToBpMisc[seqId] = bp
            seqIdToPredMisc[seqId] = seqIdToPred[seqId]
            seqIdToTruePredMisc[seqId] = seqIdToTruePred[seqId]
    seqIdToBp = seqIdToBp2
    seqIdToPred = seqIdToPred2
    seqIdToTruePred = seqIdToTruePred2

    # accuracy for all, filter out sample specific data (whole length)
    seqIdToBpNoSampleSpec = {}
    for seqId, bp in seqIdToBp.iteritems():
        if str(seqId).split('|', 2)[1].strip() != os.path.basename(sampleSpecificDir).strip():
            seqIdToBpNoSampleSpec[seqId] = bp

    acc = accuracy.Accuracy(seqIdToBpNoSampleSpec, seqIdToPred, seqIdToTruePred, databaseFile)
    out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_all.txt'))
    out.writeText(acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                                       minFracClade=None, minFracPred=None, overview=True))
    out.close()
    taxonomyA = acc.getTaxonomy()
    acc.close(closeTaxonomy=False)

    # accuracy for (misc) nodes
    acc = accuracy.Accuracy(seqIdToBpMisc, seqIdToPredMisc, seqIdToTruePredMisc, taxonomyA)
    out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_misc.txt'))
    out.writeText(acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                                       minFracClade=None, minFracPred=None, overview=True))
    out.close()
    acc.close(closeTaxonomy=False)

    # generate the confusion matrices (for the "for all" scenario)
    cm = confusion_matrix.ConfusionMatrix(seqIdToBp, seqIdToPred, seqIdToTruePred, databaseFile,
                                          taxonomy_ncbi.TAXONOMIC_RANKS[1:])
    for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]:
        cm.generateConfusionMatrix(rank, os.path.join(outputDir, 'train_accuracy_cmp_all'))
    taxonomyCM = cm.getTaxonomy()
    cm.close(closeTaxonomy=False)

    # accuracy for individual directories (seq lengths)
    # (the sample specific fragments are among PPS sampled fasta)
    for d in trainDirList:
        dName = os.path.basename(d)
        seqIdToBpSub = {}
        seqIdToPredSub = {}
        seqIdToTruePredSub = {}
        for seqId, bp in seqIdToBp.iteritems():
            if str(seqId).split('|', 2)[1].strip() == str(dName).strip():
                seqIdToBpSub[seqId] = seqIdToBp[seqId]
                seqIdToPredSub[seqId] = seqIdToPred[seqId]
                seqIdToTruePredSub[seqId] = seqIdToTruePred[seqId]

        # accuracy
        acc = accuracy.Accuracy(seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyA)
        out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_' + dName + '.txt'))
        out.writeText(acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                                           minFracClade=None, minFracPred=None, overview=True))

        # confusion matrices
        cm = confusion_matrix.ConfusionMatrix(seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyCM,
                                              taxonomy_ncbi.TAXONOMIC_RANKS[1:])
        for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]:
            cm.generateConfusionMatrix(rank, os.path.join(outputDir, 'train_accuracy_cmp_' + dName))
        cm.close(closeTaxonomy=False)

        out.close()
        acc.close(closeTaxonomy=False)
    taxonomyA.close()
    taxonomyCM.close()
예제 #12
0
def computeTrainingAccuracy(workingDir, taWorkingDir, sampleSpecificDir,
                            ppsTrainDataDir, outputDir, ppsInstallDir,
                            ppsScripts, ppsConfigFilePath, predictLogFileName,
                            modelTaxonIdFilePath, databaseFile):
    """
        Computes the training accuracy for the PPS training data.
        This function doesn't consider training data used to train intermediate (misc?) nodes!
        The training data that correspond to the sample specific data is fragmented (via PPS) and
        contained in the training data of different lengths.

        @param workingDir: working directory of the PPS+ pipeline
        @param taWorkingDir: working directory for the accuracy computation
        @param sampleSpecificDir: directory containing the sample specific data
        @param ppsTrainDataDir: directory 'sampled_fasta' containing PPS training data
        @param outputDir: directory for output files
        @param ppsScripts: directory containing PPS scripts
        @param ppsConfigFilePath: the PPS configuration file
        @param ppsInstallDir: directory where PPS is installed
        @param predictLogFileName: logging file for PPS prediction
        @param modelTaxonIdFilePath: file containing all leaf ncbi taxon ids that are modelled
        @param databaseFile: ncbi taxonomy file in the sqlite3 format
    """
    for d in [
            workingDir, taWorkingDir, sampleSpecificDir, ppsTrainDataDir,
            outputDir, ppsInstallDir, ppsScripts,
            os.path.dirname(predictLogFileName)
    ]:
        assert os.path.isdir(d), "Directory '%s' doesn't exist!" % d
    for f in [ppsConfigFilePath, databaseFile, modelTaxonIdFilePath]:
        assert os.path.isfile(f), "File '%s' doesn't exist!" % f

    # all directories that contain PPS training data
    trainDirList = [sampleSpecificDir]
    for d in os.listdir(ppsTrainDataDir):
        trainDirList.append(os.path.join(ppsTrainDataDir, d))

    # fasta file with all training sequences
    allTrainFastaFile = os.path.join(taWorkingDir, 'all_train_data.fna')
    out = csv.OutFileBuffer(allTrainFastaFile)
    seqIdToTruePred = {}

    # merge all training fasta files to one fasta file
    for d in trainDirList:
        dName = os.path.basename(d)
        for f in os.listdir(d):
            taxonId = int(os.path.basename(f).rsplit('.', 2)[0])
            for seqId, seq in fasta.fastaFileToDict(os.path.join(
                    d, f)).iteritems():
                if d == sampleSpecificDir:
                    #label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1])
                    id = str(
                        taxonId) + '|' + dName + '|' + seqId + '|label:' + str(
                            taxonId)
                else:
                    id = str(taxonId) + '|' + dName + '|' + seqId
                out.writeText('>' + id + '\n' + seq + '\n')
                seqIdToTruePred[id] = taxonId
    out.close()

    # predict the merged file using the generated model
    if os.name == 'posix':
        predictCmd = str(
            os.path.join(ppsScripts, 'predict.rb') + ' ' + allTrainFastaFile +
            ' ' + ppsConfigFilePath)
        #print(predictCmd)
        logOut = open(predictLogFileName, 'w')
        predictProc = subprocess.Popen(
            predictCmd,
            shell=True,
            bufsize=-1,
            cwd=ppsInstallDir,
            stdout=logOut,
            stderr=subprocess.STDOUT)  # stdout=subprocess.STDOUT
        predictProc.wait()
        logOut.close()
        if predictProc.returncode != 0:
            raise Exception(
                "PPS 'predict' training data returned with non-zero status: %s, cmd: %s"
                % (predictProc.returncode, predictCmd))
    else:
        print("Can't run PPS on a non-posix system!")
        return

    # read in predicted train data
    seqIdToPred = csv.predToDict(allTrainFastaFile + '.nox.fna.out')

    # read fasta file
    seqIdToBp = fasta.getSequenceToBpDict(allTrainFastaFile)

    # leaf taxonIds that are modelled
    modelLeafTaxonIds = set(map(int,
                                csv.getColumnAsList(modelTaxonIdFilePath)))

    taxonomyS = taxonomy_ncbi.TaxonomyNcbi(databaseFile, considerNoRank=True)
    notLeafTaxonIds = set()
    for id in modelLeafTaxonIds:
        notLeafTaxonIds.update(
            set(map(int, (taxonomyS.getParentsNcbidSet(id)))))
    taxonomyS.close()

    # get only sequences with true taxonId defined at leaf level that is modelled or lower
    seqIdToBp2 = {}
    seqIdToPred2 = {}
    seqIdToTruePred2 = {}
    seqIdToBpMisc = {}
    seqIdToPredMisc = {}
    seqIdToTruePredMisc = {}
    for seqId, bp in seqIdToBp.iteritems():
        label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1])
        if label not in notLeafTaxonIds:
            seqIdToBp2[seqId] = bp
            seqIdToPred2[seqId] = seqIdToPred[seqId]
            seqIdToTruePred2[seqId] = seqIdToTruePred[seqId]
        else:
            seqIdToBpMisc[seqId] = bp
            seqIdToPredMisc[seqId] = seqIdToPred[seqId]
            seqIdToTruePredMisc[seqId] = seqIdToTruePred[seqId]
    seqIdToBp = seqIdToBp2
    seqIdToPred = seqIdToPred2
    seqIdToTruePred = seqIdToTruePred2

    # accuracy for all, filter out sample specific data (whole length)
    seqIdToBpNoSampleSpec = {}
    for seqId, bp in seqIdToBp.iteritems():
        if str(seqId).split(
                '|',
                2)[1].strip() != os.path.basename(sampleSpecificDir).strip():
            seqIdToBpNoSampleSpec[seqId] = bp

    acc = accuracy.Accuracy(seqIdToBpNoSampleSpec, seqIdToPred,
                            seqIdToTruePred, databaseFile)
    out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_all.txt'))
    out.writeText(
        acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                             minFracClade=None,
                             minFracPred=None,
                             overview=True))
    out.close()
    taxonomyA = acc.getTaxonomy()
    acc.close(closeTaxonomy=False)

    # accuracy for (misc) nodes
    acc = accuracy.Accuracy(seqIdToBpMisc, seqIdToPredMisc,
                            seqIdToTruePredMisc, taxonomyA)
    out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_misc.txt'))
    out.writeText(
        acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                             minFracClade=None,
                             minFracPred=None,
                             overview=True))
    out.close()
    acc.close(closeTaxonomy=False)

    # generate the confusion matrices (for the "for all" scenario)
    cm = confusion_matrix.ConfusionMatrix(seqIdToBp, seqIdToPred,
                                          seqIdToTruePred, databaseFile,
                                          taxonomy_ncbi.TAXONOMIC_RANKS[1:])
    for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]:
        cm.generateConfusionMatrix(
            rank, os.path.join(outputDir, 'train_accuracy_cmp_all'))
    taxonomyCM = cm.getTaxonomy()
    cm.close(closeTaxonomy=False)

    # accuracy for individual directories (seq lengths)
    # (the sample specific fragments are among PPS sampled fasta)
    for d in trainDirList:
        dName = os.path.basename(d)
        seqIdToBpSub = {}
        seqIdToPredSub = {}
        seqIdToTruePredSub = {}
        for seqId, bp in seqIdToBp.iteritems():
            if str(seqId).split('|', 2)[1].strip() == str(dName).strip():
                seqIdToBpSub[seqId] = seqIdToBp[seqId]
                seqIdToPredSub[seqId] = seqIdToPred[seqId]
                seqIdToTruePredSub[seqId] = seqIdToTruePred[seqId]

        # accuracy
        acc = accuracy.Accuracy(seqIdToBpSub, seqIdToPredSub,
                                seqIdToTruePredSub, taxonomyA)
        out = csv.OutFileBuffer(
            os.path.join(outputDir, 'train_accuracy_' + dName + '.txt'))
        out.writeText(
            acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                                 minFracClade=None,
                                 minFracPred=None,
                                 overview=True))

        # confusion matrices
        cm = confusion_matrix.ConfusionMatrix(
            seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyCM,
            taxonomy_ncbi.TAXONOMIC_RANKS[1:])
        for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]:
            cm.generateConfusionMatrix(
                rank, os.path.join(outputDir, 'train_accuracy_cmp_' + dName))
        cm.close(closeTaxonomy=False)

        out.close()
        acc.close(closeTaxonomy=False)
    taxonomyA.close()
    taxonomyCM.close()
    def __init__(self,
                 seqNameToBp,
                 seqNameToPred,
                 seqNameToRefPred,
                 taxonomy,
                 ranksList=None):
        """
            Initializes the main class that computes the confusion matrices.

            @param seqNameToBp: contains mapping, sequence name to bp (as int); or a fasta file
                @type seqNameToBp: dict; or a fasta file
            @param seqNameToPred: contains mapping, sequence name to taxonId; or a tab separated prediction file
                @type seqNameToPred: dict; or a tab separated file, first column ~ sequence name, last column taxonId
            @param seqNameToRefPred: contains mapping, sequence name to taxon Id; or a tab separated reference file
                @type seqNameToRefPred: dict; or a tab separated file, first column ~ sequence name, last column taxonId
            @param ranksList: list of ranks for which the confusion matrices will be computed (None ~ all default ranks)
                @type ranksList: list of str
            @param taxonomy: database file in the sqlite3 format; or taxonomy returned by function "getTaxonomy"
        """
        # Check input options and read in the data (if appropriate)
        self._initFailed = False  # replace this with exceptions!
        if isinstance(seqNameToBp, dict):
            self._seqNameToBp = seqNameToBp
        elif isinstance(seqNameToBp, str) and os.path.isfile(seqNameToBp):
            self._seqNameToBp = fas.getSequenceToBpDict(seqNameToBp)
        else:
            print("Can't get sequence info from:", seqNameToBp)
            self._initFailed = True
            return
        if isinstance(seqNameToPred, dict):
            self._seqNameToPred = seqNameToPred
        elif isinstance(seqNameToPred, str) and os.path.isfile(seqNameToPred):
            self._seqNameToPred = csv.predToDict(seqNameToPred)
        else:
            print("Can't get prediction info from:", seqNameToPred)
            self._initFailed = True
            return
        if isinstance(seqNameToRefPred, dict):
            self._seqNameToRefPred = seqNameToRefPred
        elif isinstance(seqNameToRefPred,
                        str) and os.path.isfile(seqNameToRefPred):
            self._seqNameToRefPred = csv.predToDict(seqNameToRefPred)
        else:
            print("Can't get reference prediction info from:",
                  seqNameToRefPred)
            self._initFailed = True
            return
        if isinstance(taxonomy, str) and os.path.isfile(taxonomy):
            self._taxonomy = _TaxonomyWrapCM(taxonomy)
        elif isinstance(taxonomy, _TaxonomyWrapCM):
            self._taxonomy = taxonomy
        else:
            print("Can't use taxonomy: ", taxonomy)
        if ranksList is None:
            ranksList = taxonomy_ncbi.TAXONOMIC_RANKS[1:]  # default ranks
        else:
            allowedRanksSet = set(
                taxonomy_ncbi.TAXONOMIC_RANKS[1:])  # custom ranks
            for rank in ranksList:
                if rank not in allowedRanksSet:
                    print('Rank: "' + str(rank) + '" is not allowed!')
                    self._initFailed = True
                    return
        rankIdsList = []  # rankIds that will be considered
        for rank in ranksList:
            rankIdsList.append(self._taxonomy.getRankId(rank))
        self._allowedRankIdsSet = set(rankIdsList)

        # get predictions at different taxonomic ranks
        # rankId -> (seqId -> taxonIdAtRank)
        self._rankIdToPredMap = {}
        self._rankIdToRefMap = {}
        for rankId in rankIdsList:
            self._rankIdToPredMap[rankId] = {}
            self._rankIdToRefMap[rankId] = {}

        # get predictions at given ranks
        for seqId, taxonId in self._seqNameToPred.iteritems():
            while (taxonId is not None) and (taxonId != 1):
                rankId = self._taxonomy.getRankIdOfTaxonId(taxonId)
                if rankId in self._allowedRankIdsSet:
                    self._rankIdToPredMap[rankId][seqId] = taxonId
                taxonId = self._taxonomy.getParent(taxonId)

        # get reference predictions at given ranks
        for seqId, taxonId in self._seqNameToRefPred.iteritems():
            while (taxonId is not None) and (taxonId != 1):
                rankId = self._taxonomy.getRankIdOfTaxonId(taxonId)
                if rankId in self._allowedRankIdsSet:
                    self._rankIdToRefMap[rankId][seqId] = taxonId
                taxonId = self._taxonomy.getParent(taxonId)
예제 #14
0
def getProfile(readsFFastaFile, communityFile, contigMFastaFile,
               contigLFastaFile, taxonomyMFile, taxonomyDbFile,
               outProfileFile):
    """
        Gets the profile of the dataset.

        @param readsFFastaFile:
        @param communityFile:
        @param contigMFastaFile:
        @param contigLFastaFile:
        @param taxonomyMFile:
        @param taxonomyDbFile: taxonomy in the sqlite3 format
        @param outProfileFile: output file
    """
    # get map: taxonId -> read count
    taxonIdToReadCount = {}
    readTotalCount = 0
    for taxonId in getReadsTaxonIdList(
            readsFFastaFile, communityFile,
            readHeaderToCommunityId=getCommunityId)[1:]:
        if taxonId in taxonIdToReadCount:
            taxonIdToReadCount[taxonId] += 1
        else:
            taxonIdToReadCount[taxonId] = 1
        readTotalCount += 1

    # get map: taxonId -> contig count
    # get map: taxonId -> contig bp
    taxonIdToContigCount = {}
    taxonIdToContigBp = {}
    totalContigCount = 0
    seqIdToTaxonId = csv.predToDict(taxonomyMFile)
    seqIdToBp = fas.getSequenceToBpDict(contigMFastaFile)
    for seqId, bp in seqIdToBp.iteritems():
        totalContigCount += 1
        taxonId = seqIdToTaxonId[seqId]
        if taxonId in taxonIdToContigBp:
            taxonIdToContigBp[taxonId] += bp
        else:
            taxonIdToContigBp[taxonId] = bp
        if taxonId in taxonIdToContigCount:
            taxonIdToContigCount[taxonId] += 1
        else:
            taxonIdToContigCount[taxonId] = 1

    taxonIdToTotalBp = {}
    taxonIdToAvgSumCov = {}
    taxonIdToAvgCov = {}
    totalBp = 0.0
    for taxonId in taxonIdToContigBp:
        taxonIdToTotalBp[taxonId] = 0.0
        taxonIdToAvgSumCov[taxonId] = 0.0
        taxonIdToAvgCov[taxonId] = 0.0

    for seqId in fas.fastaFileToDictWholeNames(contigLFastaFile):
        shortSeqId = getShortContigId(seqId)
        if shortSeqId in seqIdToBp:
            coverage = getCoverage(seqId)
            bp = seqIdToBp[shortSeqId]
            taxonId = seqIdToTaxonId[shortSeqId]
            taxonIdToTotalBp[taxonId] += bp
            taxonIdToAvgSumCov[taxonId] += float(coverage) * float(bp)
            totalBp += bp

    for taxonId, bp in taxonIdToTotalBp.iteritems():
        if bp > 0:
            taxonIdToAvgCov[taxonId] = taxonIdToAvgSumCov[taxonId] / float(bp)

    tupleList = []
    taxonomy = taxonomy_ncbi.TaxonomyNcbi(taxonomyDbFile, considerNoRank=True)
    ranks = taxonomy_ncbi.TAXONOMIC_RANKS[2:]
    avgCoverage = 0.0
    for taxonId, readCount in taxonIdToReadCount.iteritems():
        scName = ScientificNameAtRank(taxonId, taxonomy, ranks)
        tupleList.append((
            taxonId,
            round(100 * (readCount / float(readTotalCount)), 1),
            round(100 * (taxonIdToTotalBp.get(taxonId, 0) / float(totalBp)),
                  1),
            round(taxonIdToAvgCov.get(taxonId, 0), 2),
            round(taxonIdToTotalBp.get(taxonId, 0) / 1000000.0, 2),
            taxonIdToContigCount.get(taxonId, 0),
            taxonomy.getScientificName(taxonId),
            scName.getNameAtRank('phylum'),
            scName.getNameAtRank('class'),
            scName.getNameAtRank('order'),
            scName.getNameAtRank('family'),
            scName.getNameAtRank('genus'),
            scName.getNameAtRank(
                'species')  # this could be done in a nicer way
        ))

        avgCoverage += taxonIdToAvgCov.get(taxonId, 0) * taxonIdToTotalBp.get(
            taxonId, 0)
    avgCoverage /= float(totalBp)
    tupleList.sort(key=lambda x: x[2], reverse=True)

    out = csv.OutFileBuffer(outProfileFile)
    out.writeText(
        '#taxonId, % reads, % contigs, avg coverage, MB contigs, contigs count, strain name, '
        + ",".join(ranks) + '\n')
    for entry in tupleList:
        out.writeText(','.join(map(str, entry)) + '\n')

    out.writeText('#Sum/Avg., -, -, ' + str(round(avgCoverage, 2)) + ', ' +
                  str(round(totalBp / 1000000.0, 2)) + ', ' +
                  str(totalContigCount) + ', -\n')
    out.close()
    taxonomy.close()
예제 #15
0
    def __init__(self, seqNameToBp, seqNameToPred, seqNameToRefPred, taxonomy, ranksList=None):
        """
            Initializes the main class that computes the confusion matrices.

            @param seqNameToBp: contains mapping, sequence name to bp (as int); or a fasta file
                @type seqNameToBp: dict; or a fasta file
            @param seqNameToPred: contains mapping, sequence name to taxonId; or a tab separated prediction file
                @type seqNameToPred: dict; or a tab separated file, first column ~ sequence name, last column taxonId
            @param seqNameToRefPred: contains mapping, sequence name to taxon Id; or a tab separated reference file
                @type seqNameToRefPred: dict; or a tab separated file, first column ~ sequence name, last column taxonId
            @param ranksList: list of ranks for which the confusion matrices will be computed (None ~ all default ranks)
                @type ranksList: list of str
            @param taxonomy: database file in the sqlite3 format; or taxonomy returned by function "getTaxonomy"
        """
        # Check input options and read in the data (if appropriate)
        self._initFailed = False  # replace this with exceptions!
        if isinstance(seqNameToBp, dict):
            self._seqNameToBp = seqNameToBp
        elif isinstance(seqNameToBp, str) and os.path.isfile(seqNameToBp):
            self._seqNameToBp = fas.getSequenceToBpDict(seqNameToBp)
        else:
            print("Can't get sequence info from:", seqNameToBp)
            self._initFailed = True
            return
        if isinstance(seqNameToPred, dict):
            self._seqNameToPred = seqNameToPred
        elif isinstance(seqNameToPred, str) and os.path.isfile(seqNameToPred):
            self._seqNameToPred = csv.predToDict(seqNameToPred)
        else:
            print("Can't get prediction info from:", seqNameToPred)
            self._initFailed = True
            return
        if isinstance(seqNameToRefPred, dict):
            self._seqNameToRefPred = seqNameToRefPred
        elif isinstance(seqNameToRefPred, str) and os.path.isfile(seqNameToRefPred):
            self._seqNameToRefPred = csv.predToDict(seqNameToRefPred)
        else:
            print("Can't get reference prediction info from:", seqNameToRefPred)
            self._initFailed = True
            return
        if isinstance(taxonomy, str) and os.path.isfile(taxonomy):
            self._taxonomy = _TaxonomyWrapCM(taxonomy)
        elif isinstance(taxonomy, _TaxonomyWrapCM):
            self._taxonomy = taxonomy
        else:
            print("Can't use taxonomy: ", taxonomy)
        if ranksList is None:
            ranksList = taxonomy_ncbi.TAXONOMIC_RANKS[1:]  # default ranks
        else:
            allowedRanksSet = set(taxonomy_ncbi.TAXONOMIC_RANKS[1:])  # custom ranks
            for rank in ranksList:
                if rank not in allowedRanksSet:
                    print('Rank: "' + str(rank) + '" is not allowed!')
                    self._initFailed = True
                    return
        rankIdsList = []  # rankIds that will be considered
        for rank in ranksList:
            rankIdsList.append(self._taxonomy.getRankId(rank))
        self._allowedRankIdsSet = set(rankIdsList)

        # get predictions at different taxonomic ranks
        # rankId -> (seqId -> taxonIdAtRank)
        self._rankIdToPredMap = {}
        self._rankIdToRefMap = {}
        for rankId in rankIdsList:
            self._rankIdToPredMap[rankId] = {}
            self._rankIdToRefMap[rankId] = {}

        # get predictions at given ranks
        for seqId, taxonId in self._seqNameToPred.iteritems():
            while (taxonId is not None) and (taxonId != 1):
                rankId = self._taxonomy.getRankIdOfTaxonId(taxonId)
                if rankId in self._allowedRankIdsSet:
                    self._rankIdToPredMap[rankId][seqId] = taxonId
                taxonId = self._taxonomy.getParent(taxonId)

        # get reference predictions at given ranks
        for seqId, taxonId in self._seqNameToRefPred.iteritems():
            while (taxonId is not None) and (taxonId != 1):
                rankId = self._taxonomy.getRankIdOfTaxonId(taxonId)
                if rankId in self._allowedRankIdsSet:
                    self._rankIdToRefMap[rankId][seqId] = taxonId
                taxonId = self._taxonomy.getParent(taxonId)
예제 #16
0
    def __init__(self, contigNameToBp, contigNameToNcbid, scaffToContigList, taxonomy,
                 minScaffContigCount=None, minScaffBpLen=None, cladesSet=None, considerContigWithNoScaff=True,
                 ignoreScaffPredToRoot=True):
        """
            Initializes the main Consistency class.

            @param contigNameToBp: dictionary that maps contig names to bp (int);
                or a fasta file that contain contigs
            @param contigNameToNcbid: dictionary that maps contig names to ncbids (int);
                or a prediction file - first column contig name, last column ncbid
            @param scaffToContigList: dictionary that maps scaffold names to list of contig names;
                or a file - first column scaffold name, second column contig name
            @param minScaffContigCount: consider only scaffolds that contain at least this number of contigs
            @param minScaffBpLen: consider only scaffolds with at least this collective length (in bp)
            @param cladesSet: consider only scaffolds that contain at least one contig from this set
            @param considerContigWithNoScaff: consider also contigs that are not assigned to scaffolds
                (as artificial scaffolds)
            @param ignoreScaffPredToRoot: ignore scaffolds that are assigned based on the root (uninformative)
        """
        # check input options
        assert minScaffContigCount is None or isinstance(minScaffContigCount, int)
        assert minScaffBpLen is None or isinstance(minScaffBpLen, int)
        assert cladesSet is None or isinstance(cladesSet, set)
        assert isinstance(considerContigWithNoScaff, bool)
        assert isinstance(ignoreScaffPredToRoot, bool)

        if isinstance(contigNameToBp, dict):
            self._contigNameToBp = contigNameToBp
        elif isinstance(contigNameToBp, str) and os.path.isfile(contigNameToBp):
            self._contigNameToBp = getSequenceToBpDict(contigNameToBp)
        else:
            print("Can't get contig info from: ", contigNameToBp)
            return
        if isinstance(contigNameToNcbid, dict):
            self._contigToPred = contigNameToNcbid
        elif isinstance(contigNameToNcbid, str) and os.path.isfile(contigNameToNcbid):
            self._contigToPred = predToDict(contigNameToNcbid)
        else:
            print("Can't get prediction info from: ", contigNameToNcbid)
            return
        if isinstance(scaffToContigList, dict):
            self._scaffToContigsList = scaffToContigList
        elif isinstance(scaffToContigList, str) and os.path.isfile(scaffToContigList):
            self._scaffToContigsList = getMapping(scaffToContigList, 0, 1, '\t')
        else:
            print("Can't get scaffold config mapping from: ", scaffToContigList)
            return

        if isinstance(taxonomy, _TaxonomyWrapper) and (not taxonomy.isClosed()):
            self._taxonomy = taxonomy
        elif isinstance(taxonomy, str) and os.path.isfile(taxonomy):
            self._taxonomy = _TaxonomyWrapper(taxonomy)
        else:
            print("Can't use taxonomy:", taxonomy)
            return

        # check the consistency of the data!

        # if a contig that is defined in the mapping doesn't exist (in the fasta file) we remove it
        for scaff, contigsList in self._scaffToContigsList.iteritems():
            removeList = []
            for contig in contigsList:
                if contig not in self._contigNameToBp:
                    removeList.append(contig)

            for contig in removeList:
                contigsList.remove(contig)

        # if a contig was predicted but there is no scaffold assigned to it then this
        # contig is assigned to an "artificial scaffold"
        if considerContigWithNoScaff:
            scaffContigSet = set()
            for s, l in self._scaffToContigsList.iteritems():
                for c in l:
                    scaffContigSet.add(c)
            aloneContigSet = set()
            for c in self._contigToPred:
                if c not in scaffContigSet:
                    aloneContigSet.add(c)

            for c in aloneContigSet:
                scaffName = str('scaffold_' + c)  # make up a scaffold name
                assert scaffName not in self._scaffToContigsList, 'The names of contigs are ambiguous!'
                self._scaffToContigsList[scaffName] = [c]

        # filter out scaffolds according to the input constrains
        self._scaffolds = dict()
        for scaffName, contigsList in self._scaffToContigsList.iteritems():
            if minScaffContigCount is not None:
                if len(contigsList) < minScaffContigCount:
                    continue

            if minScaffBpLen is not None:
                sum = 0
                for contig in contigsList:
                    sum += self._contigNameToBp[contig]
                if sum < minScaffBpLen:
                    continue

            if cladesSet is not None:
                passScaff = False
                for contig in contigsList:
                    if (contig in self._contigToPred) and (self._contigToPred[contig] in cladesSet):
                        passScaff = True
                        break
                if not passScaff:
                    continue

            # process the scaffold, but if everything in the scaffold was assigned to the root, then ignore it!
            s = self._processScaffold(scaffName)
            if not ((s.getNcbid() == 1) and ignoreScaffPredToRoot):
                self._scaffolds[scaffName] = s
예제 #17
0
def getProfile(readsFFastaFile,
               communityFile, contigMFastaFile, contigLFastaFile, taxonomyMFile, taxonomyDbFile, outProfileFile):
    """
        Gets the profile of the dataset.

        @param readsFFastaFile:
        @param communityFile:
        @param contigMFastaFile:
        @param contigLFastaFile:
        @param taxonomyMFile:
        @param taxonomyDbFile: taxonomy in the sqlite3 format
        @param outProfileFile: output file
    """
    # get map: taxonId -> read count
    taxonIdToReadCount = {}
    readTotalCount = 0
    for taxonId in getReadsTaxonIdList(readsFFastaFile, communityFile, readHeaderToCommunityId=getCommunityId)[1:]:
        if taxonId in taxonIdToReadCount:
            taxonIdToReadCount[taxonId] += 1
        else:
            taxonIdToReadCount[taxonId] = 1
        readTotalCount += 1

    # get map: taxonId -> contig count
    # get map: taxonId -> contig bp
    taxonIdToContigCount = {}
    taxonIdToContigBp = {}
    totalContigCount = 0
    seqIdToTaxonId = csv.predToDict(taxonomyMFile)
    seqIdToBp = fas.getSequenceToBpDict(contigMFastaFile)
    for seqId, bp in seqIdToBp.iteritems():
        totalContigCount += 1
        taxonId = seqIdToTaxonId[seqId]
        if taxonId in taxonIdToContigBp:
            taxonIdToContigBp[taxonId] += bp
        else:
            taxonIdToContigBp[taxonId] = bp
        if taxonId in taxonIdToContigCount:
            taxonIdToContigCount[taxonId] += 1
        else:
            taxonIdToContigCount[taxonId] = 1

    taxonIdToTotalBp = {}
    taxonIdToAvgSumCov = {}
    taxonIdToAvgCov = {}
    totalBp = 0.0
    for taxonId in taxonIdToContigBp:
        taxonIdToTotalBp[taxonId] = 0.0
        taxonIdToAvgSumCov[taxonId] = 0.0
        taxonIdToAvgCov[taxonId] = 0.0

    for seqId in fas.fastaFileToDictWholeNames(contigLFastaFile):
        shortSeqId = getShortContigId(seqId)
        if shortSeqId in seqIdToBp:
            coverage = getCoverage(seqId)
            bp = seqIdToBp[shortSeqId]
            taxonId = seqIdToTaxonId[shortSeqId]
            taxonIdToTotalBp[taxonId] += bp
            taxonIdToAvgSumCov[taxonId] += float(coverage) * float(bp)
            totalBp += bp

    for taxonId, bp in taxonIdToTotalBp.iteritems():
        if bp > 0:
            taxonIdToAvgCov[taxonId] = taxonIdToAvgSumCov[taxonId] / float(bp)

    tupleList = []
    taxonomy = taxonomy_ncbi.TaxonomyNcbi(taxonomyDbFile, considerNoRank=True)
    ranks = taxonomy_ncbi.TAXONOMIC_RANKS[2:]
    avgCoverage = 0.0
    for taxonId, readCount in taxonIdToReadCount.iteritems():
        scName = ScientificNameAtRank(taxonId, taxonomy, ranks)
        tupleList.append((taxonId,
                          round(100 * (readCount / float(readTotalCount)), 1),
                          round(100 * (taxonIdToTotalBp.get(taxonId, 0) / float(totalBp)), 1),
                          round(taxonIdToAvgCov.get(taxonId, 0), 2),
                          round(taxonIdToTotalBp.get(taxonId, 0) / 1000000.0, 2),
                          taxonIdToContigCount.get(taxonId, 0),
                          taxonomy.getScientificName(taxonId),
                          scName.getNameAtRank('phylum'),
                          scName.getNameAtRank('class'),
                          scName.getNameAtRank('order'),
                          scName.getNameAtRank('family'),
                          scName.getNameAtRank('genus'),
                          scName.getNameAtRank('species')  # this could be done in a nicer way
        ))

        avgCoverage += taxonIdToAvgCov.get(taxonId, 0) * taxonIdToTotalBp.get(taxonId, 0)
    avgCoverage /= float(totalBp)
    tupleList.sort(key=lambda x: x[2], reverse=True)

    out = csv.OutFileBuffer(outProfileFile)
    out.writeText('#taxonId, % reads, % contigs, avg coverage, MB contigs, contigs count, strain name, ' +
                  ",".join(ranks) + '\n')
    for entry in tupleList:
        out.writeText(','.join(map(str, entry)) + '\n')

    out.writeText('#Sum/Avg., -, -, ' + str(round(avgCoverage, 2)) + ', ' + str(round(totalBp / 1000000.0, 2)) +
                  ', ' + str(totalContigCount) + ', -\n')
    out.close()
    taxonomy.close()
예제 #18
0
    def __init__(self,
                 contigNameToBp,
                 contigNameToNcbid,
                 scaffToContigList,
                 taxonomy,
                 minScaffContigCount=None,
                 minScaffBpLen=None,
                 cladesSet=None,
                 considerContigWithNoScaff=True,
                 ignoreScaffPredToRoot=True):
        """
            Initializes the main Consistency class.

            @param contigNameToBp: dictionary that maps contig names to bp (int);
                or a fasta file that contain contigs
            @param contigNameToNcbid: dictionary that maps contig names to ncbids (int);
                or a prediction file - first column contig name, last column ncbid
            @param scaffToContigList: dictionary that maps scaffold names to list of contig names;
                or a file - first column scaffold name, second column contig name
            @param minScaffContigCount: consider only scaffolds that contain at least this number of contigs
            @param minScaffBpLen: consider only scaffolds with at least this collective length (in bp)
            @param cladesSet: consider only scaffolds that contain at least one contig from this set
            @param considerContigWithNoScaff: consider also contigs that are not assigned to scaffolds
                (as artificial scaffolds)
            @param ignoreScaffPredToRoot: ignore scaffolds that are assigned based on the root (uninformative)
        """
        # check input options
        assert minScaffContigCount is None or isinstance(
            minScaffContigCount, int)
        assert minScaffBpLen is None or isinstance(minScaffBpLen, int)
        assert cladesSet is None or isinstance(cladesSet, set)
        assert isinstance(considerContigWithNoScaff, bool)
        assert isinstance(ignoreScaffPredToRoot, bool)

        if isinstance(contigNameToBp, dict):
            self._contigNameToBp = contigNameToBp
        elif isinstance(contigNameToBp,
                        str) and os.path.isfile(contigNameToBp):
            self._contigNameToBp = getSequenceToBpDict(contigNameToBp)
        else:
            print("Can't get contig info from: ", contigNameToBp)
            return
        if isinstance(contigNameToNcbid, dict):
            self._contigToPred = contigNameToNcbid
        elif isinstance(contigNameToNcbid,
                        str) and os.path.isfile(contigNameToNcbid):
            self._contigToPred = predToDict(contigNameToNcbid)
        else:
            print("Can't get prediction info from: ", contigNameToNcbid)
            return
        if isinstance(scaffToContigList, dict):
            self._scaffToContigsList = scaffToContigList
        elif isinstance(scaffToContigList,
                        str) and os.path.isfile(scaffToContigList):
            self._scaffToContigsList = getMapping(scaffToContigList, 0, 1,
                                                  '\t')
        else:
            print("Can't get scaffold config mapping from: ",
                  scaffToContigList)
            return

        if isinstance(taxonomy,
                      _TaxonomyWrapper) and (not taxonomy.isClosed()):
            self._taxonomy = taxonomy
        elif isinstance(taxonomy, str) and os.path.isfile(taxonomy):
            self._taxonomy = _TaxonomyWrapper(taxonomy)
        else:
            print("Can't use taxonomy:", taxonomy)
            return

        # check the consistency of the data!

        # if a contig that is defined in the mapping doesn't exist (in the fasta file) we remove it
        for scaff, contigsList in self._scaffToContigsList.iteritems():
            removeList = []
            for contig in contigsList:
                if contig not in self._contigNameToBp:
                    removeList.append(contig)

            for contig in removeList:
                contigsList.remove(contig)

        # if a contig was predicted but there is no scaffold assigned to it then this
        # contig is assigned to an "artificial scaffold"
        if considerContigWithNoScaff:
            scaffContigSet = set()
            for s, l in self._scaffToContigsList.iteritems():
                for c in l:
                    scaffContigSet.add(c)
            aloneContigSet = set()
            for c in self._contigToPred:
                if c not in scaffContigSet:
                    aloneContigSet.add(c)

            for c in aloneContigSet:
                scaffName = str('scaffold_' + c)  # make up a scaffold name
                assert scaffName not in self._scaffToContigsList, 'The names of contigs are ambiguous!'
                self._scaffToContigsList[scaffName] = [c]

        # filter out scaffolds according to the input constrains
        self._scaffolds = dict()
        for scaffName, contigsList in self._scaffToContigsList.iteritems():
            if minScaffContigCount is not None:
                if len(contigsList) < minScaffContigCount:
                    continue

            if minScaffBpLen is not None:
                sum = 0
                for contig in contigsList:
                    sum += self._contigNameToBp[contig]
                if sum < minScaffBpLen:
                    continue

            if cladesSet is not None:
                passScaff = False
                for contig in contigsList:
                    if (contig in self._contigToPred) and (
                            self._contigToPred[contig] in cladesSet):
                        passScaff = True
                        break
                if not passScaff:
                    continue

            # process the scaffold, but if everything in the scaffold was assigned to the root, then ignore it!
            s = self._processScaffold(scaffName)
            if not ((s.getNcbid() == 1) and ignoreScaffPredToRoot):
                self._scaffolds[scaffName] = s