Ejemplo n.º 1
0
    def __init__(self,
                 seqIdToBp,
                 seqIdToPred,
                 seqIdToTruePred,
                 taxonomy,
                 correctLabelThreshold=None):
        """
            Initializes the accuracy object.
            @param seqIdToBp: dictionary or a fasta file
            @param seqIdToPred: dictionary or a prediction file
            @param seqIdToTruePred: dictionary or a true prediction file
            @param taxonomy: database file in the sqlite3 format, or taxonomy object retrieved from not closed Accuracy
        """
        if isinstance(seqIdToBp, dict):
            self._seqToBp = seqIdToBp
        else:
            assert os.path.isfile(seqIdToBp)
            self._seqToBp = fasta.getSequenceToBpDict(seqIdToBp)

        if isinstance(seqIdToPred, dict):
            self._seqToPred = seqIdToPred
        else:
            assert os.path.isfile(seqIdToPred)
            self._seqToPred = cami.readAssignments(seqIdToPred)

        if isinstance(seqIdToTruePred, dict):
            self._seqToTrue = seqIdToTruePred
        else:
            assert os.path.isfile(seqIdToTruePred)
            self._seqToTrue = cami.readAssignments(seqIdToTruePred)

        if isinstance(taxonomy, _TaxonomyWrapperA):
            self._taxonomy = taxonomy
        else:
            assert os.path.isfile(taxonomy)
            self._taxonomy = _TaxonomyWrapperA(taxonomy)

        # correct the predictions self._seqToPred
        if correctLabelThreshold is not None:
            self._seqToPred = self._correctPredictions(self._seqToBp,
                                                       self._seqToPred,
                                                       self._seqToTrue,
                                                       self._taxonomy,
                                                       correctLabelThreshold)
Ejemplo n.º 2
0
    def __init__(self, seqIdToBp, seqIdToPred, seqIdToTruePred, taxonomy, correctLabelThreshold=None):
        """
            Initializes the accuracy object.
            @param seqIdToBp: dictionary or a fasta file
            @param seqIdToPred: dictionary or a prediction file
            @param seqIdToTruePred: dictionary or a true prediction file
            @param taxonomy: database file in the sqlite3 format, or taxonomy object retrieved from not closed Accuracy
        """
        if isinstance(seqIdToBp, dict):
            self._seqToBp = seqIdToBp
        else:
            assert os.path.isfile(seqIdToBp)
            self._seqToBp = fasta.getSequenceToBpDict(seqIdToBp)

        if isinstance(seqIdToPred, dict):
            self._seqToPred = seqIdToPred
        else:
            assert os.path.isfile(seqIdToPred)
            self._seqToPred = cami.readAssignments(seqIdToPred)

        if isinstance(seqIdToTruePred, dict):
            self._seqToTrue = seqIdToTruePred
        else:
            assert os.path.isfile(seqIdToTruePred)
            self._seqToTrue = cami.readAssignments(seqIdToTruePred)

        if isinstance(taxonomy, _TaxonomyWrapperA):
            self._taxonomy = taxonomy
        else:
            assert os.path.isfile(taxonomy)
            self._taxonomy = _TaxonomyWrapperA(taxonomy)

        # correct the predictions self._seqToPred
        if correctLabelThreshold is not None:
            self._seqToPred = self._correctPredictions(
                self._seqToBp, self._seqToPred, self._seqToTrue, self._taxonomy, correctLabelThreshold)
Ejemplo n.º 3
0
    def __init__(self,
                 seqNameToBp,
                 seqNameToPred,
                 seqNameToRefPred,
                 taxonomy,
                 ranksList=None):
        """
            Initializes the main class that computes the confusion matrices.

            @param seqNameToBp: contains mapping, sequence name to bp (as int); or a fasta file
                @type seqNameToBp: dict; or a fasta file
            @param seqNameToPred: contains mapping, sequence name to taxonId; or a tab separated prediction file
                @type seqNameToPred: dict; or a tab separated file, first column ~ sequence name, last column taxonId
            @param seqNameToRefPred: contains mapping, sequence name to taxon Id; or a tab separated reference file
                @type seqNameToRefPred: dict; or a tab separated file, first column ~ sequence name, last column taxonId
            @param ranksList: list of ranks for which the confusion matrices will be computed (None ~ all default ranks)
                @type ranksList: list of str
            @param taxonomy: database file in the sqlite3 format; or taxonomy returned by function "getTaxonomy"
        """
        # Check input options and read in the data (if appropriate)
        self._initFailed = False  # replace this with exceptions!
        if isinstance(seqNameToBp, dict):
            self._seqNameToBp = seqNameToBp
        elif isinstance(seqNameToBp, str) and os.path.isfile(seqNameToBp):
            self._seqNameToBp = fas.getSequenceToBpDict(seqNameToBp)
        else:
            print("Can't get sequence info from:", seqNameToBp)
            self._initFailed = True
            return
        if isinstance(seqNameToPred, dict):
            self._seqNameToPred = seqNameToPred
        elif isinstance(seqNameToPred, str) and os.path.isfile(seqNameToPred):
            self._seqNameToPred = cami.readAssignments(seqNameToPred)
        else:
            print("Can't get prediction info from:", seqNameToPred)
            self._initFailed = True
            return
        if isinstance(seqNameToRefPred, dict):
            self._seqNameToRefPred = seqNameToRefPred
        elif isinstance(seqNameToRefPred,
                        str) and os.path.isfile(seqNameToRefPred):
            self._seqNameToRefPred = cami.readAssignments(seqNameToRefPred)
        else:
            print("Can't get reference prediction info from:",
                  seqNameToRefPred)
            self._initFailed = True
            return
        if isinstance(taxonomy, str) and os.path.isfile(taxonomy):
            self._taxonomy = _TaxonomyWrapCM(taxonomy)
        elif isinstance(taxonomy, _TaxonomyWrapCM):
            self._taxonomy = taxonomy
        else:
            print("Can't use taxonomy: ", taxonomy)
        if ranksList is None:
            ranksList = taxonomy_ncbi.TAXONOMIC_RANKS[1:]  # default ranks
        else:
            allowedRanksSet = set(
                taxonomy_ncbi.TAXONOMIC_RANKS[1:])  # custom ranks
            for rank in ranksList:
                if rank not in allowedRanksSet:
                    print('Rank: "' + str(rank) + '" is not allowed!')
                    self._initFailed = True
                    return
        rankIdsList = []  # rankIds that will be considered
        for rank in ranksList:
            rankIdsList.append(self._taxonomy.getRankId(rank))
        self._allowedRankIdsSet = set(rankIdsList)

        # get predictions at different taxonomic ranks
        # rankId -> (seqId -> taxonIdAtRank)
        self._rankIdToPredMap = {}
        self._rankIdToRefMap = {}
        for rankId in rankIdsList:
            self._rankIdToPredMap[rankId] = {}
            self._rankIdToRefMap[rankId] = {}

        # get predictions at given ranks
        for seqId, taxonId in self._seqNameToPred.iteritems():
            while (taxonId is not None) and (taxonId != 1):
                rankId = self._taxonomy.getRankIdOfTaxonId(taxonId)
                if rankId in self._allowedRankIdsSet:
                    self._rankIdToPredMap[rankId][seqId] = taxonId
                taxonId = self._taxonomy.getParent(taxonId)

        # get reference predictions at given ranks
        for seqId, taxonId in self._seqNameToRefPred.iteritems():
            while (taxonId is not None) and (taxonId != 1):
                rankId = self._taxonomy.getRankIdOfTaxonId(taxonId)
                if rankId in self._allowedRankIdsSet:
                    self._rankIdToRefMap[rankId][seqId] = taxonId
                taxonId = self._taxonomy.getParent(taxonId)
Ejemplo n.º 4
0
    def __init__(self, contigNameToBp, contigNameToNcbid, scaffToContigList, taxonomy,
                 minScaffContigCount=None, minScaffBpLen=None, cladesSet=None, considerContigWithNoScaff=True,
                 ignoreScaffPredToRoot=True):
        """
            Initializes the main Consistency class.

            @param contigNameToBp: dictionary that maps contig names to bp (int);
                or a fasta file that contain contigs
            @param contigNameToNcbid: dictionary that maps contig names to ncbids (int);
                or a prediction file - first column contig name, last column ncbid
            @param scaffToContigList: dictionary that maps scaffold names to list of contig names;
                or a file - first column scaffold name, second column contig name
            @param minScaffContigCount: consider only scaffolds that contain at least this number of contigs
            @param minScaffBpLen: consider only scaffolds with at least this collective length (in bp)
            @param cladesSet: consider only scaffolds that contain at least one contig from this set
            @param considerContigWithNoScaff: consider also contigs that are not assigned to scaffolds
                (as artificial scaffolds)
            @param ignoreScaffPredToRoot: ignore scaffolds that are assigned based on the root (uninformative)
        """
        # check input options
        assert minScaffContigCount is None or isinstance(minScaffContigCount, int)
        assert minScaffBpLen is None or isinstance(minScaffBpLen, int)
        assert cladesSet is None or isinstance(cladesSet, set)
        assert isinstance(considerContigWithNoScaff, bool)
        assert isinstance(ignoreScaffPredToRoot, bool)

        if isinstance(contigNameToBp, dict):
            self._contigNameToBp = contigNameToBp
        elif isinstance(contigNameToBp, str) and os.path.isfile(contigNameToBp):
            self._contigNameToBp = getSequenceToBpDict(contigNameToBp)
        else:
            print("Can't get contig info from: ", contigNameToBp)
            return
        if isinstance(contigNameToNcbid, dict):
            self._contigToPred = contigNameToNcbid
        elif isinstance(contigNameToNcbid, str) and os.path.isfile(contigNameToNcbid):
            self._contigToPred = cami.readAssignments(contigNameToNcbid)
        else:
            print("Can't get prediction info from: ", contigNameToNcbid)
            return
        if isinstance(scaffToContigList, dict):
            self._scaffToContigsList = scaffToContigList
        elif isinstance(scaffToContigList, str) and os.path.isfile(scaffToContigList):
            self._scaffToContigsList = getMapping(scaffToContigList, 0, 1, '\t')
        else:
            print("Can't get scaffold config mapping from: ", scaffToContigList)
            return

        if isinstance(taxonomy, _TaxonomyWrapper) and (not taxonomy.isClosed()):
            self._taxonomy = taxonomy
        elif isinstance(taxonomy, str) and os.path.isfile(taxonomy):
            self._taxonomy = _TaxonomyWrapper(taxonomy)
        else:
            print("Can't use taxonomy:", taxonomy)
            return

        # check the consistency of the data!

        # if a contig that is defined in the mapping doesn't exist (in the fasta file) we remove it
        for scaff, contigsList in self._scaffToContigsList.iteritems():
            removeList = []
            for contig in contigsList:
                if contig not in self._contigNameToBp:
                    removeList.append(contig)

            for contig in removeList:
                contigsList.remove(contig)

        # if a contig was predicted but there is no scaffold assigned to it then this
        # contig is assigned to an "artificial scaffold"
        if considerContigWithNoScaff:
            scaffContigSet = set()
            for s, l in self._scaffToContigsList.iteritems():
                for c in l:
                    scaffContigSet.add(c)
            aloneContigSet = set()
            for c in self._contigToPred:
                if c not in scaffContigSet:
                    aloneContigSet.add(c)

            for c in aloneContigSet:
                scaffName = str('scaffold_' + c)  # make up a scaffold name
                assert scaffName not in self._scaffToContigsList, 'The names of contigs are ambiguous!'
                self._scaffToContigsList[scaffName] = [c]

        # filter out scaffolds according to the input constrains
        self._scaffolds = dict()
        for scaffName, contigsList in self._scaffToContigsList.iteritems():
            if minScaffContigCount is not None:
                if len(contigsList) < minScaffContigCount:
                    continue

            if minScaffBpLen is not None:
                sum = 0
                for contig in contigsList:
                    sum += self._contigNameToBp[contig]
                if sum < minScaffBpLen:
                    continue

            if cladesSet is not None:
                passScaff = False
                for contig in contigsList:
                    if (contig in self._contigToPred) and (self._contigToPred[contig] in cladesSet):
                        passScaff = True
                        break
                if not passScaff:
                    continue

            # process the scaffold, but if everything in the scaffold was assigned to the root, then ignore it!
            s = self._processScaffold(scaffName)
            if not ((s.getNcbid() == 1) and ignoreScaffPredToRoot):
                self._scaffolds[scaffName] = s
Ejemplo n.º 5
0
    def __init__(self, seqNameToBp, seqNameToPred, seqNameToRefPred, taxonomy, ranksList=None):
        """
            Initializes the main class that computes the confusion matrices.

            @param seqNameToBp: contains mapping, sequence name to bp (as int); or a fasta file
                @type seqNameToBp: dict; or a fasta file
            @param seqNameToPred: contains mapping, sequence name to taxonId; or a tab separated prediction file
                @type seqNameToPred: dict; or a tab separated file, first column ~ sequence name, last column taxonId
            @param seqNameToRefPred: contains mapping, sequence name to taxon Id; or a tab separated reference file
                @type seqNameToRefPred: dict; or a tab separated file, first column ~ sequence name, last column taxonId
            @param ranksList: list of ranks for which the confusion matrices will be computed (None ~ all default ranks)
                @type ranksList: list of str
            @param taxonomy: database file in the sqlite3 format; or taxonomy returned by function "getTaxonomy"
        """
        # Check input options and read in the data (if appropriate)
        self._initFailed = False  # replace this with exceptions!
        if isinstance(seqNameToBp, dict):
            self._seqNameToBp = seqNameToBp
        elif isinstance(seqNameToBp, str) and os.path.isfile(seqNameToBp):
            self._seqNameToBp = fas.getSequenceToBpDict(seqNameToBp)
        else:
            print("Can't get sequence info from:", seqNameToBp)
            self._initFailed = True
            return
        if isinstance(seqNameToPred, dict):
            self._seqNameToPred = seqNameToPred
        elif isinstance(seqNameToPred, str) and os.path.isfile(seqNameToPred):
            self._seqNameToPred = cami.readAssignments(seqNameToPred)
        else:
            print("Can't get prediction info from:", seqNameToPred)
            self._initFailed = True
            return
        if isinstance(seqNameToRefPred, dict):
            self._seqNameToRefPred = seqNameToRefPred
        elif isinstance(seqNameToRefPred, str) and os.path.isfile(seqNameToRefPred):
            self._seqNameToRefPred = cami.readAssignments(seqNameToRefPred)
        else:
            print("Can't get reference prediction info from:", seqNameToRefPred)
            self._initFailed = True
            return
        if isinstance(taxonomy, str) and os.path.isfile(taxonomy):
            self._taxonomy = _TaxonomyWrapCM(taxonomy)
        elif isinstance(taxonomy, _TaxonomyWrapCM):
            self._taxonomy = taxonomy
        else:
            print("Can't use taxonomy: ", taxonomy)
        if ranksList is None:
            ranksList = taxonomy_ncbi.TAXONOMIC_RANKS[1:]  # default ranks
        else:
            allowedRanksSet = set(taxonomy_ncbi.TAXONOMIC_RANKS[1:])  # custom ranks
            for rank in ranksList:
                if rank not in allowedRanksSet:
                    print('Rank: "' + str(rank) + '" is not allowed!')
                    self._initFailed = True
                    return
        rankIdsList = []  # rankIds that will be considered
        for rank in ranksList:
            rankIdsList.append(self._taxonomy.getRankId(rank))
        self._allowedRankIdsSet = set(rankIdsList)

        # get predictions at different taxonomic ranks
        # rankId -> (seqId -> taxonIdAtRank)
        self._rankIdToPredMap = {}
        self._rankIdToRefMap = {}
        for rankId in rankIdsList:
            self._rankIdToPredMap[rankId] = {}
            self._rankIdToRefMap[rankId] = {}

        # get predictions at given ranks
        for seqId, taxonId in self._seqNameToPred.iteritems():
            while (taxonId is not None) and (taxonId != 1):
                rankId = self._taxonomy.getRankIdOfTaxonId(taxonId)
                if rankId in self._allowedRankIdsSet:
                    self._rankIdToPredMap[rankId][seqId] = taxonId
                taxonId = self._taxonomy.getParent(taxonId)

        # get reference predictions at given ranks
        for seqId, taxonId in self._seqNameToRefPred.iteritems():
            while (taxonId is not None) and (taxonId != 1):
                rankId = self._taxonomy.getRankIdOfTaxonId(taxonId)
                if rankId in self._allowedRankIdsSet:
                    self._rankIdToRefMap[rankId][seqId] = taxonId
                taxonId = self._taxonomy.getParent(taxonId)
Ejemplo n.º 6
0
    def __init__(self,
                 contigNameToBp,
                 contigNameToNcbid,
                 scaffToContigList,
                 taxonomy,
                 minScaffContigCount=None,
                 minScaffBpLen=None,
                 cladesSet=None,
                 considerContigWithNoScaff=True,
                 ignoreScaffPredToRoot=True):
        """
            Initializes the main Consistency class.

            @param contigNameToBp: dictionary that maps contig names to bp (int);
                or a fasta file that contain contigs
            @param contigNameToNcbid: dictionary that maps contig names to ncbids (int);
                or a prediction file - first column contig name, last column ncbid
            @param scaffToContigList: dictionary that maps scaffold names to list of contig names;
                or a file - first column scaffold name, second column contig name
            @param minScaffContigCount: consider only scaffolds that contain at least this number of contigs
            @param minScaffBpLen: consider only scaffolds with at least this collective length (in bp)
            @param cladesSet: consider only scaffolds that contain at least one contig from this set
            @param considerContigWithNoScaff: consider also contigs that are not assigned to scaffolds
                (as artificial scaffolds)
            @param ignoreScaffPredToRoot: ignore scaffolds that are assigned based on the root (uninformative)
        """
        # check input options
        assert minScaffContigCount is None or isinstance(
            minScaffContigCount, int)
        assert minScaffBpLen is None or isinstance(minScaffBpLen, int)
        assert cladesSet is None or isinstance(cladesSet, set)
        assert isinstance(considerContigWithNoScaff, bool)
        assert isinstance(ignoreScaffPredToRoot, bool)

        if isinstance(contigNameToBp, dict):
            self._contigNameToBp = contigNameToBp
        elif isinstance(contigNameToBp,
                        str) and os.path.isfile(contigNameToBp):
            self._contigNameToBp = getSequenceToBpDict(contigNameToBp)
        else:
            print("Can't get contig info from: ", contigNameToBp)
            return
        if isinstance(contigNameToNcbid, dict):
            self._contigToPred = contigNameToNcbid
        elif isinstance(contigNameToNcbid,
                        str) and os.path.isfile(contigNameToNcbid):
            self._contigToPred = cami.readAssignments(contigNameToNcbid)
        else:
            print("Can't get prediction info from: ", contigNameToNcbid)
            return
        if isinstance(scaffToContigList, dict):
            self._scaffToContigsList = scaffToContigList
        elif isinstance(scaffToContigList,
                        str) and os.path.isfile(scaffToContigList):
            self._scaffToContigsList = getMapping(scaffToContigList, 0, 1,
                                                  '\t')
        else:
            print("Can't get scaffold config mapping from: ",
                  scaffToContigList)
            return

        if isinstance(taxonomy,
                      _TaxonomyWrapper) and (not taxonomy.isClosed()):
            self._taxonomy = taxonomy
        elif isinstance(taxonomy, str) and os.path.isfile(taxonomy):
            self._taxonomy = _TaxonomyWrapper(taxonomy)
        else:
            print("Can't use taxonomy:", taxonomy)
            return

        # check the consistency of the data!

        # if a contig that is defined in the mapping doesn't exist (in the fasta file) we remove it
        for scaff, contigsList in self._scaffToContigsList.iteritems():
            removeList = []
            for contig in contigsList:
                if contig not in self._contigNameToBp:
                    removeList.append(contig)

            for contig in removeList:
                contigsList.remove(contig)

        # if a contig was predicted but there is no scaffold assigned to it then this
        # contig is assigned to an "artificial scaffold"
        if considerContigWithNoScaff:
            scaffContigSet = set()
            for s, l in self._scaffToContigsList.iteritems():
                for c in l:
                    scaffContigSet.add(c)
            aloneContigSet = set()
            for c in self._contigToPred:
                if c not in scaffContigSet:
                    aloneContigSet.add(c)

            for c in aloneContigSet:
                scaffName = str('scaffold_' + c)  # make up a scaffold name
                assert scaffName not in self._scaffToContigsList, 'The names of contigs are ambiguous!'
                self._scaffToContigsList[scaffName] = [c]

        # filter out scaffolds according to the input constrains
        self._scaffolds = dict()
        for scaffName, contigsList in self._scaffToContigsList.iteritems():
            if minScaffContigCount is not None:
                if len(contigsList) < minScaffContigCount:
                    continue

            if minScaffBpLen is not None:
                sum = 0
                for contig in contigsList:
                    sum += self._contigNameToBp[contig]
                if sum < minScaffBpLen:
                    continue

            if cladesSet is not None:
                passScaff = False
                for contig in contigsList:
                    if (contig in self._contigToPred) and (
                            self._contigToPred[contig] in cladesSet):
                        passScaff = True
                        break
                if not passScaff:
                    continue

            # process the scaffold, but if everything in the scaffold was assigned to the root, then ignore it!
            s = self._processScaffold(scaffName)
            if not ((s.getNcbid() == 1) and ignoreScaffPredToRoot):
                self._scaffolds[scaffName] = s
def _main():
    # define arguments
    parser = argparse.ArgumentParser(description='Default task: PPS+ evaluation', epilog='')

    parser.add_argument('-b', '--cont-binning-file', nargs=1, type=file, required=True,
                        help='Binning file containing labels assigned to contigs.', metavar='assignments.csv', dest='b')

    parser.add_argument('-t', '--cont-true-binning-file', nargs=1, type=file, required=True,
                        help='Binning file containing true labels for the contigs.', metavar='labels.csv', dest='t')

    parser.add_argument('-f', '--cont-contigs-file-listing', nargs=1, type=file, required=False,
                        help='A list of paths of FASTA contigs files.', metavar='fasta_listing.txt', dest='f')

    parser.add_argument('-m', '--cont-scaffold-contig-mapping', nargs=1, type=file, required=False,
                        help='Scaffold contig mapping, tab separated.', metavar='mapping.csv', dest='m')

    parser.add_argument('-n', '--cont-ncbi-taxonomy', nargs=1, required=False,
                        help='Directory containing the NCBI names.dmp and nodes.dmp files.', metavar='taxonomy_dir',
                        dest='n')

    parser.add_argument('-o', '--cont-output-dir', nargs=1, required=True,
                        help='Output directory.', metavar='output_dir', dest='o')

    parser.add_argument('-j', '--default-job', nargs='+',
                        help='What task/job should be performed (p~precision/recall, s~scaff-contig consistency, '
                             'c~confusion tables, default - if not spec compute all)', metavar='', dest='j')

    args = parser.parse_args()

    # read and check the arguments
    seqIdToBp = None
    scaffToContig = None
    binning = None
    trueBinning = None
    outputDir = None
    job = None

    if args.o and len(args.o) == 1 and os.path.isdir(args.o[0]):
        outputDir = args.o[0]

    if args.b and len(args.b) == 1 and os.path.isfile(args.b[0].name):
        binningFile = args.b[0].name
        binning = cami.readAssignments(binningFile)

    if args.t and len(args.t) == 1 and os.path.isfile(args.t[0].name):
        trueBinningFile = args.t[0].name
        trueBinning = cami.readAssignments(trueBinningFile)

    if args.f and len(args.f) == 1 and os.path.isfile(args.f[0].name):
        seqIdToBp = fasta.getSequenceToBpDict(args.f[0].name)

        # contigsFileListing = args.f[0].name
        # for line in open(contigsFileListing):
        #     if os.path.isfile(line.strip()):
        #         d = fasta.getSequenceToBpDict(line.strip())
        #         if seqIdToBp is None:
        #             seqIdToBp = d
        #         else:
        #             count = len(d) + len(seqIdToBp)
        #             seqIdToBp.update(d)
        #             if count > len(seqIdToBp):
        #                 sys.stderr.write('The fasta files contain duplicate entries!')

    if args.m and len(args.m) == 1 and os.path.isfile(args.m[0].name):
        scaffoldContigMapping = args.m[0].name
        scaffToContig = csv.getMapping(scaffoldContigMapping, 0, 1, '\t')

    taxonomyPath = os.path.join(outputDir, 'taxonomy_ncbi.db')
    if not os.path.isfile(taxonomyPath):
        if args.n and len(args.n) == 1 and os.path.isdir(args.n[0]):
            # build the ncbi taxonomy in the case it doesn't exist
            ncbitax2sqlite.build_database(Args(db=taxonomyPath, dmp=args.n[0]))
        else:
            taxonomyPath = None

    if args.j and len(args.j) > 0 and len(set(args.j).intersection(set(['p', 's', 'c']))) > 0:
        job = set(args.j)

    # print job
    # print args.j
    # print len(seqIdToBp)
    # print len(binning)
    # print len(trueBinning)
    # print taxonomyPath
    # print outputDir

    if (job is None or 'p' in args.j) and seqIdToBp and binning and trueBinning and taxonomyPath and outputDir:
        print('Computing precision/recall')
        # precision/recall - no correction
        acc = accuracy.Accuracy(seqIdToBp, binning, trueBinning, taxonomyPath)
        out = csv.OutFileBuffer(os.path.join(outputDir, 'precision_recall.csv'))
        out.writeText(acc.getAccuracyPrint(RANKS, MIN_FRAC_CLADE, MIN_FRAC_CLADE))
        out.close()
        acc.close()

        # precision/recall - with correction
        acc = accuracy.Accuracy(seqIdToBp, binning, trueBinning, taxonomyPath, CORRECT_LABEL_THRESHOLD)
        out = csv.OutFileBuffer(os.path.join(outputDir, 'precision_recall_correction.csv'))
        out.writeText(acc.getAccuracyPrint(RANKS, MIN_FRAC_CLADE, MIN_FRAC_CLADE))
        out.close()
        acc.close()

    # compute confusion matrices
    if (job is None or 'c' in args.j) and seqIdToBp and binning and trueBinning and taxonomyPath and outputDir:
        print('Computing confusion matrices')
        confusionMatrix = confusion_matrix.ConfusionMatrix(seqIdToBp, binning, trueBinning, taxonomyPath, RANKS)
        for rank in RANKS:
            confusionMatrix.generateConfusionMatrix(rank, os.path.join(outputDir, 'confusion_matrix'))
        confusionMatrix.close()

    # compute scaffold contig consistency
    if (job is None or 's' in args.j) and seqIdToBp and binning and scaffToContig and taxonomyPath \
            and outputDir:
        print('Computing scaffold-contig consistency')
        cons = consistency.Consistency(seqIdToBp, binning, scaffToContig, taxonomyPath)
        out = csv.OutFileBuffer(os.path.join(outputDir, 'consistency.txt'))
        out.writeText(cons.getGroupedScaffoldsPrint())
        cons.close()
        out.close()

    createEvalMetaFile(outputDir)
Ejemplo n.º 8
0
def _main():
    # define arguments
    parser = argparse.ArgumentParser(
        description='Default task: PPS+ evaluation', epilog='')

    parser.add_argument(
        '-b',
        '--cont-binning-file',
        nargs=1,
        type=file,
        required=True,
        help='Binning file containing labels assigned to contigs.',
        metavar='assignments.csv',
        dest='b')

    parser.add_argument(
        '-t',
        '--cont-true-binning-file',
        nargs=1,
        type=file,
        required=True,
        help='Binning file containing true labels for the contigs.',
        metavar='labels.csv',
        dest='t')

    parser.add_argument('-f',
                        '--cont-contigs-file-listing',
                        nargs=1,
                        type=file,
                        required=False,
                        help='A list of paths of FASTA contigs files.',
                        metavar='fasta_listing.txt',
                        dest='f')

    parser.add_argument('-m',
                        '--cont-scaffold-contig-mapping',
                        nargs=1,
                        type=file,
                        required=False,
                        help='Scaffold contig mapping, tab separated.',
                        metavar='mapping.csv',
                        dest='m')

    parser.add_argument(
        '-n',
        '--cont-ncbi-taxonomy',
        nargs=1,
        required=False,
        help='Directory containing the NCBI names.dmp and nodes.dmp files.',
        metavar='taxonomy_dir',
        dest='n')

    parser.add_argument('-o',
                        '--cont-output-dir',
                        nargs=1,
                        required=True,
                        help='Output directory.',
                        metavar='output_dir',
                        dest='o')

    parser.add_argument(
        '-j',
        '--default-job',
        nargs='+',
        help=
        'What task/job should be performed (p~precision/recall, s~scaff-contig consistency, '
        'c~confusion tables, default - if not spec compute all)',
        metavar='',
        dest='j')

    args = parser.parse_args()

    # read and check the arguments
    seqIdToBp = None
    scaffToContig = None
    binning = None
    trueBinning = None
    outputDir = None
    job = None

    if args.o and len(args.o) == 1 and os.path.isdir(args.o[0]):
        outputDir = args.o[0]

    if args.b and len(args.b) == 1 and os.path.isfile(args.b[0].name):
        binningFile = args.b[0].name
        binning = cami.readAssignments(binningFile)

    if args.t and len(args.t) == 1 and os.path.isfile(args.t[0].name):
        trueBinningFile = args.t[0].name
        trueBinning = cami.readAssignments(trueBinningFile)

    if args.f and len(args.f) == 1 and os.path.isfile(args.f[0].name):
        seqIdToBp = fasta.getSequenceToBpDict(args.f[0].name)

        # contigsFileListing = args.f[0].name
        # for line in open(contigsFileListing):
        #     if os.path.isfile(line.strip()):
        #         d = fasta.getSequenceToBpDict(line.strip())
        #         if seqIdToBp is None:
        #             seqIdToBp = d
        #         else:
        #             count = len(d) + len(seqIdToBp)
        #             seqIdToBp.update(d)
        #             if count > len(seqIdToBp):
        #                 sys.stderr.write('The fasta files contain duplicate entries!')

    if args.m and len(args.m) == 1 and os.path.isfile(args.m[0].name):
        scaffoldContigMapping = args.m[0].name
        scaffToContig = csv.getMapping(scaffoldContigMapping, 0, 1, '\t')

    taxonomyPath = os.path.join(outputDir, 'taxonomy_ncbi.db')
    if not os.path.isfile(taxonomyPath):
        if args.n and len(args.n) == 1 and os.path.isdir(args.n[0]):
            # build the ncbi taxonomy in the case it doesn't exist
            ncbitax2sqlite.build_database(Args(db=taxonomyPath, dmp=args.n[0]))
        else:
            taxonomyPath = None

    if args.j and len(args.j) > 0 and len(
            set(args.j).intersection(set(['p', 's', 'c']))) > 0:
        job = set(args.j)

    # print job
    # print args.j
    # print len(seqIdToBp)
    # print len(binning)
    # print len(trueBinning)
    # print taxonomyPath
    # print outputDir

    if (
            job is None or 'p' in args.j
    ) and seqIdToBp and binning and trueBinning and taxonomyPath and outputDir:
        print('Computing precision/recall')
        # precision/recall - no correction
        acc = accuracy.Accuracy(seqIdToBp, binning, trueBinning, taxonomyPath)
        out = csv.OutFileBuffer(os.path.join(outputDir,
                                             'precision_recall.csv'))
        out.writeText(
            acc.getAccuracyPrint(RANKS, MIN_FRAC_CLADE, MIN_FRAC_CLADE))
        out.close()
        acc.close()

        # precision/recall - with correction
        acc = accuracy.Accuracy(seqIdToBp, binning, trueBinning, taxonomyPath,
                                CORRECT_LABEL_THRESHOLD)
        out = csv.OutFileBuffer(
            os.path.join(outputDir, 'precision_recall_correction.csv'))
        out.writeText(
            acc.getAccuracyPrint(RANKS, MIN_FRAC_CLADE, MIN_FRAC_CLADE))
        out.close()
        acc.close()

    # compute confusion matrices
    if (
            job is None or 'c' in args.j
    ) and seqIdToBp and binning and trueBinning and taxonomyPath and outputDir:
        print('Computing confusion matrices')
        confusionMatrix = confusion_matrix.ConfusionMatrix(
            seqIdToBp, binning, trueBinning, taxonomyPath, RANKS)
        for rank in RANKS:
            confusionMatrix.generateConfusionMatrix(
                rank, os.path.join(outputDir, 'confusion_matrix'))
        confusionMatrix.close()

    # compute scaffold contig consistency
    if (job is None or 's' in args.j) and seqIdToBp and binning and scaffToContig and taxonomyPath \
            and outputDir:
        print('Computing scaffold-contig consistency')
        cons = consistency.Consistency(seqIdToBp, binning, scaffToContig,
                                       taxonomyPath)
        out = csv.OutFileBuffer(os.path.join(outputDir, 'consistency.txt'))
        out.writeText(cons.getGroupedScaffoldsPrint())
        cons.close()
        out.close()

    createEvalMetaFile(outputDir)