Example #1
0
    def test_consistency_unrooted(self):
        """Test consistency of taxa with a taxa that is only monophyletic in unrooted tree"""

        seed_con = 'f__Lachnospiraceae; g__Bacteroides; s__'
        nl.determine_rank_order(seed_con)
        tipname_map = {'a': ['f__Lachnospiraceae', 'g__Bacteroides', 's__Bacteroides pectinophilus'],
                       'b': ['f__Lachnospiraceae', 'g__Bacteroides', 's__Bacteroides pectinophilus'],
                       'c': ['f__Lachnospiraceae', 'g__Bacteroides', 's__Bacteroides pectinophilus'],
                       'd': ['f__Lachnospiraceae', 'g__Bacteroides', 's__Bacteroides acidifaciens'],
                       'e': ['f__Lachnospiraceae', 'g__Bacteroides', 's__Bacteroides acidifaciens']}

        tree = nl.load_tree(StringIO(u'((a,b),(c,(d,e)));'), tipname_map)

        counts = nl.collect_names_at_ranks_counts(tree)
        nl.decorate_ntips_rank(tree)
        nl.decorate_name_counts(tree)

        # determine taxonomic consistency of rooted tree
        #expected_consistency_index
        c = Consistency(counts, len(nl.RANK_ORDER))
        consistency_index = c.calculate(tree, rooted=True)

        self.assertAlmostEqual(consistency_index[0]['f__Lachnospiraceae'], 1.0)
        self.assertAlmostEqual(consistency_index[1]['g__Bacteroides'], 1.0)
        self.assertAlmostEqual(consistency_index[2]['s__Bacteroides pectinophilus'], 0.66666666)
        self.assertAlmostEqual(consistency_index[2]['s__Bacteroides acidifaciens'], 1.0)

        #determine consistency of unrooted tree
        consistency_index = c.calculate(tree, rooted=False)

        self.assertAlmostEqual(consistency_index[0]['f__Lachnospiraceae'], 1.0)
        self.assertAlmostEqual(consistency_index[1]['g__Bacteroides'], 1.0)
        self.assertAlmostEqual(consistency_index[2]['s__Bacteroides pectinophilus'], 1.0)
        self.assertAlmostEqual(consistency_index[2]['s__Bacteroides acidifaciens'], 1.0)
Example #2
0
    def test_consistency_missing(self):
        """Test consistency of taxa in tree with missing taxa"""

        seed_con = 'f__Lachnospiraceae; g__Bacteroides; s__'
        nl.determine_rank_order(seed_con)
        tipname_map = {'a': ['f__Lachnospiraceae', 'g__Bacteroides', None],
                       'c': ['f__Lachnospiraceae', 'g__Bacteroides', 's__Bacteroides pectinophilus'],
                       'b': ['f__Lachnospiraceae', 'g__Bacteroides', None], 'e': [None, None, None],
                       'd': ['f__Lachnospiraceae', 'g__Bacteroides', 's__Bacteroides pectinophilus'],
                       'g': [None, None, None], 'f': ['f__Lachnospiraceae', 'g__Lachnospira', None],
                       'h': ['f__Lachnospiraceae', 'g__Lachnospira', 's__Bacteroides pectinophilus']}
        tree = nl.load_tree(StringIO(u'(((a,b),(c,d)),((e,f),(g,h)));'), tipname_map)

        counts = nl.collect_names_at_ranks_counts(tree)
        nl.decorate_ntips_rank(tree)
        nl.decorate_name_counts(tree)

        # determine taxonomic consistency of rooted tree
        #expected_consistency_index
        c = Consistency(counts, len(nl.RANK_ORDER))
        consistency_index = c.calculate(tree, rooted=True)

        self.assertAlmostEqual(consistency_index[0]['f__Lachnospiraceae'], 1.0)
        self.assertAlmostEqual(consistency_index[1]['g__Bacteroides'], 1.0)
        self.assertAlmostEqual(consistency_index[1]['g__Lachnospira'], 1.0)
        self.assertAlmostEqual(consistency_index[2]['s__Bacteroides pectinophilus'], 1.0)

        #determine consistency of unrooted tree
        consistency_index = c.calculate(tree, rooted=False)

        self.assertAlmostEqual(consistency_index[0]['f__Lachnospiraceae'], 1.0)
        self.assertAlmostEqual(consistency_index[1]['g__Bacteroides'], 1.0)
        self.assertAlmostEqual(consistency_index[1]['g__Lachnospira'], 1.0)
        self.assertAlmostEqual(consistency_index[2]['s__Bacteroides pectinophilus'], 1.0)
Example #3
0
    def test_consistency_unrooted(self):
        """Test consistency of taxa with a taxa that is only monophyletic in unrooted tree"""

        seed_con = 'f__Lachnospiraceae; g__Bacteroides; s__'
        nl.determine_rank_order(seed_con)
        tipname_map = {
            'a': [
                'f__Lachnospiraceae', 'g__Bacteroides',
                's__Bacteroides pectinophilus'
            ],
            'b': [
                'f__Lachnospiraceae', 'g__Bacteroides',
                's__Bacteroides pectinophilus'
            ],
            'c': [
                'f__Lachnospiraceae', 'g__Bacteroides',
                's__Bacteroides pectinophilus'
            ],
            'd': [
                'f__Lachnospiraceae', 'g__Bacteroides',
                's__Bacteroides acidifaciens'
            ],
            'e': [
                'f__Lachnospiraceae', 'g__Bacteroides',
                's__Bacteroides acidifaciens'
            ]
        }

        tree = nl.load_tree('((a,b),(c,(d,e)));', tipname_map)

        counts = nl.collect_names_at_ranks_counts(tree)
        nl.decorate_ntips_rank(tree)
        nl.decorate_name_counts(tree)

        # determine taxonomic consistency of rooted tree
        #expected_consistency_index
        c = Consistency(counts, len(nl.RANK_ORDER))
        consistency_index = c.calculate(tree, rooted=True)

        self.assertAlmostEqual(consistency_index[0]['f__Lachnospiraceae'], 1.0)
        self.assertAlmostEqual(consistency_index[1]['g__Bacteroides'], 1.0)
        self.assertAlmostEqual(
            consistency_index[2]['s__Bacteroides pectinophilus'], 0.66666666)
        self.assertAlmostEqual(
            consistency_index[2]['s__Bacteroides acidifaciens'], 1.0)

        #determine consistency of unrooted tree
        consistency_index = c.calculate(tree, rooted=False)

        self.assertAlmostEqual(consistency_index[0]['f__Lachnospiraceae'], 1.0)
        self.assertAlmostEqual(consistency_index[1]['g__Bacteroides'], 1.0)
        self.assertAlmostEqual(
            consistency_index[2]['s__Bacteroides pectinophilus'], 1.0)
        self.assertAlmostEqual(
            consistency_index[2]['s__Bacteroides acidifaciens'], 1.0)
Example #4
0
    def test_generate_constrings_valid_input(self):
        """Tests generate_constrings with standard valid input.

        Checks that our output mirrors nlevel (tax2tree's interface)."""
        exp = test_results
        determine_rank_order(test_cons[0].split('\t')[1])
        cons_map = load_consensus_map(test_cons, False)
        tree = load_tree(test_tree, cons_map)

        obs = generate_constrings(tree, cons_map)
        self.assertEqual(obs, exp)
Example #5
0
    def test_generate_constrings_valid_input(self):
        """Tests generate_constrings with standard valid input.

        Checks that our output mirrors nlevel (tax2tree's interface)."""
        exp = test_results
        determine_rank_order(test_cons[0].split('\t')[1])
        cons_map = load_consensus_map(test_cons, False)
        tree = load_tree(test_tree, cons_map)

        obs = generate_constrings(tree, cons_map)
        self.assertEqual(obs, exp)
Example #6
0
    def test_consistency_missing(self):
        """Test consistency of taxa in tree with missing taxa"""

        seed_con = 'f__Lachnospiraceae; g__Bacteroides; s__'
        nl.determine_rank_order(seed_con)
        tipname_map = {
            'a': ['f__Lachnospiraceae', 'g__Bacteroides', None],
            'c': [
                'f__Lachnospiraceae', 'g__Bacteroides',
                's__Bacteroides pectinophilus'
            ],
            'b': ['f__Lachnospiraceae', 'g__Bacteroides', None],
            'e': [None, None, None],
            'd': [
                'f__Lachnospiraceae', 'g__Bacteroides',
                's__Bacteroides pectinophilus'
            ],
            'g': [None, None, None],
            'f': ['f__Lachnospiraceae', 'g__Lachnospira', None],
            'h': [
                'f__Lachnospiraceae', 'g__Lachnospira',
                's__Bacteroides pectinophilus'
            ]
        }
        tree = nl.load_tree('(((a,b),(c,d)),((e,f),(g,h)));', tipname_map)

        counts = nl.collect_names_at_ranks_counts(tree)
        nl.decorate_ntips_rank(tree)
        nl.decorate_name_counts(tree)

        # determine taxonomic consistency of rooted tree
        #expected_consistency_index
        c = Consistency(counts, len(nl.RANK_ORDER))
        consistency_index = c.calculate(tree, rooted=True)

        self.assertAlmostEqual(consistency_index[0]['f__Lachnospiraceae'], 1.0)
        self.assertAlmostEqual(consistency_index[1]['g__Bacteroides'], 1.0)
        self.assertAlmostEqual(consistency_index[1]['g__Lachnospira'], 1.0)
        self.assertAlmostEqual(
            consistency_index[2]['s__Bacteroides pectinophilus'], 1.0)

        #determine consistency of unrooted tree
        consistency_index = c.calculate(tree, rooted=False)

        self.assertAlmostEqual(consistency_index[0]['f__Lachnospiraceae'], 1.0)
        self.assertAlmostEqual(consistency_index[1]['g__Bacteroides'], 1.0)
        self.assertAlmostEqual(consistency_index[1]['g__Lachnospira'], 1.0)
        self.assertAlmostEqual(
            consistency_index[2]['s__Bacteroides pectinophilus'], 1.0)
Example #7
0
    def __call__(self, seq_path=None, result_path=None, log_path=None):
        """Returns a dict mapping {seq_id:(taxonomy, confidence)} for each seq

        Keep in mind, "confidence" is only done for consistency and in fact
        all assignments will have a score of 0 because a method for determining
        confidence is not currently implemented.

        Parameters:
        seq_path: path to file of sequences. The sequences themselves are
            never actually used, but they are needed for their ids.
        result_path: path to file of results. If specified, dumps the
            result to the desired path instead of returning it.
        log_path: path to log, which should include dump of params.
        """

        # initialize the logger
        logger = self._get_logger(log_path)
        logger.info(str(self))

        with open(seq_path, 'U') as f:
            seqs = dict(MinimalFastaParser(f))

        consensus_map = tax2tree.prep_consensus(
            open(self.Params['id_to_taxonomy_fp']),
            seqs.keys())
        seed_con = consensus_map[0].strip().split('\t')[1]
        determine_rank_order(seed_con)

        tipnames_map = load_consensus_map(consensus_map, False)

        tree = load_tree(open(self.Params['tree_fp']), tipnames_map)

        results = tax2tree.generate_constrings(tree, tipnames_map)
        results = tax2tree.clean_output(results, seqs.keys())

        if result_path:
            # if the user provided a result_path, write the
            # results to file
            with open(result_path, 'w') as f:
                for seq_id, (lineage, confidence) in results.iteritems():
                    f.write('%s\t%s\t%s\n' % (seq_id, lineage, confidence))
            logger.info('Result path: %s' % result_path)

        return results
Example #8
0
    def __call__(self, seq_path=None, result_path=None, log_path=None):
        """Returns a dict mapping {seq_id:(taxonomy, confidence)} for each seq

        Keep in mind, "confidence" is only done for consistency and in fact
        all assignments will have a score of 0 because a method for determining
        confidence is not currently implemented.

        Parameters:
        seq_path: path to file of sequences. The sequences themselves are
            never actually used, but they are needed for their ids.
        result_path: path to file of results. If specified, dumps the
            result to the desired path instead of returning it.
        log_path: path to log, which should include dump of params.
        """

        # initialize the logger
        logger = self._get_logger(log_path)
        logger.info(str(self))

        with open(seq_path, 'U') as f:
            seqs = dict(parse_fasta(f))

        consensus_map = tax2tree.prep_consensus(
            open(self.Params['id_to_taxonomy_fp']),
            seqs.keys())
        seed_con = consensus_map[0].strip().split('\t')[1]
        determine_rank_order(seed_con)

        tipnames_map = load_consensus_map(consensus_map, False)

        tree = load_tree(open(self.Params['tree_fp']), tipnames_map)

        results = tax2tree.generate_constrings(tree, tipnames_map)
        results = tax2tree.clean_output(results, seqs.keys())

        if result_path:
            # if the user provided a result_path, write the
            # results to file
            with open(result_path, 'w') as f:
                for seq_id, (lineage, confidence) in results.iteritems():
                    f.write('%s\t%s\t%s\n' % (seq_id, lineage, confidence))
            logger.info('Result path: %s' % result_path)

        return results
Example #9
0
def flat_errors(tax_lines):
    """Flat file errors"""
    inc_prefix = 'Incorrect prefixes'
    inc_nlevel = 'Incorrect number of levels'
    inc_gap = 'Gaps in taxonomy'

    seed_con = tax_lines[0].strip().split('\t')[1]
    rank_order = determine_rank_order(seed_con)

    nlevels = len(rank_order)
    errors = defaultdict(list)
    errors_seen = defaultdict(set)

    for line in tax_lines:
        id_, parsed = check_parse(line)

        if not check_prefixes(parsed, rank_order):
            if parsed not in errors_seen[inc_prefix]:
                errors_seen[inc_prefix].add(parsed)
                errors[inc_prefix].append(id_)

        if not check_n_levels(parsed, nlevels):
            if parsed not in errors_seen[inc_nlevel]:
                errors_seen[inc_nlevel].add(parsed)
                errors[inc_nlevel].append(id_)

        if not check_gap(parsed):
            gap_idx = find_gap(parsed)
            taxon_following_gap = gap_idx + 1

            # another +1 as the slice is exclusive
            if parsed[:taxon_following_gap + 1] not in errors_seen[inc_gap]:
                errors_seen[inc_gap].add(parsed[:taxon_following_gap + 1])
                errors['Gaps in taxonomy'].append(id_)

    return errors
Example #10
0
def flat_errors(tax_lines):
    """Flat file errors"""
    inc_prefix = 'Incorrect prefixes'
    inc_nlevel = 'Incorrect number of levels'
    inc_gap = 'Gaps in taxonomy'

    seed_con = tax_lines[0].strip().split('\t')[1]
    rank_order = determine_rank_order(seed_con)

    nlevels = len(rank_order)
    errors = defaultdict(list)
    errors_seen = defaultdict(set)

    for line in tax_lines:
        id_, parsed = check_parse(line)

        if not check_prefixes(parsed, rank_order):
            if parsed not in errors_seen[inc_prefix]:
                errors_seen[inc_prefix].add(parsed)
                errors[inc_prefix].append(id_)

        if not check_n_levels(parsed, nlevels):
            if parsed not in errors_seen[inc_nlevel]:
                errors_seen[inc_nlevel].add(parsed)
                errors[inc_nlevel].append(id_)

        if not check_gap(parsed):
            gap_idx = find_gap(parsed)
            taxon_following_gap = gap_idx + 1

            # another +1 as the slice is exclusive
            if parsed[:taxon_following_gap + 1] not in errors_seen[inc_gap]:
                errors_seen[inc_gap].add(parsed[:taxon_following_gap + 1])
                errors['Gaps in taxonomy'].append(id_)

    return errors