Beispiel #1
0
 def test_extra_features_in_sequences(self):
     input_table = biom.Table(np.array([[0, 1, 3], [1, 1, 2], [4, 5, 6]]),
                              ['feature1', 'feature2', 'feature3'],
                              ['sample1', 'sample2', 'sample3'])
     with self.assertRaisesRegex(ValueError,
                                 expected_regex='Some feat.*feature4.*'):
         cluster_features_closed_reference(
             sequences=self.input_sequences, table=input_table,
             reference_sequences=self.ref_sequences_1, perc_identity=1.0)
Beispiel #2
0
 def test_no_matches(self):
     with self.assertRaisesRegex(VSearchError,
                                 expected_regex='No matches were iden'):
         with redirected_stdio(stderr=os.devnull):
             # self.ref_sequences_2 are rev comps of self.ref_sequences_1,
             # so if strand='both' is not passed, there should be no matches
             cluster_features_closed_reference(
                 sequences=self.input_sequences, table=self.input_table,
                 reference_sequences=self.ref_sequences_2,
                 perc_identity=1.0)
 def test_no_overlapping_feature_ids(self):
     input_table = biom.Table(
         np.array([[0, 1, 3], [1, 1, 2], [4, 5, 6], [7, 8, 9], [1, 1, 1]]),
         ['f1', 'f2', 'f3', 'f4', 'f5'], ['sample1', 'sample2', 'sample3'])
     with self.assertRaisesRegex(ValueError,
                                 expected_regex='Some feat.*f1.*'):
         cluster_features_closed_reference(
             sequences=self.input_sequences,
             table=input_table,
             reference_sequences=self.ref_sequences_1,
             perc_identity=1.0)
Beispiel #4
0
    def test_1_percent_clustering(self):
        # feature1 and feature3 cluster together; feature2 and feature4
        # cluster together;
        exp_table = biom.Table(np.array([[104, 106, 109],
                                         [8, 9, 11]]),
                               ['r1', 'r2'],
                               ['sample1', 'sample2', 'sample3'])

        with redirected_stdio(stderr=os.devnull):
            obs_table, matched_seqs, unmatched_seqs = \
                    cluster_features_closed_reference(
                        sequences=self.input_sequences, table=self.input_table,
                        reference_sequences=self.ref_sequences_1,
                        perc_identity=0.01)
        # order of identifiers is important for biom.Table equality
        obs_table = \
            obs_table.sort_order(exp_table.ids(axis='observation'),
                                 axis='observation')
        self.assertEqual(obs_table, exp_table)

        obs_matched_seqs = _read_seqs(matched_seqs)
        # The rep seqs selected are feature1 and feature4, for r1 and r2,
        # respectively. feature1 and feature3 are in the same cluster, but
        # feature1 is selected as the rep seq because it has a higher count.
        # Similarly, feature4 is selected as the cluster rep seq  because it
        # has a higher count.
        exp_matched_seqs = [self.input_sequences_list[0],  # feature1
                            self.input_sequences_list[3]]  # feature4
        _relabel_seqs(exp_matched_seqs, ['r1', 'r2'])
        self.assertEqual(obs_matched_seqs, exp_matched_seqs)

        # all sequences matched, so unmatched seqs is empty
        self.assertEqual(os.path.getsize(str(unmatched_seqs)), 0)
Beispiel #5
0
    def test_97_percent_clustering(self):
        # feature1 and feature3 cluster together; feature2 doesn't cluster at
        # all; feature 4 clusters alone.
        exp_table = biom.Table(np.array([[104, 106, 109],
                                         [7, 8, 9]]),
                               ['r1', 'r2'],
                               ['sample1', 'sample2', 'sample3'])

        with redirected_stdio(stderr=os.devnull):
            obs_table, matched_seqs, unmatched_seqs = \
                    cluster_features_closed_reference(
                        sequences=self.input_sequences, table=self.input_table,
                        reference_sequences=self.ref_sequences_1,
                        perc_identity=0.97)
        # order of identifiers is important for biom.Table equality
        obs_table = \
            obs_table.sort_order(exp_table.ids(axis='observation'),
                                 axis='observation')
        self.assertEqual(obs_table, exp_table)

        obs_matched_seqs = _read_seqs(matched_seqs)
        # The rep seqs selected are feature1 and feature4, for r1 and r2,
        # respectively. feature1 and feature3 are in the same cluster, but
        # feature1 is selected as the rep seq because it has a higher count.
        exp_matched_seqs = [self.input_sequences_list[0],  # feature1
                            self.input_sequences_list[3]]  # feature4
        _relabel_seqs(exp_matched_seqs, ['r1', 'r2'])
        self.assertEqual(obs_matched_seqs, exp_matched_seqs)

        obs_unmatched_seqs = _read_seqs(unmatched_seqs)
        exp_unmatched_seqs = [self.input_sequences_list[1]]  # feature2
        self.assertEqual(obs_unmatched_seqs, exp_unmatched_seqs)
Beispiel #6
0
    def test_100_percent_clustering_strand(self):
        # feature2 and feature3 don't cluster
        exp_table = biom.Table(np.array([[100, 101, 103],
                                         [7, 8, 9]]),
                               ['r1', 'r2'],
                               ['sample1', 'sample2', 'sample3'])

        with redirected_stdio(stderr=os.devnull):
            obs_table, matched_seqs, unmatched_seqs = \
                    cluster_features_closed_reference(
                        sequences=self.input_sequences, table=self.input_table,
                        reference_sequences=self.ref_sequences_2,
                        perc_identity=1.0, strand='both')
        # order of identifiers is important for biom.Table equality
        obs_table = \
            obs_table.sort_order(exp_table.ids(axis='observation'),
                                 axis='observation')
        self.assertEqual(obs_table, exp_table)

        obs_matched_seqs = _read_seqs(matched_seqs)
        # The rep seqs selected are feature1 and feature4, for r1 and r2,
        # respectively. Since no other features are in the cluster, there is
        # no count-based selection of the rep seq.
        exp_matched_seqs = [self.input_sequences_list[0],  # feature1
                            self.input_sequences_list[3]]  # feature4
        _relabel_seqs(exp_matched_seqs, ['r1', 'r2'])
        self.assertEqual(obs_matched_seqs, exp_matched_seqs)

        obs_unmatched_seqs = _read_seqs(unmatched_seqs)
        exp_unmatched_seqs = [self.input_sequences_list[2],  # feature3
                              self.input_sequences_list[1]]  # feature2
        self.assertEqual(obs_unmatched_seqs, exp_unmatched_seqs)
Beispiel #7
0
    def test_features_with_same_counts(self):
        # feature1 and feature3 cluster into r1, feature2 and feature4 cluster
        # into r2. The features within a cluster have the same count, so this
        # test should ensure that the right rep seq is picked for each cluster.
        # The query in _fasta_from_sqlite should break ties by using the
        # first feature when sorting the tied features alphabetically by id.
        input_table = biom.Table(np.array([[4, 5, 6],
                                           [1, 2, 3],
                                           [4, 6, 5],
                                           [2, 1, 3]]),
                                 ['feature1', 'feature2', 'feature3',
                                  'feature4'],
                                 ['sample1', 'sample2', 'sample3'])
        exp_table = biom.Table(np.array([[8, 11, 11],
                                         [3, 3, 6]]),
                               ['r1', 'r2'],
                               ['sample1', 'sample2', 'sample3'])

        with redirected_stdio(stderr=os.devnull):
            obs_table, matched_seqs, unmatched_seqs = \
                    cluster_features_closed_reference(
                        sequences=self.input_sequences, table=input_table,
                        reference_sequences=self.ref_sequences_1,
                        perc_identity=0.01)
        # order of identifiers is important for biom.Table equality
        obs_table = \
            obs_table.sort_order(exp_table.ids(axis='observation'),
                                 axis='observation')
        self.assertEqual(obs_table, exp_table)

        obs_matched_seqs = _read_seqs(matched_seqs)
        # The rep seqs selected are feature1 and feature2, for r1 and r2,
        # respectively. feature1 and feature3 are in the same cluster, but
        # feature1 is selected as the rep seq because it comes first
        # alphabetically, breaking the tie caused by the same counts.
        # Similarly, feature2 is selected as the cluster rep seq  because it
        # has a higher count.
        exp_matched_seqs = [self.input_sequences_list[0],  # feature1
                            self.input_sequences_list[1]]  # feature2
        _relabel_seqs(exp_matched_seqs, ['r1', 'r2'])
        self.assertEqual(obs_matched_seqs, exp_matched_seqs)

        # all sequences matched, so unmatched seqs is empty
        self.assertEqual(os.path.getsize(str(unmatched_seqs)), 0)
def cluster_features(query_table: biom.Table, closed_reference_table: biom.Table,
                                         query_sequences: DNAFASTAFormat,
                                         reference_sequences: pd.Series, thr: float = 0.97,
                                         threads: int = 1, output_log_file: str = None) -> (
        biom.Table, DNAFASTAFormat, DNAFASTAFormat):
    reference_sequences_fasta = get_reference_seqs_from_ids(closed_reference_table, reference_sequences)
    results = cluster_features_closed_reference(sequences=query_sequences, table=query_table,
                                                reference_sequences=reference_sequences_fasta,
                                                perc_identity=thr, threads=threads)

    clustered_table_biom = results[0]

    clustered_sequences_pd = Artifact.load(str(results[1])).view(pd.Series)
    unmatched_sequences_pd = Artifact.load(str(results[2])).view(pd.Series)

    with tempfile.mktemp() as tmp_fp:
        logger_ins = LOG(tmp_fp).get_logger('clustering_features')
        logger_ins.info("The number of OTUs in the reference database is", _15(reference_sequences_fasta).size)
        logger_ins.info("The number of unmatched sequence to the reference alignment is", unmatched_sequences_pd.size)
        logger_ins.info("The number of matched sequences to the reference alignment is", clustered_sequences_pd.size)
        logger_ins.info("Before applying clustering, the total number of counts "
                        "in the original feature table was", np.sum(query_table.sum()))
        logger_ins.info("Before applying clustering, the number of non-zero elements"
                        " of the underlying feature table is", query_table.nnz)
        logger_ins.info("After applying clustering, the total number of counts "
                        "in the original feature table was", np.sum(clustered_table_biom.sum()))
        logger_ins.info("After applying clustering, the number of non-zero elements"
                        " of the underlying feature table is", clustered_table_biom.nnz)
        logger_ins.info("The percent of total counts retained is",
                        np.sum(query_table.sum()) / np.sum(clustered_table_biom.sum()) * 100, "%s")

        query_samples = clustered_table_biom.ids('sample')
        closed_reference_features = closed_reference_table.ids('observation')
        clustered_table_biom = closed_reference_table.merge(clustered_table_biom)
        clustered_table_biom.filter(ids_to_keep=query_samples, axis='sample', inplace=True)
        if len(set(closed_reference_features) - set(clustered_table_biom.ids('sample'))) != 0:
            raise ValueError(
                "Merging two tables failed! There are less features in the final table than expected!"
            )
        if output_log_file:
            shutil.copy(tmp_fp, output_log_file)
    return clustered_table_biom, results[1], results[2]
Beispiel #9
0
    def test_short_sequences(self):
        input_sequences_fp = self.get_data_path('dna-sequences-short.fasta')
        input_sequences = DNAFASTAFormat(input_sequences_fp, mode='r')

        input_table = biom.Table(np.array([[0, 1, 3], [1, 1, 2]]),
                                 ['feature1', 'feature2'],
                                 ['sample1', 'sample2', 'sample3'])

        exp_table = biom.Table(np.array([[1, 2, 5]]), ['r2'],
                               ['sample1', 'sample2', 'sample3'])

        with redirected_stdio(stderr=os.devnull):
            obs_table, matched_seqs, unmatched_seqs = \
                cluster_features_closed_reference(
                    sequences=input_sequences, table=input_table,
                    reference_sequences=self.ref_sequences_1,
                    perc_identity=0.01)

        # order of identifiers is important for biom.Table equality
        obs_table = \
            obs_table.sort_order(exp_table.ids(axis='observation'),
                                 axis='observation')