def test_extra_features_in_sequences(self): input_table = biom.Table(np.array([[0, 1, 3], [1, 1, 2], [4, 5, 6]]), ['feature1', 'feature2', 'feature3'], ['sample1', 'sample2', 'sample3']) with self.assertRaisesRegex(ValueError, expected_regex='Some feat.*feature4.*'): cluster_features_closed_reference( sequences=self.input_sequences, table=input_table, reference_sequences=self.ref_sequences_1, perc_identity=1.0)
def test_no_matches(self): with self.assertRaisesRegex(VSearchError, expected_regex='No matches were iden'): with redirected_stdio(stderr=os.devnull): # self.ref_sequences_2 are rev comps of self.ref_sequences_1, # so if strand='both' is not passed, there should be no matches cluster_features_closed_reference( sequences=self.input_sequences, table=self.input_table, reference_sequences=self.ref_sequences_2, perc_identity=1.0)
def test_no_overlapping_feature_ids(self): input_table = biom.Table( np.array([[0, 1, 3], [1, 1, 2], [4, 5, 6], [7, 8, 9], [1, 1, 1]]), ['f1', 'f2', 'f3', 'f4', 'f5'], ['sample1', 'sample2', 'sample3']) with self.assertRaisesRegex(ValueError, expected_regex='Some feat.*f1.*'): cluster_features_closed_reference( sequences=self.input_sequences, table=input_table, reference_sequences=self.ref_sequences_1, perc_identity=1.0)
def test_1_percent_clustering(self): # feature1 and feature3 cluster together; feature2 and feature4 # cluster together; exp_table = biom.Table(np.array([[104, 106, 109], [8, 9, 11]]), ['r1', 'r2'], ['sample1', 'sample2', 'sample3']) with redirected_stdio(stderr=os.devnull): obs_table, matched_seqs, unmatched_seqs = \ cluster_features_closed_reference( sequences=self.input_sequences, table=self.input_table, reference_sequences=self.ref_sequences_1, perc_identity=0.01) # order of identifiers is important for biom.Table equality obs_table = \ obs_table.sort_order(exp_table.ids(axis='observation'), axis='observation') self.assertEqual(obs_table, exp_table) obs_matched_seqs = _read_seqs(matched_seqs) # The rep seqs selected are feature1 and feature4, for r1 and r2, # respectively. feature1 and feature3 are in the same cluster, but # feature1 is selected as the rep seq because it has a higher count. # Similarly, feature4 is selected as the cluster rep seq because it # has a higher count. exp_matched_seqs = [self.input_sequences_list[0], # feature1 self.input_sequences_list[3]] # feature4 _relabel_seqs(exp_matched_seqs, ['r1', 'r2']) self.assertEqual(obs_matched_seqs, exp_matched_seqs) # all sequences matched, so unmatched seqs is empty self.assertEqual(os.path.getsize(str(unmatched_seqs)), 0)
def test_97_percent_clustering(self): # feature1 and feature3 cluster together; feature2 doesn't cluster at # all; feature 4 clusters alone. exp_table = biom.Table(np.array([[104, 106, 109], [7, 8, 9]]), ['r1', 'r2'], ['sample1', 'sample2', 'sample3']) with redirected_stdio(stderr=os.devnull): obs_table, matched_seqs, unmatched_seqs = \ cluster_features_closed_reference( sequences=self.input_sequences, table=self.input_table, reference_sequences=self.ref_sequences_1, perc_identity=0.97) # order of identifiers is important for biom.Table equality obs_table = \ obs_table.sort_order(exp_table.ids(axis='observation'), axis='observation') self.assertEqual(obs_table, exp_table) obs_matched_seqs = _read_seqs(matched_seqs) # The rep seqs selected are feature1 and feature4, for r1 and r2, # respectively. feature1 and feature3 are in the same cluster, but # feature1 is selected as the rep seq because it has a higher count. exp_matched_seqs = [self.input_sequences_list[0], # feature1 self.input_sequences_list[3]] # feature4 _relabel_seqs(exp_matched_seqs, ['r1', 'r2']) self.assertEqual(obs_matched_seqs, exp_matched_seqs) obs_unmatched_seqs = _read_seqs(unmatched_seqs) exp_unmatched_seqs = [self.input_sequences_list[1]] # feature2 self.assertEqual(obs_unmatched_seqs, exp_unmatched_seqs)
def test_100_percent_clustering_strand(self): # feature2 and feature3 don't cluster exp_table = biom.Table(np.array([[100, 101, 103], [7, 8, 9]]), ['r1', 'r2'], ['sample1', 'sample2', 'sample3']) with redirected_stdio(stderr=os.devnull): obs_table, matched_seqs, unmatched_seqs = \ cluster_features_closed_reference( sequences=self.input_sequences, table=self.input_table, reference_sequences=self.ref_sequences_2, perc_identity=1.0, strand='both') # order of identifiers is important for biom.Table equality obs_table = \ obs_table.sort_order(exp_table.ids(axis='observation'), axis='observation') self.assertEqual(obs_table, exp_table) obs_matched_seqs = _read_seqs(matched_seqs) # The rep seqs selected are feature1 and feature4, for r1 and r2, # respectively. Since no other features are in the cluster, there is # no count-based selection of the rep seq. exp_matched_seqs = [self.input_sequences_list[0], # feature1 self.input_sequences_list[3]] # feature4 _relabel_seqs(exp_matched_seqs, ['r1', 'r2']) self.assertEqual(obs_matched_seqs, exp_matched_seqs) obs_unmatched_seqs = _read_seqs(unmatched_seqs) exp_unmatched_seqs = [self.input_sequences_list[2], # feature3 self.input_sequences_list[1]] # feature2 self.assertEqual(obs_unmatched_seqs, exp_unmatched_seqs)
def test_features_with_same_counts(self): # feature1 and feature3 cluster into r1, feature2 and feature4 cluster # into r2. The features within a cluster have the same count, so this # test should ensure that the right rep seq is picked for each cluster. # The query in _fasta_from_sqlite should break ties by using the # first feature when sorting the tied features alphabetically by id. input_table = biom.Table(np.array([[4, 5, 6], [1, 2, 3], [4, 6, 5], [2, 1, 3]]), ['feature1', 'feature2', 'feature3', 'feature4'], ['sample1', 'sample2', 'sample3']) exp_table = biom.Table(np.array([[8, 11, 11], [3, 3, 6]]), ['r1', 'r2'], ['sample1', 'sample2', 'sample3']) with redirected_stdio(stderr=os.devnull): obs_table, matched_seqs, unmatched_seqs = \ cluster_features_closed_reference( sequences=self.input_sequences, table=input_table, reference_sequences=self.ref_sequences_1, perc_identity=0.01) # order of identifiers is important for biom.Table equality obs_table = \ obs_table.sort_order(exp_table.ids(axis='observation'), axis='observation') self.assertEqual(obs_table, exp_table) obs_matched_seqs = _read_seqs(matched_seqs) # The rep seqs selected are feature1 and feature2, for r1 and r2, # respectively. feature1 and feature3 are in the same cluster, but # feature1 is selected as the rep seq because it comes first # alphabetically, breaking the tie caused by the same counts. # Similarly, feature2 is selected as the cluster rep seq because it # has a higher count. exp_matched_seqs = [self.input_sequences_list[0], # feature1 self.input_sequences_list[1]] # feature2 _relabel_seqs(exp_matched_seqs, ['r1', 'r2']) self.assertEqual(obs_matched_seqs, exp_matched_seqs) # all sequences matched, so unmatched seqs is empty self.assertEqual(os.path.getsize(str(unmatched_seqs)), 0)
def cluster_features(query_table: biom.Table, closed_reference_table: biom.Table, query_sequences: DNAFASTAFormat, reference_sequences: pd.Series, thr: float = 0.97, threads: int = 1, output_log_file: str = None) -> ( biom.Table, DNAFASTAFormat, DNAFASTAFormat): reference_sequences_fasta = get_reference_seqs_from_ids(closed_reference_table, reference_sequences) results = cluster_features_closed_reference(sequences=query_sequences, table=query_table, reference_sequences=reference_sequences_fasta, perc_identity=thr, threads=threads) clustered_table_biom = results[0] clustered_sequences_pd = Artifact.load(str(results[1])).view(pd.Series) unmatched_sequences_pd = Artifact.load(str(results[2])).view(pd.Series) with tempfile.mktemp() as tmp_fp: logger_ins = LOG(tmp_fp).get_logger('clustering_features') logger_ins.info("The number of OTUs in the reference database is", _15(reference_sequences_fasta).size) logger_ins.info("The number of unmatched sequence to the reference alignment is", unmatched_sequences_pd.size) logger_ins.info("The number of matched sequences to the reference alignment is", clustered_sequences_pd.size) logger_ins.info("Before applying clustering, the total number of counts " "in the original feature table was", np.sum(query_table.sum())) logger_ins.info("Before applying clustering, the number of non-zero elements" " of the underlying feature table is", query_table.nnz) logger_ins.info("After applying clustering, the total number of counts " "in the original feature table was", np.sum(clustered_table_biom.sum())) logger_ins.info("After applying clustering, the number of non-zero elements" " of the underlying feature table is", clustered_table_biom.nnz) logger_ins.info("The percent of total counts retained is", np.sum(query_table.sum()) / np.sum(clustered_table_biom.sum()) * 100, "%s") query_samples = clustered_table_biom.ids('sample') closed_reference_features = closed_reference_table.ids('observation') clustered_table_biom = closed_reference_table.merge(clustered_table_biom) clustered_table_biom.filter(ids_to_keep=query_samples, axis='sample', inplace=True) if len(set(closed_reference_features) - set(clustered_table_biom.ids('sample'))) != 0: raise ValueError( "Merging two tables failed! There are less features in the final table than expected!" ) if output_log_file: shutil.copy(tmp_fp, output_log_file) return clustered_table_biom, results[1], results[2]
def test_short_sequences(self): input_sequences_fp = self.get_data_path('dna-sequences-short.fasta') input_sequences = DNAFASTAFormat(input_sequences_fp, mode='r') input_table = biom.Table(np.array([[0, 1, 3], [1, 1, 2]]), ['feature1', 'feature2'], ['sample1', 'sample2', 'sample3']) exp_table = biom.Table(np.array([[1, 2, 5]]), ['r2'], ['sample1', 'sample2', 'sample3']) with redirected_stdio(stderr=os.devnull): obs_table, matched_seqs, unmatched_seqs = \ cluster_features_closed_reference( sequences=input_sequences, table=input_table, reference_sequences=self.ref_sequences_1, perc_identity=0.01) # order of identifiers is important for biom.Table equality obs_table = \ obs_table.sort_order(exp_table.ids(axis='observation'), axis='observation')