def test_typical(self): metadata = CategoricalMetadataColumn( pd.Series(['AAAA', 'CCCC'], name='Barcode', index=pd.Index(['sample_a', 'sample_b'], name='id'))) exp = [ # sample a, fwd '@id1\nACGTACGT\n+\nzzzzzzzz\n' '@id3\nACGTACGT\n+\nzzzzzzzz\n', # sample a, rev '@id1\nGGGGTGCATGCA\n+\nzzzzzzzzzzzz\n' '@id3\nGGGGTGCATGCA\n+\nzzzzzzzzzzzz\n', # sample b, fwd '@id2\nACGTACGT\n+\nzzzzzzzz\n' '@id4\nACGTACGT\n+\nzzzzzzzz\n' '@id5\nACGTACGT\n+\nzzzzzzzz\n', # sample b, fwd '@id2\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n' '@id4\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n' '@id5\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n', ] exp_untrimmed = [ '@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n', '@id6\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n' ] with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_paired_fn(self.muxed_sequences, metadata) self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art) self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
def test_multiple_orientations_dual_indices(self): forward_barcodes = CategoricalMetadataColumn( pd.Series(['AAAA', 'CCCC'], name='ForwardBarcode', index=pd.Index(['sample_a', 'sample_b'], name='id'))) reverse_barcodes = CategoricalMetadataColumn( pd.Series(['GGGG', 'TTTT'], name='ReverseBarcode', index=pd.Index(['sample_a', 'sample_b'], name='id'))) mixed_orientation_sequences_f_fp = self.get_data_path( 'mixed-orientation/forward.fastq.gz') mixed_orientation_sequences_r_fp = self.get_data_path( 'mixed-orientation/reverse.fastq.gz') # These files have forward and reverse reads mixed together in the same # file with tempfile.TemporaryDirectory() as temp: shutil.copy(mixed_orientation_sequences_f_fp, temp) shutil.copy(mixed_orientation_sequences_r_fp, temp) mixed_orientation_sequences = Artifact.import_data( 'MultiplexedPairedEndBarcodeInSequence', temp) with self.assertRaisesRegex( ValueError, 'Dual-indexed barcodes for mixed ' 'orientation reads are not supported.'): obs_demuxed_art, obs_untrimmed_art = \ self.demux_paired_fn(mixed_orientation_sequences, forward_barcodes=forward_barcodes, reverse_barcodes=reverse_barcodes, mixed_orientation=True)
def test_batch_size(self): metadata = CategoricalMetadataColumn( pd.Series(['AAAA', 'CCCC'], name='Barcode', index=pd.Index(['sample_a', 'sample_b'], name='id'))) exp = [ # sample a '@id1\nACGTACGT\n+\nzzzzzzzz\n' '@id3\nACGTACGT\n+\nzzzzzzzz\n', # sample b '@id2\nACGTACGT\n+\nzzzzzzzz\n' '@id4\nACGTACGT\n+\nzzzzzzzz\n' '@id5\nACGTACGT\n+\nzzzzzzzz\n', ] with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_single_fn(self.muxed_sequences, metadata, batch_size=1) # This test should yield the same results as test_typical, above, # the fact that we are batching shouldn't impact the final results self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art) self.assert_untrimmed_results('@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n', obs_untrimmed_art)
def test_batch_size_odd_number_of_samples(self): metadata = CategoricalMetadataColumn( pd.Series(['AAAA', 'CCCC', 'GGGG'], name='Barcode', index=pd.Index(['sample_a', 'sample_b', 'sample_c'], name='id'))) exp = [ # sample a '@id1\nACGTACGT\n+\nzzzzzzzz\n' '@id3\nACGTACGT\n+\nzzzzzzzz\n', # sample b '@id2\nACGTACGT\n+\nzzzzzzzz\n' '@id4\nACGTACGT\n+\nzzzzzzzz\n' '@id5\nACGTACGT\n+\nzzzzzzzz\n', # sample c '@id6\nACGTACGT\n+\nzzzzzzzz\n', ] with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_single_fn(self.muxed_sequences, metadata, batch_size=2) self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art) self.assert_untrimmed_results('', obs_untrimmed_art)
def group(table: biom.Table, axis: str, metadata: qiime2.CategoricalMetadataColumn, mode: str) -> biom.Table: if table.is_empty(): raise ValueError("Cannot group an empty table.") if axis == 'feature': biom_axis = 'observation' else: biom_axis = axis metadata = _munge_metadata_column(metadata, table.ids(axis=biom_axis), axis) grouped_table = table.collapse( lambda axis_id, _: metadata.get_value(axis_id), collapse_f=_mode_lookup[mode], axis=biom_axis, norm=False, include_collapsed_metadata=False) # Reorder axis by first unique appearance of each group value in metadata # (makes it stable for identity mappings and easier to test) # TODO use CategoricalMetadataColumn API for retrieving categories/groups, # when the API exists. series = metadata.to_series() return grouped_table.sort_order(series.unique(), axis=biom_axis)
def test_variable_length_barcodes(self): metadata = CategoricalMetadataColumn( pd.Series(['AAAAA', 'CCCCCC', 'GGGG'], name='Barcode', index=pd.Index(['sample_a', 'sample_b', 'sample_c'], name='id'))) muxed_sequences_fp = self.get_data_path('variable_length.fastq.gz') muxed_sequences = Artifact.import_data( 'MultiplexedSingleEndBarcodeInSequence', muxed_sequences_fp) exp = [ # sample a '@id1\nACGTACGT\n+\nzzzzzzzz\n' '@id3\nACGTACGT\n+\nzzzzzzzz\n', # sample b '@id2\nACGTACGT\n+\nzzzzzzzz\n' '@id4\nACGTACGT\n+\nzzzzzzzz\n' '@id5\nACGTACGT\n+\nzzzzzzzz\n', # sample c '@id6\nACGTACGT\n+\nzzzzzzzz\n', ] with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_single_fn(muxed_sequences, metadata) self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art) self.assert_untrimmed_results('', obs_untrimmed_art)
def heatmap(output_dir: str, ranks: pd.DataFrame, microbe_metadata: qiime2.CategoricalMetadataColumn = None, metabolite_metadata: qiime2.CategoricalMetadataColumn = None, method: str = 'average', metric: str = 'euclidean', color_palette: str = 'seismic', margin_palette: str = 'cubehelix', x_labels: bool = False, y_labels: bool = False, level: int = -1) -> None: if microbe_metadata is not None: microbe_metadata = microbe_metadata.to_series() if metabolite_metadata is not None: metabolite_metadata = metabolite_metadata.to_series() hotmap = ranks_heatmap(ranks, microbe_metadata, metabolite_metadata, method, metric, color_palette, margin_palette, x_labels, y_labels, level) hotmap.savefig(join(output_dir, 'heatmap.pdf'), bbox_inches='tight') hotmap.savefig(join(output_dir, 'heatmap.png'), bbox_inches='tight') index = join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={ 'title': 'Rank Heatmap', 'pdf_fp': 'heatmap.pdf', 'png_fp': 'heatmap.png' })
def test_mixed_orientation_success(self): forward_barcodes = CategoricalMetadataColumn( pd.Series(['AAAA', 'CCCC'], name='ForwardBarcode', index=pd.Index(['sample_a', 'sample_b'], name='id'))) mixed_orientation_sequences_f_fp = self.get_data_path( 'mixed-orientation/forward.fastq.gz') mixed_orientation_sequences_r_fp = self.get_data_path( 'mixed-orientation/reverse.fastq.gz') with tempfile.TemporaryDirectory() as temp: shutil.copy(mixed_orientation_sequences_f_fp, temp) shutil.copy(mixed_orientation_sequences_r_fp, temp) mixed_orientation_sequences = Artifact.import_data( 'MultiplexedPairedEndBarcodeInSequence', temp) with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_paired_fn(mixed_orientation_sequences, forward_barcodes=forward_barcodes, mixed_orientation=True) self.assert_demux_results(forward_barcodes.to_series(), obs_demuxed_art) # Everything should match self.assert_untrimmed_results([b'', b''], obs_untrimmed_art)
def test_min_length(self): metadata = CategoricalMetadataColumn( # The third barcode is meant to completely remove the only GGGG # coded sequence pd.Series(['AAAA', 'CCCC', 'GGGGACGTACGT'], name='Barcode', index=pd.Index(['sample_a', 'sample_b', 'sample_c'], name='id'))) exp = [ # sample a '@id1\nACGTACGT\n+\nzzzzzzzz\n' '@id3\nACGTACGT\n+\nzzzzzzzz\n', # sample b '@id2\nACGTACGT\n+\nzzzzzzzz\n' '@id4\nACGTACGT\n+\nzzzzzzzz\n' '@id5\nACGTACGT\n+\nzzzzzzzz\n', # sample c is empty because the barcode matched the entire # read, which removed everything. '', ] with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_single_fn(self.muxed_sequences, metadata) self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art) self.assert_untrimmed_results('', obs_untrimmed_art)
def subsample_longitudinal(dates: qiime2.CategoricalMetadataColumn, start_date: str = None, samples_per_interval: int = 7, days_per_interval: int = 7, seed: int = None) -> IDSelection: window_size = '%dD' % days_per_interval dt_series = pd.to_datetime(dates.to_series(), errors='coerce') df = pd.DataFrame({'ids': dates.to_series().index}, index=dt_series) if start_date is not None: filter_before = pd.Timestamp(start_date) df = df.iloc[np.where(dt_series >= filter_before)] if filter_before not in df.index: # this will be stripped in _sample_group::_sampler # the purpose is to force Pandas to begin the window at this # time instead of the first observation (by making NaN the first # observation) df.loc[filter_before] = float('nan') grouped = df.groupby(pd.Grouper(freq=window_size, convention='start', closed='left'), group_keys=False) filtered_df = grouped.apply(_sample_group(samples_per_interval, seed)) df = df.dropna(axis=0) selection = pd.Series(False, index=dates.to_series().index) selection[filtered_df['ids']] = True md = qiime2.Metadata(dates.to_dataframe()) return IDSelection(selection, md, 'subsample_longitudinal')
def test_mixed_orientation_success(self): # sample_a and sample_b have reads in both fwd and rev directions. # sample_c only has reads in the fwd direction. # sample_d only has reads in the rev direction. forward_barcodes = CategoricalMetadataColumn( pd.Series(['AAAA', 'CCCC', 'GGGG', 'TTTT'], name='ForwardBarcode', index=pd.Index( ['sample_a', 'sample_b', 'sample_c', 'sample_d'], name='id'))) mixed_orientation_sequences_f_fp = self.get_data_path( 'mixed-orientation/forward.fastq.gz') mixed_orientation_sequences_r_fp = self.get_data_path( 'mixed-orientation/reverse.fastq.gz') with tempfile.TemporaryDirectory() as temp: shutil.copy(mixed_orientation_sequences_f_fp, temp) shutil.copy(mixed_orientation_sequences_r_fp, temp) mixed_orientation_sequences = Artifact.import_data( 'MultiplexedPairedEndBarcodeInSequence', temp) with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_paired_fn(mixed_orientation_sequences, forward_barcodes=forward_barcodes, mixed_orientation=True) exp = [ # sample_a fwd '@id1\nACGTACGT\n+\nyyyyyyyy\n' \ '@id3\nACGTACGT\n+\nyyyyyyyy\n', # sample_a rev '@id1\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n' \ '@id3\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n', # sample_b fwd '@id4\nACGTACGT\n+\nyyyyyyyy\n' \ '@id2\nACGTACGT\n+\nyyyyyyyy\n', # sample_b rev '@id4\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n' \ '@id2\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n', # sample_c fwd '@id5\nACGTACGT\n+\nyyyyyyyy\n', # sample_c rev '@id5\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n', # sample_d fwd '@id6\nACGTACGT\n+\nyyyyyyyy\n', # sample_d rev '@id6\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n', ] # We want to be sure that the validation is 100%, not just `min`, obs_demuxed_art.validate(level='max') # checkpoint assertion for the above `validate` - nothing should fail self.assertTrue(True) self.assert_demux_results(forward_barcodes.to_series(), exp, obs_demuxed_art) # Everything should match, so untrimmed should be empty self.assert_untrimmed_results(['', ''], obs_untrimmed_art)
def test_typical(self): metadata = CategoricalMetadataColumn( pd.Series(['AAAA', 'CCCC'], name='Barcode', index=pd.Index(['sample_a', 'sample_b'], name='id'))) with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_single_fn(self.muxed_sequences, metadata) self.assert_demux_results(metadata.to_series(), obs_demuxed_art) self.assert_untrimmed_results(b'@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n', obs_untrimmed_art)
def test_all_matched(self): metadata = CategoricalMetadataColumn( pd.Series(['AAAA', 'CCCC', 'GGGG'], name='Barcode', index=pd.Index(['sample_a', 'sample_b', 'sample_c'], name='id'))) with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_single_fn(self.muxed_sequences, metadata) self.assert_demux_results(metadata.to_series(), obs_demuxed_art) # obs_untrimmed should be empty, since everything matched self.assert_untrimmed_results(b'', obs_untrimmed_art)
def setUp(self): self.results = "results" if not os.path.exists(self.results): os.mkdir(self.results) self.balances = pd.DataFrame( { 'a': [-2, -1, 0, 1, 2], 'b': [-2, 0, 0, 0, 0] }, index=['a1', 'a2', 'a3', 'a4', 'a5']) self.tree = TreeNode.read([r'((k, q)d, ((x, y)a, z)b)c;']) self.taxonomy = pd.DataFrame( [['foo;barf;a;b;c;d;e', 1], ['foo;bark;f;g;h;i;j', 1], ['foo;bark;f;g;h;w;j', 1], ['nom;tu;k;l;m;n;o', 0.9], ['nom;tu;k;l;m;t;o', 0.9]], columns=['Taxon', 'Confidence'], index=['x', 'y', 'z', 'k', 'q']) self.balances = pd.DataFrame( [[1, 2, 3, 4, 5, 6, 7], [-3.1, -2.9, -3, 3, 2.9, 3.2, 3.1], [1, 1, 1, 1, 1, 1, 1], [3, 2, 1, 0, -1, -2, -3]], index=['d', 'a', 'b', 'c'], columns=['s1', 's2', 's3', 's4', 's5', 's6', 's7']).T basis, _ = balance_basis(self.tree) self.table = pd.DataFrame( ilr_inv(self.balances, basis), columns=['x', 'y', 'z', 'k', 'q'], index=['s1', 's2', 's3', 's4', 's5', 's6', 's7']) index = pd.Index(['s1', 's2', 's3', 's4', 's5', 's6', 's7'], name='id') self.categorical = CategoricalMetadataColumn( pd.Series(['a', 'a', 'a', 'b', 'b', 'b', 'b'], index=index, name='categorical')) self.multi_categorical = CategoricalMetadataColumn( pd.Series(['a', 'a', 'c', 'b', 'b', 'b', 'c'], index=index, name='multi_categorical')) self.partial_numerical_categorical = CategoricalMetadataColumn( pd.Series(['1', '1', '1', '2', '2', '2', 'a'], index=index, name='multi_categorical')) self.full_numerical_categorical = CategoricalMetadataColumn( pd.Series(['1', '1', '1.0', '2', '2', '2.0', '3'], index=index, name='numerical_categorical')) self.continuous = NumericMetadataColumn( pd.Series(np.arange(7), index=index, name='continuous'))
def test_error_tolerance_high_enough_to_prevent_filtering(self): metadata = CategoricalMetadataColumn( pd.Series(['AAAG', 'CCCC'], name='Barcode', index=pd.Index(['sample_a', 'sample_b'], name='id'))) with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_single_fn(self.muxed_sequences, metadata, error_rate=0.25) # This test should yield the same results as test_typical, above self.assert_demux_results(metadata.to_series(), obs_demuxed_art) self.assert_untrimmed_results(b'@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n', obs_untrimmed_art)
def test_di_mismatched_barcodes(self): forward_barcodes = CategoricalMetadataColumn( pd.Series(['AAAA', 'CCCC', 'ACGT'], name='ForwardBarcode', index=pd.Index(['sample_a', 'sample_b', 'sample_c'], name='id'))) reverse_barcodes = CategoricalMetadataColumn( pd.Series(['GGGG', 'TTTT'], name='ReverseBarcode', index=pd.Index(['sample_a', 'sample_b'], name='id'))) with self.assertRaisesRegex(ValueError, 'do not have.*sample_c'): self.demux_paired_fn(self.muxed_sequences, forward_barcodes=forward_barcodes, reverse_barcodes=reverse_barcodes)
def test_heatmap_extra_tips(self): # Adds in test scenario where there more tips than features # in the table np.random.seed(0) num_otus = 11 # otus index = pd.Index(np.arange(5).astype(np.str), name='id') table = pd.DataFrame(np.random.random((len(index), num_otus)), index=index, columns=np.arange(num_otus).astype(np.str)) x = np.random.rand(num_otus * 2) dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x - y)) lm = ward(dm.condensed_form()) t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str)) for i, n in enumerate(t.postorder()): if not n.is_tip(): n.name = "y%d" % i n.length = np.random.rand() * 3 md = CategoricalMetadataColumn( pd.Series(['a', 'a', 'a', 'b', 'b'], index=index, name='column-name')) dendrogram_heatmap(self.results, table, t, md) index_fp = os.path.join(self.results, 'index.html') self.assertTrue(os.path.exists(index_fp)) with open(index_fp, 'r') as fh: html = fh.read() self.assertIn('<h1>Dendrogram heatmap</h1>', html)
def test_visualization_garbage_metadata(self): # tests the scenario where ndim > number of tips np.random.seed(0) num_otus = 10 # otus num_samples = 5 table = pd.DataFrame(np.random.random((num_samples, num_otus)), index=np.arange(num_samples).astype(np.str), columns=np.arange(num_otus).astype(np.str)) x = np.random.rand(num_otus) dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x - y)) lm = ward(dm.condensed_form()) t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str)) for i, n in enumerate(t.postorder()): if not n.is_tip(): n.name = "y%d" % i n.length = np.random.rand() * 3 md = CategoricalMetadataColumn( pd.Series(['a', 'a', 'a', 'b', 'b', 'foo', 'foo'], index=pd.Index(np.arange(7).astype(np.str), name='id'), name='column-name')) dendrogram_heatmap(self.results, table, t, md) index_fp = os.path.join(self.results, 'index.html') self.assertTrue(os.path.exists(index_fp)) with open(index_fp, 'r') as fh: html = fh.read() self.assertIn('<h1>Dendrogram heatmap</h1>', html)
def test_extra_barcode_in_metadata(self): metadata = CategoricalMetadataColumn( pd.Series(['AAAA', 'CCCC', 'GGGG', 'TTTT'], name='Barcode', index=pd.Index( ['sample_a', 'sample_b', 'sample_c', 'sample_d'], name='id'))) exp = [ # sample a '@id1\nACGTACGT\n+\nzzzzzzzz\n' '@id3\nACGTACGT\n+\nzzzzzzzz\n', # sample b '@id2\nACGTACGT\n+\nzzzzzzzz\n' '@id4\nACGTACGT\n+\nzzzzzzzz\n' '@id5\nACGTACGT\n+\nzzzzzzzz\n', # sample c '@id6\nACGTACGT\n+\nzzzzzzzz\n', # sample d is empty bc no reads matched the barcode TTTT '', ] with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_single_fn(self.muxed_sequences, metadata) exp_samples_and_barcodes = pd.Series( ['AAAA', 'CCCC', 'GGGG', 'TTTT'], index=['sample_a', 'sample_b', 'sample_c', 'sample_d']) self.assert_demux_results(exp_samples_and_barcodes, exp, obs_demuxed_art) self.assert_untrimmed_results('', obs_untrimmed_art)
def setUp(self): _ranks = pd.DataFrame([[4.1, 1.3, 2.1], [0.1, 0.3, 0.2], [2.2, 4.3, 3.2], [-6.3, -4.4, 2.1]], index=pd.Index([c for c in 'ABCD'], name='id'), columns=['m1', 'm2', 'm3']) self.ranks = Artifact.import_data('FeatureData[Conditional]', _ranks) self.taxa = CategoricalMetadataColumn( pd.Series([ 'k__Bacteria; p__Proteobacteria; c__Deltaproteobacteria; ' 'o__Desulfobacterales; f__Desulfobulbaceae; g__; s__', 'k__Bacteria; p__Cyanobacteria; c__Chloroplast; o__Streptophyta', 'k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; ' 'o__Rickettsiales; f__mitochondria; g__Lardizabala; s__biternata', 'k__Archaea; p__Euryarchaeota; c__Methanomicrobia; ' 'o__Methanosarcinales; f__Methanosarcinaceae; g__Methanosarcina' ], index=pd.Index([c for c in 'ABCD'], name='feature-id'), name='Taxon')) metabolites = biom.Table(np.array([[9, 8, 2], [2, 1, 2], [9, 4, 5], [8, 8, 7]]), sample_ids=['s1', 's2', 's3'], observation_ids=['m1', 'm2', 'm3', 'm4']) self.metabolites = Artifact.import_data('FeatureTable[Frequency]', metabolites) microbes = biom.Table(np.array([[1, 2, 3], [3, 6, 3], [1, 9, 9], [8, 8, 7]]), sample_ids=['s1', 's2', 's3'], observation_ids=[i for i in 'ABCD']) self.microbes = Artifact.import_data('FeatureTable[Frequency]', microbes)
def test_invalid_batch_size(self): metadata = CategoricalMetadataColumn( pd.Series(['AAAA', 'CCCC'], name='Barcode', index=pd.Index(['sample_a', 'sample_b'], name='id'))) with self.assertRaisesRegex(ValueError, '5.*cannot be greater.*2'): self.demux_single_fn(self.muxed_sequences, metadata, batch_size=5)
def classify_samples(output_dir: str, table: pd.DataFrame, metadata: qiime2.CategoricalMetadataColumn, test_size: float = defaults['test_size'], step: float = defaults['step'], cv: int = defaults['cv'], random_state: int = None, n_jobs: int = defaults['n_jobs'], n_estimators: int = defaults['n_estimators'], estimator: str = defaults['estimator_r'], optimize_feature_selection: bool = False, parameter_tuning: bool = False, palette: str = defaults['palette']) -> None: # extract column name from CategoricalMetadataColumn column = metadata.to_series().name # disable feature selection for unsupported estimators optimize_feature_selection, calc_feature_importance = \ _disable_feature_selection(estimator, optimize_feature_selection) # specify parameters and distributions to sample from for parameter tuning estimator, param_dist, parameter_tuning = _set_parameters_and_estimator( estimator, table, metadata, column, n_estimators, n_jobs, cv, random_state, parameter_tuning, classification=True) estimator, cm, accuracy, importances = split_optimize_classify( table, metadata, column, estimator, output_dir, test_size=test_size, step=step, cv=cv, random_state=random_state, n_jobs=n_jobs, optimize_feature_selection=optimize_feature_selection, parameter_tuning=parameter_tuning, param_dist=param_dist, calc_feature_importance=calc_feature_importance, palette=palette) _visualize(output_dir, estimator, cm, accuracy, importances, optimize_feature_selection, title='classification predictions')
def test_none_matched(self): metadata = CategoricalMetadataColumn( pd.Series(['TTTT'], name='Barcode', index=pd.Index(['sample_d'], name='id'))) with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_single_fn(self.muxed_sequences, metadata) self.assert_demux_results(metadata.to_series(), [''], obs_demuxed_art) self.assert_untrimmed_results( '@id1\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n' '@id2\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n' '@id3\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n' '@id4\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n' '@id5\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n' '@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n', obs_untrimmed_art)
def test_none_matched(self): metadata = CategoricalMetadataColumn( pd.Series(['TTTT'], name='Barcode', index=pd.Index(['sample_d'], name='id'))) with redirected_stdio(stderr=os.devnull): with self.assertRaisesRegex(ValueError, 'demultiplexed'): self.demux_single_fn(self.muxed_sequences, metadata)
def test_variable_length_barcodes(self): metadata = CategoricalMetadataColumn( pd.Series(['AAAAA', 'CCCCCC', 'GGGG'], name='Barcode', index=pd.Index(['sample_a', 'sample_b', 'sample_c'], name='id'))) muxed_sequences_fp = self.get_data_path('variable_length.fastq.gz') muxed_sequences = Artifact.import_data( 'MultiplexedSingleEndBarcodeInSequence', muxed_sequences_fp) with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_single_fn(muxed_sequences, metadata) # This test should yield the same results as test_typical, above, just # with variable length barcodes self.assert_demux_results(metadata.to_series(), obs_demuxed_art) self.assert_untrimmed_results(b'', obs_untrimmed_art)
def setUp(self): barcode_map = pd.Series(['GTCA', 'TCAG', 'GGGG'], index=['sample1', 'sample2', 'sample3'], name="aname") barcode_map.index.name = "sample_name" barcode_map = CategoricalMetadataColumn(barcode_map) seqs_fp = dir_path + "/data/small/" seqs = Artifact.import_data("EMPSingleEndSequences", seqs_fp) self.demuxed, = emp_single(seqs, barcode_map) self.exp = 1
def setUp(self): _ranks = pd.DataFrame([[4.1, 1.3, 2.1], [0.1, 0.3, 0.2], [2.2, 4.3, 3.2], [-6.3, -4.4, 2.1]], index=pd.Index([c for c in 'ABCD'], name='id'), columns=['m1', 'm2', 'm3']) self.ranks = Artifact.import_data('FeatureData[Conditional]', _ranks) self.taxa = CategoricalMetadataColumn( pd.Series([ 'k__Bacteria; p__Proteobacteria; c__Deltaproteobacteria; ' 'o__Desulfobacterales; f__Desulfobulbaceae; g__; s__', 'k__Bacteria; p__Cyanobacteria; c__Chloroplast; o__Streptophyta', 'k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; ' 'o__Rickettsiales; f__mitochondria; g__Lardizabala; s__biternata', 'k__Archaea; p__Euryarchaeota; c__Methanomicrobia; ' 'o__Methanosarcinales; f__Methanosarcinaceae; g__Methanosarcina' ], index=pd.Index([c for c in 'ABCD'], name='feature-id'), name='Taxon')) self.metabolites = CategoricalMetadataColumn( pd.Series(['amino acid', 'carbohydrate', 'drug metabolism'], index=pd.Index(['m1', 'm2', 'm3'], name='feature-id'), name='Super Pathway'))
def paired_heatmap(output_dir: str, ranks: pd.DataFrame, microbes_table: biom.Table, metabolites_table: biom.Table, features: str = None, top_k_microbes: int = 2, keep_top_samples: bool = True, microbe_metadata: qiime2.CategoricalMetadataColumn = None, normalize: str = 'log10', color_palette: str = 'magma', top_k_metabolites: int = 50, level: int = -1, row_center: bool = True) -> None: if microbe_metadata is not None: microbe_metadata = microbe_metadata.to_series() ranks = ranks.T if row_center: ranks = ranks - ranks.mean(axis=0) select_microbes, select_metabolites, hotmaps = paired_heatmaps( ranks, microbes_table, metabolites_table, microbe_metadata, features, top_k_microbes, top_k_metabolites, keep_top_samples, level, normalize, color_palette) hotmaps.savefig(join(output_dir, 'heatmap.pdf'), bbox_inches='tight') hotmaps.savefig(join(output_dir, 'heatmap.png'), bbox_inches='tight') select_microbes.to_csv(join(output_dir, 'select_microbes.tsv'), sep='\t') select_metabolites.to_csv(join(output_dir, 'select_metabolites.tsv'), sep='\t') index = join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={ 'title': 'Paired Feature Abundance Heatmaps', 'pdf_fp': 'heatmap.pdf', 'png_fp': 'heatmap.png', 'table1_fp': 'select_microbes.tsv', 'download1_text': 'Download microbe abundances as TSV', 'table2_fp': 'select_metabolites.tsv', 'download2_text': 'Download top k metabolite abundances as TSV' })
def aldex2(table: pd.DataFrame, metadata: qiime2.CategoricalMetadataColumn, mc_samples: int = 128, test: str = 't', denom: str = 'all') -> pd.DataFrame: # create series from the metadata column meta = metadata.to_series() # The condition is just the only column in the passed metadata column condition = metadata.name # filter the metadata so only the samples present in the table are used # this also reorders it for the correct condition selection # it has to be re ordered for aldex to correctly input the conditions meta = meta.loc[list(table.index)] # force reorder based on the data to ensure conds are selected correctly with tempfile.TemporaryDirectory() as temp_dir_name: biom_fp = os.path.join(temp_dir_name, 'input.tsv.biom') map_fp = os.path.join(temp_dir_name, 'input.map.txt') summary_fp = os.path.join(temp_dir_name, 'output.summary.txt') # Need to manually specify header=True for Series (i.e. "meta"). It's # already the default for DataFrames (i.e. "table"), but we manually # specify it here anyway to alleviate any potential confusion. table.to_csv(biom_fp, sep='\t', header=True) meta.to_csv(map_fp, sep='\t', header=True) cmd = [ 'run_aldex2.R', biom_fp, map_fp, condition, mc_samples, test, denom, summary_fp ] cmd = list(map(str, cmd)) try: run_commands([cmd]) except subprocess.CalledProcessError as e: raise Exception("An error was encountered while running ALDEx2" " in R (return code %d), please inspect stdout" " and stderr to learn more." % e.returncode) summary = pd.read_csv(summary_fp, index_col=0) #differentials = summary[['effect']] # hack to fix column name for features because aldex removes #it in R because of row.names = 1 summary.index.name = "featureid" summary.rename(index=str, inplace=True) return summary
def test_di_typical(self): forward_barcodes = CategoricalMetadataColumn( pd.Series(['AAAA', 'CCCC'], name='ForwardBarcode', index=pd.Index(['sample_a', 'sample_b'], name='id'))) reverse_barcodes = CategoricalMetadataColumn( pd.Series(['GGGG', 'TTTT'], name='ReverseBarcode', index=pd.Index(['sample_a', 'sample_b'], name='id'))) with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_paired_fn(self.muxed_sequences, forward_barcodes=forward_barcodes, reverse_barcodes=reverse_barcodes) self.assert_demux_results(forward_barcodes.to_series(), obs_demuxed_art) exp_untrimmed = [ b'@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n', b'@id6\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n' ] self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
def beta_group_significance(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.CategoricalMetadataColumn, method: str = 'permanova', pairwise: bool = False, permutations: int = 999) -> None: try: beta_group_significance_fn = _beta_group_significance_fns[method] except KeyError: raise ValueError('Unknown group significance method %s. The available ' 'options are %s.' % (method, ', '.join(_beta_group_significance_fns))) # Filter metadata to only include IDs present in the distance matrix. # Also ensures every distance matrix ID is present in the metadata. metadata = metadata.filter_ids(distance_matrix.ids) metadata = metadata.drop_missing_values() # filter the distance matrix to exclude samples that were dropped from # the metadata due to missing values, and keep track of how many samples # survived the filtering so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(metadata.ids) filtered_dm_length = distance_matrix.shape[0] metadata = metadata.to_series() # Run the significance test result = beta_group_significance_fn(distance_matrix, metadata, permutations=permutations) # Generate distance boxplots sns.set_style('white') # Identify the groups, then compute the within group distances and the # between group distances, and generate one boxplot per group. # groups will be an OrderedDict mapping group id to the sample ids in that # group. The order is used both on the x-axis, and in the layout of the # boxplots in the visualization. # TODO: update to use a grouping API and natsort API on # CategoricalMetadataColumn, if those become available. groupings = collections.OrderedDict( [(id, list(series.index)) for id, series in natsorted(metadata.groupby(metadata))]) pairs_summary = pd.DataFrame(columns=['SubjectID1', 'SubjectID2', 'Group1', 'Group2', 'Distance']) for group_id in groupings: group_distances, x_ticklabels, group_pairs_summary = \ _get_distance_boxplot_data(distance_matrix, group_id, groupings) group_pairs_summary = pd.DataFrame( group_pairs_summary, columns=['SubjectID1', 'SubjectID2', 'Group1', 'Group2', 'Distance']) pairs_summary = pd.concat([pairs_summary, group_pairs_summary]) ax = sns.boxplot(data=group_distances, flierprops={ 'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5, 'alpha': 0.5}) ax.set_xticklabels(x_ticklabels, rotation=90) ax.set_xlabel('Group') ax.set_ylabel('Distance') ax.set_title('Distances to %s' % group_id) # change the color of the boxes to white for box in ax.artists: box.set_facecolor('white') sns.despine() plt.tight_layout() fig = ax.get_figure() fig.savefig(os.path.join(output_dir, '%s-boxplots.png' % urllib.parse.quote_plus(str(group_id)))) fig.savefig(os.path.join(output_dir, '%s-boxplots.pdf' % urllib.parse.quote_plus(str(group_id)))) fig.clear() pairs_summary.to_csv(os.path.join(output_dir, 'raw_data.tsv'), sep='\t') result_html = q2templates.df_to_html(result.to_frame()) if pairwise: pairwise_results = [] for group1_id, group2_id in itertools.combinations(groupings, 2): pairwise_result = \ _get_pairwise_group_significance_stats( distance_matrix=distance_matrix, group1_id=group1_id, group2_id=group2_id, groupings=groupings, metadata=metadata, beta_group_significance_fn=beta_group_significance_fn, permutations=permutations) pairwise_results.append([group1_id, group2_id, pairwise_result['sample size'], permutations, pairwise_result['test statistic'], pairwise_result['p-value']]) columns = ['Group 1', 'Group 2', 'Sample size', 'Permutations', result['test statistic name'], 'p-value'] pairwise_results = pd.DataFrame(pairwise_results, columns=columns) pairwise_results.set_index(['Group 1', 'Group 2'], inplace=True) pairwise_results['q-value'] = multipletests( pairwise_results['p-value'], method='fdr_bh')[1] pairwise_results.sort_index(inplace=True) pairwise_path = os.path.join( output_dir, '%s-pairwise.csv' % method) pairwise_results.to_csv(pairwise_path) pairwise_results_html = q2templates.df_to_html(pairwise_results) else: pairwise_results_html = None # repartition groupings for rendering group_ids = list(groupings.keys()) row_count, group_count = 3, len(group_ids) # Start at three plots per row while group_count % row_count != 0: row_count = row_count - 1 group_rows = [group_ids[g:g+row_count] for g in range(0, group_count, row_count)] index = os.path.join( TEMPLATES, 'beta_group_significance_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'method': method, 'group_rows': group_rows, 'bootstrap_group_col_size': int(12 / row_count), 'result': result_html, 'pairwise_results': pairwise_results_html })
def ancom(output_dir: str, table: pd.DataFrame, metadata: qiime2.CategoricalMetadataColumn, transform_function: str = 'clr', difference_function: str = None) -> None: metadata = metadata.filter_ids(table.index) if metadata.has_missing_values(): missing_data_sids = metadata.get_ids(where_values_missing=True) missing_data_sids = ', '.join(sorted(missing_data_sids)) raise ValueError('Metadata column is missing values for the ' 'following samples. Values need to be added for ' 'these samples, or the samples need to be removed ' 'from the table: %s' % missing_data_sids) ancom_results = skbio_ancom(table, metadata.to_series(), significance_test=f_oneway) ancom_results[0].sort_values(by='W', ascending=False, inplace=True) ancom_results[0].rename(columns={'reject': 'Reject null hypothesis'}, inplace=True) significant_features = ancom_results[0][ ancom_results[0]['Reject null hypothesis']] context = dict() if not significant_features.empty: context['significant_features'] = q2templates.df_to_html( significant_features['W'].to_frame()) context['percent_abundances'] = q2templates.df_to_html( ancom_results[1].loc[significant_features.index]) metadata = metadata.to_series() cats = list(set(metadata)) transform_function_name = transform_function transform_function = _transform_functions[transform_function] transformed_table = table.apply( transform_function, axis=1, result_type='broadcast') if difference_function is None: if len(cats) == 2: difference_function = 'mean_difference' else: # len(categories) > 2 difference_function = 'f_statistic' _d_func = _difference_functions[difference_function] def diff_func(x): args = _d_func(*[x[metadata == c] for c in cats]) if isinstance(args, tuple): return args[0] else: return args # effectively doing a groupby operation wrt to the metadata fold_change = transformed_table.apply(diff_func, axis=0) if not pd.isnull(fold_change).all(): volcano_results = pd.DataFrame({transform_function_name: fold_change, 'W': ancom_results[0].W}) volcano_results = volcano_results.reset_index(drop=False) spec = { '$schema': 'https://vega.github.io/schema/vega/v4.json', 'width': 300, 'height': 300, 'data': [ {'name': 'values', 'values': volcano_results.to_dict(orient='records')}], 'scales': [ {'name': 'xScale', 'domain': {'data': 'values', 'field': transform_function_name}, 'range': 'width'}, {'name': 'yScale', 'domain': {'data': 'values', 'field': 'W'}, 'range': 'height'}], 'axes': [ {'scale': 'xScale', 'orient': 'bottom', 'title': transform_function_name}, {'scale': 'yScale', 'orient': 'left', 'title': 'W'}], 'marks': [ {'type': 'symbol', 'from': {'data': 'values'}, 'encode': { 'hover': { 'fill': {'value': '#FF0000'}, 'opacity': {'value': 1}}, 'enter': { 'x': {'scale': 'xScale', 'field': transform_function_name}, 'y': {'scale': 'yScale', 'field': 'W'}}, 'update': { 'fill': {'value': 'black'}, 'opacity': {'value': 0.3}, 'tooltip': { 'signal': "{{'title': datum['index'], '{0}': " "datum['{0}'], 'W': datum['W']}}".format( transform_function_name)}}}}]} context['vega_spec'] = json.dumps(spec) copy_tree(os.path.join(TEMPLATES, 'ancom'), output_dir) ancom_results[0].to_csv(os.path.join(output_dir, 'ancom.tsv'), header=True, index=True, sep='\t') ancom_results[1].to_csv(os.path.join(output_dir, 'percent-abundances.tsv'), header=True, index=True, sep='\t') index = os.path.join(TEMPLATES, 'ancom', 'index.html') q2templates.render(index, output_dir, context=context)