def test_combine_exclude_ids_and_features_filters(self): # exclude one, min_features filter none df = pd.DataFrame({'Subject': ['subject-1'], 'SampleType': ['gut']}, index=pd.Index(['S1'], name='id')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = filter_samples(table, metadata=metadata, exclude_ids=True, min_features=2) expected = Table(np.array([[1, 3], [1, 2]]), ['O1', 'O2'], ['S2', 'S3']) self.assertEqual(actual, expected) # exclude one, min_features filter one df = pd.DataFrame({'Subject': ['subject-1'], 'SampleType': ['gut']}, index=pd.Index(['S2'], name='id')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = filter_samples(table, metadata=metadata, exclude_ids=True, min_features=2) expected = Table(np.array([[3], [2]]), ['O1', 'O2'], ['S3']) self.assertEqual(actual, expected) # exclude one, min_features filter for same one df = pd.DataFrame({'Subject': ['subject-1'], 'SampleType': ['gut']}, index=pd.Index(['S2'], name='id')) metadata = qiime2.Metadata(df) table = Table(np.array([[4, 1, 3], [6, 0, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = filter_samples(table, metadata=metadata, exclude_ids=True, min_features=1) expected = Table(np.array([[4, 3], [6, 2]]), ['O1', 'O2'], ['S1', 'S3']) self.assertEqual(actual, expected) # exclude one, max_features filter none df = pd.DataFrame({'Subject': ['subject-1'], 'SampleType': ['gut']}, index=pd.Index(['S1'], name='id')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = filter_samples(table, metadata=metadata, exclude_ids=True, max_features=3) expected = Table(np.array([[1, 3], [1, 2]]), ['O1', 'O2'], ['S2', 'S3']) self.assertEqual(actual, expected) # exclude one, max_features filter one df = pd.DataFrame({'Subject': ['subject-1'], 'SampleType': ['gut']}, index=pd.Index(['S2'], name='id')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = filter_samples(table, metadata=metadata, exclude_ids=True, max_features=1) expected = Table(np.array([[1]]), ['O2'], ['S1']) self.assertEqual(actual, expected) # exclude one, max_features filter for same one df = pd.DataFrame({'Subject': ['subject-1'], 'SampleType': ['gut']}, index=pd.Index(['S2'], name='id')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 10, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = filter_samples(table, metadata=metadata, exclude_ids=True, max_features=9) expected = Table(np.array([[0, 3], [1, 2]]), ['O1', 'O2'], ['S1', 'S3']) self.assertEqual(actual, expected) # exclude one, max_features filter none, # min_features filter none df = pd.DataFrame({'Subject': ['subject-1'], 'SampleType': ['gut']}, index=pd.Index(['S2'], name='id')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [0, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = filter_samples(table, metadata=metadata, exclude_ids=True, min_features=0, max_features=5) expected = Table(np.array([[0, 3], [0, 2]]), ['O1', 'O2'], ['S1', 'S3']) self.assertEqual(actual, expected) # exclude one, max_features filter one, # min_features filter one df = pd.DataFrame({'Subject': ['subject-1'], 'SampleType': ['gut']}, index=pd.Index(['S2'], name='id')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [0, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = filter_samples(table, metadata=metadata, exclude_ids=True, min_features=1, max_features=1) expected = Table(np.array([]), [], []) self.assertEqual(actual, expected) # where filter one -> exclude one, # max_features filter one, # min_features filter one df = pd.DataFrame({'Subject': ['subject-1', 'subject-2'], 'SampleType': ['gut', 'tongue']}, index=pd.Index(['S1', 'S2'], name='id')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [0, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) where = "SampleType='tongue'" actual = filter_samples(table, metadata=metadata, where=where, exclude_ids=True, min_features=1, max_features=1) expected = Table(np.array([]), [], []) self.assertEqual(actual, expected)
def _3(ff: CoordinatesFormat) -> (qiime2.Metadata): with ff.open() as fh: return qiime2.Metadata(_read_dataframe(fh))
def _3(ff: FirstDifferencesFormat) -> qiime2.Metadata: with ff.open() as fh: return qiime2.Metadata(_read_first_differences(fh))
def test_nmit_missing_table_ids(self): md = qiime2.Metadata(pd.DataFrame([[1]], columns=['i'], index=pd.Index(['20'], name='id'))) with self.assertRaisesRegex(ValueError, 'Missing samples'): nmit(table=self.table_taxa_fp, metadata=md, individual_id_column='studyid')
def test_first_distances_single_sample(self): with self.assertRaisesRegex(RuntimeError, "Output is empty"): first_distances( distance_matrix=dm_single_sample, metadata=qiime2.Metadata(md), state_column='Time', individual_id_column='ind', replicate_handling='drop')
def _6(ff: PredictionsFormat) -> (qiime2.Metadata): with ff.open() as fh: return qiime2.Metadata( _read_dataframe(fh).apply( lambda x: pd.to_numeric(x, errors='ignore')))
def test_id_based_filtering_with_extra_ids(self): md = qiime2.Metadata( pd.DataFrame([], index=pd.Index(['O1', 'O3', 'foo'], name='id'))) exp = pd.Series(['ACGT', 'CCCC'], index=['O1', 'O3']) self.filter_and_assertEqual(exp, md=md)
def _6(ff: ModuleMembershipTSVFormat) -> qiime2.Metadata: df = pd.read_table(str(ff), index_col=0, header=None, dtype=str) df.columns = ['module_membership'] df.index = [str(i) for i in df.index] df.index.name = '#OTU ID' return qiime2.Metadata(df)
def synthetic_under_sampling(table: biom.Table, metadata: NumericMetadataColumn, concatenate_meta_fp: Str, method: Str = 'RandomUnderSampler', voting: Str = 'auto', n_jobs: Int = 1, sampling_strategy: Str = 'auto', random_state: Int = 42, output_log_fp: Str = None) -> biom.Table: log_fp = tempfile.mktemp() print("The log file will be writen into", log_fp) if log_fp: logger_ins = LOG(log_fp=log_fp).get_logger('synthetic_over_sampling') logger_ins.info("The parameters used for oversampling are") logger_ins.info('voting (will be used with ClusterCentroids only):', voting) logger_ins.info('Sampling method:', method) logger_ins.info('Output log file path:', log_fp) logger_ins.info('sampling_strategy:', sampling_strategy) logger_ins.info('n_jobs:', n_jobs) logger_ins.info('random_state:', random_state) cls = dispatcher[method] if method != 'RandomUnderSampler': table.norm(inplace=True) if log_fp: logger_ins.info( "The input table is normalized before using it for oversampling" ) sorted_table, sorted_metadata = _read_inputs(table, meta_data=metadata) matrix_data = sorted_table.matrix_data.transpose().todense() if method not in dispatcher: raise ValueError('The optional methods for over sampling are', dispatcher.keys(), "instead it received", method) if method == 'RandomUnderSampler': under_sampling_cls = cls(sampling_strategy=sampling_strategy, random_state=random_state, replacement=False) else: raise NotImplementedError("Method", method, "is not implemented yet") X_resampled, y_resampled = under_sampling_cls.fit_resample( matrix_data, sorted_metadata) if log_fp: logger_ins.info("The under-sampling finished successfully!") logger_ins.info("Overall, the size of data is", len(X_resampled)) if method == 'RandomUnderSampler': dummy_samples_ids = under_sampling_cls.sample_indices_ dummy_samples = [] orig_samples = sorted_table.ids('sample') for sample_id in dummy_samples_ids: dummy_samples.append(orig_samples[sample_id]) else: raise NotImplementedError("Method", method, "is not implemented yet") under_sampling_dummy = sorted_table.filter(ids_to_keep=dummy_samples, inplace=False) under_sampling_dummy = under_sampling_dummy.sort_order(order=dummy_samples, axis='sample') if method == "RandomUnderSampler" and np.sum( under_sampling_dummy.matrix_data.transpose() - X_resampled) != 0: raise ValueError("The undersampling changed the matrix data") undersampled_table = biom.Table( X_resampled.transpose(), observation_ids=sorted_table.ids('observation'), sample_ids=dummy_samples) undersampled_metadata = pd.DataFrame(index=dummy_samples, data=y_resampled) undersampled_metadata.index.names = ['#SampleID'] undersampled_metadata.columns = ['label'] undersampled_meta = qiime2.Metadata(undersampled_metadata) undersampled_meta.save(concatenate_meta_fp) if log_fp: shutil.copy(log_fp, output_log_fp) return undersampled_table
def maturity_index(ctx, table, metadata, state_column, group_by, control, individual_id_column=None, estimator='RandomForestRegressor', n_estimators=100, test_size=0.5, step=0.05, cv=5, random_state=None, n_jobs=1, parameter_tuning=False, optimize_feature_selection=False, stratify=False, missing_samples='error'): filter_samples = ctx.get_action('feature_table', 'filter_samples') filter_features = ctx.get_action('feature_table', 'filter_features') group_table = ctx.get_action('feature_table', 'group') heatmap = ctx.get_action('feature_table', 'heatmap') split = ctx.get_action('sample_classifier', 'split_table') fit = ctx.get_action('sample_classifier', 'fit_regressor') predict_test = ctx.get_action('sample_classifier', 'predict_regression') summarize_estimator = ctx.get_action('sample_classifier', 'summarize') scatter = ctx.get_action('sample_classifier', 'scatterplot') volatility = ctx.get_action('longitudinal', 'volatility') # we must perform metadata superset validation here before we start # slicing and dicing. md_as_frame = metadata.to_dataframe() if missing_samples == 'error': _validate_metadata_is_superset(md_as_frame, table.view(biom.Table)) # Let's also validate metadata columns before we get busy _validate_input_columns( md_as_frame, individual_id_column, group_by, state_column, None) # train regressor on subset of control samples control_table, = filter_samples( table, metadata=metadata, where="{0}='{1}'".format(group_by, control)) md_column = metadata.get_column(state_column) X_train, X_test = split(control_table, md_column, test_size, random_state, stratify, missing_samples='ignore') sample_estimator, importance = fit( X_train, md_column, step, cv, random_state, n_jobs, n_estimators, estimator, optimize_feature_selection, parameter_tuning, missing_samples='ignore') # drop training samples from rest of dataset; we will predict all others control_ids = pd.DataFrame(index=X_train.view(biom.Table).ids()) control_ids.index.name = 'id' control_ids = qiime2.Metadata(control_ids) test_table, = filter_samples(table, metadata=control_ids, exclude_ids=True) # predict test samples predictions, = predict_test(test_table, sample_estimator, n_jobs) # summarize estimator params summary, = summarize_estimator(sample_estimator) # only report accuracy on control test samples test_ids = X_test.view(biom.Table).ids() accuracy_md = metadata.filter_ids(test_ids).get_column(state_column) accuracy_results, = scatter(predictions, accuracy_md, 'ignore') # calculate MAZ score # merge is inner join by default, so training samples are dropped (good!) pred_md = metadata.merge(predictions.view(qiime2.Metadata)).to_dataframe() pred_md['prediction'] = pd.to_numeric(pred_md['prediction']) pred_md = _maz_score( pred_md, 'prediction', state_column, group_by, control) maz = '{0} MAZ score'.format(state_column) maz_scores = ctx.make_artifact('SampleData[RegressorPredictions]', pred_md[maz]) # make heatmap # trim table to important features for viewing as heatmap table, = filter_features(table, metadata=importance.view(qiime2.Metadata)) # make sure IDs match between table and metadata cluster_table, = filter_samples(table, metadata=metadata) # need to group table by two columns together, so do this ugly hack cluster_by = group_by + '-' + state_column md_as_frame[cluster_by] = (md_as_frame[group_by].astype(str) + '-' + md_as_frame[state_column].astype(str)) cluster_md = qiime2.CategoricalMetadataColumn(md_as_frame[cluster_by]) cluster_table, = group_table(cluster_table, axis='sample', metadata=cluster_md, mode='median-ceiling') # group metadata to match grouped sample IDs and sort by group/column clust_md = md_as_frame.groupby(cluster_by).first() clust_md = clust_md.sort_values([group_by, state_column]) # sort table using clustered/sorted metadata as guide sorted_table = cluster_table.view(biom.Table).sort_order(clust_md.index) sorted_table = ctx.make_artifact('FeatureTable[Frequency]', sorted_table) clustermap, = heatmap(sorted_table, cluster='features') # visualize MAZ vs. time (column) lineplots, = volatility( qiime2.Metadata(pred_md), state_column=state_column, individual_id_column=individual_id_column, default_group_column=group_by, default_metric=maz, yscale='linear') return ( sample_estimator, importance, predictions, summary, accuracy_results, maz_scores, clustermap, lineplots)
def multinomial( table: biom.Table, metadata: Metadata, formula: str, training_column: str = None, num_random_test_examples: int = 5, epochs: int = 1000, batch_size: int = 5, differential_prior: float = 1, learning_rate: float = 1e-3, clipnorm: float = 10, min_sample_count: int = 1000, min_feature_count: int = 10, summary_interval: int = 60 ) -> (pd.DataFrame, qiime2.Metadata, skbio.OrdinationResults): # load metadata and tables metadata = metadata.to_dataframe() # match them table, metadata, design = match_and_filter(table, metadata, formula, min_sample_count, min_feature_count) # convert to dense representation dense_table = table.to_dataframe().to_dense().T # split up training and testing trainX, testX, trainY, testY = split_training(dense_table, metadata, design, training_column, num_random_test_examples) model = MultRegression(learning_rate=learning_rate, clipnorm=clipnorm, beta_mean=differential_prior, batch_size=batch_size, save_path=None) with tf.Graph().as_default(), tf.Session() as session: model(session, trainX, trainY, testX, testY) loss, cv, its = model.fit(epochs=epochs, summary_interval=summary_interval, checkpoint_interval=None) md_ids = np.array(design.columns) obs_ids = table.ids(axis='observation') beta_ = clr(clr_inv(np.hstack((np.zeros((model.p, 1)), model.B)))) differentials = pd.DataFrame( beta_.T, columns=md_ids, index=obs_ids, ) convergence_stats = pd.DataFrame({ 'loglikehood': loss, 'cross-validation': cv, 'iteration': its }) convergence_stats.index.name = 'id' convergence_stats.index = convergence_stats.index.astype(np.str) c = convergence_stats['loglikehood'].astype(np.float) convergence_stats['loglikehood'] = c c = convergence_stats['cross-validation'].astype(np.float) convergence_stats['cross-validation'] = c c = convergence_stats['iteration'].astype(np.int) convergence_stats['iteration'] = c # regression biplot if differentials.shape[-1] > 1: u, s, v = np.linalg.svd(differentials) pc_ids = ['PC%d' % i for i in range(len(s))] samples = pd.DataFrame(u[:, :len(s)] @ np.diag(s), columns=pc_ids, index=differentials.index) features = pd.DataFrame(v.T[:, :len(s)], columns=pc_ids, index=differentials.columns) short_method_name = 'regression_biplot' long_method_name = 'Multinomial regression biplot' eigvals = pd.Series(s, index=pc_ids) proportion_explained = eigvals**2 / (eigvals**2).sum() biplot = OrdinationResults(short_method_name, long_method_name, eigvals, samples=samples, features=features, proportion_explained=proportion_explained) else: # this is to handle the edge case with only intercepts biplot = OrdinationResults('', '', pd.Series(), pd.DataFrame()) return differentials, qiime2.Metadata(convergence_stats), biplot
def _volatility(output_dir, metadata, state_column, individual_id_column, default_group_column, default_metric, table, yscale, importances): if individual_id_column == state_column: raise ValueError('individual_id_column & state_column must be set to ' 'unique values.') # verify that individual_id_column exists in metadata # other metadata columns are validated later (to ensure correct types) if individual_id_column is not None: individual_ids = metadata.get_column( individual_id_column).to_dataframe() is_feat_vol_plot = importances is not None if is_feat_vol_plot: # We don't want to include any MD columns in the metric select in the # feature volatility variant of the viz, except for the state vo state_md_col = metadata.get_column(state_column).to_dataframe() metadata = metadata.filter_columns(column_type='categorical') metadata = metadata.merge(qiime2.Metadata(state_md_col)) # Compile first differences and other stats on feature data stats_chart_data = _summarize_feature_stats(table, state_md_col) stats_chart_data = importances.join(stats_chart_data, how='inner') qiime2.Metadata(stats_chart_data).save( os.path.join(output_dir, 'feature_metadata.tsv')) stats_chart_data = stats_chart_data.reset_index(drop=False) # convert np.nan to None (nans and vega don't mix) stats_chart_data = _convert_nan_to_none(stats_chart_data) # Convert table to metadata and merge, if present. if table is not None: table.index.name = 'id' table_md = qiime2.Metadata(table) metadata = metadata.merge(table_md) # Partition the metadata into constituent types and assign defaults. categorical = metadata.filter_columns(column_type='categorical') numeric = metadata.filter_columns(column_type='numeric') if default_group_column is None: default_group_column = list(categorical.columns.keys())[0] if default_metric is None: default_metric = list(numeric.columns.keys())[0] # Ensure the default_* columns are members of their respective groups. # This will raise a uniform framework error on our behalf if necessary. categorical.get_column(default_group_column) numeric.get_column(default_metric) # We don't need to do any additional validation on the # individual_id_column after this point, since it doesn't matter if it is # categorical, numeric, only one value, etc. # Verify states column is numeric states = metadata.get_column(state_column) if not isinstance(states, qiime2.NumericMetadataColumn): raise TypeError('state_column must be numeric.') # Verify that the state column has more than one value present uniq_states = states.to_series().unique() if len(uniq_states) < 2: raise ValueError('state_column must contain at least two unique ' 'values.') group_columns = list(categorical.columns.keys()) if individual_id_column and individual_id_column not in group_columns: group_columns += [individual_id_column] if individual_id_column not in metadata.columns.keys(): metadata = metadata.merge(qiime2.Metadata(individual_ids)) metric_columns = list(numeric.columns.keys()) control_chart_data = metadata.to_dataframe() # convert np.nan to None (nans and vega don't mix) control_chart_data = _convert_nan_to_none(control_chart_data) if is_feat_vol_plot: metric_columns.remove(state_column) # If we made it this far that means we can let Vega do it's thing! vega_spec = render_spec_volatility(control_chart_data, (stats_chart_data if is_feat_vol_plot else None), individual_id_column, state_column, default_group_column, group_columns, default_metric, metric_columns, yscale, is_feat_vol_plot) # Order matters here - need to render the template *after* copying the # directory tree, otherwise we will overwrite the index.html metadata.save(os.path.join(output_dir, 'data.tsv')) copy_tree(os.path.join(TEMPLATES, 'volatility'), output_dir) index = os.path.join(TEMPLATES, 'volatility', 'index.html') q2templates.render(index, output_dir, context={'vega_spec': vega_spec, 'is_feat_vol_plot': is_feat_vol_plot})
def _load_md(md_fp): md_fp = self.get_data_path(md_fp) md = pd.DataFrame.from_csv(md_fp, sep='\t') md = qiime2.Metadata(md) return md
def _3(ff: AlphaDiversityFormat) -> qiime2.Metadata: with ff.open() as fh: df = _read_alpha_diversity(fh) df.index.name = 'Sample ID' return qiime2.Metadata(df)
def _12(ff: ProbabilitiesFormat) -> (qiime2.Metadata): with ff.open() as fh: return qiime2.Metadata( _read_dataframe(fh).apply( lambda x: pd.to_numeric(x, errors='raise')))
def _29(ff: TSVTaxonomyFormat) -> qiime2.Metadata: df = _taxonomy_formats_to_dataframe(str(ff), has_header=True) return qiime2.Metadata(df)
def _3(ff: BooleanSeriesFormat) -> (qiime2.Metadata): with ff.open() as fh: return qiime2.Metadata(_read_dataframe(fh))
def _dnafastaformats_to_metadata(ff): df = _dnafastaformats_to_series(ff).to_frame() df = df.astype(str) df.index.name, df.columns = 'Feature ID', ['Sequence'] return qiime2.Metadata(df)
def _9(ff: ImportanceFormat) -> (qiime2.Metadata): with ff.open() as fh: return qiime2.Metadata( _read_dataframe(fh).apply( lambda x: pd.to_numeric(x, errors='raise')))
def _224(data: pd.DataFrame) -> DifferentialFormat: ff = DifferentialFormat() qiime2.Metadata(data).save(str(ff)) return ff
def _3(ff: TSVMolecules) -> qiime2.Metadata: return qiime2.Metadata(_tsvmolecules_to_df(ff))
def _3(ff: OrdinationFormat) -> qiime2.Metadata: df = _ordination_format_to_dataframe(ff) return qiime2.Metadata(df)
def test_first_differences_baseline_invalid_baseline(self): with self.assertRaisesRegex(ValueError, "must be a valid state"): first_differences( metadata=qiime2.Metadata(md_one_subject_many_times), state_column='Time', individual_id_column='ind', metric='Value', replicate_handling='drop', baseline=27)
def test_sample_metadata(self): # no filtering df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'], 'SampleType': ['gut', 'tongue', 'gut']}, index=pd.Index(['S1', 'S2', 'S3'], name='id')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = filter_samples(table, metadata=metadata) expected = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) self.assertEqual(actual, expected) # filter one df = pd.DataFrame({'Subject': ['subject-1', 'subject-2'], 'SampleType': ['tongue', 'gut']}, index=pd.Index(['S2', 'S3'], name='id')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = filter_samples(table, metadata=metadata) expected = Table(np.array([[1, 3], [1, 2]]), ['O1', 'O2'], ['S2', 'S3']) self.assertEqual(actual, expected) # filter all df = pd.DataFrame({}, index=pd.Index(['foo'], name='id')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = filter_samples(table, metadata=metadata) expected = Table(np.array([]), [], []) self.assertEqual(actual, expected) # exclude none df = pd.DataFrame({'Subject': ['subject-1'], 'SampleType': ['gut']}, index=pd.Index(['S90'], name='id')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = filter_samples(table, metadata=metadata, exclude_ids=True) self.assertEqual(actual, table) # exclude one df = pd.DataFrame({'Subject': ['subject-1'], 'SampleType': ['gut']}, index=pd.Index(['S1'], name='id')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = filter_samples(table, metadata=metadata, exclude_ids=True) expected = Table(np.array([[1, 3], [1, 2]]), ['O1', 'O2'], ['S2', 'S3']) self.assertEqual(actual, expected) # exclude two df = pd.DataFrame({'Subject': ['subject-1', 'subject-1'], 'SampleType': ['gut', 'tongue']}, index=pd.Index(['S1', 'S2'], name='id')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) actual = filter_samples(table, metadata=metadata, exclude_ids=True) expected = Table(np.array([[3], [2]]), ['O1', 'O2'], ['S3']) self.assertEqual(actual, expected) # exclude all df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'], 'SampleType': ['gut', 'tongue', 'gut']}, index=pd.Index(['S1', 'S2', 'S3'], name='id')) metadata = qiime2.Metadata(df) actual = filter_samples(table, metadata=metadata, exclude_ids=True) expected = Table(np.array([]), [], []) self.assertEqual(actual, expected)
def _3(ff: QualityFilterStatsFmt) -> qiime2.Metadata: return qiime2.Metadata(_stats_to_df(ff))
def test_where(self): # no filtering df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'], 'SampleType': ['gut', 'tongue', 'gut']}, index=pd.Index(['S1', 'S2', 'S3'], name='#SampleID')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) where = "Subject='subject-1' OR Subject='subject-2'" actual = filter_samples(table, metadata=metadata, where=where) expected = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) self.assertEqual(actual, expected) # filter one df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'], 'SampleType': ['gut', 'tongue', 'gut']}, index=pd.Index(['S1', 'S2', 'S3'], name='#SampleID')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) where = "Subject='subject-1'" actual = filter_samples(table, metadata=metadata, where=where) expected = Table(np.array([[0, 1], [1, 1]]), ['O1', 'O2'], ['S1', 'S2']) self.assertEqual(actual, expected) # filter two df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'], 'SampleType': ['gut', 'tongue', 'gut']}, index=pd.Index(['S1', 'S2', 'S3'], name='#SampleID')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) where = "Subject='subject-1' AND SampleType='gut'" actual = filter_samples(table, metadata=metadata, where=where) expected = Table(np.array([[1]]), ['O2'], ['S1']) self.assertEqual(actual, expected) # filter all df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'], 'SampleType': ['gut', 'tongue', 'gut']}, index=pd.Index(['S1', 'S2', 'S3'], name='#SampleID')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) where = "Subject='subject-1' AND Subject='subject-2'" actual = filter_samples(table, metadata=metadata, where=where) expected = Table(np.array([]), [], []) self.assertEqual(actual, expected) # filter none -> exclude none df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'], 'SampleType': ['gut', 'tongue', 'gut']}, index=pd.Index(['S1', 'S2', 'S3'], name='#SampleID')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) where = "Subject='subject-1' AND SampleType='elbow'" actual = filter_samples(table, metadata=metadata, where=where, exclude_ids=True) self.assertEqual(actual, table) # filter one -> exclude one df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'], 'SampleType': ['gut', 'tongue', 'gut']}, index=pd.Index(['S1', 'S2', 'S3'], name='#SampleID')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) where = "Subject='subject-1' AND SampleType='gut'" actual = filter_samples(table, metadata=metadata, where=where, exclude_ids=True) expected = Table(np.array([[1, 3], [1, 2]]), ['O1', 'O2'], ['S2', 'S3']) self.assertEqual(actual, expected) # filter two -> exclude two df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'], 'SampleType': ['gut', 'tongue', 'gut']}, index=pd.Index(['S1', 'S2', 'S3'], name='#SampleID')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) where = "Subject='subject-1'" actual = filter_samples(table, metadata=metadata, where=where, exclude_ids=True) expected = Table(np.array([[3], [2]]), ['O1', 'O2'], ['S3']) self.assertEqual(actual, expected) # filter all -> exclude all df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'], 'SampleType': ['gut', 'tongue', 'gut']}, index=pd.Index(['S1', 'S2', 'S3'], name='#SampleID')) metadata = qiime2.Metadata(df) table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) where = "Subject='subject-1' OR Subject='subject-2'" actual = filter_samples(table, metadata=metadata, where=where, exclude_ids=True) expected = Table(np.array([]), [], []) self.assertEqual(actual, expected)
def _6(ff: QuadTreeFormat) -> qiime2.Metadata: with ff.open() as fh: return qiime2.Metadata(_read_dataframe(fh))
def _8(ff: TaxonomyFormat) -> qiime2.Metadata: data = _read_taxonomy(str(ff)) return qiime2.Metadata(data)