def test_combine_exclude_ids_and_features_filters(self):
        # exclude one, min_features filter none
        df = pd.DataFrame({'Subject': ['subject-1'],
                           'SampleType': ['gut']},
                          index=pd.Index(['S1'], name='id'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        actual = filter_samples(table,
                                metadata=metadata,
                                exclude_ids=True,
                                min_features=2)
        expected = Table(np.array([[1, 3], [1, 2]]),
                         ['O1', 'O2'],
                         ['S2', 'S3'])
        self.assertEqual(actual, expected)

        # exclude one, min_features filter one
        df = pd.DataFrame({'Subject': ['subject-1'],
                           'SampleType': ['gut']},
                          index=pd.Index(['S2'], name='id'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        actual = filter_samples(table,
                                metadata=metadata,
                                exclude_ids=True,
                                min_features=2)
        expected = Table(np.array([[3], [2]]),
                         ['O1', 'O2'],
                         ['S3'])
        self.assertEqual(actual, expected)

        # exclude one, min_features filter for same one
        df = pd.DataFrame({'Subject': ['subject-1'],
                           'SampleType': ['gut']},
                          index=pd.Index(['S2'], name='id'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[4, 1, 3], [6, 0, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        actual = filter_samples(table,
                                metadata=metadata,
                                exclude_ids=True,
                                min_features=1)
        expected = Table(np.array([[4, 3], [6, 2]]),
                         ['O1', 'O2'],
                         ['S1', 'S3'])
        self.assertEqual(actual, expected)

        # exclude one, max_features filter none
        df = pd.DataFrame({'Subject': ['subject-1'],
                           'SampleType': ['gut']},
                          index=pd.Index(['S1'], name='id'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        actual = filter_samples(table,
                                metadata=metadata,
                                exclude_ids=True,
                                max_features=3)
        expected = Table(np.array([[1, 3], [1, 2]]),
                         ['O1', 'O2'],
                         ['S2', 'S3'])
        self.assertEqual(actual, expected)

        # exclude one, max_features filter one
        df = pd.DataFrame({'Subject': ['subject-1'],
                           'SampleType': ['gut']},
                          index=pd.Index(['S2'], name='id'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        actual = filter_samples(table,
                                metadata=metadata,
                                exclude_ids=True,
                                max_features=1)
        expected = Table(np.array([[1]]), ['O2'], ['S1'])
        self.assertEqual(actual, expected)

        # exclude one, max_features filter for same one
        df = pd.DataFrame({'Subject': ['subject-1'],
                           'SampleType': ['gut']},
                          index=pd.Index(['S2'], name='id'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 10, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        actual = filter_samples(table,
                                metadata=metadata,
                                exclude_ids=True,
                                max_features=9)
        expected = Table(np.array([[0, 3], [1, 2]]),
                         ['O1', 'O2'],
                         ['S1', 'S3'])
        self.assertEqual(actual, expected)

        # exclude one, max_features filter none,
        # min_features filter none
        df = pd.DataFrame({'Subject': ['subject-1'],
                           'SampleType': ['gut']},
                          index=pd.Index(['S2'], name='id'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [0, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        actual = filter_samples(table,
                                metadata=metadata,
                                exclude_ids=True,
                                min_features=0,
                                max_features=5)
        expected = Table(np.array([[0, 3], [0, 2]]),
                         ['O1', 'O2'],
                         ['S1', 'S3'])
        self.assertEqual(actual, expected)

        # exclude one, max_features filter one,
        # min_features filter one
        df = pd.DataFrame({'Subject': ['subject-1'],
                           'SampleType': ['gut']},
                          index=pd.Index(['S2'], name='id'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [0, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        actual = filter_samples(table,
                                metadata=metadata,
                                exclude_ids=True,
                                min_features=1,
                                max_features=1)
        expected = Table(np.array([]), [], [])
        self.assertEqual(actual, expected)

        # where filter one -> exclude one,
        # max_features filter one,
        # min_features filter one
        df = pd.DataFrame({'Subject': ['subject-1', 'subject-2'],
                           'SampleType': ['gut', 'tongue']},
                          index=pd.Index(['S1', 'S2'], name='id'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [0, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        where = "SampleType='tongue'"
        actual = filter_samples(table,
                                metadata=metadata,
                                where=where,
                                exclude_ids=True,
                                min_features=1,
                                max_features=1)
        expected = Table(np.array([]), [], [])
        self.assertEqual(actual, expected)
Esempio n. 2
0
def _3(ff: CoordinatesFormat) -> (qiime2.Metadata):
    with ff.open() as fh:
        return qiime2.Metadata(_read_dataframe(fh))
Esempio n. 3
0
def _3(ff: FirstDifferencesFormat) -> qiime2.Metadata:
    with ff.open() as fh:
        return qiime2.Metadata(_read_first_differences(fh))
Esempio n. 4
0
 def test_nmit_missing_table_ids(self):
     md = qiime2.Metadata(pd.DataFrame([[1]], columns=['i'],
                          index=pd.Index(['20'], name='id')))
     with self.assertRaisesRegex(ValueError, 'Missing samples'):
         nmit(table=self.table_taxa_fp, metadata=md,
              individual_id_column='studyid')
Esempio n. 5
0
 def test_first_distances_single_sample(self):
     with self.assertRaisesRegex(RuntimeError, "Output is empty"):
         first_distances(
             distance_matrix=dm_single_sample, metadata=qiime2.Metadata(md),
             state_column='Time', individual_id_column='ind',
             replicate_handling='drop')
Esempio n. 6
0
def _6(ff: PredictionsFormat) -> (qiime2.Metadata):
    with ff.open() as fh:
        return qiime2.Metadata(
            _read_dataframe(fh).apply(
                lambda x: pd.to_numeric(x, errors='ignore')))
 def test_id_based_filtering_with_extra_ids(self):
     md = qiime2.Metadata(
         pd.DataFrame([], index=pd.Index(['O1', 'O3', 'foo'], name='id')))
     exp = pd.Series(['ACGT', 'CCCC'], index=['O1', 'O3'])
     self.filter_and_assertEqual(exp, md=md)
Esempio n. 8
0
def _6(ff: ModuleMembershipTSVFormat) -> qiime2.Metadata:
    df = pd.read_table(str(ff), index_col=0, header=None, dtype=str)
    df.columns = ['module_membership']
    df.index = [str(i) for i in df.index]
    df.index.name = '#OTU ID'
    return qiime2.Metadata(df)
def synthetic_under_sampling(table: biom.Table,
                             metadata: NumericMetadataColumn,
                             concatenate_meta_fp: Str,
                             method: Str = 'RandomUnderSampler',
                             voting: Str = 'auto',
                             n_jobs: Int = 1,
                             sampling_strategy: Str = 'auto',
                             random_state: Int = 42,
                             output_log_fp: Str = None) -> biom.Table:
    log_fp = tempfile.mktemp()
    print("The log file will be writen into", log_fp)
    if log_fp:
        logger_ins = LOG(log_fp=log_fp).get_logger('synthetic_over_sampling')
        logger_ins.info("The parameters used for oversampling are")
        logger_ins.info('voting (will be used with ClusterCentroids only):',
                        voting)
        logger_ins.info('Sampling method:', method)
        logger_ins.info('Output log file path:', log_fp)
        logger_ins.info('sampling_strategy:', sampling_strategy)
        logger_ins.info('n_jobs:', n_jobs)
        logger_ins.info('random_state:', random_state)

    cls = dispatcher[method]
    if method != 'RandomUnderSampler':
        table.norm(inplace=True)
        if log_fp:
            logger_ins.info(
                "The input table is normalized before using it for oversampling"
            )
    sorted_table, sorted_metadata = _read_inputs(table, meta_data=metadata)
    matrix_data = sorted_table.matrix_data.transpose().todense()
    if method not in dispatcher:
        raise ValueError('The optional methods for over sampling are',
                         dispatcher.keys(), "instead it received", method)
    if method == 'RandomUnderSampler':
        under_sampling_cls = cls(sampling_strategy=sampling_strategy,
                                 random_state=random_state,
                                 replacement=False)
    else:
        raise NotImplementedError("Method", method, "is not implemented yet")
    X_resampled, y_resampled = under_sampling_cls.fit_resample(
        matrix_data, sorted_metadata)
    if log_fp:
        logger_ins.info("The under-sampling finished successfully!")
        logger_ins.info("Overall, the size of data is", len(X_resampled))
    if method == 'RandomUnderSampler':
        dummy_samples_ids = under_sampling_cls.sample_indices_
        dummy_samples = []
        orig_samples = sorted_table.ids('sample')
        for sample_id in dummy_samples_ids:
            dummy_samples.append(orig_samples[sample_id])
    else:
        raise NotImplementedError("Method", method, "is not implemented yet")
    under_sampling_dummy = sorted_table.filter(ids_to_keep=dummy_samples,
                                               inplace=False)
    under_sampling_dummy = under_sampling_dummy.sort_order(order=dummy_samples,
                                                           axis='sample')
    if method == "RandomUnderSampler" and np.sum(
            under_sampling_dummy.matrix_data.transpose() - X_resampled) != 0:
        raise ValueError("The undersampling changed the matrix data")

    undersampled_table = biom.Table(
        X_resampled.transpose(),
        observation_ids=sorted_table.ids('observation'),
        sample_ids=dummy_samples)
    undersampled_metadata = pd.DataFrame(index=dummy_samples, data=y_resampled)
    undersampled_metadata.index.names = ['#SampleID']
    undersampled_metadata.columns = ['label']
    undersampled_meta = qiime2.Metadata(undersampled_metadata)
    undersampled_meta.save(concatenate_meta_fp)

    if log_fp:
        shutil.copy(log_fp, output_log_fp)

    return undersampled_table
Esempio n. 10
0
def maturity_index(ctx,
                   table,
                   metadata,
                   state_column,
                   group_by,
                   control,
                   individual_id_column=None,
                   estimator='RandomForestRegressor',
                   n_estimators=100,
                   test_size=0.5,
                   step=0.05,
                   cv=5,
                   random_state=None,
                   n_jobs=1,
                   parameter_tuning=False,
                   optimize_feature_selection=False,
                   stratify=False,
                   missing_samples='error'):

    filter_samples = ctx.get_action('feature_table', 'filter_samples')
    filter_features = ctx.get_action('feature_table', 'filter_features')
    group_table = ctx.get_action('feature_table', 'group')
    heatmap = ctx.get_action('feature_table', 'heatmap')
    split = ctx.get_action('sample_classifier', 'split_table')
    fit = ctx.get_action('sample_classifier', 'fit_regressor')
    predict_test = ctx.get_action('sample_classifier', 'predict_regression')
    summarize_estimator = ctx.get_action('sample_classifier', 'summarize')
    scatter = ctx.get_action('sample_classifier', 'scatterplot')
    volatility = ctx.get_action('longitudinal', 'volatility')

    # we must perform metadata superset validation here before we start
    # slicing and dicing.
    md_as_frame = metadata.to_dataframe()
    if missing_samples == 'error':
        _validate_metadata_is_superset(md_as_frame, table.view(biom.Table))

    # Let's also validate metadata columns before we get busy
    _validate_input_columns(
        md_as_frame, individual_id_column, group_by, state_column, None)

    # train regressor on subset of control samples
    control_table, = filter_samples(
        table, metadata=metadata, where="{0}='{1}'".format(group_by, control))

    md_column = metadata.get_column(state_column)
    X_train, X_test = split(control_table, md_column, test_size, random_state,
                            stratify, missing_samples='ignore')

    sample_estimator, importance = fit(
        X_train, md_column, step, cv, random_state, n_jobs, n_estimators,
        estimator, optimize_feature_selection, parameter_tuning,
        missing_samples='ignore')

    # drop training samples from rest of dataset; we will predict all others
    control_ids = pd.DataFrame(index=X_train.view(biom.Table).ids())
    control_ids.index.name = 'id'
    control_ids = qiime2.Metadata(control_ids)
    test_table, = filter_samples(table, metadata=control_ids, exclude_ids=True)

    # predict test samples
    predictions, = predict_test(test_table, sample_estimator, n_jobs)

    # summarize estimator params
    summary, = summarize_estimator(sample_estimator)

    # only report accuracy on control test samples
    test_ids = X_test.view(biom.Table).ids()
    accuracy_md = metadata.filter_ids(test_ids).get_column(state_column)
    accuracy_results, = scatter(predictions, accuracy_md, 'ignore')

    # calculate MAZ score
    # merge is inner join by default, so training samples are dropped (good!)
    pred_md = metadata.merge(predictions.view(qiime2.Metadata)).to_dataframe()
    pred_md['prediction'] = pd.to_numeric(pred_md['prediction'])
    pred_md = _maz_score(
        pred_md, 'prediction', state_column, group_by, control)
    maz = '{0} MAZ score'.format(state_column)
    maz_scores = ctx.make_artifact('SampleData[RegressorPredictions]',
                                   pred_md[maz])

    # make heatmap
    # trim table to important features for viewing as heatmap
    table, = filter_features(table, metadata=importance.view(qiime2.Metadata))
    # make sure IDs match between table and metadata
    cluster_table, = filter_samples(table, metadata=metadata)
    # need to group table by two columns together, so do this ugly hack
    cluster_by = group_by + '-' + state_column
    md_as_frame[cluster_by] = (md_as_frame[group_by].astype(str) + '-' +
                               md_as_frame[state_column].astype(str))
    cluster_md = qiime2.CategoricalMetadataColumn(md_as_frame[cluster_by])
    cluster_table, = group_table(cluster_table, axis='sample',
                                 metadata=cluster_md, mode='median-ceiling')
    # group metadata to match grouped sample IDs and sort by group/column
    clust_md = md_as_frame.groupby(cluster_by).first()
    clust_md = clust_md.sort_values([group_by, state_column])
    # sort table using clustered/sorted metadata as guide
    sorted_table = cluster_table.view(biom.Table).sort_order(clust_md.index)
    sorted_table = ctx.make_artifact('FeatureTable[Frequency]', sorted_table)
    clustermap, = heatmap(sorted_table, cluster='features')

    # visualize MAZ vs. time (column)
    lineplots, = volatility(
        qiime2.Metadata(pred_md), state_column=state_column,
        individual_id_column=individual_id_column,
        default_group_column=group_by, default_metric=maz, yscale='linear')

    return (
        sample_estimator, importance, predictions, summary, accuracy_results,
        maz_scores, clustermap, lineplots)
Esempio n. 11
0
def multinomial(
    table: biom.Table,
    metadata: Metadata,
    formula: str,
    training_column: str = None,
    num_random_test_examples: int = 5,
    epochs: int = 1000,
    batch_size: int = 5,
    differential_prior: float = 1,
    learning_rate: float = 1e-3,
    clipnorm: float = 10,
    min_sample_count: int = 1000,
    min_feature_count: int = 10,
    summary_interval: int = 60
) -> (pd.DataFrame, qiime2.Metadata, skbio.OrdinationResults):

    # load metadata and tables
    metadata = metadata.to_dataframe()
    # match them
    table, metadata, design = match_and_filter(table, metadata, formula,
                                               min_sample_count,
                                               min_feature_count)

    # convert to dense representation
    dense_table = table.to_dataframe().to_dense().T

    # split up training and testing
    trainX, testX, trainY, testY = split_training(dense_table, metadata,
                                                  design, training_column,
                                                  num_random_test_examples)

    model = MultRegression(learning_rate=learning_rate,
                           clipnorm=clipnorm,
                           beta_mean=differential_prior,
                           batch_size=batch_size,
                           save_path=None)
    with tf.Graph().as_default(), tf.Session() as session:
        model(session, trainX, trainY, testX, testY)

        loss, cv, its = model.fit(epochs=epochs,
                                  summary_interval=summary_interval,
                                  checkpoint_interval=None)

    md_ids = np.array(design.columns)
    obs_ids = table.ids(axis='observation')

    beta_ = clr(clr_inv(np.hstack((np.zeros((model.p, 1)), model.B))))

    differentials = pd.DataFrame(
        beta_.T,
        columns=md_ids,
        index=obs_ids,
    )
    convergence_stats = pd.DataFrame({
        'loglikehood': loss,
        'cross-validation': cv,
        'iteration': its
    })

    convergence_stats.index.name = 'id'
    convergence_stats.index = convergence_stats.index.astype(np.str)

    c = convergence_stats['loglikehood'].astype(np.float)
    convergence_stats['loglikehood'] = c

    c = convergence_stats['cross-validation'].astype(np.float)
    convergence_stats['cross-validation'] = c

    c = convergence_stats['iteration'].astype(np.int)
    convergence_stats['iteration'] = c

    # regression biplot
    if differentials.shape[-1] > 1:
        u, s, v = np.linalg.svd(differentials)
        pc_ids = ['PC%d' % i for i in range(len(s))]
        samples = pd.DataFrame(u[:, :len(s)] @ np.diag(s),
                               columns=pc_ids,
                               index=differentials.index)
        features = pd.DataFrame(v.T[:, :len(s)],
                                columns=pc_ids,
                                index=differentials.columns)
        short_method_name = 'regression_biplot'
        long_method_name = 'Multinomial regression biplot'
        eigvals = pd.Series(s, index=pc_ids)
        proportion_explained = eigvals**2 / (eigvals**2).sum()
        biplot = OrdinationResults(short_method_name,
                                   long_method_name,
                                   eigvals,
                                   samples=samples,
                                   features=features,
                                   proportion_explained=proportion_explained)
    else:
        # this is to handle the edge case with only intercepts
        biplot = OrdinationResults('', '', pd.Series(), pd.DataFrame())

    return differentials, qiime2.Metadata(convergence_stats), biplot
Esempio n. 12
0
def _volatility(output_dir, metadata, state_column, individual_id_column,
                default_group_column, default_metric, table, yscale,
                importances):
    if individual_id_column == state_column:
        raise ValueError('individual_id_column & state_column must be set to '
                         'unique values.')

    # verify that individual_id_column exists in metadata
    # other metadata columns are validated later (to ensure correct types)
    if individual_id_column is not None:
        individual_ids = metadata.get_column(
            individual_id_column).to_dataframe()

    is_feat_vol_plot = importances is not None

    if is_feat_vol_plot:
        # We don't want to include any MD columns in the metric select in the
        # feature volatility variant of the viz, except for the state vo
        state_md_col = metadata.get_column(state_column).to_dataframe()
        metadata = metadata.filter_columns(column_type='categorical')
        metadata = metadata.merge(qiime2.Metadata(state_md_col))

        # Compile first differences and other stats on feature data
        stats_chart_data = _summarize_feature_stats(table, state_md_col)
        stats_chart_data = importances.join(stats_chart_data, how='inner')
        qiime2.Metadata(stats_chart_data).save(
            os.path.join(output_dir, 'feature_metadata.tsv'))
        stats_chart_data = stats_chart_data.reset_index(drop=False)
        # convert np.nan to None (nans and vega don't mix)
        stats_chart_data = _convert_nan_to_none(stats_chart_data)

    # Convert table to metadata and merge, if present.
    if table is not None:
        table.index.name = 'id'
        table_md = qiime2.Metadata(table)
        metadata = metadata.merge(table_md)

    # Partition the metadata into constituent types and assign defaults.
    categorical = metadata.filter_columns(column_type='categorical')
    numeric = metadata.filter_columns(column_type='numeric')
    if default_group_column is None:
        default_group_column = list(categorical.columns.keys())[0]
    if default_metric is None:
        default_metric = list(numeric.columns.keys())[0]

    # Ensure the default_* columns are members of their respective groups.
    # This will raise a uniform framework error on our behalf if necessary.
    categorical.get_column(default_group_column)
    numeric.get_column(default_metric)

    # We don't need to do any additional validation on the
    # individual_id_column after this point, since it doesn't matter if it is
    # categorical, numeric, only one value, etc.

    # Verify states column is numeric
    states = metadata.get_column(state_column)
    if not isinstance(states, qiime2.NumericMetadataColumn):
        raise TypeError('state_column must be numeric.')

    # Verify that the state column has more than one value present
    uniq_states = states.to_series().unique()
    if len(uniq_states) < 2:
        raise ValueError('state_column must contain at least two unique '
                         'values.')

    group_columns = list(categorical.columns.keys())
    if individual_id_column and individual_id_column not in group_columns:
        group_columns += [individual_id_column]
        if individual_id_column not in metadata.columns.keys():
            metadata = metadata.merge(qiime2.Metadata(individual_ids))
    metric_columns = list(numeric.columns.keys())
    control_chart_data = metadata.to_dataframe()
    # convert np.nan to None (nans and vega don't mix)
    control_chart_data = _convert_nan_to_none(control_chart_data)

    if is_feat_vol_plot:
        metric_columns.remove(state_column)

    # If we made it this far that means we can let Vega do it's thing!
    vega_spec = render_spec_volatility(control_chart_data,
                                       (stats_chart_data if is_feat_vol_plot
                                        else None),
                                       individual_id_column,
                                       state_column, default_group_column,
                                       group_columns, default_metric,
                                       metric_columns, yscale,
                                       is_feat_vol_plot)

    # Order matters here - need to render the template *after* copying the
    # directory tree, otherwise we will overwrite the index.html
    metadata.save(os.path.join(output_dir, 'data.tsv'))
    copy_tree(os.path.join(TEMPLATES, 'volatility'), output_dir)
    index = os.path.join(TEMPLATES, 'volatility', 'index.html')
    q2templates.render(index, output_dir,
                       context={'vega_spec': vega_spec,
                                'is_feat_vol_plot': is_feat_vol_plot})
Esempio n. 13
0
 def _load_md(md_fp):
     md_fp = self.get_data_path(md_fp)
     md = pd.DataFrame.from_csv(md_fp, sep='\t')
     md = qiime2.Metadata(md)
     return md
Esempio n. 14
0
def _3(ff: AlphaDiversityFormat) -> qiime2.Metadata:
    with ff.open() as fh:
        df = _read_alpha_diversity(fh)
        df.index.name = 'Sample ID'
        return qiime2.Metadata(df)
Esempio n. 15
0
def _12(ff: ProbabilitiesFormat) -> (qiime2.Metadata):
    with ff.open() as fh:
        return qiime2.Metadata(
            _read_dataframe(fh).apply(
                lambda x: pd.to_numeric(x, errors='raise')))
Esempio n. 16
0
def _29(ff: TSVTaxonomyFormat) -> qiime2.Metadata:
    df = _taxonomy_formats_to_dataframe(str(ff), has_header=True)
    return qiime2.Metadata(df)
Esempio n. 17
0
def _3(ff: BooleanSeriesFormat) -> (qiime2.Metadata):
    with ff.open() as fh:
        return qiime2.Metadata(_read_dataframe(fh))
Esempio n. 18
0
def _dnafastaformats_to_metadata(ff):
    df = _dnafastaformats_to_series(ff).to_frame()
    df = df.astype(str)
    df.index.name, df.columns = 'Feature ID', ['Sequence']
    return qiime2.Metadata(df)
Esempio n. 19
0
def _9(ff: ImportanceFormat) -> (qiime2.Metadata):
    with ff.open() as fh:
        return qiime2.Metadata(
            _read_dataframe(fh).apply(
                lambda x: pd.to_numeric(x, errors='raise')))
Esempio n. 20
0
def _224(data: pd.DataFrame) -> DifferentialFormat:
    ff = DifferentialFormat()
    qiime2.Metadata(data).save(str(ff))
    return ff
Esempio n. 21
0
def _3(ff: TSVMolecules) -> qiime2.Metadata:
    return qiime2.Metadata(_tsvmolecules_to_df(ff))
Esempio n. 22
0
def _3(ff: OrdinationFormat) -> qiime2.Metadata:
    df = _ordination_format_to_dataframe(ff)
    return qiime2.Metadata(df)
Esempio n. 23
0
 def test_first_differences_baseline_invalid_baseline(self):
     with self.assertRaisesRegex(ValueError, "must be a valid state"):
         first_differences(
             metadata=qiime2.Metadata(md_one_subject_many_times),
             state_column='Time', individual_id_column='ind',
             metric='Value', replicate_handling='drop', baseline=27)
    def test_sample_metadata(self):
        # no filtering
        df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'],
                           'SampleType': ['gut', 'tongue', 'gut']},
                          index=pd.Index(['S1', 'S2', 'S3'], name='id'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        actual = filter_samples(table, metadata=metadata)
        expected = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                         ['O1', 'O2'],
                         ['S1', 'S2', 'S3'])
        self.assertEqual(actual, expected)

        # filter one
        df = pd.DataFrame({'Subject': ['subject-1', 'subject-2'],
                           'SampleType': ['tongue', 'gut']},
                          index=pd.Index(['S2', 'S3'], name='id'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        actual = filter_samples(table, metadata=metadata)
        expected = Table(np.array([[1, 3], [1, 2]]),
                         ['O1', 'O2'],
                         ['S2', 'S3'])
        self.assertEqual(actual, expected)

        # filter all
        df = pd.DataFrame({}, index=pd.Index(['foo'], name='id'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        actual = filter_samples(table, metadata=metadata)
        expected = Table(np.array([]), [], [])
        self.assertEqual(actual, expected)

        # exclude none
        df = pd.DataFrame({'Subject': ['subject-1'],
                           'SampleType': ['gut']},
                          index=pd.Index(['S90'], name='id'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        actual = filter_samples(table, metadata=metadata, exclude_ids=True)
        self.assertEqual(actual, table)

        # exclude one
        df = pd.DataFrame({'Subject': ['subject-1'],
                           'SampleType': ['gut']},
                          index=pd.Index(['S1'], name='id'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        actual = filter_samples(table, metadata=metadata, exclude_ids=True)
        expected = Table(np.array([[1, 3], [1, 2]]),
                         ['O1', 'O2'],
                         ['S2', 'S3'])
        self.assertEqual(actual, expected)

        # exclude two
        df = pd.DataFrame({'Subject': ['subject-1', 'subject-1'],
                           'SampleType': ['gut', 'tongue']},
                          index=pd.Index(['S1', 'S2'], name='id'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        actual = filter_samples(table, metadata=metadata, exclude_ids=True)
        expected = Table(np.array([[3], [2]]),
                         ['O1', 'O2'],
                         ['S3'])
        self.assertEqual(actual, expected)

        # exclude all
        df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'],
                           'SampleType': ['gut', 'tongue', 'gut']},
                          index=pd.Index(['S1', 'S2', 'S3'], name='id'))
        metadata = qiime2.Metadata(df)
        actual = filter_samples(table, metadata=metadata,
                                exclude_ids=True)
        expected = Table(np.array([]), [], [])
        self.assertEqual(actual, expected)
def _3(ff: QualityFilterStatsFmt) -> qiime2.Metadata:
    return qiime2.Metadata(_stats_to_df(ff))
    def test_where(self):
        # no filtering
        df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'],
                           'SampleType': ['gut', 'tongue', 'gut']},
                          index=pd.Index(['S1', 'S2', 'S3'], name='#SampleID'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        where = "Subject='subject-1' OR Subject='subject-2'"
        actual = filter_samples(table, metadata=metadata, where=where)
        expected = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                         ['O1', 'O2'],
                         ['S1', 'S2', 'S3'])
        self.assertEqual(actual, expected)

        # filter one
        df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'],
                           'SampleType': ['gut', 'tongue', 'gut']},
                          index=pd.Index(['S1', 'S2', 'S3'], name='#SampleID'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        where = "Subject='subject-1'"
        actual = filter_samples(table, metadata=metadata, where=where)
        expected = Table(np.array([[0, 1], [1, 1]]),
                         ['O1', 'O2'],
                         ['S1', 'S2'])
        self.assertEqual(actual, expected)

        # filter two
        df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'],
                           'SampleType': ['gut', 'tongue', 'gut']},
                          index=pd.Index(['S1', 'S2', 'S3'], name='#SampleID'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        where = "Subject='subject-1' AND SampleType='gut'"
        actual = filter_samples(table, metadata=metadata, where=where)
        expected = Table(np.array([[1]]),
                         ['O2'],
                         ['S1'])
        self.assertEqual(actual, expected)

        # filter all
        df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'],
                           'SampleType': ['gut', 'tongue', 'gut']},
                          index=pd.Index(['S1', 'S2', 'S3'], name='#SampleID'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        where = "Subject='subject-1' AND Subject='subject-2'"
        actual = filter_samples(table, metadata=metadata, where=where)
        expected = Table(np.array([]), [], [])
        self.assertEqual(actual, expected)

        # filter none -> exclude none
        df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'],
                           'SampleType': ['gut', 'tongue', 'gut']},
                          index=pd.Index(['S1', 'S2', 'S3'], name='#SampleID'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        where = "Subject='subject-1' AND SampleType='elbow'"
        actual = filter_samples(table,
                                metadata=metadata,
                                where=where,
                                exclude_ids=True)
        self.assertEqual(actual, table)

        # filter one -> exclude one
        df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'],
                           'SampleType': ['gut', 'tongue', 'gut']},
                          index=pd.Index(['S1', 'S2', 'S3'], name='#SampleID'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        where = "Subject='subject-1' AND SampleType='gut'"
        actual = filter_samples(table,
                                metadata=metadata,
                                where=where,
                                exclude_ids=True)
        expected = Table(np.array([[1, 3], [1, 2]]),
                         ['O1', 'O2'],
                         ['S2', 'S3'])
        self.assertEqual(actual, expected)

        # filter two -> exclude two
        df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'],
                           'SampleType': ['gut', 'tongue', 'gut']},
                          index=pd.Index(['S1', 'S2', 'S3'], name='#SampleID'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        where = "Subject='subject-1'"
        actual = filter_samples(table,
                                metadata=metadata,
                                where=where,
                                exclude_ids=True)
        expected = Table(np.array([[3], [2]]),
                         ['O1', 'O2'],
                         ['S3'])
        self.assertEqual(actual, expected)

        # filter all -> exclude all
        df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'],
                           'SampleType': ['gut', 'tongue', 'gut']},
                          index=pd.Index(['S1', 'S2', 'S3'], name='#SampleID'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        where = "Subject='subject-1' OR Subject='subject-2'"
        actual = filter_samples(table,
                                metadata=metadata,
                                where=where,
                                exclude_ids=True)
        expected = Table(np.array([]), [], [])
        self.assertEqual(actual, expected)
Esempio n. 27
0
def _6(ff: QuadTreeFormat) -> qiime2.Metadata:
    with ff.open() as fh:
        return qiime2.Metadata(_read_dataframe(fh))
Esempio n. 28
0
def _8(ff: TaxonomyFormat) -> qiime2.Metadata:
    data = _read_taxonomy(str(ff))
    return qiime2.Metadata(data)