def setUp(self):
        super().setUp()
        self.table = biom.Table(np.array([[0, 1, 2],
                                          [2, 4, 6],
                                          [3, 0, 1]]),
                                ['feature-1', 'feature-2', 'feature-3'],
                                ['sample-1', 'sample-2', 'sample-3'])
        self.taxonomy_df = pd.DataFrame([['feature-1', 'a; b; c', 0.123],
                                         ['feature-2', 'a; b; c; d; e', 0.345],
                                         ['feature-3', 'a; f; g; h', 0.678]],
                                        columns=['Feature ID', 'Taxon',
                                                 'Confidence'])
        self.taxonomy_df.set_index('Feature ID', inplace=True)
        self.table2 = biom.Table([[0, 0.1, 0.2],
                                  [0.2, 0.4, 0.6],
                                  [0.3, 0, 0.1]],
                                 ['feature-1', 'feature-2', 'feature-3'],
                                 ['sample-1', 'sample-2', 'sample-3'])

        self.table_artifact = Artifact.import_data(
            "FeatureTable[Frequency]", self.table
        )
        self.taxonomy_artifact = Artifact.import_data(
            "FeatureData[Taxonomy]", self.taxonomy_df,
        )
        self.table2_artifact = Artifact.import_data(
            "FeatureTable[Frequency]", self.table2
        )
        self.table_qza = self.create_tempfile(suffix='.qza').name
        self.table_artifact.save(self.table_qza)
        self.taxonomy_qza = self.create_tempfile(suffix='.qza').name
        self.taxonomy_artifact.save(self.taxonomy_qza)
        self.table2_qza = self.create_tempfile(suffix='.qza').name
        self.table2_artifact.save(self.table2_qza)
        self.table_biom = self.create_tempfile(suffix='.biom').name
        with biom_open(self.table_biom, 'w') as f:
            self.table.to_hdf5(f, 'test-table')
        self.resources = ResourceManager()
        self.config = {'table_resources': {
            'simple-table': {
                'table': self.table_qza,
                'q2-type': FeatureTable[Frequency]
            },
            'second-simple-table': {
                'table': self.table2_qza,
            },
            'table-with-taxonomy': {
                'table': self.table_qza,
                'feature-data-taxonomy': self.taxonomy_qza,
            },
            'table-with-variance': {
                'table': self.table_qza,
                'variances': self.table2_qza,
            },
            'table-with-taxonomy-and-variance': {
                'table': self.table_qza,
                'feature-data-taxonomy': self.taxonomy_qza,
                'variances': self.table2_qza,
            },
            'table-from-biom': {
                'table': self.table_biom,
                'feature-data-taxonomy': self.taxonomy_qza,
                'variances': self.table_biom,
                'table-format': 'biom',
            },
            'table-with-cached-taxonomy': {
                'table': self.table_qza,
                'feature-data-taxonomy': self.taxonomy_qza,
                'cache-taxonomy': True,
            },
        }}
Beispiel #2
0
def _dataframe_to_table(df):
    if df.index.inferred_type != 'string':
        raise TypeError("Please provide a DataFrame with a string-based Index")
    return biom.Table(df.T.values, observation_ids=df.columns,
                      sample_ids=df.index)
Beispiel #3
0
#  Main code
###############################################
matplotlib.rc('text', usetex=True)
# Setup
np.random.seed(0)
zheng = lambda x, y: zhengr(x, y)
res_folder = '../results/correlations/'

params = getParams(sts=[11], interactions=['commensal'])

params = init_data(params, num_samps=100)
corr_mat = build_correlation_matrix(params)
table = build_contingency_table(params)

# save table
bT = biom.Table(table, range(table.shape[0]), range(table.shape[1]))
biomname = '../data/tables_6_3_2015/bioms/table_1.biom'
txtname = '../data/tables_6_3_2015/txts/table_1.txt'
open(biomname, 'w').write(bT.to_json('Jamie'))
open(txtname, 'w').write(bT.to_tsv())
#######################################################################
#                  Absolute Ecological Relations                      #
#######################################################################
pearson_corr_mat = abs(np.corrcoef(table.T))
spearman_corr_mat = abs(spearmanr(table)[0])
zheng_corr_mat = get_corr_matrix(table, zheng)
# Can insert sparcc_corr_mat right here.  Just need to
# 1. read text file of correlations via pandas DataFrame
# 2. extract matrix via the pandas as_matrix() command
# 3. take absolute value via pandas abs() command
Beispiel #4
0
def synthetic_over_sampling(table: biom.Table,
                            metadata: NumericMetadataColumn,
                            concatenate_meta: Str,
                            method: Str = 'SMOTETomek',
                            k_neighbors: Int = 5,
                            m_neighbors: Int = 10,
                            n_jobs: Int = 1,
                            log_fp: Str = None,
                            sampling_strategy: Str = 'auto',
                            random_state: Int = 42,
                            output_log_fp: Str = None) -> biom.Table:
    if log_fp:
        logger_ins = LOG(
            log_fp=log_fp).get_logger('synthetic_sampling_combination')
        logger_ins.info("The parameters used for oversampling are")
        logger_ins.info('k_neighbors:', k_neighbors)
        logger_ins.info('m_neighbors:', m_neighbors)
        logger_ins.info('Sampling method:', method)
        logger_ins.info('Output log file path:', log_fp)
        logger_ins.info('sampling_strategy:', sampling_strategy)
        logger_ins.info('n_jobs:', n_jobs)
        logger_ins.info('random_state:', random_state)

    cls = dispatcher[method]
    if method != 'RandomOverSampler':
        table.norm(inplace=True)
        if log_fp:
            logger_ins.info(
                "The input table is normalized before using it for oversampling"
            )
    sorted_table, sorted_metadata = _read_inputs(table, meta_data=metadata)
    matrix_data = sorted_table.matrix_data.transpose()
    if method not in dispatcher:
        raise ValueError('The optional methods for over sampling are',
                         dispatcher.keys(), "instead it received", method)
    if method == 'ADASYN':
        over_sampling_cls = cls(sampling_strategy=sampling_strategy,
                                random_state=random_state,
                                n_neighbors=k_neighbors,
                                n_jobs=n_jobs)
    elif method == 'RandomOverSampler':
        over_sampling_cls = cls(sampling_strategy=sampling_strategy,
                                random_state=random_state)
    else:
        over_sampling_cls = cls(sampling_strategy=sampling_strategy,
                                m_neighbors=m_neighbors,
                                random_state=random_state,
                                n_jobs=n_jobs,
                                k_neighbors=k_neighbors)
    X_resampled, y_resampled = over_sampling_cls.fit_resample(
        matrix_data, sorted_metadata)
    if np.sum(np.abs(X_resampled[:len(matrix_data), :] - matrix_data)) != 0 or \
            np.sum(y_resampled[:len(matrix_data)] == metadata) != len(matrix_data):
        raise ValueError(
            "Over sampling method changed the data! Please double check your biom table"
        )
    else:
        if log_fp:
            logger_ins.info("The oversampling finished successfully!")
            logger_ins.info(
                "The first", len(matrix_data),
                "samples belong to the original training samples and the "
                "next",
                len(X_resampled) - len(matrix_data),
                "samples belong to the new ones")
            logger_ins.info("Overall, the size of data is", len(X_resampled))
    if method != 'RandomOverSampler':
        dummy_samples = np.asarray(
            list(sorted_table.ids('sample')) + [
                "dummy_sample_" + str(i)
                for i in range(len(X_resampled) - len(matrix_data))
            ])
    else:
        dummy_samples = over_sampling_cls.sample_indices_

    oversampled_table = biom.Table(
        X_resampled,
        observation_ids=sorted_table.ids('observation'),
        sample_ids=dummy_samples)
    oversampled_metadata = pd.DataFrame(index=dummy_samples, data=y_resampled)
    oversampled_metadata.index.names = ['#SampleID']
    oversampled_metadata.columns = ['label']
    oversampled_meta = qiime2.Metadata(oversampled_metadata)
    oversampled_meta.save()

    return oversampled_table
Beispiel #5
0
def _drop_axis_metadata(table):
    return biom.Table(table.matrix_data,
                      observation_ids=table.ids(axis='observation'),
                      sample_ids=table.ids(axis='sample'))
 def test_basic(self):
     obs = self.cmd(self.uc_minimal)
     expected = biom.Table(np.array([[1.0]]),
                           observation_ids=['f2_1539'],
                           sample_ids=['f2'])
     self.assertEqual(obs, expected)
Beispiel #7
0
 def test_receives_empty_table(self):
     empty_table = biom.Table(np.array([]), [], [])
     with self.assertRaisesRegex(ValueError, "empty"):
         faith_pd(table=empty_table, phylogeny=self.input_tree)
    def test_one_to_one_rename(self):
        sample_mc = qiime2.CategoricalMetadataColumn(
            pd.Series(['a_new', 'b_new', 'c_new'],
                      name='foo',
                      index=pd.Index(['a', 'b', 'c'], name='sampleid')))
        original_sample_ids = sample_mc.to_series().index
        new_sample_ids = list(sample_mc.to_series())

        feature_mc = qiime2.CategoricalMetadataColumn(
            pd.Series(['x_new', 'y_new'],
                      name='foo',
                      index=pd.Index(['x', 'y'], name='featureid')))
        original_feature_ids = feature_mc.to_series().index
        new_feature_ids = list(feature_mc.to_series())

        data = np.array([[1, 2, 3], [30, 20, 10]])
        table = biom.Table(data,
                           sample_ids=original_sample_ids,
                           observation_ids=original_feature_ids)

        # Sample renames
        expected = biom.Table(data,
                              sample_ids=new_sample_ids,
                              observation_ids=original_feature_ids)

        # Sample x Sum
        result = group(table, axis='sample', metadata=sample_mc, mode='sum')
        self.assertEqual(expected, result)

        # Sample X Mean
        result = group(table,
                       axis='sample',
                       metadata=sample_mc,
                       mode='mean-ceiling')
        self.assertEqual(expected, result)

        # Sample X Mean
        result = group(table,
                       axis='sample',
                       metadata=sample_mc,
                       mode='median-ceiling')
        self.assertEqual(expected, result)

        # Feature renames
        expected = biom.Table(data,
                              sample_ids=original_sample_ids,
                              observation_ids=new_feature_ids)

        # Feature X Sum
        result = group(table, axis='feature', metadata=feature_mc, mode='sum')
        self.assertEqual(expected, result)

        # Feature X Mean
        result = group(table,
                       axis='feature',
                       metadata=feature_mc,
                       mode='mean-ceiling')
        self.assertEqual(expected, result)

        # Feature X Median
        result = group(table,
                       axis='feature',
                       metadata=feature_mc,
                       mode='median-ceiling')
        self.assertEqual(expected, result)
 def test_uc(self):
     obs = self.cmd(self.uc)
     expected = biom.Table(np.array([[1.0, 1.0], [0.0, 1.0]]),
                           observation_ids=['f2_1539', 'f3_1540'],
                           sample_ids=['f2', 'f3'])
     self.assertEqual(obs, expected)
Beispiel #10
0
 def test_alpha_unknown_metric(self):
     t = biom.Table(np.array([[0, 1, 3], [1, 1, 2]]),
                    ['O1', 'O2'],
                    ['S1', 'S2', 'S3'])
     with self.assertRaises(ValueError):
         alpha(table=t, metric='not-a-metric')
Beispiel #11
0
    def test_alpha_empty_table(self):
        t = biom.Table(np.array([]), [], [])

        with self.assertRaisesRegex(ValueError, "empty"):
            alpha(table=t, metric='observed_otus')
Beispiel #12
0
 def test_alpha_phylo_metric(self):
     t = biom.Table(np.array([[0, 1, 3], [1, 1, 2]]),
                    ['O1', 'O2'],
                    ['S1', 'S2', 'S3'])
     with self.assertRaises(ValueError):
         alpha(table=t, metric='faith_pd')
def synthetic_under_sampling(table: biom.Table,
                             metadata: NumericMetadataColumn,
                             concatenate_meta_fp: Str,
                             method: Str = 'RandomUnderSampler',
                             voting: Str = 'auto',
                             n_jobs: Int = 1,
                             sampling_strategy: Str = 'auto',
                             random_state: Int = 42,
                             output_log_fp: Str = None) -> biom.Table:
    log_fp = tempfile.mktemp()
    print("The log file will be writen into", log_fp)
    if log_fp:
        logger_ins = LOG(log_fp=log_fp).get_logger('synthetic_over_sampling')
        logger_ins.info("The parameters used for oversampling are")
        logger_ins.info('voting (will be used with ClusterCentroids only):',
                        voting)
        logger_ins.info('Sampling method:', method)
        logger_ins.info('Output log file path:', log_fp)
        logger_ins.info('sampling_strategy:', sampling_strategy)
        logger_ins.info('n_jobs:', n_jobs)
        logger_ins.info('random_state:', random_state)

    cls = dispatcher[method]
    if method != 'RandomUnderSampler':
        table.norm(inplace=True)
        if log_fp:
            logger_ins.info(
                "The input table is normalized before using it for oversampling"
            )
    sorted_table, sorted_metadata = _read_inputs(table, meta_data=metadata)
    matrix_data = sorted_table.matrix_data.transpose().todense()
    if method not in dispatcher:
        raise ValueError('The optional methods for over sampling are',
                         dispatcher.keys(), "instead it received", method)
    if method == 'RandomUnderSampler':
        under_sampling_cls = cls(sampling_strategy=sampling_strategy,
                                 random_state=random_state,
                                 replacement=False)
    else:
        raise NotImplementedError("Method", method, "is not implemented yet")
    X_resampled, y_resampled = under_sampling_cls.fit_resample(
        matrix_data, sorted_metadata)
    if log_fp:
        logger_ins.info("The under-sampling finished successfully!")
        logger_ins.info("Overall, the size of data is", len(X_resampled))
    if method == 'RandomUnderSampler':
        dummy_samples_ids = under_sampling_cls.sample_indices_
        dummy_samples = []
        orig_samples = sorted_table.ids('sample')
        for sample_id in dummy_samples_ids:
            dummy_samples.append(orig_samples[sample_id])
    else:
        raise NotImplementedError("Method", method, "is not implemented yet")
    under_sampling_dummy = sorted_table.filter(ids_to_keep=dummy_samples,
                                               inplace=False)
    under_sampling_dummy = under_sampling_dummy.sort_order(order=dummy_samples,
                                                           axis='sample')
    if method == "RandomUnderSampler" and np.sum(
            under_sampling_dummy.matrix_data.transpose() - X_resampled) != 0:
        raise ValueError("The undersampling changed the matrix data")

    undersampled_table = biom.Table(
        X_resampled.transpose(),
        observation_ids=sorted_table.ids('observation'),
        sample_ids=dummy_samples)
    undersampled_metadata = pd.DataFrame(index=dummy_samples, data=y_resampled)
    undersampled_metadata.index.names = ['#SampleID']
    undersampled_metadata.columns = ['label']
    undersampled_meta = qiime2.Metadata(undersampled_metadata)
    undersampled_meta.save(concatenate_meta_fp)

    if log_fp:
        shutil.copy(log_fp, output_log_fp)

    return undersampled_table

if __name__ == '__main__':
    deblur_table = Artifact.load(sys.argv[1]).view(Table)
    inv_otu_map = get_inv_otu_map(
        [l.strip().split('\t') for l in open(sys.argv[2])])
    output = sys.argv[3]

    new_table_rcv = defaultdict(int)
    ggid_to_index = {}
    sample_to_index = {}

    for deblur_seq, sample in deblur_table.nonzero():
        ggid = inv_otu_map.get(deblur_seq, None)

        # if the deblurred sequence isn't in the OTU map (i.e., didn't recruit)
        if ggid is None:
            continue

        ggid_idx = next_index(ggid_to_index, ggid)
        sample_idx = next_index(sample_to_index, sample)

        value = deblur_table.get_value_by_ids(deblur_seq, sample)
        new_table_rcv[(ggid_idx, sample_idx)] += value

    ggid_order = index_to_order(ggid_to_index)
    sample_order = index_to_order(sample_to_index)
    new_table = biom.Table(new_table_rcv, ggid_order, sample_order)

    Artifact.import_data("FeatureTable[Frequency]", new_table).save(output)
Beispiel #15
0
    def __init__(self,
                 table: biom.Table,
                 features: pd.DataFrame,
                 variances: biom.Table = None,
                 formatter: Optional['Formatter'] = None,
                 rank_level: int = 1):
        """Establish the taxonomy data

        Parameters
        ----------
        table : biom.Table
            Relative abundance data per sample or collapsed into higher order
            entiries (e.g., abx in the past year)
        features : pd.DataFrame
            DataFrame relating an observation to a Taxon
        variances : biom.Table, optional
            Variation information about a taxon within a label.
        rank_level : int
            The taxonomic level (depth) to compute ranks over. Level 0 is
            domain, level 1 is phylum, etc.
        """
        self._table = table.norm(inplace=False)
        self._group_id_lookup = set(self._table.ids())
        self._feature_id_lookup = set(self._table.ids(axis='observation'))
        self._feature_order = self._table.ids(axis='observation')
        self._features = features

        if variances is None:
            empty = ss.csr_matrix((len(
                self._table.ids(axis='observation')), len(self._table.ids())),
                                  dtype=float)
            self._variances = biom.Table(empty,
                                         self._table.ids(axis='observation'),
                                         self._table.ids())
        else:
            self._variances = variances

        if set(self._variances.ids()) != set(self._table.ids()):
            raise DisjointError("Table and variances are disjoint")

        if set(self._variances.ids(axis='observation')) != \
                set(self._table.ids(axis='observation')):
            raise DisjointError("Table and variances are disjoint")

        if not self._feature_id_lookup.issubset(set(self._features.index)):
            raise SubsetError("Table features are not a subset of the "
                              "taxonomy information")

        self._ranked, self._ranked_order = self._rankdata(rank_level)

        self._features = self._features.loc[self._feature_order]
        self._variances = self._variances.sort_order(self._feature_order,
                                                     axis='observation')

        if formatter is None:
            formatter: Formatter = GreengenesFormatter()
        self._formatter = formatter

        # initialize taxonomy tree
        tree_data = ((i, lineage.split('; '))
                     for i, lineage in self._features['Taxon'].items())
        self.taxonomy_tree = skbio.TreeNode.from_taxonomy(tree_data)
        for node in self.taxonomy_tree.traverse():
            node.length = 1
        self.bp_tree = parse_newick(str(self.taxonomy_tree))

        feature_taxons = self._features
        self._formatted_taxa_names = {
            i: self._formatter.dict_format(lineage)
            for i, lineage in feature_taxons['Taxon'].items()
        }
 def test_uc_w_mapping(self):
     obs = self.cmd(self.uc, self.rep_set)
     expected = biom.Table(np.array([[1.0, 1.0], [0.0, 1.0]]),
                           observation_ids=['otu1', 'otu2'],
                           sample_ids=['f2', 'f3'])
     self.assertEqual(obs, expected)
Beispiel #17
0
region2_align = \
    Artifact.import_data('FeatureData[KmerAlignment]', pd.DataFrame(
        data=np.array([['seq1', 'asv06', 15, 0, 2, 'Gotham'],
                       ['seq2', 'asv07', 15, 0, 2, 'Gotham'],
                       ['seq3', 'asv08', 15, 0, 2, 'Gotham'],
                       ['seq4', 'asv09', 15, 0, 2, 'Gotham'],
                       ['seq5', 'asv10', 15, 0, 2, 'Gotham'],
                       ['seq6', 'asv11', 15, 0, 2, 'Gotham']],
                       dtype=object),
        columns=['kmer', 'asv', 'length', 'mismatch', 'max-mismatch', 'region']
    ))
region1_counts = Artifact.import_data(
    'FeatureTable[Frequency]',
    biom.Table(
        np.array([[150, 0, 0, 50, 50], [125, 50, 50, 25, 25],
                  [100, 0, 100, 50, 50]]).T,
        sample_ids=['sample1', 'sample2', 'sample3'],
        observation_ids=['asv01', 'asv02', 'asv03', 'asv04', 'asv05'],
    ))
region1_alt_counts = Artifact.import_data(
    'FeatureTable[Frequency]',
    biom.Table(
        np.array([[150, 0, 0, 50, 50, 50], [125, 50, 50, 25, 25, 25],
                  [100, 0, 100, 50, 50, 0]]).T,
        sample_ids=['sample1', 'sample2', 'sample3'],
        observation_ids=['asv01', 'asv02', 'asv03', 'asv04', 'asv05', 'asv20'],
    ))
region2_counts = Artifact.import_data(
    'FeatureTable[Frequency]',
    biom.Table(
        data=np.array([[100, 50, 0, 50, 50, 50], [100, 25, 100, 25, 25, 25],
                       [0, 100, 100, 0, 50, 50]]).T,
Beispiel #18
0
def _biom_from_samples(context, samples, get=None, normalize_taxonomy=None):
    """Create a BIOM table from an iterable of samples

    Parameters
    ----------
    context : str
        The context to obtain sample data from.
    samples : iterable of str
        The samples to fetch.
    get : a make_get instance, optional
        A constructed get method.
    normalize_taxonomy : list, optional
        The ranks to normalize a lineage too (e.g., [k, p, c, o, f, g, s])

    Returns
    -------
    biom.Table
        A Table populated with the found samples.
    dict
        A map of {sample_id_in_table: original_id}. This map can be used to
        identify what samples are ambiguous based off their original IDs.

    Redis command summary
    ---------------------
    HMGET <context>:feature-index-inverted
    EVALSHA <fetch-sample-sha1> 0 context <redbiom-id>
    """
    from operator import itemgetter
    import scipy.sparse as ss
    import biom
    import redbiom.admin
    import redbiom._requests
    import redbiom.util
    import redbiom
    config = redbiom.get_config()

    if get is None:
        get = redbiom._requests.make_get(config)

    se = redbiom._requests.make_script_exec(config)

    redbiom._requests.valid(context, get)

    samples = list(samples)  # unroll iterator if necessary

    # resolve ambiguities
    stable_ids, unobserved, ambig_assoc, rimap = \
        redbiom.util.resolve_ambiguities(context, samples, get)

    table_data = []
    unique_indices = set()
    fetch_sample = redbiom.admin.ScriptManager.get('fetch-sample')
    for id_ in rimap:
        # 0 -> we're passing 0 keys, and instead using ARGV
        data = se(fetch_sample, 0, context, id_)
        table_data.append((id_, data))
        unique_indices.update(data)

    # construct a mapping of
    # {feature ID : index position in the BIOM table}
    unique_indices_map = {
        observed: index
        for index, observed in enumerate(unique_indices)
    }

    # pull out the feature and sample IDs in the desired ordering
    obs_ids = [
        id_ for id_, _ in sorted(unique_indices_map.items(), key=itemgetter(1))
    ]
    sample_ids = [id_ for id_, _ in table_data]

    # fill in the matrix
    mat = ss.lil_matrix((len(unique_indices), len(table_data)))
    for col, (sample, col_data) in enumerate(table_data):
        # since this isn't dense, hopefully roworder doesn't hose us
        for obs_id, value in col_data.items():
            mat[unique_indices_map[obs_id], col] = value

    lineages = taxon_ancestors(context,
                               obs_ids,
                               get,
                               normalize=normalize_taxonomy)

    if lineages is not None:
        obs_md = [{'taxonomy': lineage} for lineage in lineages]
    else:
        obs_md = None

    table = biom.Table(mat, obs_ids, sample_ids, obs_md)
    table.update_ids(rimap)

    return table, ambig_assoc
 def test_basic_w_mapping(self):
     obs = self.cmd(self.uc_minimal, self.rep_set)
     expected = biom.Table(np.array([[1.0]]),
                           observation_ids=['otu1'],
                           sample_ids=['f2'])
     self.assertEqual(obs, expected)
Beispiel #20
0
 def setUp(self):
     super().setUp()
     self.empty_table = biom.Table(np.array([]), [], [])