Esempio n. 1
0
    def test_validate_prefix(self):
        httpretty.register_uri(
            httpretty.POST,
            "https://test_server.com/qiita_db/jobs/job-id/step/")
        httpretty.register_uri(
            httpretty.GET,
            "https://test_server.com/qiita_db/prep_template/1/data",
            body='{"data": {"1.S1": {"orig_name": "S1"}, "1.S2": '
                 '{"orig_name": "S2"}, "1.S3": {"orig_name": "S3"}}}')

        fd, biom_fp = mkstemp(suffix=".biom")
        close(fd)
        data = np.asarray([[0, 0, 1], [1, 3, 42]])
        table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3'])
        with biom_open(biom_fp, 'w') as f:
            table.to_hdf5(f, "Test")

        self._clean_up_files.append(biom_fp)

        self.parameters['files'] = '{"BIOM": ["%s"]}' % biom_fp

        obs_success, obs_ainfo, obs_error = validate(
            self.qclient, 'job-id', self.parameters, self.out_dir)
        exp_biom_fp = join(self.out_dir, basename(biom_fp))
        self._clean_up_files.append(exp_biom_fp)
        self.assertTrue(obs_success)
        self.assertEqual(obs_ainfo, [[None, 'BIOM', [exp_biom_fp, 'biom']]])
        self.assertEqual(obs_error, "")
        obs_t = load_table(exp_biom_fp)
        self.assertItemsEqual(obs_t.ids(), ["1.S1", "1.S2", "1.S3"])
Esempio n. 2
0
def beta_phylogenetic(table: biom.Table, phylogeny: skbio.TreeNode,
                      metric: str, n_jobs: int=1)-> skbio.DistanceMatrix:
    if metric not in phylogenetic_metrics():
        raise ValueError("Unknown phylogenetic metric: %s" % metric)
    if table.is_empty():
        raise ValueError("The provided table object is empty")
    if n_jobs != 1 and metric == 'weighted_unifrac':
        raise ValueError("Weighted UniFrac is not parallelizable")

    counts = table.matrix_data.toarray().astype(int).T
    sample_ids = table.ids(axis='sample')
    feature_ids = table.ids(axis='observation')

    try:
        results = skbio.diversity.beta_diversity(
            metric=metric,
            counts=counts,
            ids=sample_ids,
            otu_ids=feature_ids,
            tree=phylogeny,
            pairwise_func=sklearn.metrics.pairwise_distances,
            n_jobs=n_jobs
        )
    except skbio.tree.MissingNodeError as e:
        message = str(e).replace('otu_ids', 'feature_ids')
        message = message.replace('tree', 'phylogeny')
        raise skbio.tree.MissingNodeError(message)

    return results
Esempio n. 3
0
def beta(table: biom.Table, metric: str,
         pseudocount: int=1, n_jobs: int=1)-> skbio.DistanceMatrix:

    if not (metric in non_phylogenetic_metrics()):
        raise ValueError("Unknown metric: %s" % metric)

    counts = table.matrix_data.toarray().T

    def aitchison(x, y, **kwds):
        return euclidean(clr(x), clr(y))

    if metric == 'aitchison':
        counts += pseudocount
        metric = aitchison

    if table.is_empty():
        raise ValueError("The provided table object is empty")

    sample_ids = table.ids(axis='sample')

    return skbio.diversity.beta_diversity(
        metric=metric,
        counts=counts,
        ids=sample_ids,
        validate=True,
        pairwise_func=sklearn.metrics.pairwise_distances,
        n_jobs=n_jobs
    )
def main(table_in, table_out, pathways, to_classic):
    # setup
    table = load_table(table_in)
    pathway_dict = get_pathway2kos()

    # get set of kos from pathways
    pathways_kos = set()
    for pathway in pathways:
        pathways_kos = pathways_kos | pathway_dict[pathway.strip()[-5:]]

    # get selected kos
    kos_to_keep = set(table.ids('observation')) & \
        pathways_kos
    if len(kos_to_keep) == 0:
        raise EmptySetERROR('Intersection created empty set')
    obs_ids = np.array(list(kos_to_keep))
    data = np.empty([len(obs_ids), len(table.ids('sample'))])
    for i, obs in enumerate(obs_ids):
        data[i] = table.data(obs, 'observation')

    # output
    new_table = Table(data, obs_ids, table.ids('sample'), type="OTU table")
    if to_classic:
        # print to tab delimited biom table
        f = open(table_out, 'w')
        f.write(new_table.to_tsv())
    else:
        # print json biom table
        new_table.to_json("filter_KOs_by_pathway.py", open(table_out, 'w'))
Esempio n. 5
0
def group(table: biom.Table, axis: str,
          metadata: qiime2.CategoricalMetadataColumn, mode: str) -> biom.Table:
    if table.is_empty():
        raise ValueError("Cannot group an empty table.")

    if axis == 'feature':
        biom_axis = 'observation'
    else:
        biom_axis = axis

    metadata = _munge_metadata_column(metadata, table.ids(axis=biom_axis),
                                      axis)

    grouped_table = table.collapse(
        lambda axis_id, _: metadata.get_value(axis_id),
        collapse_f=_mode_lookup[mode],
        axis=biom_axis,
        norm=False,
        include_collapsed_metadata=False)
    # Reorder axis by first unique appearance of each group value in metadata
    # (makes it stable for identity mappings and easier to test)
    # TODO use CategoricalMetadataColumn API for retrieving categories/groups,
    # when the API exists.
    series = metadata.to_series()
    return grouped_table.sort_order(series.unique(), axis=biom_axis)
Esempio n. 6
0
    def test_execute_job_error(self):
        # Create a prep template
        prep_info = {'SKB8.640193': {'col': 'val1'},
                     'SKD8.640184': {'col': 'val2'}}
        data = {'prep_info': dumps(prep_info),
                'study': 1,
                'data_type': '16S'}
        template = self.qclient.post(
            '/apitest/prep_template/', data=data)['prep']
        # Create a new validate job
        fd, biom_fp = mkstemp(suffix=".biom")
        close(fd)
        data = np.random.randint(100, size=(2, 2))
        table = Table(data, ['O1', 'O2'], ['S1', 'S2'])
        with biom_open(biom_fp, 'w') as f:
            table.to_hdf5(f, "Test")
        data = {'command': dumps(['BIOM type', '2.1.4', 'Validate']),
                'parameters': dumps(
                    {'files': dumps({'biom': [biom_fp]}),
                     'template': template,
                     'artifact_type': 'BIOM'}),
                'artifact_type': 'BIOM',
                'status': 'queued'}
        job_id = self.qclient.post(
            '/apitest/processing_job/', data=data)['job']

        plugin("https://localhost:21174", job_id, self.out_dir)
        obs = self._wait_job(job_id)
        self.assertEqual(obs, 'error')
Esempio n. 7
0
def merge(table1: biom.Table, table2: biom.Table) -> biom.Table:
    table1_sids = set(table1.ids(axis='sample'))
    table2_sids = set(table2.ids(axis='sample'))
    if len(table1_sids & table2_sids) > 0:
        raise ValueError('Some samples are present in both tables: %s' %
                         ', '.join(table1_sids & table2_sids))
    return table1.merge(table2)
Esempio n. 8
0
def filter_table(table: biom.Table, tree: skbio.TreeNode) -> biom.Table:
    """ Filter table to remove feature ids that are not tip ids in tree
    """
    tip_ids = set([t.name for t in tree.tips()])
    feature_ids = set(table.ids(axis='observation'))
    # ids_to_keep can only include ids that are in table
    ids_to_keep = tip_ids & feature_ids
    table.filter(ids_to_keep, axis='observation', inplace=True)
    return table
Esempio n. 9
0
def rarefy(table: biom.Table, sampling_depth: int) -> biom.Table:
    table = table.subsample(sampling_depth, axis='sample', by_id=False)

    if table.is_empty():
        raise ValueError('The rarefied table contains no samples or features. '
                         'Verify your table is valid and that you provided a '
                         'shallow enough sampling depth.')

    return table
Esempio n. 10
0
def alpha(table: biom.Table, metric: str) -> pd.Series:
    if metric not in non_phylogenetic_metrics():
        raise ValueError("Unknown metric: %s" % metric)
    if table.is_empty():
        raise ValueError("The provided table object is empty")

    counts = table.matrix_data.toarray().astype(int).T
    sample_ids = table.ids(axis='sample')

    result = skbio.diversity.alpha_diversity(metric=metric, counts=counts,
                                             ids=sample_ids)
    result.name = metric
    return result
Esempio n. 11
0
def _1(data: biom.Table) -> BIOMV100Format:
    data = _drop_axis_metadata(data)

    ff = BIOMV100Format()
    with ff.open() as fh:
        fh.write(data.to_json(generated_by=_get_generated_by()))
    return ff
Esempio n. 12
0
def beta(table: biom.Table, metric: str, n_jobs: int=1)-> skbio.DistanceMatrix:
    if metric not in non_phylogenetic_metrics():
        raise ValueError("Unknown metric: %s" % metric)
    if table.is_empty():
        raise ValueError("The provided table object is empty")

    counts = table.matrix_data.toarray().astype(int).T
    sample_ids = table.ids(axis='sample')

    return skbio.diversity.beta_diversity(
        metric=metric,
        counts=counts,
        ids=sample_ids,
        pairwise_func=sklearn.metrics.pairwise_distances,
        n_jobs=n_jobs
    )
Esempio n. 13
0
def rename_deblur_biom(biom, name_stub='deblur', metadata_name='deblurred_seq'):
    seqs = biom.ids(axis='observation')

    seqnames = ['{0}{1}'.format(name_stub, x) for x in range(len(seqs))]

    seq_metadata = {seqname: {metadata_name: seq} for seq, seqname in zip(seqs, seqnames)}

    renamed_biom = Table(biom.matrix_data, 
                         seqnames,
                         biom.ids(axis='sample'),
                         biom.metadata(axis='observation'),
                         biom.metadata(axis='sample'),
                         table_id = biom.table_id + ' renamed')

    renamed_biom.add_metadata(seq_metadata, axis='observation')

    return(renamed_biom)
Esempio n. 14
0
    def test_collapse_full(self):
        obs = collapse_full(table)
        exp = Table(array([[0.00769230769231], [0.0282051282051],
                           [0.0487179487179], [0.0692307692308],
                           [0.0897435897436], [0.110256410256],
                           [0.130769230769], [0.151282051282],
                           [0.171794871795], [0.192307692308]]),
                    observ_ids, ['average'],
                    observation_metadata=observ_metadata)
        for r in range(10):
            assert_almost_equal(obs[r, 0],  exp[r, 0])
        self.assertEqual(obs.ids(), exp.ids())
        self.assertItemsEqual(obs.ids('observation'), exp.ids('observation'))

        obs_meta = []
        for _, _, m in obs.iter(axis='observation'):
            obs_meta.append(m)
        self.assertItemsEqual(obs_meta, observ_metadata)
Esempio n. 15
0
    def test_biom_match(self):
        table = Table(
            np.array([[0, 0, 1, 1],
                      [2, 3, 4, 4],
                      [5, 5, 3, 3]]).T,
            ['a', 'b', 'c', 'd'],
            ['s2', 's3', 's4'])
        md = pd.DataFrame(
            {
                'x1': [1, 3, 2],
                'x2': [1, 1, 0]
            },
            columns=['s1', 's2', 's3']
        ).T

        exp_table = Table(
            np.array(
                [
                    [0, 0, 1, 1],
                    [2, 3, 4, 4]
                ]).T,
            ['a', 'b', 'c', 'd'],
            ['s2', 's3'])
        exp_md = pd.DataFrame(
            {
                'x1': [3, 2],
                'x2': [1, 0]
            },
            columns=['s2', 's3']
        ).T

        res_table, res_md = match(table, md)
        exp_df = pd.DataFrame(exp_table.to_dataframe())
        res_df = pd.DataFrame(res_table.to_dataframe())

        exp_df = exp_df.reindex_axis(sorted(exp_df.columns), axis=1)
        res_df = res_df.reindex_axis(sorted(res_df.columns), axis=1)

        pdt.assert_frame_equal(exp_df, res_df)

        exp_md = exp_md.reindex_axis(sorted(exp_md.index), axis=0)
        res_md = res_md.reindex_axis(sorted(res_md.index), axis=0)

        pdt.assert_frame_equal(res_md, exp_md)
Esempio n. 16
0
def beta(table: biom.Table, metric: str,
         pseudocount: int = 1, n_jobs: int = 1) -> skbio.DistanceMatrix:

    if not (metric in non_phylogenetic_metrics()):
        raise ValueError("Unknown metric: %s" % metric)

    counts = table.matrix_data.toarray().T

    def aitchison(x, y, **kwds):
        return euclidean(clr(x), clr(y))

    def canberra_adkins(x, y, **kwds):
        if (x < 0).any() or (y < 0).any():
            raise ValueError("Canberra-Adkins is only defined over positive "
                             "values.")

        nz = ((x > 0) | (y > 0))
        x_ = x[nz]
        y_ = y[nz]
        nnz = nz.sum()

        return (1. / nnz) * np.sum(np.abs(x_ - y_) / (x_ + y_))

    if metric == 'aitchison':
        counts += pseudocount
        metric = aitchison
    elif metric == 'canberra_adkins':
        metric = canberra_adkins

    if table.is_empty():
        raise ValueError("The provided table object is empty")

    sample_ids = table.ids(axis='sample')

    return skbio.diversity.beta_diversity(
        metric=metric,
        counts=counts,
        ids=sample_ids,
        validate=True,
        pairwise_func=sklearn.metrics.pairwise_distances,
        n_jobs=n_jobs
    )
Esempio n. 17
0
def generate_per_sample_biom(biom_file, limit):
    """Generate per-sample BIOM files

    Parameters
    ----------
    biom_file : str
        A filepath to a BIOM table
    limit : int or None
        Limit the number of tables to load

    Returns
    -------
    str
        The sample ID
    str
        The table in BIOM Format v1.0
    str
        The table in the classic OTU table format
    """
    table = load_table(biom_file)
    obs_ids = table.ids(axis='observation')
    obs_md = table.metadata(axis='observation')

    if limit is None:
        limit = np.inf

    count = 0
    for v, sample, _ in table.iter():
        if count >= limit:
            break

        single_sample = Table(v[:, np.newaxis], obs_ids, [sample], obs_md)
        single_sample.filter(lambda v_, i, md: v_ > 0, axis='observation')
        biomv1 = single_sample.to_json('AG')
        biomtxt = single_sample.to_tsv(
            header_key='taxonomy',
            header_value='taxonomy',
            metadata_formatter=lambda x: '; '.join(x))
        yield (sample, biomv1, biomtxt)
        count += 1
Esempio n. 18
0
def alpha_phylogenetic(table: biom.Table, phylogeny: skbio.TreeNode,
                       metric: str) -> pd.Series:
    if metric not in phylogenetic_metrics():
        raise ValueError("Unknown phylogenetic metric: %s" % metric)

    counts = table.matrix_data.toarray().astype(int).T
    sample_ids = table.ids(axis='sample')
    feature_ids = table.ids(axis='observation')

    try:
        result = skbio.diversity.alpha_diversity(metric=metric,
                                                 counts=counts,
                                                 ids=sample_ids,
                                                 otu_ids=feature_ids,
                                                 tree=phylogeny)
    except skbio.tree.MissingNodeError as e:
        message = str(e).replace('otu_ids', 'feature_ids')
        message = message.replace('tree', 'phylogeny')
        raise skbio.tree.MissingNodeError(message)

    result.name = metric
    return result
Esempio n. 19
0
def collapse_full(_bt):
    """Collapses full biom table to median of each OTU

    Parameters
    ----------
    _bt : biom table
        Table to collapse

    Returns
    -------
    biom table
        Collapsed biom table, one sample containing median of each OTU,
        normalized.
    """
    num_obs = len(_bt.ids(axis='observation'))
    table = Table(np.array(
        [np.median(v) for v in _bt.iter_data(axis='observation')]).reshape(
        (num_obs, 1)),
        _bt.ids(axis='observation'), ['average'],
        observation_metadata=_bt.metadata(axis='observation'))
    table.norm(inplace=True)
    return table
Esempio n. 20
0
    def setUp(self):
        # Registewr the URIs for the QiitaClient
        httpretty.register_uri(
            httpretty.POST,
            "https://test_server.com/qiita_db/authenticate/",
            body='{"access_token": "token", "token_type": "Bearer", '
                 '"expires_in": "3600"}')

        self.qclient = QiitaClient('https://test_server.com', 'client_id',
                                   'client_secret')
        # Create a biom table
        fd, self.biom_fp = mkstemp(suffix=".biom")
        close(fd)
        data = np.asarray([[0, 0, 1], [1, 3, 42]])
        table = Table(data, ['O1', 'O2'], ['1.S1', '1.S2', '1.S3'])
        with biom_open(self.biom_fp, 'w') as f:
            table.to_hdf5(f, "Test")
        self.out_dir = mkdtemp()
        self.artifact_id = 4
        self.parameters = {'input_data': self.artifact_id}

        self._clean_up_files = [self.biom_fp, self.out_dir]
Esempio n. 21
0
    def _create_job_and_biom(self, sample_ids, template=None, analysis=None):
        # Create the BIOM table that needs to be valdiated
        fd, biom_fp = mkstemp(suffix=".biom")
        close(fd)
        data = np.random.randint(100, size=(2, len(sample_ids)))
        table = Table(data, ['O1', 'O2'], sample_ids)
        with biom_open(biom_fp, 'w') as f:
            table.to_hdf5(f, "Test")
        self._clean_up_files.append(biom_fp)

        # Create a new job
        parameters = {'template': template,
                      'files': dumps({'biom': [biom_fp]}),
                      'artifact_type': 'BIOM',
                      'analysis': analysis}
        data = {'command': dumps(['BIOM type', '2.1.4', 'Validate']),
                'parameters': dumps(parameters),
                'status': 'running'}
        res = self.qclient.post('/apitest/processing_job/', data=data)
        job_id = res['job']

        return biom_fp, job_id, parameters
Esempio n. 22
0
def filter_seqs(data: pd.Series, table: biom.Table=None,
                metadata: qiime2.Metadata=None, where: str=None,
                exclude_ids: bool=False) -> pd.Series:
    if table is not None and metadata is not None:
        raise ValueError('Filtering with metadata and filtering with a table '
                         'are mutually exclusive.')
    elif table is None and metadata is None:
        raise ValueError('No filtering requested. Must provide either table '
                         'or metadata.')
    elif table is not None:
        ids_to_keep = table.ids(axis='observation')
    else:
        # Note, no need to check for missing feature IDs in the metadata,
        # because that is basically the point of this method.
        ids_to_keep = metadata.get_ids(where=where)

    if exclude_ids is True:
        ids_to_keep = set(data.index) - set(ids_to_keep)
    filtered = data[data.index.isin(ids_to_keep)]
    if filtered.empty is True:
        raise ValueError('All features were filtered out of the data.')
    return filtered
Esempio n. 23
0
class TestFilters(unittest.TestCase):

    def setUp(self):
        X = np.array(
            [[10, 1, 4, 1, 4, 0],
             [0, 0, 2, 0, 2, 8],
             [0, 1, 2, 1, 2, 4],
             [0, 1, 0, 1, 0, 0],
             [2, 0, 0, 0, 0, 0],
             [1, 0, 0, 0, 0, 0],
             [7, 1, 0, 1, 0, 0]]
        )
        oids = ['o1', 'o2', 'o3', 'o4', 'o5', 'o6', 'o7']
        sids = ['s1', 's2', 's3', 's4', 's5', 's6']

        self.metadata = pd.DataFrame(
            np.vstack(
                (
                    np.ones(8),
                    np.array(['a', 'a', 'b', 'b', 'a', 'a', 'b', 'a']),
                    np.arange(8).astype(np.float64),
                    np.array(['Test', 'Test', 'Train', 'Train',
                              'Train', 'Train', 'Test', 'Train'])
                )
            ).T,
            columns=['intercept', 'categorical', 'continuous', 'train'],
            index=['s1', 's2', 's3', 's4', 's5', 's6', 's7', 's8']
        )
        self.metadata['continuous'] = self.metadata[
            'continuous'].astype(np.float64)
        self.trimmed_metadata = self.metadata.loc[
            ['s1', 's2', 's3', 's4', 's5', 's6']
        ]

        self.table = Table(X, oids, sids)

    def test_match_and_filter_no_filter(self):
        formula = 'C(categorical) + continuous'
        res = match_and_filter(self.table, self.metadata, formula,
                               min_sample_count=0, min_feature_count=0)
        res_table, res_metadata, res_design = res

        pdt.assert_frame_equal(res_table.to_dataframe(),
                               self.table.to_dataframe())

        exp_metadata = pd.DataFrame(
            np.vstack(
                (
                    np.ones(6),
                    np.array(['a', 'a', 'b', 'b', 'a', 'a']),
                    np.arange(6).astype(np.float64),
                    np.array(['Test', 'Test', 'Train', 'Train',
                              'Train', 'Train'])
                )
            ).T,
            columns=['intercept', 'categorical', 'continuous', 'train'],
            index=['s1', 's2', 's3', 's4', 's5', 's6']
        )
        exp_metadata['continuous'] = exp_metadata[
            'continuous'].astype(np.float64)
        pdt.assert_frame_equal(res_metadata, exp_metadata)
        exp_design = pd.DataFrame(
            np.vstack(
                (
                    np.ones(6),
                    np.array([0, 0, 1, 1, 0, 0]),
                    np.arange(6).astype(np.float64)
                )
            ).T,
            columns=['Intercept', 'C(categorical)[T.b]', 'continuous'],
            index=['s1', 's2', 's3', 's4', 's5', 's6']
        )

        pdt.assert_frame_equal(res_design, exp_design)

    def test_split_training_random(self):
        np.random.seed(0)
        design = pd.DataFrame(
            np.vstack(
                (
                    np.ones(6),
                    np.array([0, 0, 1, 1, 0, 0]),
                    np.arange(6)
                )
            ).T,
            columns=['Intercept', 'C(categorical)[T.b]', 'continuous'],
            index=['s1', 's2', 's3', 's4', 's5', 's6']
        )
        res = split_training(self.table.to_dataframe().T,
                             self.trimmed_metadata, design,
                             training_column=None,
                             num_random_test_examples=2)

        trainX, testX, trainY, testY = res
        # print(trainX.shape, testX.shape, trainY.shape, testY.shape)
        npt.assert_allclose(trainX.shape, np.array([4, 3]))
        npt.assert_allclose(trainY.shape, np.array([4, 7]))

        npt.assert_allclose(testX.shape, np.array([2, 3]))
        npt.assert_allclose(testY.shape, np.array([2, 7]))

    def test_split_training_fixed(self):
        np.random.seed(0)
        design = pd.DataFrame(
            np.vstack(
                (
                    np.ones(6),
                    np.array([0, 0, 1, 1, 0, 0]),
                    np.arange(6)
                )
            ).T,
            columns=['Intercept', 'C(categorical)[T.b]', 'continuous'],
            index=['s1', 's2', 's3', 's4', 's5', 's6']
        )
        t = self.table.to_dataframe().T
        res = split_training(t,
                             self.metadata, design,
                             training_column='train',
                             num_random_test_examples=2)

        exp_trainX = design.iloc[2:].values
        exp_testX = design.iloc[:2].values
        exp_trainY = t.iloc[2:].values
        exp_testY = t.iloc[:2].values

        res_trainX, res_testX, res_trainY, res_testY = res

        npt.assert_allclose(exp_trainX, res_trainX)
        npt.assert_allclose(exp_trainY, res_trainY)
        npt.assert_allclose(exp_testX, res_testX)
        npt.assert_allclose(exp_testY, res_testY)
Esempio n. 24
0
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int,
                      phylogeny: skbio.TreeNode = None, metrics: set = None,
                      metadata: qiime2.Metadata = None, min_depth: int = 1,
                      steps: int = 10, iterations: int = 10) -> None:

    if metrics is None:
        metrics = {'observed_otus', 'shannon'}
        if phylogeny is not None:
            metrics.add('faith_pd')
    elif not metrics:
        raise ValueError('`metrics` was given an empty set.')
    else:
        phylo_overlap = phylogenetic_metrics() & metrics
        if phylo_overlap and phylogeny is None:
            raise ValueError('Phylogenetic metric %s was requested but '
                             'phylogeny was not provided.' % phylo_overlap)

    if max_depth <= min_depth:
        raise ValueError('Provided max_depth of %d must be greater than '
                         'provided min_depth of %d.' % (max_depth, min_depth))
    possible_steps = max_depth - min_depth
    if possible_steps < steps:
        raise ValueError('Provided number of steps (%d) is greater than the '
                         'steps possible between min_depth and '
                         'max_depth (%d).' % (steps, possible_steps))
    if table.is_empty():
        raise ValueError('Provided table is empty.')
    max_frequency = max(table.sum(axis='sample'))
    if max_frequency < max_depth:
        raise ValueError('Provided max_depth of %d is greater than '
                         'the maximum sample total frequency of the '
                         'feature_table (%d).' % (max_depth, max_frequency))

    if metadata is None:
        columns, filtered_columns = set(), set()
    else:
        # Filter metadata to only include sample IDs present in the feature
        # table. Also ensures every feature table sample ID is present in the
        # metadata.
        metadata = metadata.filter_ids(table.ids(axis='sample'))

        # Drop metadata columns that aren't categorical, or consist solely of
        # missing values.
        pre_filtered_cols = set(metadata.columns)
        metadata = metadata.filter_columns(column_type='categorical',
                                           drop_all_missing=True)
        filtered_columns = pre_filtered_cols - set(metadata.columns)

        metadata_df = metadata.to_dataframe()
        if metadata_df.empty or len(metadata.columns) == 0:
            raise ValueError("All metadata filtered after dropping columns "
                             "that contained non-categorical data.")
        metadata_df.columns = pd.MultiIndex.from_tuples(
            [(c, '') for c in metadata_df.columns])
        columns = metadata_df.columns.get_level_values(0)

    data = _compute_rarefaction_data(table, min_depth, max_depth,
                                     steps, iterations, phylogeny, metrics)

    filenames = []
    for m, data in data.items():
        metric_name = quote(m)
        filename = '%s.csv' % metric_name

        if metadata is None:
            n_df = _compute_summary(data, 'sample-id')
            jsonp_filename = '%s.jsonp' % metric_name
            _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name,
                                     n_df, '')
            filenames.append(jsonp_filename)
        else:
            merged = data.join(metadata_df, how='left')
            for column in columns:
                column_name = quote(column)
                reindexed_df, counts = _reindex_with_metadata(column,
                                                              columns,
                                                              merged)
                c_df = _compute_summary(reindexed_df, column, counts=counts)
                jsonp_filename = "%s-%s.jsonp" % (metric_name, column_name)
                _alpha_rarefaction_jsonp(output_dir, jsonp_filename,
                                         metric_name, c_df, column)
                filenames.append(jsonp_filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            data.columns = ['depth-%d_iter-%d' % (t[0], t[1])
                            for t in data.columns.values]
            if metadata is not None:
                data = data.join(metadata.to_dataframe(), how='left')
            data.to_csv(fh, index_label=['sample-id'])

    index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html')
    q2templates.render(index, output_dir,
                       context={'metrics': list(metrics),
                                'filenames': [quote(f) for f in filenames],
                                'columns': list(columns),
                                'steps': steps,
                                'filtered_columns': sorted(filtered_columns)})

    shutil.copytree(os.path.join(TEMPLATES, 'alpha_rarefaction_assets',
                                 'dist'),
                    os.path.join(output_dir, 'dist'))
Esempio n. 25
0
def match_biom_tables(observed_table,
                      expected_table_keep,
                      verbose=False,
                      limit_to_expected_observations=False,
                      limit_to_observed_observations=False,
                      normalize=False,
                      shuffle_samples=False):

    expected_table = expected_table_keep.copy()

    overlapping_obs_ids = list(
        set(observed_table.ids(axis='observation'))
        & set(expected_table.ids(axis='observation')))

    if len(overlapping_obs_ids) < 1:
        print "obs ids:", observed_table.ids(axis='observation')[0:10]
        print "exp ids:", expected_table.ids(axis='observation')[0:10]

        raise ValueError,\
         "No observation ids are in common  between the observed and expected tables, so no evaluations can be performed."

    if limit_to_expected_observations:

        def f(data_vector, id_, metadata):
            return (id_ in overlapping_obs_ids)

        observed_table = observed_table.filter(f,
                                               axis='observation',
                                               inplace=False)

    if limit_to_observed_observations:

        def f(data_vector, id_, metadata):
            return (id_ in overlapping_obs_ids)

        expected_table = expected_table.filter(f,
                                               axis='observation',
                                               inplace=False)

    ###Make tables have same set (e.g.number) of ObservationIds and in the same order###
    #1)identify ObservationIds unique to each table
    unique_obs_in_expected = list(
        set(expected_table.ids(axis='observation')) -
        set(observed_table.ids(axis='observation')))
    unique_obs_in_observed = list(
        set(observed_table.ids(axis='observation')) -
        set(expected_table.ids(axis='observation')))

    #2)Add each missing observation with all 0's

    if unique_obs_in_observed:
        empty_obs_data = [[0] * len(expected_table.ids())
                          ] * len(unique_obs_in_observed)
        empty_obs_table = Table(empty_obs_data, unique_obs_in_observed,
                                expected_table.ids())
        expected_table = expected_table.merge(empty_obs_table)

    if unique_obs_in_expected:
        empty_obs_data = [[0] * len(observed_table.ids())
                          ] * len(unique_obs_in_expected)
        empty_obs_table = Table(empty_obs_data, unique_obs_in_expected,
                                observed_table.ids())
        observed_table = observed_table.merge(empty_obs_table)

    #3)sort the ObservationIds so they are in the same order between the tables

    if verbose:
        print "Sorting observations in expected table to match observed table..."
    expected_table = expected_table.sort_order(
        observed_table.ids(axis='observation'), axis='observation')

    overlapping_sample_ids = list(
        set(observed_table.ids()) & set(expected_table.ids()))

    if verbose:
        num_uniq_obs_sample_ids = len(
            observed_table.ids()) - len(overlapping_sample_ids)
        num_uniq_exp_sample_ids = len(
            expected_table.ids()) - len(overlapping_sample_ids)
        if num_uniq_obs_sample_ids:
            print "Num observed samples not in expected: {0}".format(
                num_uniq_obs_sample_ids)
        if num_uniq_exp_sample_ids:
            print "Num expected samples not in observed: {0}".format(
                num_uniq_exp_sample_ids)
        print "Num samples with same id: {0}".format(
            len(overlapping_sample_ids))

    if normalize:
        if verbose:
            print "Normalizing tables..."
        observed_table = observed_table.norm(axis='sample', inplace=False)
        expected_table = expected_table.norm(axis='sample', inplace=False)

    if verbose:
        print "Extracting data from biom objects..."
    # create lists to contain filtered data - we're going to need the data in
    # numpy arrays, so it makes sense to compute this way rather than filtering
    # the tables
    obs_data = {}
    exp_data = {}

    # build lists of filtered data
    for sample_id in overlapping_sample_ids:
        exp_data[sample_id] = expected_table.data(sample_id)

    if shuffle_samples:
        if verbose:
            print "Randomly shufflying sample ids..."
        sample_ids_to_shuffle = overlapping_sample_ids[:]
        shuffle(sample_ids_to_shuffle)

        for index in range(len(overlapping_sample_ids)):
            obs_data[overlapping_sample_ids[index]] = observed_table.data(
                sample_ids_to_shuffle[index])
    else:
        for sample_id in overlapping_sample_ids:
            obs_data[sample_id] = observed_table.data(sample_id)

    return obs_data, exp_data
Esempio n. 26
0
from biom import Table
from numpy import array

REGULAR_BIOM_TABLE = Table(data=array([
    [6.0, 0.0],
    [141.0, 67.0],
    [0.0, 6.0],
    [260.0, 601.0],
    [6128.0, 393.0],
    [35.0, 0.0],
    [0.0, 262.0],
    [0.0, 7.0],
    [19.0, 0.0]
]),
    observation_ids=['Dill cryptic virus 2', 'Enterobacteria phage T4', 'Hepatitis C virus',
                     'Human papillomavirus type 90', 'Lactobacillus phage Lv 1',
                     'Merkel cell polyomavirus', 'Mycobacterium phage Adler',
                     'Propionibacterium phge P105', 'Staphylococcus phage PH15'],
    sample_ids=['vag_intr_SRS014465.fasta', 'vag_intr_SRS015071.fasta'],
    sample_metadata=[{'name': 'vag_intr_SRS014465.fasta', 'file_id': '762b8657-bc6c-4c2f-8572-6be4df1adfc9',
                      'dataset_id': 'c1a84ab2-bca8-414b-9132-de4981426ba1', 'reads_total': 875954, 'label': 'label0',
                      'label_name': 'No Label'},
                     {'name': 'vag_intr_SRS015071.fasta', 'file_id': '8be2175c-6106-459f-859b-1e8f8bc6b0e8',
                      'dataset_id': '11d395b8-abfc-4571-aab6-e734b5d33885', 'reads_total': 507176, 'label': 'label0',
                      'label_name': 'No Label'}],
    table_id='05775205-8479-4346-9866-a45bdb449d70',
    type="OTU table")

REGULAR_BIOM_SAMPLE_META = {'vag_intr_SRS014465.fasta': {'id': 'vag_intr_SRS014465.fasta',
                                                         'metadata': {'name': 'vag_intr_SRS014465.fasta',
                                                                      'file_id': '762b8657-bc6c-4c2f-8572-6be4df1adfc9',
def percentile_normalize(table: biom.Table,
                         metadata: qiime2.MetadataColumn,
                         batch: qiime2.MetadataColumn = None,
                         n_control_thresh: int = 10,
                         otu_thresh: float = 0.3) -> biom.Table:
    """
    Converts an input table with cases and controls into percentiles
    of control samples.

    Parameters
    ----------
    table : biom.Table
        Feature table with relative abundances. Samples are in columns,
        features (i.e. OTUs) are in rows.
    metadata : qiime2.CategoricalMetadataColumn
        metadata column with samples labeled as "case" or "control".
        All samples with either label are returned, normalized to the
        equivalent percentile in "control" samples.
    batch : qiime2.CategoricalMetadataColumn
        metadata column with the different batches labeled. Percentile
        normalization will be performed within each batch, and the output
        tables will be concatenated together. You can use this to normalize
        multiple studies at once by first merging the original feature table,
        adding a study ID column in the merged metadata, and then calling
        percentile normalization with this option.
    n_control_thresh : int [default=10]
        Minimum number of controls accepted to perform percentile
        normalization. Because the transformation converts abundances
        in controls to a uniform distribution, we *highly* discourage
        performing percentile normalization on datasets with fewer than
        30 controls, and certainly not fewer than 10 (the default value).
        If you have fewer controls than `n_control_thresh`, the
        normalization will return an error.
    otu_thresh : float [default=0.3]
        The OTU filtering threshold: OTUs must be present in at least
        otu_thresh fraction of cases OR controls, otherwise it gets thrown
        out and not percentile normalized. This method does not perform
        well with very sparse OTUs, so we do not recommend lowering
        this threshold below 0.3. otu_thresh should be [0, 1]

    Returns
    -------
    norm_biom : biom.Table
        A biom table with the normalized data, only including the samples
        that were labeled as either "case" or "control", and the OTUs
        which passed the otu_thresh threshold.
    """
    # Filter metadata to only include IDs present in the table.
    # Also ensures every distance table ID is present in the metadata.
    metadata = metadata.filter_ids(table.ids(axis='sample'))
    metadata = metadata.drop_missing_values()

    # filter the table to exclude samples that were dropped from
    # the metadata due to missing values
    table = table.filter(metadata.ids)

    metadata = metadata.to_series()

    ## Convert biom Table into dense pandas dataframe
    # Transpose so samples are in rows and OTUs/features in columns
    df = table.to_dataframe().to_dense().T

    # Set up a list of metadata series, one per batch
    batches_to_norm = []
    if batch is not None:
        batch = batch.filter_ids(table.ids(axis='sample'))
        batch = batch.drop_missing_values()
        batch = batch.to_series()
        for g, one_batch in batch.groupby(batch):
            batches_to_norm.append(metadata.loc[one_batch.index])
    else:
        batches_to_norm.append(metadata)

    norm_dfs = []
    for meta in batches_to_norm:
        # Get case and control samples from metadata
        control_samples = meta[meta == "control"].index.tolist()
        case_samples = meta[meta == "case"].index.tolist()

        # Check that there are cases and controls
        if len(control_samples) == 0:
            if len(case_samples) == 0:
                # Both cases and controls are zero
                raise ValueError(
                    'There are no case or control samples in your data. Check the metadata column for "case" and "control" labels.'
                )
            # Just controls as zero
            raise ValueError(
                'There are no control samples in your data. Check the metadata column for "control" labels.'
            )
        # Just cases are zero
        elif len(case_samples) == 0:
            raise ValueError(
                'There are no case samples in your data. Check the metadata column for "case" labels.'
            )

        # Make sure there are enough controls to perform normalization
        if len(control_samples) < n_control_thresh:
            if batch is not None:
                batch_err = (' in batch ' +
                             str(batch.loc[meta.index].unique()[0]) + '')
            else:
                batch_err = ''
            raise ValueError(
                "There aren't enough controls in your data. " + batch_err +
                "(n_control_thresh = {})".format(n_control_thresh))

        # Filter OTUs, replace zeros with random value, and
        # percentile normalize
        norm_df = _percentile_normalize_one_df(df, control_samples,
                                               case_samples, otu_thresh)
        norm_dfs.append(norm_df)

    # Merge all normalized data
    # Keep all samples and all OTUs - OTUs not present in one batch will be NaNs
    norm_df = pd.concat(norm_dfs, axis=1)

    # Put this dataframe into biom format
    norm_biom = biom.Table(data=norm_df.values,
                           observation_ids=norm_df.index,
                           sample_ids=norm_df.columns)

    return norm_biom
Esempio n. 28
0
def iterative_pick_subsampled_open_reference_otus(
        input_fps,
        refseqs_fp,
        output_dir,
        percent_subsample,
        new_ref_set_id,
        command_handler,
        params,
        qiime_config,
        prefilter_refseqs_fp=None,
        prefilter_percent_id=None,
        min_otu_size=2,
        run_assign_tax=True,
        run_align_and_tree=True,
        step1_otu_map_fp=None,
        step1_failures_fasta_fp=None,
        parallel=False,
        suppress_step4=False,
        logger=None,
        suppress_md5=False,
        denovo_otu_picking_method='uclust',
        reference_otu_picking_method='uclust_ref',
        status_update_callback=print_to_stdout):
    """ Call the pick_subsampled_open_reference_otus workflow on multiple inputs
         and handle processing of the results.
    """
    create_dir(output_dir)
    commands = []

    if logger is None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    # if the user has not passed a different reference collection for the pre-filter,
    # used the input refseqs_fp for all iterations. we want to pre-filter all data against
    # the input data as lower percent identity searches with uclust can be slow, so we
    # want the reference collection to stay at a reasonable size.
    if prefilter_refseqs_fp is None:
        prefilter_refseqs_fp = refseqs_fp

    otu_table_fps = []
    repset_fasta_fps = []
    for i, input_fp in enumerate(input_fps):
        iteration_output_dir = '%s/%d/' % (output_dir, i)
        if iteration_output_exists(iteration_output_dir, min_otu_size):
            # if the output from an iteration already exists, skip that
            # iteration (useful for continuing failed runs)
            log_input_md5s(logger, [input_fp, refseqs_fp])
            logger.write('Iteration %d (input file: %s) output data already exists. '
                         'Skipping and moving to next.\n\n' % (i, input_fp))
        else:
            pick_subsampled_open_reference_otus(input_fp=input_fp,
                                                refseqs_fp=refseqs_fp,
                                                output_dir=iteration_output_dir,
                                                percent_subsample=percent_subsample,
                                                new_ref_set_id='.'.join(
                                                    [new_ref_set_id, str(i)]),
                                                command_handler=command_handler,
                                                params=params,
                                                qiime_config=qiime_config,
                                                run_assign_tax=False,
                                                run_align_and_tree=False,
                                                prefilter_refseqs_fp=prefilter_refseqs_fp,
                                                prefilter_percent_id=prefilter_percent_id,
                                                min_otu_size=min_otu_size,
                                                step1_otu_map_fp=step1_otu_map_fp,
                                                step1_failures_fasta_fp=step1_failures_fasta_fp,
                                                parallel=parallel,
                                                suppress_step4=suppress_step4,
                                                logger=logger,
                                                suppress_md5=suppress_md5,
                                                suppress_index_page=True,
                                                denovo_otu_picking_method=denovo_otu_picking_method,
                                                reference_otu_picking_method=reference_otu_picking_method,
                                                status_update_callback=status_update_callback)
        # perform post-iteration file shuffling whether the previous iteration's
        # data previously existed or was just computed.
        # step1 otu map and failures can only be used for the first iteration
        # as subsequent iterations need to use updated refseqs files
        step1_otu_map_fp = step1_failures_fasta_fp = None
        new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir
        refseqs_fp = new_refseqs_fp

        otu_table_fps.append(
            '%s/otu_table_mc%d.biom' %
            (iteration_output_dir, min_otu_size))

        repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir)

    # Merge OTU tables - check for existence first as this step has historically
    # been a frequent failure, so is sometimes run manually in failed runs.
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)
    if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0):
        merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\
            (','.join(otu_table_fps), otu_table_fp)
        commands.append([("Merge OTU tables", merge_cmd)])

    # Build master rep set
    final_repset_fp = '%s/rep_set.fna' % output_dir
    final_repset_from_iteration_repsets_fps(repset_fasta_fps, final_repset_fp)

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
        align_and_tree_input_otu_table = otu_table_w_tax_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,
                                                                 min_otu_size)
    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,
                                                           min_otu_size)

    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write(
                "Final output file exists (%s). Will not rebuild." %
                otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp], error_on_missing=False)

            taxonomy_fp = assign_tax(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
                (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table", add_metadata_cmd)])

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)

            pynast_failures_fp = align_and_tree(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Build OTU table without PyNAST failures
            with biom_open(align_and_tree_input_otu_table) as biom_file:
                table = Table.from_hdf5(biom_file)
            filtered_otu_table = filter_otus_from_otu_table(table,
                get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
                0, inf, 0, inf, negate_ids_to_keep=True)
            write_biom_table(filtered_otu_table,
                             pynast_failure_filtered_otu_table_fp)

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    logger.close()
Esempio n. 29
0
def rarefy(table: biom.Table, sampling_depth: int) -> biom.Table:
    return table.subsample(sampling_depth, axis='sample', by_id=False)
Esempio n. 30
0
def deposit(output_dir, table1, table2, metadata, U, V, B, it, rep):
    """ Writes down tables, metadata and feature metadata into files.

    Parameters
    ----------
    output_dir : str
        output directory
    table1 : biom.Table
        Biom table
    table2 : biom.Table
        Biom table
    metadata : pd.DataFrame
        Dataframe of sample metadata
    U : np.array
        Microbial latent variables
    V : np.array
        Metabolite latent variables
    edges : list
        Edge list for ground truthing.
    feature_metadata : pd.DataFrame
        Dataframe of features metadata
    it : int
        iteration number
    rep : int
        repetition number
    """
    choice = 'abcdefghijklmnopqrstuvwxyz'
    output_microbes = "%s/table_microbes.%d_%s.biom" % (
        output_dir, it, choice[rep])
    output_metabolites = "%s/table_metabolites.%d_%s.biom" % (
        output_dir, it, choice[rep])
    output_md = "%s/metadata.%d_%s.txt" % (
        output_dir, it, choice[rep])
    output_U = "%s/U.%d_%s.txt" % (
        output_dir, it, choice[rep])
    output_V = "%s/V.%d_%s.txt" % (
        output_dir, it, choice[rep])
    output_B = "%s/B.%d_%s.txt" % (
        output_dir, it, choice[rep])
    output_ranks = "%s/ranks.%d_%s.txt" % (
        output_dir, it, choice[rep])

    idx1 = table1.sum(axis=0) > 0
    idx2 = table2.sum(axis=0) > 0
    table1 = table1.loc[:, idx1]
    table2 = table2.loc[:, idx2]

    table1 = Table(table1.values.T, table1.columns, table1.index)
    table2 = Table(table2.values.T, table2.columns, table2.index)

    with biom_open(output_microbes, 'w') as f:
        table1.to_hdf5(f, generated_by='moi1')
    with biom_open(output_metabolites, 'w') as f:
        table2.to_hdf5(f, generated_by='moi2')

    ranks = clr(softmax(np.hstack(
        (np.zeros((U.shape[0], 1)), U @ V))))
    ranks = ranks[idx1, :]
    ranks = ranks[:, idx2]
    ranks = pd.DataFrame(
        ranks, index=table1.ids(axis='observation'),
        columns=table2.ids(axis='observation'))
    ranks.to_csv(output_ranks, sep='\t')
    metadata.to_csv(output_md, sep='\t', index_label='#SampleID')

    np.savetxt(output_B, B)
    np.savetxt(output_U, U)
    np.savetxt(output_V, V)
Esempio n. 31
0
def relative_frequency(table: biom.Table) -> biom.Table:
    """ Convert feature table in-place from frequencies to relative frequencies
    """
    table.norm(axis='sample', inplace=True)
    return table
Esempio n. 32
0
def transpose(table: biom.Table) -> biom.Table:
    transposed_table = table.transpose()
    return transposed_table
Esempio n. 33
0
def presence_absence(table: biom.Table) -> biom.Table:
    """ Convert feature table in-place to presence/absence data
    """
    table.pa(inplace=True)
    return table
Esempio n. 34
0
def ctf_helper(
    table: biom.Table,
    sample_metadata: DataFrame,
    individual_id_column: str,
    state_columns: list,
    n_components: int = DEFAULT_COMP,
    min_sample_count: int = DEFAULT_MSC,
    min_feature_count: int = DEFAULT_MFC,
    max_iterations_als: int = DEFAULT_TENSALS_MAXITER,
    max_iterations_rptm: int = DEFAULT_TENSALS_MAXITER,
    n_initializations: int = DEFAULT_TENSALS_MAXITER,
    feature_metadata: DataFrame = DEFFM
) -> (dict, OrdinationResults, dict, tuple):
    """ Runs  Compositional Tensor Factorization CTF.
    """

    # validate the metadata using q2 as a wrapper
    if sample_metadata is not None and not isinstance(sample_metadata,
                                                      DataFrame):
        sample_metadata = sample_metadata.to_dataframe()
    keep_cols = state_columns + [individual_id_column]
    all_sample_metadata = sample_metadata.drop(keep_cols, axis=1)
    sample_metadata = sample_metadata[keep_cols]
    # validate the metadata using q2 as a wrapper
    if feature_metadata is not None and not isinstance(feature_metadata,
                                                       DataFrame):
        feature_metadata = feature_metadata.to_dataframe()
    # match the data (borrowed in part from gneiss.util.match)
    subtablefids = table.ids('observation')
    subtablesids = table.ids('sample')
    if len(subtablesids) != len(set(subtablesids)):
        raise ValueError('Data-table contains duplicate sample IDs')
    if len(subtablefids) != len(set(subtablefids)):
        raise ValueError('Data-table contains duplicate feature IDs')
    submetadataids = set(sample_metadata.index)
    subtablesids = set(subtablesids)
    subtablefids = set(subtablefids)
    if feature_metadata is not None:
        submetadatafeat = set(feature_metadata.index)
        fidx = subtablefids & submetadatafeat
        if len(fidx) == 0:
            raise ValueError(("No more features left.  Check to make "
                              "sure that the sample names between "
                              "`feature-metadata` and `table` are "
                              "consistent"))
        feature_metadata = feature_metadata.reindex(fidx)
    sidx = subtablesids & submetadataids
    if len(sidx) == 0:
        raise ValueError(("No more features left.  Check to make sure that "
                          "the sample names between `sample-metadata` and"
                          " `table` are consistent"))
    if feature_metadata is not None:
        table.filter(list(fidx), axis='observation', inplace=True)
    table.filter(list(sidx), axis='sample', inplace=True)
    sample_metadata = sample_metadata.reindex(sidx)

    # filter and import table
    for axis, min_sum in zip(['sample', 'observation'],
                             [min_sample_count, min_feature_count]):
        table = table.filter(table.ids(axis)[table.sum(axis) >= min_sum],
                             axis=axis,
                             inplace=True)

    # table to dataframe
    table = DataFrame(table.matrix_data.toarray(), table.ids('observation'),
                      table.ids('sample'))

    # tensor building
    tensor = build()
    tensor.construct(table, sample_metadata, individual_id_column,
                     state_columns)

    # factorize
    TF = TensorFactorization(n_components=n_components,
                             max_als_iterations=max_iterations_als,
                             max_rtpm_iterations=max_iterations_rptm,
                             n_initializations=n_initializations).fit(
                                 tensor_rclr(tensor.counts))
    # label tensor loadings
    TF.label(tensor, taxonomy=feature_metadata)

    # if the n_components is two add PC3 of zeros
    # this is referenced as in issue in
    # <https://github.com/biocore/emperor/commit
    # /a93f029548c421cb0ba365b4294f7a5a6b0209ce>
    if n_components == 2:
        TF.subjects.loc[:, 'PC3'] = [0] * len(TF.subjects.index)
        TF.features.loc[:, 'PC3'] = [0] * len(TF.features.index)
        TF.proportion_explained['PC3'] = 0
        TF.eigvals['PC3'] = 0

    # save ordination results
    short_method_name = 'CTF_Biplot'
    long_method_name = 'Compositional Tensor Factorization Biplot'
    # only keep PC -- other tools merge metadata
    keep_PC = [col for col in TF.features.columns if 'PC' in col]
    subj_ordin = OrdinationResults(
        short_method_name,
        long_method_name,
        TF.eigvals,
        samples=TF.subjects[keep_PC].dropna(axis=0),
        features=TF.features[keep_PC].dropna(axis=0),
        proportion_explained=TF.proportion_explained)
    # save distance matrix for each condition
    distances = {}
    state_ordn = {}
    subject_trajectories = {}
    feature_trajectories = {}
    for condition, cond, dist, straj, ftraj in zip(tensor.conditions,
                                                   TF.conditions,
                                                   TF.subject_distances,
                                                   TF.subject_trajectory,
                                                   TF.feature_trajectory):
        # match distances to metadata
        ids = straj.index
        ind_dict = dict((ind, ind_i) for ind_i, ind in enumerate(ids))
        inter = set(ind_dict).intersection(sample_metadata.index)
        indices = sorted([ind_dict[ind] for ind in inter])
        dist = dist[indices, :][:, indices]
        distances[condition] = skbio.stats.distance.DistanceMatrix(
            dist, ids=ids[indices])
        # fix conditions
        if n_components == 2:
            cond['PC3'] = [0] * len(cond.index)
        cond = OrdinationResults(short_method_name,
                                 long_method_name,
                                 TF.eigvals,
                                 samples=cond[keep_PC].dropna(axis=0),
                                 features=TF.features[keep_PC].dropna(axis=0),
                                 proportion_explained=TF.proportion_explained)
        state_ordn[condition] = cond
        # add the sample metadata before returning output
        # addtionally only keep metadata with trajectory
        # output available.
        pre_merge_cols = list(straj.columns)
        straj = concat(
            [straj.reindex(all_sample_metadata.index), all_sample_metadata],
            axis=1,
            sort=True)
        straj = straj.dropna(subset=pre_merge_cols)
        # ensure index name for q2
        straj.index.name = "#SampleID"
        # save traj.
        keep_PC_traj = [col for col in straj.columns if 'PC' in col]
        straj[keep_PC_traj] -= straj[keep_PC_traj].mean()
        ftraj[keep_PC_traj] -= ftraj[keep_PC_traj].mean()
        subject_trajectories[condition] = straj
        ftraj.index = ftraj.index.astype(str)
        feature_trajectories[condition] = ftraj
    return (state_ordn, subj_ordin, distances, subject_trajectories,
            feature_trajectories)
Esempio n. 35
0
def cluster_features_closed_reference(sequences: DNAFASTAFormat,
                                      table: biom.Table,
                                      reference_sequences: DNAFASTAFormat,
                                      perc_identity: float,
                                      strand: str = 'plus',
                                      threads: int = 1
                                      ) -> (biom.Table, DNAFASTAFormat,
                                            DNAFASTAFormat):

    table_ids = set(table.ids(axis='observation'))
    sequence_ids = {e.metadata['id'] for e in skbio.io.read(
                    str(sequences), constructor=skbio.DNA, format='fasta')}
    _error_on_nonoverlapping_ids(table_ids, sequence_ids)
    matched_seqs, unmatched_seqs = DNAFASTAFormat(), DNAFASTAFormat()

    with tempfile.NamedTemporaryFile() as fasta_with_sizes, \
            tempfile.NamedTemporaryFile() as out_uc, \
            tempfile.NamedTemporaryFile() as tmp_unmatched_seqs:
        _fasta_with_sizes(str(sequences), fasta_with_sizes.name, table)
        cmd = ['vsearch',
               '--usearch_global', fasta_with_sizes.name,
               '--id', str(perc_identity),
               '--db', str(reference_sequences),
               '--uc', out_uc.name,
               '--strand', str(strand),
               '--qmask', 'none',  # ensures no lowercase DNA chars
               '--notmatched', tmp_unmatched_seqs.name,
               '--threads', str(threads)]
        run_command(cmd)
        out_uc.seek(0)

        # It is possible for there to be no unmatched sequences --- if that
        # is the case, skip thie following clean-up.
        if os.path.getsize(tmp_unmatched_seqs.name) > 0:
            # We don't really need to sort the matched sequences, this
            # is just to let us use --xsize, which strips the counts from
            # the Feature ID. It would be more ideal if --usearch_global,
            # above let us pass in --xsize, but unfortunately it isn't
            # supported.
            cmd = ['vsearch',
                   '--sortbysize', tmp_unmatched_seqs.name,
                   '--xsize',
                   '--output', str(unmatched_seqs)]
            run_command(cmd)

        try:
            conn = _uc_to_sqlite(out_uc)
            collapse_f = _collapse_f_from_sqlite(conn)
            _fasta_from_sqlite(conn, str(sequences), str(matched_seqs))
        except ValueError:
            raise VSearchError('No matches were identified to '
                               'reference_sequences. This can happen if '
                               'sequences are not homologous to '
                               'reference_sequences, or if sequences are '
                               'not in the same orientation as reference_'
                               'sequences (i.e., if sequences are reverse '
                               'complemented with respect to reference '
                               'sequences). Sequence orientation can be '
                               'adjusted with the strand parameter.')

        unmatched_ids = [e.metadata['id']
                         for e in skbio.io.read(open(str(unmatched_seqs)),
                                                constructor=skbio.DNA,
                                                format='fasta')]
    table.filter(ids_to_keep=unmatched_ids, invert=True, axis='observation',
                 inplace=True)
    table = table.collapse(collapse_f, norm=False, min_group_size=1,
                           axis='observation',
                           include_collapsed_metadata=False)

    return table, matched_seqs, unmatched_seqs
Esempio n. 36
0
def deposit_biofilms(output_dir,
                     abs_table1, abs_table2,
                     rel_table1, rel_table2,
                     edges, metadata, sample_id):
    """ Writes down tables and edges into files.

    Parameters
    ----------
    output_dir : str
        output directory
    rel_table1 : biom.Table
        Biom table of relative abundances
    rel_table2 : biom.Table
        Biom table of relative abundances
    abs_table1 : biom.Table
        Biom table of absolute abundances
    abs_table2 : biom.Table
        Biom table of absolute abundances
    edges : list
        Edge list for ground truthing.
    metadata : pd.DataFrame
        Dataframe of sample metadata
    sample_id : str
        sample id
    """
    output_abs_microbes = "%s/table.abs.microbes.%s.biom" % (
        output_dir, sample_id)
    output_abs_metabolites = "%s/table.abs.metabolites.%s.biom" % (
        output_dir, sample_id)
    output_rel_microbes = "%s/table.rel.microbes.%s.biom" % (
        output_dir, sample_id)
    output_rel_metabolites = "%s/table.rel.metabolites.%s.biom" % (
        output_dir, sample_id)
    output_md = "%s/metadata.%s.txt" % (
        output_dir, sample_id)
    output_U = "%s/U.%s.txt" % (
        output_dir, sample_id)
    output_V = "%s/V.%s.txt" % (
        output_dir, sample_id)
    output_edges = "%s/edges.%s.txt" % (
        output_dir, sample_id)
    output_ranks = "%s/ranks.%s.txt" % (
        output_dir, sample_id)

    # idx1 = table1.sum(axis=0) > 0
    # idx2 = table2.sum(axis=0) > 0
    # table1 = table1.loc[:, idx1]
    # table2 = table2.loc[:, idx2]

    # relative abundances
    table1 = Table(rel_table1.values.T, rel_table1.columns, rel_table1.index)
    table2 = Table(rel_table2.values.T, rel_table2.columns, rel_table2.index)
    with biom_open(output_rel_microbes, 'w') as f:
        table1.to_hdf5(f, generated_by='moi1')
    with biom_open(output_rel_metabolites, 'w') as f:
        table2.to_hdf5(f, generated_by='moi2')

    # absolute abundances
    table1 = Table(abs_table1.values.T, abs_table1.columns, abs_table1.index)
    table2 = Table(abs_table2.values.T, abs_table2.columns, abs_table2.index)
    with biom_open(output_abs_microbes, 'w') as f:
        table1.to_hdf5(f, generated_by='moi1')
    with biom_open(output_abs_metabolites, 'w') as f:
        table2.to_hdf5(f, generated_by='moi2')

    pd.DataFrame(edges).to_csv(output_edges, sep='\t')
    metadata.to_csv(output_md, sep='\t')
Esempio n. 37
0
    def test_filter_biom(self):
        table = Table(*map(
            np.array,
            prep_table({
                'S1': {
                    'G1': 4,
                    'G2': 5,
                    'G3': 8
                },
                'S2': {
                    'G1': 2,
                    'G4': 3,
                    'G5': 7
                },
                'S3': {
                    'G2': 3,
                    'G5': 5
                }
            })))
        obs = filter_biom(table, th=3)
        exp = Table(*map(
            np.array,
            prep_table({
                'S1': {
                    'G1': 4,
                    'G2': 5,
                    'G3': 8
                },
                'S2': {
                    'G4': 3,
                    'G5': 7
                },
                'S3': {
                    'G2': 3,
                    'G5': 5
                }
            })))
        self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal')

        obs = filter_biom(table, th=4)
        exp = Table(*map(
            np.array,
            prep_table({
                'S1': {
                    'G1': 4,
                    'G2': 5,
                    'G3': 8
                },
                'S2': {
                    'G5': 7
                },
                'S3': {
                    'G5': 5
                }
            })))
        self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal')

        obs = filter_biom(table, th=6)
        exp = Table(*map(
            np.array, prep_table({
                'S1': {
                    'G3': 8
                },
                'S2': {
                    'G5': 7
                },
                'S3': {}
            })))
        self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal')

        obs = filter_biom(table, th=0.25)
        exp = Table(*map(
            np.array,
            prep_table({
                'S1': {
                    'G2': 5,
                    'G3': 8
                },
                'S2': {
                    'G4': 3,
                    'G5': 7
                },
                'S3': {
                    'G2': 3,
                    'G5': 5
                }
            })))
        self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal')

        obs = filter_biom(table, th=0.5)
        exp = Table(*map(
            np.array, prep_table({
                'S1': {},
                'S2': {
                    'G5': 7
                },
                'S3': {
                    'G5': 5
                }
            })))
        self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal')

        # empty BIOM table cannot be directly compared
        obs = filter_biom(table, th=10)
        self.assertTupleEqual(obs.to_dataframe(True).shape, (0, 3))
Esempio n. 38
0
def deposit_biofilm(table1, table2, metadata, U, V, edges, it, rep, output_dir):
    """ Writes down tables, metadata and feature metadata into files.

    Parameters
    ----------
    table : biom.Table
        Biom table
    metadata : pd.DataFrame
        Dataframe of sample metadata
    feature_metadata : pd.DataFrame
        Dataframe of features metadata
    it : int
        iteration number
    rep : int
        repetition number
    output_dir : str
        output directory
    """
    choice = 'abcdefghijklmnopqrstuvwxyz'
    output_microbes = "%s/table_microbes.%d_%s.biom" % (
        output_dir, it, choice[rep])
    output_metabolites = "%s/table_metabolites.%d_%s.biom" % (
        output_dir, it, choice[rep])
    output_md = "%s/metadata.%d_%s.txt" % (
        output_dir, it, choice[rep])
    output_U = "%s/U.%d_%s.txt" % (
        output_dir, it, choice[rep])
    output_V = "%s/V.%d_%s.txt" % (
        output_dir, it, choice[rep])
    output_B = "%s/edges.%d_%s.txt" % (
        output_dir, it, choice[rep])
    output_ranks = "%s/ranks.%d_%s.txt" % (
        output_dir, it, choice[rep])

    idx1 = table1.sum(axis=0) > 0
    idx2 = table2.sum(axis=0) > 0
    table1 = table1.loc[:, idx1]
    table2 = table2.loc[:, idx2]

    table1 = Table(table1.values.T, table1.columns, table1.index)
    table2 = Table(table2.values.T, table2.columns, table2.index)

    with biom_open(output_microbes, 'w') as f:
        table1.to_hdf5(f, generated_by='moi1')
    with biom_open(output_metabolites, 'w') as f:
        table2.to_hdf5(f, generated_by='moi2')

    ranks = (U @ V)

    ranks = ranks[idx1, :]
    ranks = ranks[:, idx2]
    ranks = pd.DataFrame(ranks, index=table1.ids(axis='observation'),
                         columns=table2.ids(axis='observation'))
    ranks.to_csv(output_ranks, sep='\t')
    metadata.to_csv(output_md, sep='\t', index_label='#SampleID')

    B = B[:, idx1]

    np.savetxt(output_U, U)
    np.savetxt(output_V, V)
    np.savetxt(output_B, B)
Esempio n. 39
0
 def test_biom_add_metacol(self):
     obs = Table(*map(
         np.array,
         prep_table({
             'S1': {
                 'G1': 4,
                 'G2': 5,
                 'G3': 8,
                 'G4': 0,
                 'G5': 3
             },
             'S2': {
                 'G1': 1,
                 'G2': 8,
                 'G3': 0,
                 'G4': 7,
                 'G5': 4
             },
             'S3': {
                 'G1': 0,
                 'G2': 2,
                 'G3': 3,
                 'G4': 5,
                 'G5': 0
             }
         })))
     self.assertIsNone(obs.metadata(axis='observation'))
     rankdic = {'G1': 'S', 'G2': 'S', 'G3': 'F', 'G4': 'O', 'G5': 'P'}
     biom_add_metacol(obs, rankdic, 'Rank')
     exp = [{
         'Rank': 'S'
     }, {
         'Rank': 'S'
     }, {
         'Rank': 'F'
     }, {
         'Rank': 'O'
     }, {
         'Rank': 'P'
     }]
     self.assertListEqual(list(map(dict, obs.metadata(axis='observation'))),
                          exp)
     namedic = {
         'G1': 'Proteo',
         'G3': 'Actino',
         'G2': 'Firmic',
         'G4': 'Bacter'
     }
     biom_add_metacol(obs, namedic, 'Name', missing='X')
     exp = [{
         'Rank': 'S',
         'Name': 'Proteo'
     }, {
         'Rank': 'S',
         'Name': 'Firmic'
     }, {
         'Rank': 'F',
         'Name': 'Actino'
     }, {
         'Rank': 'O',
         'Name': 'Bacter'
     }, {
         'Rank': 'P',
         'Name': 'X'
     }]
     self.assertListEqual(list(map(dict, obs.metadata(axis='observation'))),
                          exp)
Esempio n. 40
0
def alpha_rarefaction(output_dir: str,
                      table: biom.Table,
                      max_depth: int,
                      phylogeny: skbio.TreeNode = None,
                      metrics: set = None,
                      metadata: qiime2.Metadata = None,
                      min_depth: int = 1,
                      steps: int = 10,
                      iterations: int = 10) -> None:

    if metrics is None:
        metrics = {'observed_otus', 'shannon'}
        if phylogeny is not None:
            metrics.add('faith_pd')
    elif not metrics:
        raise ValueError('`metrics` was given an empty set.')
    else:
        phylo_overlap = phylogenetic_metrics() & metrics
        if phylo_overlap and phylogeny is None:
            raise ValueError('Phylogenetic metric %s was requested but '
                             'phylogeny was not provided.' % phylo_overlap)

    if max_depth <= min_depth:
        raise ValueError('Provided max_depth of %d must be greater than '
                         'provided min_depth of %d.' % (max_depth, min_depth))
    possible_steps = max_depth - min_depth
    if possible_steps < steps:
        raise ValueError('Provided number of steps (%d) is greater than the '
                         'steps possible between min_depth and '
                         'max_depth (%d).' % (steps, possible_steps))
    if table.is_empty():
        raise ValueError('Provided table is empty.')
    max_frequency = max(table.sum(axis='sample'))
    if max_frequency < max_depth:
        raise ValueError('Provided max_depth of %d is greater than '
                         'the maximum sample total frequency of the '
                         'feature_table (%d).' % (max_depth, max_frequency))

    if metadata is None:
        columns, filtered_columns = set(), set()
    else:
        # Filter metadata to only include sample IDs present in the feature
        # table. Also ensures every feature table sample ID is present in the
        # metadata.
        metadata = metadata.filter_ids(table.ids(axis='sample'))

        # Drop metadata columns that aren't categorical, or consist solely of
        # missing values.
        pre_filtered_cols = set(metadata.columns)
        metadata = metadata.filter_columns(column_type='categorical',
                                           drop_all_missing=True)
        filtered_columns = pre_filtered_cols - set(metadata.columns)

        metadata_df = metadata.to_dataframe()
        if metadata_df.empty or len(metadata.columns) == 0:
            raise ValueError("All metadata filtered after dropping columns "
                             "that contained non-categorical data.")
        metadata_df.columns = pd.MultiIndex.from_tuples([
            (c, '') for c in metadata_df.columns
        ])
        columns = metadata_df.columns.get_level_values(0)

    data = _compute_rarefaction_data(table, min_depth, max_depth, steps,
                                     iterations, phylogeny, metrics)

    filenames = []
    for m, data in data.items():
        metric_name = quote(m)
        filename = '%s.csv' % metric_name

        if metadata is None:
            n_df = _compute_summary(data, 'sample-id')
            jsonp_filename = '%s.jsonp' % metric_name
            _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name,
                                     n_df, '')
            filenames.append(jsonp_filename)
        else:
            merged = data.join(metadata_df, how='left')
            for column in columns:
                column_name = quote(column)
                reindexed_df, counts = _reindex_with_metadata(
                    column, columns, merged)
                c_df = _compute_summary(reindexed_df, column, counts=counts)
                jsonp_filename = "%s-%s.jsonp" % (metric_name, column_name)
                _alpha_rarefaction_jsonp(output_dir, jsonp_filename,
                                         metric_name, c_df, column)
                filenames.append(jsonp_filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            data.columns = [
                'depth-%d_iter-%d' % (t[0], t[1]) for t in data.columns.values
            ]
            if metadata is not None:
                data = data.join(metadata.to_dataframe(), how='left')
            data.to_csv(fh, index_label=['sample-id'])

    index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'metrics': list(metrics),
                           'filenames': [quote(f) for f in filenames],
                           'columns': list(columns),
                           'steps': steps,
                           'filtered_columns': sorted(filtered_columns)
                       })

    shutil.copytree(
        os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'dist'),
        os.path.join(output_dir, 'dist'))
Esempio n. 41
0
    def test_collapse_biom(self):
        table = Table(*map(
            np.array,
            prep_table({
                'S1': {
                    'G1': 4,
                    'G2': 5,
                    'G3': 8,
                    'G4': 0,
                    'G5': 3,
                    'G6': 0
                },
                'S2': {
                    'G1': 1,
                    'G2': 8,
                    'G3': 0,
                    'G4': 7,
                    'G5': 4,
                    'G6': 2
                },
                'S3': {
                    'G1': 0,
                    'G2': 2,
                    'G3': 3,
                    'G4': 5,
                    'G5': 0,
                    'G6': 9
                }
            })))

        # one-to-one mapping (e.g., direct translation)
        mapping = {
            'G1': ['H1'],
            'G2': ['H2'],
            'G3': ['H3'],
            'G4': ['H4'],
            'G5': ['H5'],
            'G6': ['H6']
        }
        obs = collapse_biom(table.copy(), mapping)
        exp = Table(*map(
            np.array,
            prep_table({
                'S1': {
                    'H1': 4,
                    'H2': 5,
                    'H3': 8,
                    'H4': 0,
                    'H5': 3,
                    'H6': 0
                },
                'S2': {
                    'H1': 1,
                    'H2': 8,
                    'H3': 0,
                    'H4': 7,
                    'H5': 4,
                    'H6': 2
                },
                'S3': {
                    'H1': 0,
                    'H2': 2,
                    'H3': 3,
                    'H4': 5,
                    'H5': 0,
                    'H6': 9
                }
            })))
        self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal')

        # some missing, some extra
        mapping = {'G1': ['H1'], 'G2': ['H2'], 'G3': ['H3'], 'G9': ['H9']}
        obs = collapse_biom(table.copy(), mapping)
        exp = Table(*map(
            np.array,
            prep_table({
                'S1': {
                    'H1': 4,
                    'H2': 5,
                    'H3': 8
                },
                'S2': {
                    'H1': 1,
                    'H2': 8,
                    'H3': 0
                },
                'S3': {
                    'H1': 0,
                    'H2': 2,
                    'H3': 3
                }
            })))
        self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal')

        # wrong mapping (no match)
        mapping = {'H1': ['I1'], 'H2': ['I2'], 'H3': ['I3']}
        obs = collapse_biom(table.copy(), mapping)
        self.assertTrue(obs.is_empty())
        self.assertListEqual(list(obs.ids('sample')), ['S1', 'S2', 'S3'])
        self.assertListEqual(list(obs.ids('observation')), [])

        # many-to-one mapping (e.g., taxonomic rank up)
        mapping = {
            'G1': ['H1'],
            'G2': ['H1'],
            'G3': ['H2'],
            'G4': ['H2'],
            'G5': ['H2'],
            'G6': ['H3']
        }
        obs = collapse_biom(table.copy(), mapping)
        exp = Table(*map(
            np.array,
            prep_table({
                'S1': {
                    'H1': 9,
                    'H2': 11,
                    'H3': 0
                },
                'S2': {
                    'H1': 9,
                    'H2': 11,
                    'H3': 2
                },
                'S3': {
                    'H1': 2,
                    'H2': 8,
                    'H3': 9
                }
            })))
        self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal')

        # many-to-many mapping (e.g., genes to pathways)
        mapping = {
            'G1': ['H1'],
            'G2': ['H1', 'H2'],
            'G3': ['H2', 'H3', 'H4'],
            'G4': ['H2', 'H5'],
            'G5': ['H4'],
            'G6': ['H3', 'H5']
        }
        obs = collapse_biom(table.copy(), mapping)
        exp = Table(*map(
            np.array,
            prep_table({
                'S1': {
                    'H1': 9,
                    'H2': 13,
                    'H3': 8,
                    'H4': 11,
                    'H5': 0
                },
                'S2': {
                    'H1': 9,
                    'H2': 15,
                    'H3': 2,
                    'H4': 4,
                    'H5': 9
                },
                'S3': {
                    'H1': 2,
                    'H2': 10,
                    'H3': 12,
                    'H4': 3,
                    'H5': 14
                }
            })))
        self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal')

        # many-to-many mapping, with normalization
        obs = collapse_biom(table.copy(), mapping, normalize=True)
        exp = Table(*map(
            np.array,
            prep_table({
                'S1': {
                    'H1': 6,
                    'H2': 5,
                    'H3': 3,
                    'H4': 6,
                    'H5': 0
                },
                'S2': {
                    'H1': 5,
                    'H2': 8,
                    'H3': 1,
                    'H4': 4,
                    'H5': 4
                },
                'S3': {
                    'H1': 1,
                    'H2': 4,
                    'H3': 6,
                    'H4': 1,
                    'H5': 7
                }
            })))
        self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal')

        # nothing left after normalization
        table = Table(*map(
            np.array,
            prep_table({
                'S1': {
                    'G1': 0
                },
                'S2': {
                    'G1': 1
                },
                'S3': {
                    'G1': 2
                }
            })))
        mapping = {'G1': ['H1', 'H2', 'H3', 'H4']}
        obs = collapse_biom(table.copy(), mapping, normalize=True)
        self.assertTrue(obs.is_empty())
        self.assertListEqual(list(obs.ids('sample')), ['S1', 'S2', 'S3'])
        self.assertListEqual(list(obs.ids('observation')), [])
 def test_beta_rarefaction_empty_table(self):
     table = Table(np.array([[]]), [], [])
     with self.assertRaisesRegex(ValueError, 'feature table is empty'):
         beta_rarefaction(self.output_dir, table, 'braycurtis', 'upgma',
                          self.md, 1)
Esempio n. 43
0
    def test_import_shogun_biom(self):
        shogun_table = ('#OTU ID\t1450\t2563\n'
                        'k__Archaea\t26\t25\n'
                        'k__Archaea;p__Crenarchaeota\t3\t5\n'
                        'k__Archaea;p__Crenarchaeota;c__Thermoprotei\t1\t25\n')

        exp_biom = Table(np.array([[26, 25],
                                   [3, 5],
                                   [1, 25]]),
                         ['k__Archaea',
                          'k__Archaea;p__Crenarchaeota',
                          'k__Archaea;p__Crenarchaeota;c__Thermoprotei'],
                         ['1450',
                          '2563'])

        obs_biom = import_shogun_biom(StringIO(shogun_table))
        self.assertEqual(exp_biom, obs_biom)

        tax_metadata = {'k__Archaea': {
                            'taxonomy': ['k__Archaea']},
                        'k__Archaea;p__Crenarchaeota': {
                            'taxonomy': ['k__Archaea',
                                         'p__Crenarchaeota']},
                        'k__Archaea;p__Crenarchaeota;c__Thermoprotei': {
                            'taxonomy': ['k__Archaea',
                                         'p__Crenarchaeota',
                                         'c__Thermoprotei']}}
        exp_biom_tax = Table(np.array([[26, 25],
                                       [3, 5],
                                       [1, 25]]),
                             ['k__Archaea',
                              'k__Archaea;p__Crenarchaeota',
                              'k__Archaea;p__Crenarchaeota;c__Thermoprotei'],
                             ['1450',
                              '2563'])
        exp_biom_tax.add_metadata(tax_metadata, axis='observation')
        obs_biom_tax = import_shogun_biom(
            StringIO(shogun_table), names_to_taxonomy=True)

        self.assertEqual(exp_biom_tax, obs_biom_tax)

        # test modules
        module_table = ('#MODULE ID\t1450\t2563\n'
                        'M00017\t26\t25\n'
                        'M00018\t3\t5\n')

        exp_m_biom = Table(np.array([[26, 25],
                                     [3, 5]]),
                           ['M00017', 'M00018'],
                           ['1450', '2563'])
        exp_m_biom.add_metadata(self.mod_md, axis='observation')
        obs_m_biom = import_shogun_biom(
            StringIO(module_table), annotation_table=StringIO(self.modules),
            annotation_type='module')

        self.assertEqual(exp_m_biom, obs_m_biom)

        # test pathways
        path_table = ('#PATHWAY ID\t1450\t2563\n'
                      '1.4.1  With NAD+ or NADP+ as acceptor\t26\t25\n'
                      '1.4.3  With oxygen as acceptor\t3\t5\n')

        exp_p_biom = Table(np.array([[26, 25],
                                     [3, 5]]),
                           ['1.4.1  With NAD+ or NADP+ as acceptor',
                            '1.4.3  With oxygen as acceptor'],
                           ['1450', '2563'])

        exp_p_biom.add_metadata(self.path_md, axis='observation')
        obs_p_biom = import_shogun_biom(
            StringIO(path_table), annotation_table=StringIO(self.pathways),
            annotation_type='pathway')

        self.assertEqual(exp_p_biom, obs_p_biom)

        # test enzymes
        enzyme_table = ('#KEGG ID\t1450\t2563\n'
                        'K00001\t26\t25\n'
                        'K00002\t3\t5\n'
                        'K00003\t1\t25\n')
        exp_e_biom = Table(np.array([[26, 25],
                                     [3, 5],
                                     [1, 25]]),
                           ['K00001',
                            'K00002',
                            'K00003'],
                           ['1450', '2563'])
        exp_e_biom.add_metadata(self.enz_md, axis='observation')
        obs_e_biom = import_shogun_biom(
            StringIO(enzyme_table), annotation_table=StringIO(self.enzymes),
            annotation_type='enzyme')

        self.assertEqual(exp_e_biom, obs_e_biom)

        # test empty
        empty_table = ('#KEGG ID\t1450\t2563\n')
        exp_empty_biom = Table(np.zeros((0, 2)),
                               [],
                               ['1450', '2563'])
        obs_empty_biom = import_shogun_biom(
            StringIO(empty_table), annotation_table=StringIO(self.enzymes),
            annotation_type='enzyme')

        self.assertEqual(exp_empty_biom, obs_empty_biom)
Esempio n. 44
0
def pick_subsampled_open_reference_otus(input_fp,
                                        refseqs_fp,
                                        output_dir,
                                        percent_subsample,
                                        new_ref_set_id,
                                        command_handler,
                                        params,
                                        qiime_config,
                                        prefilter_refseqs_fp=None,
                                        run_assign_tax=True,
                                        run_align_and_tree=True,
                                        prefilter_percent_id=None,
                                        min_otu_size=2,
                                        step1_otu_map_fp=None,
                                        step1_failures_fasta_fp=None,
                                        parallel=False,
                                        suppress_step4=False,
                                        logger=None,
                                        suppress_md5=False,
                                        suppress_index_page=False,
                                        denovo_otu_picking_method='uclust',
                                        reference_otu_picking_method='uclust_ref',
                                        status_update_callback=print_to_stdout):
    """ Run the data preparation steps of Qiime

        The steps performed by this function are:
          - Pick reference OTUs against refseqs_fp
          - Subsample the failures to n sequences.
          - Pick OTUs de novo on the n failures.
          - Pick representative sequences for the resulting OTUs.
          - Pick reference OTUs on all failures using the
             representative set from step 4 as the reference set.

    """
    # for now only allowing uclust for otu picking
    allowed_denovo_otu_picking_methods = ['uclust', 'usearch61']
    allowed_reference_otu_picking_methods = ['uclust_ref', 'usearch61_ref']
    assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\
        "Unknown de novo OTU picking method: %s. Known methods are: %s"\
        % (denovo_otu_picking_method,
           ','.join(allowed_denovo_otu_picking_methods))

    assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\
        "Unknown reference OTU picking method: %s. Known methods are: %s"\
        % (reference_otu_picking_method,
           ','.join(allowed_reference_otu_picking_methods))

    # Prepare some variables for the later steps
    index_links = []
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    if logger is None:
        log_fp = generate_log_fp(output_dir)
        logger = WorkflowLogger(log_fp,
                                params=params,
                                qiime_config=qiime_config)

        close_logger_on_success = True
        index_links.append(
                ('Run summary data',
                log_fp,
                _index_headers['run_summary']))
    else:
        close_logger_on_success = False


    if not suppress_md5:
        log_input_md5s(logger, [input_fp,
                                refseqs_fp,
                                step1_otu_map_fp,
                                step1_failures_fasta_fp])

    # if the user has not passed a different reference collection for the pre-filter,
    # used the main refseqs_fp. this is useful if the user wants to provide a smaller
    # reference collection, or to use the input reference collection when running in
    # iterative mode (rather than an iteration's new refseqs)
    if prefilter_refseqs_fp is None:
        prefilter_refseqs_fp = refseqs_fp

    # Step 1: Closed-reference OTU picking on the input file (if not already
    # complete)
    if step1_otu_map_fp and step1_failures_fasta_fp:
        step1_dir = '%s/step1_otus' % output_dir
        create_dir(step1_dir)
        logger.write("Using pre-existing reference otu map and failures.\n\n")
    else:
        if prefilter_percent_id is not None:
            prefilter_dir = '%s/prefilter_otus/' % output_dir
            prefilter_failures_list_fp = '%s/%s_failures.txt' % \
                (prefilter_dir, input_basename)
            prefilter_pick_otu_cmd = pick_reference_otus(
                input_fp, prefilter_dir, reference_otu_picking_method,
                prefilter_refseqs_fp, parallel, params, logger, prefilter_percent_id)
            commands.append(
                [('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)])

            prefiltered_input_fp = '%s/prefiltered_%s%s' %\
                (prefilter_dir, input_basename, input_ext)
            filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\
                (input_fp, prefiltered_input_fp, prefilter_failures_list_fp)
            commands.append(
                [('Filter prefilter failures from input', filter_fasta_cmd)])
            index_links.append(
            ('Pre-filtered sequence identifiers '
             '(failed to hit reference at %1.1f%% identity)' % (float(prefilter_percent_id)*100),
                        prefilter_failures_list_fp,
                        _index_headers['sequences']))


            # Call the command handler on the list of commands
            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

            input_fp = prefiltered_input_fp
            input_dir, input_filename = split(input_fp)
            input_basename, input_ext = splitext(input_filename)
            if getsize(prefiltered_input_fp) == 0:
                raise ValueError(
                    "All sequences were discarded by the prefilter. "
                    "Are the input sequences in the same orientation "
                    "in your input file and reference file (you can "
                    "add 'pick_otus:enable_rev_strand_match True' to "
                    "your parameters file if not)? Are you using the "
                    "correct reference file?")

        # Build the OTU picking command
        step1_dir = \
            '%s/step1_otus' % output_dir
        step1_otu_map_fp = \
            '%s/%s_otus.txt' % (step1_dir, input_basename)
        step1_pick_otu_cmd = pick_reference_otus(
            input_fp, step1_dir, reference_otu_picking_method,
            refseqs_fp, parallel, params, logger)
        commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)])

        # Build the failures fasta file
        step1_failures_list_fp = '%s/%s_failures.txt' % \
            (step1_dir, input_basename)
        step1_failures_fasta_fp = \
            '%s/failures.fasta' % step1_dir
        step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
            (input_fp, step1_failures_list_fp, step1_failures_fasta_fp)

        commands.append([('Generate full failures fasta file',
                          step1_filter_fasta_cmd)])

        # Call the command handler on the list of commands
        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []

    step1_repset_fasta_fp = \
        '%s/step1_rep_set.fna' % step1_dir
    step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
        (step1_otu_map_fp, step1_repset_fasta_fp, input_fp)
    commands.append([('Pick rep set', step1_pick_rep_set_cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    # Subsample the failures fasta file to retain (roughly) the
    # percent_subsample
    step2_input_fasta_fp = \
        '%s/subsampled_failures.fasta' % step1_dir
    subsample_fasta(step1_failures_fasta_fp,
                    step2_input_fasta_fp,
                    percent_subsample)

    logger.write('# Subsample the failures fasta file using API \n' +
                 'python -c "import qiime; qiime.util.subsample_fasta' +
                 '(\'%s\', \'%s\', \'%f\')\n\n"' % (abspath(step1_failures_fasta_fp),
                                                    abspath(
                                                        step2_input_fasta_fp),
                                                    percent_subsample))

    # Prep the OTU picking command for the subsampled failures
    step2_dir = '%s/step2_otus/' % output_dir
    step2_cmd = pick_denovo_otus(step2_input_fasta_fp,
                                 step2_dir,
                                 new_ref_set_id,
                                 denovo_otu_picking_method,
                                 params,
                                 logger)
    step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir

    commands.append([('Pick de novo OTUs for new clusters', step2_cmd)])

    # Prep the rep set picking command for the subsampled failures
    step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir
    step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
        (step2_otu_map_fp, step2_repset_fasta_fp, step2_input_fasta_fp)
    commands.append(
        [('Pick representative set for subsampled failures', step2_rep_set_cmd)])

    step3_dir = '%s/step3_otus/' % output_dir
    step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir
    step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir
    step3_cmd = pick_reference_otus(
        step1_failures_fasta_fp,
        step3_dir,
        reference_otu_picking_method,
        step2_repset_fasta_fp,
        parallel,
        params,
        logger)

    commands.append([
        ('Pick reference OTUs using de novo rep set', step3_cmd)])

    # name the final otu map
    merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir

    index_links.append(
        ('Final map of OTU identifier to sequence identifers (i.e., "OTU map")',
         merged_otu_map_fp,
         _index_headers['otu_maps']))


    if not suppress_step4:
        step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir
        step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
            (step1_failures_fasta_fp,
             step3_failures_list_fp, step3_failures_fasta_fp)
        commands.append([('Create fasta file of step3 failures',
                          step3_filter_fasta_cmd)])

        step4_dir = '%s/step4_otus/' % output_dir
        step4_cmd = pick_denovo_otus(step3_failures_fasta_fp,
                                     step4_dir,
                                     '.'.join([new_ref_set_id, 'CleanUp']),
                                     denovo_otu_picking_method,
                                     params,
                                     logger)
        step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir
        commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)])
        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        cat_otu_tables_cmd = 'cat %s %s %s > %s' %\
            (step1_otu_map_fp, step3_otu_map_fp,
             step4_otu_map_fp, merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])
        step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir
        step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
            (step4_otu_map_fp, step4_repset_fasta_fp, step3_failures_fasta_fp)
        commands.append(
            [('Pick representative set for subsampled failures', step4_rep_set_cmd)])

    else:
        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        cat_otu_tables_cmd = 'cat %s %s > %s' %\
            (step1_otu_map_fp, step3_otu_map_fp, merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])
        # Move the step 3 failures file to the top-level directory
        commands.append([('Move final failures file to top-level directory',
                          'mv %s %s/final_failures.txt' % (step3_failures_list_fp, output_dir))])

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    otu_fp = merged_otu_map_fp
    # Filter singletons from the otu map
    otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,
                                                          min_otu_size)

    otus_to_keep = filter_otus_from_otu_map(
        otu_fp,
        otu_no_singletons_fp,
        min_otu_size)

    index_links.append(('Final map of OTU identifier to sequence identifers excluding '
                        'OTUs with fewer than %d sequences' % min_otu_size,
                        otu_no_singletons_fp,
                        _index_headers['otu_maps']))

    logger.write('# Filter singletons from the otu map using API \n' +
                 'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' +
                 '(\'%s\', \'%s\', \'%d\')"\n\n' % (abspath(otu_fp),
                                                    abspath(
                                                        otu_no_singletons_fp),
                                                    min_otu_size))

    # make the final representative seqs file and a new refseqs file that
    # could be used in subsequent otu picking runs.
    # this is clunky. first, we need to do this without singletons to match
    # the otu map without singletons. next, there is a difference in what
    # we need the reference set to be and what we need the repseqs to be.
    # the reference set needs to be a superset of the input reference set
    # to this set. the repset needs to be only the sequences that were observed
    # in this data set, and we want reps for the step1 reference otus to be
    # reads from this run so we don't hit issues building a tree using
    # sequences of very different lengths. so...
    final_repset_fp = '%s/rep_set.fna' % output_dir
    index_links.append(
        ('OTU representative sequences',
         final_repset_fp,
         _index_headers['sequences']))
    final_repset_f = open(final_repset_fp, 'w')
    new_refseqs_fp = '%s/new_refseqs.fna' % output_dir
    index_links.append(
        ('New reference sequences (i.e., OTU representative sequences plus input '
         'reference sequences)',
         new_refseqs_fp,
         _index_headers['sequences']))
    # write non-singleton otus representative sequences from step1 to the
    # final rep set file
    for otu_id, seq in parse_fasta(open(step1_repset_fasta_fp, 'U')):
        if otu_id.split()[0] in otus_to_keep:
            final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    logger.write('# Write non-singleton otus representative sequences ' +
                 'from step1 to the final rep set file: %s\n\n' % final_repset_fp)
    # copy the full input refseqs file to the new refseqs_fp
    copy(refseqs_fp, new_refseqs_fp)
    new_refseqs_f = open(new_refseqs_fp, 'a')
    new_refseqs_f.write('\n')
    logger.write('# Copy the full input refseqs file to the new refseq file\n' +
                 'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp))
    # iterate over all representative sequences from step2 and step4 and write
    # those corresponding to non-singleton otus to the final representative set
    # file and the new reference sequences file.
    for otu_id, seq in parse_fasta(open(step2_repset_fasta_fp, 'U')):
        if otu_id.split()[0] in otus_to_keep:
            new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
            final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    if not suppress_step4:
        for otu_id, seq in parse_fasta(open(step4_repset_fasta_fp, 'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    new_refseqs_f.close()
    final_repset_f.close()
    logger.write('# Write non-singleton otus representative sequences from ' +
                 'step 2 and step 4 to the final representative set and the new reference' +
                 ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp))

    # Prep the make_otu_table.py command
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)

    make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\
        (otu_no_singletons_fp, otu_table_fp)
    commands.append([("Make the otu table", make_otu_table_cmd)])
    index_links.append(
        ('OTU table exluding OTUs with fewer than %d sequences' % min_otu_size,
         otu_table_fp,
         _index_headers['otu_tables']))
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)

    commands = []

    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)

        align_and_tree_input_otu_table = otu_table_w_tax_fp
        index_links.append(
            ('OTU table exluding OTUs with fewer than %d sequences and including OTU '
             'taxonomy assignments' % min_otu_size,
             otu_table_w_tax_fp,
             _index_headers['otu_tables']))

        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size)
        index_links.append(
            ('OTU table exluding OTUs with fewer than %d sequences and sequences that '
            'fail to align with PyNAST and including OTU taxonomy assignments' % min_otu_size,
             pynast_failure_filtered_otu_table_fp,
             _index_headers['otu_tables']))

    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
        index_links.append(
            ('OTU table exluding OTUs with fewer than %d sequences and including OTU '
            'taxonomy assignments' % min_otu_size,
             otu_table_w_tax_fp,
             _index_headers['otu_tables']))

    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,
                                                           min_otu_size)
        index_links.append(
            ('OTU table exluding OTUs with fewer than %d sequences and sequences that '
             'fail to align with PyNAST' % min_otu_size,
             pynast_failure_filtered_otu_table_fp,
             _index_headers['otu_tables']))

    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write(
                "Final output file exists (%s). Will not rebuild." %
                otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp], error_on_missing=False)

            taxonomy_fp = assign_tax(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
                (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table", add_metadata_cmd)])

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        rep_set_tree_fp = join(output_dir, 'rep_set.tre')
        index_links.append(
            ('OTU phylogenetic tree',
             rep_set_tree_fp,
             _index_headers['trees']))
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)

            pynast_failures_fp = align_and_tree(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Build OTU table without PyNAST failures
            with biom_open(align_and_tree_input_otu_table) as biom_file:
                table = Table.from_hdf5(biom_file)
            filtered_otu_table = filter_otus_from_otu_table(table,
                get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
                0, inf, 0, inf, negate_ids_to_keep=True)
            write_biom_table(filtered_otu_table,
                             pynast_failure_filtered_otu_table_fp)

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []


    if close_logger_on_success:
        logger.close()

    if not suppress_index_page:
        index_fp = '%s/index.html' % output_dir
        generate_index_page(index_links, index_fp)
Esempio n. 45
0
                    metavar = "filename",
                    help = "[REQUIRED] outfile name",
                    required = True)
options = parser.parse_args()

#############################
# Import json formatted OTU #
#############################

import json
jsondata = open(options.biominputfile)
biom = json.load(jsondata)
jsondata.close()

from biom import Table
table = Table.from_json(biom)

print("")
print("Original OTU Table (without taxonomy)")
print("-------------------------------------")
print("")
print(table)
print("")

min_samplesize = int(min(table.sum(axis='sample')))
print("Subsampling to the smallest sample size: " + str(min_samplesize))

# Subsample
table_ss = table.subsample(min_samplesize)

# Output
Esempio n. 46
0
    def test_filter_table(self):
        table = prep_table({
            'S1': {
                'G1': 4,
                'G2': 5,
                'G3': 8
            },
            'S2': {
                'G1': 2,
                'G4': 3,
                'G5': 7
            },
            'S3': {
                'G2': 3,
                'G5': 5
            }
        })

        # filter by count
        obs = filter_table(table, th=3)
        exp = ([[4, 0, 0], [5, 0, 3], [8, 0, 0], [0, 3, 0],
                [0, 7, 5]], ['G1', 'G2', 'G3', 'G4', 'G5'], ['S1', 'S2',
                                                             'S3'], [{}] * 5)
        self.assertTupleEqual(obs, exp)

        obs = filter_table(table, th=4)
        exp = ([[4, 0, 0], [5, 0, 0], [8, 0, 0],
                [0, 7, 5]], ['G1', 'G2', 'G3', 'G5'], ['S1', 'S2',
                                                       'S3'], [{}] * 4)
        self.assertTupleEqual(obs, exp)

        obs = filter_table(table, th=6)
        exp = ([[8, 0, 0], [0, 7, 0]], ['G3', 'G5'], ['S1', 'S2',
                                                      'S3'], [{}] * 2)
        self.assertTupleEqual(obs, exp)

        # filter by threshold
        obs = filter_table(table, th=0.25)
        exp = ([[5, 0, 3], [8, 0, 0], [0, 3, 0],
                [0, 7, 5]], ['G2', 'G3', 'G4', 'G5'], ['S1', 'S2',
                                                       'S3'], [{}] * 4)
        self.assertTupleEqual(obs, exp)

        obs = filter_table(table, th=0.5)
        exp = ([[0, 7, 5]], ['G5'], ['S1', 'S2', 'S3'], [{}])
        self.assertTupleEqual(obs, exp)

        # filter out everything
        obs = filter_table(table, th=10)
        exp = ([], [], ['S1', 'S2', 'S3'], [])
        self.assertTupleEqual(obs, exp)

        # filter an empty table
        obs = filter_table(exp, th=1)
        exp = ([], [], ['S1', 'S2', 'S3'], [])
        self.assertTupleEqual(obs, exp)

        # filter a BIOM table
        table = Table(*map(np.array, table))
        obs = filter_table(table, th=3)
        exp = Table(*map(
            np.array,
            prep_table({
                'S1': {
                    'G1': 4,
                    'G2': 5,
                    'G3': 8
                },
                'S2': {
                    'G4': 3,
                    'G5': 7
                },
                'S3': {
                    'G2': 3,
                    'G5': 5
                }
            })))
        self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal')
    def setUpClass(cls):
        _table1 = [
            'a\ta\t1\t0.0\t0.5\t0.1', 'a\ta\t1\t1.0\t1.0\t0.2',
            'a\ta\t1\t2.0\t1.5\t0.2', 'a\tb\t1\t3.0\t2.0\t8.',
            'a\tb\t1\t4.0\t2.5\t9.', 'a\tb\t1\t5.0\t3.0\t10.',
            'b\ta\t1\t0.0\t2.0\t0.1', 'b\ta\t1\t1.0\t3.0\t0.3',
            'b\ta\t1\t2.0\t4.0\t0.1', 'b\tb\t1\t3.0\t5.0\t9.',
            'b\tb\t1\t4.0\t6.0\t11.', 'b\tb\t1\t5.0\t7.0\t10.'
        ]

        cls.table1 = pd.DataFrame(
            [(n.split('\t')) for n in _table1],
            columns=['group', 'dataset', 'level', 'x', 'y', 'c'],
            dtype=float)

        cls.table2 = """{"id": "None",
                          "format": "Biological Observation Matrix 1.0.0",
                          "format_url": "http:\/\/biom-format.org",
                          "type": "OTU table",
                          "generated_by": "greg",
                          "date": "2013-08-22T13:10:23.907145",
                          "matrix_type": "sparse",
                          "matrix_element_type": "float",
                          "shape": [
                            3,
                            4
                          ],
                          "data": [
                            [
                              0,
                              0,
                              1
                            ],
                            [
                              0,
                              1,
                              2
                            ],
                            [
                              0,
                              2,
                              3
                            ],
                            [
                              0,
                              3,
                              4
                            ],
                            [
                              1,
                              0,
                              2
                            ],
                            [
                              1,
                              1,
                              0
                            ],
                            [
                              1,
                              2,
                              7
                            ],
                            [
                              1,
                              3,
                              8
                            ],
                            [
                              2,
                              0,
                              9
                            ],
                            [
                              2,
                              1,
                              10
                            ],
                            [
                              2,
                              2,
                              11
                            ],
                            [
                              2,
                              3,
                              12
                            ]
                          ],
                          "rows": [
                            {
                              "id": "o1",
                              "metadata": {
                                "domain": "Archaea"
                              }
                            },
                            {
                              "id": "o2",
                              "metadata": {
                                "domain": "Bacteria"
                              }
                            },
                            {
                              "id": "o3",
                              "metadata": {
                                "domain": "Bacteria"
                              }
                            }
                          ],
                          "columns": [
                            {
                              "id": "s1",
                              "metadata": {
                                "method": "A",
                                "Sample": "A",
                                "parameters": "A"
                              }
                            },
                            {
                              "id": "s2",
                              "metadata": {
                                "method": "A",
                                "Sample": "A",
                                "parameters": "B"
                              }
                            },
                            {
                              "id": "s3",
                              "metadata": {
                                "method": "A",
                                "Sample": "A",
                                "parameters": "C"
                              }
                            },
                            {
                              "id": "s4",
                              "metadata": {
                                "method": "B",
                                "Sample": "A",
                                "parameters": "D"
                              }
                            }
                          ]
                        }"""
        # table 2
        # OTU ID	s1	s2	s3	s4
        # o1    1.0 2.0 3.0 4.0
        # o2    2.0 0.0 7.0 8.0
        # o3    9.0 10.0    11.0    12.0

        cls.tmpdir = mkdtemp()
        cls.table2 = Table.from_json(json.loads(cls.table2))
        write_biom_table(cls.table2, 'hdf5', join(cls.tmpdir, 'table2.biom'))
        cls.dm, cls.s_md = make_distance_matrix(join(cls.tmpdir,
                                                     'table2.biom'),
                                                method="braycurtis")
        cls.dist = per_method_distance(cls.dm,
                                       cls.s_md,
                                       group_by='method',
                                       standard='B',
                                       metric='distance',
                                       sample='Sample')
Esempio n. 48
0
File: 54.py Progetto: tkosciol/qiita
def create_non_rarefied_biom_artifact(analysis, biom_data, rarefied_table):
    """Creates the initial non-rarefied BIOM artifact of the analysis

    Parameters
    ----------
    analysis : dict
        Dictionary with the analysis information
    biom_data : dict
        Dictionary with the biom file information
    rarefied_table : biom.Table
        The rarefied BIOM table

    Returns
    -------
    int
        The id of the new artifact
    """
    # The non rarefied biom artifact is the initial biom table of the analysis.
    # This table does not currently exist anywhere, so we need to actually
    # create the BIOM file. To create this BIOM file we need: (1) the samples
    # and artifacts they come from and (2) whether the samples where
    # renamed or not. (1) is on the database, but we need to inferr (2) from
    # the existing rarefied BIOM table. Fun, fun...

    with TRN:
        # Get the samples included in the BIOM table grouped by artifact id
        # Note that the analysis contains a BIOM table per data type included
        # in it, and the table analysis_sample does not differentiate between
        # datatypes, so we need to check the data type in the artifact table
        sql = """SELECT artifact_id, array_agg(sample_id)
                 FROM qiita.analysis_sample
                    JOIN qiita.artifact USING (artifact_id)
                 WHERE analysis_id = %s AND data_type_id = %s
                 GROUP BY artifact_id"""
        TRN.add(sql, [analysis['analysis_id'], biom_data['data_type_id']])
        samples_by_artifact = TRN.execute_fetchindex()

        # Create an empty BIOM table to be the new master table
        new_table = Table([], [], [])
        ids_map = {}
        for a_id, samples in samples_by_artifact:
            # Get the filepath of the BIOM table from the artifact
            artifact = Artifact(a_id)
            biom_fp = None
            for _, fp, fp_type in artifact.filepaths:
                if fp_type == 'biom':
                    biom_fp = fp
            # Note that we are sure that the biom table exists for sure, so
            # no need to check if biom_fp is undefined
            biom_table = load_table(biom_fp)
            samples = set(samples).intersection(biom_table.ids())
            biom_table.filter(samples, axis='sample', inplace=True)
            # we need to check if the table has samples left before merging
            if biom_table.shape[0] != 0 and biom_table.shape[1] != 0:
                new_table = new_table.merge(biom_table)
                ids_map.update({sid: "%d.%s" % (a_id, sid)
                                for sid in biom_table.ids()})

        # Check if we need to rename the sample ids in the biom table
        new_table_ids = set(new_table.ids())
        if not new_table_ids.issuperset(rarefied_table.ids()):
            # We need to rename the sample ids
            new_table.update_ids(ids_map, 'sample', True, True)

        sql = """INSERT INTO qiita.artifact
                    (generated_timestamp, data_type_id, visibility_id,
                     artifact_type_id, submitted_to_vamps)
            VALUES (%s, %s, %s, %s, %s)
            RETURNING artifact_id"""
        # Magic number 4 -> visibility sandbox
        # Magix number 7 -> biom artifact type
        TRN.add(sql, [analysis['timestamp'], biom_data['data_type_id'],
                      4, 7, False])
        artifact_id = TRN.execute_fetchlast()

        # Associate the artifact with the analysis
        sql = """INSERT INTO qiita.analysis_artifact
                    (analysis_id, artifact_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [analysis['analysis_id'], artifact_id])
        # Link the artifact with its file
        dd_id, mp = get_mountpoint('BIOM')[0]
        dir_fp = join(get_db_files_base_dir(), mp, str(artifact_id))
        if not exists(dir_fp):
            makedirs(dir_fp)
        new_table_fp = join(dir_fp, "biom_table.biom")
        with biom_open(new_table_fp, 'w') as f:
            new_table.to_hdf5(f, "Generated by Qiita")

        sql = """INSERT INTO qiita.filepath
                    (filepath, filepath_type_id, checksum,
                     checksum_algorithm_id, data_directory_id)
                 VALUES (%s, %s, %s, %s, %s)
                 RETURNING filepath_id"""
        # Magic number 7 -> filepath_type_id = 'biom'
        # Magic number 1 -> the checksum algorithm id
        TRN.add(sql, [basename(new_table_fp), 7,
                      compute_checksum(new_table_fp), 1, dd_id])
        fp_id = TRN.execute_fetchlast()
        sql = """INSERT INTO qiita.artifact_filepath
                    (artifact_id, filepath_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [artifact_id, fp_id])
        TRN.execute()

    return artifact_id
Esempio n. 49
0
def ft1_factory():
    return Artifact.import_data(
        'FeatureTable[Frequency]',
        Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'],
              ['S1', 'S2', 'S3']))
def beta_rarefaction(output_dir: str,
                     table: biom.Table,
                     metric: str,
                     clustering_method: str,
                     metadata: qiime2.Metadata,
                     sampling_depth: int,
                     iterations: int = 10,
                     phylogeny: skbio.TreeNode = None,
                     correlation_method: str = 'spearman',
                     color_scheme: str = 'BrBG') -> None:
    with qiime2.sdk.Context() as scope:
        if table.is_empty():
            raise ValueError("Input feature table is empty.")

        # Filter metadata to only include sample IDs present in the feature
        # table. Also ensures every feature table sample ID is present in the
        # metadata.
        metadata = metadata.filter_ids(table.ids(axis='sample'))

        table = qiime2.Artifact.import_data('FeatureTable[Frequency]', table)

        if metric in METRICS['PHYLO']['IMPL'] | METRICS['PHYLO']['UNIMPL']:
            if phylogeny is None:
                raise ValueError("A phylogenetic metric (%s) was requested, "
                                 "but a phylogenetic tree was not provided. "
                                 "Phylogeny must be provided when using a "
                                 "phylogenetic diversity metric." % metric)

            phylogeny = qiime2.Artifact.import_data('Phylogeny[Rooted]',
                                                    phylogeny)
            api_method = scope.ctx.get_action('diversity', 'beta_phylogenetic')
            beta_func = functools.partial(api_method, phylogeny=phylogeny)
        else:
            beta_func = scope.ctx.get_action('diversity', 'beta')

        rare_func = scope.ctx.get_action('feature-table', 'rarefy')

        distance_matrices = _get_multiple_rarefaction(beta_func, rare_func,
                                                      metric, iterations,
                                                      table, sampling_depth)

    primary = distance_matrices[0]
    support = distance_matrices[1:]

    heatmap_fig, similarity_df = _make_heatmap(distance_matrices, metric,
                                               correlation_method,
                                               color_scheme)
    heatmap_fig.savefig(os.path.join(output_dir, 'heatmap.svg'))
    similarity_df.to_csv(os.path.join(output_dir,
                                      'rarefaction-iteration-correlation.tsv'),
                         sep='\t')

    tree = _cluster_samples(primary, support, clustering_method)
    tree.write(
        os.path.join(output_dir,
                     'sample-clustering-%s.tre' % clustering_method))

    emperor = _jackknifed_emperor(primary, support, metadata)
    emperor_dir = os.path.join(output_dir, 'emperor')
    emperor.copy_support_files(emperor_dir)
    with open(os.path.join(emperor_dir, 'index.html'), 'w') as fh:
        fh.write(emperor.make_emperor(standalone=True))

    templates = list(
        map(
            lambda page: os.path.join(TEMPLATES, 'beta_rarefaction_assets',
                                      page),
            ['index.html', 'heatmap.html', 'tree.html', 'emperor.html']))

    context = {
        'metric':
        metric,
        'clustering_method':
        clustering_method,
        'tabs': [{
            'url': 'emperor.html',
            'title': 'PCoA'
        }, {
            'url': 'heatmap.html',
            'title': 'Heatmap'
        }, {
            'url': 'tree.html',
            'title': 'Clustering'
        }]
    }

    q2templates.render(templates, output_dir, context=context)
Esempio n. 51
0
def qarcoal(
    table: biom.Table,
    taxonomy: pd.DataFrame,
    num_string: str,
    denom_string: str,
    samples_to_use: Metadata = None,
    allow_shared_features: bool = False,
) -> pd.DataFrame:
    """Calculate sample-wise log-ratios of features based on taxonomy.

    Parameters:
    -----------
        table: biom file with which to calculate log ratios
        taxonomy: pd.DataFrame with taxonomy information (should have Taxon
            column in which features will be searched)
        num_string: numerator string to search for in taxonomy
        denom_string: denominator string to search for in taxonomy
        samples_to_use: Q2 Metadata file with samples to use.
            If provided, feature table will be filtered to only consider
            samples present in this file. (optional)
        allow_shared_features: bool denoting handling of shared features
            between numerator and denominator. If False, an error is raised
            if features are shared between numerator and denominator. If True,
            will allow shared features without throwing an error.
    Returns:
    --------
        comparison_df: pd DataFrame in the form:

            Sample-ID    Num_Sum    Denom_Sum   log_ratio
                   S1          7           15   -0.762140
    """

    # biom table is features x samples
    if samples_to_use is not None:
        filt_samples = set(samples_to_use.to_dataframe().index)
        feat_table = table.filter(filt_samples, axis="sample", inplace=False)
        feat_table = feat_table.to_dataframe()
    else:
        feat_table = table.to_dataframe()

    # raise error if there are any negative counts in the feature table
    if feat_table.lt(0).any().any():
        raise ValueError("Feature table has negative counts!")

    tax_num_df, tax_denom_df = filter_and_join_taxonomy(
        feat_table,
        taxonomy,
        num_string,
        denom_string,
    )

    # if shared features are disallowed, check to make sure they don't occur
    # if allowed, can skip this step at user's risk
    if not allow_shared_features:
        shared_features = set(tax_num_df.index) & set(tax_denom_df.index)
        if shared_features:
            raise ValueError("Shared features between num and denom!")

    tax_num_sample_sum = tax_num_df.sum(axis=0)
    tax_denom_sample_sum = tax_denom_df.sum(axis=0)

    comparison_df = pd.DataFrame.from_records(
        [tax_num_sample_sum, tax_denom_sample_sum],
        index=["Num_Sum", "Denom_Sum"],
    ).T
    comparison_df["log_ratio"] = comparison_df.apply(
        lambda x: np.log(x.Num_Sum / x.Denom_Sum), axis=1)
    comparison_df.index.name = "Sample-ID"

    return comparison_df
Esempio n. 52
0
def full_pipeline(
        table: biom.Table,
        seq: pd.Series,
        threads: int = 1,
        hsp_method: str = "mp",
        placement_tool: str = "epa-ng",
        min_align: float = 0.8,
        max_nsti: float = 2.0,
        edge_exponent: float = 0.5,
        skip_minpath: bool = False,
        no_gap_fill: bool = False,
        skip_norm: bool = False,
        highly_verbose: bool = False) -> (biom.Table, biom.Table, biom.Table):

    # Write out BIOM table and FASTA to be used in pipeline.
    with TemporaryDirectory() as temp_dir:

        # Write out BIOM table:
        biom_infile = path.join(temp_dir, "intable.biom")
        with biom.util.biom_open(biom_infile, 'w') as out_biom:
            table.to_hdf5(h5grp=out_biom,
                          generated_by="PICRUSt2 QIIME 2 Plugin")

        # Write out Pandas series as FASTA:
        seq_outfile = path.join(temp_dir, "seqs.fna")

        with open(seq_outfile, "w") as outfile_fh:
            for seqname, sequence in seq.iteritems():
                print(">" + str(seqname) + "\n" + str(sequence),
                      file=outfile_fh)

        picrust2_out = path.join(temp_dir, "picrust2_out")

        func_outputs, pathway_outputs = picrust2.pipeline.full_pipeline(
            study_fasta=seq_outfile,
            input_table=biom_infile,
            output_folder=picrust2_out,
            processes=threads,
            placement_tool=placement_tool,
            ref_dir=default_ref_dir,
            in_traits="EC,KO",
            custom_trait_tables=None,
            marker_gene_table=default_tables["16S"],
            pathway_map=default_pathway_map,
            rxn_func="EC",
            no_pathways=False,
            regroup_map=default_regroup_map,
            no_regroup=False,
            stratified=False,
            max_nsti=max_nsti,
            min_reads=1,
            min_samples=1,
            hsp_method=hsp_method,
            edge_exponent=edge_exponent,
            min_align=min_align,
            skip_nsti=False,
            skip_minpath=skip_minpath,
            no_gap_fill=no_gap_fill,
            coverage=False,
            per_sequence_contrib=False,
            wide_table=False,
            skip_norm=skip_norm,
            remove_intermediate=False,
            verbose=highly_verbose)

        # Convert the returned unstratified tables to BIOM tables.
        # Note that the 0-index in the func table returned objects corresponds
        # to the path to the unstratified table.
        ko_biom = biom.load_table(func_outputs["KO"][0])
        ec_biom = biom.load_table(func_outputs["EC"][0])
        pathabun_biom = biom.load_table(pathway_outputs["unstrat_abun"])

        return ko_biom, ec_biom, pathabun_biom
Esempio n. 53
0
def ft3_factory():
    return Artifact.import_data(
        'FeatureTable[Frequency]',
        Table(np.array([[0, 4, 9], [4, 4, 8]]), ['O1', 'O4'],
              ['S7', 'S8', 'S9']))
Esempio n. 54
0
def convert_precalc_to_biom(precalc_in,
                            ids_to_load=None,
                            transpose=True,
                            md_prefix='metadata_'):
    """Loads PICRUSTs tab-delimited version of the precalc file and outputs a BIOM object"""

    #if given a string convert to a filehandle
    if type(precalc_in) == str or type(precalc_in) == unicode:
        fh = StringIO.StringIO(precalc_in)
    else:
        fh = precalc_in

    #first line has to be header
    header_ids = fh.readline().strip().split('\t')

    col_meta_locs = {}
    for idx, col_id in enumerate(header_ids):
        if col_id.startswith(md_prefix):
            col_meta_locs[col_id[len(md_prefix):]] = idx

    end_of_data = len(header_ids) - len(col_meta_locs)
    trait_ids = header_ids[1:end_of_data]

    col_meta = []
    row_meta = [{} for i in trait_ids]

    if ids_to_load is not None and len(ids_to_load) > 0:
        ids_to_load = set(ids_to_load)
        load_all_ids = False
    else:
        load_all_ids = True

    matching = []
    otu_ids = []
    for line in fh:
        fields = line.strip().split('\t')
        row_id = fields[0]
        if (row_id.startswith(md_prefix)):
            #handle metadata

            #determine type of metadata (this may not be perfect)
            metadata_type = determine_metadata_type(line)
            for idx, trait_name in enumerate(trait_ids):
                row_meta[idx][row_id[len(md_prefix):]] = parse_metadata_field(
                    fields[idx + 1], metadata_type)

        elif load_all_ids or (row_id in set(ids_to_load)):
            otu_ids.append(row_id)
            matching.append(map(float, fields[1:end_of_data]))

            #add metadata
            col_meta_dict = {}
            for meta_name in col_meta_locs:
                col_meta_dict[meta_name] = fields[col_meta_locs[meta_name]]
            col_meta.append(col_meta_dict)

            if not load_all_ids:
                ids_to_load.remove(row_id)

    if not otu_ids:
        raise ValueError, "No OTUs match identifiers in precalculated file. PICRUSt requires an OTU table reference/closed picked against GreenGenes.\nExample of the first 5 OTU ids from your table: {0}".format(
            ', '.join(list(ids_to_load)[:5]))

    if ids_to_load:
        raise ValueError, "One or more OTU ids were not found in the precalculated file!\nAre you using the correct --gg_version?\nExample of (the {0}) unknown OTU ids: {1}".format(
            len(ids_to_load), ', '.join(list(ids_to_load)[:5]))

    #note that we transpose the data before making biom obj
    matching = asarray(matching)
    if transpose:
        return Table(matching.T,
                     trait_ids,
                     otu_ids,
                     row_meta,
                     col_meta,
                     type='Gene table')
    else:
        return Table(matching,
                     otu_ids,
                     trait_ids,
                     col_meta,
                     row_meta,
                     type='Gene table')
Esempio n. 55
0
def sequence_variants_from_samples(samples: biom.Table) -> DNAIterator:
    seqs = (DNA(s, metadata={'id': s})
            for s in samples.ids(axis='observation'))
    return DNAIterator(seqs)
Esempio n. 56
0
def relative_frequency(table: biom.Table, axis: str='sample') -> biom.Table:
    """ Convert feature table in-place from frequencies to relative frequencies
    """
    table.norm(axis=axis, inplace=True)
    return table
Esempio n. 57
0
def _table_to_dataframe(table: biom.Table) -> pd.DataFrame:
    array = table.matrix_data.toarray().T
    sample_ids = table.ids(axis='sample')
    feature_ids = table.ids(axis='observation')
    return pd.DataFrame(array, index=sample_ids, columns=feature_ids)
Esempio n. 58
0
File: 54.py Progetto: tanaes/qiita
def create_non_rarefied_biom_artifact(analysis, biom_data, rarefied_table):
    """Creates the initial non-rarefied BIOM artifact of the analysis

    Parameters
    ----------
    analysis : dict
        Dictionary with the analysis information
    biom_data : dict
        Dictionary with the biom file information
    rarefied_table : biom.Table
        The rarefied BIOM table

    Returns
    -------
    int
        The id of the new artifact
    """
    # The non rarefied biom artifact is the initial biom table of the analysis.
    # This table does not currently exist anywhere, so we need to actually
    # create the BIOM file. To create this BIOM file we need: (1) the samples
    # and artifacts they come from and (2) whether the samples where
    # renamed or not. (1) is on the database, but we need to inferr (2) from
    # the existing rarefied BIOM table. Fun, fun...

    with TRN:
        # Get the samples included in the BIOM table grouped by artifact id
        # Note that the analysis contains a BIOM table per data type included
        # in it, and the table analysis_sample does not differentiate between
        # datatypes, so we need to check the data type in the artifact table
        sql = """SELECT artifact_id, array_agg(sample_id)
                 FROM qiita.analysis_sample
                    JOIN qiita.artifact USING (artifact_id)
                 WHERE analysis_id = %s AND data_type_id = %s
                 GROUP BY artifact_id"""
        TRN.add(sql, [analysis['analysis_id'], biom_data['data_type_id']])
        samples_by_artifact = TRN.execute_fetchindex()

        # Create an empty BIOM table to be the new master table
        new_table = Table([], [], [])
        ids_map = {}
        for a_id, samples in samples_by_artifact:
            # Get the filepath of the BIOM table from the artifact
            artifact = Artifact(a_id)
            biom_fp = None
            for _, fp, fp_type in artifact.filepaths:
                if fp_type == 'biom':
                    biom_fp = fp
            # Note that we are sure that the biom table exists for sure, so
            # no need to check if biom_fp is undefined
            biom_table = load_table(biom_fp)
            samples = set(samples).intersection(biom_table.ids())
            biom_table.filter(samples, axis='sample', inplace=True)
            # we need to check if the table has samples left before merging
            if biom_table.shape[0] != 0 and biom_table.shape[1] != 0:
                new_table = new_table.merge(biom_table)
                ids_map.update(
                    {sid: "%d.%s" % (a_id, sid)
                     for sid in biom_table.ids()})

        # Check if we need to rename the sample ids in the biom table
        new_table_ids = set(new_table.ids())
        if not new_table_ids.issuperset(rarefied_table.ids()):
            # We need to rename the sample ids
            new_table.update_ids(ids_map, 'sample', True, True)

        sql = """INSERT INTO qiita.artifact
                    (generated_timestamp, data_type_id, visibility_id,
                     artifact_type_id, submitted_to_vamps)
            VALUES (%s, %s, %s, %s, %s)
            RETURNING artifact_id"""
        # Magic number 4 -> visibility sandbox
        # Magix number 7 -> biom artifact type
        TRN.add(
            sql,
            [analysis['timestamp'], biom_data['data_type_id'], 4, 7, False])
        artifact_id = TRN.execute_fetchlast()

        # Associate the artifact with the analysis
        sql = """INSERT INTO qiita.analysis_artifact
                    (analysis_id, artifact_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [analysis['analysis_id'], artifact_id])
        # Link the artifact with its file
        dd_id, mp = get_mountpoint('BIOM')[0]
        dir_fp = join(get_db_files_base_dir(), mp, str(artifact_id))
        if not exists(dir_fp):
            makedirs(dir_fp)
        new_table_fp = join(dir_fp, "biom_table.biom")
        with biom_open(new_table_fp, 'w') as f:
            new_table.to_hdf5(f, "Generated by Qiita")

        sql = """INSERT INTO qiita.filepath
                    (filepath, filepath_type_id, checksum,
                     checksum_algorithm_id, data_directory_id)
                 VALUES (%s, %s, %s, %s, %s)
                 RETURNING filepath_id"""
        # Magic number 7 -> filepath_type_id = 'biom'
        # Magic number 1 -> the checksum algorithm id
        TRN.add(sql, [
            basename(new_table_fp), 7,
            compute_checksum(new_table_fp), 1, dd_id
        ])
        fp_id = TRN.execute_fetchlast()
        sql = """INSERT INTO qiita.artifact_filepath
                    (artifact_id, filepath_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [artifact_id, fp_id])
        TRN.execute()

    return artifact_id
Esempio n. 59
0
def ft2_factory():
    return Artifact.import_data(
        'FeatureTable[Frequency]',
        Table(np.array([[0, 2, 6], [2, 2, 4]]), ['O1', 'O3'],
              ['S4', 'S5', 'S6']))