Python Table.ids Examples, biom.Table.ids Python Examples

Example #1

0

Show file

File: _subsample.py Project: thermokarst-forks/q2-feature-table

def subsample(table: biom.Table, subsampling_depth: int,
              axis: str) -> biom.Table:
    if axis == 'feature':
        # we are transposing the table due to biocore/biom-format#759
        table = table.transpose()

    if len(table.ids()) < subsampling_depth:
        raise ValueError('The subsampling depth exceeds the number of '
                         'elements on the desired axis. The maximum depth '
                         'is: %d.' % len(table.ids()))

    # the axis is always 'sample' due to the above transpose
    table = table.subsample(subsampling_depth, axis='sample', by_id=True)

    # the inverted axis is always observation due to the above transpose
    invaxis = 'observation'
    table.filter(lambda v, i, m: v.sum() > 0, axis=invaxis)

    if axis == 'feature':
        # reverse the transpose necessary due to biocore/biom-format#759
        table = table.transpose()

    if table.is_empty():
        raise ValueError('The subsampled table contains no samples or features'
                         ' (samples/features that sum to zero after filtering'
                         ' are automatically removed). It may be a good idea'
                         ' to double check that your table is valid/nonempty.')

    return table

Example #2

0

Show file

def cscs(features: biom.Table,
         css_edges: str,
         cosine_threshold: float = 0.6,
         normalization: bool = True,
         weighted: bool = True) -> skbio.DistanceMatrix:
    observationids = {
        x: index
        for index, x in enumerate(features.ids(axis='observation'))
    }
    edgesdok = dok_matrix((features.shape[0], features.shape[0]),
                          dtype=np.float32)
    for line in open(css_edges, "r"):
        if line.find("CLUSTERID1") > -1:
            continue
        linesplit = line.split("\t")
        if float(linesplit[4]) < cosine_threshold:
            edgesdok[observationids[linesplit[0]],
                     observationids[linesplit[1]]] = 0.0
        else:
            edgesdok[observationids[linesplit[0]],
                     observationids[linesplit[1]]] = float(linesplit[4])
            edgesdok[observationids[linesplit[1]],
                     observationids[linesplit[0]]] = float(linesplit[4])
        edgesdok.setdiag(1)

    if normalization:
        features = features.norm(axis='sample', inplace=False)
    if weighted == False:
        features = features.pa  #TODO: make new option in cscs()

    sample_names = features.ids()
    cscs = parallel_make_distance_matrix(features, edgesdok, sample_names)
    cscs = 1 - cscs
    print(cscs)
    return (skbio.DistanceMatrix(cscs, ids=cscs.index))

Example #3

0

Show file

def _read_inputs(biom_table: biom.Table, phylogeny_fp: NewickFormat,
                 meta_data: NumericMetadataColumn = None):
    if meta_data:
        generate_strategy = "balancing"
        meta, biom_table = _sort_metada(meta_data, biom_table)
        y = meta.iloc[:, 0]
        samples = meta.index
    else:
        generate_strategy = "augmentation"
        y = pd.Series(data=np.ones((len(biom_table.ids('sample')),)),
                      index=biom_table.ids('sample'))
        samples = biom_table.ids('sample')

    _table_tmp = biom_table.sort_order(axis='sample', order=samples)
    _table = _map_observations(_table_tmp)
    pruned_phylogeny_fp = _prune_features_from_phylogeny(_table, phylogeny_fp)
    _tree = dendropy.Tree.get(path=str(pruned_phylogeny_fp),
                              preserve_underscores=False,
                              schema="newick", rooting='default-rooted')

    if sum(samples != _table.ids('sample')) > 0:
        raise ValueError("The samples IDs in meta data and biom table are "
                         "not the same! The difference is:",
                         set(samples) - set(_table.ids('sample')),
                         "Please double check.")

    return _table, y, _tree, generate_strategy, pruned_phylogeny_fp

Example #4

0

Show file

def alpha(table: biom.Table):
    """

    :param table:
    :return:
    """
    if table.is_empty():
        raise ValueError("The provided table object is empty")

    table = get_biom_table(table)
    alpha_diversities = []
    counts = table.matrix_data.toarray().astype(float).T
    sample_ids = table.ids(axis='sample')
    sample_metadata = dict(zip(table.ids(), table.metadata()))

    for metric in ALPHA_DIVERSITY_METHODS:
        result = alpha_diversity(metric=metric, counts=counts, ids=sample_ids)
        result.name = metric
        alpha_diversities.append(result)

    aggregated_diversity_results = aggregate_results(alpha_diversities,
                                                     sample_ids)
    formatted_diversity_results = _format_alpha_results_to_json(
        aggregated_diversity_results, sample_metadata)

    return formatted_diversity_results

Example #5

0

Show file

File: _method.py Project: gregcaporaso/diversity

def beta_phylogenetic(table: biom.Table, phylogeny: skbio.TreeNode,
                      metric: str, n_jobs: int=1)-> skbio.DistanceMatrix:
    if metric not in phylogenetic_metrics():
        raise ValueError("Unknown phylogenetic metric: %s" % metric)
    if table.is_empty():
        raise ValueError("The provided table object is empty")
    if n_jobs != 1 and metric == 'weighted_unifrac':
        raise ValueError("Weighted UniFrac is not parallelizable")

    counts = table.matrix_data.toarray().astype(int).T
    sample_ids = table.ids(axis='sample')
    feature_ids = table.ids(axis='observation')

    try:
        results = skbio.diversity.beta_diversity(
            metric=metric,
            counts=counts,
            ids=sample_ids,
            otu_ids=feature_ids,
            tree=phylogeny,
            pairwise_func=sklearn.metrics.pairwise_distances,
            n_jobs=n_jobs
        )
    except skbio.tree.MissingNodeError as e:
        message = str(e).replace('otu_ids', 'feature_ids')
        message = message.replace('tree', 'phylogeny')
        raise skbio.tree.MissingNodeError(message)

    return results

Example #6

0

Show file

File: _merge.py Project: nervous-laughter/q2-feature-table

def merge(table1: biom.Table, table2: biom.Table) -> biom.Table:
    table1_sids = set(table1.ids(axis='sample'))
    table2_sids = set(table2.ids(axis='sample'))
    if len(table1_sids & table2_sids) > 0:
        raise ValueError('Some samples are present in both tables: %s' %
                         ', '.join(table1_sids & table2_sids))
    return table1.merge(table2)

Example #7

0

Show file

File: _method.py Project: wasade/q2-diversity

def beta_phylogenetic(table: biom.Table,
                      phylogeny: skbio.TreeNode,
                      metric: str,
                      n_jobs: int = 1) -> skbio.DistanceMatrix:
    if metric not in phylogenetic_metrics():
        raise ValueError("Unknown phylogenetic metric: %s" % metric)
    if table.is_empty():
        raise ValueError("The provided table object is empty")
    if n_jobs != 1 and metric == 'weighted_unifrac':
        raise ValueError("Weighted UniFrac is not parallelizable")

    counts = table.matrix_data.toarray().astype(int).T
    sample_ids = table.ids(axis='sample')
    feature_ids = table.ids(axis='observation')

    try:
        results = skbio.diversity.beta_diversity(
            metric=metric,
            counts=counts,
            ids=sample_ids,
            otu_ids=feature_ids,
            tree=phylogeny,
            pairwise_func=sklearn.metrics.pairwise_distances,
            n_jobs=n_jobs)
    except skbio.tree.MissingNodeError as e:
        message = str(e).replace('otu_ids', 'feature_ids')
        message = message.replace('tree', 'phylogeny')
        raise skbio.tree.MissingNodeError(message)

    return results

Example #8

0

Show file

def deposit_biofilm(table1, table2, metadata, U, V, edges, it, rep,
                    output_dir):
    """ Writes down tables, metadata and feature metadata into files.

    Parameters
    ----------
    table : biom.Table
        Biom table
    metadata : pd.DataFrame
        Dataframe of sample metadata
    feature_metadata : pd.DataFrame
        Dataframe of features metadata
    it : int
        iteration number
    rep : int
        repetition number
    output_dir : str
        output directory
    """
    choice = 'abcdefghijklmnopqrstuvwxyz'
    output_microbes = "%s/table_microbes.%d_%s.biom" % (output_dir, it,
                                                        choice[rep])
    output_metabolites = "%s/table_metabolites.%d_%s.biom" % (output_dir, it,
                                                              choice[rep])
    output_md = "%s/metadata.%d_%s.txt" % (output_dir, it, choice[rep])
    output_U = "%s/U.%d_%s.txt" % (output_dir, it, choice[rep])
    output_V = "%s/V.%d_%s.txt" % (output_dir, it, choice[rep])
    output_B = "%s/edges.%d_%s.txt" % (output_dir, it, choice[rep])
    output_ranks = "%s/ranks.%d_%s.txt" % (output_dir, it, choice[rep])

    idx1 = table1.sum(axis=0) > 0
    idx2 = table2.sum(axis=0) > 0
    table1 = table1.loc[:, idx1]
    table2 = table2.loc[:, idx2]

    table1 = Table(table1.values.T, table1.columns, table1.index)
    table2 = Table(table2.values.T, table2.columns, table2.index)

    with biom_open(output_microbes, 'w') as f:
        table1.to_hdf5(f, generated_by='moi1')
    with biom_open(output_metabolites, 'w') as f:
        table2.to_hdf5(f, generated_by='moi2')

    ranks = (U @ V)

    ranks = ranks[idx1, :]
    ranks = ranks[:, idx2]
    ranks = pd.DataFrame(ranks,
                         index=table1.ids(axis='observation'),
                         columns=table2.ids(axis='observation'))
    ranks.to_csv(output_ranks, sep='\t')
    metadata.to_csv(output_md, sep='\t', index_label='#SampleID')

    B = B[:, idx1]

    np.savetxt(output_U, U)
    np.savetxt(output_V, V)
    np.savetxt(output_B, B)

Example #9

0

Show file

def rclr_transformation(table: Table) -> Table:
    """
    Takes biom table and returns
    a matrix_rclr transformed biom table.
    """
    # transform table values (and return biom.Table)
    table = Table(
        matrix_rclr(table.matrix_data.toarray().T).T, table.ids('observation'),
        table.ids('sample'))
    return table

Example #10

0

Show file

def add_pseudocount(table: biom.Table, pseudocount: int = 1) -> biom.Table:
    # This is ugly, and it requires a sparse and dense representation to
    # be in memory at the same time, but biom.Table.transform only operates
    # on non-zero values, so it isn't useful here (as we need to operate on
    # all values).
    result = biom.Table([
        v + pseudocount
        for v in table.iter_data(dense=True, axis='observation')
    ], table.ids(axis='observation'), table.ids())
    return result

Example #11

0

Show file

File: _countVectors.py Project: ucsd-cmi/q2-feature-engineering

def count_vectors(table: biom.Table,
                  phylogeny: skbio.TreeNode,
                  method: str = 'weighted_unifrac') -> biom.Table:
    table = _map_observations(table)
    pruned_phylo = prune_features_from_phylogeny(table, phylogeny)
    pruned_phylo = rename_nodes(pruned_phylo)
    table = table.sort(axis='observation')
    otu_ids = np.asarray(table.ids('observation'))
    counts = np.asarray(table.matrix_data.todense().transpose())
    features, tree_index = _run_unifrac(counts, otu_ids, pruned_phylo, method)
    return biom.Table(data=features.transpose(),
                      observation_ids=rename_otus(tree_index),
                      sample_ids=table.ids())

Example #12

0

Show file

def group(table: biom.Table, axis: str,
          metadata: qiime2.CategoricalMetadataColumn, mode: str) -> biom.Table:
    if table.is_empty():
        raise ValueError("Cannot group an empty table.")

    if axis == 'feature':
        biom_axis = 'observation'
    else:
        biom_axis = axis

    metadata = _munge_metadata_column(metadata, table.ids(axis=biom_axis),
                                      axis)

    grouped_table = table.collapse(
        lambda axis_id, _: metadata.get_value(axis_id),
        collapse_f=_mode_lookup[mode],
        axis=biom_axis,
        norm=False,
        include_collapsed_metadata=False)
    # Reorder axis by first unique appearance of each group value in metadata
    # (makes it stable for identity mappings and easier to test)
    # TODO use CategoricalMetadataColumn API for retrieving categories/groups,
    # when the API exists.
    series = metadata.to_series()
    return grouped_table.sort_order(series.unique(), axis=biom_axis)

Example #13

0

Show file

def tree_cluster(
        phylogeny_fp: NewickFormat,
        table: biom.Table,
        method: Str = 'max_clade',
        threshold: Float = 0.0,
        threshold_free: Str = None
) -> (skbio.TreeNode, pd.DataFrame, biom.Table):
    phylogeny_cleaned_fp = _clean_phylogeny(phylogeny_fp, table)
    print("The numbe of features in the original table is",
          len(table.ids('observation')))

    print("The phylogeny loaded and randomly resolved!")
    tree = _read_phylogeny(str(phylogeny_cleaned_fp))
    if threshold_free is None:
        clusters = METHODS[method.lower()](tree, threshold, float('-inf'))
    else:
        clusters = THRESHOLDFREE[threshold_free](METHODS[method.lower()], tree,
                                                 threshold, float('-inf'))

    cluster_df = _convert_result_to_data_frame(clusters)
    print(
        "TreeCluster finished successfully! The number of clusters using the threshold",
        threshold, "is", len(clusters))

    final_table_filtered, pivot_mapping_dct = map_features(cluster_df, table)
    print("The clustered table is generated!")
    new_tree = generate_new_phylogeny(phylogeny_cleaned_fp, pivot_mapping_dct)
    print("The clustered phylogeny is generated!")
    cluster_df = cluster_df.set_index('Feature ID')
    return new_tree, cluster_df, final_table_filtered

Example #14

0

Show file

File: beta.py Project: andrewsanchez/q2-diversity-lib

def beta_passthrough(table: biom.Table, metric: str, pseudocount: int = 1,
                     n_jobs: int = 1) -> skbio.DistanceMatrix:
    def aitchison(x, y, **kwds):
        return euclidean(clr(x), clr(y))

    def canberra_adkins(x, y, **kwds):
        nz = ((x > 0) | (y > 0))
        x_ = x[nz]
        y_ = y[nz]
        nnz = nz.sum()

        return (1. / nnz) * np.sum(np.abs(x_ - y_) / (x_ + y_))

    def jensen_shannon(x, y, **kwds):
        return jensenshannon(x, y)

    counts = table.matrix_data.toarray().T
    sample_ids = table.ids(axis='sample')
    if metric == 'aitchison':
        counts += pseudocount
        metric = aitchison
    elif metric == 'canberra_adkins':
        metric = canberra_adkins
    elif metric == 'jensenshannon':
        metric = jensen_shannon
    else:
        pass

    return skbio.diversity.beta_diversity(
            metric=metric, counts=counts, ids=sample_ids, validate=True,
            pairwise_func=sklearn.metrics.pairwise_distances, n_jobs=n_jobs)

Example #15

0

Show file

File: _data_preprocess.py Project: ucsd-cmi/q2-feature-engineering

def get_reference_seqs_from_ids(table: biom.Table, reference_seqs_pd: pd.Series) -> DNAFASTAFormat:
    output_references = pd.Series()
    for obs in table.ids('observation'):
        seq = reference_seqs_pd[obs]
        output_references[obs] = seq
    output_references_fasta = _16(output_references)
    return output_references_fasta

Example #16

0

Show file

File: _method.py Project: jakereps/q2-diversity

def beta(table: biom.Table, metric: str,
         pseudocount: int=1, n_jobs: int=1)-> skbio.DistanceMatrix:

    if not (metric in non_phylogenetic_metrics()):
        raise ValueError("Unknown metric: %s" % metric)

    counts = table.matrix_data.toarray().T

    def aitchison(x, y, **kwds):
        return euclidean(clr(x), clr(y))

    if metric == 'aitchison':
        counts += pseudocount
        metric = aitchison

    if table.is_empty():
        raise ValueError("The provided table object is empty")

    sample_ids = table.ids(axis='sample')

    return skbio.diversity.beta_diversity(
        metric=metric,
        counts=counts,
        ids=sample_ids,
        validate=True,
        pairwise_func=sklearn.metrics.pairwise_distances,
        n_jobs=n_jobs
    )

Example #17

0

Show file

File: _method.py Project: wasade/q2-diversity

def beta(table: biom.Table,
         metric: str,
         pseudocount: int = 1,
         n_jobs: int = 1) -> skbio.DistanceMatrix:

    if not (metric in non_phylogenetic_metrics()):
        raise ValueError("Unknown metric: %s" % metric)

    counts = table.matrix_data.toarray().T

    def aitchison(x, y, **kwds):
        return euclidean(clr(x), clr(y))

    if metric == 'aitchison':
        counts += pseudocount
        metric = aitchison

    if table.is_empty():
        raise ValueError("The provided table object is empty")

    sample_ids = table.ids(axis='sample')

    return skbio.diversity.beta_diversity(
        metric=metric,
        counts=counts,
        ids=sample_ids,
        validate=True,
        pairwise_func=sklearn.metrics.pairwise_distances,
        n_jobs=n_jobs)

Example #18

0

Show file

File: _group.py Project: jakereps/q2-feature-table

def group(table: biom.Table, axis: str,
          metadata: qiime2.CategoricalMetadataColumn, mode: str) -> biom.Table:
    if table.is_empty():
        raise ValueError("Cannot group an empty table.")

    if axis == 'feature':
        biom_axis = 'observation'
    else:
        biom_axis = axis

    metadata = _munge_metadata_column(metadata, table.ids(axis=biom_axis),
                                      axis)

    grouped_table = table.collapse(
        lambda axis_id, _: metadata.get_value(axis_id),
        collapse_f=_mode_lookup[mode],
        axis=biom_axis,
        norm=False,
        include_collapsed_metadata=False)
    # Reorder axis by first unique appearance of each group value in metadata
    # (makes it stable for identity mappings and easier to test)
    # TODO use CategoricalMetadataColumn API for retrieving categories/groups,
    # when the API exists.
    series = metadata.to_series()
    return grouped_table.sort_order(series.unique(), axis=biom_axis)

Example #19

0

Show file

File: _insertion.py Project: qiime2/q2-fragment-insertion

def filter_features(table: biom.Table,
                    tree: NewickFormat) -> (biom.Table, biom.Table):

    # load the insertion tree
    tree = skbio.TreeNode.read(str(tree))
    # collect all tips=inserted fragments+reference taxa names
    fragments_tree = {
        str(tip.name)
        for tip in tree.tips()
        if tip.name is not None}

    # collect all fragments/features from table
    fragments_table = set(map(str, table.ids(axis='observation')))

    if len(fragments_table & fragments_tree) <= 0:
        raise ValueError(('Not a single fragment of your table is part of your'
                          ' tree. The resulting table would be empty.'))

    tbl_positive = table.filter(fragments_table & fragments_tree,
                                axis='observation', inplace=False)
    tbl_negative = table.filter(fragments_table - fragments_tree,
                                axis='observation', inplace=False)

    # print some information for quality control,
    # which user can request via --verbose
    results = pd.DataFrame(
        data={'kept_reads': tbl_positive.sum(axis='sample'),
              'removed_reads': tbl_negative.sum(axis='sample')},
        index=tbl_positive.ids())
    results['removed_ratio'] = results['removed_reads'] / \
        (results['kept_reads'] + results['removed_reads'])

    return (tbl_positive, tbl_negative)

Example #20

0

Show file

File: _clawback.py Project: khemlalnirmalkar/q2-clawback

def generate_class_weights(
        reference_taxonomy: Series, reference_sequences: DNAIterator,
        samples: biom.Table, taxonomy_classification: DataFrame,
        unobserved_weight: float = 1e-6, normalise: bool = False,
        allow_weight_outside_reference: bool = False) \
        -> biom.Table:
    weights = {
        reference_taxonomy[seq.metadata['id']]: 0.
        for seq in reference_sequences
    }
    if normalise:
        samples.norm()

    tax_map = taxonomy_classification['Taxon']
    try:
        taxa = [tax_map[s] for s in samples.ids(axis='observation')]
    except KeyError as s:
        raise ValueError(str(s) + ' not in taxonomy_classification')
    if not allow_weight_outside_reference and not set(taxa).issubset(weights):
        raise ValueError(
            'taxonomy_classification does not match reference_taxonomy')

    for taxon, count in zip(taxa, samples.sum('observation')):
        if taxon in weights:
            weights[taxon] += count
    taxa, weights = zip(*weights.items())
    weights = array(weights)
    weights /= weights.sum()
    weights = \
        (1. - unobserved_weight) * weights + unobserved_weight / len(weights)
    weights /= weights.sum()

    return biom.Table(weights[None].T, taxa, sample_ids=['Weight'])

Example #21

0

Show file

File: utils.py Project: qiyunzhu/qp-shogun

def import_shogun_biom(f,
                       annotation_table=None,
                       annotation_type=None,
                       names_to_taxonomy=False):
    import_funcs = {
        'module': shogun_parse_module_table,
        'pathway': shogun_parse_pathway_table,
        'enzyme': shogun_parse_enzyme_table
    }

    table = pd.read_csv(f, sep='\t', index_col=0)

    bt = Table(table.values,
               observation_ids=list(map(str, table.index)),
               sample_ids=list(map(str, table.columns)))

    if names_to_taxonomy:
        metadata = {
            x: {
                'taxonomy': x.split(';')
            }
            for x in bt.ids(axis='observation')
        }
        bt.add_metadata(metadata, axis='observation')

    if annotation_table is not None:
        metadata = import_funcs[annotation_type](annotation_table)
        bt.add_metadata(metadata, axis='observation')

    return (bt)

Example #22

0

Show file

File: _countVectors.py Project: ucsd-cmi/q2-feature-engineering

def _map_observations(table: biom.Table) -> biom.Table:
    obs_dict = {}
    for taxa in table.ids('observation'):
        obs_dict[taxa] = taxa.replace('_', ' ')
    table = table.update_ids(id_map=obs_dict,
                             axis='observation',
                             inplace=False)
    return table

Example #23

0

Show file

File: _filter.py Project: qiime2/q2-phylogeny

def filter_table(table: biom.Table, tree: skbio.TreeNode) -> biom.Table:
    """ Filter table to remove feature ids that are not tip ids in tree
    """
    tip_ids = set([t.name for t in tree.tips()])
    feature_ids = set(table.ids(axis='observation'))
    # ids_to_keep can only include ids that are in table
    ids_to_keep = tip_ids & feature_ids
    table.filter(ids_to_keep, axis='observation', inplace=True)
    return table

Example #24

0

Show file

File: test_util.py Project: biocore/American-Gut

    def test_collapse_full(self):
        obs = collapse_full(table)
        exp = Table(array([[0.00769230769231], [0.0282051282051],
                           [0.0487179487179], [0.0692307692308],
                           [0.0897435897436], [0.110256410256],
                           [0.130769230769], [0.151282051282],
                           [0.171794871795], [0.192307692308]]),
                    observ_ids, ['average'],
                    observation_metadata=observ_metadata)
        for r in range(10):
            assert_almost_equal(obs[r, 0],  exp[r, 0])
        self.assertEqual(obs.ids(), exp.ids())
        self.assertItemsEqual(obs.ids('observation'), exp.ids('observation'))

        obs_meta = []
        for _, _, m in obs.iter(axis='observation'):
            obs_meta.append(m)
        self.assertItemsEqual(obs_meta, observ_metadata)

Example #25

0

Show file

File: _filter.py Project: andrewsanchez/q2-phylogeny

def filter_table(table: biom.Table, tree: skbio.TreeNode) -> biom.Table:
    """ Filter table to remove feature ids that are not tip ids in tree
    """
    tip_ids = set([t.name for t in tree.tips()])
    feature_ids = set(table.ids(axis='observation'))
    # ids_to_keep can only include ids that are in table
    ids_to_keep = tip_ids & feature_ids
    table.filter(ids_to_keep, axis='observation', inplace=True)
    return table

Example #26

0

Show file

    def test_collapse_full(self):
        obs = collapse_full(table)
        exp = Table(array([[0.00769230769231], [0.0282051282051],
                           [0.0487179487179], [0.0692307692308],
                           [0.0897435897436], [0.110256410256],
                           [0.130769230769], [0.151282051282],
                           [0.171794871795], [0.192307692308]]),
                    observ_ids, ['average'],
                    observation_metadata=observ_metadata)
        for r in range(10):
            assert_almost_equal(obs[r, 0], exp[r, 0])
        self.assertEqual(obs.ids(), exp.ids())
        self.assertItemsEqual(obs.ids('observation'), exp.ids('observation'))

        obs_meta = []
        for _, _, m in obs.iter(axis='observation'):
            obs_meta.append(m)
        self.assertItemsEqual(obs_meta, observ_metadata)

Example #27

0

Show file

def pad_features_in_test_data(train_table: biom.Table, 
                                test_table: biom.Table) -> biom.Table:
    '''
    Do feature alignment on train and test tables by adding zero-padding features that
    only existed in the train table into test table.

    Parameters
    ----------
    train_table: biom.Table
    A biom table with train data
    test_table: biom.Table
    A biom table with test data
    
    Returns
    -------
    new_test_biom: biom.Table
    A biom table with the updated test data with identical set of
        features in the train table.
    '''

    train_feature_ids = train_table.ids(axis='observation')
    test_feature_ids = test_table.ids(axis='observation')

    n_samples = test_table.shape[0]
    #n_features = test_table.shape[1]
    sample_ids= test_table.ids(axis='sample')
    #print("The # of features in the train data: ", len(train_feature_ids))
    #print("The # of features in the original test data: ", len(test_feature_ids))
    train_uniq_f=list(set(train_feature_ids)-set(test_feature_ids))
    shared_f=set(train_feature_ids).intersection(set(test_feature_ids))
    # create a zero matrix for all features uniquely existed in the train table
    padding_table = biom.Table(np.zeros((len(train_uniq_f), n_samples)),
                                train_uniq_f, sample_ids)
    # filter out features that don't exist in the train table in the test table
    test_table.filter(shared_f, axis='observation')

    n_filtered_features = test_table.shape[1]
    if n_filtered_features == 0:
        raise ValueError('No feature overlap between train and test table!'
                         'Check the feature-format consistentcy between tables!')
    # merge the two tables
    new_test_table = test_table.merge(padding_table)

    return new_test_table

Example #28

0

Show file

def plot(output_dir,
         table: biom.Table,
         metadata: q2.Metadata,
         case_where: str,
         control_where: str,
         feature_tree: skbio.TreeNode = None):

    with open('/tmp/tree.nwk', 'w') as fh:
        feature_tree.write(fh)

    copy_tree(os.path.join(PLOT, 'assets', 'dist'), output_dir)
    data_dir = os.path.join(output_dir, 'data')
    os.mkdir(data_dir)

    metadata = metadata.filter_ids(table.ids(axis='sample'))
    case_samples = sorted(list(metadata.get_ids(case_where)))
    control_samples = sorted(list(metadata.get_ids(control_where)))

    table.filter(case_samples + control_samples)
    table.remove_empty('observation')
    features = list(table.ids(axis='observation'))

    if feature_tree is not None:
        feature_tree = shear_no_prune(feature_tree, features)
    else:
        feature_tree = TreeNode()

    tree_data = tree_to_array(feature_tree)
    idx, = np.where(np.asarray(tree_data['children']) == 0)
    tree_data['lookup'] = dict(zip(map(str, idx), range(len(idx))))

    tip_order = np.asarray(tree_data['names'])[idx]
    table = table.sort_order(tip_order, axis='observation')
    table = table.sort_order(case_samples + control_samples, axis='sample')

    with open(os.path.join(data_dir, 'packed_table.jsonp'), 'w') as fh:
        fh.write('LOAD_PACKED_TABLE(')
        fh.write(json.dumps(table_to_b64pa(table)))
        fh.write(');')

    with open(os.path.join(data_dir, 'tree.jsonp'), 'w') as fh:
        fh.write('LOAD_TREE(')
        fh.write(json.dumps(tree_data))
        fh.write(');')

Example #29

0

Show file

File: _method.py Project: nervous-laughter/q2-diversity

def beta(table: biom.Table, metric: str) -> skbio.DistanceMatrix:
    if metric not in non_phylogenetic_metrics():
        raise ValueError("Unknown metric: %s" % metric)

    counts = table.matrix_data.toarray().astype(int).T
    sample_ids = table.ids(axis='sample')

    return skbio.diversity.beta_diversity(metric=metric,
                                          counts=counts,
                                          ids=sample_ids)

Example #30

0

Show file

def alpha_passthrough(table: biom.Table, metric: str) -> pd.Series:
    # Note: some metrics require ints, but biom.Table seems to default to float
    # (e.g. ace, lladser_pe, michaelis_menten_fit)
    counts = table.matrix_data.astype(int).toarray().T
    sample_ids = table.ids(axis='sample')

    result = skbio.diversity.alpha_diversity(metric=metric, counts=counts,
                                             ids=sample_ids)
    result.name = metric
    return result

Example #31

0

Show file

File: beta.py Project: wasade/q2-diversity-lib

def jaccard(table: biom.Table, n_jobs: int = 1) -> skbio.DistanceMatrix:
    counts = table.matrix_data.toarray().T
    sample_ids = table.ids(axis='sample')
    return skbio.diversity.beta_diversity(
        metric='jaccard',
        counts=counts,
        ids=sample_ids,
        validate=True,
        pairwise_func=sklearn.metrics.pairwise_distances,
        n_jobs=n_jobs)

Example #32

0

Show file

def multinomial(table: biom.Table,
                metadata: Metadata,
                formula: str,
                training_column: str = None,
                num_random_test_examples: int = 10,
                epoch: int = 10,
                batch_size: int = 5,
                beta_prior: float = 1,
                learning_rate: float = 0.1,
                clipnorm: float = 10,
                min_sample_count: int = 10,
                min_feature_count: int = 10,
                summary_interval: int = 60) -> (pd.DataFrame):

    # load metadata and tables
    metadata = metadata.to_dataframe()

    # match them
    table, metadata, design = match_and_filter(table, metadata, formula,
                                               training_column,
                                               num_random_test_examples,
                                               min_sample_count,
                                               min_feature_count)

    # convert to dense representation
    dense_table = table.to_dataframe().to_dense().T

    # split up training and testing
    trainX, testX, trainY, testY = split_training(dense_table, metadata,
                                                  design, training_column,
                                                  num_random_test_examples)

    model = MultRegression(learning_rate=learning_rate,
                           clipnorm=clipnorm,
                           beta_mean=beta_prior,
                           batch_size=batch_size,
                           save_path=None)
    with tf.Graph().as_default(), tf.Session() as session:
        model(session, trainX, trainY, testX, testY)

        model.fit(epoch=epoch,
                  summary_interval=summary_interval,
                  checkpoint_interval=None)

    md_ids = np.array(design.columns)
    obs_ids = table.ids(axis='observation')

    beta_ = clr(clr_inv(np.hstack((np.zeros((model.p, 1)), model.B))))

    beta_ = pd.DataFrame(
        beta_.T,
        columns=md_ids,
        index=obs_ids,
    )
    return beta_

Example #33

0

Show file

File: _data_preprocess.py Project: ucsd-cmi/q2-feature-engineering

def reorder_feature_table(query_table: biom.Table, reference_table: biom.Table) -> biom.Table:
    query_samples = query_table.ids()
    ref_samples = set(reference_table.ids())
    for sample in query_samples:
        if sample in ref_samples:
            raise ValueError(
                "The sample", sample, "from your reference data found in your query one, while "
                                      "the two tables should be disjoint."
            )
    merged_table = reference_table.merge(query_table)
    merged_table.filter(ids_to_keep=reference_table.ids('observation'))
    merged_table.filter(ids_to_keep=query_samples)
    for sample in merged_table.ids():
        if sample in ref_samples:
            raise ValueError(
                "The sample", sample, "from your reference data found in your query one, while "
                                      "the two tables should be disjoint."
            )

    return merged_table.sort_order(reference_table.ids('observation'))

Example #34

0

Show file

File: _method.py Project: jairideout/diversity

def alpha(table: biom.Table, metric: str) -> pd.Series:
    if metric not in non_phylogenetic_metrics():
        raise ValueError("Unknown metric: %s" % metric)

    counts = table.matrix_data.toarray().astype(int).T
    sample_ids = table.ids(axis='sample')

    result = skbio.diversity.alpha_diversity(metric=metric, counts=counts,
                                             ids=sample_ids)
    result.name = metric
    return result

Example #35

0

Show file

def shannon_entropy(table: biom.Table,
                    drop_undefined_samples: bool = False) -> pd.Series:
    counts = table.matrix_data.toarray().T
    sample_ids = table.ids(axis='sample')
    if drop_undefined_samples:
        counts, sample_ids = _drop_undefined_samples(
                counts, sample_ids, minimum_nonzero_elements=1)
    result = skbio.diversity.alpha_diversity(metric='shannon', counts=counts,
                                             ids=sample_ids)
    result.name = 'shannon_entropy'
    return result

Example #36

0

Show file

File: _method.py Project: nervous-laughter/q2-diversity

def beta_phylogenetic(table: biom.Table, phylogeny: skbio.TreeNode,
                      metric: str) -> skbio.DistanceMatrix:
    if metric not in phylogenetic_metrics():
        raise ValueError("Unknown phylogenetic metric: %s" % metric)

    counts = table.matrix_data.toarray().astype(int).T
    sample_ids = table.ids(axis='sample')
    feature_ids = table.ids(axis='observation')

    try:
        results = skbio.diversity.beta_diversity(metric=metric,
                                                 counts=counts,
                                                 ids=sample_ids,
                                                 otu_ids=feature_ids,
                                                 tree=phylogeny)
    except skbio.tree.MissingNodeError as e:
        message = str(e).replace('otu_ids', 'feature_ids')
        message = message.replace('tree', 'phylogeny')
        raise skbio.tree.MissingNodeError(message)

    return results

Example #37

0

Show file

File: _method.py Project: jairideout/diversity

def alpha_phylogenetic(table: biom.Table, phylogeny: skbio.TreeNode,
                       metric: str) -> pd.Series:
    if metric not in phylogenetic_metrics():
        raise ValueError("Unknown phylogenetic metric: %s" % metric)

    counts = table.matrix_data.toarray().astype(int).T
    sample_ids = table.ids(axis='sample')
    feature_ids = table.ids(axis='observation')

    try:
        result = skbio.diversity.alpha_diversity(metric=metric,
                                                 counts=counts,
                                                 ids=sample_ids,
                                                 otu_ids=feature_ids,
                                                 tree=phylogeny)
    except skbio.tree.MissingNodeError as e:
        message = str(e).replace('otu_ids', 'feature_ids')
        message = message.replace('tree', 'phylogeny')
        raise skbio.tree.MissingNodeError(message)

    result.name = metric
    return result

Example #38

0

Show file

File: _method.py Project: gregcaporaso/diversity

def beta(table: biom.Table, metric: str, n_jobs: int=1)-> skbio.DistanceMatrix:
    if metric not in non_phylogenetic_metrics():
        raise ValueError("Unknown metric: %s" % metric)
    if table.is_empty():
        raise ValueError("The provided table object is empty")

    counts = table.matrix_data.toarray().astype(int).T
    sample_ids = table.ids(axis='sample')

    return skbio.diversity.beta_diversity(
        metric=metric,
        counts=counts,
        ids=sample_ids,
        pairwise_func=sklearn.metrics.pairwise_distances,
        n_jobs=n_jobs
    )

Example #39

0

Show file

File: _method.py Project: qiime2/q2-diversity

def beta(table: biom.Table, metric: str,
         pseudocount: int = 1, n_jobs: int = 1) -> skbio.DistanceMatrix:

    if not (metric in non_phylogenetic_metrics()):
        raise ValueError("Unknown metric: %s" % metric)

    counts = table.matrix_data.toarray().T

    def aitchison(x, y, **kwds):
        return euclidean(clr(x), clr(y))

    def canberra_adkins(x, y, **kwds):
        if (x < 0).any() or (y < 0).any():
            raise ValueError("Canberra-Adkins is only defined over positive "
                             "values.")

        nz = ((x > 0) | (y > 0))
        x_ = x[nz]
        y_ = y[nz]
        nnz = nz.sum()

        return (1. / nnz) * np.sum(np.abs(x_ - y_) / (x_ + y_))

    if metric == 'aitchison':
        counts += pseudocount
        metric = aitchison
    elif metric == 'canberra_adkins':
        metric = canberra_adkins

    if table.is_empty():
        raise ValueError("The provided table object is empty")

    sample_ids = table.ids(axis='sample')

    return skbio.diversity.beta_diversity(
        metric=metric,
        counts=counts,
        ids=sample_ids,
        validate=True,
        pairwise_func=sklearn.metrics.pairwise_distances,
        n_jobs=n_jobs
    )

Example #40

0

Show file

File: _filter.py Project: jakereps/q2-feature-table

def filter_seqs(data: pd.Series, table: biom.Table=None,
                metadata: qiime2.Metadata=None, where: str=None,
                exclude_ids: bool=False) -> pd.Series:
    if table is not None and metadata is not None:
        raise ValueError('Filtering with metadata and filtering with a table '
                         'are mutually exclusive.')
    elif table is None and metadata is None:
        raise ValueError('No filtering requested. Must provide either table '
                         'or metadata.')
    elif table is not None:
        ids_to_keep = table.ids(axis='observation')
    else:
        # Note, no need to check for missing feature IDs in the metadata,
        # because that is basically the point of this method.
        ids_to_keep = metadata.get_ids(where=where)

    if exclude_ids is True:
        ids_to_keep = set(data.index) - set(ids_to_keep)
    filtered = data[data.index.isin(ids_to_keep)]
    if filtered.empty is True:
        raise ValueError('All features were filtered out of the data.')
    return filtered

Example #41

0

Show file

File: _visualizer.py Project: gregcaporaso/diversity

def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int,
                      phylogeny: skbio.TreeNode=None, metrics: set=None,
                      metadata: qiime2.Metadata=None, min_depth: int=1,
                      steps: int=10, iterations: int=10) -> None:

    if metrics is None:
        metrics = {'observed_otus', 'shannon'}
        if phylogeny is not None:
            metrics.add('faith_pd')
    elif not metrics:
        raise ValueError('`metrics` was given an empty set.')
    else:
        phylo_overlap = phylogenetic_metrics() & metrics
        if phylo_overlap and phylogeny is None:
            raise ValueError('Phylogenetic metric %s was requested but '
                             'phylogeny was not provided.' % phylo_overlap)

    if max_depth <= min_depth:
        raise ValueError('Provided max_depth of %d must be greater than '
                         'provided min_depth of %d.' % (max_depth, min_depth))
    possible_steps = max_depth - min_depth
    if possible_steps < steps:
        raise ValueError('Provided number of steps (%d) is greater than the '
                         'steps possible between min_depth and '
                         'max_depth (%d).' % (steps, possible_steps))
    if table.is_empty():
        raise ValueError('Provided table is empty.')
    max_frequency = max(table.sum(axis='sample'))
    if max_frequency < max_depth:
        raise ValueError('Provided max_depth of %d is greater than '
                         'the maximum sample total frequency of the '
                         'feature_table (%d).' % (max_depth, max_frequency))
    if metadata is not None:
        metadata_ids = metadata.ids()
        table_ids = set(table.ids(axis='sample'))
        if not table_ids.issubset(metadata_ids):
            raise ValueError('Missing samples in metadata: %r' %
                             table_ids.difference(metadata_ids))

    filenames, categories, empty_columns = [], [], []
    data = _compute_rarefaction_data(table, min_depth, max_depth,
                                     steps, iterations, phylogeny, metrics)
    for m, data in data.items():
        metric_name = quote(m)
        filename = '%s.csv' % metric_name

        if metadata is None:
            n_df = _compute_summary(data, 'sample-id')
            jsonp_filename = '%s.jsonp' % metric_name
            _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name,
                                     n_df, '')
            filenames.append(jsonp_filename)
        else:
            metadata_df = metadata.to_dataframe()
            metadata_df = metadata_df.loc[data.index]

            all_columns = metadata_df.columns
            metadata_df.dropna(axis='columns', how='all', inplace=True)
            empty_columns = set(all_columns) - set(metadata_df.columns)

            metadata_df.columns = pd.MultiIndex.from_tuples(
                [(c, '') for c in metadata_df.columns])
            merged = data.join(metadata_df, how='left')
            categories = metadata_df.columns.get_level_values(0)
            for category in categories:
                category_name = quote(category)
                reindexed_df, counts = _reindex_with_metadata(category,
                                                              categories,
                                                              merged)
                c_df = _compute_summary(reindexed_df, category, counts=counts)
                jsonp_filename = "%s-%s.jsonp" % (metric_name, category_name)
                _alpha_rarefaction_jsonp(output_dir, jsonp_filename,
                                         metric_name, c_df, category_name)
                filenames.append(jsonp_filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            data.columns = ['depth-%d_iter-%d' % (t[0], t[1])
                            for t in data.columns.values]
            if metadata is not None:
                data = data.join(metadata.to_dataframe(), how='left')
            data.to_csv(fh, index_label=['sample-id'])

    index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html')
    q2templates.render(index, output_dir,
                       context={'metrics': list(metrics),
                                'filenames': filenames,
                                'categories': list(categories),
                                'empty_columns': sorted(empty_columns)})

    shutil.copytree(os.path.join(TEMPLATES, 'alpha_rarefaction_assets',
                                 'dist'),
                    os.path.join(output_dir, 'dist'))

Example #42

0

Show file

File: _beta_rarefaction.py Project: jakereps/q2-diversity

def beta_rarefaction(output_dir: str, table: biom.Table, metric: str,
                     clustering_method: str, metadata: qiime2.Metadata,
                     sampling_depth: int, iterations: int=10,
                     phylogeny: skbio.TreeNode=None,
                     correlation_method: str='spearman',
                     color_scheme: str='BrBG') -> None:
    if metric in phylogenetic_metrics():
        if phylogeny is None:
            raise ValueError("A phylogenetic metric (%s) was requested, "
                             "but a phylogenetic tree was not provided. "
                             "Phylogeny must be provided when using a "
                             "phylogenetic diversity metric." % metric)
        beta_func = functools.partial(beta_phylogenetic, phylogeny=phylogeny)
    else:
        beta_func = beta

    if table.is_empty():
        raise ValueError("Input feature table is empty.")

    # Filter metadata to only include sample IDs present in the feature table.
    # Also ensures every feature table sample ID is present in the metadata.
    metadata = metadata.filter_ids(table.ids(axis='sample'))

    distance_matrices = _get_multiple_rarefaction(
        beta_func, metric, iterations, table, sampling_depth)
    primary = distance_matrices[0]
    support = distance_matrices[1:]

    heatmap_fig, similarity_df = _make_heatmap(
        distance_matrices, metric, correlation_method, color_scheme)
    heatmap_fig.savefig(os.path.join(output_dir, 'heatmap.svg'))
    similarity_df.to_csv(
        os.path.join(output_dir, 'rarefaction-iteration-correlation.tsv'),
        sep='\t')

    tree = _cluster_samples(primary, support, clustering_method)
    tree.write(os.path.join(output_dir,
                            'sample-clustering-%s.tre' % clustering_method))

    emperor = _jackknifed_emperor(primary, support, metadata)
    emperor_dir = os.path.join(output_dir, 'emperor')
    emperor.copy_support_files(emperor_dir)
    with open(os.path.join(emperor_dir, 'index.html'), 'w') as fh:
        fh.write(emperor.make_emperor(standalone=True))

    templates = list(map(
        lambda page: os.path.join(TEMPLATES, 'beta_rarefaction_assets', page),
        ['index.html', 'heatmap.html', 'tree.html', 'emperor.html']))

    context = {
        'metric': metric,
        'clustering_method': clustering_method,
        'tabs': [{'url': 'emperor.html',
                  'title': 'PCoA'},
                 {'url': 'heatmap.html',
                  'title': 'Heatmap'},
                 {'url': 'tree.html',
                  'title': 'Clustering'}]
    }

    q2templates.render(templates, output_dir, context=context)

Example #43

0

Show file

File: _visualizer.py Project: qiime2/q2-diversity

def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int,
                      phylogeny: skbio.TreeNode = None, metrics: set = None,
                      metadata: qiime2.Metadata = None, min_depth: int = 1,
                      steps: int = 10, iterations: int = 10) -> None:

    if metrics is None:
        metrics = {'observed_otus', 'shannon'}
        if phylogeny is not None:
            metrics.add('faith_pd')
    elif not metrics:
        raise ValueError('`metrics` was given an empty set.')
    else:
        phylo_overlap = phylogenetic_metrics() & metrics
        if phylo_overlap and phylogeny is None:
            raise ValueError('Phylogenetic metric %s was requested but '
                             'phylogeny was not provided.' % phylo_overlap)

    if max_depth <= min_depth:
        raise ValueError('Provided max_depth of %d must be greater than '
                         'provided min_depth of %d.' % (max_depth, min_depth))
    possible_steps = max_depth - min_depth
    if possible_steps < steps:
        raise ValueError('Provided number of steps (%d) is greater than the '
                         'steps possible between min_depth and '
                         'max_depth (%d).' % (steps, possible_steps))
    if table.is_empty():
        raise ValueError('Provided table is empty.')
    max_frequency = max(table.sum(axis='sample'))
    if max_frequency < max_depth:
        raise ValueError('Provided max_depth of %d is greater than '
                         'the maximum sample total frequency of the '
                         'feature_table (%d).' % (max_depth, max_frequency))

    if metadata is None:
        columns, filtered_columns = set(), set()
    else:
        # Filter metadata to only include sample IDs present in the feature
        # table. Also ensures every feature table sample ID is present in the
        # metadata.
        metadata = metadata.filter_ids(table.ids(axis='sample'))

        # Drop metadata columns that aren't categorical, or consist solely of
        # missing values.
        pre_filtered_cols = set(metadata.columns)
        metadata = metadata.filter_columns(column_type='categorical',
                                           drop_all_missing=True)
        filtered_columns = pre_filtered_cols - set(metadata.columns)

        metadata_df = metadata.to_dataframe()
        if metadata_df.empty or len(metadata.columns) == 0:
            raise ValueError("All metadata filtered after dropping columns "
                             "that contained non-categorical data.")
        metadata_df.columns = pd.MultiIndex.from_tuples(
            [(c, '') for c in metadata_df.columns])
        columns = metadata_df.columns.get_level_values(0)

    data = _compute_rarefaction_data(table, min_depth, max_depth,
                                     steps, iterations, phylogeny, metrics)

    filenames = []
    for m, data in data.items():
        metric_name = quote(m)
        filename = '%s.csv' % metric_name

        if metadata is None:
            n_df = _compute_summary(data, 'sample-id')
            jsonp_filename = '%s.jsonp' % metric_name
            _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name,
                                     n_df, '')
            filenames.append(jsonp_filename)
        else:
            merged = data.join(metadata_df, how='left')
            for column in columns:
                column_name = quote(column)
                reindexed_df, counts = _reindex_with_metadata(column,
                                                              columns,
                                                              merged)
                c_df = _compute_summary(reindexed_df, column, counts=counts)
                jsonp_filename = "%s-%s.jsonp" % (metric_name, column_name)
                _alpha_rarefaction_jsonp(output_dir, jsonp_filename,
                                         metric_name, c_df, column)
                filenames.append(jsonp_filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            data.columns = ['depth-%d_iter-%d' % (t[0], t[1])
                            for t in data.columns.values]
            if metadata is not None:
                data = data.join(metadata.to_dataframe(), how='left')
            data.to_csv(fh, index_label=['sample-id'])

    index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html')
    q2templates.render(index, output_dir,
                       context={'metrics': list(metrics),
                                'filenames': [quote(f) for f in filenames],
                                'columns': list(columns),
                                'steps': steps,
                                'filtered_columns': sorted(filtered_columns)})

    shutil.copytree(os.path.join(TEMPLATES, 'alpha_rarefaction_assets',
                                 'dist'),
                    os.path.join(output_dir, 'dist'))

Example #44

0

Show file

File: 54.py Project: tkosciol/qiita

def create_non_rarefied_biom_artifact(analysis, biom_data, rarefied_table):
    """Creates the initial non-rarefied BIOM artifact of the analysis

    Parameters
    ----------
    analysis : dict
        Dictionary with the analysis information
    biom_data : dict
        Dictionary with the biom file information
    rarefied_table : biom.Table
        The rarefied BIOM table

    Returns
    -------
    int
        The id of the new artifact
    """
    # The non rarefied biom artifact is the initial biom table of the analysis.
    # This table does not currently exist anywhere, so we need to actually
    # create the BIOM file. To create this BIOM file we need: (1) the samples
    # and artifacts they come from and (2) whether the samples where
    # renamed or not. (1) is on the database, but we need to inferr (2) from
    # the existing rarefied BIOM table. Fun, fun...

    with TRN:
        # Get the samples included in the BIOM table grouped by artifact id
        # Note that the analysis contains a BIOM table per data type included
        # in it, and the table analysis_sample does not differentiate between
        # datatypes, so we need to check the data type in the artifact table
        sql = """SELECT artifact_id, array_agg(sample_id)
                 FROM qiita.analysis_sample
                    JOIN qiita.artifact USING (artifact_id)
                 WHERE analysis_id = %s AND data_type_id = %s
                 GROUP BY artifact_id"""
        TRN.add(sql, [analysis['analysis_id'], biom_data['data_type_id']])
        samples_by_artifact = TRN.execute_fetchindex()

        # Create an empty BIOM table to be the new master table
        new_table = Table([], [], [])
        ids_map = {}
        for a_id, samples in samples_by_artifact:
            # Get the filepath of the BIOM table from the artifact
            artifact = Artifact(a_id)
            biom_fp = None
            for _, fp, fp_type in artifact.filepaths:
                if fp_type == 'biom':
                    biom_fp = fp
            # Note that we are sure that the biom table exists for sure, so
            # no need to check if biom_fp is undefined
            biom_table = load_table(biom_fp)
            samples = set(samples).intersection(biom_table.ids())
            biom_table.filter(samples, axis='sample', inplace=True)
            # we need to check if the table has samples left before merging
            if biom_table.shape[0] != 0 and biom_table.shape[1] != 0:
                new_table = new_table.merge(biom_table)
                ids_map.update({sid: "%d.%s" % (a_id, sid)
                                for sid in biom_table.ids()})

        # Check if we need to rename the sample ids in the biom table
        new_table_ids = set(new_table.ids())
        if not new_table_ids.issuperset(rarefied_table.ids()):
            # We need to rename the sample ids
            new_table.update_ids(ids_map, 'sample', True, True)

        sql = """INSERT INTO qiita.artifact
                    (generated_timestamp, data_type_id, visibility_id,
                     artifact_type_id, submitted_to_vamps)
            VALUES (%s, %s, %s, %s, %s)
            RETURNING artifact_id"""
        # Magic number 4 -> visibility sandbox
        # Magix number 7 -> biom artifact type
        TRN.add(sql, [analysis['timestamp'], biom_data['data_type_id'],
                      4, 7, False])
        artifact_id = TRN.execute_fetchlast()

        # Associate the artifact with the analysis
        sql = """INSERT INTO qiita.analysis_artifact
                    (analysis_id, artifact_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [analysis['analysis_id'], artifact_id])
        # Link the artifact with its file
        dd_id, mp = get_mountpoint('BIOM')[0]
        dir_fp = join(get_db_files_base_dir(), mp, str(artifact_id))
        if not exists(dir_fp):
            makedirs(dir_fp)
        new_table_fp = join(dir_fp, "biom_table.biom")
        with biom_open(new_table_fp, 'w') as f:
            new_table.to_hdf5(f, "Generated by Qiita")

        sql = """INSERT INTO qiita.filepath
                    (filepath, filepath_type_id, checksum,
                     checksum_algorithm_id, data_directory_id)
                 VALUES (%s, %s, %s, %s, %s)
                 RETURNING filepath_id"""
        # Magic number 7 -> filepath_type_id = 'biom'
        # Magic number 1 -> the checksum algorithm id
        TRN.add(sql, [basename(new_table_fp), 7,
                      compute_checksum(new_table_fp), 1, dd_id])
        fp_id = TRN.execute_fetchlast()
        sql = """INSERT INTO qiita.artifact_filepath
                    (artifact_id, filepath_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [artifact_id, fp_id])
        TRN.execute()

    return artifact_id

Example #45

0

Show file

File: _transformer.py Project: BenKaehler/q2-types

def _table_to_dataframe(table: biom.Table) -> pd.DataFrame:
    array = table.matrix_data.toarray().T
    sample_ids = table.ids(axis='sample')
    feature_ids = table.ids(axis='observation')
    return pd.DataFrame(array, index=sample_ids, columns=feature_ids)