Ejemplo n.º 1
0
def test_get_stats():
    def entropy(p):
        p = p[p != 0]
        return -(p * log(p)).sum()

    data = array([[0, 0, 1], [1, 3, 42]], dtype=float)
    table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3'])
    h, jsd = get_stats(table)

    p = data.sum(axis=1)
    p /= p.sum()
    assert_almost_equal(h, entropy(p))
    avg_h = 0
    weights = [p.sum() / data.sum() for p in data.T]
    for w, p in zip(weights, data.T):
        p /= p.sum()
        avg_h += w * entropy(p)
    assert_almost_equal(jsd, h - avg_h)

    table.norm()
    h, jsd = get_stats(table)

    data /= data.sum(axis=0)
    p = data.sum(axis=1) / 3.
    assert_almost_equal(h, entropy(p))
    avg_h = sum(entropy(p) / 3. for p in data.T)
    assert_almost_equal(jsd, h - avg_h)
Ejemplo n.º 2
0
def generate_class_weights(
        reference_taxonomy: Series, reference_sequences: DNAIterator,
        samples: biom.Table, taxonomy_classification: DataFrame,
        unobserved_weight: float = 1e-6, normalise: bool = False,
        allow_weight_outside_reference: bool = False) \
        -> biom.Table:
    weights = {
        reference_taxonomy[seq.metadata['id']]: 0.
        for seq in reference_sequences
    }
    if normalise:
        samples.norm()

    tax_map = taxonomy_classification['Taxon']
    try:
        taxa = [tax_map[s] for s in samples.ids(axis='observation')]
    except KeyError as s:
        raise ValueError(str(s) + ' not in taxonomy_classification')
    if not allow_weight_outside_reference and not set(taxa).issubset(weights):
        raise ValueError(
            'taxonomy_classification does not match reference_taxonomy')

    for taxon, count in zip(taxa, samples.sum('observation')):
        if taxon in weights:
            weights[taxon] += count
    taxa, weights = zip(*weights.items())
    weights = array(weights)
    weights /= weights.sum()
    weights = \
        (1. - unobserved_weight) * weights + unobserved_weight / len(weights)
    weights /= weights.sum()

    return biom.Table(weights[None].T, taxa, sample_ids=['Weight'])
Ejemplo n.º 3
0
def format_barplots(table: biom.Table, normalize: bool):
    barplots = []
    barplots.append('DATASET_MULTIBAR')
    barplots.append('SEPARATOR TAB')
    barplots.append('DATASET_LABEL\tRelative Abundance')
    if normalize:
        table = table.norm(axis='observation', inplace=False)
    table = table.to_dataframe(dense=True)

    field_labels = list(table.columns)
    field_colors = values_to_colors(field_labels, 'husl').values()

    barplots.append('FIELD_COLORS\t' + '\t'.join(field_colors))
    barplots.append('FIELD_LABELS\t' + '\t'.join(field_labels))

    barplots.append('LEGEND_TITLE\tRelative Abundance')
    barplots.append('LEGEND_SHAPES\t' + '\t'.join(['1'] * len(field_colors)))
    barplots.append('LEGEND_COLORS\t' + '\t'.join(field_colors))
    barplots.append('LEGEND_LABELS\t' + '\t'.join(field_labels))
    barplots.append('WIDTH\t100')

    barplots.append('DATA')
    table = table.reset_index()
    for idx in table.index:
        barplots.append('\t'.join(table.loc[idx].apply(str)))

    return '\n'.join(barplots)
Ejemplo n.º 4
0
def cscs(features: biom.Table,
         css_edges: str,
         cosine_threshold: float = 0.6,
         normalization: bool = True,
         weighted: bool = True) -> skbio.DistanceMatrix:
    observationids = {
        x: index
        for index, x in enumerate(features.ids(axis='observation'))
    }
    edgesdok = dok_matrix((features.shape[0], features.shape[0]),
                          dtype=np.float32)
    for line in open(css_edges, "r"):
        if line.find("CLUSTERID1") > -1:
            continue
        linesplit = line.split("\t")
        if float(linesplit[4]) < cosine_threshold:
            edgesdok[observationids[linesplit[0]],
                     observationids[linesplit[1]]] = 0.0
        else:
            edgesdok[observationids[linesplit[0]],
                     observationids[linesplit[1]]] = float(linesplit[4])
            edgesdok[observationids[linesplit[1]],
                     observationids[linesplit[0]]] = float(linesplit[4])
        edgesdok.setdiag(1)

    if normalization:
        features = features.norm(axis='sample', inplace=False)
    if weighted == False:
        features = features.pa  #TODO: make new option in cscs()

    sample_names = features.ids()
    cscs = parallel_make_distance_matrix(features, edgesdok, sample_names)
    cscs = 1 - cscs
    print(cscs)
    return (skbio.DistanceMatrix(cscs, ids=cscs.index))
Ejemplo n.º 5
0
    def __init__(self,
                 table: biom.Table,
                 features: pd.DataFrame,
                 variances: biom.Table = None,
                 formatter: Optional['Formatter'] = None):
        """Establish the taxonomy data

        Parameters
        ----------
        table : biom.Table
            Relative abundance data per sample or collapsed into higher order
            entiries (e.g., abx in the past year)
        features : pd.DataFrame
            DataFrame relating an observation to a Taxon
        variances : biom.Table, optional
            Variation information about a taxon within a label.
        """
        self._table = table.norm(inplace=False)
        self._group_id_lookup = set(self._table.ids())
        self._feature_id_lookup = set(self._table.ids(axis='observation'))
        self._feature_order = self._table.ids(axis='observation')
        self._features = features
        self._ranks = table.rankdata(inplace=False)

        if variances is None:
            self._variances = biom.Table(np.zeros(self._table.shape),
                                         self._table.ids(axis='observation'),
                                         self._table.ids())
        else:
            self._variances = variances

        if set(self._variances.ids()) != set(self._table.ids()):
            raise DisjointError("Table and variances are disjoint")

        if set(self._variances.ids(axis='observation')) != \
                set(self._table.ids(axis='observation')):
            raise DisjointError("Table and variances are disjoint")

        if set(self._table.ids(axis='observation')) != \
                set(self._features.index):
            raise DisjointError("Table and features are disjoint")

        self._features = self._features.loc[self._feature_order]
        self._variances = self._variances.sort_order(self._feature_order,
                                                     axis='observation')

        if formatter is None:
            formatter: Formatter = GreengenesFormatter()
        self._formatter = formatter

        feature_taxons = self._features
        self._formatted_taxa_names = {
            i: self._formatter.dict_format(lineage)
            for i, lineage in feature_taxons['Taxon'].items()
        }
Ejemplo n.º 6
0
def collapse_full(_bt):
    """Collapses full biom table to median of each OTU

    Parameters
    ----------
    _bt : biom table
        Table to collapse

    Returns
    -------
    biom table
        Collapsed biom table, one sample containing median of each OTU,
        normalized.
    """
    num_obs = len(_bt.ids(axis='observation'))
    table = Table(np.array(
        [np.median(v) for v in _bt.iter_data(axis='observation')]).reshape(
        (num_obs, 1)),
        _bt.ids(axis='observation'), ['average'],
        observation_metadata=_bt.metadata(axis='observation'))
    table.norm(inplace=True)
    return table
Ejemplo n.º 7
0
def synthetic_over_sampling(table: biom.Table,
                            metadata: NumericMetadataColumn,
                            concatenate_meta_fp: Str,
                            method: Str = 'SMOTE',
                            k_neighbors: Int = 5,
                            n_jobs: Int = 1,
                            sampling_strategy: Str = 'auto',
                            random_state: Int = 42,
                            output_log_fp: Str = None) -> biom.Table:
    log_fp = tempfile.mktemp()
    print("The log file will be writen into", log_fp)

    if log_fp:
        logger_ins = LOG(log_fp=log_fp).get_logger('synthetic_over_sampling')
        logger_ins.info("The parameters used for oversampling are")
        logger_ins.info('k_neighbors:', k_neighbors)
        logger_ins.info('Sampling method:', method)
        logger_ins.info('Output log file path:', output_log_fp)
        logger_ins.info('sampling_strategy:', sampling_strategy)
        logger_ins.info('n_jobs:', n_jobs)
        logger_ins.info('random_state:', random_state)

    cls = dispatcher[method]
    if method != 'RandomOverSampler':
        table.norm(inplace=True)
        if log_fp:
            logger_ins.info(
                "The input table is normalized before using it for oversampling"
            )
    sorted_table, sorted_metadata = _read_inputs(table, meta_data=metadata)
    matrix_data = sorted_table.matrix_data.transpose().todense()

    if method not in dispatcher:
        raise ValueError('The optional methods for over sampling are',
                         dispatcher.keys(), "instead it received", method)
    if method == 'ADASYN':
        neigh = sklearn.neighbors.NearestNeighbors(metric='braycurtis',
                                                   n_neighbors=k_neighbors + 1)
        over_sampling_cls = cls(sampling_strategy=sampling_strategy,
                                random_state=random_state,
                                n_neighbors=neigh,
                                n_jobs=n_jobs)
    elif method == 'RandomOverSampler':
        over_sampling_cls = cls(sampling_strategy=sampling_strategy,
                                random_state=random_state)
    else:
        neigh = sklearn.neighbors.NearestNeighbors(metric='braycurtis',
                                                   n_neighbors=k_neighbors + 1)
        over_sampling_cls = cls(sampling_strategy=sampling_strategy,
                                k_neighbors=neigh,
                                random_state=random_state,
                                n_jobs=n_jobs)
    X_resampled, y_resampled = over_sampling_cls.fit_resample(
        matrix_data, sorted_metadata)
    if np.sum(np.abs(X_resampled[:len(matrix_data), :] - matrix_data)) != 0 or \
            np.sum(y_resampled[:len(matrix_data)] == sorted_metadata) != len(matrix_data):
        raise ValueError(
            "Over sampling method changed the data! Please double check your biom table. The sum of differences "
            "between the generated and original samples is",
            np.sum(np.abs(X_resampled[:len(matrix_data), :] - matrix_data)),
            "(should be 0.0) and the number of "
            "retained labels is",
            np.sum(y_resampled[:len(matrix_data)] == sorted_metadata),
            "while should be", len(sorted_metadata))
    else:
        if log_fp:
            logger_ins.info("The oversampling finished successfully!")
            logger_ins.info(
                "The first", len(matrix_data),
                "samples belong to the original training samples and the "
                "next",
                len(X_resampled) - len(matrix_data),
                "samples belong to the new ones")
            logger_ins.info("Overall, the size of data is", len(X_resampled))
    if method != 'RandomOverSampler':
        dummy_samples = np.asarray(
            list(sorted_table.ids('sample')) + [
                "dummy_sample_" + str(i)
                for i in range(len(X_resampled) - len(matrix_data))
            ])
    else:
        orig_samples = sorted_table.ids('sample')
        dummy_samples = over_sampling_cls.sample_indices_
        samples_counter = Counter(dummy_samples)
        dummy_samples_ = []
        tracking_dict = dict()
        for sample in dummy_samples:
            j = tracking_dict.get(sample, 0)
            if samples_counter[sample] > 1:
                tracking_dict[sample] = j + 1
                sample = str(orig_samples[sample]) + "_" + str(j + 1)
            else:
                sample = str(orig_samples[sample])
            dummy_samples_.append(sample)
        dummy_samples = dummy_samples_

    oversampled_table = biom.Table(
        X_resampled.transpose(),
        observation_ids=sorted_table.ids('observation'),
        sample_ids=dummy_samples)
    oversampled_metadata = pd.DataFrame(index=dummy_samples, data=y_resampled)
    oversampled_metadata.index.names = ['#SampleID']
    oversampled_metadata.columns = ['label']
    oversampled_meta = qiime2.Metadata(oversampled_metadata)
    oversampled_meta.save(concatenate_meta_fp)

    if log_fp:
        shutil.copy(log_fp, output_log_fp)

    return oversampled_table
Ejemplo n.º 8
0
def relative_frequency(table: biom.Table) -> biom.Table:
    """ Convert feature table in-place from frequencies to relative frequencies
    """
    table.norm(axis='sample', inplace=True)
    return table
Ejemplo n.º 9
0
    def __init__(self,
                 table: biom.Table,
                 features: pd.DataFrame,
                 variances: biom.Table = None,
                 formatter: Optional['Formatter'] = None,
                 rank_level: int = 1):
        """Establish the taxonomy data

        Parameters
        ----------
        table : biom.Table
            Relative abundance data per sample or collapsed into higher order
            entiries (e.g., abx in the past year)
        features : pd.DataFrame
            DataFrame relating an observation to a Taxon
        variances : biom.Table, optional
            Variation information about a taxon within a label.
        rank_level : int
            The taxonomic level (depth) to compute ranks over. Level 0 is
            domain, level 1 is phylum, etc.
        """
        self._table = table.norm(inplace=False)
        self._group_id_lookup = set(self._table.ids())
        self._feature_id_lookup = set(self._table.ids(axis='observation'))
        self._feature_order = self._table.ids(axis='observation')
        self._features = features

        if variances is None:
            empty = ss.csr_matrix((len(
                self._table.ids(axis='observation')), len(self._table.ids())),
                                  dtype=float)
            self._variances = biom.Table(empty,
                                         self._table.ids(axis='observation'),
                                         self._table.ids())
        else:
            self._variances = variances

        if set(self._variances.ids()) != set(self._table.ids()):
            raise DisjointError("Table and variances are disjoint")

        if set(self._variances.ids(axis='observation')) != \
                set(self._table.ids(axis='observation')):
            raise DisjointError("Table and variances are disjoint")

        if not self._feature_id_lookup.issubset(set(self._features.index)):
            raise SubsetError("Table features are not a subset of the "
                              "taxonomy information")

        self._ranked, self._ranked_order = self._rankdata(rank_level)

        self._features = self._features.loc[self._feature_order]
        self._variances = self._variances.sort_order(self._feature_order,
                                                     axis='observation')

        if formatter is None:
            formatter: Formatter = GreengenesFormatter()
        self._formatter = formatter

        # initialize taxonomy tree
        tree_data = ((i, lineage.split('; '))
                     for i, lineage in self._features['Taxon'].items())
        self.taxonomy_tree = skbio.TreeNode.from_taxonomy(tree_data)
        self._index_taxa_prevalence()
        for node in self.taxonomy_tree.traverse():
            node.length = 1
        self.bp_tree = parse_newick(str(self.taxonomy_tree))

        feature_taxons = self._features
        self._formatted_taxa_names = {
            i: self._formatter.dict_format(lineage)
            for i, lineage in feature_taxons['Taxon'].items()
        }
Ejemplo n.º 10
0
def relative_frequency(table: biom.Table, axis: str='sample') -> biom.Table:
    """ Convert feature table in-place from frequencies to relative frequencies
    """
    table.norm(axis=axis, inplace=True)
    return table
Ejemplo n.º 11
0
def synthetic_over_sampling(table: biom.Table,
                            metadata: NumericMetadataColumn,
                            concatenate_meta: Str,
                            method: Str = 'SMOTETomek',
                            k_neighbors: Int = 5,
                            m_neighbors: Int = 10,
                            n_jobs: Int = 1,
                            log_fp: Str = None,
                            sampling_strategy: Str = 'auto',
                            random_state: Int = 42,
                            output_log_fp: Str = None) -> biom.Table:
    if log_fp:
        logger_ins = LOG(
            log_fp=log_fp).get_logger('synthetic_sampling_combination')
        logger_ins.info("The parameters used for oversampling are")
        logger_ins.info('k_neighbors:', k_neighbors)
        logger_ins.info('m_neighbors:', m_neighbors)
        logger_ins.info('Sampling method:', method)
        logger_ins.info('Output log file path:', log_fp)
        logger_ins.info('sampling_strategy:', sampling_strategy)
        logger_ins.info('n_jobs:', n_jobs)
        logger_ins.info('random_state:', random_state)

    cls = dispatcher[method]
    if method != 'RandomOverSampler':
        table.norm(inplace=True)
        if log_fp:
            logger_ins.info(
                "The input table is normalized before using it for oversampling"
            )
    sorted_table, sorted_metadata = _read_inputs(table, meta_data=metadata)
    matrix_data = sorted_table.matrix_data.transpose()
    if method not in dispatcher:
        raise ValueError('The optional methods for over sampling are',
                         dispatcher.keys(), "instead it received", method)
    if method == 'ADASYN':
        over_sampling_cls = cls(sampling_strategy=sampling_strategy,
                                random_state=random_state,
                                n_neighbors=k_neighbors,
                                n_jobs=n_jobs)
    elif method == 'RandomOverSampler':
        over_sampling_cls = cls(sampling_strategy=sampling_strategy,
                                random_state=random_state)
    else:
        over_sampling_cls = cls(sampling_strategy=sampling_strategy,
                                m_neighbors=m_neighbors,
                                random_state=random_state,
                                n_jobs=n_jobs,
                                k_neighbors=k_neighbors)
    X_resampled, y_resampled = over_sampling_cls.fit_resample(
        matrix_data, sorted_metadata)
    if np.sum(np.abs(X_resampled[:len(matrix_data), :] - matrix_data)) != 0 or \
            np.sum(y_resampled[:len(matrix_data)] == metadata) != len(matrix_data):
        raise ValueError(
            "Over sampling method changed the data! Please double check your biom table"
        )
    else:
        if log_fp:
            logger_ins.info("The oversampling finished successfully!")
            logger_ins.info(
                "The first", len(matrix_data),
                "samples belong to the original training samples and the "
                "next",
                len(X_resampled) - len(matrix_data),
                "samples belong to the new ones")
            logger_ins.info("Overall, the size of data is", len(X_resampled))
    if method != 'RandomOverSampler':
        dummy_samples = np.asarray(
            list(sorted_table.ids('sample')) + [
                "dummy_sample_" + str(i)
                for i in range(len(X_resampled) - len(matrix_data))
            ])
    else:
        dummy_samples = over_sampling_cls.sample_indices_

    oversampled_table = biom.Table(
        X_resampled,
        observation_ids=sorted_table.ids('observation'),
        sample_ids=dummy_samples)
    oversampled_metadata = pd.DataFrame(index=dummy_samples, data=y_resampled)
    oversampled_metadata.index.names = ['#SampleID']
    oversampled_metadata.columns = ['label']
    oversampled_meta = qiime2.Metadata(oversampled_metadata)
    oversampled_meta.save()

    return oversampled_table
def synthetic_under_sampling(table: biom.Table,
                             metadata: NumericMetadataColumn,
                             concatenate_meta_fp: Str,
                             method: Str = 'RandomUnderSampler',
                             voting: Str = 'auto',
                             n_jobs: Int = 1,
                             sampling_strategy: Str = 'auto',
                             random_state: Int = 42,
                             output_log_fp: Str = None) -> biom.Table:
    log_fp = tempfile.mktemp()
    print("The log file will be writen into", log_fp)
    if log_fp:
        logger_ins = LOG(log_fp=log_fp).get_logger('synthetic_over_sampling')
        logger_ins.info("The parameters used for oversampling are")
        logger_ins.info('voting (will be used with ClusterCentroids only):',
                        voting)
        logger_ins.info('Sampling method:', method)
        logger_ins.info('Output log file path:', log_fp)
        logger_ins.info('sampling_strategy:', sampling_strategy)
        logger_ins.info('n_jobs:', n_jobs)
        logger_ins.info('random_state:', random_state)

    cls = dispatcher[method]
    if method != 'RandomUnderSampler':
        table.norm(inplace=True)
        if log_fp:
            logger_ins.info(
                "The input table is normalized before using it for oversampling"
            )
    sorted_table, sorted_metadata = _read_inputs(table, meta_data=metadata)
    matrix_data = sorted_table.matrix_data.transpose().todense()
    if method not in dispatcher:
        raise ValueError('The optional methods for over sampling are',
                         dispatcher.keys(), "instead it received", method)
    if method == 'RandomUnderSampler':
        under_sampling_cls = cls(sampling_strategy=sampling_strategy,
                                 random_state=random_state,
                                 replacement=False)
    else:
        raise NotImplementedError("Method", method, "is not implemented yet")
    X_resampled, y_resampled = under_sampling_cls.fit_resample(
        matrix_data, sorted_metadata)
    if log_fp:
        logger_ins.info("The under-sampling finished successfully!")
        logger_ins.info("Overall, the size of data is", len(X_resampled))
    if method == 'RandomUnderSampler':
        dummy_samples_ids = under_sampling_cls.sample_indices_
        dummy_samples = []
        orig_samples = sorted_table.ids('sample')
        for sample_id in dummy_samples_ids:
            dummy_samples.append(orig_samples[sample_id])
    else:
        raise NotImplementedError("Method", method, "is not implemented yet")
    under_sampling_dummy = sorted_table.filter(ids_to_keep=dummy_samples,
                                               inplace=False)
    under_sampling_dummy = under_sampling_dummy.sort_order(order=dummy_samples,
                                                           axis='sample')
    if method == "RandomUnderSampler" and np.sum(
            under_sampling_dummy.matrix_data.transpose() - X_resampled) != 0:
        raise ValueError("The undersampling changed the matrix data")

    undersampled_table = biom.Table(
        X_resampled.transpose(),
        observation_ids=sorted_table.ids('observation'),
        sample_ids=dummy_samples)
    undersampled_metadata = pd.DataFrame(index=dummy_samples, data=y_resampled)
    undersampled_metadata.index.names = ['#SampleID']
    undersampled_metadata.columns = ['label']
    undersampled_meta = qiime2.Metadata(undersampled_metadata)
    undersampled_meta.save(concatenate_meta_fp)

    if log_fp:
        shutil.copy(log_fp, output_log_fp)

    return undersampled_table