def test_get_stats(): def entropy(p): p = p[p != 0] return -(p * log(p)).sum() data = array([[0, 0, 1], [1, 3, 42]], dtype=float) table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3']) h, jsd = get_stats(table) p = data.sum(axis=1) p /= p.sum() assert_almost_equal(h, entropy(p)) avg_h = 0 weights = [p.sum() / data.sum() for p in data.T] for w, p in zip(weights, data.T): p /= p.sum() avg_h += w * entropy(p) assert_almost_equal(jsd, h - avg_h) table.norm() h, jsd = get_stats(table) data /= data.sum(axis=0) p = data.sum(axis=1) / 3. assert_almost_equal(h, entropy(p)) avg_h = sum(entropy(p) / 3. for p in data.T) assert_almost_equal(jsd, h - avg_h)
def generate_class_weights( reference_taxonomy: Series, reference_sequences: DNAIterator, samples: biom.Table, taxonomy_classification: DataFrame, unobserved_weight: float = 1e-6, normalise: bool = False, allow_weight_outside_reference: bool = False) \ -> biom.Table: weights = { reference_taxonomy[seq.metadata['id']]: 0. for seq in reference_sequences } if normalise: samples.norm() tax_map = taxonomy_classification['Taxon'] try: taxa = [tax_map[s] for s in samples.ids(axis='observation')] except KeyError as s: raise ValueError(str(s) + ' not in taxonomy_classification') if not allow_weight_outside_reference and not set(taxa).issubset(weights): raise ValueError( 'taxonomy_classification does not match reference_taxonomy') for taxon, count in zip(taxa, samples.sum('observation')): if taxon in weights: weights[taxon] += count taxa, weights = zip(*weights.items()) weights = array(weights) weights /= weights.sum() weights = \ (1. - unobserved_weight) * weights + unobserved_weight / len(weights) weights /= weights.sum() return biom.Table(weights[None].T, taxa, sample_ids=['Weight'])
def format_barplots(table: biom.Table, normalize: bool): barplots = [] barplots.append('DATASET_MULTIBAR') barplots.append('SEPARATOR TAB') barplots.append('DATASET_LABEL\tRelative Abundance') if normalize: table = table.norm(axis='observation', inplace=False) table = table.to_dataframe(dense=True) field_labels = list(table.columns) field_colors = values_to_colors(field_labels, 'husl').values() barplots.append('FIELD_COLORS\t' + '\t'.join(field_colors)) barplots.append('FIELD_LABELS\t' + '\t'.join(field_labels)) barplots.append('LEGEND_TITLE\tRelative Abundance') barplots.append('LEGEND_SHAPES\t' + '\t'.join(['1'] * len(field_colors))) barplots.append('LEGEND_COLORS\t' + '\t'.join(field_colors)) barplots.append('LEGEND_LABELS\t' + '\t'.join(field_labels)) barplots.append('WIDTH\t100') barplots.append('DATA') table = table.reset_index() for idx in table.index: barplots.append('\t'.join(table.loc[idx].apply(str))) return '\n'.join(barplots)
def cscs(features: biom.Table, css_edges: str, cosine_threshold: float = 0.6, normalization: bool = True, weighted: bool = True) -> skbio.DistanceMatrix: observationids = { x: index for index, x in enumerate(features.ids(axis='observation')) } edgesdok = dok_matrix((features.shape[0], features.shape[0]), dtype=np.float32) for line in open(css_edges, "r"): if line.find("CLUSTERID1") > -1: continue linesplit = line.split("\t") if float(linesplit[4]) < cosine_threshold: edgesdok[observationids[linesplit[0]], observationids[linesplit[1]]] = 0.0 else: edgesdok[observationids[linesplit[0]], observationids[linesplit[1]]] = float(linesplit[4]) edgesdok[observationids[linesplit[1]], observationids[linesplit[0]]] = float(linesplit[4]) edgesdok.setdiag(1) if normalization: features = features.norm(axis='sample', inplace=False) if weighted == False: features = features.pa #TODO: make new option in cscs() sample_names = features.ids() cscs = parallel_make_distance_matrix(features, edgesdok, sample_names) cscs = 1 - cscs print(cscs) return (skbio.DistanceMatrix(cscs, ids=cscs.index))
def __init__(self, table: biom.Table, features: pd.DataFrame, variances: biom.Table = None, formatter: Optional['Formatter'] = None): """Establish the taxonomy data Parameters ---------- table : biom.Table Relative abundance data per sample or collapsed into higher order entiries (e.g., abx in the past year) features : pd.DataFrame DataFrame relating an observation to a Taxon variances : biom.Table, optional Variation information about a taxon within a label. """ self._table = table.norm(inplace=False) self._group_id_lookup = set(self._table.ids()) self._feature_id_lookup = set(self._table.ids(axis='observation')) self._feature_order = self._table.ids(axis='observation') self._features = features self._ranks = table.rankdata(inplace=False) if variances is None: self._variances = biom.Table(np.zeros(self._table.shape), self._table.ids(axis='observation'), self._table.ids()) else: self._variances = variances if set(self._variances.ids()) != set(self._table.ids()): raise DisjointError("Table and variances are disjoint") if set(self._variances.ids(axis='observation')) != \ set(self._table.ids(axis='observation')): raise DisjointError("Table and variances are disjoint") if set(self._table.ids(axis='observation')) != \ set(self._features.index): raise DisjointError("Table and features are disjoint") self._features = self._features.loc[self._feature_order] self._variances = self._variances.sort_order(self._feature_order, axis='observation') if formatter is None: formatter: Formatter = GreengenesFormatter() self._formatter = formatter feature_taxons = self._features self._formatted_taxa_names = { i: self._formatter.dict_format(lineage) for i, lineage in feature_taxons['Taxon'].items() }
def collapse_full(_bt): """Collapses full biom table to median of each OTU Parameters ---------- _bt : biom table Table to collapse Returns ------- biom table Collapsed biom table, one sample containing median of each OTU, normalized. """ num_obs = len(_bt.ids(axis='observation')) table = Table(np.array( [np.median(v) for v in _bt.iter_data(axis='observation')]).reshape( (num_obs, 1)), _bt.ids(axis='observation'), ['average'], observation_metadata=_bt.metadata(axis='observation')) table.norm(inplace=True) return table
def synthetic_over_sampling(table: biom.Table, metadata: NumericMetadataColumn, concatenate_meta_fp: Str, method: Str = 'SMOTE', k_neighbors: Int = 5, n_jobs: Int = 1, sampling_strategy: Str = 'auto', random_state: Int = 42, output_log_fp: Str = None) -> biom.Table: log_fp = tempfile.mktemp() print("The log file will be writen into", log_fp) if log_fp: logger_ins = LOG(log_fp=log_fp).get_logger('synthetic_over_sampling') logger_ins.info("The parameters used for oversampling are") logger_ins.info('k_neighbors:', k_neighbors) logger_ins.info('Sampling method:', method) logger_ins.info('Output log file path:', output_log_fp) logger_ins.info('sampling_strategy:', sampling_strategy) logger_ins.info('n_jobs:', n_jobs) logger_ins.info('random_state:', random_state) cls = dispatcher[method] if method != 'RandomOverSampler': table.norm(inplace=True) if log_fp: logger_ins.info( "The input table is normalized before using it for oversampling" ) sorted_table, sorted_metadata = _read_inputs(table, meta_data=metadata) matrix_data = sorted_table.matrix_data.transpose().todense() if method not in dispatcher: raise ValueError('The optional methods for over sampling are', dispatcher.keys(), "instead it received", method) if method == 'ADASYN': neigh = sklearn.neighbors.NearestNeighbors(metric='braycurtis', n_neighbors=k_neighbors + 1) over_sampling_cls = cls(sampling_strategy=sampling_strategy, random_state=random_state, n_neighbors=neigh, n_jobs=n_jobs) elif method == 'RandomOverSampler': over_sampling_cls = cls(sampling_strategy=sampling_strategy, random_state=random_state) else: neigh = sklearn.neighbors.NearestNeighbors(metric='braycurtis', n_neighbors=k_neighbors + 1) over_sampling_cls = cls(sampling_strategy=sampling_strategy, k_neighbors=neigh, random_state=random_state, n_jobs=n_jobs) X_resampled, y_resampled = over_sampling_cls.fit_resample( matrix_data, sorted_metadata) if np.sum(np.abs(X_resampled[:len(matrix_data), :] - matrix_data)) != 0 or \ np.sum(y_resampled[:len(matrix_data)] == sorted_metadata) != len(matrix_data): raise ValueError( "Over sampling method changed the data! Please double check your biom table. The sum of differences " "between the generated and original samples is", np.sum(np.abs(X_resampled[:len(matrix_data), :] - matrix_data)), "(should be 0.0) and the number of " "retained labels is", np.sum(y_resampled[:len(matrix_data)] == sorted_metadata), "while should be", len(sorted_metadata)) else: if log_fp: logger_ins.info("The oversampling finished successfully!") logger_ins.info( "The first", len(matrix_data), "samples belong to the original training samples and the " "next", len(X_resampled) - len(matrix_data), "samples belong to the new ones") logger_ins.info("Overall, the size of data is", len(X_resampled)) if method != 'RandomOverSampler': dummy_samples = np.asarray( list(sorted_table.ids('sample')) + [ "dummy_sample_" + str(i) for i in range(len(X_resampled) - len(matrix_data)) ]) else: orig_samples = sorted_table.ids('sample') dummy_samples = over_sampling_cls.sample_indices_ samples_counter = Counter(dummy_samples) dummy_samples_ = [] tracking_dict = dict() for sample in dummy_samples: j = tracking_dict.get(sample, 0) if samples_counter[sample] > 1: tracking_dict[sample] = j + 1 sample = str(orig_samples[sample]) + "_" + str(j + 1) else: sample = str(orig_samples[sample]) dummy_samples_.append(sample) dummy_samples = dummy_samples_ oversampled_table = biom.Table( X_resampled.transpose(), observation_ids=sorted_table.ids('observation'), sample_ids=dummy_samples) oversampled_metadata = pd.DataFrame(index=dummy_samples, data=y_resampled) oversampled_metadata.index.names = ['#SampleID'] oversampled_metadata.columns = ['label'] oversampled_meta = qiime2.Metadata(oversampled_metadata) oversampled_meta.save(concatenate_meta_fp) if log_fp: shutil.copy(log_fp, output_log_fp) return oversampled_table
def relative_frequency(table: biom.Table) -> biom.Table: """ Convert feature table in-place from frequencies to relative frequencies """ table.norm(axis='sample', inplace=True) return table
def __init__(self, table: biom.Table, features: pd.DataFrame, variances: biom.Table = None, formatter: Optional['Formatter'] = None, rank_level: int = 1): """Establish the taxonomy data Parameters ---------- table : biom.Table Relative abundance data per sample or collapsed into higher order entiries (e.g., abx in the past year) features : pd.DataFrame DataFrame relating an observation to a Taxon variances : biom.Table, optional Variation information about a taxon within a label. rank_level : int The taxonomic level (depth) to compute ranks over. Level 0 is domain, level 1 is phylum, etc. """ self._table = table.norm(inplace=False) self._group_id_lookup = set(self._table.ids()) self._feature_id_lookup = set(self._table.ids(axis='observation')) self._feature_order = self._table.ids(axis='observation') self._features = features if variances is None: empty = ss.csr_matrix((len( self._table.ids(axis='observation')), len(self._table.ids())), dtype=float) self._variances = biom.Table(empty, self._table.ids(axis='observation'), self._table.ids()) else: self._variances = variances if set(self._variances.ids()) != set(self._table.ids()): raise DisjointError("Table and variances are disjoint") if set(self._variances.ids(axis='observation')) != \ set(self._table.ids(axis='observation')): raise DisjointError("Table and variances are disjoint") if not self._feature_id_lookup.issubset(set(self._features.index)): raise SubsetError("Table features are not a subset of the " "taxonomy information") self._ranked, self._ranked_order = self._rankdata(rank_level) self._features = self._features.loc[self._feature_order] self._variances = self._variances.sort_order(self._feature_order, axis='observation') if formatter is None: formatter: Formatter = GreengenesFormatter() self._formatter = formatter # initialize taxonomy tree tree_data = ((i, lineage.split('; ')) for i, lineage in self._features['Taxon'].items()) self.taxonomy_tree = skbio.TreeNode.from_taxonomy(tree_data) self._index_taxa_prevalence() for node in self.taxonomy_tree.traverse(): node.length = 1 self.bp_tree = parse_newick(str(self.taxonomy_tree)) feature_taxons = self._features self._formatted_taxa_names = { i: self._formatter.dict_format(lineage) for i, lineage in feature_taxons['Taxon'].items() }
def relative_frequency(table: biom.Table, axis: str='sample') -> biom.Table: """ Convert feature table in-place from frequencies to relative frequencies """ table.norm(axis=axis, inplace=True) return table
def synthetic_over_sampling(table: biom.Table, metadata: NumericMetadataColumn, concatenate_meta: Str, method: Str = 'SMOTETomek', k_neighbors: Int = 5, m_neighbors: Int = 10, n_jobs: Int = 1, log_fp: Str = None, sampling_strategy: Str = 'auto', random_state: Int = 42, output_log_fp: Str = None) -> biom.Table: if log_fp: logger_ins = LOG( log_fp=log_fp).get_logger('synthetic_sampling_combination') logger_ins.info("The parameters used for oversampling are") logger_ins.info('k_neighbors:', k_neighbors) logger_ins.info('m_neighbors:', m_neighbors) logger_ins.info('Sampling method:', method) logger_ins.info('Output log file path:', log_fp) logger_ins.info('sampling_strategy:', sampling_strategy) logger_ins.info('n_jobs:', n_jobs) logger_ins.info('random_state:', random_state) cls = dispatcher[method] if method != 'RandomOverSampler': table.norm(inplace=True) if log_fp: logger_ins.info( "The input table is normalized before using it for oversampling" ) sorted_table, sorted_metadata = _read_inputs(table, meta_data=metadata) matrix_data = sorted_table.matrix_data.transpose() if method not in dispatcher: raise ValueError('The optional methods for over sampling are', dispatcher.keys(), "instead it received", method) if method == 'ADASYN': over_sampling_cls = cls(sampling_strategy=sampling_strategy, random_state=random_state, n_neighbors=k_neighbors, n_jobs=n_jobs) elif method == 'RandomOverSampler': over_sampling_cls = cls(sampling_strategy=sampling_strategy, random_state=random_state) else: over_sampling_cls = cls(sampling_strategy=sampling_strategy, m_neighbors=m_neighbors, random_state=random_state, n_jobs=n_jobs, k_neighbors=k_neighbors) X_resampled, y_resampled = over_sampling_cls.fit_resample( matrix_data, sorted_metadata) if np.sum(np.abs(X_resampled[:len(matrix_data), :] - matrix_data)) != 0 or \ np.sum(y_resampled[:len(matrix_data)] == metadata) != len(matrix_data): raise ValueError( "Over sampling method changed the data! Please double check your biom table" ) else: if log_fp: logger_ins.info("The oversampling finished successfully!") logger_ins.info( "The first", len(matrix_data), "samples belong to the original training samples and the " "next", len(X_resampled) - len(matrix_data), "samples belong to the new ones") logger_ins.info("Overall, the size of data is", len(X_resampled)) if method != 'RandomOverSampler': dummy_samples = np.asarray( list(sorted_table.ids('sample')) + [ "dummy_sample_" + str(i) for i in range(len(X_resampled) - len(matrix_data)) ]) else: dummy_samples = over_sampling_cls.sample_indices_ oversampled_table = biom.Table( X_resampled, observation_ids=sorted_table.ids('observation'), sample_ids=dummy_samples) oversampled_metadata = pd.DataFrame(index=dummy_samples, data=y_resampled) oversampled_metadata.index.names = ['#SampleID'] oversampled_metadata.columns = ['label'] oversampled_meta = qiime2.Metadata(oversampled_metadata) oversampled_meta.save() return oversampled_table
def synthetic_under_sampling(table: biom.Table, metadata: NumericMetadataColumn, concatenate_meta_fp: Str, method: Str = 'RandomUnderSampler', voting: Str = 'auto', n_jobs: Int = 1, sampling_strategy: Str = 'auto', random_state: Int = 42, output_log_fp: Str = None) -> biom.Table: log_fp = tempfile.mktemp() print("The log file will be writen into", log_fp) if log_fp: logger_ins = LOG(log_fp=log_fp).get_logger('synthetic_over_sampling') logger_ins.info("The parameters used for oversampling are") logger_ins.info('voting (will be used with ClusterCentroids only):', voting) logger_ins.info('Sampling method:', method) logger_ins.info('Output log file path:', log_fp) logger_ins.info('sampling_strategy:', sampling_strategy) logger_ins.info('n_jobs:', n_jobs) logger_ins.info('random_state:', random_state) cls = dispatcher[method] if method != 'RandomUnderSampler': table.norm(inplace=True) if log_fp: logger_ins.info( "The input table is normalized before using it for oversampling" ) sorted_table, sorted_metadata = _read_inputs(table, meta_data=metadata) matrix_data = sorted_table.matrix_data.transpose().todense() if method not in dispatcher: raise ValueError('The optional methods for over sampling are', dispatcher.keys(), "instead it received", method) if method == 'RandomUnderSampler': under_sampling_cls = cls(sampling_strategy=sampling_strategy, random_state=random_state, replacement=False) else: raise NotImplementedError("Method", method, "is not implemented yet") X_resampled, y_resampled = under_sampling_cls.fit_resample( matrix_data, sorted_metadata) if log_fp: logger_ins.info("The under-sampling finished successfully!") logger_ins.info("Overall, the size of data is", len(X_resampled)) if method == 'RandomUnderSampler': dummy_samples_ids = under_sampling_cls.sample_indices_ dummy_samples = [] orig_samples = sorted_table.ids('sample') for sample_id in dummy_samples_ids: dummy_samples.append(orig_samples[sample_id]) else: raise NotImplementedError("Method", method, "is not implemented yet") under_sampling_dummy = sorted_table.filter(ids_to_keep=dummy_samples, inplace=False) under_sampling_dummy = under_sampling_dummy.sort_order(order=dummy_samples, axis='sample') if method == "RandomUnderSampler" and np.sum( under_sampling_dummy.matrix_data.transpose() - X_resampled) != 0: raise ValueError("The undersampling changed the matrix data") undersampled_table = biom.Table( X_resampled.transpose(), observation_ids=sorted_table.ids('observation'), sample_ids=dummy_samples) undersampled_metadata = pd.DataFrame(index=dummy_samples, data=y_resampled) undersampled_metadata.index.names = ['#SampleID'] undersampled_metadata.columns = ['label'] undersampled_meta = qiime2.Metadata(undersampled_metadata) undersampled_meta.save(concatenate_meta_fp) if log_fp: shutil.copy(log_fp, output_log_fp) return undersampled_table