def test_biom_match(self): table = Table( np.array([[0, 0, 1, 1], [2, 3, 4, 4], [5, 5, 3, 3]]).T, ['a', 'b', 'c', 'd'], ['s2', 's3', 's4']) md = pd.DataFrame( { 'x1': [1, 3, 2], 'x2': [1, 1, 0] }, columns=['s1', 's2', 's3'] ).T exp_table = Table( np.array( [ [0, 0, 1, 1], [2, 3, 4, 4] ]).T, ['a', 'b', 'c', 'd'], ['s2', 's3']) exp_md = pd.DataFrame( { 'x1': [3, 2], 'x2': [1, 0] }, columns=['s2', 's3'] ).T res_table, res_md = match(table, md) exp_df = pd.DataFrame(exp_table.to_dataframe()) res_df = pd.DataFrame(res_table.to_dataframe()) exp_df = exp_df.reindex_axis(sorted(exp_df.columns), axis=1) res_df = res_df.reindex_axis(sorted(res_df.columns), axis=1) pdt.assert_frame_equal(exp_df, res_df) exp_md = exp_md.reindex_axis(sorted(exp_md.index), axis=0) res_md = res_md.reindex_axis(sorted(res_md.index), axis=0) pdt.assert_frame_equal(res_md, exp_md)
def rpca( table: biom.Table, n_components: int = DEFAULT_RANK, min_sample_count: int = DEFAULT_MSC, min_feature_count: int = DEFAULT_MFC, max_iterations: int = DEFAULT_ITERATIONS ) -> (skbio.OrdinationResults, skbio.DistanceMatrix): """Runs RPCA with an rclr preprocessing step. This code will be run by both the standalone and QIIME 2 versions of DEICODE. """ # filter sample to min depth def sample_filter(val, id_, md): return sum(val) > min_sample_count def observation_filter(val, id_, md): return sum(val) > min_feature_count # filter and import table table = table.filter(observation_filter, axis='observation') table = table.filter(sample_filter, axis='sample') table = table.to_dataframe().T if len(table.index) != len(set(table.index)): raise ValueError('Data-table contains duplicate indices') if len(table.columns) != len(set(table.columns)): raise ValueError('Data-table contains duplicate columns') # rclr preprocessing and OptSpace (RPCA) opt = MatrixCompletion(n_components=n_components, max_iterations=max_iterations).fit(rclr(table)) rename_cols = ['PC' + str(i + 1) for i in range(n_components)] X = opt.sample_weights @ opt.s @ opt.feature_weights.T X = X - X.mean(axis=0) X = X - X.mean(axis=1).reshape(-1, 1) u, s, v = svd(X) u = u[:, :n_components] v = v.T[:, :n_components] p = s**2 / np.sum(s**2) p = p[:n_components] s = s[:n_components] feature_loading = pd.DataFrame(v, index=table.columns, columns=rename_cols) sample_loading = pd.DataFrame(u, index=table.index, columns=rename_cols) # % var explained proportion_explained = pd.Series(p, index=rename_cols) # get eigenvalues eigvals = pd.Series(s, index=rename_cols) # if the n_components is two add PC3 of zeros # this is referenced as in issue in # <https://github.com/biocore/emperor/commit # /a93f029548c421cb0ba365b4294f7a5a6b0209ce> # discussed in DEICODE -- PR#29 if n_components == 2: feature_loading['PC3'] = [0] * len(feature_loading.index) sample_loading['PC3'] = [0] * len(sample_loading.index) eigvals.loc['PC3'] = 0 proportion_explained.loc['PC3'] = 0 # save ordination results short_method_name = 'rpca_biplot' long_method_name = '(Robust Aitchison) RPCA Biplot' ord_res = skbio.OrdinationResults( short_method_name, long_method_name, eigvals.copy(), samples=sample_loading.copy(), features=feature_loading.copy(), proportion_explained=proportion_explained.copy()) # save distance matrix dist_res = skbio.stats.distance.DistanceMatrix(opt.distance, ids=sample_loading.index) return ord_res, dist_res
def get_matched_tables(collated_fingerprints: pd.DataFrame, smiles: pd.DataFrame, feature_table: biom.Table): ''' This function filters the feature table to retain only features with fingerprints. It also relabels features with MD5 hash of its binary fingerprint vector. Parameters ---------- collated_fingerprints : pd.DataFrame table containing mass-spec molecular substructures (columns) for each mass-spec feature (index) smiles: pd.DataFrame table containing smiles for each mass-spec feature (index) feature_table : biom.Table feature tables with mass-spec feature intensity per sample. Raises ------ ValueError If features in collated fingerprint table are not a subset of features in ``feature_table`` Returns ------- pd.DataFrame fingerprint table with features relabeled with MD5 hash of its binary fingerprint vector biom.Table feature table that is filtered to contain only the features with predicted fingerprints. Features are labeled by MD5 hash of its binary fingerprint vector pd.DataFrame table that maps MD5 hash of a feature to the original feature ID in the input feature table ''' fps = collated_fingerprints.copy() allfps = list(fps.index) if fps.empty: raise ValueError("Cannot have empty fingerprint table") table = feature_table.to_dataframe(dense=True) allfeatrs = set(table.index) if not set(allfps).issubset(allfeatrs): extra_tips = set(allfps) - set(allfps).intersection(allfeatrs) raise ValueError('The following tips were not ' 'found in the feature table:\n' + ', '.join([str(i) for i in extra_tips])) filtered_table = table.reindex(allfps) list_md5 = [] for fid in allfps: md5 = str(hashlib.md5(fps.loc[fid].values.tobytes()).hexdigest()) list_md5.append(md5) fps['label'] = list_md5 filtered_table['label'] = list_md5 feature_data = pd.DataFrame(columns=[ 'label', '#featureID', 'csi_smiles', 'ms2_smiles', 'ms2_compound', 'ms2_adduct' ]) feature_data['label'] = list_md5 feature_data['#featureID'] = allfps feature_data['csi_smiles'] = list(smiles.loc[allfps, 'csi_smiles']) feature_data['ms2_smiles'] = list(smiles.loc[allfps, 'ms2_smiles']) feature_data['ms2_compound'] = list(smiles.loc[allfps, 'ms2_compound']) feature_data['ms2_adduct'] = list(smiles.loc[allfps, 'ms2_adduct']) feature_data.set_index('label', inplace=True) relabel_fps = fps.groupby('label').first() matched_table = filtered_table.groupby('label').sum() # biom requires that ids be strings npfeatures = matched_table.values matched_table = biom.table.Table( data=npfeatures, observation_ids=matched_table.index.astype(str), sample_ids=matched_table.columns.astype(str)) return relabel_fps, matched_table, feature_data
def process(infile1: biom.Table, sample_types: MetadataColumn, metric: Str, conditioning: Str, infile2: biom.Table = None, name: Str = "-name-", ab_comp: Bool = False, min_count: Int = 3, total_select: Str = "all", iteration_select: Set[Int] = None, pca_components: Int = 4, smooth_type: Str = "sliding_window", window_size: Int = 3, centrality: Str = None, keep_threshold: Float = 0.5, correlation: Str = None, weighted: Bool = False, correlation_prop: Str = "both", evaluation: Str = "kl_divergence", min_connected: Int = 0, detailed: Bool = False) -> list: """ This is function corresponds with qiime2 function and takes care of file passing between all parts of plugin. :param infile1: This is the biom file (qza) which will have OTU info extracted from and analyzed to generate an interaction table of taxom. :param sample_types: the is metadata representive of samples taken and whether there are invaded/natural :param metric: This is the metric to use :param conditioning: Conditioning type to use on the data. :param infile2: This is only used in the case of an A/B analysis and will not be used if ab_comp is False. :param name: This is attached to all detailed output as a means of identification :param ab_comp: Boolean representing whether to perform AB comparison on the data. :param min_count: Features with counts below this number will be removed. :param total_select: Number of features to select in total. ie: 1,2,3,... or 'all' :param iteration_select: Number of features to select for each time the metric is called. ie: 1,2,3,... :param pca_components: Number of pca components to find :param smooth_type: Type of Smoothing to be used to remove noise. :param window_size: If Smoothing type is a sliding window, this is the size of the window. :param centrality: If graph_centrality is the metric type, this is the type of Centrality to use. :param keep_threshold: If graph_centrality is the metric type, this is the threshold to use to remove weak edges. :param correlation: If graph centrality is the metric, this specifies if positive, negative, or both types of correlation should be used. :param weighted: If graph_centrality is the metric type, this specifies if weighted edges should be used to create the graph. :param correlation_prop: :param evaluation: This is the evaluation type to use. :param min_connected: The minimum percentage of connectedness of the graph that should be considered before the winnowing process is aborted. :param detailed: Notifies plugin to output diagrams and csv files to each steps respective output folder throughout computation. If not enabled files will not be generated :return: return a list of single item with artifact see artifact generation for details on why this is done """ print( "\n############################# START #############################") if iteration_select is None: # Since default parameter can't be mutable iteration_select = {1, 4, 16, 64, 128} # make sure proper file structure is present _verify_output_folders() _verbose(step=0) # This will be used as part of the PERMANOVA calculation if (not isinstance(sample_types, pd.DataFrame) ): # allows for easier testing and input directly to python sample_types = sample_types.to_dataframe() # Make sure input is valid num_samples = len( infile1.ids(axis='observation') ) # this accounts for abundances being same size as well in later steps try: if ("type" in sample_types.columns): num_sample_types = len(sample_types.loc[:, "type"]) else: num_sample_types = len(sample_types.loc[:, "Type"]) except: raise Exception( "Error: sample metadata must include a column titled Type.") if (num_samples != num_sample_types): raise Exception( "Error: each provided sample must have a corresponding type. ( natural/invaded ).\n" f"Was given {num_samples} samples and {num_sample_types} types. ") # Verify parameters are all given _verify_input_is_provided(metric, conditioning, ab_comp, infile2, centrality, correlation) # if ab_comp is used we will assume that each sample type corresponds with the 1 - n sample of each dataframe if (ab_comp): sample_types = pd.concat([sample_types, sample_types], ignore_index=True) metric_output = pd.DataFrame() # dataframe to write metrics new auc_output = pd.DataFrame() # Keep most accurate AUC permanova_output = pd.DataFrame() # Keep most accurate PERMANOVA value _verbose(step=0.5) for iteration_selected in sorted(iteration_select): # Convert input to dataframes dataframe_1 = infile1.to_dataframe().to_dense() dataframe_1.name = f"{name}_1_{iteration_selected}_" dataframe_2 = None if (ab_comp): dataframe_2 = infile2.to_dataframe().to_dense() dataframe_2.name = f"{name}_2_{iteration_selected}_" if (len(dataframe_1) != len(dataframe_2)): raise Exception( f"Error: Dataframes must be the same size in order to correlate with sample metadata. " f"dataframe1: {len(dataframe_1)} != dataframe2: {len(dataframe_2)}" ) name_new = f"{name}_{iteration_selected}_" # will allow for easier iteration selection # <><><> Pass data to steps 1 to 3 <><><> _verbose(step=1) metric_result, important_features, abundances = \ _winnow_pipeline( dataframe_1=dataframe_1, dataframe_2=dataframe_2, ab_comp=ab_comp, metric_name=metric, c_type=conditioning, min_count=min_count, total_select=total_select, iteration_select=iteration_selected, pca_components=pca_components, smooth_type=smooth_type, window_size=window_size, centrality_type=centrality, keep_threshold=keep_threshold, correlation=correlation, weighted=weighted, corr_prop=correlation_prop, evaluation_type=evaluation, min_connected=min_connected, detailed=detailed ) # these are used in: Step7_9, Step4_5, Step6 if (metric_output.empty ): # create a dataframe of import OTU's for jaccard step metric_output = metric_result else: if (len(metric_output.columns) < len(metric_result.columns)): # the dataframe must be extended to be able to hold new data new_columns = [ col for col in metric_result.columns if not col in metric_output.columns ] for col in new_columns: metric_output[col] = "" # Default as empty metric_output = pd.concat( [metric_output, metric_result], sort=False, ignore_index=True ) # assign back since does not perform in place # check if a metric result was generated before attempting other steps, must be atleast 2 OTUs if (1 in metric_result.columns and 2 in metric_result.columns): # <><><> Pass data to steps 4 to 5 <><><> _verbose(step=4) auc_results, auc_parameters = \ _winnow_ordering( dataframe=important_features, name=name_new, detailed=detailed ) # these are used in: Step6, None auc_output = auc_results # Note: sample types correspond with abundances being passed # print( abundances, auc_results, sample_types ) # <><><> Pass data to step 6 <><><> _verbose(step=6) permanova_results = \ _winnow_permanova( auc_ordering_df=auc_results, abundances_df=abundances, samples_df=sample_types, centrality_type=centrality, name=name_new, detailed=detailed ) permanova_output = permanova_results _verbose(step=6.5) else: _verbose(step=1.5) # <><><> Pass data to steps 7 to 9 <><><> _verbose(step=7) jaccard_results = _winnow_sensativity( metric_output, name= f"{metric}_{correlation}_{str(keep_threshold)}_{centrality}_{name}", detailed=detailed) # Notify user of output path _verbose(step=10) print( f"Please see:\n\t{os.path.dirname(os.path.realpath(__file__))}\nfolder for detailed output." ) print("############################# DONE #############################") # assemble output and return as artifact metric_output.replace(r'^\s*$', np.nan, regex=True, inplace=True) # Replace blank with Nan artifact_directory = _assemble_artifact_output(metric_output, auc_output, permanova_output, jaccard_results) return artifact_directory
def multinomial(table: biom.Table, metadata: Metadata, formula: str, training_column: str = DEFAULTS["training-column"], num_random_test_examples: int = ( DEFAULTS["num-random-test-examples"] ), epochs: int = DEFAULTS["epochs"], batch_size: int = DEFAULTS["batch-size"], differential_prior: float = DEFAULTS["differential-prior"], learning_rate: float = DEFAULTS["learning-rate"], clipnorm: float = DEFAULTS["clipnorm"], min_sample_count: int = DEFAULTS["min-sample-count"], min_feature_count: int = DEFAULTS["min-feature-count"], summary_interval: int = DEFAULTS["summary-interval"], random_seed: int = DEFAULTS["random-seed"], ) -> ( pd.DataFrame, qiime2.Metadata, skbio.OrdinationResults ): # load metadata and tables metadata = metadata.to_dataframe() # match them table, metadata, design = match_and_filter( table, metadata, formula, min_sample_count, min_feature_count ) # convert to dense representation dense_table = table.to_dataframe().to_dense().T # split up training and testing trainX, testX, trainY, testY = split_training( dense_table, metadata, design, training_column, num_random_test_examples, seed=random_seed, ) model = MultRegression(learning_rate=learning_rate, clipnorm=clipnorm, beta_mean=differential_prior, batch_size=batch_size, save_path=None) with tf.Graph().as_default(), tf.Session() as session: tf.set_random_seed(random_seed) model(session, trainX, trainY, testX, testY) loss, cv, its = model.fit( epochs=epochs, summary_interval=summary_interval, checkpoint_interval=None) md_ids = np.array(design.columns) obs_ids = table.ids(axis='observation') beta_ = np.hstack((np.zeros((model.p, 1)), model.B)) beta_ = beta_ - beta_.mean(axis=1).reshape(-1, 1) differentials = pd.DataFrame( beta_.T, columns=md_ids, index=obs_ids, ) differentials.index.name = 'featureid' convergence_stats = pd.DataFrame( { 'loss': loss, 'cross-validation': cv, 'iteration': its } ) convergence_stats.index.name = 'id' convergence_stats.index = convergence_stats.index.astype(np.str) c = convergence_stats['loss'].astype(np.float) convergence_stats['loss'] = c c = convergence_stats['cross-validation'].astype(np.float) convergence_stats['cross-validation'] = c c = convergence_stats['iteration'].astype(np.int) convergence_stats['iteration'] = c # regression biplot if differentials.shape[-1] > 1: u, s, v = np.linalg.svd(differentials) pc_ids = ['PC%d' % i for i in range(len(s))] samples = pd.DataFrame(u[:, :len(s)] @ np.diag(s), columns=pc_ids, index=differentials.index) features = pd.DataFrame(v.T[:, :len(s)], columns=pc_ids, index=differentials.columns) short_method_name = 'regression_biplot' long_method_name = 'Multinomial regression biplot' eigvals = pd.Series(s, index=pc_ids) proportion_explained = eigvals**2 / (eigvals**2).sum() biplot = OrdinationResults( short_method_name, long_method_name, eigvals, samples=samples, features=features, proportion_explained=proportion_explained) else: # this is to handle the edge case with only intercepts biplot = OrdinationResults('', '', pd.Series(), pd.DataFrame()) return differentials, qiime2.Metadata(convergence_stats), biplot
def multinomial(table: biom.Table, metadata: Metadata, formula: str, training_column: str = None, num_random_test_examples: int = 10, epoch: int = 10, batch_size: int = 5, beta_prior: float = 1, learning_rate: float = 0.1, clipnorm: float = 10, min_sample_count: int = 10, min_feature_count: int = 10, summary_interval: int = 60) -> ( pd.DataFrame, qiime2.Metadata ): # load metadata and tables metadata = metadata.to_dataframe() # match them table, metadata, design = match_and_filter( table, metadata, formula, training_column, num_random_test_examples, min_sample_count, min_feature_count ) # convert to dense representation dense_table = table.to_dataframe().to_dense().T # split up training and testing trainX, testX, trainY, testY = split_training( dense_table, metadata, design, training_column, num_random_test_examples ) model = MultRegression(learning_rate=learning_rate, clipnorm=clipnorm, beta_mean=beta_prior, batch_size=batch_size, save_path=None) with tf.Graph().as_default(), tf.Session() as session: model(session, trainX, trainY, testX, testY) loss, cv, its = model.fit( epoch=epoch, summary_interval=summary_interval, checkpoint_interval=None) md_ids = np.array(design.columns) obs_ids = table.ids(axis='observation') beta_ = clr(clr_inv(np.hstack((np.zeros((model.p, 1)), model.B)))) beta_ = pd.DataFrame( beta_.T, columns=md_ids, index=obs_ids, ) convergence_stats = pd.DataFrame( { 'loglikehood': loss, 'cross-validation': cv, 'iteration': its } ) convergence_stats.index.name = 'id' convergence_stats.index = convergence_stats.index.astype(np.str) c = convergence_stats['loglikehood'].astype(np.float) convergence_stats['loglikehood'] = c c = convergence_stats['cross-validation'].astype(np.float) convergence_stats['cross-validation'] = c c = convergence_stats['iteration'].astype(np.int) convergence_stats['iteration'] = c return beta_, qiime2.Metadata(convergence_stats)
def percentile_normalize(table: biom.Table, metadata: qiime2.MetadataColumn, batch: qiime2.MetadataColumn = None, n_control_thresh: int = 10, otu_thresh: float = 0.3) -> biom.Table: """ Converts an input table with cases and controls into percentiles of control samples. Parameters ---------- table : biom.Table Feature table with relative abundances. Samples are in columns, features (i.e. OTUs) are in rows. metadata : qiime2.CategoricalMetadataColumn metadata column with samples labeled as "case" or "control". All samples with either label are returned, normalized to the equivalent percentile in "control" samples. batch : qiime2.CategoricalMetadataColumn metadata column with the different batches labeled. Percentile normalization will be performed within each batch, and the output tables will be concatenated together. You can use this to normalize multiple studies at once by first merging the original feature table, adding a study ID column in the merged metadata, and then calling percentile normalization with this option. n_control_thresh : int [default=10] Minimum number of controls accepted to perform percentile normalization. Because the transformation converts abundances in controls to a uniform distribution, we *highly* discourage performing percentile normalization on datasets with fewer than 30 controls, and certainly not fewer than 10 (the default value). If you have fewer controls than `n_control_thresh`, the normalization will return an error. otu_thresh : float [default=0.3] The OTU filtering threshold: OTUs must be present in at least otu_thresh fraction of cases OR controls, otherwise it gets thrown out and not percentile normalized. This method does not perform well with very sparse OTUs, so we do not recommend lowering this threshold below 0.3. otu_thresh should be [0, 1] Returns ------- norm_biom : biom.Table A biom table with the normalized data, only including the samples that were labeled as either "case" or "control", and the OTUs which passed the otu_thresh threshold. """ # Filter metadata to only include IDs present in the table. # Also ensures every distance table ID is present in the metadata. metadata = metadata.filter_ids(table.ids(axis='sample')) metadata = metadata.drop_missing_values() # filter the table to exclude samples that were dropped from # the metadata due to missing values table = table.filter(metadata.ids) metadata = metadata.to_series() ## Convert biom Table into dense pandas dataframe # Transpose so samples are in rows and OTUs/features in columns df = table.to_dataframe().to_dense().T # Set up a list of metadata series, one per batch batches_to_norm = [] if batch is not None: batch = batch.filter_ids(table.ids(axis='sample')) batch = batch.drop_missing_values() batch = batch.to_series() for g, one_batch in batch.groupby(batch): batches_to_norm.append(metadata.loc[one_batch.index]) else: batches_to_norm.append(metadata) norm_dfs = [] for meta in batches_to_norm: # Get case and control samples from metadata control_samples = meta[meta == "control"].index.tolist() case_samples = meta[meta == "case"].index.tolist() # Make sure there are enough controls to perform normalization if len(control_samples) < n_control_thresh: if batch is not None: batch_err = (' in batch ' + str(batch.loc[meta.index].unique()[0]) + '') else: batch_err = '' raise ValueError( "There aren't enough controls in your data. " + batch_err + "(n_control_thresh = {})".format(n_control_thresh)) # Filter OTUs, replace zeros with random value, and # percentile normalize norm_df = _percentile_normalize_one_df(df, control_samples, case_samples, otu_thresh) norm_dfs.append(norm_df) # Merge all normalized data # Keep all samples and all OTUs - OTUs not present in one batch will be NaNs norm_df = pd.concat(norm_dfs, axis=1) # Put this dataframe into biom format norm_biom = biom.Table(data=norm_df.values, observation_ids=norm_df.index, sample_ids=norm_df.columns) return norm_biom
class TestFilters(unittest.TestCase): def setUp(self): X = np.array([[10, 1, 4, 1, 4, 0], [0, 0, 2, 0, 2, 8], [0, 1, 2, 1, 2, 4], [0, 1, 0, 1, 0, 0], [2, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [7, 1, 0, 1, 0, 0]]) oids = ['o1', 'o2', 'o3', 'o4', 'o5', 'o6', 'o7'] sids = ['s1', 's2', 's3', 's4', 's5', 's6'] bigX = np.array([[10, 1, 4, 1, 4, 1, 0], [0, 0, 2, 0, 2, 1, 8], [0, 1, 2, 1, 2, 1, 4], [0, 1, 0, 1, 0, 1, 0], [2, 0, 0, 0, 0, 1, 0], [1, 0, 0, 0, 0, 1, 0], [4, 0, 0, 0, 0, 1, 0]]) self.big_table = Table( bigX, oids, sids + ['s9'], ) self.metadata = pd.DataFrame( np.vstack( (np.ones(8), np.array(['a', 'a', 'b', 'b', 'a', 'a', 'b', 'a']), np.arange(8).astype(np.float64), np.array([ 'Test', 'Test', 'Train', 'Train', 'Train', 'Train', 'Test', 'Train' ]))).T, columns=['intercept', 'categorical', 'continuous', 'train'], index=['s1', 's2', 's3', 's4', 's5', 's6', 's7', 's8']) self.metadata['continuous'] = self.metadata['continuous'].astype( np.float64) self.trimmed_metadata = self.metadata.loc[[ 's1', 's2', 's3', 's4', 's5', 's6' ]] df = pd.DataFrame([{ 'intercept': 1, 'categorical': 'b', 'continuous': 1., 'train': 'Train' }, { 'intercept': 1, 'categorical': 'b', 'continuous': 1., 'train': 'Train' }], index=['s2', 's4']) df = df.reindex( columns=['intercept', 'categorical', 'continuous', 'train']) self.metadata_dup = self.metadata.append(df) self.table = Table(X, oids, sids) def test_match_duplicate(self): formula = 'C(categorical) + continuous' res = match_and_filter(self.table, self.metadata_dup, formula, min_sample_count=0, min_feature_count=0) res_table, res_metadata, res_design = res pdt.assert_frame_equal(res_table.to_dataframe(), self.table.to_dataframe()) exp_metadata = pd.DataFrame( np.vstack( (np.ones(6), np.array(['a', 'a', 'b', 'b', 'a', 'a']), np.arange(6).astype(np.float64), np.array(['Test', 'Test', 'Train', 'Train', 'Train', 'Train']))).T, columns=['intercept', 'categorical', 'continuous', 'train'], index=['s1', 's2', 's3', 's4', 's5', 's6']) exp_metadata['continuous'] = exp_metadata['continuous'].astype( np.float64) pdt.assert_frame_equal(res_metadata, exp_metadata) exp_design = pd.DataFrame( np.vstack((np.ones(6), np.array([0, 0, 1, 1, 0, 0]), np.arange(6).astype(np.float64))).T, columns=['Intercept', 'C(categorical)[T.b]', 'continuous'], index=['s1', 's2', 's3', 's4', 's5', 's6']) pdt.assert_frame_equal(res_design, exp_design) def test_match_and_filter_no_filter(self): formula = 'C(categorical) + continuous' res = match_and_filter(self.table, self.metadata, formula, min_sample_count=0, min_feature_count=0) res_table, res_metadata, res_design = res pdt.assert_frame_equal(res_table.to_dataframe(), self.table.to_dataframe()) exp_metadata = pd.DataFrame( np.vstack( (np.ones(6), np.array(['a', 'a', 'b', 'b', 'a', 'a']), np.arange(6).astype(np.float64), np.array(['Test', 'Test', 'Train', 'Train', 'Train', 'Train']))).T, columns=['intercept', 'categorical', 'continuous', 'train'], index=['s1', 's2', 's3', 's4', 's5', 's6']) exp_metadata['continuous'] = exp_metadata['continuous'].astype( np.float64) pdt.assert_frame_equal(res_metadata, exp_metadata) exp_design = pd.DataFrame( np.vstack((np.ones(6), np.array([0, 0, 1, 1, 0, 0]), np.arange(6).astype(np.float64))).T, columns=['Intercept', 'C(categorical)[T.b]', 'continuous'], index=['s1', 's2', 's3', 's4', 's5', 's6']) pdt.assert_frame_equal(res_design, exp_design) def test_match_and_filter_big_table(self): formula = 'C(categorical) + continuous' res = match_and_filter(self.big_table, self.metadata, formula, min_sample_count=0, min_feature_count=0) res_metadata = res[1] drop_metadata = res_metadata.dropna() res_design = res[2] drop_design = res_design.dropna() self.assertEqual(res_design.shape[0], drop_design.shape[0]) self.assertEqual(res_metadata.shape[0], drop_metadata.shape[0]) def test_split_training_random(self): np.random.seed(0) design = pd.DataFrame( np.vstack((np.ones(6), np.array([0, 0, 1, 1, 0, 0]), np.arange(6))).T, columns=['Intercept', 'C(categorical)[T.b]', 'continuous'], index=['s1', 's2', 's3', 's4', 's5', 's6']) res = split_training(self.table.to_dataframe().T, self.trimmed_metadata, design, training_column=None, num_random_test_examples=2) trainX, testX, trainY, testY = res # print(trainX.shape, testX.shape, trainY.shape, testY.shape) npt.assert_allclose(trainX.shape, np.array([4, 3])) npt.assert_allclose(trainY.shape, np.array([4, 7])) npt.assert_allclose(testX.shape, np.array([2, 3])) npt.assert_allclose(testY.shape, np.array([2, 7])) def test_split_training_fixed(self): np.random.seed(0) design = pd.DataFrame( np.vstack((np.ones(6), np.array([0, 0, 1, 1, 0, 0]), np.arange(6))).T, columns=['Intercept', 'C(categorical)[T.b]', 'continuous'], index=['s1', 's2', 's3', 's4', 's5', 's6']) t = self.table.to_dataframe().T res = split_training(t, self.metadata, design, training_column='train', num_random_test_examples=2) exp_trainX = design.iloc[2:].values exp_testX = design.iloc[:2].values exp_trainY = t.iloc[2:].values exp_testY = t.iloc[:2].values res_trainX, res_testX, res_trainY, res_testY = res npt.assert_allclose(exp_trainX, res_trainX) npt.assert_allclose(exp_trainY, res_trainY) npt.assert_allclose(exp_testX, res_testX) npt.assert_allclose(exp_testY, res_testY)
def rpca( table: biom.Table, n_components: Union[int, str] = DEFAULT_RANK, min_sample_count: int = DEFAULT_MSC, min_feature_count: int = DEFAULT_MFC, min_feature_frequency: float = DEFAULT_MFF, max_iterations: int = DEFAULT_ITERATIONS ) -> (skbio.OrdinationResults, skbio.DistanceMatrix): """Runs RPCA with an rclr preprocessing step. This code will be run by both the standalone and QIIME 2 versions of DEICODE. """ # get shape of table n_features, n_samples = table.shape # filter sample to min seq. depth def sample_filter(val, id_, md): return sum(val) > min_sample_count # filter features to min total counts def observation_filter(val, id_, md): return sum(val) > min_feature_count # filter features by N samples presence def frequency_filter(val, id_, md): return (np.sum(val > 0) / n_samples) > (min_feature_frequency / 100) # filter and import table for each filter above table = table.filter(observation_filter, axis='observation') table = table.filter(frequency_filter, axis='observation') table = table.filter(sample_filter, axis='sample') table = table.to_dataframe().T # check the table after filtering if len(table.index) != len(set(table.index)): raise ValueError('Data-table contains duplicate indices') if len(table.columns) != len(set(table.columns)): raise ValueError('Data-table contains duplicate columns') # Robust-clt (rclr) preprocessing and OptSpace (RPCA) opt = MatrixCompletion(n_components=n_components, max_iterations=max_iterations).fit(rclr(table)) # get new n-comp when applicable n_components = opt.s.shape[0] # get PC column labels for the skbio OrdinationResults rename_cols = ['PC' + str(i + 1) for i in range(n_components)] # get completed matrix for centering X = opt.sample_weights @ opt.s @ opt.feature_weights.T # center again around zero after completion X = X - X.mean(axis=0) X = X - X.mean(axis=1).reshape(-1, 1) # re-factor the data u, s, v = svd(X) # only take n-components u = u[:, :n_components] v = v.T[:, :n_components] # calc. the new variance using projection p = s**2 / np.sum(s**2) p = p[:n_components] s = s[:n_components] # save the loadings robust_clr = pd.DataFrame(X, index=table.index, columns=table.columns) feature_loading = pd.DataFrame(v, index=table.columns, columns=rename_cols) sample_loading = pd.DataFrame(u, index=table.index, columns=rename_cols) # % var explained proportion_explained = pd.Series(p, index=rename_cols) # get eigenvalues eigvals = pd.Series(s, index=rename_cols) # if the n_components is two add PC3 of zeros # this is referenced as in issue in # <https://github.com/biocore/emperor/commit # /a93f029548c421cb0ba365b4294f7a5a6b0209ce> # discussed in DEICODE -- PR#29 if n_components == 2: feature_loading['PC3'] = [0] * len(feature_loading.index) sample_loading['PC3'] = [0] * len(sample_loading.index) eigvals.loc['PC3'] = 0 proportion_explained.loc['PC3'] = 0 # save ordination results short_method_name = 'rpca_biplot' long_method_name = '(Robust Aitchison) RPCA Biplot' ord_res = skbio.OrdinationResults( short_method_name, long_method_name, eigvals.copy(), samples=sample_loading.copy(), features=feature_loading.copy(), proportion_explained=proportion_explained.copy()) # save distance matrix dist_res = skbio.stats.distance.DistanceMatrix(opt.distance, ids=sample_loading.index) return ord_res, dist_res, robust_clr
def qarcoal( table: biom.Table, taxonomy: pd.DataFrame, num_string: str, denom_string: str, samples_to_use: Metadata = None, allow_shared_features: bool = False, ) -> pd.DataFrame: """Calculate sample-wise log-ratios of features based on taxonomy. Parameters: ----------- table: biom file with which to calculate log ratios taxonomy: pd.DataFrame with taxonomy information (should have Taxon column in which features will be searched) num_string: numerator string to search for in taxonomy denom_string: denominator string to search for in taxonomy samples_to_use: Q2 Metadata file with samples to use. If provided, feature table will be filtered to only consider samples present in this file. (optional) allow_shared_features: bool denoting handling of shared features between numerator and denominator. If False, an error is raised if features are shared between numerator and denominator. If True, will allow shared features without throwing an error. Returns: -------- comparison_df: pd DataFrame in the form: Sample-ID Num_Sum Denom_Sum log_ratio S1 7 15 -0.762140 """ # biom table is features x samples if samples_to_use is not None: filt_samples = set(samples_to_use.to_dataframe().index) feat_table = table.filter(filt_samples, axis="sample", inplace=False) feat_table = feat_table.to_dataframe() else: feat_table = table.to_dataframe() # raise error if there are any negative counts in the feature table if feat_table.lt(0).any().any(): raise ValueError("Feature table has negative counts!") tax_num_df, tax_denom_df = filter_and_join_taxonomy( feat_table, taxonomy, num_string, denom_string, ) # if shared features are disallowed, check to make sure they don't occur # if allowed, can skip this step at user's risk if not allow_shared_features: shared_features = set(tax_num_df.index) & set(tax_denom_df.index) if shared_features: raise ValueError("Shared features between num and denom!") tax_num_sample_sum = tax_num_df.sum(axis=0) tax_denom_sample_sum = tax_denom_df.sum(axis=0) comparison_df = pd.DataFrame.from_records( [tax_num_sample_sum, tax_denom_sample_sum], index=["Num_Sum", "Denom_Sum"], ).T comparison_df["log_ratio"] = comparison_df.apply( lambda x: np.log(x.Num_Sum / x.Denom_Sum), axis=1 ) comparison_df.index.name = "Sample-ID" return comparison_df