class DataModel(): """ Methods for loading and compressing input data Usage: from tybalt.data_models import DataModel data = DataModel(filename) """ def __init__(self, filename=None, df=False, select_columns=False, gene_modules=None, test_filename=None, test_df=None): """ DataModel can be initialized with either a filename or a pandas dataframe and processes gene modules and sample labels if provided. Arguments: filename - if provided, load gene expression data into object df - dataframe of preloaded gene expression data select_columns - the columns of the dataframe to use gene_modules - a list of gene module assignments for each gene (for use with the simulated data or when ground truth gene modules are known) test_filename - if provided, loads testing dataset into object test_df - dataframe of prelaoded gene expression testing set data """ # Load gene expression data self.filename = filename if filename is None: self.df = df else: self.df = pd.read_table(self.filename, index_col=0) # Load test set gene expression data if applicable self.test_filename = test_filename self.test_df = test_df if test_filename is not None and test_df is None: self.test_df = pd.read_table(self.test_filename, index_col=0) if select_columns: subset_df = self.df.iloc[:, select_columns] other_columns = range(max(select_columns) + 1, self.df.shape[1]) self.other_df = self.df.iloc[:, other_columns] self.df = subset_df if self.test_df is not None: self.test_df = self.test_df.iloc[:, select_columns] if gene_modules is not None: self.gene_modules = pd.DataFrame(gene_modules).T self.gene_modules.index = ['modules'] self.num_samples, self.num_genes = self.df.shape self.num_test_samples, self.num_test_genes = self.test_df.shape assert_notice = 'train and test sets must have same number of genes' assert self.num_genes == self.num_test_genes, assert_notice def transform(self, how): self.transformation = how if how == 'zscore': self.transform_fit = StandardScaler().fit(self.df) elif how == 'zeroone': self.transform_fit = MinMaxScaler().fit(self.df) else: raise ValueError('how must be either "zscore" or "zeroone".') self.df = pd.DataFrame(self.transform_fit.transform(self.df), index=self.df.index, columns=self.df.columns) if self.test_df is not None: if how == 'zscore': self.transform_test_fit = StandardScaler().fit(self.test_df) elif how == 'zeroone': self.transform_test_fit = MinMaxScaler().fit(self.test_df) test_transform = self.transform_test_fit.transform(self.test_df) self.test_df = pd.DataFrame(test_transform, index=self.test_df.index, columns=self.test_df.columns) def pca(self, n_components, transform_df=False, transform_test_df=False): self.pca_fit = decomposition.PCA(n_components=n_components) self.pca_df = self.pca_fit.fit_transform(self.df) colnames = ['pca_{}'.format(x) for x in range(0, n_components)] self.pca_df = pd.DataFrame(self.pca_df, index=self.df.index, columns=colnames) self.pca_weights = pd.DataFrame(self.pca_fit.components_, columns=self.df.columns, index=colnames) if transform_df: out_df = self.pca_fit.transform(transform_df) return out_df if transform_test_df: self.pca_test_df = self.pca_fit.transform(self.test_df) def ica(self, n_components, transform_df=False, transform_test_df=False): self.ica_fit = decomposition.FastICA(n_components=n_components) self.ica_df = self.ica_fit.fit_transform(self.df) colnames = ['ica_{}'.format(x) for x in range(0, n_components)] self.ica_df = pd.DataFrame(self.ica_df, index=self.df.index, columns=colnames) self.ica_weights = pd.DataFrame(self.ica_fit.components_, columns=self.df.columns, index=colnames) if transform_df: out_df = self.ica_fit.transform(transform_df) return out_df if transform_test_df: self.ica_test_df = self.ica_fit.transform(self.test_df) def nmf( self, n_components, transform_df=False, transform_test_df=False, init='nndsvdar', tol=5e-3, ): self.nmf_fit = decomposition.NMF(n_components=n_components, init=init, tol=tol) self.nmf_df = self.nmf_fit.fit_transform(self.df) colnames = ['nmf_{}'.format(x) for x in range(n_components)] self.nmf_df = pd.DataFrame(self.nmf_df, index=self.df.index, columns=colnames) self.nmf_weights = pd.DataFrame(self.nmf_fit.components_, columns=self.df.columns, index=colnames) if transform_df: out_df = self.nmf_fit.transform(transform_df) return out_df if transform_test_df: self.nmf_test_df = self.nmf_fit.transform(self.test_df) def nn(self, n_components, model='tybalt', transform_df=False, transform_test_df=False, **kwargs): # unpack kwargs original_dim = kwargs.pop('original_dim', self.df.shape[1]) latent_dim = kwargs.pop('latent_dim', n_components) batch_size = kwargs.pop('batch_size', 50) epochs = kwargs.pop('epochs', 50) learning_rate = kwargs.pop('learning_rate', 0.0005) noise = kwargs.pop('noise', 0) sparsity = kwargs.pop('sparsity', 0) kappa = kwargs.pop('kappa', 1) epsilon_std = kwargs.pop('epsilon_std', 1.0) beta = kwargs.pop('beta', 0) beta = K.variable(beta) loss = kwargs.pop('loss', 'binary_crossentropy') validation_ratio = kwargs.pop('validation_ratio', 0.1) tied_weights = kwargs.pop('tied_weights', True) if tied_weights and model == 'adage': use_decoder_weights = False else: use_decoder_weights = True verbose = kwargs.pop('verbose', True) tybalt_separate_loss = kwargs.pop('separate_loss', False) adage_comp_loss = kwargs.pop('multiply_adage_loss', False) adage_optimizer = kwargs.pop('adage_optimizer', 'adam') # Extra processing for conditional vae if hasattr(self, 'other_df') and model == 'ctybalt': y_df = kwargs.pop('y_df', self.other_df) y_var = kwargs.pop('y_var', 'groups') label_dim = kwargs.pop('label_dim', len(set(y_df[y_var]))) self.nn_train_y = y_df.drop(self.nn_test_df.index) self.nn_test_y = y_df.drop(self.nn_train_df.index) self.nn_train_y = self.nn_train_y.loc[self.nn_train_df.index, ] self.nn_test_y = self.nn_test_y.loc[self.nn_test_df.index, ] label_encoder = LabelEncoder().fit(self.other_df[y_var]) self.nn_train_y = label_encoder.transform(self.nn_train_y[y_var]) self.nn_test_y = label_encoder.transform(self.nn_test_y[y_var]) self.other_onehot = label_encoder.transform(self.other_df[y_var]) self.nn_train_y = to_categorical(self.nn_train_y) self.nn_test_y = to_categorical(self.nn_test_y) self.other_onehot = to_categorical(self.other_onehot) self.nn_test_df = self.df.sample(frac=validation_ratio) self.nn_train_df = self.df.drop(self.nn_test_df.index) if model == 'tybalt': self.tybalt_fit = Tybalt(original_dim=original_dim, latent_dim=latent_dim, batch_size=batch_size, epochs=epochs, learning_rate=learning_rate, kappa=kappa, epsilon_std=epsilon_std, beta=beta, loss=loss, verbose=verbose) self.tybalt_fit.initialize_model() self.tybalt_fit.train_vae(train_df=self.nn_train_df, test_df=self.nn_test_df, separate_loss=tybalt_separate_loss) features = ['vae_{}'.format(x) for x in range(0, latent_dim)] self.tybalt_weights = (self.tybalt_fit.get_weights( decoder=use_decoder_weights)) self.tybalt_weights = pd.DataFrame(self.tybalt_weights[1][0], columns=self.df.columns, index=features) self.tybalt_df = self.tybalt_fit.compress(self.df) self.tybalt_df.columns = features if transform_df: out_df = self.tybalt_fit.compress(transform_df) return out_df if transform_test_df: self.tybalt_test_df = self.tybalt_fit.compress(self.test_df) if model == 'ctybalt': self.ctybalt_fit = cTybalt(original_dim=original_dim, latent_dim=latent_dim, label_dim=label_dim, batch_size=batch_size, epochs=epochs, learning_rate=learning_rate, kappa=kappa, epsilon_std=epsilon_std, beta=beta, loss=loss, verbose=verbose) self.ctybalt_fit.initialize_model() self.ctybalt_fit.train_cvae(train_df=self.nn_train_df, train_labels_df=self.nn_train_y, test_df=self.nn_test_df, test_labels_df=self.nn_test_y) self.ctybalt_decoder_w = (self.ctybalt_fit.get_weights( decoder=use_decoder_weights)) features = ['cvae_{}'.format(x) for x in range(0, latent_dim)] features_with_groups = features + [ 'group_{}'.format(x) for x in range(latent_dim, latent_dim + label_dim) ] w = pd.DataFrame(self.ctybalt_decoder_w[1][0]) self.ctybalt_group_w = pd.DataFrame(w.iloc[:, -label_dim:]) gene_range = range(0, w.shape[1] - label_dim) self.ctybalt_weights = pd.DataFrame(w.iloc[:, gene_range]) self.ctybalt_weights.columns = self.df.columns self.ctybalt_weights.index = features_with_groups self.ctybalt_df = self.ctybalt_fit.compress( [self.df, self.other_onehot]) self.ctybalt_df.columns = features if transform_df: # Note: transform_df must be a list of two dfs [x_df, y_df] out_df = self.ctybalt_fit.compress(transform_df) return out_df if model == 'adage': self.adage_fit = Adage(original_dim=original_dim, latent_dim=latent_dim, noise=noise, batch_size=batch_size, epochs=epochs, sparsity=sparsity, learning_rate=learning_rate, loss=loss, verbose=verbose, tied_weights=tied_weights, optimizer=adage_optimizer) self.adage_fit.initialize_model() self.adage_fit.train_adage(train_df=self.nn_train_df, test_df=self.nn_test_df, adage_comparable_loss=adage_comp_loss) features = ['dae_{}'.format(x) for x in range(0, latent_dim)] self.adage_weights = (self.adage_fit.get_weights( decoder=use_decoder_weights)) self.adage_weights = pd.DataFrame(self.adage_weights[1][0], columns=self.df.columns, index=features) self.adage_df = self.adage_fit.compress(self.df) self.adage_df.columns = features if transform_df: out_df = self.adage_fit.compress(transform_df) return out_df if transform_test_df: self.adage_test_df = self.adage_fit.compress(self.test_df) def combine_models(self, include_labels=False, include_raw=False, test_set=False): """ Merge z matrices together across algorithms Arguments: test_set - if True, output z matrix predictions for test set Output: pandas dataframe of all model z matrices """ all_models = [] if hasattr(self, 'pca_df'): if test_set: pca_df = pd.DataFrame(self.pca_test_df, index=self.test_df.index, columns=self.pca_df.columns) else: pca_df = self.pca_df all_models += [pca_df] if hasattr(self, 'ica_df'): if test_set: ica_df = pd.DataFrame(self.ica_test_df, index=self.test_df.index, columns=self.ica_df.columns) else: ica_df = self.ica_df all_models += [ica_df] if hasattr(self, 'nmf_df'): if test_set: nmf_df = pd.DataFrame(self.nmf_test_df, index=self.test_df.index, columns=self.nmf_df.columns) else: nmf_df = self.nmf_df all_models += [nmf_df] if hasattr(self, 'tybalt_df'): if test_set: tybalt_df = self.tybalt_test_df tybalt_df.columns = columns = self.tybalt_df.columns else: tybalt_df = self.tybalt_df all_models += [tybalt_df] if hasattr(self, 'ctybalt_df'): if test_set: ctybalt_df = pd.DataFrame(self.ctybalt_test_df, index=self.test_df.index, columns=self.ctybalt_df.columns) else: ctybalt_df = self.ctybalt_df all_models += [ctybalt_df] if hasattr(self, 'adage_df'): if test_set: adage_df = self.adage_test_df adage_df.columns = columns = self.adage_df.columns else: adage_df = self.adage_df all_models += [adage_df] if include_raw: all_models += [self.df] if include_labels: all_models += [self.other_df] all_df = pd.concat(all_models, axis=1) return all_df def combine_weight_matrix(self): all_weight = [] if hasattr(self, 'pca_df'): all_weight += [self.pca_weights] if hasattr(self, 'ica_df'): all_weight += [self.ica_weights] if hasattr(self, 'nmf_df'): all_weight += [self.nmf_weights] if hasattr(self, 'tybalt_df'): all_weight += [self.tybalt_weights] if hasattr(self, 'ctybalt_df'): all_weight += [self.ctybalt_weights] if hasattr(self, 'adage_df'): all_weight += [self.adage_weights] all_weight_df = pd.concat(all_weight, axis=0).T all_weight_df = all_weight_df.rename({'Unnamed: 0': 'entrez_gene'}, axis='columns') return all_weight_df def compile_reconstruction(self, test_set=False): """ Compile reconstruction costs between input and algorithm reconstruction Arguments: test_set - if True, compile reconstruction for the test set data Output: Two dictionaries storing 1) reconstruction costs and 2) reconstructed matrix for each algorithm """ # Set the dataframe for use to compute reconstruction cost if test_set: input_df = self.test_df else: input_df = self.df all_reconstruction = {} reconstruct_mat = {} if hasattr(self, 'pca_df'): # Set PCA dataframe if test_set: pca_df = self.pca_test_df else: pca_df = self.pca_df pca_reconstruct = self.pca_fit.inverse_transform(pca_df) pca_recon = approx_keras_binary_cross_entropy( pca_reconstruct, input_df, self.num_genes) all_reconstruction['pca'] = [pca_recon] reconstruct_mat['pca'] = pd.DataFrame(pca_reconstruct, index=input_df.index, columns=input_df.columns) if hasattr(self, 'ica_df'): # Set ICA dataframe if test_set: ica_df = self.ica_test_df else: ica_df = self.ica_df ica_reconstruct = self.ica_fit.inverse_transform(ica_df) ica_recon = approx_keras_binary_cross_entropy( ica_reconstruct, input_df, self.num_genes) all_reconstruction['ica'] = [ica_recon] reconstruct_mat['ica'] = pd.DataFrame(ica_reconstruct, index=input_df.index, columns=input_df.columns) if hasattr(self, 'nmf_df'): # Set NMF dataframe if test_set: nmf_df = self.nmf_test_df else: nmf_df = self.nmf_df nmf_reconstruct = self.nmf_fit.inverse_transform(nmf_df) nmf_recon = approx_keras_binary_cross_entropy( nmf_reconstruct, input_df, self.num_genes) all_reconstruction['nmf'] = [nmf_recon] reconstruct_mat['nmf'] = pd.DataFrame(nmf_reconstruct, index=input_df.index, columns=input_df.columns) if hasattr(self, 'tybalt_df'): vae_reconstruct = self.tybalt_fit.decoder.predict_on_batch( self.tybalt_fit.encoder.predict_on_batch(input_df)) vae_recon = approx_keras_binary_cross_entropy( vae_reconstruct, input_df, self.num_genes) all_reconstruction['vae'] = [vae_recon] reconstruct_mat['vae'] = pd.DataFrame(vae_reconstruct, index=input_df.index, columns=input_df.columns) if hasattr(self, 'adage_df'): dae_reconstruct = self.adage_fit.decoder.predict_on_batch( self.adage_fit.encoder.predict_on_batch(input_df)) dae_recon = approx_keras_binary_cross_entropy( dae_reconstruct, input_df, self.num_genes) all_reconstruction['dae'] = [dae_recon] reconstruct_mat['dae'] = pd.DataFrame(dae_reconstruct, index=input_df.index, columns=input_df.columns) return pd.DataFrame(all_reconstruction), reconstruct_mat def compile_reconstruction_testset(self): all_reconstruction = {} reconstruct_mat = {} if hasattr(self, 'pca_test_df'): key = 'pca_test' pca_reconstruct = self.pca_fit.inverse_transform(self.pca_test_df) pca_recon = approx_keras_binary_cross_entropy( pca_reconstruct, self.test_df, self.num_genes) all_reconstruction[key] = [pca_recon] reconstruct_mat[key] = pd.DataFrame(pca_reconstruct, index=self.test_df.index, columns=self.test_df.columns) if hasattr(self, 'ica_test_df'): key = 'ica_test' ica_reconstruct = self.ica_fit.inverse_transform(self.ica_test_df) ica_recon = approx_keras_binary_cross_entropy( ica_reconstruct, self.test_df, self.num_genes) all_reconstruction[key] = [ica_recon] reconstruct_mat[key] = pd.DataFrame(ica_reconstruct, index=self.test_df.index, columns=self.test_df.columns) if hasattr(self, 'nmf_test_df'): key = 'nmf_test' nmf_reconstruct = self.nmf_fit.inverse_transform(self.nmf_test_df) nmf_recon = approx_keras_binary_cross_entropy( nmf_reconstruct, self.test_df, self.num_genes) all_reconstruction[key] = [nmf_recon] reconstruct_mat[key] = pd.DataFrame(nmf_reconstruct, index=self.test_df.index, columns=self.test_df.columns) if hasattr(self, 'tybalt_test_df'): key = 'vae' vae_reconstruct = self.tybalt_fit.decoder.predict_on_batch( self.tybalt_fit.encoder.predict_on_batch(self.test_df)) vae_recon = approx_keras_binary_cross_entropy( vae_reconstruct, self.test_df, self.num_genes) all_reconstruction[key] = [vae_recon] reconstruct_mat[key] = pd.DataFrame(vae_reconstruct, index=self.test_df.index, columns=self.test_df.columns) if hasattr(self, 'adage_test_df'): key = 'dae' dae_reconstruct = self.adage_fit.decoder.predict_on_batch( self.adage_fit.encoder.predict_on_batch(self.test_df)) dae_recon = approx_keras_binary_cross_entropy( dae_reconstruct, self.test_df, self.num_genes) all_reconstruction[key] = [dae_recon] reconstruct_mat[key] = pd.DataFrame(dae_reconstruct, index=self.test_df.index, columns=self.test_df.columns) return pd.DataFrame(all_reconstruction), reconstruct_mat def get_modules_ranks(self, weight_df, num_components, noise_column=0): """ Takes a compression algorithm's weight matrix (gene by latent feature), and reports two performance metrics: 1) mean rank sum across modules: measures how well modules are separated into features 2) min average rank across modules: measures how well modules of decreasing proportion are captured """ # Rank absolute value compressed features for each gene weight_rank_df = weight_df.abs().rank(axis=0, ascending=False) # Add gene module membership to ranks module_w_df = pd.concat([weight_rank_df, self.gene_modules], axis=0) module_w_df = module_w_df.astype(int) # Get the total module by compressed feature mean rank module_meanrank_df = (module_w_df.T.groupby('modules').mean()).T # Drop the noise column and get the sum of the minimum mean rank. # This heuristic measures, on average, how well individual compressed # features capture ground truth gene modules. A lower number indicates # better separation performance for the algorithm of interest module_meanrank_minsum = (module_meanrank_df.drop( noise_column, axis=1).min(axis=0).sum()) # Process output data # Divide this by the total number of features in the model. Subtract by # one to account for the dropped noise column, if applicable module_meanrank_minavg = module_meanrank_minsum / (num_components - 1) # We are interested if the features encapsulate gene modules # A lower number across modules indicates a stronger ability to # aggregate genes into features module_min_rank = pd.DataFrame(module_meanrank_df.min(), columns=['min_rank']) return module_meanrank_df, module_min_rank, module_meanrank_minavg def get_group_means(self, df): """ Get the mean latent space vector representation of input groups """ return df.assign(groups=self.other_df).groupby('groups').mean() def get_subtraction(self, group_means, group_list): """ Subtract two group means given by group list """ a, b = group_list a_df = group_means.loc[a, :] b_df = group_means.loc[b, :] subtraction = pd.DataFrame(a_df - b_df).T return subtraction def subtraction_essense(self, group_subtract, mean_rank, node): """ Obtain the difference between the subtraction and node of interest """ # subset mean rank to node of the "dropped" feature in the simulation feature_essense = mean_rank.loc[:, node] # The node essense is the compressed feature with the lowest mean rank node_essense = feature_essense.idxmin() node_idx = int(node_essense.split('_')[1]) # Ask how different this specific feature is from all others group_z = zscore(group_subtract.iloc[0, :].tolist()) node_essense_zscore = group_z[node_idx] return node_essense_zscore def get_addition(self, group_means, subtraction, group): """ Add node to subtraction """ mean_feature = group_means.loc[group, :] return subtraction + mean_feature def reconstruct_group(self, lsa_result, algorithm=False): """ Reconstruct the latent space arithmetic result back to input dim """ if algorithm == 'tybalt': return self.tybalt_fit.decoder.predict_on_batch(lsa_result) elif algorithm == 'ctybalt': return self.ctybalt_fit.decoder.predict_on_batch(lsa_result) elif algorithm == 'adage': return self.adage_fit.decoder.predict_on_batch(lsa_result) elif algorithm == 'pca': return self.pca_fit.inverse_transform(lsa_result) elif algorithm == 'ica': return self.ica_fit.inverse_transform(lsa_result) elif algorithm == 'nmf': return self.nmf_fit.inverse_transform(lsa_result) else: raise ValueError('algorithm must be one of: "pca", "ica", "nmf",' + ' "adage", "tybalt", or "ctybalt"') def get_average_distance(self, transform_df, real_df): """ Obtain the average euclidean distance between the transformed vector and all samples as part of the real dataframe """ return euclidean_distances(transform_df, real_df).mean() def _wrap_sub_eval(self, weight_df, compress_df, num_components, noise_column, subtraction_groups, addition_group, node, real_df, algorithm): """ Helper function that wraps all evals """ # Get the module mean rank and the min rank sum mean_rank_mod, mean_rank_min, min_rank_avg = (self.get_modules_ranks( weight_df, num_components, noise_column)) # Begin subtraction analysis - first, get group means group_means = self.get_group_means(compress_df) # Next, get the subtraction result sub_result = self.get_subtraction(group_means, subtraction_groups) # Then, get the relative minimum difference to determine if the # subtraction isolates the feature we expect is should - and how much relative_min_diff = self.subtraction_essense(sub_result, mean_rank_mod, node) # Now reconstruct the subtraction back into original space lsa_result = self.get_addition(group_means, sub_result, addition_group) recon_lsa = self.reconstruct_group(lsa_result, algorithm) avg_dist = self.get_average_distance(recon_lsa, real_df) out_results = mean_rank_min.T out_results.index = algorithm.split() out_results = out_results.assign(minimum_rank_avg=min_rank_avg) out_results = out_results.assign(min_node_zscore=relative_min_diff) out_results = out_results.assign(avg_recon_dist=avg_dist) return out_results def subtraction_eval(self, num_components, noise_column, group_list, add_groups, expect_node, real_df): tybalt_results = self._wrap_sub_eval(weight_df=self.tybalt_weights, compress_df=self.tybalt_df, num_components=num_components, noise_column=noise_column, subtraction_groups=group_list, addition_group=add_groups, node=expect_node, real_df=real_df, algorithm='tybalt') adage_results = self._wrap_sub_eval(weight_df=self.adage_weights, compress_df=self.adage_df, num_components=num_components, noise_column=noise_column, subtraction_groups=group_list, addition_group=add_groups, node=expect_node, real_df=real_df, algorithm='adage') pca_results = self._wrap_sub_eval(weight_df=self.pca_weights, compress_df=self.pca_df, num_components=num_components, noise_column=noise_column, subtraction_groups=group_list, addition_group=add_groups, node=expect_node, real_df=real_df, algorithm='pca') ica_results = self._wrap_sub_eval(weight_df=self.ica_weights, compress_df=self.ica_df, num_components=num_components, noise_column=noise_column, subtraction_groups=group_list, addition_group=add_groups, node=expect_node, real_df=real_df, algorithm='ica') nmf_results = self._wrap_sub_eval(weight_df=self.nmf_weights, compress_df=self.nmf_df, num_components=num_components, noise_column=noise_column, subtraction_groups=group_list, addition_group=add_groups, node=expect_node, real_df=real_df, algorithm='nmf') return pd.concat([ tybalt_results, adage_results, pca_results, ica_results, nmf_results ])
class DataModel(): """ Methods for loading and compressing input data Usage: from tybalt.data_models import DataModel data = DataModel(filename) """ def __init__(self, filename=None, df=False, select_columns=False): self.filename = filename if filename is None: self.df = df else: self.df = pd.read_table(self.filename) if select_columns: subset_df = self.df.iloc[:, select_columns] other_columns = range(max(select_columns) + 1, self.df.shape[1]) self.other_df = self.df.iloc[:, other_columns] self.df = subset_df def transform(self, how): self.transformation = how if how == 'zscore': self.transform_fit = StandardScaler().fit(self.df) elif how == 'zeroone': self.transform_fit = MinMaxScaler().fit(self.df) else: raise ValueError('how must be either "zscore" or "zeroone".') self.df = pd.DataFrame(self.transform_fit.transform(self.df), index=self.df.index, columns=self.df.columns) def pca(self, n_components, transform_df=False): self.pca_fit = decomposition.PCA(n_components=n_components) self.pca_df = self.pca_fit.fit_transform(self.df) colnames = ['pca_{}'.format(x) for x in range(0, n_components)] self.pca_df = pd.DataFrame(self.pca_df, index=self.df.index, columns=colnames) if transform_df: out_df = self.pca_fit.transform(transform_df) return out_df def ica(self, n_components, transform_df=False): self.ica_fit = decomposition.FastICA(n_components=n_components) self.ica_df = self.ica_fit.fit_transform(self.df) colnames = ['ica_{}'.format(x) for x in range(0, n_components)] self.ica_df = pd.DataFrame(self.ica_df, index=self.df.index, columns=colnames) if transform_df: out_df = self.ica_fit.transform(transform_df) return out_df def nmf(self, n_components, transform_df=False, init='nndsvdar', tol=5e-3): self.nmf_fit = decomposition.NMF(n_components=n_components, init=init, tol=tol) self.nmf_df = self.nmf_fit.fit_transform(self.df) colnames = ['nmf_{}'.format(x) for x in range(0, n_components)] self.nmf_df = pd.DataFrame(self.nmf_df, index=self.df.index, columns=colnames) if transform_df: out_df = self.nmf_fit.transform(transform_df) return out_df def nn(self, n_components, model='tybalt', transform_df=False, **kwargs): # unpack kwargs original_dim = kwargs.pop('original_dim', self.df.shape[1]) latent_dim = kwargs.pop('latent_dim', n_components) batch_size = kwargs.pop('batch_size', 50) epochs = kwargs.pop('epochs', 50) learning_rate = kwargs.pop('learning_rate', 0.0005) noise = kwargs.pop('noise', 0) sparsity = kwargs.pop('sparsity', 0) kappa = kwargs.pop('kappa', 1) epsilon_std = kwargs.pop('epsilon_std', 1.0) beta = kwargs.pop('beta', 0) beta = K.variable(beta) loss = kwargs.pop('loss', 'binary_crossentropy') validation_ratio = kwargs.pop('validation_ratio', 0.1) # Extra processing for conditional vae if hasattr(self, 'other_df') and model == 'ctybalt': y_df = kwargs.pop('y_df', self.other_df) y_var = kwargs.pop('y_var', 'groups') label_dim = kwargs.pop('label_dim', len(set(y_df[y_var]))) self.nn_train_y = y_df.drop(self.nn_test_df.index) self.nn_test_y = y_df.drop(self.nn_train_df.index) self.nn_train_y = self.nn_train_y.loc[self.nn_train_df.index, ] self.nn_test_y = self.nn_test_y.loc[self.nn_test_df.index, ] label_encoder = LabelEncoder().fit(self.other_df[y_var]) self.nn_train_y = label_encoder.transform(self.nn_train_y[y_var]) self.nn_test_y = label_encoder.transform(self.nn_test_y[y_var]) self.other_onehot = label_encoder.transform(self.other_df[y_var]) self.nn_train_y = to_categorical(self.nn_train_y) self.nn_test_y = to_categorical(self.nn_test_y) self.other_onehot = to_categorical(self.other_onehot) self.nn_test_df = self.df.sample(frac=validation_ratio) self.nn_train_df = self.df.drop(self.nn_test_df.index) if model == 'tybalt': self.tybalt_fit = Tybalt(original_dim=original_dim, latent_dim=latent_dim, batch_size=batch_size, epochs=epochs, learning_rate=learning_rate, kappa=kappa, epsilon_std=epsilon_std, beta=beta, loss=loss) self.tybalt_fit.initialize_model() self.tybalt_fit.train_vae(train_df=self.nn_train_df, test_df=self.nn_test_df) self.tybalt_weights = self.tybalt_fit.get_decoder_weights() self.tybalt_df = self.tybalt_fit.compress(self.df) colnames = ['vae_{}'.format(x) for x in range(0, latent_dim)] self.tybalt_df.columns = colnames if transform_df: out_df = self.tybalt_fit.compress(transform_df) return out_df if model == 'ctybalt': self.ctybalt_fit = cTybalt(original_dim=original_dim, latent_dim=latent_dim, label_dim=label_dim, batch_size=batch_size, epochs=epochs, learning_rate=learning_rate, kappa=kappa, epsilon_std=epsilon_std, beta=beta, loss=loss) self.ctybalt_fit.initialize_model() self.ctybalt_fit.train_cvae(train_df=self.nn_train_df, train_labels_df=self.nn_train_y, test_df=self.nn_test_df, test_labels_df=self.nn_test_y) self.ctybalt_weights = self.ctybalt_fit.get_decoder_weights() self.ctybalt_df = self.ctybalt_fit.compress( [self.df, self.other_onehot]) colnames = ['cvae_{}'.format(x) for x in range(0, latent_dim)] self.ctybalt_df.columns = colnames if transform_df: # Note: transform_df must be a list of two dfs [x_df, y_df] out_df = self.ctybalt_fit.compress(transform_df) return out_df if model == 'adage': self.adage_fit = Adage(original_dim=original_dim, latent_dim=latent_dim, noise=noise, batch_size=batch_size, epochs=epochs, sparsity=sparsity, learning_rate=learning_rate, loss=loss) self.adage_fit.initialize_model() self.adage_fit.train_adage(train_df=self.nn_train_df, test_df=self.nn_test_df) self.adage_weights = self.adage_fit.get_decoder_weights() self.adage_df = self.adage_fit.compress(self.df) colnames = ['dae_{}'.format(x) for x in range(0, latent_dim)] self.adage_df.columns = colnames if transform_df: out_df = self.adage_fit.compress(transform_df) return out_df def combine_models(self, include_labels=False, include_raw=False): all_models = [] if hasattr(self, 'pca_df'): all_models += [self.pca_df] if hasattr(self, 'ica_df'): all_models += [self.ica_df] if hasattr(self, 'nmf_df'): all_models += [self.nmf_df] if hasattr(self, 'tybalt_df'): all_models += [self.tybalt_df] if hasattr(self, 'ctybalt_df'): all_models += [self.ctybalt_df] if hasattr(self, 'adage_df'): all_models += [self.adage_df] if include_raw: all_models += [self.df] if include_labels: all_models += [self.other_df] all_df = pd.concat(all_models, axis=1) return all_df
def nn(self, n_components, model='tybalt', transform_df=False, transform_test_df=False, **kwargs): # unpack kwargs original_dim = kwargs.pop('original_dim', self.df.shape[1]) latent_dim = kwargs.pop('latent_dim', n_components) batch_size = kwargs.pop('batch_size', 50) epochs = kwargs.pop('epochs', 50) learning_rate = kwargs.pop('learning_rate', 0.0005) noise = kwargs.pop('noise', 0) sparsity = kwargs.pop('sparsity', 0) kappa = kwargs.pop('kappa', 1) epsilon_std = kwargs.pop('epsilon_std', 1.0) beta = kwargs.pop('beta', 0) beta = K.variable(beta) loss = kwargs.pop('loss', 'binary_crossentropy') validation_ratio = kwargs.pop('validation_ratio', 0.1) tied_weights = kwargs.pop('tied_weights', True) if tied_weights and model == 'adage': use_decoder_weights = False else: use_decoder_weights = True verbose = kwargs.pop('verbose', True) tybalt_separate_loss = kwargs.pop('separate_loss', False) adage_comp_loss = kwargs.pop('multiply_adage_loss', False) adage_optimizer = kwargs.pop('adage_optimizer', 'adam') # Extra processing for conditional vae if hasattr(self, 'other_df') and model == 'ctybalt': y_df = kwargs.pop('y_df', self.other_df) y_var = kwargs.pop('y_var', 'groups') label_dim = kwargs.pop('label_dim', len(set(y_df[y_var]))) self.nn_train_y = y_df.drop(self.nn_test_df.index) self.nn_test_y = y_df.drop(self.nn_train_df.index) self.nn_train_y = self.nn_train_y.loc[self.nn_train_df.index, ] self.nn_test_y = self.nn_test_y.loc[self.nn_test_df.index, ] label_encoder = LabelEncoder().fit(self.other_df[y_var]) self.nn_train_y = label_encoder.transform(self.nn_train_y[y_var]) self.nn_test_y = label_encoder.transform(self.nn_test_y[y_var]) self.other_onehot = label_encoder.transform(self.other_df[y_var]) self.nn_train_y = to_categorical(self.nn_train_y) self.nn_test_y = to_categorical(self.nn_test_y) self.other_onehot = to_categorical(self.other_onehot) self.nn_test_df = self.df.sample(frac=validation_ratio) self.nn_train_df = self.df.drop(self.nn_test_df.index) if model == 'tybalt': self.tybalt_fit = Tybalt(original_dim=original_dim, latent_dim=latent_dim, batch_size=batch_size, epochs=epochs, learning_rate=learning_rate, kappa=kappa, epsilon_std=epsilon_std, beta=beta, loss=loss, verbose=verbose) self.tybalt_fit.initialize_model() self.tybalt_fit.train_vae(train_df=self.nn_train_df, test_df=self.nn_test_df, separate_loss=tybalt_separate_loss) features = ['vae_{}'.format(x) for x in range(0, latent_dim)] self.tybalt_weights = (self.tybalt_fit.get_weights( decoder=use_decoder_weights)) self.tybalt_weights = pd.DataFrame(self.tybalt_weights[1][0], columns=self.df.columns, index=features) self.tybalt_df = self.tybalt_fit.compress(self.df) self.tybalt_df.columns = features if transform_df: out_df = self.tybalt_fit.compress(transform_df) return out_df if transform_test_df: self.tybalt_test_df = self.tybalt_fit.compress(self.test_df) if model == 'ctybalt': self.ctybalt_fit = cTybalt(original_dim=original_dim, latent_dim=latent_dim, label_dim=label_dim, batch_size=batch_size, epochs=epochs, learning_rate=learning_rate, kappa=kappa, epsilon_std=epsilon_std, beta=beta, loss=loss, verbose=verbose) self.ctybalt_fit.initialize_model() self.ctybalt_fit.train_cvae(train_df=self.nn_train_df, train_labels_df=self.nn_train_y, test_df=self.nn_test_df, test_labels_df=self.nn_test_y) self.ctybalt_decoder_w = (self.ctybalt_fit.get_weights( decoder=use_decoder_weights)) features = ['cvae_{}'.format(x) for x in range(0, latent_dim)] features_with_groups = features + [ 'group_{}'.format(x) for x in range(latent_dim, latent_dim + label_dim) ] w = pd.DataFrame(self.ctybalt_decoder_w[1][0]) self.ctybalt_group_w = pd.DataFrame(w.iloc[:, -label_dim:]) gene_range = range(0, w.shape[1] - label_dim) self.ctybalt_weights = pd.DataFrame(w.iloc[:, gene_range]) self.ctybalt_weights.columns = self.df.columns self.ctybalt_weights.index = features_with_groups self.ctybalt_df = self.ctybalt_fit.compress( [self.df, self.other_onehot]) self.ctybalt_df.columns = features if transform_df: # Note: transform_df must be a list of two dfs [x_df, y_df] out_df = self.ctybalt_fit.compress(transform_df) return out_df if model == 'adage': self.adage_fit = Adage(original_dim=original_dim, latent_dim=latent_dim, noise=noise, batch_size=batch_size, epochs=epochs, sparsity=sparsity, learning_rate=learning_rate, loss=loss, verbose=verbose, tied_weights=tied_weights, optimizer=adage_optimizer) self.adage_fit.initialize_model() self.adage_fit.train_adage(train_df=self.nn_train_df, test_df=self.nn_test_df, adage_comparable_loss=adage_comp_loss) features = ['dae_{}'.format(x) for x in range(0, latent_dim)] self.adage_weights = (self.adage_fit.get_weights( decoder=use_decoder_weights)) self.adage_weights = pd.DataFrame(self.adage_weights[1][0], columns=self.df.columns, index=features) self.adage_df = self.adage_fit.compress(self.df) self.adage_df.columns = features if transform_df: out_df = self.adage_fit.compress(transform_df) return out_df if transform_test_df: self.adage_test_df = self.adage_fit.compress(self.test_df)
def nn(self, n_components, model='tybalt', transform_df=False, **kwargs): # unpack kwargs original_dim = kwargs.pop('original_dim', self.df.shape[1]) latent_dim = kwargs.pop('latent_dim', n_components) batch_size = kwargs.pop('batch_size', 50) epochs = kwargs.pop('epochs', 50) learning_rate = kwargs.pop('learning_rate', 0.0005) noise = kwargs.pop('noise', 0) sparsity = kwargs.pop('sparsity', 0) kappa = kwargs.pop('kappa', 1) epsilon_std = kwargs.pop('epsilon_std', 1.0) beta = kwargs.pop('beta', 0) beta = K.variable(beta) loss = kwargs.pop('loss', 'binary_crossentropy') validation_ratio = kwargs.pop('validation_ratio', 0.1) # Extra processing for conditional vae if hasattr(self, 'other_df') and model == 'ctybalt': y_df = kwargs.pop('y_df', self.other_df) y_var = kwargs.pop('y_var', 'groups') label_dim = kwargs.pop('label_dim', len(set(y_df[y_var]))) self.nn_train_y = y_df.drop(self.nn_test_df.index) self.nn_test_y = y_df.drop(self.nn_train_df.index) self.nn_train_y = self.nn_train_y.loc[self.nn_train_df.index, ] self.nn_test_y = self.nn_test_y.loc[self.nn_test_df.index, ] label_encoder = LabelEncoder().fit(self.other_df[y_var]) self.nn_train_y = label_encoder.transform(self.nn_train_y[y_var]) self.nn_test_y = label_encoder.transform(self.nn_test_y[y_var]) self.other_onehot = label_encoder.transform(self.other_df[y_var]) self.nn_train_y = to_categorical(self.nn_train_y) self.nn_test_y = to_categorical(self.nn_test_y) self.other_onehot = to_categorical(self.other_onehot) self.nn_test_df = self.df.sample(frac=validation_ratio) self.nn_train_df = self.df.drop(self.nn_test_df.index) if model == 'tybalt': self.tybalt_fit = Tybalt(original_dim=original_dim, latent_dim=latent_dim, batch_size=batch_size, epochs=epochs, learning_rate=learning_rate, kappa=kappa, epsilon_std=epsilon_std, beta=beta, loss=loss) self.tybalt_fit.initialize_model() self.tybalt_fit.train_vae(train_df=self.nn_train_df, test_df=self.nn_test_df) self.tybalt_weights = self.tybalt_fit.get_decoder_weights() self.tybalt_df = self.tybalt_fit.compress(self.df) colnames = ['vae_{}'.format(x) for x in range(0, latent_dim)] self.tybalt_df.columns = colnames if transform_df: out_df = self.tybalt_fit.compress(transform_df) return out_df if model == 'ctybalt': self.ctybalt_fit = cTybalt(original_dim=original_dim, latent_dim=latent_dim, label_dim=label_dim, batch_size=batch_size, epochs=epochs, learning_rate=learning_rate, kappa=kappa, epsilon_std=epsilon_std, beta=beta, loss=loss) self.ctybalt_fit.initialize_model() self.ctybalt_fit.train_cvae(train_df=self.nn_train_df, train_labels_df=self.nn_train_y, test_df=self.nn_test_df, test_labels_df=self.nn_test_y) self.ctybalt_weights = self.ctybalt_fit.get_decoder_weights() self.ctybalt_df = self.ctybalt_fit.compress( [self.df, self.other_onehot]) colnames = ['cvae_{}'.format(x) for x in range(0, latent_dim)] self.ctybalt_df.columns = colnames if transform_df: # Note: transform_df must be a list of two dfs [x_df, y_df] out_df = self.ctybalt_fit.compress(transform_df) return out_df if model == 'adage': self.adage_fit = Adage(original_dim=original_dim, latent_dim=latent_dim, noise=noise, batch_size=batch_size, epochs=epochs, sparsity=sparsity, learning_rate=learning_rate, loss=loss) self.adage_fit.initialize_model() self.adage_fit.train_adage(train_df=self.nn_train_df, test_df=self.nn_test_df) self.adage_weights = self.adage_fit.get_decoder_weights() self.adage_df = self.adage_fit.compress(self.df) colnames = ['dae_{}'.format(x) for x in range(0, latent_dim)] self.adage_df.columns = colnames if transform_df: out_df = self.adage_fit.compress(transform_df) return out_df