def test_ilr_basis_one_dimension_error(self): table = np.array([[1., 10.], [1.14141414, 9.90909091], [1.28282828, 9.81818182], [1.42424242, 9.72727273], [1.56565657, 9.63636364]]) basis = np.array([0.80442968, 0.19557032]) with self.assertRaises(ValueError): ilr(table, basis=basis)
def _to_balances(table, tree): """ Converts a table of abundances to balances given a tree. Parameters ---------- table : pd.DataFrame Contingency table where samples correspond to rows and features correspond to columns. tree : skbio.TreeNode Tree object where the leaves correspond to the columns contained in the table. Returns ------- pd.DataFrame Contingency table where samples correspond to rows and balances correspond to columns. np.array Orthonormal basis in the Aitchison simplex generated from `tree`. """ non_tips = [n.name for n in tree.levelorder() if not n.is_tip()] basis, _ = balance_basis(tree) mat = ilr(table.values, basis=basis) ilr_table = pd.DataFrame(mat, columns=non_tips, index=table.index) return ilr_table, basis
def ilr_transform(table: pd.DataFrame, tree: skbio.TreeNode) -> pd.DataFrame: """Performs isometric logratio (ilr) transformation on feature-table. This creates a new table with balances (groups of features) that distinguish samples. Zeros must first be removed from the table (e.g. add-pseudocount). For source documentation check out: https://numpydoc.readthedocs.io/en/latest/ Parameters ----------- table : pd.DataFrame Dataframe of the feature table where rows correspond to samples and columns are features. The values within the table must be positive and nonzero. tree : skbio.TreeNode A tree relating all of the features to balances or log-contrasts (hierarchy). This tree must be bifurcating (i.e. has exactly 2 nodes). The internal nodes of the tree will be renamed. Returns -------- balances : pd.DataFrame Balances calculated from the feature table. Balance represents the log ratio of subchildren values below the specified internal node. """ _table, _tree = match_tips(table, tree) basis, nodes = balance_basis(_tree) balances = ilr(_table.values, basis) in_nodes = [n.name for n in _tree.levelorder() if not n.is_tip()] return pd.DataFrame(balances, columns=in_nodes, index=table.index)
def normalize_transform(self, mode='clr'): """ Some operations may require transformed data. This function performs normalization and a clr transform on all OTU tables in a Batch object. It returns a deep copy of the original Batch object, so the original file is not modified. :param mode: transformation mode; clr (centered log-ratio) or ilr (isometric log-ratio) :return: Transformed copy of Batch object. """ batchcopy = copy.deepcopy(self) try: for x in list(self.otu): # normalizes the data by samples normbiom = batchcopy.otu[x].norm(axis='sample', inplace=False) mat = csr_matrix.toarray(normbiom.matrix_data) # replaces all zeros with a small value # multiplicative replacement preserves ratios between values mat = multiplicative_replacement(mat) if mode is 'clr': mat = clr(mat) elif mode is 'ilr': mat = ilr(mat) else: raise ValueError("Only CLR and ILR transformations are currently supported.") normbiom._data = csc_matrix(mat) batchcopy.otu[x] = normbiom except Exception: logger.error("Failed to normalize data", exc_info=True) return batchcopy
def ilr_transform(table: pd.DataFrame, tree: skbio.TreeNode) -> pd.DataFrame: _table, _tree = match_tips(table, tree) basis, _ = balance_basis(_tree) balances = ilr(_table.values, basis) in_nodes = [n.name for n in _tree.levelorder() if not n.is_tip()] return pd.DataFrame(balances, columns=in_nodes, index=table.index)
def _regression(y, X, basis=None): """ Performs a simplicial ordinary least squares on a set of compositions and a response variable Parameters ---------- y : numpy.ndarray, float a matrix of proportions where rows correspond to samples and columns correspond to features. X : numpy.ndarray, float independent variable Returns ------- predict: pd.DataFrame, float a predicted matrix of proportions where rows correspond to samples and columns correspond to features. b: pd.DataFrame, float a matrix of estimated coefficient compositions resid: pd.DataFrame, float a matrix of compositional residuals r2: float coefficient of determination """ y = np.atleast_2d(y) X = np.atleast_2d(X) # Need to add constant for intercept r, c = X.shape y_ = ilr(y, basis=basis) # Now perform least squares to calculate unknown coefficients inv = np.linalg.pinv(np.dot(X.T, X)) cross = np.dot(inv, X.T) b_ = np.dot(cross, y_) predict_ = np.dot(X, b_) resid = (y_ - predict_) sst = (y_ - y_.mean(axis=0)) r2 = 1 - ((resid**2).sum() / (sst**2).sum()) if len(b_.shape) == 1: b_ = np.atleast_2d(b_).T b = ilr_inv(b_) if len(predict_.shape) == 1: predict_ = np.atleast_2d(predict_).T predict = ilr_inv(predict_) if len(resid.shape) == 1: resid = np.atleast_2d(resid).T resid = ilr_inv(resid) return predict, b, resid, r2
def test_ilr_basis_isomorphism(self): # tests to make sure that the isomorphism holds # with the introduction of the basis. basis = np.array([[0.80442968, 0.19557032]]) table = np.array([[ np.log(1 / 10) * np.sqrt(1 / 2), np.log(1.14141414 / 9.90909091) * np.sqrt(1 / 2), np.log(1.28282828 / 9.81818182) * np.sqrt(1 / 2), np.log(1.42424242 / 9.72727273) * np.sqrt(1 / 2), np.log(1.56565657 / 9.63636364) * np.sqrt(1 / 2) ]]).T res = ilr(ilr_inv(table, basis=basis), basis=basis) npt.assert_allclose(res, table.squeeze()) table = np.array([[1., 10.], [1.14141414, 9.90909091], [1.28282828, 9.81818182], [1.42424242, 9.72727273], [1.56565657, 9.63636364]]) res = ilr_inv(np.atleast_2d(ilr(table, basis=basis)).T, basis=basis) npt.assert_allclose(res, closure(table.squeeze()))
def test_ilr_basis_isomorphism(self): # tests to make sure that the isomorphism holds # with the introduction of the basis. basis = np.array([[0.80442968, 0.19557032]]) table = np.array([[np.log(1/10)*np.sqrt(1/2), np.log(1.14141414 / 9.90909091)*np.sqrt(1/2), np.log(1.28282828 / 9.81818182)*np.sqrt(1/2), np.log(1.42424242 / 9.72727273)*np.sqrt(1/2), np.log(1.56565657 / 9.63636364)*np.sqrt(1/2)]]).T res = ilr(ilr_inv(table, basis=basis), basis=basis) npt.assert_allclose(res, table.squeeze()) table = np.array([[1., 10.], [1.14141414, 9.90909091], [1.28282828, 9.81818182], [1.42424242, 9.72727273], [1.56565657, 9.63636364]]) res = ilr_inv(np.atleast_2d(ilr(table, basis=basis)).T, basis=basis) npt.assert_allclose(res, closure(table.squeeze()))
def test_ilr_inv(self): mat = closure(self.cdata7) npt.assert_array_almost_equal(ilr_inv(ilr(mat)), mat) npt.assert_allclose(ilr_inv(np.identity(3)), self.ortho1, rtol=1e-04, atol=1e-06) with self.assertRaises(ValueError): ilr_inv(self.cdata1, basis=self.cdata1) # make sure that inplace modification is not occurring ilr_inv(self.cdata1) npt.assert_allclose(self.cdata1, np.array([[2, 2, 6], [4, 4, 2]]))
def test_ilr_basis(self): table = np.array([[1., 10.], [1.14141414, 9.90909091], [1.28282828, 9.81818182], [1.42424242, 9.72727273], [1.56565657, 9.63636364]]) basis = np.array([[0.80442968, 0.19557032]]) res = ilr(table, basis=basis) exp = np.array([ np.log(1 / 10) * np.sqrt(1 / 2), np.log(1.14141414 / 9.90909091) * np.sqrt(1 / 2), np.log(1.28282828 / 9.81818182) * np.sqrt(1 / 2), np.log(1.42424242 / 9.72727273) * np.sqrt(1 / 2), np.log(1.56565657 / 9.63636364) * np.sqrt(1 / 2) ]) npt.assert_allclose(res, exp)
def test_ilr_basis(self): table = np.array([[1., 10.], [1.14141414, 9.90909091], [1.28282828, 9.81818182], [1.42424242, 9.72727273], [1.56565657, 9.63636364]]) basis = np.array([[0.80442968, 0.19557032]]) res = ilr(table, basis=basis) exp = np.array([np.log(1/10)*np.sqrt(1/2), np.log(1.14141414 / 9.90909091)*np.sqrt(1/2), np.log(1.28282828 / 9.81818182)*np.sqrt(1/2), np.log(1.42424242 / 9.72727273)*np.sqrt(1/2), np.log(1.56565657 / 9.63636364)*np.sqrt(1/2)]) npt.assert_allclose(res, exp)
def train_compositional_parameters(data): """ Given noisy compositional data, try to learn the compositional noise parameters. It is assumed that noise follows a Gaussian distribution in the ilr space. Parameters ---------- data : array_like A matrix of counts where there are `n` rows and `m` columns where `n` corresponds to the number of samples and `m` corresponds to the number of species. Returns ------- mu: float Mean of ilr normal in the default gram schmidt space cov: float Covariance matrix of ilr normal in the default gram schmidt space """ X = ilr(data) mu = np.mean(X, axis=0) cov = np.cov(X.T) return mu, cov
def test_ilr(self): mat = closure(self.cdata7) npt.assert_array_almost_equal(ilr(mat), np.array([0.70710678, 0.40824829])) # Should give same result as inner npt.assert_allclose(ilr(self.ortho1), np.identity(3), rtol=1e-04, atol=1e-06) with self.assertRaises(ValueError): ilr(self.cdata1, basis=self.cdata1) # make sure that inplace modification is not occurring ilr(self.cdata1) npt.assert_allclose(self.cdata1, np.array([[2, 2, 6], [4, 4, 2]]))
def balancetest(table, grouping, tree, significance_test=None, layout=None, normalize=True, mode='c'): """ Performs statistical test on ilr balances and plots on tree. Parameters ---------- table : pd.DataFrame A 2D matrix of strictly positive values (i.e. counts or proportions) where the rows correspond to samples and the columns correspond to features. grouping : pd.Series Vector indicating the assignment of samples to groups. For example, these could be strings or integers denoting which group a sample belongs to. It must be the same length as the samples in `table`. The index must be the same on `table` and `grouping` but need not be in the same order. tree : skbio.TreeNode A strictly bifurcating tree defining a hierarchical relationship between all of the features within `table` significance_test : function, optional A statistical significance function to test for significance between classes. This function must be able to accept at least two 1D array_like arguments of floats and returns a test statistic and a p-value, or a single statistic. By default ``scipy.stats.f_oneway`` is used. layout : function, optional A layout for formatting the tree visualization. Must take a `ete.tree` as a parameter. mode : str Type of display to show the tree. ('c': circular, 'r': rectangular). Returns ------- ete_tree : ete.Tree ETE tree converted from the `skbio.TreeNode` object ts : ete.TreeStyle ETE tree style used for formatting the visualized tree, with the test statistic plotted on each of the internal nodes. Note ---- The `skbio.TreeNode` is assumed to strictly bifurcating and whose tips match `table`. Also, it is assumed that none of the values in `table` are zero. Replace with a pseudocount if necessary. See also -------- skbio.TreeNode.bifurcate skbio.stats.composition.ilr skbio.stats.multiplicative_replacement scipy.stats.f_oneway """ if np.any(table <= 0): raise ValueError('Cannot handle zeros or negative values in `table`. ' 'Use pseudo counts or ``multiplicative_replacement``.' ) if significance_test is None: significance_test = scipy.stats.f_oneway sorted_features = [n.name for n in tree.tips()][::-1] if len(sorted_features) != len(table.columns): raise ValueError('The number of tips (%d) in the tree must be equal ' 'to the number features in the table (%d).' % (len(sorted_features), len(table.columns))) table = table.reindex(columns=sorted_features) mat, cats = check_table_grouping(table, grouping) basis, nodes = phylogenetic_basis(tree) ilr_coords = ilr(mat, basis=basis) ete_tree = Tree(str(tree)) _cats = set(cats) i = 0 for n in ete_tree.traverse(): if not n.is_leaf(): diffs = [ilr_coords[(cats == x).values, i] for x in _cats] stat = significance_test(*diffs) if len(stat) == 2: n.add_features(weight=-np.log(stat[1])) elif len(stat) == 1: n.add_features(weight=stat) else: raise ValueError( "Too many arguments returned by %s" % significance_test.__name__) i += 1 # Create an empty TreeStyle ts = TreeStyle() # Set our custom layout function if layout is None: ts.layout_fn = default_layout else: ts.layout_fn = layout # Draw a tree ts.mode = mode # We will add node names manually ts.show_leaf_name = False # Show branch data ts.show_branch_length = True ts.show_branch_support = True return ete_tree, ts
def _ilr_without_tree(X): return pd.DataFrame(ilr(X), index=X.index)
def run_preprocess(RNA_count_file, ATAC_count_file, ATAC_barcode_file, ATAC_peak_file, GTF_file, max_threads=2, velocity_file=None, outname=None, peak_per=None, cell_per=None, normalization='default', skip=False): # Find common barcodes and write to a list # spinner = Halo(text='Organizing Cell Barcodes', spinner='dots', # color='white', placement='right') # spinner.start() skip_atac = skip assert normalization in ['default', 'ilr', 'none'] s = open(RNA_count_file).readline().replace(' ', '\t') RNA_barcodes = np.loadtxt(StringIO(s), dtype=str)[1:] ATAC_barcodes = np.loadtxt(ATAC_barcode_file, dtype=str) ATAC_peaks = np.loadtxt(ATAC_peak_file, dtype=str) spinner = Halo(text='Organizing Cell Barcodes', spinner='dots', color='white', placement='right') spinner.start() #THIS IS WHERE THAT CORRECTION IS for j in range(0, np.size(ATAC_barcodes)): ATAC_barcodes[j] = ATAC_barcodes[j].replace('.R', ',R').replace('.P', ',P') # OR # for j in range(0, np.size(ATAC_barcodes)): # ATAC_barcodes[j] = ATAC_barcodes[j][:-5] # for j in range(0, np.size(RNA_barcodes)): # RNA_barcodes[j] = RNA_barcodes[j][:-5] intersecting, rna_idx, atac_idx = np.intersect1d(RNA_barcodes, ATAC_barcodes, return_indices=True) common_idxs = np.hstack((rna_idx.reshape(-1, 1), atac_idx.reshape(-1, 1))) spinner.stop() print("Found Intersection of Barcodes") if skip_atac != False: # Now let's get the ATAC-seq counts n_threads = max_threads spinner = Halo(text='Loading ATAC-seq counts into memory', spinner='dots', color='white', placement='right') spinner.start() txt_array = np.asarray(np.loadtxt(ATAC_count_file, skiprows=1, dtype=str)[1:, :], dtype=int) spinner.stop() number_peaks = np.max(txt_array[:, 0]) number_cells = np.max(txt_array[:, 1]) # Split file across cores and write to a sparse matrix spinner = Halo(text='Normalizing ATAC-seq', spinner='dots', color='white', placement='right') spinner.start() split_list = np.array_split(txt_array, n_threads) del txt_array gc.collect() ATAC_counts = scipy.sparse.lil_matrix((number_cells, number_peaks)) #Looks like two cores is the fastest on my computer if __name__ != "__main__": with Manager() as manager: chunks = manager.list() processes = [] for k in range(0, n_threads): p = Process(target=make_sparse_chunk, args=(number_cells, number_peaks, split_list[k], chunks)) p.start() processes.append(p) for p in processes: p.join() for chunk in chunks: nonzero = chunk.nonzero() ATAC_counts[nonzero] = chunk[nonzero] del chunk gc.collect() # Filter on depths and peak coverage if applicable # CELLS x PEAKS non_zero = ATAC_counts.nonzero() depths = np.sum(ATAC_counts, axis=1) if cell_per != None: bad_depths = np.argwhere( depths <= np.percentile(depths, cell_per)) else: bad_depths = np.argwhere(depths == 0) bad_idx_in_common = np.intersect1d(bad_depths, common_idxs) common_idxs = np.delete(common_idxs, (bad_idx_in_common), axis=0) # print("shape of ATAC counts matrix") # print(np.shape(ATAC_counts)) # print("Nonzero elements") # print(np.shape(non_zero)) if peak_per != None: peak_coverage = np.count_nonzero(ATAC_counts, axis=0) good_peaks = np.argwhere( peak_coverage >= np.percentile(peak_coverage, peak_per)) bad_peaks = np.argwhere( peak_coverage < np.percentile(peak_coverage, peak_per)) rem = np.sum(ATAC_counts[:, bad_peaks], axis=1) # COUNTS, BARCODES, PEAKS, DEPTHS if normalization == 'default': pickle.dump([ normalize(scipy.sparse.hstack( [ATAC_counts[common_idxs[:, 1], good_peaks], rem]), norm='l1', axis=1), ATAC_barcodes[common_idxs[:, 1]], np.concatenate( (ATAC_peaks[good_peaks], np.asarray(['rem']))), depths[common_idxs[:, 1]] ], open("ATAC_data.p", "wb")) print("Wrote Preprocessed ATAC-seq Data") if normalization == 'ilr': pickle.dump([ ilr( scipy.sparse.hstack([ ATAC_counts[common_idxs[:, 1], good_peaks], rem ]) + 1), ATAC_barcodes[common_idxs[:, 1]], np.concatenate( (ATAC_peaks[good_peaks], np.asarray(['rem']))), depths[common_idxs[:, 1]] ], open("ATAC_data.p", "wb")) print("Wrote Preprocessed ATAC-seq Data") if normalization == 'none': pickle.dump([ scipy.sparse.hstack([ ATAC_counts[common_idxs[:, 1], good_peaks], rem ]), ATAC_barcodes[common_idxs[:, 1]], np.concatenate( (ATAC_peaks[good_peaks], np.asarray(['rem']))), depths[common_idxs[:, 1]] ], open("ATAC_data.p", "wb")) print("Wrote Preprocessed ATAC-seq Data") del ATAC_counts gc.collect() else: if normalization == 'default': # COUNTS, BARCODES, PEAKS, DEPTHS pickle.dump([ normalize(ATAC_counts[common_idxs[:, 1], :], norm='l1', axis=1), ATAC_barcodes[common_idxs[:, 1]], ATAC_peaks, depths[common_idxs[:, 1]] ], open("ATAC_data.p", "wb")) print("Wrote Preprocessed ATAC-seq Data") if normalization == 'ilr': # COUNTS, BARCODES, PEAKS, DEPTHS pickle.dump([ ilr(ATAC_counts[common_idxs[:, 1], :] + 1), ATAC_barcodes[common_idxs[:, 1]], ATAC_peaks, depths[common_idxs[:, 1]] ], open("ATAC_data.p", "wb")) print("Wrote Preprocessed ATAC-seq Data") if normalization == 'none': # COUNTS, BARCODES, PEAKS, DEPTHS pickle.dump([ ATAC_counts[common_idxs[:, 1], :], ATAC_barcodes[common_idxs[:, 1]], ATAC_peaks, depths[common_idxs[:, 1]] ], open("ATAC_data.p", "wb")) print("Wrote Preprocessed ATAC-seq Data") del ATAC_counts gc.collect() spinner.stop() spinner = Halo(text='Normalizing RNA-seq', spinner='dots', color='white', placement='right') spinner.start() # Load in the RNA-seq RNA_counts = scipy.sparse.lil_matrix( np.asarray(np.loadtxt(RNA_count_file, skiprows=1, dtype=str)[:, 1:], dtype=int).transpose()) gene_names = np.loadtxt(RNA_count_file, usecols=0, skiprows=1, dtype=str) # COUNTS, BARCODES, GENE NAMES GTF_info = np.loadtxt(GTF_file, dtype=str, skiprows=5, delimiter='\t') rna_depths = np.sum(RNA_counts, axis=1) if normalization == 'default': pickle.dump([ normalize(RNA_counts[common_idxs[:, 0], :], norm='l1', axis=1), RNA_barcodes[common_idxs[:, 0]], gene_names, rna_depths[common_idxs[:, 0]], GTF_info ], open("RNA_data.p", "wb")) if normalization == 'ilr': pickle.dump([ ilr(RNA_counts[common_idxs[:, 0], :] + 1), RNA_barcodes[common_idxs[:, 0]], gene_names, rna_depths[common_idxs[:, 0]], GTF_info ], open("RNA_data.p", "wb")) if normalization == 'none': pickle.dump([ RNA_counts[common_idxs[:, 0], :], RNA_barcodes[common_idxs[:, 0]], gene_names, rna_depths[common_idxs[:, 0]], GTF_info ], open("RNA_data.p", "wb")) del RNA_counts gc.collect() spinner.stop() print("Wrote Preprocessed RNA-seq Data") print("Preprocessing Completed")
labels = [re.sub(regex, "", e) for e in labs] # Remove first element "x" labels.pop(0) # Ensure that this is not the rarefied ASV table sample_counts = unscaled_tab.sum(axis=1) # T # Perform total sum scaling normalization (TSS) scaled = unscaled_tab.div(unscaled_tab.sum(axis=1), axis=0) # scaled.sum(axis=1) # check # Substitute zeros with small pseudocounts since... zeros_scaled = comp.multiplicative_replacement(scaled) # numpy.ndarray # Isoform log transform since... ilr_transformed = comp.ilr(zeros_scaled) # Convert ndarray back to dataframe because... df_ilr_transformed = pd.DataFrame(ilr_transformed, index=scaled.index, columns=scaled.columns) ######################################################################################################## # Decision tree methods tended to perform well # HFE OTU feature reduction method brought a substantial performance improvement for nearly all methods # After feature reduction most methods performed similarly so need to do that ######################################################################################################## # Split data into test and training sets # Do before feature selection so features selected from training set, not whole dataset train, test, y_train, y_test = mod.train_test_split(df_ilr_transformed,
def band_table(num_samples, num_features, tree=None, low=2, high=10, sigma=2, alpha=6, seed=0): """ Generates a simulated table of counts. Each organism is modeled as a Gaussian distribution. Then counts are simulated using a Poisson distribution. Parameters ---------- num_samples : int Number of samples to simulate num_features : int Number of features to simulate tree : skbio.TreeNode Tree used as a scaffold for the ilr transform. If None, then the gram_schmidt_basis will be used. low : float Smallest gradient value. high : float Largest gradient value. sigma : float Variance of each species distribution alpha : int Global count bias. This bias is added to every cell in the matrix. seed : int or np.random.RandomState Random seed Returns ------- biom.Table Biom representation of the count table. pd.DataFrame DataFrame containing relevant metadata. beta : np.array Regression parameter estimates. theta : np.array Bias per sample. """ state = np.random.RandomState(seed) # measured gradient values for each sample gradient = np.linspace(low, high, num_samples) # optima for features (i.e. optimal ph for species) mu = np.linspace(low, high, num_features) sigma = np.array([sigma] * num_features) # construct species distributions table = chain_interactions(gradient, mu, sigma) samp_ids = ['S%d' % i for i in range(num_samples)] # obtain basis required to convert from balances to proportions. if tree is None: basis = _gram_schmidt_basis(num_features) feat_ids = ['F%d' % i for i in range(num_features)] table = pd.DataFrame(table, index=samp_ids, columns=feat_ids) else: feat_ids = [n.name for n in tree.tips()] table = pd.DataFrame(table, index=samp_ids, columns=feat_ids) basis = sparse_balance_basis(tree)[0].todense() # construct balances from gaussian distribution. # this will be necessary when refitting parameters later. Y = ilr(table, basis=clr_inv(basis)) X = gradient.reshape(-1, 1) X = np.hstack((np.ones(len(X)).reshape(-1, 1), X.reshape(-1, 1))) pY, resid, B = ols(Y, X) gamma = B[0] beta = B[1].reshape(1, -1) # parameter estimates r = beta.shape[1] # Normal distribution to simulate linear regression M = np.eye(r) # Generate covariance matrix from inverse wishart Sigma = invwishart.rvs(df=r + 2, scale=M.dot(M.T), random_state=state) w, v = eigsh(Sigma, k=2) # Low rank covariance matrix sim_L = (v @ np.diag(w)).T # sample y = X.dot(B) Ys = np.vstack( [state.multivariate_normal(y[i, :], Sigma) for i in range(y.shape[0])]) Yp = Ys @ basis # calculate bias terms theta = -np.log(np.exp(Yp).sum(axis=1)) + alpha # multinomial sample the entries #table = np.vstack(multinomial(nd, Yp[i, :]) for i in range(y.shape[0])) # poisson sample the entries table = np.vstack( state.poisson(np.exp(Yp[i, :] + theta[i])) for i in range(y.shape[0])).T table = Table(table, feat_ids, samp_ids) metadata = pd.DataFrame({'G': gradient}, index=samp_ids) return table, metadata, beta, theta, gamma
def train( iterations, sample_size, reduce, positive_train, negative_train, test_data, test_label): f1_original_clr = [] f1_original = [] f1_dca = [] f1_clr = [] f1_ilr = [] roc_original_clr = [] roc_original = [] roc_dca = [] roc_clr = [] roc_ilr = [] for _ in range( iterations ): # Select a smaller size #Select a random set from the train data train_sample_data, train_sample_label = split_train_test( positive_train, negative_train, sample_size ) f1_original_data, roc_original_data = train_svm(train_sample_data, train_sample_label, test_data, test_label) f1_original.append( f1_original_data ) roc_original.append( roc_original_data ) train_sample_data[train_sample_data == 0] = 0.1e-32 test_data[test_data == 0] = 0.1e-32 clr_original_train = clr(train_sample_data) clr_original_test = clr(test_data) scaler = StandardScaler() clr_original_train = np.nan_to_num(scaler.fit_transform(clr_original_train)) clr_original_test = np.nan_to_num(scaler.fit_transform(clr_original_test)) f1_original_data_clr, roc_original_data_clr = train_svm( clr_original_train, train_sample_label, clr_original_test, test_label ) f1_original_clr.append ( f1_original_data_clr ) roc_original_clr.append( roc_original_data_clr ) matrices = genetic_algorithm( train_sample_data, reduce ) roc_dca_iterations = [] for br_matrix in matrices: #br_matrix = matrices[0] reduced_data = np.matmul(br_matrix, train_sample_data.transpose()).transpose() reduced_test = np.matmul(br_matrix, test_data.transpose()).transpose() f1_dca_data, roc_dca_data = train_svm( reduced_data, train_sample_label, reduced_test, test_label ) #f1_dca.append( f1_dca_data ) roc_dca_iterations.append( roc_dca_data ) #print ("DCA max", max(roc_dca_iterations) ) roc_dca.append( max(roc_dca_iterations) ) #print ( " PCA CLR train shape ", train_sample_data.shape ) # Do ILR and CLR transformation # Set zeros to small values train_sample_data[train_sample_data == 0] = 0.1e-32 test_data[test_data == 0] = 0.1e-32 clr_data_train = clr(train_sample_data) clr_test = clr(test_data) ilr_data_train = ilr( train_sample_data ) ilr_test = ilr( test_data ) np.savetxt("ilr_data.csv", ilr_data_train, delimiter=",") # Do PCA to reduce dimensions pca_clr = PCA(n_components = reduce) pca_ilr = PCA(n_components = reduce) #print ( "reduce ", reduce ) fit_train_clr = np.ascontiguousarray( pca_clr.fit_transform(clr_data_train) ) fit_test_clr = np.ascontiguousarray( pca_clr.transform(clr_test) ) fit_train_ilr = np.ascontiguousarray( pca_ilr.fit_transform(ilr_data_train) ) fit_test_ilr = np.ascontiguousarray( pca_ilr.transform(ilr_test) ) np.savetxt("ilr_data_pca.csv", fit_train_ilr, delimiter=",") pca_clr_reduced_train = np.nan_to_num( fit_train_clr ) pca_ilr_reduced_train = np.nan_to_num( fit_train_ilr ) fit_test_clr = np.nan_to_num( fit_test_clr ) fit_test_ilr = np.nan_to_num( fit_test_ilr ) f1_pca_clr_data, roc_pca_clr_data = train_svm( pca_clr_reduced_train, train_sample_label, fit_test_clr, test_label ) f1_pca_ilr_data, roc_pca_ilr_data = train_svm( pca_ilr_reduced_train, train_sample_label, fit_test_ilr, test_label ) f1_clr.append( f1_pca_clr_data ) roc_clr.append( roc_pca_clr_data ) f1_ilr.append( f1_pca_ilr_data ) roc_ilr.append( roc_pca_ilr_data ) #print ( roc_original, roc_dca, roc_clr, roc_ilr) return ( sum ( roc_original ) / iterations ) , ( sum ( roc_original_clr ) / iterations ), ( sum( roc_dca ) / iterations ) , ( sum( roc_clr ) / iterations ) , ( sum( roc_ilr ) / iterations )