def GSLR(pathway_id_and_filepath_and_nodes_and_edges_and_costs): pathway_id, filepath, nodes, edges, costs = pathway_id_and_filepath_and_nodes_and_edges_and_costs # we had done dataset.to_csv(filename, index=True, header=True) dataset = pd.read_csv(filepath, index_col=0) y = LabelEncoder().fit_transform(dataset.index.tolist()) dataset = dataset.transpose().reindex(index=nodes).transpose() X = dataset.values d = len(nodes) c = 2 graph_opts = gslr.GraphOptions(edges=edges, root=-1, num_clusters=1, pruning='strong') sparsity_low = 50 sparsity_high = 100 verbosity_level = 0 num_steps = 35 possible_steps = np.array([0.03, 0.1, 0.3]) steps = np.tile(possible_steps, (num_steps, 1)) features = [] accuracies = [] for train, test in StratifiedKFold(n_splits=10).split(X, y): W0 = np.zeros((c, d)) W_hat, losses = gslr.gslr(X[train], y[train], W0, sparsity_low, sparsity_high, graph_opts, steps, verbosity_level, edge_costs=costs, edge_costs_multiplier=2) yhat = gslr.predict(X[test], W_hat) num_cor = gslr.num_correct(y[test], yhat) accuracy = num_cor / float(len(test)) features.append(W_hat[0]) accuracies.append(accuracy) features = pd.DataFrame(features, columns=dataset.columns) features = features.columns[(features != 0).any()].tolist() return pathway_id, accuracies, features
def GSLR(pathway_id_and_filepath_and_nodes_and_edges_and_costs_and_low_and_high): pathway_id, filepath, nodes, edges, costs, sparsity_low, sparsity_high = pathway_id_and_filepath_and_nodes_and_edges_and_costs_and_low_and_high print() print('-----------------') print(pathway_id) print(str(sparsity_low)+'-'+str(sparsity_high)) print() # we had done dataset.to_csv(filename, index=True, header=True) dataset = pd.read_csv(filepath, index_col=0) y = LabelEncoder().fit_transform(dataset.index.tolist()) dataset = dataset.transpose().reindex(index=nodes).transpose() X = dataset.values d = len(nodes) c = 2 graph_opts = gslr.GraphOptions(edges=edges, root=-1, num_clusters=1, pruning='strong') verbosity_level = 1 num_steps = 35 possible_steps = np.array([0.03,0.1,0.3]) steps = np.tile(possible_steps, (num_steps, 1)) featuresets = [] accuracies = [] for train, test in StratifiedKFold(n_splits=10).split(X, y): print() print('fold') print() W0 = np.zeros((c, d)) W_hat, losses = gslr.gslr(X[train], y[train], W0, sparsity_low, sparsity_high, graph_opts, steps, verbosity_level, edge_costs=costs, edge_costs_multiplier=2) yhat = gslr.predict(X[test], W_hat) num_cor = gslr.num_correct(y[test], yhat) accuracy = num_cor / float(len(test)) accuracies.append(accuracy) features = pd.DataFrame(W_hat, columns=dataset.columns) features = features.columns[(features != 0).any()].tolist() featuresets.append(features) return pathway_id, (sparsity_low, sparsity_high), accuracies, featuresets
def GSLR(filepath_and_pathway_ids_and_nodes_and_edges): filepath, pathway_id_1, pathway_id_2, nodes, edges = filepath_and_pathway_ids_and_nodes_and_edges # we had done dataset.to_csv(filename, index=True, header=True) dataset = pd.read_csv(filepath, index_col=0) y = LabelEncoder().fit_transform(dataset.index.tolist()) dataset = dataset.transpose().reindex(index=nodes).transpose() X = dataset.values d = len(nodes) c = 2 graph_opts = gslr.GraphOptions(edges=edges, root=-1, num_clusters=1, pruning='strong') sparsity_low = 30 sparsity_high = 70 verbosity_level = 0 num_steps = 50 possible_steps = np.array([0.1,0.2]) steps = np.tile(possible_steps, (num_steps, 1)) features = [] accuracies = [] for train, test in StratifiedKFold(n_splits=10).split(X, y): W0 = np.zeros((c, d)) W_hat, losses = gslr.gslr(X[train], y[train], W0, sparsity_low, sparsity_high, graph_opts, steps, verbosity_level) yhat = gslr.predict(X[test], W_hat) num_cor = gslr.num_correct(y[test], yhat) accuracy = num_cor / float(len(test)) features.append(W_hat[0]) accuracies.append(accuracy) features = pd.DataFrame(features, columns=dataset.columns) features = features.columns[(features != 0).any()].tolist() return pathway_id_1, pathway_id_2, accuracies, features
c = 2 graph_opts = gslr.GraphOptions(edges=edges, root=-1, num_clusters=1, pruning='strong') sparsity_low = 150 sparsity_high = 400 verbosity_level = 1 num_steps = 100 possible_steps = np.array([0.03, 0.1, 0.3]) steps = np.tile(possible_steps, (num_steps, 1)) W0 = np.zeros((c, d)) W_hat, losses = gslr.gslr(X, y, W0, sparsity_low, sparsity_high, graph_opts, steps, verbosity_level, edge_costs=inbiomap_experimentally.cost.values, edge_costs_multiplier=6) yhat = gslr.predict(X, W_hat) num_cor = gslr.num_correct(y, yhat) return num_cor, W_hat, losses if __name__ == "__main__": ### I. Load Ovarian Cancer Proteomics Dataset # medullo = pd.read_csv('/Users/alex/Documents/proteomics/data_preparation/proteomics_data/medullo_inbiomap_exp.tsv', index_col=0) dataset = pd.read_csv('/Users/alex/Documents/proteomics/data_preparation/proteomics_data/ovarian_inbiomap_exp.tsv', index_col=0) # brca = pd.read_csv('/Users/alex/Documents/proteomics/data_preparation/proteomics_data/brca_inbiomap_exp.tsv', index_col=0)