def _nodes_features(self) -> NoReturn: """ Create self.num_feat, self.nonzero_feat, self.feat. Notes ----- One-hot encoding as genes features. Binary vectors with presence of different side effects as drugs features. self.num_feat : Dict[int, int] Number of elements in feature vector for 0: -genes, for 1: -drugs. self.nonzero_feat : Dict[int, int] Number of all features for 0: -gene and 1: -drug nodes. e.g., it is in format 0: num of genes in graph, 1: num of drugs. self.feat : Dict[int, sp.csr_matrix] From edge type (0 = gene, 1 = drug) to feature matrix. Row in feature matrix = embedding of one node. """ # One-hot for genes n_genes = self.gene_net.number_of_nodes() gene_feat = sp.identity(n_genes) gene_nonzero_feat, gene_num_feat = gene_feat.shape gene_feat = preprocessing.sparse_to_tuple(gene_feat.tocoo()) # Create sparse matrix with rows -- genes features. # Gene feature -- binary vector with length = num of mono se. # feature[i] = 1 <=> gene has ith mono se drug_feat = create_adj_matrix( a_item2b_item=self.stitch2se, ordered_list_a_item=self.ordered_list_of_drugs, ordered_list_b_item=self.ordered_list_of_se_mono) # Check if some gene has zero embedding (i.e. it has no frequent se) drugs_zero_features = np.array( self.ordered_list_of_drugs)[drug_feat.getnnz(axis=1) == 0] # assert 0 not in drug_feat.getnnz(axis=1), \ # 'All genes should have nonzero embeddings! ' print(f'Length of drugs features vectors: {drug_feat.shape[1]}') print(f'Number of unique vectors: ' f'{np.unique(drug_feat.toarray(), axis=0).shape[0]}') if len(drugs_zero_features) > 0: print('Warning! All genes should have nonzero embeddings! ') print(f'Where are {len(drugs_zero_features)} zero embeddings') print(f'Bad drugs: {drugs_zero_features}') drug_nonzero_feat, drug_num_feat = drug_feat.shape drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo()) self.num_feat = { 0: gene_num_feat, 1: drug_num_feat, } self.nonzero_feat = { 0: gene_nonzero_feat, 1: drug_nonzero_feat, } self.feat = { 0: gene_feat, 1: drug_feat, }
def network_edge_threshold(network_adj, threshold): edge_tmp, edge_value, shape_tmp = preprocessing.sparse_to_tuple( network_adj) preserved_edge_index = np.where(edge_value > threshold)[0] preserved_network = sp.csr_matrix( (edge_value[preserved_edge_index], (edge_tmp[preserved_edge_index, 0], edge_tmp[preserved_edge_index, 1])), shape=shape_tmp) return preserved_network
def _nodes_features(self) -> NoReturn: """ Create self.num_feat, self.nonzero_feat, self.feat. Notes ----- One-hot encoding as genes and drugs features (separately one-hot for different nodes types). self.num_feat : Dict[int, int] Number of elements in feature vector for 0: -genes, for 1: -drugs. self.nonzero_feat : Dict[int, int] Number of all features for 0: -gene and 1: -drug nodes. self.feat : Dict[int, sp.csr_matrix] From edge type (0 = gene, 1 = drug) to feature matrix. Row in feature matrix = embedding of one node. """ # featureless (genes) gene_feat = sp.identity(self.n_genes) gene_nonzero_feat, gene_num_feat = gene_feat.shape gene_feat = preprocessing.sparse_to_tuple(gene_feat.tocoo()) # features (drugs) drug_feat = sp.identity(self.n_drugs) drug_nonzero_feat, drug_num_feat = drug_feat.shape drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo()) # data representation self.num_feat = { 0: gene_num_feat, 1: drug_num_feat, } self.nonzero_feat = { 0: gene_nonzero_feat, 1: drug_nonzero_feat, } self.feat = { 0: gene_feat, 1: drug_feat, }
# data representation adj_mats_orig = { (0,0): [gene_adj], (0,1): [gene_drug_adj], (1,0): [drug_gene_adj], (1,1): drug_drug_adj_list, } degrees = { 0: [gene_degrees], 1: drug_degrees_list, } # featureless (genes) gene_feat = sp.identity(n_genes) gene_nonzero_feat, gene_num_feat = gene_feat.shape gene_feat = preprocessing.sparse_to_tuple(gene_feat.tocoo()) # features (drugs) drug_feat = sp.identity(n_drugs) drug_nonzero_feat, drug_num_feat = drug_feat.shape drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo()) # data representation num_feat = { 0: gene_num_feat, 1: drug_num_feat, } nonzero_feat = { 0: gene_nonzero_feat, 1: drug_nonzero_feat, }
def main_execution(): combo_to_drugs_ids, combo_to_side_effects = load_drug_bank_combo_side_effect_file( fichier='polypharmacy/drugbank/drugbank-combo.csv') nodes = set([u for e in combo_to_drugs_ids.values() for u in e]) n_drugs = len(nodes) relation_types = set([r for r in combo_to_side_effects.values()]) n_drugdrug_rel_types = len(relation_types) drugs_to_positions_in_matrices_dict = { node: i for i, node in enumerate(nodes) } drug_drug_adj_list = [] # matrice d'adjacence de chaque drug_drug for i, el in enumerate(relation_types): # pour chaque side effect mat = np.zeros((n_drugs, n_drugs)) for d1, d2 in combinations(list(nodes), 2): temp_cle = '{}_{}'.format(d1, d2) if temp_cle in combo_to_side_effects.keys(): if combo_to_side_effects[temp_cle] == el: # chaque fois on a une réelle s.e entre les 2 drogues dans la matrice mat[drugs_to_positions_in_matrices_dict[d1], drugs_to_positions_in_matrices_dict[d2]] = \ mat[drugs_to_positions_in_matrices_dict[d2], drugs_to_positions_in_matrices_dict[d1]] = 1. # Inscrire une interaction drug_drug_adj_list.append(sp.csr_matrix(mat)) drug_degrees_list = [ np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in drug_drug_adj_list ] adj_mats_orig = { (0, 0): drug_drug_adj_list + [x.transpose(copy=True) for x in drug_drug_adj_list], } degrees = { 0: drug_degrees_list + drug_degrees_list, } # features (drugs) drug_feat = sp.identity(n_drugs) drug_nonzero_feat, drug_num_feat = drug_feat.shape drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo()) # data representation num_feat = { 0: drug_num_feat, } nonzero_feat = { 0: drug_nonzero_feat, } feat = { 0: drug_feat, } edge_type2dim = { k: [adj.shape for adj in adjs] for k, adjs in adj_mats_orig.items() } edge_type2decoder = { (0, 0): 'dedicom', } edge_types = {k: len(v) for k, v in adj_mats_orig.items()} num_edge_types = sum(edge_types.values()) print("Edge types:", "%d" % num_edge_types) print("Defining placeholders") placeholders = construct_placeholders(edge_types) ########################################################### # # Create minibatch iterator, model and optimizer # ########################################################### print("Create minibatch iterator") minibatch = EdgeMinibatchIterator(adj_mats=adj_mats_orig, feat=feat, edge_types=edge_types, batch_size=FLAGS.batch_size, val_test_size=val_test_size) print("Create model") model = DecagonModel( placeholders=placeholders, num_feat=num_feat, nonzero_feat=nonzero_feat, edge_types=edge_types, decoders=edge_type2decoder, ) print("Create optimizer") with tf.name_scope('optimizer'): opt = DecagonOptimizer(embeddings=model.embeddings, latent_inters=model.latent_inters, latent_varies=model.latent_varies, degrees=degrees, edge_types=edge_types, edge_type2dim=edge_type2dim, placeholders=placeholders, batch_size=FLAGS.batch_size, margin=FLAGS.max_margin) print("Initialize session") sess = tf.Session() sess.run(tf.global_variables_initializer()) feed_dict = {} ########################################################### # # Train model # ########################################################### print("Train model") for epoch in range(FLAGS.epochs): minibatch.shuffle() itr = 0 while not minibatch.end(): # Construct feed dictionary feed_dict = minibatch.next_minibatch_feed_dict( placeholders=placeholders) feed_dict = minibatch.update_feed_dict(feed_dict=feed_dict, dropout=FLAGS.dropout, placeholders=placeholders) t = time.time() # Training step: run single weight update outs = sess.run([opt.opt_op, opt.cost, opt.batch_edge_type_idx], feed_dict=feed_dict) train_cost = outs[1] batch_edge_type = outs[2] if itr % PRINT_PROGRESS_EVERY == 0: val_auc, val_auprc, val_apk = get_accuracy_scores( feed_dict, placeholders, sess, opt, minibatch, adj_mats_orig, minibatch.val_edges, minibatch.val_edges_false, minibatch.idx2edge_type[minibatch.current_edge_type_idx]) print("Epoch:", "%04d" % (epoch + 1), "Iter:", "%04d" % (itr + 1), "Edge:", "%04d" % batch_edge_type, "train_loss=", "{:.5f}".format(train_cost), "val_roc=", "{:.5f}".format(val_auc), "val_auprc=", "{:.5f}".format(val_auprc), "val_apk=", "{:.5f}".format(val_apk), "time=", "{:.5f}".format(time.time() - t)) itr += 1 print("Optimization finished!") for et in range(num_edge_types): roc_score, auprc_score, apk_score = get_accuracy_scores( feed_dict, placeholders, sess, opt, minibatch, adj_mats_orig, minibatch.test_edges, minibatch.test_edges_false, minibatch.idx2edge_type[et]) print("Edge type=", "[%02d, %02d, %02d]" % minibatch.idx2edge_type[et]) print("Edge type:", "%04d" % et, "Test AUROC score", "{:.5f}".format(roc_score)) print("Edge type:", "%04d" % et, "Test AUPRC score", "{:.5f}".format(auprc_score)) print("Edge type:", "%04d" % et, "Test AP@k score", "{:.5f}".format(apk_score)) print()
(1, 0): [drug_proten_interactions], (1, 1): [Drug_Drug_sim_adj, Drug_Drug_sim_adj], #type3 } protein_degrees = np.array(Protein_Protein_sim_adj.sum(axis=0)).squeeze() drug_degrees = np.array(Drug_Drug_sim_adj.sum(axis=0)).squeeze() degrees = { 0: [protein_degrees, protein_degrees], 1: [drug_degrees, drug_degrees], } # # featureless protein_feat = sp.identity(Protein_Protein_sim_adj.shape[0]) protein_nonzero_feat, protein_num_feat = protein_feat.shape protein_feat = preprocessing.sparse_to_tuple(protein_feat.tocoo()) drug_feat = sp.identity(Drug_Drug_sim_adj.shape[0]) drug_nonzero_feat, drug_num_feat = drug_feat.shape drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo()) num_feat = { 0: protein_num_feat, 1: drug_num_feat, } nonzero_feat = { 0: protein_nonzero_feat, 1: drug_nonzero_feat, } feat = { 0: protein_feat,
def main_execution(combo_file='./polypharmacy/bio-decagon-combo.csv', targets_file='./polypharmacy/bio-decagon-targets.csv', genes_genes_file='./polypharmacy/bio-decagon-ppi.csv', new_train_test_split=False): print('Load Combo to Side Effects') if combo_file.find('decagon') != -1: combo_to_drugs_ids, combo_to_side_effects, combo_to_side_effects_names, side_effects_ids_to_names = \ load_decagon_combo_side_effect_file(fichier=combo_file) print('Load drugs to targets') drugs_id_to_targets_id = load_decagon_file_targets_id( fichier=targets_file) else: combo_to_drugs_ids, combo_to_side_effects = load_drug_bank_combo_side_effect_file( fichier=combo_file) print('Load drugs to targets') drugs_id_to_targets_id, drugs_id_to_drugs_name = load_file_targets_id( fichier=targets_file) print('Load genes to genes (targets) interactions net') genes_genes_net, genes_node_to_idx = load_genes_genes_interactions( fichier=genes_genes_file) print('Build genes-genes adjacency matrix') genes_adj = nx.adjacency_matrix(genes_genes_net) genes_degrees = np.array(genes_adj.sum(axis=0)).squeeze() if new_train_test_split: print('Load the new train test validation split') combo_to_drugs_ids_train, combo_to_drugs_ids_test, combo_to_drugs_ids_valid = train_test_valid_split_3( ) drug_nodes_train = set( [u for e in combo_to_drugs_ids_train.values() for u in e]) drug_nodes_test = set( [u for e in combo_to_drugs_ids_test.values() for u in e]) drug_nodes_valid = set( [u for e in combo_to_drugs_ids_valid.values() for u in e]) print('Build drugs-drugs matrix representation') drug_nodes = set([u for e in combo_to_drugs_ids.values() for u in e]) n_drugs = len(drug_nodes) relation_types = set( [r for se in combo_to_side_effects.values() for r in se]) drugs_nodes_to_idx = {node: i for i, node in enumerate(drug_nodes)} print('Build general drugs-drugs matrix representation') drug_drug_adj_list = [] # matrice d'adjacence de chaque drug_drug for i, el in enumerate(relation_types): # pour chaque side effect mat = np.zeros((n_drugs, n_drugs)) for d1, d2 in combinations(list(drug_nodes), 2): temp_cle = '{}_{}'.format(d1, d2) if temp_cle in combo_to_side_effects.keys(): if el in combo_to_side_effects[temp_cle]: # list of list on check si le s.e apparait au moins une fois dans la liste mat[drugs_nodes_to_idx[d1], drugs_nodes_to_idx[d2]] = \ mat[drugs_nodes_to_idx[d2], drugs_nodes_to_idx[d1]] = 1. # Inscrire une interaction drug_drug_adj_list.append(sp.csr_matrix(mat)) drug_degrees_list = [ np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in drug_drug_adj_list ] if new_train_test_split: print('Build train drugs-drugs matrix representation') drug_drug_adj_list_train = [ ] # matrice d'adjacence de chaque drug_drug for i, el in enumerate(relation_types): # pour chaque side effect mat = np.zeros((n_drugs, n_drugs)) for d1, d2 in combinations(list(drug_nodes_train), 2): temp_cle = '{}_{}'.format(d1, d2) if temp_cle in combo_to_side_effects.keys(): if el in combo_to_side_effects[temp_cle]: # list of list on check si le s.e apparait au moins une fois dans la liste mat[drugs_nodes_to_idx[d1], drugs_nodes_to_idx[d2]] = \ mat[drugs_nodes_to_idx[d2], drugs_nodes_to_idx[d1]] = 1. # Inscrire une interaction drug_drug_adj_list_train.append(sp.csr_matrix(mat)) drug_degrees_list_train = [ np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in drug_drug_adj_list_train ] print('Build test drugs-drugs matrix representation') drug_drug_adj_list_test = [] # matrice d'adjacence de chaque drug_drug for i, el in enumerate(relation_types): # pour chaque side effect mat = np.zeros((n_drugs, n_drugs)) for d1, d2 in combinations(list(drug_nodes_test), 2): temp_cle = '{}_{}'.format(d1, d2) if temp_cle in combo_to_side_effects.keys(): if el in combo_to_side_effects[temp_cle]: # list of list on check si le s.e apparait au moins une fois dans la liste mat[drugs_nodes_to_idx[d1], drugs_nodes_to_idx[d2]] = \ mat[drugs_nodes_to_idx[d2], drugs_nodes_to_idx[d1]] = 1. # Inscrire une interaction drug_drug_adj_list_test.append(sp.csr_matrix(mat)) drug_degrees_list_test = [ np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in drug_drug_adj_list_test ] print('Build valid drugs-drugs matrix representation') drug_drug_adj_list_valid = [ ] # matrice d'adjacence de chaque drug_drug for i, el in enumerate(relation_types): # pour chaque side effect mat = np.zeros((n_drugs, n_drugs)) for d1, d2 in combinations(list(drug_nodes_valid), 2): temp_cle = '{}_{}'.format(d1, d2) if temp_cle in combo_to_side_effects.keys(): if el in combo_to_side_effects[temp_cle]: # list of list on check si le s.e apparait au moins une fois dans la liste mat[drugs_nodes_to_idx[d1], drugs_nodes_to_idx[d2]] = \ mat[drugs_nodes_to_idx[d2], drugs_nodes_to_idx[d1]] = 1. # Inscrire une interaction drug_drug_adj_list_valid.append(sp.csr_matrix(mat)) drug_degrees_list_valid = [ np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in drug_drug_adj_list_valid ] print('Build general genes-drugs matrix representation') genes_nodes = set([gene_node for gene_node in genes_node_to_idx.keys()]) n_genes = len(genes_nodes) mat = np.zeros((n_genes, n_drugs)) for drug in drug_nodes: if drug in drugs_id_to_targets_id.keys(): for target in drugs_id_to_targets_id[drug]: if target in genes_node_to_idx.keys(): mat[genes_node_to_idx[target], drugs_nodes_to_idx[drug]] = 1. genes_drugs_adj = sp.csr_matrix(mat) drugs_genes_adj = genes_drugs_adj.transpose(copy=True) if new_train_test_split: print('Build train genes-drugs matrix representation') for drug in drug_nodes_train: if drug in drugs_id_to_targets_id.keys(): for target in drugs_id_to_targets_id[drug]: if target in genes_node_to_idx.keys(): mat[genes_node_to_idx[target], drugs_nodes_to_idx[drug]] = 1. genes_drugs_adj_train = sp.csr_matrix(mat) drugs_genes_adj_train = genes_drugs_adj_train.transpose(copy=True) print('Build test genes-drugs matrix representation') for drug in drug_nodes_test: if drug in drugs_id_to_targets_id.keys(): for target in drugs_id_to_targets_id[drug]: if target in genes_node_to_idx.keys(): mat[genes_node_to_idx[target], drugs_nodes_to_idx[drug]] = 1. genes_drugs_adj_test = sp.csr_matrix(mat) drugs_genes_adj_test = genes_drugs_adj_test.transpose(copy=True) print('Build valid genes-drugs matrix representation') for drug in drug_nodes_valid: if drug in drugs_id_to_targets_id.keys(): for target in drugs_id_to_targets_id[drug]: if target in genes_node_to_idx.keys(): mat[genes_node_to_idx[target], drugs_nodes_to_idx[drug]] = 1. genes_drugs_adj_valid = sp.csr_matrix(mat) drugs_genes_adj_valid = genes_drugs_adj_valid.transpose(copy=True) print('Build general Adjacency matrix data representation') adj_mats_orig = { (0, 0): [genes_adj, genes_adj.transpose(copy=True)], (0, 1): [genes_drugs_adj], (1, 0): [drugs_genes_adj], (1, 1): drug_drug_adj_list + [x.transpose(copy=True) for x in drug_drug_adj_list], } if new_train_test_split: print('Build train Adjacency matrix data representation') adj_mats_orig_train = { (0, 0): [genes_adj, genes_adj.transpose(copy=True)], (0, 1): [genes_drugs_adj_train], (1, 0): [drugs_genes_adj_train], (1, 1): drug_drug_adj_list_train + [x.transpose(copy=True) for x in drug_drug_adj_list_train], } print('Build test Adjacency matrix data representation') adj_mats_orig_test = { (0, 0): [genes_adj, genes_adj.transpose(copy=True)], (0, 1): [genes_drugs_adj_test], (1, 0): [drugs_genes_adj_test], (1, 1): drug_drug_adj_list_test + [x.transpose(copy=True) for x in drug_drug_adj_list_test], } print('Build valid Adjacency matrix data representation') adj_mats_orig_valid = { (0, 0): [genes_adj, genes_adj.transpose(copy=True)], (0, 1): [genes_drugs_adj_valid], (1, 0): [drugs_genes_adj_valid], (1, 1): drug_drug_adj_list_valid + [x.transpose(copy=True) for x in drug_drug_adj_list_valid], } degrees = { 0: [genes_degrees, genes_degrees], 1: drug_degrees_list + drug_degrees_list, } print('featureless (genes)') gene_feat = sp.identity(n_genes) gene_nonzero_feat, gene_num_feat = gene_feat.shape gene_feat = preprocessing.sparse_to_tuple(gene_feat.tocoo()) print('features (drugs)') drug_feat = sp.identity(n_drugs) drug_nonzero_feat, drug_num_feat = drug_feat.shape drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo()) print('Features data representation') num_feat = { 0: gene_num_feat, 1: drug_num_feat, } nonzero_feat = { 0: gene_nonzero_feat, 1: drug_nonzero_feat, } feat = { 0: gene_feat, 1: drug_feat, } edge_type2dim = { k: [adj.shape for adj in adjs] for k, adjs in adj_mats_orig.items() } edge_type2decoder = { (0, 0): 'bilinear', (0, 1): 'bilinear', (1, 0): 'bilinear', (1, 1): 'dedicom', } edge_types = {k: len(v) for k, v in adj_mats_orig.items()} num_edge_types = sum(edge_types.values()) print("Edge types:", "%d" % num_edge_types) print("Defining placeholders") placeholders = construct_placeholders(edge_types) ########################################################### # # Create minibatch iterator, model and optimizer # ########################################################### if new_train_test_split: print("Create minibatch iterator") minibatch = EdgeMinibatchIteratorNewSplit( adj_mats=adj_mats_orig, adj_mats_train=adj_mats_orig_train, adj_mats_test=adj_mats_orig_test, adj_mats_valid=adj_mats_orig_valid, feat=feat, edge_types=edge_types, batch_size=FLAGS.batch_size, val_test_size=val_test_size) else: print("Create minibatch iterator") minibatch = EdgeMinibatchIterator(adj_mats=adj_mats_orig, feat=feat, edge_types=edge_types, batch_size=FLAGS.batch_size, val_test_size=val_test_size) print("Create model") model = DecagonModel( placeholders=placeholders, num_feat=num_feat, nonzero_feat=nonzero_feat, edge_types=edge_types, decoders=edge_type2decoder, ) print("Create optimizer") with tf.name_scope('optimizer'): opt = DecagonOptimizer(embeddings=model.embeddings, latent_inters=model.latent_inters, latent_varies=model.latent_varies, degrees=degrees, edge_types=edge_types, edge_type2dim=edge_type2dim, placeholders=placeholders, batch_size=FLAGS.batch_size, margin=FLAGS.max_margin) print("Initialize session") sess = tf.Session() sess.run(tf.global_variables_initializer()) feed_dict = {} ########################################################### # # Train model # ########################################################### print("Train model") for epoch in range(FLAGS.epochs): minibatch.shuffle() itr = 0 while not minibatch.end(): # Construct feed dictionary feed_dict = minibatch.next_minibatch_feed_dict( placeholders=placeholders) feed_dict = minibatch.update_feed_dict(feed_dict=feed_dict, dropout=FLAGS.dropout, placeholders=placeholders) t = time.time() # Training step: run single weight update outs = sess.run([opt.opt_op, opt.cost, opt.batch_edge_type_idx], feed_dict=feed_dict) train_cost = outs[1] batch_edge_type = outs[2] if itr % PRINT_PROGRESS_EVERY == 0: val_auc, val_auprc, val_apk = get_accuracy_scores( feed_dict, placeholders, sess, opt, minibatch, adj_mats_orig, minibatch.val_edges, minibatch.val_edges_false, minibatch.idx2edge_type[minibatch.current_edge_type_idx]) print("Epoch:", "%04d" % (epoch + 1), "Iter:", "%04d" % (itr + 1), "Edge:", "%04d" % batch_edge_type, "train_loss=", "{:.5f}".format(train_cost), "val_roc=", "{:.5f}".format(val_auc), "val_auprc=", "{:.5f}".format(val_auprc), "val_apk=", "{:.5f}".format(val_apk), "time=", "{:.5f}".format(time.time() - t)) itr += 1 print("Optimization finished!") for et in range(num_edge_types): roc_score, auprc_score, apk_score = get_accuracy_scores( feed_dict, placeholders, sess, opt, minibatch, adj_mats_orig, minibatch.test_edges, minibatch.test_edges_false, minibatch.idx2edge_type[et]) print("Edge type=", "[%02d, %02d, %02d]" % minibatch.idx2edge_type[et]) print("Edge type:", "%04d" % et, "Test AUROC score", "{:.5f}".format(roc_score)) print("Edge type:", "%04d" % et, "Test AUPRC score", "{:.5f}".format(auprc_score)) print("Edge type:", "%04d" % et, "Test AP@k score", "{:.5f}".format(apk_score)) print()
} degrees = { 0: pers_comp_degrees_list, 1: comp_degrees_list + comp_degrees_list, 2: [ np.array([np.sum(comp_bankr_adj[0].T)]), np.array([np.sum(comp_bankr_adj[0].T)]) ] } # features (Person) pers_feat = sp.identity(n_persons) pers_nonzero_feat, pers_num_feat = pers_feat.shape pers_feat = preprocessing.sparse_to_tuple(pers_feat.tocoo()) # features (Companies) comp_feat = sp.identity(n_companies) comp_nonzero_feat, comp_num_feat = comp_feat.shape comp_feat = preprocessing.sparse_to_tuple(comp_feat.tocoo()) # features (Bankruptcy) n_bankruptcy = 1 banrp_feat = sp.identity(n_bankruptcy) banrp_nonzero_feat, banrp_num_feat = banrp_feat.shape banrp_feat = preprocessing.sparse_to_tuple(banrp_feat.tocoo()) # data representation num_feat = {0: pers_num_feat, 1: comp_num_feat, 2: banrp_num_feat} nonzero_feat = {
ppi_mat = ppi_adj.todense() # High memory requirement for big matrices # Calculate algorithmic complexity bdm = BDM(ndim=2, partition=PartitionRecursive) ppi_per = PerturbationExperiment(bdm, metric='bdm', bipartite_network=False) ppi_per.set_data(np.array(ppi_mat)) edge_complexity = ppi_per.run() # Reshape to the adj matrix shape complexity_mat = edge_complexity.reshape(np.shape(ppi_adj)) #============================= PRELIMINARY SAVING OF BDM ================================ # out_file_bdm = 'data_structures/BDM/EDGES_PPI_' + sim_type + '_genes_' + str( old_genes) print('Output BDM file: ', out_file_bdm, '\n') with open(out_file_bdm, 'wb') as f: pickle.dump(edge_complexity, f) # =============================== REMOVING EDGES ======================================== # coords, _, _ = sparse_to_tuple(ppi_adj) # Take the upper triangular coordinates upper_coords = coords[(coords[:, 1] - coords[:, 0] > 0).nonzero()] # Select abs of the complexity of selected entries true_cmplx = np.abs(complexity_mat[upper_coords[:, 0], upper_coords[:, 1]]).squeeze() # Give an index to the edge pair = np.array(list(enumerate(true_cmplx))) # Sort from greatest to lowest complexity sorted_pair = pair[pair[:, 1].argsort()][::-1] # Select sorted indices idx = sorted_pair[:, 0].astype(int) # Select a threshold entry according to the cut fraction threshold = np.floor(len(idx) * (1 - cut_frac)).astype(int) # Select indices above threshold idx = idx[:threshold]
if BDM: prot_feat = np.hstack([to_add_bdm_genes_dti, to_add_bdm_ppi]) # Drug features if DSE: drug_feat = np.asarray( np.hstack( [drug_feat.todense(), to_add_bdm_drugs_dti, to_add_bdm_ddi])) else: drug_feat = np.hstack([to_add_bdm_drugs_dti, to_add_bdm_ddi]) print('Drug feature matrix shape: ', np.shape(drug_feat)) print('Protein feature matrix shape: ', np.shape(prot_feat)) # Drug features drug_num_feat = drug_feat.shape[1] drug_nonzero_feat = len(np.nonzero(drug_feat)[0]) drug_feat = sparse_to_tuple(sp.coo_matrix(drug_feat)) # Protein features gene_num_feat = prot_feat.shape[1] gene_nonzero_feat = len(np.nonzero(prot_feat)[0]) gene_feat = sparse_to_tuple(sp.coo_matrix(prot_feat)) # ============================================================================================= # # CREATION OF DECAGON DICTIONARIES adj_mats_orig = { (0, 0): [ppi_adj], (0, 1): [dti_adj], (1, 0): [dti_adj.transpose(copy=True)], (1, 1): ddi_adj_list, } degrees = {0: [ppi_degrees], 1: ddi_degrees_list} edge_type2dim = { k: [adj.shape for adj in adjs]
def __init__(self, et): """:param num: load num+1 edge types in order""" # load data print("loading...") # temp = '/home/acq18hx/decagon/' temp = './' with open(temp + 'data_decagon/graph_num_info.pkl', 'rb') as f: [num_gene, num_drug, num_edge_type, num_drug_additional_feature] = pickle.load(f) # gene-gene gene_adj = sp.load_npz(temp + "data_decagon/gene-sparse-adj.npz") print("load gene_gene finished!") # gene-drug gene_drug_adj = sp.load_npz(temp + "data_decagon/gene-drug-sparse-adj.npz") drug_gene_adj = sp.load_npz(temp + "data_decagon/drug-gene-sparse-adj.npz") print("load gene_drug finished!") # drug-drug drug_drug_adj_list = [] l_et = int(len(et) / 2) for i in et[:l_et]: drug_drug_adj_list.append( sp.load_npz("".join([ temp + "data_decagon/drug-sparse-adj/type_", str(i), ".npz" ]))) print("load drug_drug finished!") drug_feat_sparse = sp.load_npz(temp + "data_decagon/drug-feature-sparse.npz") print("load drug_feature finished!") # -------------------------- gene feature -------------------------- # featureless (genes) gene_feat = sp.identity(num_gene) gene_nonzero_feat, gene_num_feat = gene_feat.shape gene_feat = preprocessing.sparse_to_tuple(gene_feat.tocoo()) # drug vectors with additional features (single side effect) drug_nonzero_feat, drug_num_feat = drug_feat_sparse.shape[ 1], np.count_nonzero(drug_feat_sparse.sum(axis=0)) drug_feat = preprocessing.sparse_to_tuple(drug_feat_sparse.tocoo()) # data representation self.adj_mats_orig = { (0, 0): [gene_adj, gene_adj.transpose(copy=True)], (0, 1): [gene_drug_adj], (1, 0): [drug_gene_adj], (1, 1): drug_drug_adj_list + [x.transpose(copy=True) for x in drug_drug_adj_list], } gene_degrees = np.array(gene_adj.sum(axis=0)).squeeze() drug_degrees_list = [ np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in drug_drug_adj_list ] self.degrees = { 0: [gene_degrees, gene_degrees], 1: drug_degrees_list + drug_degrees_list, } # data representation self.num_feat = { 0: gene_num_feat, 1: drug_num_feat, } self.num_nonzero_feat = { 0: gene_nonzero_feat, 1: drug_nonzero_feat, } self.feat = { 0: gene_feat, 1: drug_feat, } self.edge_type2dim = { k: [adj.shape for adj in adjs] for k, adjs in self.adj_mats_orig.items() } self.edge_type2decoder = { (0, 0): 'bilinear', (0, 1): 'bilinear', (1, 0): 'bilinear', (1, 1): 'dedicom', } self.edge_types = {k: len(v) for k, v in self.adj_mats_orig.items()} self.num_edge_types = sum(self.edge_types.values()) print("Edge types:", "%d" % self.num_edge_types) print("======================================================")
def build_original(self): pp_f = "data_decagon/PP-Decagon_ppi.csv" dd_f = "data_decagon/bio-decagon-combo.csv" dp_f = "data_decagon/bio-decagon-targets.csv" ds_f = "data_decagon/bio-decagon-mono.csv" p_set, d_set, combo_set, mono_set = set(), set(), set(), set() pp_list, ddt_list, dp_list, ds_list = [], [], [], [] a, b, c = 0, 0, 0 # temp variables # 1. Protein-Protein Association Network with open(pp_f, 'r') as f: ppi = csv.reader(f) next(ppi) for [g1, g2] in ppi: a, b = int(g1), int(g2) p_set.add(a) p_set.add(b) pp_list.append((a, b)) # 2. Drug-Drug Association Network with open(dd_f, "r") as f: ppi = csv.reader(f) next(ppi) for [d1, d2, t, n] in ppi: a, b, c = int(t.split('C')[-1]), int(d1.split('D')[-1]), int( d2.split('D')[-1]) combo_set.add(a) d_set.add(b) d_set.add(c) ddt_list.append((b, c, a)) # 3. Drug-Protein Association Network with open(dp_f, "r") as f: ppi = csv.reader(f) next(ppi) for [d, p] in ppi: a, b = int(d.split('D')[-1]), int(p) d_set.add(a) p_set.add(b) dp_list.append((a, b)) # 4. Drug-SideEffect Association Network with open(ds_f, "r") as f: ppi = csv.reader(f) next(ppi) for [d, e, n] in ppi: a, b = int(e.split('C')[-1]), int(d.split('D')[-1]) mono_set.add(a) d_set.add(b) ds_list.append((b, a)) num_gene = p_set.__len__() num_drug = d_set.__len__() num_edge_type = combo_set.__len__() num_drug_additional_feature = mono_set.__len__() # -------------------------- gene adj -------------------------- gene_to_old = list(p_set) gene_to_new = sp.csr_matrix( (range(num_gene), ([0] * num_gene, gene_to_old))) drug_to_old = list(d_set) drug_to_new = sp.csr_matrix( (range(num_drug), ([0] * num_drug, drug_to_old))) edge_type_to_old = list(combo_set) edge_type_to_new = sp.csr_matrix( (range(num_edge_type), ([0] * num_edge_type, edge_type_to_old))) side_effect_to_old = list(mono_set) side_effect_to_new = sp.csr_matrix( (range(num_drug_additional_feature), ([0] * num_drug_additional_feature, side_effect_to_old))) r, c = [], [] array_length = len(pp_list) # -------------------------- gene-gene adj -------------------------- for i in range(array_length): r.append(gene_to_new[0, pp_list[i][0]]) c.append(gene_to_new[0, pp_list[i][1]]) gene_adj = sp.csr_matrix(([1] * array_length, (r, c)), shape=(num_gene, num_gene)) gene_degrees = np.array(gene_adj.sum(axis=0)).squeeze() r, c = [], [] array_length = len(dp_list) # -------------------------- drug(row)-gene(col) adj -------------------------- for i in range(array_length): r.append(drug_to_new[0, dp_list[i][0]]) c.append(gene_to_new[0, dp_list[i][1]]) drug_gene_adj = sp.csr_matrix(([1] * array_length, (r, c)), shape=(num_drug, num_gene)) gene_drug_adj = drug_gene_adj.transpose(copy=True) r = {} array_length = len(ddt_list) # -------------------------- drug-drug adj list -------------------------- for i in range(array_length): c = edge_type_to_new[0, ddt_list[i][2]] if c not in r: r[c] = [drug_to_new[0, ddt_list[i][0]] ], [drug_to_new[0, ddt_list[i][1]]] else: r[c][0].append(drug_to_new[0, ddt_list[i][0]]) r[c][1].append(drug_to_new[0, ddt_list[i][1]]) drug_drug_adj_list = [] for i in range(num_edge_type): drug_drug_adj_list.append( sp.csr_matrix(([1] * len(r[i][0]), (r[i][0], r[i][1])), shape=(num_drug, num_drug))) drug_degrees_list = [ np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in drug_drug_adj_list ] # -------------------------- gene feature -------------------------- # featureless (genes) gene_feat = sp.identity(num_gene) gene_nonzero_feat, gene_num_feat = gene_feat.shape gene_feat = preprocessing.sparse_to_tuple(gene_feat.tocoo()) # drug vectors with additional features (single side effect) r, c = list(range(num_drug)), list(range(num_drug)) for (a, b) in ds_list: r.append(drug_to_new[0, a]) c.append(side_effect_to_new[0, b] + num_drug) array_length = num_drug + len(ds_list) drug_feat = sp.csr_matrix( ([1] * array_length, (r, c)), shape=(num_drug, num_drug + num_drug_additional_feature)) drug_nonzero_feat, drug_num_feat = drug_feat.shape[ 1], np.count_nonzero(drug_feat.sum(axis=0)) drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo()) # data representation self.adj_mats_orig = { (0, 0): [gene_adj, gene_adj.transpose(copy=True)], (0, 1): [gene_drug_adj], (1, 0): [drug_gene_adj], (1, 1): drug_drug_adj_list + [x.transpose(copy=True) for x in drug_drug_adj_list], } self.degrees = { 0: [gene_degrees, gene_degrees], 1: drug_degrees_list + drug_degrees_list, } # data representation self.num_feat = { 0: gene_num_feat, 1: drug_num_feat, } self.num_nonzero_feat = { 0: gene_nonzero_feat, 1: drug_nonzero_feat, } self.feat = { 0: gene_feat, 1: drug_feat, } self.edge_type2dim = { k: [adj.shape for adj in adjs] for k, adjs in self.adj_mats_orig.items() } self.edge_type2decoder = { (0, 0): 'bilinear', (0, 1): 'bilinear', (1, 0): 'bilinear', (1, 1): 'dedicom', } self.edge_types = {k: len(v) for k, v in self.adj_mats_orig.items()} self.num_edge_types = sum(self.edge_types.values()) print("Edge types:", "%d" % self.num_edge_types)
def main(args): parser = argparse.ArgumentParser() parser.add_argument( "--decagon_data_file_directory", type=str, help= "path to directory where bio-decagon-*.csv files are located, with trailing slash. " "Default is current directory", default='./') parser.add_argument( "--saved_files_directory", type=str, help= "path to directory where saved files files are located, with trailing slash. " "Default is current directory. If a decagon_model.ckpt* exists in this directory, it will " "be loaded and evaluated, and no training will be done.", default='./') parser.add_argument("--verbose", help="increase output verbosity", action="store_true", default=False) args = parser.parse_args(args) decagon_data_file_directory = args.decagon_data_file_directory verbose = args.verbose script_start_time = datetime.now() # create pre-processed file that only has side effect with >=500 occurrences all_combos_df = pd.read_csv('%sbio-decagon-combo.csv' % decagon_data_file_directory) side_effects_500 = all_combos_df["Polypharmacy Side Effect"].value_counts() side_effects_500 = side_effects_500[side_effects_500 >= 500].index.tolist() all_combos_df = all_combos_df[ all_combos_df["Polypharmacy Side Effect"].isin(side_effects_500)] all_combos_df.to_csv('%sbio-decagon-combo-over500only.csv' % decagon_data_file_directory, index=False) # use pre=processed file that only contains the most common side effects (those with >= 500 drug pairs) drug_drug_net, combo2stitch, combo2se, se2name = load_combo_se( fname=('%sbio-decagon-combo-over500only.csv' % decagon_data_file_directory)) # net is a networkx graph with genes(proteins) as nodes and protein-protein-interactions as edges # node2idx maps node id to node index gene_net, node2idx = load_ppi(fname=('%sbio-decagon-ppi.csv' % decagon_data_file_directory)) # stitch2se maps (individual) stitch ids to a list of side effect ids # se2name_mono maps side effect ids that occur in the mono file to side effect names (shorter than se2name) stitch2se, se2name_mono = load_mono_se(fname=('%sbio-decagon-mono.csv' % decagon_data_file_directory)) # stitch2proteins maps stitch ids (drug) to protein (gene) ids drug_gene_net, stitch2proteins = load_targets( fname=('%sbio-decagon-targets-all.csv' % decagon_data_file_directory)) # se2class maps side effect id to class name # this was 0.05 in the original code, but the paper says that 10% each are used for testing and validation val_test_size = 0.1 n_genes = gene_net.number_of_nodes() gene_adj = nx.adjacency_matrix(gene_net) gene_degrees = np.array(gene_adj.sum(axis=0)).squeeze() ordered_list_of_drugs = list(drug_drug_net.nodes.keys()) ordered_list_of_side_effects = list(se2name.keys()) ordered_list_of_proteins = list(gene_net.nodes.keys()) n_drugs = len(ordered_list_of_drugs) drug_gene_adj = sp.lil_matrix(np.zeros((n_drugs, n_genes))) for drug in stitch2proteins: for protein in stitch2proteins[drug]: # there are quite a few drugs in here that aren't in our list of 645, # and proteins that aren't in our list of 19081 if drug in ordered_list_of_drugs and protein in ordered_list_of_proteins: drug_index = ordered_list_of_drugs.index(drug) gene_index = ordered_list_of_proteins.index(protein) drug_gene_adj[drug_index, gene_index] = 1 drug_gene_adj = drug_gene_adj.tocsr() # needs to be drug vs. gene matrix (645x19081) gene_drug_adj = drug_gene_adj.transpose(copy=True) drug_drug_adj_list = [] if not os.path.isfile("adjacency_matrices/sparse_matrix0000.npz"): # pre-initialize all the matrices print("Initializing drug-drug adjacency matrix list") start_time = datetime.now() print("Starting at %s" % str(start_time)) n = len(ordered_list_of_side_effects) for i in range(n): drug_drug_adj_list.append( sp.lil_matrix(np.zeros((n_drugs, n_drugs)))) if verbose: print("%s percent done" % str(100.0 * i / n)) print("Done initializing at %s after %s" % (datetime.now(), datetime.now() - start_time)) start_time = datetime.now() combo_finish_time = start_time print("Creating adjacency matrices for side effects") print("Starting at %s" % str(start_time)) combo_count = len(combo2se) combo_counter = 0 # for side_effect_type in ordered_list_of_side_effects: # for drug1, drug2 in combinations(list(range(n_drugs)), 2): for combo in combo2se.keys(): side_effect_list = combo2se[combo] for present_side_effect in side_effect_list: # find the matrix we need to update side_effect_number = ordered_list_of_side_effects.index( present_side_effect) # find the drugs for which we need to make the update drug_tuple = combo2stitch[combo] drug1_index = ordered_list_of_drugs.index(drug_tuple[0]) drug2_index = ordered_list_of_drugs.index(drug_tuple[1]) # update drug_drug_adj_list[side_effect_number][drug1_index, drug2_index] = 1 if verbose and combo_counter % 1000 == 0: print( "Finished combo %s after %s . %d percent of combos done" % (combo_counter, str(combo_finish_time - start_time), (100.0 * combo_counter / combo_count))) combo_finish_time = datetime.now() combo_counter = combo_counter + 1 print("Done creating adjacency matrices at %s after %s" % (datetime.now(), datetime.now() - start_time)) start_time = datetime.now() print("Saving matrices to file") print("Starting at %s" % str(start_time)) # save matrices to file if not os.path.isdir("adjacency_matrices"): os.mkdir("adjacency_matrices") for i in range(len(drug_drug_adj_list)): sp.save_npz('adjacency_matrices/sparse_matrix%04d.npz' % (i, ), drug_drug_adj_list[i].tocoo()) print("Done saving matrices to file at %s after %s" % (datetime.now(), datetime.now() - start_time)) else: print("Loading adjacency matrices from file.") for i in range(len(ordered_list_of_side_effects)): drug_drug_adj_list.append( sp.load_npz('adjacency_matrices/sparse_matrix%04d.npz' % i)) for i in range(len(drug_drug_adj_list)): drug_drug_adj_list[i] = drug_drug_adj_list[i].tocsr() start_time = datetime.now() print("Setting up for training") print("Starting at %s" % str(start_time)) drug_degrees_list = [ np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in drug_drug_adj_list ] # data representation global adj_mats_orig adj_mats_orig = { (0, 0): [gene_adj, gene_adj.transpose(copy=True) ], # protein-protein interactions (and inverses) (0, 1): [gene_drug_adj], # protein-drug relationships (inverse of targets) (1, 0): [drug_gene_adj], # drug-protein relationships (targets) # This creates an "inverse" relationship for every polypharmacy side effect, using the transpose of the # relationship's adjacency matrix, resulting in 2x the number of side effects (and adjacency matrices). (1, 1): drug_drug_adj_list + [x.transpose(copy=True) for x in drug_drug_adj_list], } degrees = { 0: [gene_degrees, gene_degrees], 1: drug_degrees_list + drug_degrees_list, } # featureless (genes) gene_feat = sp.identity(n_genes) gene_nonzero_feat, gene_num_feat = gene_feat.shape gene_feat = preprocessing.sparse_to_tuple(gene_feat.tocoo()) # features (drugs) drug_feat = sp.identity(n_drugs) drug_nonzero_feat, drug_num_feat = drug_feat.shape drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo()) # data representation num_feat = { 0: gene_num_feat, 1: drug_num_feat, } nonzero_feat = { 0: gene_nonzero_feat, 1: drug_nonzero_feat, } feat = { 0: gene_feat, 1: drug_feat, } edge_type2dim = { k: [adj.shape for adj in adjs] for k, adjs in adj_mats_orig.items() } edge_type2decoder = { (0, 0): 'bilinear', (0, 1): 'bilinear', (1, 0): 'bilinear', (1, 1): 'dedicom', } edge_types = {k: len(v) for k, v in adj_mats_orig.items()} global num_edge_types num_edge_types = sum(edge_types.values()) print("Edge types:", "%d" % num_edge_types) ########################################################### # # Settings and placeholders # ########################################################### # Important -- Do not evaluate/print validation performance every iteration as it can take # substantial amount of time PRINT_PROGRESS_EVERY = 10000 print("Defining placeholders") construct_placeholders(edge_types) ########################################################### # # Create minibatch iterator, model and optimizer # ########################################################### global minibatch_iterator iterator_pickle_file_name = args.saved_files_directory + "minibatch_iterator.pickle" if os.path.isfile(iterator_pickle_file_name): print("Load minibatch iterator pickle") with open(iterator_pickle_file_name, 'rb') as pickle_file: minibatch_iterator = pickle.load(pickle_file) else: print("Create minibatch iterator") minibatch_iterator = EdgeMinibatchIterator(adj_mats=adj_mats_orig, feat=feat, edge_types=edge_types, batch_size=FLAGS.batch_size, val_test_size=val_test_size) print("Pickling minibatch iterator") with open(iterator_pickle_file_name, 'wb') as pickle_file: pickle.dump(minibatch_iterator, pickle_file) print("Create model") model = DecagonModel( placeholders=placeholders, num_feat=num_feat, nonzero_feat=nonzero_feat, edge_types=edge_types, decoders=edge_type2decoder, ) print("Create optimizer") global optimizer with tf.name_scope('optimizer'): optimizer = DecagonOptimizer(embeddings=model.embeddings, latent_inters=model.latent_inters, latent_varies=model.latent_varies, degrees=degrees, edge_types=edge_types, edge_type2dim=edge_type2dim, placeholders=placeholders, batch_size=FLAGS.batch_size, margin=FLAGS.max_margin) print("Done setting up at %s after %s" % (datetime.now(), datetime.now() - start_time)) print("Initialize session") global sess sess = tf.Session() decagon_model_file_name = args.saved_files_directory + "decagon_model.ckpt" saved_model_available = os.path.isfile(decagon_model_file_name + ".index") if saved_model_available: saver = tf.train.Saver() saver.restore(sess, decagon_model_file_name) print("Model restored.") if not saved_model_available: print("Training model") start_time = datetime.now() print("Starting at %s" % str(start_time)) sess.run(tf.global_variables_initializer()) feed_dict = {} ########################################################### # # Train model # ########################################################### saver = tf.train.Saver() print("Train model") epoch_losses = [] for epoch in range(FLAGS.epochs): minibatch_iterator.shuffle() itr = 0 while not minibatch_iterator.end(): # Construct feed dictionary feed_dict = minibatch_iterator.next_minibatch_feed_dict( placeholders=placeholders) feed_dict = minibatch_iterator.update_feed_dict( feed_dict=feed_dict, dropout=FLAGS.dropout, placeholders=placeholders) t = time.time() # Training step: run single weight update outs = sess.run([ optimizer.opt_op, optimizer.cost, optimizer.batch_edge_type_idx ], feed_dict=feed_dict) train_cost = outs[1] batch_edge_type = outs[2] if itr % PRINT_PROGRESS_EVERY == 0: val_auc, val_auprc, val_apk = get_accuracy_scores( minibatch_iterator.val_edges, minibatch_iterator.val_edges_false, minibatch_iterator.idx2edge_type[ minibatch_iterator.current_edge_type_idx], feed_dict) print("Epoch:", "%04d" % (epoch + 1), "Iter:", "%04d" % (itr + 1), "Edge:", "%04d" % batch_edge_type, "train_loss=", "{:.5f}".format(train_cost), "val_roc=", "{:.5f}".format(val_auc), "val_auprc=", "{:.5f}".format(val_auprc), "val_apk=", "{:.5f}".format(val_apk), "time=", "{:.5f}".format(time.time() - t)) itr += 1 validation_loss = get_validation_loss( edges_pos=minibatch_iterator.val_edges, edges_neg=minibatch_iterator.val_edges_false, feed_dict=feed_dict) print( "Epoch:", "%04d" % (epoch + 1), "Validation loss (average cross entropy): {}".format( validation_loss)) epoch_losses.append(validation_loss) if len(epoch_losses) >= 3: if round(epoch_losses[-1], 3) >= round( epoch_losses[-2], 3) >= round(epoch_losses[-3], 3): break print("Saving model after epoch:", epoch) save_path = saver.save( sess, args.saved_files_directory + "decagon_model" + str(epoch) + ".ckpt") print("Model saved in path: %s" % save_path) print("Optimization finished!") print("Done training model %s after %s" % (datetime.now(), datetime.now() - start_time)) print("Saving model") save_path = saver.save(sess, decagon_model_file_name) print("Model saved in path: %s" % save_path) print("Pickling minibatch iterator") with open(iterator_pickle_file_name, 'wb') as pickle_file: pickle.dump(minibatch_iterator, pickle_file) start_time = datetime.now() print("Evaluating model") print("Starting at %s" % str(start_time)) for edge_type in range(num_edge_types): # get all edges in test set with this type feed_dict = minibatch_iterator.test_feed_dict( edge_type, placeholders=placeholders) feed_dict = minibatch_iterator.update_feed_dict( feed_dict, FLAGS.dropout, placeholders) edge_tuple = minibatch_iterator.idx2edge_type[edge_type] _, _, all_scores, all_labels, subjects, predicates, objects = get_predictions( edges_pos=minibatch_iterator.test_edges, edges_neg=minibatch_iterator.test_edges_false, edge_type=edge_tuple, feed_dict=feed_dict) print("subject\tpredicate\tobject\tpredicted\tactual") for i in range(len(all_scores)): subject = subjects[i] if edge_tuple[0] == 1: subject = ordered_list_of_drugs[subject] else: subject = ordered_list_of_proteins[subject] object = objects[i] if edge_tuple[1] == 1: object = ordered_list_of_drugs[object] else: object = ordered_list_of_proteins[object] predicate = predicates[i] if edge_tuple[:2] == (1, 1): side_effect_index = edge_tuple[2] is_inverse = False if side_effect_index >= 963: side_effect_index = side_effect_index - 963 is_inverse = True predicate = ordered_list_of_side_effects[side_effect_index] if is_inverse: predicate = predicate + "_2" print("{}\t{}\t{}\t{}\t{}".format(subject, predicate, object, all_scores[i], all_labels[i])) print() print("Done evaluating at %s after %s" % (datetime.now(), datetime.now() - start_time)) print("Script running time: %s" % (datetime.now() - script_start_time))
protein_degrees = np.array(protein_protein_adj.sum(axis=0)).squeeze() drug_degrees = np.array(drug_drug_adj.sum(axis=0)).squeeze() disease_degrees = np.array(disease_drug_adj.sum(axis=0)).squeeze() side_effect_degrees = np.array(side_effect_drug_adj.sum(axis=0)).squeeze() degrees = { 0: [protein_degrees, protein_degrees], 1: [drug_degrees, drug_degrees], 2: [disease_degrees], 3: [side_effect_degrees] } # # featureless (genes) gene_feat = sp.identity(1512) protein_nonzero_feat, protein_num_feat = gene_feat.shape gene_feat = preprocessing.sparse_to_tuple(gene_feat.tocoo()) # # # features (drugs) drug_feat = sp.identity(708) # drug_feat = Drug_Drug_adj drug_nonzero_feat, drug_num_feat = drug_feat.shape drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo()) # data representation diease_feat = sp.identity(5603) diease_nonzero_feat, diease_num_feat = diease_feat.shape diease_feat = preprocessing.sparse_to_tuple(diease_feat.tocoo()) # NOTICE