def _minibatch_iterator_init(self, path_to_split: str, batch_size: int, val_test_size: float) -> NoReturn: """ Create minibatch iterator (self.minibatch). Parameters ---------- path_to_split : str Path to save train, test and validate edges. If it consist needed edges, they will be loaded. Else they will be calculated and saved. batch_size : int Minibatch size. val_test_size : float Proportion to split edges into train, test and validate. """ print('Create minibatch iterator') need_sample_edges = not (os.path.isdir(path_to_split) and len(os.listdir(path_to_split)) == 6) self.minibatch = EdgeMinibatchIterator( adj_mats=self.adj_mats, feat=self.feat, edge_types=self.edge_types, symmetry_types_groups=self.symmetry_types_groups, batch_size=batch_size, val_test_size=val_test_size, path_to_split=path_to_split, need_sample_edges=need_sample_edges )
flags.DEFINE_boolean('bias', True, 'Bias term.') print 'Defining placeholders' placeholders = construct_placeholders(edge_types) ########################################################### # # Create minibatch iterator, model and optimizer # ########################################################### print 'Create minibatch iterator' minibatch = EdgeMinibatchIterator( adj_mats=adj_mats_orig, feat=feat, edge_types=edge_types, directed=edge_type2directed, batch_size=FLAGS.batch_size ) print 'Create model' model = DecagonModel( placeholders=placeholders, num_feat=num_feat, nonzero_feat=nonzero_feat, edge_types=edge_types, decoders=edge_type2decoder, ) print 'Create optimizer' with tf.name_scope('optimizer'):
flags.DEFINE_integer('hidden1', 64, 'Number of units in hidden layer 1.') flags.DEFINE_integer('hidden2', 32, 'Number of units in hidden layer 2.') flags.DEFINE_float('weight_decay', 0.001, 'Weight for L2 loss on embedding matrix.') flags.DEFINE_float('dropout', 0.1, 'Dropout rate (1 - keep probability).') flags.DEFINE_float('max_margin', 0.1, 'Max margin parameter in hinge loss') flags.DEFINE_integer('batch_size', 512, 'minibatch size.') flags.DEFINE_boolean('bias', True, 'Bias term.') print("Defining placeholders") placeholders = construct_placeholders(edge_types) print("Create minibatch iterator") minibatch = EdgeMinibatchIterator(adj_mats=adj_mats_orig, feat=feat, edge_types=edge_types, batch_size=FLAGS.batch_size, val_test_size=val_test_size) print("Create model") model = DecagonModel( placeholders=placeholders, num_feat=num_feat, nonzero_feat=nonzero_feat, edge_types=edge_types, decoders=edge_type2decoder, ) print("Create optimizer") with tf.name_scope('optimizer'): opt = DecagonOptimizer(embeddings=model.embeddings,
def main_execution(): combo_to_drugs_ids, combo_to_side_effects = load_drug_bank_combo_side_effect_file( fichier='polypharmacy/drugbank/drugbank-combo.csv') nodes = set([u for e in combo_to_drugs_ids.values() for u in e]) n_drugs = len(nodes) relation_types = set([r for r in combo_to_side_effects.values()]) n_drugdrug_rel_types = len(relation_types) drugs_to_positions_in_matrices_dict = { node: i for i, node in enumerate(nodes) } drug_drug_adj_list = [] # matrice d'adjacence de chaque drug_drug for i, el in enumerate(relation_types): # pour chaque side effect mat = np.zeros((n_drugs, n_drugs)) for d1, d2 in combinations(list(nodes), 2): temp_cle = '{}_{}'.format(d1, d2) if temp_cle in combo_to_side_effects.keys(): if combo_to_side_effects[temp_cle] == el: # chaque fois on a une réelle s.e entre les 2 drogues dans la matrice mat[drugs_to_positions_in_matrices_dict[d1], drugs_to_positions_in_matrices_dict[d2]] = \ mat[drugs_to_positions_in_matrices_dict[d2], drugs_to_positions_in_matrices_dict[d1]] = 1. # Inscrire une interaction drug_drug_adj_list.append(sp.csr_matrix(mat)) drug_degrees_list = [ np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in drug_drug_adj_list ] adj_mats_orig = { (0, 0): drug_drug_adj_list + [x.transpose(copy=True) for x in drug_drug_adj_list], } degrees = { 0: drug_degrees_list + drug_degrees_list, } # features (drugs) drug_feat = sp.identity(n_drugs) drug_nonzero_feat, drug_num_feat = drug_feat.shape drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo()) # data representation num_feat = { 0: drug_num_feat, } nonzero_feat = { 0: drug_nonzero_feat, } feat = { 0: drug_feat, } edge_type2dim = { k: [adj.shape for adj in adjs] for k, adjs in adj_mats_orig.items() } edge_type2decoder = { (0, 0): 'dedicom', } edge_types = {k: len(v) for k, v in adj_mats_orig.items()} num_edge_types = sum(edge_types.values()) print("Edge types:", "%d" % num_edge_types) print("Defining placeholders") placeholders = construct_placeholders(edge_types) ########################################################### # # Create minibatch iterator, model and optimizer # ########################################################### print("Create minibatch iterator") minibatch = EdgeMinibatchIterator(adj_mats=adj_mats_orig, feat=feat, edge_types=edge_types, batch_size=FLAGS.batch_size, val_test_size=val_test_size) print("Create model") model = DecagonModel( placeholders=placeholders, num_feat=num_feat, nonzero_feat=nonzero_feat, edge_types=edge_types, decoders=edge_type2decoder, ) print("Create optimizer") with tf.name_scope('optimizer'): opt = DecagonOptimizer(embeddings=model.embeddings, latent_inters=model.latent_inters, latent_varies=model.latent_varies, degrees=degrees, edge_types=edge_types, edge_type2dim=edge_type2dim, placeholders=placeholders, batch_size=FLAGS.batch_size, margin=FLAGS.max_margin) print("Initialize session") sess = tf.Session() sess.run(tf.global_variables_initializer()) feed_dict = {} ########################################################### # # Train model # ########################################################### print("Train model") for epoch in range(FLAGS.epochs): minibatch.shuffle() itr = 0 while not minibatch.end(): # Construct feed dictionary feed_dict = minibatch.next_minibatch_feed_dict( placeholders=placeholders) feed_dict = minibatch.update_feed_dict(feed_dict=feed_dict, dropout=FLAGS.dropout, placeholders=placeholders) t = time.time() # Training step: run single weight update outs = sess.run([opt.opt_op, opt.cost, opt.batch_edge_type_idx], feed_dict=feed_dict) train_cost = outs[1] batch_edge_type = outs[2] if itr % PRINT_PROGRESS_EVERY == 0: val_auc, val_auprc, val_apk = get_accuracy_scores( feed_dict, placeholders, sess, opt, minibatch, adj_mats_orig, minibatch.val_edges, minibatch.val_edges_false, minibatch.idx2edge_type[minibatch.current_edge_type_idx]) print("Epoch:", "%04d" % (epoch + 1), "Iter:", "%04d" % (itr + 1), "Edge:", "%04d" % batch_edge_type, "train_loss=", "{:.5f}".format(train_cost), "val_roc=", "{:.5f}".format(val_auc), "val_auprc=", "{:.5f}".format(val_auprc), "val_apk=", "{:.5f}".format(val_apk), "time=", "{:.5f}".format(time.time() - t)) itr += 1 print("Optimization finished!") for et in range(num_edge_types): roc_score, auprc_score, apk_score = get_accuracy_scores( feed_dict, placeholders, sess, opt, minibatch, adj_mats_orig, minibatch.test_edges, minibatch.test_edges_false, minibatch.idx2edge_type[et]) print("Edge type=", "[%02d, %02d, %02d]" % minibatch.idx2edge_type[et]) print("Edge type:", "%04d" % et, "Test AUROC score", "{:.5f}".format(roc_score)) print("Edge type:", "%04d" % et, "Test AUPRC score", "{:.5f}".format(auprc_score)) print("Edge type:", "%04d" % et, "Test AP@k score", "{:.5f}".format(apk_score)) print()
num_edge_types = sum(edge_types.values()) print("Edge types:", "%d" % num_edge_types) # Important -- Do not evaluate/print validation performance every iteration as it can take # substantial amount of time PRINT_PROGRESS_EVERY = 20 print("Defining placeholders") placeholders = construct_placeholders(edge_types) print("Create minibatch iterator") minibatch = EdgeMinibatchIterator( adj_mats=adj_mats_orig, seed=seed, feat=feat, edge_types=edge_types, data_set=data_set, batch_size=FLAGS.batch_size, val_test_size=val_test_size, ) print("Create model") model = DecagonModel( data_set=data_set, placeholders=placeholders, num_feat=num_feat, nonzero_feat=nonzero_feat, edge_types=edge_types, decoders=edge_type2decoder, )
def main_execution(combo_file='./polypharmacy/bio-decagon-combo.csv', targets_file='./polypharmacy/bio-decagon-targets.csv', genes_genes_file='./polypharmacy/bio-decagon-ppi.csv', new_train_test_split=False): print('Load Combo to Side Effects') if combo_file.find('decagon') != -1: combo_to_drugs_ids, combo_to_side_effects, combo_to_side_effects_names, side_effects_ids_to_names = \ load_decagon_combo_side_effect_file(fichier=combo_file) print('Load drugs to targets') drugs_id_to_targets_id = load_decagon_file_targets_id( fichier=targets_file) else: combo_to_drugs_ids, combo_to_side_effects = load_drug_bank_combo_side_effect_file( fichier=combo_file) print('Load drugs to targets') drugs_id_to_targets_id, drugs_id_to_drugs_name = load_file_targets_id( fichier=targets_file) print('Load genes to genes (targets) interactions net') genes_genes_net, genes_node_to_idx = load_genes_genes_interactions( fichier=genes_genes_file) print('Build genes-genes adjacency matrix') genes_adj = nx.adjacency_matrix(genes_genes_net) genes_degrees = np.array(genes_adj.sum(axis=0)).squeeze() if new_train_test_split: print('Load the new train test validation split') combo_to_drugs_ids_train, combo_to_drugs_ids_test, combo_to_drugs_ids_valid = train_test_valid_split_3( ) drug_nodes_train = set( [u for e in combo_to_drugs_ids_train.values() for u in e]) drug_nodes_test = set( [u for e in combo_to_drugs_ids_test.values() for u in e]) drug_nodes_valid = set( [u for e in combo_to_drugs_ids_valid.values() for u in e]) print('Build drugs-drugs matrix representation') drug_nodes = set([u for e in combo_to_drugs_ids.values() for u in e]) n_drugs = len(drug_nodes) relation_types = set( [r for se in combo_to_side_effects.values() for r in se]) drugs_nodes_to_idx = {node: i for i, node in enumerate(drug_nodes)} print('Build general drugs-drugs matrix representation') drug_drug_adj_list = [] # matrice d'adjacence de chaque drug_drug for i, el in enumerate(relation_types): # pour chaque side effect mat = np.zeros((n_drugs, n_drugs)) for d1, d2 in combinations(list(drug_nodes), 2): temp_cle = '{}_{}'.format(d1, d2) if temp_cle in combo_to_side_effects.keys(): if el in combo_to_side_effects[temp_cle]: # list of list on check si le s.e apparait au moins une fois dans la liste mat[drugs_nodes_to_idx[d1], drugs_nodes_to_idx[d2]] = \ mat[drugs_nodes_to_idx[d2], drugs_nodes_to_idx[d1]] = 1. # Inscrire une interaction drug_drug_adj_list.append(sp.csr_matrix(mat)) drug_degrees_list = [ np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in drug_drug_adj_list ] if new_train_test_split: print('Build train drugs-drugs matrix representation') drug_drug_adj_list_train = [ ] # matrice d'adjacence de chaque drug_drug for i, el in enumerate(relation_types): # pour chaque side effect mat = np.zeros((n_drugs, n_drugs)) for d1, d2 in combinations(list(drug_nodes_train), 2): temp_cle = '{}_{}'.format(d1, d2) if temp_cle in combo_to_side_effects.keys(): if el in combo_to_side_effects[temp_cle]: # list of list on check si le s.e apparait au moins une fois dans la liste mat[drugs_nodes_to_idx[d1], drugs_nodes_to_idx[d2]] = \ mat[drugs_nodes_to_idx[d2], drugs_nodes_to_idx[d1]] = 1. # Inscrire une interaction drug_drug_adj_list_train.append(sp.csr_matrix(mat)) drug_degrees_list_train = [ np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in drug_drug_adj_list_train ] print('Build test drugs-drugs matrix representation') drug_drug_adj_list_test = [] # matrice d'adjacence de chaque drug_drug for i, el in enumerate(relation_types): # pour chaque side effect mat = np.zeros((n_drugs, n_drugs)) for d1, d2 in combinations(list(drug_nodes_test), 2): temp_cle = '{}_{}'.format(d1, d2) if temp_cle in combo_to_side_effects.keys(): if el in combo_to_side_effects[temp_cle]: # list of list on check si le s.e apparait au moins une fois dans la liste mat[drugs_nodes_to_idx[d1], drugs_nodes_to_idx[d2]] = \ mat[drugs_nodes_to_idx[d2], drugs_nodes_to_idx[d1]] = 1. # Inscrire une interaction drug_drug_adj_list_test.append(sp.csr_matrix(mat)) drug_degrees_list_test = [ np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in drug_drug_adj_list_test ] print('Build valid drugs-drugs matrix representation') drug_drug_adj_list_valid = [ ] # matrice d'adjacence de chaque drug_drug for i, el in enumerate(relation_types): # pour chaque side effect mat = np.zeros((n_drugs, n_drugs)) for d1, d2 in combinations(list(drug_nodes_valid), 2): temp_cle = '{}_{}'.format(d1, d2) if temp_cle in combo_to_side_effects.keys(): if el in combo_to_side_effects[temp_cle]: # list of list on check si le s.e apparait au moins une fois dans la liste mat[drugs_nodes_to_idx[d1], drugs_nodes_to_idx[d2]] = \ mat[drugs_nodes_to_idx[d2], drugs_nodes_to_idx[d1]] = 1. # Inscrire une interaction drug_drug_adj_list_valid.append(sp.csr_matrix(mat)) drug_degrees_list_valid = [ np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in drug_drug_adj_list_valid ] print('Build general genes-drugs matrix representation') genes_nodes = set([gene_node for gene_node in genes_node_to_idx.keys()]) n_genes = len(genes_nodes) mat = np.zeros((n_genes, n_drugs)) for drug in drug_nodes: if drug in drugs_id_to_targets_id.keys(): for target in drugs_id_to_targets_id[drug]: if target in genes_node_to_idx.keys(): mat[genes_node_to_idx[target], drugs_nodes_to_idx[drug]] = 1. genes_drugs_adj = sp.csr_matrix(mat) drugs_genes_adj = genes_drugs_adj.transpose(copy=True) if new_train_test_split: print('Build train genes-drugs matrix representation') for drug in drug_nodes_train: if drug in drugs_id_to_targets_id.keys(): for target in drugs_id_to_targets_id[drug]: if target in genes_node_to_idx.keys(): mat[genes_node_to_idx[target], drugs_nodes_to_idx[drug]] = 1. genes_drugs_adj_train = sp.csr_matrix(mat) drugs_genes_adj_train = genes_drugs_adj_train.transpose(copy=True) print('Build test genes-drugs matrix representation') for drug in drug_nodes_test: if drug in drugs_id_to_targets_id.keys(): for target in drugs_id_to_targets_id[drug]: if target in genes_node_to_idx.keys(): mat[genes_node_to_idx[target], drugs_nodes_to_idx[drug]] = 1. genes_drugs_adj_test = sp.csr_matrix(mat) drugs_genes_adj_test = genes_drugs_adj_test.transpose(copy=True) print('Build valid genes-drugs matrix representation') for drug in drug_nodes_valid: if drug in drugs_id_to_targets_id.keys(): for target in drugs_id_to_targets_id[drug]: if target in genes_node_to_idx.keys(): mat[genes_node_to_idx[target], drugs_nodes_to_idx[drug]] = 1. genes_drugs_adj_valid = sp.csr_matrix(mat) drugs_genes_adj_valid = genes_drugs_adj_valid.transpose(copy=True) print('Build general Adjacency matrix data representation') adj_mats_orig = { (0, 0): [genes_adj, genes_adj.transpose(copy=True)], (0, 1): [genes_drugs_adj], (1, 0): [drugs_genes_adj], (1, 1): drug_drug_adj_list + [x.transpose(copy=True) for x in drug_drug_adj_list], } if new_train_test_split: print('Build train Adjacency matrix data representation') adj_mats_orig_train = { (0, 0): [genes_adj, genes_adj.transpose(copy=True)], (0, 1): [genes_drugs_adj_train], (1, 0): [drugs_genes_adj_train], (1, 1): drug_drug_adj_list_train + [x.transpose(copy=True) for x in drug_drug_adj_list_train], } print('Build test Adjacency matrix data representation') adj_mats_orig_test = { (0, 0): [genes_adj, genes_adj.transpose(copy=True)], (0, 1): [genes_drugs_adj_test], (1, 0): [drugs_genes_adj_test], (1, 1): drug_drug_adj_list_test + [x.transpose(copy=True) for x in drug_drug_adj_list_test], } print('Build valid Adjacency matrix data representation') adj_mats_orig_valid = { (0, 0): [genes_adj, genes_adj.transpose(copy=True)], (0, 1): [genes_drugs_adj_valid], (1, 0): [drugs_genes_adj_valid], (1, 1): drug_drug_adj_list_valid + [x.transpose(copy=True) for x in drug_drug_adj_list_valid], } degrees = { 0: [genes_degrees, genes_degrees], 1: drug_degrees_list + drug_degrees_list, } print('featureless (genes)') gene_feat = sp.identity(n_genes) gene_nonzero_feat, gene_num_feat = gene_feat.shape gene_feat = preprocessing.sparse_to_tuple(gene_feat.tocoo()) print('features (drugs)') drug_feat = sp.identity(n_drugs) drug_nonzero_feat, drug_num_feat = drug_feat.shape drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo()) print('Features data representation') num_feat = { 0: gene_num_feat, 1: drug_num_feat, } nonzero_feat = { 0: gene_nonzero_feat, 1: drug_nonzero_feat, } feat = { 0: gene_feat, 1: drug_feat, } edge_type2dim = { k: [adj.shape for adj in adjs] for k, adjs in adj_mats_orig.items() } edge_type2decoder = { (0, 0): 'bilinear', (0, 1): 'bilinear', (1, 0): 'bilinear', (1, 1): 'dedicom', } edge_types = {k: len(v) for k, v in adj_mats_orig.items()} num_edge_types = sum(edge_types.values()) print("Edge types:", "%d" % num_edge_types) print("Defining placeholders") placeholders = construct_placeholders(edge_types) ########################################################### # # Create minibatch iterator, model and optimizer # ########################################################### if new_train_test_split: print("Create minibatch iterator") minibatch = EdgeMinibatchIteratorNewSplit( adj_mats=adj_mats_orig, adj_mats_train=adj_mats_orig_train, adj_mats_test=adj_mats_orig_test, adj_mats_valid=adj_mats_orig_valid, feat=feat, edge_types=edge_types, batch_size=FLAGS.batch_size, val_test_size=val_test_size) else: print("Create minibatch iterator") minibatch = EdgeMinibatchIterator(adj_mats=adj_mats_orig, feat=feat, edge_types=edge_types, batch_size=FLAGS.batch_size, val_test_size=val_test_size) print("Create model") model = DecagonModel( placeholders=placeholders, num_feat=num_feat, nonzero_feat=nonzero_feat, edge_types=edge_types, decoders=edge_type2decoder, ) print("Create optimizer") with tf.name_scope('optimizer'): opt = DecagonOptimizer(embeddings=model.embeddings, latent_inters=model.latent_inters, latent_varies=model.latent_varies, degrees=degrees, edge_types=edge_types, edge_type2dim=edge_type2dim, placeholders=placeholders, batch_size=FLAGS.batch_size, margin=FLAGS.max_margin) print("Initialize session") sess = tf.Session() sess.run(tf.global_variables_initializer()) feed_dict = {} ########################################################### # # Train model # ########################################################### print("Train model") for epoch in range(FLAGS.epochs): minibatch.shuffle() itr = 0 while not minibatch.end(): # Construct feed dictionary feed_dict = minibatch.next_minibatch_feed_dict( placeholders=placeholders) feed_dict = minibatch.update_feed_dict(feed_dict=feed_dict, dropout=FLAGS.dropout, placeholders=placeholders) t = time.time() # Training step: run single weight update outs = sess.run([opt.opt_op, opt.cost, opt.batch_edge_type_idx], feed_dict=feed_dict) train_cost = outs[1] batch_edge_type = outs[2] if itr % PRINT_PROGRESS_EVERY == 0: val_auc, val_auprc, val_apk = get_accuracy_scores( feed_dict, placeholders, sess, opt, minibatch, adj_mats_orig, minibatch.val_edges, minibatch.val_edges_false, minibatch.idx2edge_type[minibatch.current_edge_type_idx]) print("Epoch:", "%04d" % (epoch + 1), "Iter:", "%04d" % (itr + 1), "Edge:", "%04d" % batch_edge_type, "train_loss=", "{:.5f}".format(train_cost), "val_roc=", "{:.5f}".format(val_auc), "val_auprc=", "{:.5f}".format(val_auprc), "val_apk=", "{:.5f}".format(val_apk), "time=", "{:.5f}".format(time.time() - t)) itr += 1 print("Optimization finished!") for et in range(num_edge_types): roc_score, auprc_score, apk_score = get_accuracy_scores( feed_dict, placeholders, sess, opt, minibatch, adj_mats_orig, minibatch.test_edges, minibatch.test_edges_false, minibatch.idx2edge_type[et]) print("Edge type=", "[%02d, %02d, %02d]" % minibatch.idx2edge_type[et]) print("Edge type:", "%04d" % et, "Test AUROC score", "{:.5f}".format(roc_score)) print("Edge type:", "%04d" % et, "Test AUPRC score", "{:.5f}".format(auprc_score)) print("Edge type:", "%04d" % et, "Test AP@k score", "{:.5f}".format(apk_score)) print()
class RunDecagon(metaclass=ABCMeta): """ Abstract class of Decagon runner. Different subclasses define specific behavior (e.g. run on synthetic data or real). Attributes ---------- adj_mats : Dict[Tuple[int, int], List[sp.csr_matrix]] From edge type to list of adjacency matrices for each edge class (e.g. (1, 1): list of drug-drug adjacency matrices for each se class). In our case all matrix in adj_mats are symmetric. degrees : Dict[int, List[int]] Number of connections for each node (0: genes, 1: drugs). edge_type2dim : Dict[Tuple[int, int], List[int] From edge type to list of shapes all its adjacency matrices. edge_type2decoder : Dict[Tuple[int, int], str] From edge type to decoder type (we use different decompositions for different edges types). edge_types : Dict[Tuple[int, int], int] From edge type to number of classes of these edge type (e. g. (1, 1): number of se). num_edge_types : int Number of all edge types (considering all classes). symmetry_types_groups : List[List] Should contains lists with len in {1, 2}. All types of edges splits into groups of symmetry. E. g. symmetry_types_groups = [[(0, 0)], [(0, 1), (1, 0)], [(1, 1)]]. Two types from one group of symmetry have same edges, differing only in direction (e.g (0, 1) has protein -> drug edges and (1, 0) has drug -> protein edges). num_feat : Dict[int, int] Number of elements in feature vector for 0: -genes, for 1: -drugs. nonzero_feat : Dict[int, int] Number of all features for 0: -gene and 1: -drug nodes. feat : Dict[int, sp.csr_matrix] From edge type (0 = gene, 1 = drug) to feature matrix. Row in feature matrix = embedding of one node. minibatch : EdgeMinibatchIterator Minibatch iterator. placeholders : Dict[str, tf.compat.v1.placeholder] Variables for input data in decagon model. model : DecagonModel Decagon model (encoder + decoder). opt : DecagonOptimizer Optimizer of decagon weigts. """ def __init__(self): self.adj_mats = None self.degrees = None self.num_feat = None self.nonzero_feat = None self.feat = None self.edge_type2dim = None self.edge_type2decoder = None self.edge_types = None self.num_edge_types = None self.minibatch = None self.opt = None self.placeholders = None self.model = None self.feed_dict = None pass def _adjacency(self, adj_path: str) -> NoReturn: """ Create self.adj_mats, self.degrees. Parameters ---------- adj_path : str path for saving/loading adjacency matrices. Notes ----- self.adj_mats: Dict[Tuple[int, int], List[sp.csr_matrix]] From edge type to list of adjacency matrices for each edge class (e.g. (1, 1): list of drug-drug adjacency matrices for each se class) In our case all matrix in adj_mats are symmetric self.degrees: Dict[int, List[int]] Number of connections for each node (0: genes, 1: drugs) """ raise NotImplementedError() def _nodes_features(self) -> NoReturn: """ Create self.num_feat, self.nonzero_feat, self.feat. Returns ------- Notes ----- self.num_feat : Dict[int, int] Number of elements in feature vector for 0: -genes, for 1: -drugs. self.nonzero_feat : Dict[int, int] Number of all features for 0: -gene and 1: -drug nodes. All features should be nonzero!?? TODO: What to do with zero features?? E.g., it is in format 0: num of genes in graph, 1: num of drugs. self.feat : Dict[int, sp.csr_matrix] From edge type (0 = gene, 1 = drug) to feature matrix. Row in feature matrix = embedding of one node. """ raise NotImplementedError() def _edge_types_info(self) -> NoReturn: """ Create self.edge_type2dim, self.edge_type2decoder, self.edge_types, self.num_edge_types. Notes ----- self.edge_type2dim : Dict[Tuple[int, int], List[int] From edge type to list of shapes all its adjacency matrices. self.edge_type2decoder : Dict[Tuple[int, int], str] From edge type to decoder type (we use different decompositions for different edges types). self.edge_types : Dict[Tuple[int, int], int] From edge type to number of classes of these edge type (e. g. (1, 1): number of se). self.num_edge_types : int Number of all edge types (considering all classes). """ self.edge_type2dim = {k: [adj.shape for adj in adjs] for k, adjs in self.adj_mats.items()} self.edge_type2decoder = { (0, 0): 'bilinear', (0, 1): 'bilinear', (1, 0): 'bilinear', (1, 1): 'dedicom', } self.symmetry_types_groups = [ [(0, 0)], [(0, 1), (1, 0)], [(1, 1)] ] self.edge_types = {k: len(v) for k, v in self.adj_mats.items()} self.num_edge_types = sum(self.edge_types.values()) print(f'Edge types {self.num_edge_types}') def _minibatch_iterator_init(self, path_to_split: str, batch_size: int, val_test_size: float) -> NoReturn: """ Create minibatch iterator (self.minibatch). Parameters ---------- path_to_split : str Path to save train, test and validate edges. If it consist needed edges, they will be loaded. Else they will be calculated and saved. batch_size : int Minibatch size. val_test_size : float Proportion to split edges into train, test and validate. """ print('Create minibatch iterator') need_sample_edges = not (os.path.isdir(path_to_split) and len(os.listdir(path_to_split)) == 6) self.minibatch = EdgeMinibatchIterator( adj_mats=self.adj_mats, feat=self.feat, edge_types=self.edge_types, symmetry_types_groups=self.symmetry_types_groups, batch_size=batch_size, val_test_size=val_test_size, path_to_split=path_to_split, need_sample_edges=need_sample_edges ) def _construct_placeholders(self) -> NoReturn: """ Create self.placeholders. Notes _____ Placeholders - input data in tf1. """ print("Defining placeholders") self.placeholders = { 'batch': tf.compat.v1.placeholder(tf.int32, name='batch'), 'batch_edge_type_idx': tf.compat.v1.placeholder(tf.int32, shape=(), name='batch_edge_type_idx'), 'batch_row_edge_type': tf.compat.v1.placeholder(tf.int32, shape=(), name='batch_row_edge_type'), 'batch_col_edge_type': tf.compat.v1.placeholder(tf.int32, shape=(), name='batch_col_edge_type'), 'degrees': tf.compat.v1.placeholder(tf.int32), 'dropout': tf.compat.v1.placeholder_with_default(0., shape=()), } adj_placeholders = {'adj_mats_%d,%d,%d' % (i, j, k): tf.compat.v1.sparse_placeholder(tf.float32) for i, j in self.edge_types for k in range(self.edge_types[i, j])} self.placeholders.update(adj_placeholders) features_placeholders = {'feat_%d' % i: tf.compat.v1.sparse_placeholder(tf.float32) for i, _ in self.edge_types} self.placeholders.update(features_placeholders) def _model_init(self) -> NoReturn: """ Create self.model. """ print("Create model") self.model = DecagonModel( placeholders=self.placeholders, num_feat=self.num_feat, nonzero_feat=self.nonzero_feat, edge_types=self.edge_types, decoders=self.edge_type2decoder, ) def _optimizer_init(self, batch_size: int, max_margin: float) -> NoReturn: """ Create self.opt. Parameters ---------- batch_size : int Minibatch size. max_margin : float Max margin parameter in hinge loss. """ print("Create optimizer") with tf.compat.v1.name_scope('optimizer'): self.opt = DecagonOptimizer( embeddings=self.model.embeddings, latent_inters=self.model.latent_inters, latent_varies=self.model.latent_varies, degrees=self.degrees, edge_types=self.edge_types, edge_type2dim=self.edge_type2dim, placeholders=self.placeholders, batch_size=batch_size, margin=max_margin ) def _get_accuracy_scores(self, sess: tf.compat.v1.Session, edges_pos: Dict[Tuple[int, int], List[np.array]], edges_neg: Dict[Tuple[int, int], List[np.array]], edge_type: Tuple[int, int, int]): """ Calculate metrics (AUROC, AUPRC, AP@50) Parameters ---------- sess : tf.compat.v1.Session Initialized tf session. edges_pos : Dict[Tuple[int, int], List[np.array]] From edge type to np.arrays of real edges for every edge class in this type. edges_neg : Dict[Tuple[int, int], List[np.array]] From edge type to np.arrays of fake edges for every edge class in this type. edge_type : Tuple[int, int, int] Edge type with class. Two first elements --- edge type, last element --- class in this type. Returns ------- """ self.feed_dict.update({self.placeholders['dropout']: 0}) self.feed_dict.update({self.placeholders['batch_edge_type_idx']: self.minibatch.edge_type2idx[edge_type]}) self.feed_dict.update({self.placeholders['batch_row_edge_type']: edge_type[0]}) self.feed_dict.update({self.placeholders['batch_col_edge_type']: edge_type[1]}) rec = sess.run(self.opt.predictions, feed_dict=self.feed_dict) uv = edges_pos[edge_type[:2]][edge_type[2]] u = uv[:, 0] v = uv[:, 1] preds = expit(rec[u, v]) assert np.all(self.adj_mats[edge_type[:2]][edge_type[2]][u, v] == 1), \ 'Positive examples (real edges) are not exist' uv = edges_neg[edge_type[:2]][edge_type[2]] u = uv[:, 0] v = uv[:, 1] preds_neg = expit(rec[u, v]) assert np.all(self.adj_mats[edge_type[:2]][edge_type[2]][u, v] == 0), \ 'Negative examples (fake edges) are real' # Predicted probs preds_all = np.hstack([preds, preds_neg]) # preds_all = np.nan_to_num(preds_all) # Real probs: 1 for pos, 0 for neg labels_all = np.hstack([np.ones(len(preds)), np.zeros(len(preds_neg))]) roc_sc = metrics.roc_auc_score(labels_all, preds_all) aupr_sc = metrics.average_precision_score(labels_all, preds_all) # Real existing edges (local indexes) actual = range(len(preds)) # All local indexes with probability (sorted) predicted = sorted(range(len(preds_all)), reverse=True, key=lambda i: preds_all[i]) apk_sc = rank_metrics.apk(actual, predicted, k=50) return roc_sc, aupr_sc, apk_sc def _run_epoch(self, sess: tf.compat.v1.Session, dropout: float, print_progress_every: int, epoch: int, log: bool ) -> NoReturn: """ Run one epoch. Parameters ---------- sess : tf.compat.v1.Session Initialized tf session. dropout : float Dropout rate (1 - keep probability). print_progress_every : int Print statistic every print_progress_every iterations. epoch : int Number of current epoch (for printing statistic). log : bool Whether to log or not. """ self.minibatch.shuffle() for batch_edges, current_edge_type, current_edge_type_idx in self.minibatch: # Construct feed dictionary self.feed_dict = self.minibatch.batch_feed_dict( batch_edges=batch_edges, batch_edge_type=current_edge_type_idx, dropout=dropout, placeholders=self.placeholders) t = time.time() # Training step: run single weight update outs = sess.run([self.opt.opt_op, self.opt.cost, self.opt.batch_edge_type_idx], feed_dict=self.feed_dict) train_cost = outs[1] batch_edge_type = outs[2] if self.minibatch.iter % print_progress_every == 0: val_auc, val_auprc, val_apk = self._get_accuracy_scores( sess, self.minibatch.val_edges, self.minibatch.val_edges_false, current_edge_type) print("Epoch:", "%04d" % (epoch + 1), "Iter:", "%04d" % (self.minibatch.iter + 1), "Edge:", "%04d" % batch_edge_type, "train_loss=", "{:.5f}".format(train_cost), "val_roc=", "{:.5f}".format(val_auc), "val_auprc=", "{:.5f}".format(val_auprc), "val_apk=", "{:.5f}".format(val_apk), "time=", "{:.5f}".format(time.time() - t)) if log: import neptune neptune.log_metric("val_roc", val_auc, timestamp=time.time()) neptune.log_metric("val_apk", val_apk, timestamp=time.time()) neptune.log_metric("val_auprc", val_auprc, timestamp=time.time()) neptune.log_metric("train_loss", train_cost, timestamp=time.time()) def run(self, adj_path: str, path_to_split: str, val_test_size: float, batch_size: int, num_epochs: int, dropout: float, max_margin: float, print_progress_every: int, log: bool, on_cpu: bool, seed: int = 123, upload_saved: bool = False) -> NoReturn: """ Run Decagon. Parameters ---------- upload_saved : bool Default = False Whether to log or not. adj_path : str path for saving/loading adjacency matrices. path_to_split : str path to save train, test and validate edges. If it consist needed edges, they will be loaded. Else they will be calculated and saved. batch_size : int Minibatch size. val_test_size : float proportion to split edges into train, test and validate. num_epochs : int number of training epochs. dropout : float Dropout rate (1 - keep probability). print_progress_every : int Print statistic every print_progress_every iterations. log : bool Whether to log or not. on_cpu : bool Run on cpu instead of gpu. max_margin : float Max margin parameter in hinge loss. seed : int Random seed. """ np.random.seed(seed) # check if all path exists if adj_path and not os.path.exists(adj_path): os.makedirs(adj_path) if not os.path.exists(path_to_split): os.makedirs(path_to_split) if not os.path.exists(os.path.dirname(MODEL_SAVE_PATH)): os.makedirs(os.path.dirname(MODEL_SAVE_PATH)) if on_cpu: os.environ['CUDA_VISIBLE_DEVICES'] = "" self._adjacency(adj_path) self._nodes_features() self._edge_types_info() self._construct_placeholders() self._minibatch_iterator_init(path_to_split, batch_size, val_test_size) self._model_init() self._optimizer_init(batch_size, max_margin) print("Initialize session") saver = tf.compat.v1.train.Saver() sess = tf.compat.v1.Session() sess.run(tf.compat.v1.global_variables_initializer()) self.feed_dict = {} if upload_saved: saver.restore(sess, MODEL_TO_UPLOAD) sess.run(tf.compat.v1.global_variables_initializer()) self.minibatch.shuffle() for batch_edges, current_edge_type, current_edge_type_idx in self.minibatch: # Construct feed dictionary self.feed_dict = self.minibatch.batch_feed_dict( batch_edges=batch_edges, batch_edge_type=current_edge_type_idx, dropout=dropout, placeholders=self.placeholders) saver.restore(sess, MODEL_SAVE_PATH) dir_to_save_model = f"{MODEL_SAVE_PATH}/model_{datetime.now().isoformat()[:-7]}" os.makedirs(dir_to_save_model, exist_ok=True) for epoch in range(num_epochs): self._run_epoch(sess, dropout, print_progress_every, epoch, log) saver.save(sess, f"{dir_to_save_model}/epoch_{epoch}.ckpt") print("Optimization finished!") for et in range(self.num_edge_types): roc_score, auprc_score, apk_score = self._get_accuracy_scores( sess, self.minibatch.test_edges, self.minibatch.test_edges_false, self.minibatch.idx2edge_type[et]) print("Edge type=", "[%02d, %02d, %02d]" % self.minibatch.idx2edge_type[et]) print("Edge type:", "%04d" % et, "Test AUROC score", "{:.5f}".format(roc_score)) print("Edge type:", "%04d" % et, "Test AUPRC score", "{:.5f}".format(auprc_score)) print("Edge type:", "%04d" % et, "Test AP@k score", "{:.5f}".format(apk_score)) print() if log: import neptune neptune.log_metric("ROC-AUC", roc_score) neptune.log_metric("AUPRC", auprc_score) neptune.log_metric("AP@k score", apk_score)
print('\n==== IMPORTED VARIABLES ====') with open(in_file, 'rb') as f: DS = pickle.load(f) for key in DS.keys(): globals()[key] = DS[key] print(key, "Imported successfully") print('\n') n_genes = len(gene2idx) n_drugs = len(drug2idx) n_se_combo = len(se_combo_name2idx) # ============================================================================================= # # CREATE MINIBATCH print("Create minibatch iterator\n") minibatch = EdgeMinibatchIterator(adj_mats=adj_mats_orig, feat=feat, edge_types=edge_types, batch_size=args.batch_size, val_test_size=args.val_test_size) # ============================================================================================= # # EXPORT DATA out_file = 'data/data_structures/MINIBATCH/MINIBATCH_' + words[2]+\ '_genes_' + str(n_genes) + '_drugs_'+ str(n_drugs) + '_se_' + str(n_se_combo)+\ '_batchsize_'+str(args.batch_size)+'_valsize_'+str(args.val_test_size) print('Output file: ', out_file, '\n') memUse = ps.memory_info() data = {} data['minibatch'] = minibatch data['mb_vms'] = memUse.vms data['mb_rss'] = memUse.rss data['mb_time'] = time.time() - start with open(out_file, 'wb') as f:
def main(args): parser = argparse.ArgumentParser() parser.add_argument( "--decagon_data_file_directory", type=str, help= "path to directory where bio-decagon-*.csv files are located, with trailing slash. " "Default is current directory", default='./') parser.add_argument( "--saved_files_directory", type=str, help= "path to directory where saved files files are located, with trailing slash. " "Default is current directory. If a decagon_model.ckpt* exists in this directory, it will " "be loaded and evaluated, and no training will be done.", default='./') parser.add_argument("--verbose", help="increase output verbosity", action="store_true", default=False) args = parser.parse_args(args) decagon_data_file_directory = args.decagon_data_file_directory verbose = args.verbose script_start_time = datetime.now() # create pre-processed file that only has side effect with >=500 occurrences all_combos_df = pd.read_csv('%sbio-decagon-combo.csv' % decagon_data_file_directory) side_effects_500 = all_combos_df["Polypharmacy Side Effect"].value_counts() side_effects_500 = side_effects_500[side_effects_500 >= 500].index.tolist() all_combos_df = all_combos_df[ all_combos_df["Polypharmacy Side Effect"].isin(side_effects_500)] all_combos_df.to_csv('%sbio-decagon-combo-over500only.csv' % decagon_data_file_directory, index=False) # use pre=processed file that only contains the most common side effects (those with >= 500 drug pairs) drug_drug_net, combo2stitch, combo2se, se2name = load_combo_se( fname=('%sbio-decagon-combo-over500only.csv' % decagon_data_file_directory)) # net is a networkx graph with genes(proteins) as nodes and protein-protein-interactions as edges # node2idx maps node id to node index gene_net, node2idx = load_ppi(fname=('%sbio-decagon-ppi.csv' % decagon_data_file_directory)) # stitch2se maps (individual) stitch ids to a list of side effect ids # se2name_mono maps side effect ids that occur in the mono file to side effect names (shorter than se2name) stitch2se, se2name_mono = load_mono_se(fname=('%sbio-decagon-mono.csv' % decagon_data_file_directory)) # stitch2proteins maps stitch ids (drug) to protein (gene) ids drug_gene_net, stitch2proteins = load_targets( fname=('%sbio-decagon-targets-all.csv' % decagon_data_file_directory)) # se2class maps side effect id to class name # this was 0.05 in the original code, but the paper says that 10% each are used for testing and validation val_test_size = 0.1 n_genes = gene_net.number_of_nodes() gene_adj = nx.adjacency_matrix(gene_net) gene_degrees = np.array(gene_adj.sum(axis=0)).squeeze() ordered_list_of_drugs = list(drug_drug_net.nodes.keys()) ordered_list_of_side_effects = list(se2name.keys()) ordered_list_of_proteins = list(gene_net.nodes.keys()) n_drugs = len(ordered_list_of_drugs) drug_gene_adj = sp.lil_matrix(np.zeros((n_drugs, n_genes))) for drug in stitch2proteins: for protein in stitch2proteins[drug]: # there are quite a few drugs in here that aren't in our list of 645, # and proteins that aren't in our list of 19081 if drug in ordered_list_of_drugs and protein in ordered_list_of_proteins: drug_index = ordered_list_of_drugs.index(drug) gene_index = ordered_list_of_proteins.index(protein) drug_gene_adj[drug_index, gene_index] = 1 drug_gene_adj = drug_gene_adj.tocsr() # needs to be drug vs. gene matrix (645x19081) gene_drug_adj = drug_gene_adj.transpose(copy=True) drug_drug_adj_list = [] if not os.path.isfile("adjacency_matrices/sparse_matrix0000.npz"): # pre-initialize all the matrices print("Initializing drug-drug adjacency matrix list") start_time = datetime.now() print("Starting at %s" % str(start_time)) n = len(ordered_list_of_side_effects) for i in range(n): drug_drug_adj_list.append( sp.lil_matrix(np.zeros((n_drugs, n_drugs)))) if verbose: print("%s percent done" % str(100.0 * i / n)) print("Done initializing at %s after %s" % (datetime.now(), datetime.now() - start_time)) start_time = datetime.now() combo_finish_time = start_time print("Creating adjacency matrices for side effects") print("Starting at %s" % str(start_time)) combo_count = len(combo2se) combo_counter = 0 # for side_effect_type in ordered_list_of_side_effects: # for drug1, drug2 in combinations(list(range(n_drugs)), 2): for combo in combo2se.keys(): side_effect_list = combo2se[combo] for present_side_effect in side_effect_list: # find the matrix we need to update side_effect_number = ordered_list_of_side_effects.index( present_side_effect) # find the drugs for which we need to make the update drug_tuple = combo2stitch[combo] drug1_index = ordered_list_of_drugs.index(drug_tuple[0]) drug2_index = ordered_list_of_drugs.index(drug_tuple[1]) # update drug_drug_adj_list[side_effect_number][drug1_index, drug2_index] = 1 if verbose and combo_counter % 1000 == 0: print( "Finished combo %s after %s . %d percent of combos done" % (combo_counter, str(combo_finish_time - start_time), (100.0 * combo_counter / combo_count))) combo_finish_time = datetime.now() combo_counter = combo_counter + 1 print("Done creating adjacency matrices at %s after %s" % (datetime.now(), datetime.now() - start_time)) start_time = datetime.now() print("Saving matrices to file") print("Starting at %s" % str(start_time)) # save matrices to file if not os.path.isdir("adjacency_matrices"): os.mkdir("adjacency_matrices") for i in range(len(drug_drug_adj_list)): sp.save_npz('adjacency_matrices/sparse_matrix%04d.npz' % (i, ), drug_drug_adj_list[i].tocoo()) print("Done saving matrices to file at %s after %s" % (datetime.now(), datetime.now() - start_time)) else: print("Loading adjacency matrices from file.") for i in range(len(ordered_list_of_side_effects)): drug_drug_adj_list.append( sp.load_npz('adjacency_matrices/sparse_matrix%04d.npz' % i)) for i in range(len(drug_drug_adj_list)): drug_drug_adj_list[i] = drug_drug_adj_list[i].tocsr() start_time = datetime.now() print("Setting up for training") print("Starting at %s" % str(start_time)) drug_degrees_list = [ np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in drug_drug_adj_list ] # data representation global adj_mats_orig adj_mats_orig = { (0, 0): [gene_adj, gene_adj.transpose(copy=True) ], # protein-protein interactions (and inverses) (0, 1): [gene_drug_adj], # protein-drug relationships (inverse of targets) (1, 0): [drug_gene_adj], # drug-protein relationships (targets) # This creates an "inverse" relationship for every polypharmacy side effect, using the transpose of the # relationship's adjacency matrix, resulting in 2x the number of side effects (and adjacency matrices). (1, 1): drug_drug_adj_list + [x.transpose(copy=True) for x in drug_drug_adj_list], } degrees = { 0: [gene_degrees, gene_degrees], 1: drug_degrees_list + drug_degrees_list, } # featureless (genes) gene_feat = sp.identity(n_genes) gene_nonzero_feat, gene_num_feat = gene_feat.shape gene_feat = preprocessing.sparse_to_tuple(gene_feat.tocoo()) # features (drugs) drug_feat = sp.identity(n_drugs) drug_nonzero_feat, drug_num_feat = drug_feat.shape drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo()) # data representation num_feat = { 0: gene_num_feat, 1: drug_num_feat, } nonzero_feat = { 0: gene_nonzero_feat, 1: drug_nonzero_feat, } feat = { 0: gene_feat, 1: drug_feat, } edge_type2dim = { k: [adj.shape for adj in adjs] for k, adjs in adj_mats_orig.items() } edge_type2decoder = { (0, 0): 'bilinear', (0, 1): 'bilinear', (1, 0): 'bilinear', (1, 1): 'dedicom', } edge_types = {k: len(v) for k, v in adj_mats_orig.items()} global num_edge_types num_edge_types = sum(edge_types.values()) print("Edge types:", "%d" % num_edge_types) ########################################################### # # Settings and placeholders # ########################################################### # Important -- Do not evaluate/print validation performance every iteration as it can take # substantial amount of time PRINT_PROGRESS_EVERY = 10000 print("Defining placeholders") construct_placeholders(edge_types) ########################################################### # # Create minibatch iterator, model and optimizer # ########################################################### global minibatch_iterator iterator_pickle_file_name = args.saved_files_directory + "minibatch_iterator.pickle" if os.path.isfile(iterator_pickle_file_name): print("Load minibatch iterator pickle") with open(iterator_pickle_file_name, 'rb') as pickle_file: minibatch_iterator = pickle.load(pickle_file) else: print("Create minibatch iterator") minibatch_iterator = EdgeMinibatchIterator(adj_mats=adj_mats_orig, feat=feat, edge_types=edge_types, batch_size=FLAGS.batch_size, val_test_size=val_test_size) print("Pickling minibatch iterator") with open(iterator_pickle_file_name, 'wb') as pickle_file: pickle.dump(minibatch_iterator, pickle_file) print("Create model") model = DecagonModel( placeholders=placeholders, num_feat=num_feat, nonzero_feat=nonzero_feat, edge_types=edge_types, decoders=edge_type2decoder, ) print("Create optimizer") global optimizer with tf.name_scope('optimizer'): optimizer = DecagonOptimizer(embeddings=model.embeddings, latent_inters=model.latent_inters, latent_varies=model.latent_varies, degrees=degrees, edge_types=edge_types, edge_type2dim=edge_type2dim, placeholders=placeholders, batch_size=FLAGS.batch_size, margin=FLAGS.max_margin) print("Done setting up at %s after %s" % (datetime.now(), datetime.now() - start_time)) print("Initialize session") global sess sess = tf.Session() decagon_model_file_name = args.saved_files_directory + "decagon_model.ckpt" saved_model_available = os.path.isfile(decagon_model_file_name + ".index") if saved_model_available: saver = tf.train.Saver() saver.restore(sess, decagon_model_file_name) print("Model restored.") if not saved_model_available: print("Training model") start_time = datetime.now() print("Starting at %s" % str(start_time)) sess.run(tf.global_variables_initializer()) feed_dict = {} ########################################################### # # Train model # ########################################################### saver = tf.train.Saver() print("Train model") epoch_losses = [] for epoch in range(FLAGS.epochs): minibatch_iterator.shuffle() itr = 0 while not minibatch_iterator.end(): # Construct feed dictionary feed_dict = minibatch_iterator.next_minibatch_feed_dict( placeholders=placeholders) feed_dict = minibatch_iterator.update_feed_dict( feed_dict=feed_dict, dropout=FLAGS.dropout, placeholders=placeholders) t = time.time() # Training step: run single weight update outs = sess.run([ optimizer.opt_op, optimizer.cost, optimizer.batch_edge_type_idx ], feed_dict=feed_dict) train_cost = outs[1] batch_edge_type = outs[2] if itr % PRINT_PROGRESS_EVERY == 0: val_auc, val_auprc, val_apk = get_accuracy_scores( minibatch_iterator.val_edges, minibatch_iterator.val_edges_false, minibatch_iterator.idx2edge_type[ minibatch_iterator.current_edge_type_idx], feed_dict) print("Epoch:", "%04d" % (epoch + 1), "Iter:", "%04d" % (itr + 1), "Edge:", "%04d" % batch_edge_type, "train_loss=", "{:.5f}".format(train_cost), "val_roc=", "{:.5f}".format(val_auc), "val_auprc=", "{:.5f}".format(val_auprc), "val_apk=", "{:.5f}".format(val_apk), "time=", "{:.5f}".format(time.time() - t)) itr += 1 validation_loss = get_validation_loss( edges_pos=minibatch_iterator.val_edges, edges_neg=minibatch_iterator.val_edges_false, feed_dict=feed_dict) print( "Epoch:", "%04d" % (epoch + 1), "Validation loss (average cross entropy): {}".format( validation_loss)) epoch_losses.append(validation_loss) if len(epoch_losses) >= 3: if round(epoch_losses[-1], 3) >= round( epoch_losses[-2], 3) >= round(epoch_losses[-3], 3): break print("Saving model after epoch:", epoch) save_path = saver.save( sess, args.saved_files_directory + "decagon_model" + str(epoch) + ".ckpt") print("Model saved in path: %s" % save_path) print("Optimization finished!") print("Done training model %s after %s" % (datetime.now(), datetime.now() - start_time)) print("Saving model") save_path = saver.save(sess, decagon_model_file_name) print("Model saved in path: %s" % save_path) print("Pickling minibatch iterator") with open(iterator_pickle_file_name, 'wb') as pickle_file: pickle.dump(minibatch_iterator, pickle_file) start_time = datetime.now() print("Evaluating model") print("Starting at %s" % str(start_time)) for edge_type in range(num_edge_types): # get all edges in test set with this type feed_dict = minibatch_iterator.test_feed_dict( edge_type, placeholders=placeholders) feed_dict = minibatch_iterator.update_feed_dict( feed_dict, FLAGS.dropout, placeholders) edge_tuple = minibatch_iterator.idx2edge_type[edge_type] _, _, all_scores, all_labels, subjects, predicates, objects = get_predictions( edges_pos=minibatch_iterator.test_edges, edges_neg=minibatch_iterator.test_edges_false, edge_type=edge_tuple, feed_dict=feed_dict) print("subject\tpredicate\tobject\tpredicted\tactual") for i in range(len(all_scores)): subject = subjects[i] if edge_tuple[0] == 1: subject = ordered_list_of_drugs[subject] else: subject = ordered_list_of_proteins[subject] object = objects[i] if edge_tuple[1] == 1: object = ordered_list_of_drugs[object] else: object = ordered_list_of_proteins[object] predicate = predicates[i] if edge_tuple[:2] == (1, 1): side_effect_index = edge_tuple[2] is_inverse = False if side_effect_index >= 963: side_effect_index = side_effect_index - 963 is_inverse = True predicate = ordered_list_of_side_effects[side_effect_index] if is_inverse: predicate = predicate + "_2" print("{}\t{}\t{}\t{}\t{}".format(subject, predicate, object, all_scores[i], all_labels[i])) print() print("Done evaluating at %s after %s" % (datetime.now(), datetime.now() - start_time)) print("Script running time: %s" % (datetime.now() - script_start_time))
########################################################### # # Create minibatch iterator, model and optimizer # ########################################################### print("Create minibatch iterator") path_to_split = f'data/split/{val_test_size}' need_sample_edges = not (os.path.isdir(path_to_split) and len(os.listdir(path_to_split)) == 6) minibatch = EdgeMinibatchIterator( adj_mats=adj_mats_orig, feat=feat, edge_types=edge_types, batch_size=PARAMS['batch_size'], val_test_size=val_test_size, path_to_split=path_to_split, need_sample_edges=need_sample_edges ) print("Create model") model = DecagonModel( placeholders=placeholders, num_feat=num_feat, nonzero_feat=nonzero_feat, edge_types=edge_types, decoders=edge_type2decoder, ) print("Create optimizer")