Example #1
0
    def _nodes_features(self) -> NoReturn:
        """
        Create self.num_feat, self.nonzero_feat, self.feat.

        Notes
        -----
        One-hot encoding as genes features.
        Binary vectors with presence of different side effects as drugs features.
        self.num_feat : Dict[int, int]
            Number of elements in feature vector for 0: -genes, for 1: -drugs.
        self.nonzero_feat : Dict[int, int]
            Number of all features for 0: -gene and 1: -drug nodes.
            e.g., it is in format 0: num of genes in graph, 1: num of drugs.
        self.feat : Dict[int, sp.csr_matrix]
            From edge type (0 = gene, 1 = drug) to feature matrix.
            Row in feature matrix = embedding of one node.

        """
        # One-hot for genes
        n_genes = self.gene_net.number_of_nodes()
        gene_feat = sp.identity(n_genes)
        gene_nonzero_feat, gene_num_feat = gene_feat.shape
        gene_feat = preprocessing.sparse_to_tuple(gene_feat.tocoo())

        # Create sparse matrix with rows -- genes features.
        # Gene feature -- binary vector with length = num of mono se.
        # feature[i] = 1 <=> gene has ith mono se
        drug_feat = create_adj_matrix(
            a_item2b_item=self.stitch2se,
            ordered_list_a_item=self.ordered_list_of_drugs,
            ordered_list_b_item=self.ordered_list_of_se_mono)
        # Check if some gene has zero embedding (i.e. it has no frequent se)
        drugs_zero_features = np.array(
            self.ordered_list_of_drugs)[drug_feat.getnnz(axis=1) == 0]
        # assert 0 not in drug_feat.getnnz(axis=1), \
        # 'All genes should have nonzero embeddings! '
        print(f'Length of drugs features vectors: {drug_feat.shape[1]}')
        print(f'Number of unique vectors: '
              f'{np.unique(drug_feat.toarray(), axis=0).shape[0]}')
        if len(drugs_zero_features) > 0:
            print('Warning! All genes should have nonzero embeddings! ')
            print(f'Where are {len(drugs_zero_features)} zero embeddings')
            print(f'Bad drugs: {drugs_zero_features}')
        drug_nonzero_feat, drug_num_feat = drug_feat.shape
        drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo())
        self.num_feat = {
            0: gene_num_feat,
            1: drug_num_feat,
        }
        self.nonzero_feat = {
            0: gene_nonzero_feat,
            1: drug_nonzero_feat,
        }
        self.feat = {
            0: gene_feat,
            1: drug_feat,
        }
Example #2
0
def network_edge_threshold(network_adj, threshold):
    edge_tmp, edge_value, shape_tmp = preprocessing.sparse_to_tuple(
        network_adj)
    preserved_edge_index = np.where(edge_value > threshold)[0]
    preserved_network = sp.csr_matrix(
        (edge_value[preserved_edge_index],
         (edge_tmp[preserved_edge_index, 0], edge_tmp[preserved_edge_index,
                                                      1])),
        shape=shape_tmp)
    return preserved_network
Example #3
0
    def _nodes_features(self) -> NoReturn:
        """
        Create self.num_feat, self.nonzero_feat, self.feat.

        Notes
        -----
        One-hot encoding as genes and drugs features
        (separately one-hot for different nodes types).
        self.num_feat : Dict[int, int]
            Number of elements in feature vector for 0: -genes, for 1: -drugs.
        self.nonzero_feat : Dict[int, int]
            Number of all features for 0: -gene and 1: -drug nodes.
        self.feat : Dict[int, sp.csr_matrix]
            From edge type (0 = gene, 1 = drug) to feature matrix.
            Row in feature matrix = embedding of one node.

        """
        # featureless (genes)
        gene_feat = sp.identity(self.n_genes)
        gene_nonzero_feat, gene_num_feat = gene_feat.shape
        gene_feat = preprocessing.sparse_to_tuple(gene_feat.tocoo())

        # features (drugs)
        drug_feat = sp.identity(self.n_drugs)
        drug_nonzero_feat, drug_num_feat = drug_feat.shape
        drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo())

        # data representation
        self.num_feat = {
            0: gene_num_feat,
            1: drug_num_feat,
        }
        self.nonzero_feat = {
            0: gene_nonzero_feat,
            1: drug_nonzero_feat,
        }
        self.feat = {
            0: gene_feat,
            1: drug_feat,
        }
Example #4
0
# data representation
adj_mats_orig = {
    (0,0): [gene_adj],
    (0,1): [gene_drug_adj],
    (1,0): [drug_gene_adj],
    (1,1): drug_drug_adj_list,
}
degrees = {
    0: [gene_degrees],
    1: drug_degrees_list,
}

# featureless (genes)
gene_feat = sp.identity(n_genes)
gene_nonzero_feat, gene_num_feat = gene_feat.shape
gene_feat = preprocessing.sparse_to_tuple(gene_feat.tocoo())

# features (drugs)
drug_feat = sp.identity(n_drugs)
drug_nonzero_feat, drug_num_feat = drug_feat.shape
drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo())

# data representation
num_feat = {
    0: gene_num_feat,
    1: drug_num_feat,
}
nonzero_feat = {
    0: gene_nonzero_feat,
    1: drug_nonzero_feat,
}
Example #5
0
def main_execution():
    combo_to_drugs_ids, combo_to_side_effects = load_drug_bank_combo_side_effect_file(
        fichier='polypharmacy/drugbank/drugbank-combo.csv')
    nodes = set([u for e in combo_to_drugs_ids.values() for u in e])
    n_drugs = len(nodes)
    relation_types = set([r for r in combo_to_side_effects.values()])
    n_drugdrug_rel_types = len(relation_types)
    drugs_to_positions_in_matrices_dict = {
        node: i
        for i, node in enumerate(nodes)
    }

    drug_drug_adj_list = []  # matrice d'adjacence de chaque drug_drug
    for i, el in enumerate(relation_types):  # pour chaque side effect
        mat = np.zeros((n_drugs, n_drugs))
        for d1, d2 in combinations(list(nodes), 2):
            temp_cle = '{}_{}'.format(d1, d2)
            if temp_cle in combo_to_side_effects.keys():
                if combo_to_side_effects[temp_cle] == el:
                    # chaque fois on a une réelle s.e entre les 2 drogues dans la matrice
                    mat[drugs_to_positions_in_matrices_dict[d1], drugs_to_positions_in_matrices_dict[d2]] = \
                        mat[drugs_to_positions_in_matrices_dict[d2], drugs_to_positions_in_matrices_dict[d1]] = 1.
                    # Inscrire une interaction
        drug_drug_adj_list.append(sp.csr_matrix(mat))
    drug_degrees_list = [
        np.array(drug_adj.sum(axis=0)).squeeze()
        for drug_adj in drug_drug_adj_list
    ]

    adj_mats_orig = {
        (0, 0):
        drug_drug_adj_list +
        [x.transpose(copy=True) for x in drug_drug_adj_list],
    }
    degrees = {
        0: drug_degrees_list + drug_degrees_list,
    }

    # features (drugs)
    drug_feat = sp.identity(n_drugs)
    drug_nonzero_feat, drug_num_feat = drug_feat.shape
    drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo())

    # data representation
    num_feat = {
        0: drug_num_feat,
    }
    nonzero_feat = {
        0: drug_nonzero_feat,
    }
    feat = {
        0: drug_feat,
    }

    edge_type2dim = {
        k: [adj.shape for adj in adjs]
        for k, adjs in adj_mats_orig.items()
    }
    edge_type2decoder = {
        (0, 0): 'dedicom',
    }

    edge_types = {k: len(v) for k, v in adj_mats_orig.items()}
    num_edge_types = sum(edge_types.values())
    print("Edge types:", "%d" % num_edge_types)
    print("Defining placeholders")
    placeholders = construct_placeholders(edge_types)

    ###########################################################
    #
    # Create minibatch iterator, model and optimizer
    #
    ###########################################################

    print("Create minibatch iterator")
    minibatch = EdgeMinibatchIterator(adj_mats=adj_mats_orig,
                                      feat=feat,
                                      edge_types=edge_types,
                                      batch_size=FLAGS.batch_size,
                                      val_test_size=val_test_size)

    print("Create model")
    model = DecagonModel(
        placeholders=placeholders,
        num_feat=num_feat,
        nonzero_feat=nonzero_feat,
        edge_types=edge_types,
        decoders=edge_type2decoder,
    )

    print("Create optimizer")
    with tf.name_scope('optimizer'):
        opt = DecagonOptimizer(embeddings=model.embeddings,
                               latent_inters=model.latent_inters,
                               latent_varies=model.latent_varies,
                               degrees=degrees,
                               edge_types=edge_types,
                               edge_type2dim=edge_type2dim,
                               placeholders=placeholders,
                               batch_size=FLAGS.batch_size,
                               margin=FLAGS.max_margin)

    print("Initialize session")
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    feed_dict = {}

    ###########################################################
    #
    # Train model
    #
    ###########################################################

    print("Train model")
    for epoch in range(FLAGS.epochs):

        minibatch.shuffle()
        itr = 0
        while not minibatch.end():
            # Construct feed dictionary
            feed_dict = minibatch.next_minibatch_feed_dict(
                placeholders=placeholders)
            feed_dict = minibatch.update_feed_dict(feed_dict=feed_dict,
                                                   dropout=FLAGS.dropout,
                                                   placeholders=placeholders)

            t = time.time()

            # Training step: run single weight update
            outs = sess.run([opt.opt_op, opt.cost, opt.batch_edge_type_idx],
                            feed_dict=feed_dict)
            train_cost = outs[1]
            batch_edge_type = outs[2]

            if itr % PRINT_PROGRESS_EVERY == 0:
                val_auc, val_auprc, val_apk = get_accuracy_scores(
                    feed_dict, placeholders, sess, opt, minibatch,
                    adj_mats_orig, minibatch.val_edges,
                    minibatch.val_edges_false,
                    minibatch.idx2edge_type[minibatch.current_edge_type_idx])

                print("Epoch:", "%04d" % (epoch + 1), "Iter:",
                      "%04d" % (itr + 1), "Edge:", "%04d" % batch_edge_type,
                      "train_loss=", "{:.5f}".format(train_cost), "val_roc=",
                      "{:.5f}".format(val_auc), "val_auprc=",
                      "{:.5f}".format(val_auprc), "val_apk=",
                      "{:.5f}".format(val_apk), "time=",
                      "{:.5f}".format(time.time() - t))

            itr += 1

    print("Optimization finished!")

    for et in range(num_edge_types):
        roc_score, auprc_score, apk_score = get_accuracy_scores(
            feed_dict, placeholders, sess, opt, minibatch, adj_mats_orig,
            minibatch.test_edges, minibatch.test_edges_false,
            minibatch.idx2edge_type[et])
        print("Edge type=", "[%02d, %02d, %02d]" % minibatch.idx2edge_type[et])
        print("Edge type:", "%04d" % et, "Test AUROC score",
              "{:.5f}".format(roc_score))
        print("Edge type:", "%04d" % et, "Test AUPRC score",
              "{:.5f}".format(auprc_score))
        print("Edge type:", "%04d" % et, "Test AP@k score",
              "{:.5f}".format(apk_score))
        print()
Example #6
0
        (1, 0): [drug_proten_interactions],
        (1, 1): [Drug_Drug_sim_adj, Drug_Drug_sim_adj],  #type3
    }

    protein_degrees = np.array(Protein_Protein_sim_adj.sum(axis=0)).squeeze()
    drug_degrees = np.array(Drug_Drug_sim_adj.sum(axis=0)).squeeze()

    degrees = {
        0: [protein_degrees, protein_degrees],
        1: [drug_degrees, drug_degrees],
    }

    # # featureless
    protein_feat = sp.identity(Protein_Protein_sim_adj.shape[0])
    protein_nonzero_feat, protein_num_feat = protein_feat.shape
    protein_feat = preprocessing.sparse_to_tuple(protein_feat.tocoo())

    drug_feat = sp.identity(Drug_Drug_sim_adj.shape[0])
    drug_nonzero_feat, drug_num_feat = drug_feat.shape
    drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo())

    num_feat = {
        0: protein_num_feat,
        1: drug_num_feat,
    }
    nonzero_feat = {
        0: protein_nonzero_feat,
        1: drug_nonzero_feat,
    }
    feat = {
        0: protein_feat,
def main_execution(combo_file='./polypharmacy/bio-decagon-combo.csv',
                   targets_file='./polypharmacy/bio-decagon-targets.csv',
                   genes_genes_file='./polypharmacy/bio-decagon-ppi.csv',
                   new_train_test_split=False):
    print('Load Combo to Side Effects')
    if combo_file.find('decagon') != -1:
        combo_to_drugs_ids, combo_to_side_effects, combo_to_side_effects_names, side_effects_ids_to_names = \
            load_decagon_combo_side_effect_file(fichier=combo_file)
        print('Load drugs to targets')
        drugs_id_to_targets_id = load_decagon_file_targets_id(
            fichier=targets_file)
    else:
        combo_to_drugs_ids, combo_to_side_effects = load_drug_bank_combo_side_effect_file(
            fichier=combo_file)
        print('Load drugs to targets')
        drugs_id_to_targets_id, drugs_id_to_drugs_name = load_file_targets_id(
            fichier=targets_file)

    print('Load genes to genes (targets) interactions net')
    genes_genes_net, genes_node_to_idx = load_genes_genes_interactions(
        fichier=genes_genes_file)

    print('Build genes-genes adjacency matrix')
    genes_adj = nx.adjacency_matrix(genes_genes_net)
    genes_degrees = np.array(genes_adj.sum(axis=0)).squeeze()

    if new_train_test_split:
        print('Load the new train test validation split')
        combo_to_drugs_ids_train, combo_to_drugs_ids_test, combo_to_drugs_ids_valid = train_test_valid_split_3(
        )
        drug_nodes_train = set(
            [u for e in combo_to_drugs_ids_train.values() for u in e])
        drug_nodes_test = set(
            [u for e in combo_to_drugs_ids_test.values() for u in e])
        drug_nodes_valid = set(
            [u for e in combo_to_drugs_ids_valid.values() for u in e])

    print('Build drugs-drugs matrix representation')
    drug_nodes = set([u for e in combo_to_drugs_ids.values() for u in e])
    n_drugs = len(drug_nodes)
    relation_types = set(
        [r for se in combo_to_side_effects.values() for r in se])
    drugs_nodes_to_idx = {node: i for i, node in enumerate(drug_nodes)}

    print('Build general drugs-drugs matrix representation')
    drug_drug_adj_list = []  # matrice d'adjacence de chaque drug_drug
    for i, el in enumerate(relation_types):  # pour chaque side effect
        mat = np.zeros((n_drugs, n_drugs))
        for d1, d2 in combinations(list(drug_nodes), 2):
            temp_cle = '{}_{}'.format(d1, d2)
            if temp_cle in combo_to_side_effects.keys():
                if el in combo_to_side_effects[temp_cle]:
                    # list of list on check si le s.e apparait au moins une fois dans la liste
                    mat[drugs_nodes_to_idx[d1], drugs_nodes_to_idx[d2]] = \
                        mat[drugs_nodes_to_idx[d2], drugs_nodes_to_idx[d1]] = 1.
                    # Inscrire une interaction
        drug_drug_adj_list.append(sp.csr_matrix(mat))
    drug_degrees_list = [
        np.array(drug_adj.sum(axis=0)).squeeze()
        for drug_adj in drug_drug_adj_list
    ]

    if new_train_test_split:
        print('Build train drugs-drugs matrix representation')
        drug_drug_adj_list_train = [
        ]  # matrice d'adjacence de chaque drug_drug
        for i, el in enumerate(relation_types):  # pour chaque side effect
            mat = np.zeros((n_drugs, n_drugs))
            for d1, d2 in combinations(list(drug_nodes_train), 2):
                temp_cle = '{}_{}'.format(d1, d2)
                if temp_cle in combo_to_side_effects.keys():
                    if el in combo_to_side_effects[temp_cle]:
                        # list of list on check si le s.e apparait au moins une fois dans la liste
                        mat[drugs_nodes_to_idx[d1], drugs_nodes_to_idx[d2]] = \
                            mat[drugs_nodes_to_idx[d2], drugs_nodes_to_idx[d1]] = 1.
                    # Inscrire une interaction
            drug_drug_adj_list_train.append(sp.csr_matrix(mat))
        drug_degrees_list_train = [
            np.array(drug_adj.sum(axis=0)).squeeze()
            for drug_adj in drug_drug_adj_list_train
        ]

        print('Build test drugs-drugs matrix representation')
        drug_drug_adj_list_test = []  # matrice d'adjacence de chaque drug_drug
        for i, el in enumerate(relation_types):  # pour chaque side effect
            mat = np.zeros((n_drugs, n_drugs))
            for d1, d2 in combinations(list(drug_nodes_test), 2):
                temp_cle = '{}_{}'.format(d1, d2)
                if temp_cle in combo_to_side_effects.keys():
                    if el in combo_to_side_effects[temp_cle]:
                        # list of list on check si le s.e apparait au moins une fois dans la liste
                        mat[drugs_nodes_to_idx[d1], drugs_nodes_to_idx[d2]] = \
                            mat[drugs_nodes_to_idx[d2], drugs_nodes_to_idx[d1]] = 1.
                    # Inscrire une interaction
            drug_drug_adj_list_test.append(sp.csr_matrix(mat))
        drug_degrees_list_test = [
            np.array(drug_adj.sum(axis=0)).squeeze()
            for drug_adj in drug_drug_adj_list_test
        ]

        print('Build valid drugs-drugs matrix representation')
        drug_drug_adj_list_valid = [
        ]  # matrice d'adjacence de chaque drug_drug
        for i, el in enumerate(relation_types):  # pour chaque side effect
            mat = np.zeros((n_drugs, n_drugs))
            for d1, d2 in combinations(list(drug_nodes_valid), 2):
                temp_cle = '{}_{}'.format(d1, d2)
                if temp_cle in combo_to_side_effects.keys():
                    if el in combo_to_side_effects[temp_cle]:
                        # list of list on check si le s.e apparait au moins une fois dans la liste
                        mat[drugs_nodes_to_idx[d1], drugs_nodes_to_idx[d2]] = \
                            mat[drugs_nodes_to_idx[d2], drugs_nodes_to_idx[d1]] = 1.
                    # Inscrire une interaction
            drug_drug_adj_list_valid.append(sp.csr_matrix(mat))
        drug_degrees_list_valid = [
            np.array(drug_adj.sum(axis=0)).squeeze()
            for drug_adj in drug_drug_adj_list_valid
        ]

    print('Build general genes-drugs matrix representation')
    genes_nodes = set([gene_node for gene_node in genes_node_to_idx.keys()])
    n_genes = len(genes_nodes)
    mat = np.zeros((n_genes, n_drugs))
    for drug in drug_nodes:
        if drug in drugs_id_to_targets_id.keys():
            for target in drugs_id_to_targets_id[drug]:
                if target in genes_node_to_idx.keys():
                    mat[genes_node_to_idx[target],
                        drugs_nodes_to_idx[drug]] = 1.
    genes_drugs_adj = sp.csr_matrix(mat)
    drugs_genes_adj = genes_drugs_adj.transpose(copy=True)

    if new_train_test_split:
        print('Build train genes-drugs matrix representation')
        for drug in drug_nodes_train:
            if drug in drugs_id_to_targets_id.keys():
                for target in drugs_id_to_targets_id[drug]:
                    if target in genes_node_to_idx.keys():
                        mat[genes_node_to_idx[target],
                            drugs_nodes_to_idx[drug]] = 1.
        genes_drugs_adj_train = sp.csr_matrix(mat)
        drugs_genes_adj_train = genes_drugs_adj_train.transpose(copy=True)

        print('Build test genes-drugs matrix representation')
        for drug in drug_nodes_test:
            if drug in drugs_id_to_targets_id.keys():
                for target in drugs_id_to_targets_id[drug]:
                    if target in genes_node_to_idx.keys():
                        mat[genes_node_to_idx[target],
                            drugs_nodes_to_idx[drug]] = 1.
        genes_drugs_adj_test = sp.csr_matrix(mat)
        drugs_genes_adj_test = genes_drugs_adj_test.transpose(copy=True)

        print('Build valid genes-drugs matrix representation')
        for drug in drug_nodes_valid:
            if drug in drugs_id_to_targets_id.keys():
                for target in drugs_id_to_targets_id[drug]:
                    if target in genes_node_to_idx.keys():
                        mat[genes_node_to_idx[target],
                            drugs_nodes_to_idx[drug]] = 1.
        genes_drugs_adj_valid = sp.csr_matrix(mat)
        drugs_genes_adj_valid = genes_drugs_adj_valid.transpose(copy=True)

    print('Build general Adjacency matrix data representation')
    adj_mats_orig = {
        (0, 0): [genes_adj, genes_adj.transpose(copy=True)],
        (0, 1): [genes_drugs_adj],
        (1, 0): [drugs_genes_adj],
        (1, 1):
        drug_drug_adj_list +
        [x.transpose(copy=True) for x in drug_drug_adj_list],
    }

    if new_train_test_split:
        print('Build train Adjacency matrix data representation')
        adj_mats_orig_train = {
            (0, 0): [genes_adj, genes_adj.transpose(copy=True)],
            (0, 1): [genes_drugs_adj_train],
            (1, 0): [drugs_genes_adj_train],
            (1, 1):
            drug_drug_adj_list_train +
            [x.transpose(copy=True) for x in drug_drug_adj_list_train],
        }

        print('Build test Adjacency matrix data representation')
        adj_mats_orig_test = {
            (0, 0): [genes_adj, genes_adj.transpose(copy=True)],
            (0, 1): [genes_drugs_adj_test],
            (1, 0): [drugs_genes_adj_test],
            (1, 1):
            drug_drug_adj_list_test +
            [x.transpose(copy=True) for x in drug_drug_adj_list_test],
        }

        print('Build valid Adjacency matrix data representation')
        adj_mats_orig_valid = {
            (0, 0): [genes_adj, genes_adj.transpose(copy=True)],
            (0, 1): [genes_drugs_adj_valid],
            (1, 0): [drugs_genes_adj_valid],
            (1, 1):
            drug_drug_adj_list_valid +
            [x.transpose(copy=True) for x in drug_drug_adj_list_valid],
        }

    degrees = {
        0: [genes_degrees, genes_degrees],
        1: drug_degrees_list + drug_degrees_list,
    }

    print('featureless (genes)')
    gene_feat = sp.identity(n_genes)
    gene_nonzero_feat, gene_num_feat = gene_feat.shape
    gene_feat = preprocessing.sparse_to_tuple(gene_feat.tocoo())

    print('features (drugs)')
    drug_feat = sp.identity(n_drugs)
    drug_nonzero_feat, drug_num_feat = drug_feat.shape
    drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo())

    print('Features data representation')
    num_feat = {
        0: gene_num_feat,
        1: drug_num_feat,
    }
    nonzero_feat = {
        0: gene_nonzero_feat,
        1: drug_nonzero_feat,
    }
    feat = {
        0: gene_feat,
        1: drug_feat,
    }

    edge_type2dim = {
        k: [adj.shape for adj in adjs]
        for k, adjs in adj_mats_orig.items()
    }
    edge_type2decoder = {
        (0, 0): 'bilinear',
        (0, 1): 'bilinear',
        (1, 0): 'bilinear',
        (1, 1): 'dedicom',
    }

    edge_types = {k: len(v) for k, v in adj_mats_orig.items()}
    num_edge_types = sum(edge_types.values())
    print("Edge types:", "%d" % num_edge_types)

    print("Defining placeholders")
    placeholders = construct_placeholders(edge_types)

    ###########################################################
    #
    # Create minibatch iterator, model and optimizer
    #
    ###########################################################

    if new_train_test_split:
        print("Create minibatch iterator")
        minibatch = EdgeMinibatchIteratorNewSplit(
            adj_mats=adj_mats_orig,
            adj_mats_train=adj_mats_orig_train,
            adj_mats_test=adj_mats_orig_test,
            adj_mats_valid=adj_mats_orig_valid,
            feat=feat,
            edge_types=edge_types,
            batch_size=FLAGS.batch_size,
            val_test_size=val_test_size)
    else:
        print("Create minibatch iterator")
        minibatch = EdgeMinibatchIterator(adj_mats=adj_mats_orig,
                                          feat=feat,
                                          edge_types=edge_types,
                                          batch_size=FLAGS.batch_size,
                                          val_test_size=val_test_size)

    print("Create model")
    model = DecagonModel(
        placeholders=placeholders,
        num_feat=num_feat,
        nonzero_feat=nonzero_feat,
        edge_types=edge_types,
        decoders=edge_type2decoder,
    )

    print("Create optimizer")
    with tf.name_scope('optimizer'):
        opt = DecagonOptimizer(embeddings=model.embeddings,
                               latent_inters=model.latent_inters,
                               latent_varies=model.latent_varies,
                               degrees=degrees,
                               edge_types=edge_types,
                               edge_type2dim=edge_type2dim,
                               placeholders=placeholders,
                               batch_size=FLAGS.batch_size,
                               margin=FLAGS.max_margin)

    print("Initialize session")
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    feed_dict = {}

    ###########################################################
    #
    # Train model
    #
    ###########################################################

    print("Train model")
    for epoch in range(FLAGS.epochs):

        minibatch.shuffle()
        itr = 0
        while not minibatch.end():
            # Construct feed dictionary
            feed_dict = minibatch.next_minibatch_feed_dict(
                placeholders=placeholders)
            feed_dict = minibatch.update_feed_dict(feed_dict=feed_dict,
                                                   dropout=FLAGS.dropout,
                                                   placeholders=placeholders)

            t = time.time()

            # Training step: run single weight update
            outs = sess.run([opt.opt_op, opt.cost, opt.batch_edge_type_idx],
                            feed_dict=feed_dict)
            train_cost = outs[1]
            batch_edge_type = outs[2]

            if itr % PRINT_PROGRESS_EVERY == 0:
                val_auc, val_auprc, val_apk = get_accuracy_scores(
                    feed_dict, placeholders, sess, opt, minibatch,
                    adj_mats_orig, minibatch.val_edges,
                    minibatch.val_edges_false,
                    minibatch.idx2edge_type[minibatch.current_edge_type_idx])

                print("Epoch:", "%04d" % (epoch + 1), "Iter:",
                      "%04d" % (itr + 1), "Edge:", "%04d" % batch_edge_type,
                      "train_loss=", "{:.5f}".format(train_cost), "val_roc=",
                      "{:.5f}".format(val_auc), "val_auprc=",
                      "{:.5f}".format(val_auprc), "val_apk=",
                      "{:.5f}".format(val_apk), "time=",
                      "{:.5f}".format(time.time() - t))

            itr += 1

    print("Optimization finished!")

    for et in range(num_edge_types):
        roc_score, auprc_score, apk_score = get_accuracy_scores(
            feed_dict, placeholders, sess, opt, minibatch, adj_mats_orig,
            minibatch.test_edges, minibatch.test_edges_false,
            minibatch.idx2edge_type[et])
        print("Edge type=", "[%02d, %02d, %02d]" % minibatch.idx2edge_type[et])
        print("Edge type:", "%04d" % et, "Test AUROC score",
              "{:.5f}".format(roc_score))
        print("Edge type:", "%04d" % et, "Test AUPRC score",
              "{:.5f}".format(auprc_score))
        print("Edge type:", "%04d" % et, "Test AP@k score",
              "{:.5f}".format(apk_score))
        print()
Example #8
0
}
degrees = {
    0:
    pers_comp_degrees_list,
    1:
    comp_degrees_list + comp_degrees_list,
    2: [
        np.array([np.sum(comp_bankr_adj[0].T)]),
        np.array([np.sum(comp_bankr_adj[0].T)])
    ]
}

# features (Person)
pers_feat = sp.identity(n_persons)
pers_nonzero_feat, pers_num_feat = pers_feat.shape
pers_feat = preprocessing.sparse_to_tuple(pers_feat.tocoo())

# features (Companies)
comp_feat = sp.identity(n_companies)
comp_nonzero_feat, comp_num_feat = comp_feat.shape
comp_feat = preprocessing.sparse_to_tuple(comp_feat.tocoo())

# features (Bankruptcy)
n_bankruptcy = 1
banrp_feat = sp.identity(n_bankruptcy)
banrp_nonzero_feat, banrp_num_feat = banrp_feat.shape
banrp_feat = preprocessing.sparse_to_tuple(banrp_feat.tocoo())

# data representation
num_feat = {0: pers_num_feat, 1: comp_num_feat, 2: banrp_num_feat}
nonzero_feat = {
Example #9
0
ppi_mat = ppi_adj.todense()  # High memory requirement for big matrices
# Calculate algorithmic complexity
bdm = BDM(ndim=2, partition=PartitionRecursive)
ppi_per = PerturbationExperiment(bdm, metric='bdm', bipartite_network=False)
ppi_per.set_data(np.array(ppi_mat))
edge_complexity = ppi_per.run()
# Reshape to the adj matrix shape
complexity_mat = edge_complexity.reshape(np.shape(ppi_adj))
#============================= PRELIMINARY SAVING OF BDM ================================ #
out_file_bdm = 'data_structures/BDM/EDGES_PPI_' + sim_type + '_genes_' + str(
    old_genes)
print('Output BDM file: ', out_file_bdm, '\n')
with open(out_file_bdm, 'wb') as f:
    pickle.dump(edge_complexity, f)
# =============================== REMOVING EDGES ======================================== #
coords, _, _ = sparse_to_tuple(ppi_adj)
# Take the upper triangular coordinates
upper_coords = coords[(coords[:, 1] - coords[:, 0] > 0).nonzero()]
# Select abs of the complexity of selected entries
true_cmplx = np.abs(complexity_mat[upper_coords[:, 0],
                                   upper_coords[:, 1]]).squeeze()
# Give an index to the edge
pair = np.array(list(enumerate(true_cmplx)))
# Sort from greatest to lowest complexity
sorted_pair = pair[pair[:, 1].argsort()][::-1]
# Select sorted indices
idx = sorted_pair[:, 0].astype(int)
# Select a threshold entry according to the cut fraction
threshold = np.floor(len(idx) * (1 - cut_frac)).astype(int)
# Select indices above threshold
idx = idx[:threshold]
Example #10
0
if BDM:
    prot_feat = np.hstack([to_add_bdm_genes_dti, to_add_bdm_ppi])
    # Drug features
    if DSE:
        drug_feat = np.asarray(
            np.hstack(
                [drug_feat.todense(), to_add_bdm_drugs_dti, to_add_bdm_ddi]))
    else:
        drug_feat = np.hstack([to_add_bdm_drugs_dti, to_add_bdm_ddi])
print('Drug feature matrix shape: ', np.shape(drug_feat))
print('Protein feature matrix shape: ', np.shape(prot_feat))

# Drug features
drug_num_feat = drug_feat.shape[1]
drug_nonzero_feat = len(np.nonzero(drug_feat)[0])
drug_feat = sparse_to_tuple(sp.coo_matrix(drug_feat))
# Protein features
gene_num_feat = prot_feat.shape[1]
gene_nonzero_feat = len(np.nonzero(prot_feat)[0])
gene_feat = sparse_to_tuple(sp.coo_matrix(prot_feat))
# ============================================================================================= #
# CREATION OF DECAGON DICTIONARIES
adj_mats_orig = {
    (0, 0): [ppi_adj],
    (0, 1): [dti_adj],
    (1, 0): [dti_adj.transpose(copy=True)],
    (1, 1): ddi_adj_list,
}
degrees = {0: [ppi_degrees], 1: ddi_degrees_list}
edge_type2dim = {
    k: [adj.shape for adj in adjs]
Example #11
0
    def __init__(self, et):
        """:param num: load num+1 edge types in order"""
        # load data
        print("loading...")

        # temp = '/home/acq18hx/decagon/'
        temp = './'
        with open(temp + 'data_decagon/graph_num_info.pkl', 'rb') as f:
            [num_gene, num_drug, num_edge_type,
             num_drug_additional_feature] = pickle.load(f)

        # gene-gene
        gene_adj = sp.load_npz(temp + "data_decagon/gene-sparse-adj.npz")
        print("load gene_gene finished!")

        # gene-drug
        gene_drug_adj = sp.load_npz(temp +
                                    "data_decagon/gene-drug-sparse-adj.npz")
        drug_gene_adj = sp.load_npz(temp +
                                    "data_decagon/drug-gene-sparse-adj.npz")
        print("load gene_drug finished!")

        # drug-drug
        drug_drug_adj_list = []
        l_et = int(len(et) / 2)
        for i in et[:l_et]:
            drug_drug_adj_list.append(
                sp.load_npz("".join([
                    temp + "data_decagon/drug-sparse-adj/type_",
                    str(i), ".npz"
                ])))

        print("load drug_drug finished!")

        drug_feat_sparse = sp.load_npz(temp +
                                       "data_decagon/drug-feature-sparse.npz")
        print("load drug_feature finished!")

        # -------------------------- gene feature --------------------------
        # featureless (genes)
        gene_feat = sp.identity(num_gene)
        gene_nonzero_feat, gene_num_feat = gene_feat.shape
        gene_feat = preprocessing.sparse_to_tuple(gene_feat.tocoo())

        # drug vectors with additional features (single side effect)
        drug_nonzero_feat, drug_num_feat = drug_feat_sparse.shape[
            1], np.count_nonzero(drug_feat_sparse.sum(axis=0))
        drug_feat = preprocessing.sparse_to_tuple(drug_feat_sparse.tocoo())

        # data representation
        self.adj_mats_orig = {
            (0, 0): [gene_adj, gene_adj.transpose(copy=True)],
            (0, 1): [gene_drug_adj],
            (1, 0): [drug_gene_adj],
            (1, 1):
            drug_drug_adj_list +
            [x.transpose(copy=True) for x in drug_drug_adj_list],
        }

        gene_degrees = np.array(gene_adj.sum(axis=0)).squeeze()
        drug_degrees_list = [
            np.array(drug_adj.sum(axis=0)).squeeze()
            for drug_adj in drug_drug_adj_list
        ]
        self.degrees = {
            0: [gene_degrees, gene_degrees],
            1: drug_degrees_list + drug_degrees_list,
        }

        # data representation
        self.num_feat = {
            0: gene_num_feat,
            1: drug_num_feat,
        }
        self.num_nonzero_feat = {
            0: gene_nonzero_feat,
            1: drug_nonzero_feat,
        }
        self.feat = {
            0: gene_feat,
            1: drug_feat,
        }

        self.edge_type2dim = {
            k: [adj.shape for adj in adjs]
            for k, adjs in self.adj_mats_orig.items()
        }
        self.edge_type2decoder = {
            (0, 0): 'bilinear',
            (0, 1): 'bilinear',
            (1, 0): 'bilinear',
            (1, 1): 'dedicom',
        }

        self.edge_types = {k: len(v) for k, v in self.adj_mats_orig.items()}
        self.num_edge_types = sum(self.edge_types.values())
        print("Edge types:", "%d" % self.num_edge_types)
        print("======================================================")
Example #12
0
    def build_original(self):
        pp_f = "data_decagon/PP-Decagon_ppi.csv"
        dd_f = "data_decagon/bio-decagon-combo.csv"
        dp_f = "data_decagon/bio-decagon-targets.csv"
        ds_f = "data_decagon/bio-decagon-mono.csv"
        p_set, d_set, combo_set, mono_set = set(), set(), set(), set()
        pp_list, ddt_list, dp_list, ds_list = [], [], [], []

        a, b, c = 0, 0, 0  # temp variables

        # 1. Protein-Protein Association Network
        with open(pp_f, 'r') as f:
            ppi = csv.reader(f)
            next(ppi)
            for [g1, g2] in ppi:
                a, b = int(g1), int(g2)
                p_set.add(a)
                p_set.add(b)
                pp_list.append((a, b))
        # 2. Drug-Drug Association Network
        with open(dd_f, "r") as f:
            ppi = csv.reader(f)
            next(ppi)
            for [d1, d2, t, n] in ppi:
                a, b, c = int(t.split('C')[-1]), int(d1.split('D')[-1]), int(
                    d2.split('D')[-1])
                combo_set.add(a)
                d_set.add(b)
                d_set.add(c)
                ddt_list.append((b, c, a))
        # 3. Drug-Protein Association Network
        with open(dp_f, "r") as f:
            ppi = csv.reader(f)
            next(ppi)
            for [d, p] in ppi:
                a, b = int(d.split('D')[-1]), int(p)
                d_set.add(a)
                p_set.add(b)
                dp_list.append((a, b))
        # 4. Drug-SideEffect Association Network
        with open(ds_f, "r") as f:
            ppi = csv.reader(f)
            next(ppi)
            for [d, e, n] in ppi:
                a, b = int(e.split('C')[-1]), int(d.split('D')[-1])
                mono_set.add(a)
                d_set.add(b)
                ds_list.append((b, a))

        num_gene = p_set.__len__()
        num_drug = d_set.__len__()
        num_edge_type = combo_set.__len__()
        num_drug_additional_feature = mono_set.__len__()

        # -------------------------- gene adj --------------------------
        gene_to_old = list(p_set)
        gene_to_new = sp.csr_matrix(
            (range(num_gene), ([0] * num_gene, gene_to_old)))

        drug_to_old = list(d_set)
        drug_to_new = sp.csr_matrix(
            (range(num_drug), ([0] * num_drug, drug_to_old)))

        edge_type_to_old = list(combo_set)
        edge_type_to_new = sp.csr_matrix(
            (range(num_edge_type), ([0] * num_edge_type, edge_type_to_old)))

        side_effect_to_old = list(mono_set)
        side_effect_to_new = sp.csr_matrix(
            (range(num_drug_additional_feature),
             ([0] * num_drug_additional_feature, side_effect_to_old)))

        r, c = [], []
        array_length = len(pp_list)
        # -------------------------- gene-gene adj --------------------------
        for i in range(array_length):
            r.append(gene_to_new[0, pp_list[i][0]])
            c.append(gene_to_new[0, pp_list[i][1]])
        gene_adj = sp.csr_matrix(([1] * array_length, (r, c)),
                                 shape=(num_gene, num_gene))
        gene_degrees = np.array(gene_adj.sum(axis=0)).squeeze()

        r, c = [], []
        array_length = len(dp_list)
        # -------------------------- drug(row)-gene(col) adj --------------------------
        for i in range(array_length):
            r.append(drug_to_new[0, dp_list[i][0]])
            c.append(gene_to_new[0, dp_list[i][1]])
        drug_gene_adj = sp.csr_matrix(([1] * array_length, (r, c)),
                                      shape=(num_drug, num_gene))
        gene_drug_adj = drug_gene_adj.transpose(copy=True)

        r = {}
        array_length = len(ddt_list)
        # -------------------------- drug-drug adj list --------------------------
        for i in range(array_length):
            c = edge_type_to_new[0, ddt_list[i][2]]
            if c not in r:
                r[c] = [drug_to_new[0, ddt_list[i][0]]
                        ], [drug_to_new[0, ddt_list[i][1]]]
            else:
                r[c][0].append(drug_to_new[0, ddt_list[i][0]])
                r[c][1].append(drug_to_new[0, ddt_list[i][1]])
        drug_drug_adj_list = []
        for i in range(num_edge_type):
            drug_drug_adj_list.append(
                sp.csr_matrix(([1] * len(r[i][0]), (r[i][0], r[i][1])),
                              shape=(num_drug, num_drug)))
        drug_degrees_list = [
            np.array(drug_adj.sum(axis=0)).squeeze()
            for drug_adj in drug_drug_adj_list
        ]

        # -------------------------- gene feature --------------------------
        # featureless (genes)
        gene_feat = sp.identity(num_gene)
        gene_nonzero_feat, gene_num_feat = gene_feat.shape
        gene_feat = preprocessing.sparse_to_tuple(gene_feat.tocoo())

        # drug vectors with additional features (single side effect)
        r, c = list(range(num_drug)), list(range(num_drug))
        for (a, b) in ds_list:
            r.append(drug_to_new[0, a])
            c.append(side_effect_to_new[0, b] + num_drug)
        array_length = num_drug + len(ds_list)
        drug_feat = sp.csr_matrix(
            ([1] * array_length, (r, c)),
            shape=(num_drug, num_drug + num_drug_additional_feature))

        drug_nonzero_feat, drug_num_feat = drug_feat.shape[
            1], np.count_nonzero(drug_feat.sum(axis=0))
        drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo())

        # data representation
        self.adj_mats_orig = {
            (0, 0): [gene_adj, gene_adj.transpose(copy=True)],
            (0, 1): [gene_drug_adj],
            (1, 0): [drug_gene_adj],
            (1, 1):
            drug_drug_adj_list +
            [x.transpose(copy=True) for x in drug_drug_adj_list],
        }
        self.degrees = {
            0: [gene_degrees, gene_degrees],
            1: drug_degrees_list + drug_degrees_list,
        }

        # data representation
        self.num_feat = {
            0: gene_num_feat,
            1: drug_num_feat,
        }
        self.num_nonzero_feat = {
            0: gene_nonzero_feat,
            1: drug_nonzero_feat,
        }
        self.feat = {
            0: gene_feat,
            1: drug_feat,
        }

        self.edge_type2dim = {
            k: [adj.shape for adj in adjs]
            for k, adjs in self.adj_mats_orig.items()
        }
        self.edge_type2decoder = {
            (0, 0): 'bilinear',
            (0, 1): 'bilinear',
            (1, 0): 'bilinear',
            (1, 1): 'dedicom',
        }

        self.edge_types = {k: len(v) for k, v in self.adj_mats_orig.items()}
        self.num_edge_types = sum(self.edge_types.values())
        print("Edge types:", "%d" % self.num_edge_types)
Example #13
0
def main(args):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--decagon_data_file_directory",
        type=str,
        help=
        "path to directory where bio-decagon-*.csv files are located, with trailing slash. "
        "Default is current directory",
        default='./')
    parser.add_argument(
        "--saved_files_directory",
        type=str,
        help=
        "path to directory where saved files files are located, with trailing slash. "
        "Default is current directory. If a decagon_model.ckpt* exists in this directory, it will "
        "be loaded and evaluated, and no training will be done.",
        default='./')
    parser.add_argument("--verbose",
                        help="increase output verbosity",
                        action="store_true",
                        default=False)
    args = parser.parse_args(args)

    decagon_data_file_directory = args.decagon_data_file_directory
    verbose = args.verbose
    script_start_time = datetime.now()

    # create pre-processed file that only has side effect with >=500 occurrences
    all_combos_df = pd.read_csv('%sbio-decagon-combo.csv' %
                                decagon_data_file_directory)
    side_effects_500 = all_combos_df["Polypharmacy Side Effect"].value_counts()
    side_effects_500 = side_effects_500[side_effects_500 >= 500].index.tolist()
    all_combos_df = all_combos_df[
        all_combos_df["Polypharmacy Side Effect"].isin(side_effects_500)]
    all_combos_df.to_csv('%sbio-decagon-combo-over500only.csv' %
                         decagon_data_file_directory,
                         index=False)

    # use pre=processed file that only contains the most common side effects (those with >= 500 drug pairs)
    drug_drug_net, combo2stitch, combo2se, se2name = load_combo_se(
        fname=('%sbio-decagon-combo-over500only.csv' %
               decagon_data_file_directory))
    # net is a networkx graph with genes(proteins) as nodes and protein-protein-interactions as edges
    # node2idx maps node id to node index
    gene_net, node2idx = load_ppi(fname=('%sbio-decagon-ppi.csv' %
                                         decagon_data_file_directory))
    # stitch2se maps (individual) stitch ids to a list of side effect ids
    # se2name_mono maps side effect ids that occur in the mono file to side effect names (shorter than se2name)
    stitch2se, se2name_mono = load_mono_se(fname=('%sbio-decagon-mono.csv' %
                                                  decagon_data_file_directory))
    # stitch2proteins maps stitch ids (drug) to protein (gene) ids
    drug_gene_net, stitch2proteins = load_targets(
        fname=('%sbio-decagon-targets-all.csv' % decagon_data_file_directory))
    # se2class maps side effect id to class name

    # this was 0.05 in the original code, but the paper says that 10% each are used for testing and validation
    val_test_size = 0.1
    n_genes = gene_net.number_of_nodes()
    gene_adj = nx.adjacency_matrix(gene_net)
    gene_degrees = np.array(gene_adj.sum(axis=0)).squeeze()

    ordered_list_of_drugs = list(drug_drug_net.nodes.keys())
    ordered_list_of_side_effects = list(se2name.keys())
    ordered_list_of_proteins = list(gene_net.nodes.keys())

    n_drugs = len(ordered_list_of_drugs)

    drug_gene_adj = sp.lil_matrix(np.zeros((n_drugs, n_genes)))
    for drug in stitch2proteins:
        for protein in stitch2proteins[drug]:
            # there are quite a few drugs in here that aren't in our list of 645,
            # and proteins that aren't in our list of 19081
            if drug in ordered_list_of_drugs and protein in ordered_list_of_proteins:
                drug_index = ordered_list_of_drugs.index(drug)
                gene_index = ordered_list_of_proteins.index(protein)
                drug_gene_adj[drug_index, gene_index] = 1

    drug_gene_adj = drug_gene_adj.tocsr()

    # needs to be drug vs. gene matrix (645x19081)
    gene_drug_adj = drug_gene_adj.transpose(copy=True)

    drug_drug_adj_list = []
    if not os.path.isfile("adjacency_matrices/sparse_matrix0000.npz"):
        # pre-initialize all the matrices
        print("Initializing drug-drug adjacency matrix list")
        start_time = datetime.now()
        print("Starting at %s" % str(start_time))

        n = len(ordered_list_of_side_effects)
        for i in range(n):
            drug_drug_adj_list.append(
                sp.lil_matrix(np.zeros((n_drugs, n_drugs))))
            if verbose:
                print("%s percent done" % str(100.0 * i / n))
        print("Done initializing at %s after %s" %
              (datetime.now(), datetime.now() - start_time))

        start_time = datetime.now()
        combo_finish_time = start_time
        print("Creating adjacency matrices for side effects")
        print("Starting at %s" % str(start_time))
        combo_count = len(combo2se)
        combo_counter = 0

        # for side_effect_type in ordered_list_of_side_effects:
        # for drug1, drug2 in combinations(list(range(n_drugs)), 2):

        for combo in combo2se.keys():
            side_effect_list = combo2se[combo]
            for present_side_effect in side_effect_list:
                # find the matrix we need to update
                side_effect_number = ordered_list_of_side_effects.index(
                    present_side_effect)
                # find the drugs for which we need to make the update
                drug_tuple = combo2stitch[combo]
                drug1_index = ordered_list_of_drugs.index(drug_tuple[0])
                drug2_index = ordered_list_of_drugs.index(drug_tuple[1])
                # update
                drug_drug_adj_list[side_effect_number][drug1_index,
                                                       drug2_index] = 1

            if verbose and combo_counter % 1000 == 0:
                print(
                    "Finished combo %s after %s . %d percent of combos done" %
                    (combo_counter, str(combo_finish_time - start_time),
                     (100.0 * combo_counter / combo_count)))
            combo_finish_time = datetime.now()
            combo_counter = combo_counter + 1

        print("Done creating adjacency matrices at %s after %s" %
              (datetime.now(), datetime.now() - start_time))

        start_time = datetime.now()
        print("Saving matrices to file")
        print("Starting at %s" % str(start_time))

        # save matrices to file
        if not os.path.isdir("adjacency_matrices"):
            os.mkdir("adjacency_matrices")
        for i in range(len(drug_drug_adj_list)):
            sp.save_npz('adjacency_matrices/sparse_matrix%04d.npz' % (i, ),
                        drug_drug_adj_list[i].tocoo())
        print("Done saving matrices to file at %s after %s" %
              (datetime.now(), datetime.now() - start_time))
    else:
        print("Loading adjacency matrices from file.")
        for i in range(len(ordered_list_of_side_effects)):
            drug_drug_adj_list.append(
                sp.load_npz('adjacency_matrices/sparse_matrix%04d.npz' % i))

    for i in range(len(drug_drug_adj_list)):
        drug_drug_adj_list[i] = drug_drug_adj_list[i].tocsr()

    start_time = datetime.now()
    print("Setting up for training")
    print("Starting at %s" % str(start_time))

    drug_degrees_list = [
        np.array(drug_adj.sum(axis=0)).squeeze()
        for drug_adj in drug_drug_adj_list
    ]

    # data representation
    global adj_mats_orig
    adj_mats_orig = {
        (0, 0): [gene_adj, gene_adj.transpose(copy=True)
                 ],  # protein-protein interactions (and inverses)
        (0, 1):
        [gene_drug_adj],  # protein-drug relationships (inverse of targets)
        (1, 0): [drug_gene_adj],  # drug-protein relationships (targets)
        # This creates an "inverse" relationship for every polypharmacy side effect, using the transpose of the
        # relationship's adjacency matrix, resulting in 2x the number of side effects (and adjacency matrices).
        (1, 1):
        drug_drug_adj_list +
        [x.transpose(copy=True) for x in drug_drug_adj_list],
    }
    degrees = {
        0: [gene_degrees, gene_degrees],
        1: drug_degrees_list + drug_degrees_list,
    }

    # featureless (genes)
    gene_feat = sp.identity(n_genes)
    gene_nonzero_feat, gene_num_feat = gene_feat.shape
    gene_feat = preprocessing.sparse_to_tuple(gene_feat.tocoo())

    # features (drugs)
    drug_feat = sp.identity(n_drugs)
    drug_nonzero_feat, drug_num_feat = drug_feat.shape
    drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo())

    # data representation
    num_feat = {
        0: gene_num_feat,
        1: drug_num_feat,
    }
    nonzero_feat = {
        0: gene_nonzero_feat,
        1: drug_nonzero_feat,
    }
    feat = {
        0: gene_feat,
        1: drug_feat,
    }

    edge_type2dim = {
        k: [adj.shape for adj in adjs]
        for k, adjs in adj_mats_orig.items()
    }
    edge_type2decoder = {
        (0, 0): 'bilinear',
        (0, 1): 'bilinear',
        (1, 0): 'bilinear',
        (1, 1): 'dedicom',
    }

    edge_types = {k: len(v) for k, v in adj_mats_orig.items()}
    global num_edge_types
    num_edge_types = sum(edge_types.values())
    print("Edge types:", "%d" % num_edge_types)

    ###########################################################
    #
    # Settings and placeholders
    #
    ###########################################################

    # Important -- Do not evaluate/print validation performance every iteration as it can take
    # substantial amount of time
    PRINT_PROGRESS_EVERY = 10000

    print("Defining placeholders")
    construct_placeholders(edge_types)

    ###########################################################
    #
    # Create minibatch iterator, model and optimizer
    #
    ###########################################################

    global minibatch_iterator
    iterator_pickle_file_name = args.saved_files_directory + "minibatch_iterator.pickle"
    if os.path.isfile(iterator_pickle_file_name):
        print("Load minibatch iterator pickle")
        with open(iterator_pickle_file_name, 'rb') as pickle_file:
            minibatch_iterator = pickle.load(pickle_file)
    else:
        print("Create minibatch iterator")
        minibatch_iterator = EdgeMinibatchIterator(adj_mats=adj_mats_orig,
                                                   feat=feat,
                                                   edge_types=edge_types,
                                                   batch_size=FLAGS.batch_size,
                                                   val_test_size=val_test_size)
        print("Pickling minibatch iterator")
        with open(iterator_pickle_file_name, 'wb') as pickle_file:
            pickle.dump(minibatch_iterator, pickle_file)

    print("Create model")
    model = DecagonModel(
        placeholders=placeholders,
        num_feat=num_feat,
        nonzero_feat=nonzero_feat,
        edge_types=edge_types,
        decoders=edge_type2decoder,
    )

    print("Create optimizer")
    global optimizer
    with tf.name_scope('optimizer'):
        optimizer = DecagonOptimizer(embeddings=model.embeddings,
                                     latent_inters=model.latent_inters,
                                     latent_varies=model.latent_varies,
                                     degrees=degrees,
                                     edge_types=edge_types,
                                     edge_type2dim=edge_type2dim,
                                     placeholders=placeholders,
                                     batch_size=FLAGS.batch_size,
                                     margin=FLAGS.max_margin)

    print("Done setting up at %s after %s" %
          (datetime.now(), datetime.now() - start_time))

    print("Initialize session")
    global sess
    sess = tf.Session()

    decagon_model_file_name = args.saved_files_directory + "decagon_model.ckpt"
    saved_model_available = os.path.isfile(decagon_model_file_name + ".index")
    if saved_model_available:
        saver = tf.train.Saver()
        saver.restore(sess, decagon_model_file_name)
        print("Model restored.")
    if not saved_model_available:
        print("Training model")
        start_time = datetime.now()
        print("Starting at %s" % str(start_time))

        sess.run(tf.global_variables_initializer())
        feed_dict = {}

        ###########################################################
        #
        # Train model
        #
        ###########################################################

        saver = tf.train.Saver()

        print("Train model")
        epoch_losses = []
        for epoch in range(FLAGS.epochs):

            minibatch_iterator.shuffle()
            itr = 0
            while not minibatch_iterator.end():
                # Construct feed dictionary
                feed_dict = minibatch_iterator.next_minibatch_feed_dict(
                    placeholders=placeholders)
                feed_dict = minibatch_iterator.update_feed_dict(
                    feed_dict=feed_dict,
                    dropout=FLAGS.dropout,
                    placeholders=placeholders)

                t = time.time()

                # Training step: run single weight update
                outs = sess.run([
                    optimizer.opt_op, optimizer.cost,
                    optimizer.batch_edge_type_idx
                ],
                                feed_dict=feed_dict)
                train_cost = outs[1]
                batch_edge_type = outs[2]

                if itr % PRINT_PROGRESS_EVERY == 0:
                    val_auc, val_auprc, val_apk = get_accuracy_scores(
                        minibatch_iterator.val_edges,
                        minibatch_iterator.val_edges_false,
                        minibatch_iterator.idx2edge_type[
                            minibatch_iterator.current_edge_type_idx],
                        feed_dict)

                    print("Epoch:", "%04d" % (epoch + 1), "Iter:",
                          "%04d" % (itr + 1), "Edge:",
                          "%04d" % batch_edge_type, "train_loss=",
                          "{:.5f}".format(train_cost), "val_roc=",
                          "{:.5f}".format(val_auc), "val_auprc=",
                          "{:.5f}".format(val_auprc), "val_apk=",
                          "{:.5f}".format(val_apk), "time=",
                          "{:.5f}".format(time.time() - t))

                itr += 1
            validation_loss = get_validation_loss(
                edges_pos=minibatch_iterator.val_edges,
                edges_neg=minibatch_iterator.val_edges_false,
                feed_dict=feed_dict)
            print(
                "Epoch:", "%04d" % (epoch + 1),
                "Validation loss (average cross entropy): {}".format(
                    validation_loss))

            epoch_losses.append(validation_loss)
            if len(epoch_losses) >= 3:
                if round(epoch_losses[-1], 3) >= round(
                        epoch_losses[-2], 3) >= round(epoch_losses[-3], 3):
                    break

            print("Saving model after epoch:", epoch)
            save_path = saver.save(
                sess, args.saved_files_directory + "decagon_model" +
                str(epoch) + ".ckpt")
            print("Model saved in path: %s" % save_path)

        print("Optimization finished!")
        print("Done training model %s after %s" %
              (datetime.now(), datetime.now() - start_time))

        print("Saving model")
        save_path = saver.save(sess, decagon_model_file_name)
        print("Model saved in path: %s" % save_path)

        print("Pickling minibatch iterator")
        with open(iterator_pickle_file_name, 'wb') as pickle_file:
            pickle.dump(minibatch_iterator, pickle_file)

    start_time = datetime.now()
    print("Evaluating model")
    print("Starting at %s" % str(start_time))

    for edge_type in range(num_edge_types):
        # get all edges in test set with this type
        feed_dict = minibatch_iterator.test_feed_dict(
            edge_type, placeholders=placeholders)
        feed_dict = minibatch_iterator.update_feed_dict(
            feed_dict, FLAGS.dropout, placeholders)
        edge_tuple = minibatch_iterator.idx2edge_type[edge_type]

        _, _, all_scores, all_labels, subjects, predicates, objects = get_predictions(
            edges_pos=minibatch_iterator.test_edges,
            edges_neg=minibatch_iterator.test_edges_false,
            edge_type=edge_tuple,
            feed_dict=feed_dict)

        print("subject\tpredicate\tobject\tpredicted\tactual")
        for i in range(len(all_scores)):
            subject = subjects[i]
            if edge_tuple[0] == 1:
                subject = ordered_list_of_drugs[subject]
            else:
                subject = ordered_list_of_proteins[subject]

            object = objects[i]
            if edge_tuple[1] == 1:
                object = ordered_list_of_drugs[object]
            else:
                object = ordered_list_of_proteins[object]

            predicate = predicates[i]
            if edge_tuple[:2] == (1, 1):
                side_effect_index = edge_tuple[2]
                is_inverse = False
                if side_effect_index >= 963:
                    side_effect_index = side_effect_index - 963
                    is_inverse = True
                predicate = ordered_list_of_side_effects[side_effect_index]
                if is_inverse:
                    predicate = predicate + "_2"

            print("{}\t{}\t{}\t{}\t{}".format(subject, predicate, object,
                                              all_scores[i], all_labels[i]))

    print()

    print("Done evaluating at %s after %s" %
          (datetime.now(), datetime.now() - start_time))

    print("Script running time: %s" % (datetime.now() - script_start_time))
    protein_degrees = np.array(protein_protein_adj.sum(axis=0)).squeeze()
    drug_degrees = np.array(drug_drug_adj.sum(axis=0)).squeeze()
    disease_degrees = np.array(disease_drug_adj.sum(axis=0)).squeeze()
    side_effect_degrees = np.array(side_effect_drug_adj.sum(axis=0)).squeeze()

    degrees = {
        0: [protein_degrees, protein_degrees],
        1: [drug_degrees, drug_degrees],
        2: [disease_degrees],
        3: [side_effect_degrees]
    }

    # # featureless (genes)
    gene_feat = sp.identity(1512)
    protein_nonzero_feat, protein_num_feat = gene_feat.shape
    gene_feat = preprocessing.sparse_to_tuple(gene_feat.tocoo())

    #
    # # features (drugs)
    drug_feat = sp.identity(708)
    # drug_feat = Drug_Drug_adj
    drug_nonzero_feat, drug_num_feat = drug_feat.shape
    drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo())

    # data representation

    diease_feat = sp.identity(5603)
    diease_nonzero_feat, diease_num_feat = diease_feat.shape
    diease_feat = preprocessing.sparse_to_tuple(diease_feat.tocoo())
    # NOTICE