def main(): seed = 1234 data_name = "cpf" kg_dp_path = "../data/" print("Importing dataset files ... ") train_data_raw = load_kg_file(os.path.join(kg_dp_path, "train.tsv")) test_data_raw = load_kg_file(os.path.join(kg_dp_path, "test.tsv")) all_data = np.array( [[s, p, o] for s, p, o in np.concatenate([train_data_raw, test_data_raw])]) chemicals_list = set(list(all_data[:, 0])) effects_list = set(list(all_data[:, 1])) proteins_list = set(list(all_data[:, 2])) dataset = KgDataset(name=data_name) dataset.load_triples(train_data_raw, tag="train") dataset.load_triples(test_data_raw, tag="test") nb_rels = dataset.get_rels_count() nb_ents = dataset.get_rels_count() train_data = dataset.data["train"] test_data = dataset.data["test"] all_data_indices = np.concatenate([train_data, test_data]) fn_known_facts = {k: set() for k in range(nb_rels)} for s, p, o in all_data_indices: fn_known_facts[p].add((s, o)) fn_test_dict = {k: [] for k in np.unique(test_data[:, 1])} for s, p, o in test_data: fn_test_dict[p].append([s, p, o]) print("Initializing the knowledge graph embedding model... ") # model pipeline definition model = ComplEx(seed=seed, verbose=2) pipe_model = Pipeline([('kge_model', model)]) # set model parameters model_params = { 'kge_model__em_size': 100, 'kge_model__lr': 0.01, 'kge_model__optimiser': "AMSgrad", 'kge_model__log_interval': 10, 'kge_model__nb_epochs': 100, 'kge_model__batch_size': 5000, 'kge_model__initialiser': 'xavier_uniform', 'kge_model__nb_ents': nb_ents, 'kge_model__nb_rels': nb_rels } # add parameters to the model then call fit method pipe_model.set_params(**model_params) print("Training ... ") pipe_model.fit(X=train_data, y=None) def generate_fn_negatives(fn_idx, neg_data_size): """ :param fn_idx: :param neg_data_size: :return: """ candidate_neg_size = int(neg_data_size * 1.2) candidate_subs = np.random.randint(0, nb_rels, [candidate_neg_size, 1]) candidate_rel = np.ones([candidate_neg_size, 1]) * fn_idx candidate_objs = np.random.randint(0, nb_rels, [candidate_neg_size, 1]) candidate_negs = np.concatenate( [candidate_subs, candidate_rel, candidate_objs], axis=1) true_negs = [] for s, p, o in candidate_negs: if (s, o) not in fn_known_facts[rel_idx]: true_negs.append([s, p, o]) true_negs = np.array(true_negs) return true_negs[:neg_data_size, :] rel_results = [] for idx, rel_idx in enumerate(fn_test_dict): rel_name = dataset.get_rel_labels([rel_idx])[0] rel_test_data_pos = np.array(fn_test_dict[rel_idx]) rel_test_size = len(rel_test_data_pos) rel_pos_scores = pipe_model.predict(rel_test_data_pos) res = { 1: { "auroc": 0.0, "aupr": 0.0 }, 10: { "auroc": 0.0, "aupr": 0.0 }, 50: { "auroc": 0.0, "aupr": 0.0 } } for neg_ratio in [1, 10, 50]: rel_test_data_neg = generate_fn_negatives( rel_idx, rel_test_size * neg_ratio) neg_scores = [] for rel_test_data_neg_batch in generate_batches(rel_test_data_neg, batch_size=10000): batch_scores = pipe_model.predict(rel_test_data_neg_batch) neg_scores.extend(batch_scores) rel_neg_scores = np.array(neg_scores) rel_all_scores = np.concatenate([rel_pos_scores, rel_neg_scores]) rel_all_labels = np.concatenate([ np.ones([ len(rel_pos_scores), ]), np.zeros([ len(rel_neg_scores), ]) ]) rel_aupr = auc_pr(rel_all_labels, rel_all_scores) rel_auroc = auc_roc(rel_all_labels, rel_all_scores) res[neg_ratio]["aupr"] = rel_aupr res[neg_ratio]["auroc"] = rel_auroc print( "[%d] N1:AUROC %1.4f - N1:AUPR %1.4f\tN10:AUROC %1.4f - N10:AUPR %1.4f\tN50:AUROC %1.4f - N50:AUPR %1.4f" "\t[Count: %d]\tREL:%s" % (idx + 1, res[1]["auroc"], res[1]["aupr"], res[10]["auroc"], res[10]["aupr"], res[50]["auroc"], res[50]["aupr"], rel_test_size, rel_name)) rel_results.append([ res[1]["auroc"], res[1]["aupr"], res[10]["auroc"], res[10]["aupr"], res[50]["auroc"], res[50]["aupr"] ]) rel_results = np.array(rel_results) n1_au_roc = np.mean(rel_results[:, 0]) n1_au_pr = np.mean(rel_results[:, 1]) n3_au_roc = np.mean(rel_results[:, 2]) n3_au_pr = np.mean(rel_results[:, 3]) n10_au_roc = np.mean(rel_results[:, 4]) n10_au_pr = np.mean(rel_results[:, 5]) print( "-----------------------------------------------------------------------------------------------------------" ) print( "N1:AUROC %1.4f - N1:AUPR %1.4f\tN10:AUROC %1.4f - N10:AUPR %1.4f\tN50:AUROC %1.4f - N50:AUPR %1.4f = [AVERAGE]" % (n1_au_roc, n1_au_pr, n3_au_roc, n3_au_pr, n10_au_roc, n10_au_pr)) print( "-----------------------------------------------------------------------------------------------------------" )
def main(): seed = 1234 nb_epochs_then_check = None data_name = "pse" kg_dp_path = "../data/" se_map_raw = [ l.strip().split("\t") for l in open(os.path.join(kg_dp_path, "se_maps.txt")).readlines() ] se_mapping = {k: v for k, v in se_map_raw} print("Importing dataset files ... ") benchmark_train_fd = gzip.open( os.path.join(kg_dp_path, "ploypharmacy_facts_train.txt.gz"), "rt") benchmark_valid_fd = gzip.open( os.path.join(kg_dp_path, "ploypharmacy_facts_valid.txt.gz"), "rt") benchmark_test_fd = gzip.open( os.path.join(kg_dp_path, "ploypharmacy_facts_test.txt.gz"), "rt") benchmark_train = np.array( [l.strip().split() for l in benchmark_train_fd.readlines()]) benchmark_valid = np.array( [l.strip().split() for l in benchmark_valid_fd.readlines()]) benchmark_test = np.array( [l.strip().split() for l in benchmark_test_fd.readlines()]) benchmark_triples = np.array( [[d1, se, d2] for d1, se, d2 in np.concatenate( [benchmark_train, benchmark_valid, benchmark_test])]) pse_drugs = list( set( list( np.concatenate( [benchmark_triples[:, 0], benchmark_triples[:, 2]])))) pse_list = set(list(benchmark_triples[:, 1])) rel_dict = dict() for s, p, o in benchmark_triples: if p not in rel_dict: rel_dict[p] = 1 else: rel_dict[p] += 1 pair_dict = dict() for s, p, o in benchmark_triples: if s > o: pair = (s, o) else: pair = (o, s) if pair not in rel_dict: pair_dict[pair] = 1 else: pair_dict[pair] += 1 drug_combinations = np.array( [[d1, d2] for d1, d2 in list(itertools.product(pse_drugs, pse_drugs)) if d1 != d2]) print("Processing dataset files to generate a knowledge graph ... ") # delete raw polypharmacy data del benchmark_triples dataset = KgDataset(name=data_name) dataset.load_triples(benchmark_train, tag="bench_train") dataset.load_triples(benchmark_valid, tag="bench_valid") dataset.load_triples(benchmark_test, tag="bench_test") del benchmark_train del benchmark_valid del benchmark_test nb_entities = dataset.get_ents_count() nb_relations = dataset.get_rels_count() pse_indices = dataset.get_rel_indices(list(pse_list)) d1 = np.array(dataset.get_ent_indices(list( drug_combinations[:, 0]))).reshape([-1, 1]) d2 = np.array(dataset.get_ent_indices(list( drug_combinations[:, 1]))).reshape([-1, 1]) drug_combinations = np.concatenate([d1, d2], axis=1) del d1 del d2 # grouping side effect information by the side effect type train_data = dataset.data["bench_train"] valid_data = dataset.data["bench_valid"] test_data = dataset.data["bench_test"] bench_idx_data = np.concatenate([train_data, valid_data, test_data]) se_facts_full_dict = {se: set() for se in pse_indices} for s, p, o in bench_idx_data: se_facts_full_dict[p].add((s, p, o)) print("Initializing the knowledge graph embedding model... ") # model pipeline definition model = TransE(seed=seed, verbose=2) pipe_model = Pipeline([('kge_model', model)]) # set model parameters model_params = { 'kge_model__em_size': 100, 'kge_model__lr': 0.01, 'kge_model__optimiser': "AMSgrad", 'kge_model__log_interval': 10, 'kge_model__nb_epochs': 100, 'kge_model__nb_negs': 6, 'kge_model__batch_size': 5000, 'kge_model__initialiser': 'xavier_uniform', 'kge_model__nb_ents': nb_entities, 'kge_model__nb_rels': nb_relations } # add parameters to the model then call fit method pipe_model.set_params(**model_params) print("Training ... ") pipe_model.fit(X=train_data, y=None) metrics_per_se = { se_idx: { "ap": .0, "auc-roc": .0, "auc-pr": .0, "p@50": .0 } for se_idx in pse_indices } se_ap_list = [] se_auc_roc_list = [] se_auc_pr_list = [] se_p50_list = [] print( "================================================================================" ) for se in tqdm(pse_indices, desc="Evaluating test data for each side-effect"): se_name = dataset.get_rel_labels([se])[0] se_all_facts_set = se_facts_full_dict[se] se_test_facts_pos = np.array([[s, p, o] for s, p, o in test_data if p == se]) se_test_facts_pos_size = len(se_test_facts_pos) se_test_facts_neg = np.array( [[d1, se, d2] for d1, d2 in drug_combinations if (d1, se, d2) not in se_all_facts_set and (d2, se, d1) not in se_all_facts_set]) # shuffle and keep negatives with size equal to positive instances so positive to negative ratio is 1:1 np.random.shuffle(se_test_facts_neg) se_test_facts_neg = se_test_facts_neg[:se_test_facts_pos_size, :] set_test_facts_all = np.concatenate( [se_test_facts_pos, se_test_facts_neg]) se_test_facts_labels = np.concatenate([ np.ones([len(se_test_facts_pos)]), np.zeros([len(se_test_facts_neg)]) ]) se_test_facts_scores = model.predict(set_test_facts_all) se_ap = average_precision(se_test_facts_labels, se_test_facts_scores) se_p50 = precision_at_k(se_test_facts_labels, se_test_facts_scores, k=50) se_auc_pr = auc_pr(se_test_facts_labels, se_test_facts_scores) se_auc_roc = auc_roc(se_test_facts_labels, se_test_facts_scores) se_ap_list.append(se_ap) se_auc_roc_list.append(se_auc_roc) se_auc_pr_list.append(se_auc_pr) se_p50_list.append(se_p50) se_code = se_name.replace("SE:", "") metrics_per_se[se] = { "ap": se_ap, "auc-roc": se_auc_roc, "auc-pr": se_auc_pr, "p@50": se_p50 } print( "AP: %1.4f - AUC-ROC: %1.4f - AUC-PR: %1.4f - P@50: %1.4f > %s: %s" % (se_ap, se_auc_roc, se_auc_pr, se_p50, se_code, se_mapping[se_code]), flush=True) se_ap_list_avg = np.average(se_ap_list) se_auc_roc_list_avg = np.average(se_auc_roc_list) se_auc_pr_list_avg = np.average(se_auc_pr_list) se_p50_list_avg = np.average(se_p50_list) print( "================================================================================" ) print( "[AVERAGE] AP: %1.4f - AUC-ROC: %1.4f - AUC-PR: %1.4f - P@50: %1.4f" % (se_ap_list_avg, se_auc_roc_list_avg, se_auc_pr_list_avg, se_p50_list_avg), flush=True) print( "================================================================================" )
def main(): seed = 1234 nb_epochs_then_check = None data_name = "TS-PROTEIN-GO" dataset_dir = "../data/dataset/" # loading dataset train_fp = os.path.join(dataset_dir, "train.txt.gz") train_facts_labeled = [ l.strip().split("\t") for l in gzip.open(train_fp, "rt").readlines() ] train_facts = np.array([[s, p, o] for s, p, o, f in train_facts_labeled if f == "1"]) # train_facts_neg = np.array([[s, p, o] for s, p, o, f in train_facts_labeled if f == "0"]) test_fp = os.path.join(dataset_dir, "test.txt.gz") test_facts_labeled = [ l.strip().split("\t") for l in gzip.open(test_fp, "rt").readlines() ] test_facts = np.array([[s, p, o] for s, p, o, f in test_facts_labeled if f == "1"]) test_facts_neg = np.array([[s, p, o] for s, p, o, f in test_facts_labeled if f == "0"]) tissue_list = list(set(list(test_facts[:, 1]))) dataset = KgDataset(name=data_name) dataset.load_triples(train_facts, "train") dataset.load_triples(test_facts, "test") dataset.load_triples(test_facts_neg, "test_neg") del train_facts del test_facts del test_facts_neg train_data = dataset.data["train"] test_data = dataset.data["test"] test_data_neg = dataset.data["test_neg"] tissue_list = dataset.get_rel_indices(tissue_list) # model pipeline definition model = TriModel(seed=seed, loss="pt_log", verbose=2) pipe_model = Pipeline([('kge_model', model)]) # set model parameters model_params = { 'kge_model__em_size': 30, 'kge_model__lr': 0.01, 'kge_model__nb_negs': 2, 'kge_model__nb_epochs': 200, 'kge_model__batch_size': 4000, 'kge_model__nb_ents': dataset.get_ents_count(), 'kge_model__nb_rels': dataset.get_rels_count() } # add parameters to the model then call fit method pipe_model.set_params(**model_params) pipe_model.fit(X=train_data) ts_auc_roc_list = [] ts_auc_pr_list = [] print("============================================================") print("= Tissue-specific evaluation =") print("============================================================") for tissue_idx in tissue_list: tissue_name = dataset.get_rel_labels([tissue_idx])[0] ts_test_facts_pos = np.array([[s, p, o] for s, p, o in test_data if p == tissue_idx]) ts_test_facts_neg = np.array([[s, p, o] for s, p, o in test_data_neg if p == tissue_idx]) set_test_facts_all = np.concatenate( [ts_test_facts_pos, ts_test_facts_neg]) se_test_facts_labels = np.concatenate([ np.ones([len(ts_test_facts_pos)]), np.zeros([len(ts_test_facts_neg)]) ]) se_test_facts_scores = model.predict(set_test_facts_all) se_auc_pr = average_precision_score(se_test_facts_labels, se_test_facts_scores) se_auc_roc = roc_auc_score(se_test_facts_labels, se_test_facts_scores) ts_auc_roc_list.append(se_auc_roc) ts_auc_pr_list.append(se_auc_pr) print("= AUC-ROC: %1.4f - AUC-PR: %1.4f > %s" % (se_auc_roc, se_auc_pr, tissue_name), flush=True) se_auc_roc_list_avg = np.average(ts_auc_roc_list) se_auc_pr_list_avg = np.average(ts_auc_pr_list) print("============================================================") print("= AUC-ROC: %1.4f - AUC-PR: %1.4f > [AVERAGE]" % (se_auc_roc_list_avg, se_auc_pr_list_avg), flush=True) print("============================================================")