Ejemplo n.º 1
0
def main():
    seed = 1234
    data_name = "cpf"
    kg_dp_path = "../data/"

    print("Importing dataset files ... ")
    train_data_raw = load_kg_file(os.path.join(kg_dp_path, "train.tsv"))
    test_data_raw = load_kg_file(os.path.join(kg_dp_path, "test.tsv"))

    all_data = np.array(
        [[s, p, o]
         for s, p, o in np.concatenate([train_data_raw, test_data_raw])])

    chemicals_list = set(list(all_data[:, 0]))
    effects_list = set(list(all_data[:, 1]))
    proteins_list = set(list(all_data[:, 2]))

    dataset = KgDataset(name=data_name)
    dataset.load_triples(train_data_raw, tag="train")
    dataset.load_triples(test_data_raw, tag="test")

    nb_rels = dataset.get_rels_count()
    nb_ents = dataset.get_rels_count()

    train_data = dataset.data["train"]
    test_data = dataset.data["test"]
    all_data_indices = np.concatenate([train_data, test_data])

    fn_known_facts = {k: set() for k in range(nb_rels)}
    for s, p, o in all_data_indices:
        fn_known_facts[p].add((s, o))

    fn_test_dict = {k: [] for k in np.unique(test_data[:, 1])}
    for s, p, o in test_data:
        fn_test_dict[p].append([s, p, o])

    print("Initializing the knowledge graph embedding model... ")
    # model pipeline definition
    model = ComplEx(seed=seed, verbose=2)
    pipe_model = Pipeline([('kge_model', model)])

    # set model parameters
    model_params = {
        'kge_model__em_size': 100,
        'kge_model__lr': 0.01,
        'kge_model__optimiser': "AMSgrad",
        'kge_model__log_interval': 10,
        'kge_model__nb_epochs': 100,
        'kge_model__batch_size': 5000,
        'kge_model__initialiser': 'xavier_uniform',
        'kge_model__nb_ents': nb_ents,
        'kge_model__nb_rels': nb_rels
    }

    # add parameters to the model then call fit method
    pipe_model.set_params(**model_params)

    print("Training ... ")
    pipe_model.fit(X=train_data, y=None)

    def generate_fn_negatives(fn_idx, neg_data_size):
        """

        :param fn_idx:
        :param neg_data_size:
        :return:
        """
        candidate_neg_size = int(neg_data_size * 1.2)
        candidate_subs = np.random.randint(0, nb_rels, [candidate_neg_size, 1])
        candidate_rel = np.ones([candidate_neg_size, 1]) * fn_idx
        candidate_objs = np.random.randint(0, nb_rels, [candidate_neg_size, 1])
        candidate_negs = np.concatenate(
            [candidate_subs, candidate_rel, candidate_objs], axis=1)
        true_negs = []
        for s, p, o in candidate_negs:
            if (s, o) not in fn_known_facts[rel_idx]:
                true_negs.append([s, p, o])
        true_negs = np.array(true_negs)
        return true_negs[:neg_data_size, :]

    rel_results = []
    for idx, rel_idx in enumerate(fn_test_dict):
        rel_name = dataset.get_rel_labels([rel_idx])[0]
        rel_test_data_pos = np.array(fn_test_dict[rel_idx])
        rel_test_size = len(rel_test_data_pos)

        rel_pos_scores = pipe_model.predict(rel_test_data_pos)

        res = {
            1: {
                "auroc": 0.0,
                "aupr": 0.0
            },
            10: {
                "auroc": 0.0,
                "aupr": 0.0
            },
            50: {
                "auroc": 0.0,
                "aupr": 0.0
            }
        }

        for neg_ratio in [1, 10, 50]:
            rel_test_data_neg = generate_fn_negatives(
                rel_idx, rel_test_size * neg_ratio)

            neg_scores = []
            for rel_test_data_neg_batch in generate_batches(rel_test_data_neg,
                                                            batch_size=10000):
                batch_scores = pipe_model.predict(rel_test_data_neg_batch)
                neg_scores.extend(batch_scores)
            rel_neg_scores = np.array(neg_scores)

            rel_all_scores = np.concatenate([rel_pos_scores, rel_neg_scores])
            rel_all_labels = np.concatenate([
                np.ones([
                    len(rel_pos_scores),
                ]),
                np.zeros([
                    len(rel_neg_scores),
                ])
            ])
            rel_aupr = auc_pr(rel_all_labels, rel_all_scores)
            rel_auroc = auc_roc(rel_all_labels, rel_all_scores)
            res[neg_ratio]["aupr"] = rel_aupr
            res[neg_ratio]["auroc"] = rel_auroc

        print(
            "[%d] N1:AUROC %1.4f - N1:AUPR %1.4f\tN10:AUROC %1.4f - N10:AUPR %1.4f\tN50:AUROC %1.4f - N50:AUPR %1.4f"
            "\t[Count: %d]\tREL:%s" %
            (idx + 1, res[1]["auroc"], res[1]["aupr"], res[10]["auroc"],
             res[10]["aupr"], res[50]["auroc"], res[50]["aupr"], rel_test_size,
             rel_name))
        rel_results.append([
            res[1]["auroc"], res[1]["aupr"], res[10]["auroc"], res[10]["aupr"],
            res[50]["auroc"], res[50]["aupr"]
        ])
    rel_results = np.array(rel_results)
    n1_au_roc = np.mean(rel_results[:, 0])
    n1_au_pr = np.mean(rel_results[:, 1])
    n3_au_roc = np.mean(rel_results[:, 2])
    n3_au_pr = np.mean(rel_results[:, 3])
    n10_au_roc = np.mean(rel_results[:, 4])
    n10_au_pr = np.mean(rel_results[:, 5])
    print(
        "-----------------------------------------------------------------------------------------------------------"
    )
    print(
        "N1:AUROC %1.4f - N1:AUPR %1.4f\tN10:AUROC %1.4f - N10:AUPR %1.4f\tN50:AUROC %1.4f - N50:AUPR %1.4f = [AVERAGE]"
        % (n1_au_roc, n1_au_pr, n3_au_roc, n3_au_pr, n10_au_roc, n10_au_pr))
    print(
        "-----------------------------------------------------------------------------------------------------------"
    )
Ejemplo n.º 2
0
def main():
    seed = 1234
    nb_epochs_then_check = None
    data_name = "pse"
    kg_dp_path = "../data/"

    se_map_raw = [
        l.strip().split("\t")
        for l in open(os.path.join(kg_dp_path, "se_maps.txt")).readlines()
    ]
    se_mapping = {k: v for k, v in se_map_raw}

    print("Importing dataset files ... ")
    benchmark_train_fd = gzip.open(
        os.path.join(kg_dp_path, "ploypharmacy_facts_train.txt.gz"), "rt")
    benchmark_valid_fd = gzip.open(
        os.path.join(kg_dp_path, "ploypharmacy_facts_valid.txt.gz"), "rt")
    benchmark_test_fd = gzip.open(
        os.path.join(kg_dp_path, "ploypharmacy_facts_test.txt.gz"), "rt")

    benchmark_train = np.array(
        [l.strip().split() for l in benchmark_train_fd.readlines()])
    benchmark_valid = np.array(
        [l.strip().split() for l in benchmark_valid_fd.readlines()])
    benchmark_test = np.array(
        [l.strip().split() for l in benchmark_test_fd.readlines()])

    benchmark_triples = np.array(
        [[d1, se, d2] for d1, se, d2 in np.concatenate(
            [benchmark_train, benchmark_valid, benchmark_test])])

    pse_drugs = list(
        set(
            list(
                np.concatenate(
                    [benchmark_triples[:, 0], benchmark_triples[:, 2]]))))
    pse_list = set(list(benchmark_triples[:, 1]))

    rel_dict = dict()
    for s, p, o in benchmark_triples:
        if p not in rel_dict:
            rel_dict[p] = 1
        else:
            rel_dict[p] += 1

    pair_dict = dict()
    for s, p, o in benchmark_triples:
        if s > o:
            pair = (s, o)
        else:
            pair = (o, s)
        if pair not in rel_dict:
            pair_dict[pair] = 1
        else:
            pair_dict[pair] += 1

    drug_combinations = np.array(
        [[d1, d2] for d1, d2 in list(itertools.product(pse_drugs, pse_drugs))
         if d1 != d2])

    print("Processing dataset files to generate a knowledge graph ... ")
    # delete raw polypharmacy data
    del benchmark_triples
    dataset = KgDataset(name=data_name)
    dataset.load_triples(benchmark_train, tag="bench_train")
    dataset.load_triples(benchmark_valid, tag="bench_valid")
    dataset.load_triples(benchmark_test, tag="bench_test")

    del benchmark_train
    del benchmark_valid
    del benchmark_test

    nb_entities = dataset.get_ents_count()
    nb_relations = dataset.get_rels_count()
    pse_indices = dataset.get_rel_indices(list(pse_list))

    d1 = np.array(dataset.get_ent_indices(list(
        drug_combinations[:, 0]))).reshape([-1, 1])
    d2 = np.array(dataset.get_ent_indices(list(
        drug_combinations[:, 1]))).reshape([-1, 1])
    drug_combinations = np.concatenate([d1, d2], axis=1)
    del d1
    del d2

    # grouping side effect information by the side effect type
    train_data = dataset.data["bench_train"]
    valid_data = dataset.data["bench_valid"]
    test_data = dataset.data["bench_test"]

    bench_idx_data = np.concatenate([train_data, valid_data, test_data])
    se_facts_full_dict = {se: set() for se in pse_indices}

    for s, p, o in bench_idx_data:
        se_facts_full_dict[p].add((s, p, o))

    print("Initializing the knowledge graph embedding model... ")
    # model pipeline definition
    model = TransE(seed=seed, verbose=2)
    pipe_model = Pipeline([('kge_model', model)])

    # set model parameters
    model_params = {
        'kge_model__em_size': 100,
        'kge_model__lr': 0.01,
        'kge_model__optimiser': "AMSgrad",
        'kge_model__log_interval': 10,
        'kge_model__nb_epochs': 100,
        'kge_model__nb_negs': 6,
        'kge_model__batch_size': 5000,
        'kge_model__initialiser': 'xavier_uniform',
        'kge_model__nb_ents': nb_entities,
        'kge_model__nb_rels': nb_relations
    }

    # add parameters to the model then call fit method
    pipe_model.set_params(**model_params)

    print("Training ... ")
    pipe_model.fit(X=train_data, y=None)

    metrics_per_se = {
        se_idx: {
            "ap": .0,
            "auc-roc": .0,
            "auc-pr": .0,
            "p@50": .0
        }
        for se_idx in pse_indices
    }

    se_ap_list = []
    se_auc_roc_list = []
    se_auc_pr_list = []
    se_p50_list = []

    print(
        "================================================================================"
    )
    for se in tqdm(pse_indices,
                   desc="Evaluating test data for each side-effect"):
        se_name = dataset.get_rel_labels([se])[0]
        se_all_facts_set = se_facts_full_dict[se]
        se_test_facts_pos = np.array([[s, p, o] for s, p, o in test_data
                                      if p == se])
        se_test_facts_pos_size = len(se_test_facts_pos)

        se_test_facts_neg = np.array(
            [[d1, se, d2] for d1, d2 in drug_combinations
             if (d1, se,
                 d2) not in se_all_facts_set and (d2, se,
                                                  d1) not in se_all_facts_set])

        # shuffle and keep negatives with size equal to positive instances so positive to negative ratio is 1:1
        np.random.shuffle(se_test_facts_neg)
        se_test_facts_neg = se_test_facts_neg[:se_test_facts_pos_size, :]

        set_test_facts_all = np.concatenate(
            [se_test_facts_pos, se_test_facts_neg])
        se_test_facts_labels = np.concatenate([
            np.ones([len(se_test_facts_pos)]),
            np.zeros([len(se_test_facts_neg)])
        ])
        se_test_facts_scores = model.predict(set_test_facts_all)

        se_ap = average_precision(se_test_facts_labels, se_test_facts_scores)
        se_p50 = precision_at_k(se_test_facts_labels,
                                se_test_facts_scores,
                                k=50)
        se_auc_pr = auc_pr(se_test_facts_labels, se_test_facts_scores)
        se_auc_roc = auc_roc(se_test_facts_labels, se_test_facts_scores)

        se_ap_list.append(se_ap)
        se_auc_roc_list.append(se_auc_roc)
        se_auc_pr_list.append(se_auc_pr)
        se_p50_list.append(se_p50)

        se_code = se_name.replace("SE:", "")
        metrics_per_se[se] = {
            "ap": se_ap,
            "auc-roc": se_auc_roc,
            "auc-pr": se_auc_pr,
            "p@50": se_p50
        }
        print(
            "AP: %1.4f - AUC-ROC: %1.4f - AUC-PR: %1.4f - P@50: %1.4f > %s: %s"
            % (se_ap, se_auc_roc, se_auc_pr, se_p50, se_code,
               se_mapping[se_code]),
            flush=True)

    se_ap_list_avg = np.average(se_ap_list)
    se_auc_roc_list_avg = np.average(se_auc_roc_list)
    se_auc_pr_list_avg = np.average(se_auc_pr_list)
    se_p50_list_avg = np.average(se_p50_list)

    print(
        "================================================================================"
    )
    print(
        "[AVERAGE] AP: %1.4f - AUC-ROC: %1.4f - AUC-PR: %1.4f - P@50: %1.4f" %
        (se_ap_list_avg, se_auc_roc_list_avg, se_auc_pr_list_avg,
         se_p50_list_avg),
        flush=True)
    print(
        "================================================================================"
    )
Ejemplo n.º 3
0
def main():
    seed = 1234
    nb_epochs_then_check = None
    data_name = "TS-PROTEIN-GO"
    dataset_dir = "../data/dataset/"

    # loading dataset
    train_fp = os.path.join(dataset_dir, "train.txt.gz")
    train_facts_labeled = [
        l.strip().split("\t") for l in gzip.open(train_fp, "rt").readlines()
    ]
    train_facts = np.array([[s, p, o] for s, p, o, f in train_facts_labeled
                            if f == "1"])
    # train_facts_neg = np.array([[s, p, o] for s, p, o, f in train_facts_labeled if f == "0"])

    test_fp = os.path.join(dataset_dir, "test.txt.gz")
    test_facts_labeled = [
        l.strip().split("\t") for l in gzip.open(test_fp, "rt").readlines()
    ]
    test_facts = np.array([[s, p, o] for s, p, o, f in test_facts_labeled
                           if f == "1"])
    test_facts_neg = np.array([[s, p, o] for s, p, o, f in test_facts_labeled
                               if f == "0"])

    tissue_list = list(set(list(test_facts[:, 1])))
    dataset = KgDataset(name=data_name)
    dataset.load_triples(train_facts, "train")
    dataset.load_triples(test_facts, "test")
    dataset.load_triples(test_facts_neg, "test_neg")

    del train_facts
    del test_facts
    del test_facts_neg

    train_data = dataset.data["train"]
    test_data = dataset.data["test"]
    test_data_neg = dataset.data["test_neg"]
    tissue_list = dataset.get_rel_indices(tissue_list)

    # model pipeline definition
    model = TriModel(seed=seed, loss="pt_log", verbose=2)
    pipe_model = Pipeline([('kge_model', model)])

    # set model parameters
    model_params = {
        'kge_model__em_size': 30,
        'kge_model__lr': 0.01,
        'kge_model__nb_negs': 2,
        'kge_model__nb_epochs': 200,
        'kge_model__batch_size': 4000,
        'kge_model__nb_ents': dataset.get_ents_count(),
        'kge_model__nb_rels': dataset.get_rels_count()
    }

    # add parameters to the model then call fit method
    pipe_model.set_params(**model_params)
    pipe_model.fit(X=train_data)

    ts_auc_roc_list = []
    ts_auc_pr_list = []
    print("============================================================")
    print("= Tissue-specific evaluation                               =")
    print("============================================================")
    for tissue_idx in tissue_list:
        tissue_name = dataset.get_rel_labels([tissue_idx])[0]

        ts_test_facts_pos = np.array([[s, p, o] for s, p, o in test_data
                                      if p == tissue_idx])
        ts_test_facts_neg = np.array([[s, p, o] for s, p, o in test_data_neg
                                      if p == tissue_idx])
        set_test_facts_all = np.concatenate(
            [ts_test_facts_pos, ts_test_facts_neg])
        se_test_facts_labels = np.concatenate([
            np.ones([len(ts_test_facts_pos)]),
            np.zeros([len(ts_test_facts_neg)])
        ])
        se_test_facts_scores = model.predict(set_test_facts_all)

        se_auc_pr = average_precision_score(se_test_facts_labels,
                                            se_test_facts_scores)
        se_auc_roc = roc_auc_score(se_test_facts_labels, se_test_facts_scores)

        ts_auc_roc_list.append(se_auc_roc)
        ts_auc_pr_list.append(se_auc_pr)

        print("= AUC-ROC: %1.4f - AUC-PR: %1.4f > %s" %
              (se_auc_roc, se_auc_pr, tissue_name),
              flush=True)

    se_auc_roc_list_avg = np.average(ts_auc_roc_list)
    se_auc_pr_list_avg = np.average(ts_auc_pr_list)

    print("============================================================")
    print("= AUC-ROC: %1.4f - AUC-PR: %1.4f > [AVERAGE]" %
          (se_auc_roc_list_avg, se_auc_pr_list_avg),
          flush=True)
    print("============================================================")