Beispiel #1
0
def run_two_stage(args):
    DTI_network = utils.load_any_obj_pkl(args.dti_path)
    drug_similarity = utils.load_any_obj_pkl(args.drug_sim_path)
    target_similarity = utils.load_any_obj_pkl(args.target_sim_path)

    csn_network = network_construction.construct_signifcant_edge_network(
        drug_similarity, top_ratio=float(args.sparsity))
    tsn_network = network_construction.construct_signifcant_edge_network(
        target_similarity, top_ratio=float(args.sparsity))

    implicit_compounds = network_construction.create_implicit_networks(
        DTI_network, list(csn_network.nodes()))
    implicit_targets = network_construction.create_implicit_networks(
        DTI_network, list(tsn_network.nodes()))

    learner = seperate_learner.two_stage_learning(
        DTI_network=DTI_network,
        compound_list=list(csn_network.nodes()),
        target_list=list(tsn_network.nodes()),
        tsn_network=tsn_network,
        csn_network=csn_network,
        implicit_t_network=implicit_targets,
        implicit_c_network=implicit_compounds,
        wl=int(args.walk_length),
        nn=int(args.negative_number),
        wn=int(args.walk_num),
        worker=int(args.worker),
        load_emb=False)
    learner.learn_all_network_embedding()
    learner.build_node_representation()

    training_samples, training_labels = learner.construct_training_samples(
        negative_ratio=10)

    test_pairs = new_pairs_to_evaludate(list(csn_network.nodes()),
                                        list(tsn_network.nodes()), DTI_network)
    test_samples = learner.concatenate_pair_embeddings(test_pairs)

    training_samples = normalise_sample_representation.standardscaler_transform(
        training_samples)
    test_samples = normalise_sample_representation.standardscaler_transform(
        test_samples)

    clf = learner.train_DTI_prediction_svm(training_samples,
                                           training_labels,
                                           kernal=2)
    probs = clf.predict_proba(test_samples)
    new_probs = [row[1] for row in probs]
    all_evaluation = []
    #from tqdm import tqdm
    for i in range(len(test_pairs)):
        current_one = [test_pairs[i][0], test_pairs[i][1], new_probs[i]]
        all_evaluation.append(current_one)
    output_name = 'output/' + args.output_name + '.pkl'
    utils.save_any_obj_pkl(all_evaluation, output_name)
Beispiel #2
0
def evaluation():
    G_dynamic_ori = load_any_obj_pkl(
        "graph_data/collaborate_network(1G)/collaborate_network_2006_2016.pkl")
    G_dynamic = load_any_obj_pkl(
        "graph_data/collaborate_network(2G)/collaborate_network_2007_2016.pkl")
    method = "DANRL"
    filepath = "parameter_sensitivity/collaborate_network(2G)/output"
    files = os.listdir(filepath)

    filepath0 = "parameter_sensitivity/collaborate_network(2G)/recommendation"
    files0 = os.listdir(filepath0)

    i = 0
    for file in files:
        print(i, len(files))
        i += 1
        file_1 = file[:-4] + "_G_2ori.txt"
        if file_1 not in files0:
            emb_dicts = load_any_obj_pkl(os.path.join(filepath, file))
            # print(len(emb_dicts))
            avg_score = dict()
            for top_k in range(1, 11, 1):
                score = []
                for t in range(len(emb_dicts) - 2):  # 遍历所有time step的embedding
                    model = recommendation(
                        emb_dicts[t],
                        G0=G_dynamic[t],
                        G1=G_dynamic_ori[t + 2],
                        G2=G_dynamic_ori[t + 3],
                        # G3=G_dynamic_ori[t+4]
                    )
                    score.append(model.evaluate_precision_k(top_k))
                avg_score["top_" + str(top_k)] = np.mean(score)

            # "parameter_sensitivity/collaborate_network(2G)/output"
            output_filepath = "parameter_sensitivity/collaborate_network(2G)/recommendation"
            if not os.path.exists(output_filepath):
                os.makedirs(output_filepath)
            output = open(
                os.path.join(output_filepath, file[:-4] + "_G_2ori.txt"), "w")
            output.write(json.dumps(avg_score) + "\n")
            output.close()
def loadData2PD(filepath):
    data = load_any_obj_pkl(filepath)[-1]
    X = None
    car_ids = []
    for key, value in data.items():
        car_ids.append(key)
        if X is None:
            X = np.array(value).reshape(1, -1)
        else:
            X = np.vstack((X, value.reshape(1, -1)))
    X = 1.0 * (X - X.mean()) / X.std()
    return pd.DataFrame(X, index=car_ids)
def load_DynWalks_Embedding(method):
    data = load_any_obj_pkl("DynWalks/output/hangzhou_20140301_MCC_" + method +
                            "_embs.pkl")[-1]
    X = None
    car_ids = []
    for key, value in data.items():
        car_ids.append(key)
        if X is None:
            X = np.array(value).reshape(1, -1)
        else:
            X = np.vstack((X, value.reshape(1, -1)))
    X = 1.0 * (X - X.mean()) / X.std()
    return pd.DataFrame(X, index=car_ids)
Beispiel #5
0
def draw_graph():
    graphs = load_any_obj_pkl(
        "graph_data/collaborate_network(2G)/collaborate_network_2007_2016.pkl")
    year = 2007
    G_s = []
    for g in graphs:
        print("#nodes: " + str(g.number_of_nodes()) + ", #edges: " +
              str(g.number_of_edges()))
        G = nx.Graph()
        G.add_nodes_from(g.nodes())
        G.add_edges_from(g.edges())
        # nx.write_gexf(G, "graph_data/collaborate_network(2G)/" + str(year) + ".gexf")
        # year += 1
        G_s.append(G)
    nx.draw(G_s[0], node_size=30, node_color="black", edge_color="gray")
    plt.show()
Beispiel #6
0
def construct_combined_graph():
    graphs = load_any_obj_pkl(
        "graph_data/collaborate_network(1G)/collaborate_network_2006_2016.pkl")
    for i in range(2, len(graphs)):
        g0 = graphs[i - 2]
        g1 = graphs[i - 1]
        g = graphs[i]
        print("#nodes: " + str(g.number_of_nodes()) + ", #edges: " +
              str(g.number_of_edges()))
        l = []
        for edge in g0.edges():
            if edge not in g.edges():
                n1, n2 = edge[0], edge[1]
                l.append((n1, n2, g0.get_edge_data(n1, n2)['weight']))
                if n1 not in g.nodes():
                    g.add_node(n1, attribute=g0.nodes[n1]["attribute"])
                if n2 not in g.nodes():
                    g.add_node(n2, attribute=g0.nodes[n2]["attribute"])

        for edge in g1.edges():
            if edge not in g.edges():
                n1, n2 = edge[0], edge[1]
                l.append((n1, n2, g1.get_edge_data(n1, n2)['weight']))
                if n1 not in g.nodes():
                    g.add_node(n1, attribute=g1.nodes[n1]["attribute"])
                if n2 not in g.nodes():
                    g.add_node(n2, attribute=g1.nodes[n2]["attribute"])
        g.add_weighted_edges_from(l)
        print("#nodes: " + str(g.number_of_nodes()) + ", #edges: " +
              str(g.number_of_edges()))
        # nx.draw(g, node_size=20)
        # plt.show()
        g = g.subgraph(max(nx.connected_components(g), key=len)).copy()
        print("#nodes: " + str(g.number_of_nodes()) + ", #edges: " +
              str(g.number_of_edges()))
        filename = "graph_data/collaborate_network_" + str(
            i) + "_edgelist_new.txt"
        nx.write_edgelist(g, filename, data=False)

        save_any_obj_pkl(
            g,
            "graph_data/collaborate_network(3G)" + str(i + 2006) + "_new.pkl")

        graphs.append(g)

    save_any_obj_pkl(graphs,
                     "graph_data/collaborate_network_2008_2016_new.pkl")
def sw_similarity():
    csd = utils.load_any_obj_pkl('data/chem_seq_dict.pkl')
    chem_id = list(csd.keys())
    #for item in chem_id:
    #    if
    #if len(csd[item]) < 5 :
    #    print('get it')
    print(chem_id[0])
    print(chem_id[1])
    match = 2
    mismatch = -1
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    sw = swalign.LocalAlignment(
        scoring)  # you can also choose gap penalties, etc...
    alignment = sw.align(csd[chem_id[0]], csd[chem_id[1]])
    #score = alignment.dump()
    print(alignment.score)
def create_compound_similarity_network_mp(compounds_smiles_path,
                                          species_name='_DB',
                                          worker=4,
                                          top_ratio=0.04):
    compounds_smiles = utils.load_any_obj_pkl(compounds_smiles_path)
    all_compounds = list(compounds_smiles.keys())
    #print(Chem.SanitizeMol('CN(CCO[P@](O)(=O)O[P@@](O)(=O)O[Be-](F)(F)F)C1=CC=CC=C1[N+]([O-])=O'))
    #for item in all_compounds:
    #    m2 = Chem.MolFromSmiles(compounds_smiles[item])
    #    if m2 == None:
    #        print(item)
    #        print(compounds_smiles[item])
    #raise Exception('stop')
    ccd = calculate_molecular_similarity(compounds_smiles, worker=worker)
    all_corr = ccd.parallel_calculate_all_correlation()
    #all_corr = [[str(j) for j in i] for i in all_corr]
    final_corr = []
    for item in all_corr:
        for a_corr in item:
            #print(a_corr)
            final_corr.append(a_corr)
    save_name = 'data/' + 'compound_similarity' + species_name + '.pkl'
    utils.save_any_obj_pkl(final_corr, save_name)
Beispiel #9
0
        elif scholars[0] in G2.nodes() and scholars[1] in G2.nodes():
            if scholars in G2.edges():
                tp += 1
            else:
                fp += 1
        # elif scholars[0] in G3.nodes() and scholars[1] in G3.nodes():
        #     if scholars in G2.edges():
        #         tp += 1
        #     else:
        #         fp += 1
    # print(tp, fp)
    # print("recommend_precision_score=", "{:.9f}".format(tp/(tp+fp)))
    return tp / (tp + fp)


G_dynamic0 = load_any_obj_pkl(
    "graph_data/collaborate_network(1G)/collaborate_network_2006_2016.pkl")
G_dynamic = load_any_obj_pkl(
    "graph_data/collaborate_network(2G)/collaborate_network_2007_2016.pkl")

print("计算共同邻居")
cn_list = []
index2node_list = []
for g in G_dynamic:
    nodes = list(g.nodes())
    cn_matrix = np.zeros([len(nodes), len(nodes)])
    index2node = dict()
    for i in range(len(nodes)):
        index2node[i] = nodes[i]
        for j in range(i, len(nodes)):
            cn_matrix[i,
                      j] = len(list(nx.common_neighbors(g, nodes[i],
Beispiel #10
0
def load_OpenNE_Embedding(method, year):
    sid_emb = dict()
    with open(
            r"output/collaborate_network(2G)/" + method +
            "/collaborate_network_" + str(year) + "_embs.txt",
            "r") as embeddings:
        embeddings.readline()
        for embedding in embeddings:
            l = embedding.split()
            sid_emb[l[0]] = [float(n) for n in l[1:]]
    embeddings.close()
    return sid_emb


G_dynamic_ori = load_any_obj_pkl(
    "graph_data/collaborate_network(1G)/collaborate_network_2006_2016.pkl")
G_dynamic = load_any_obj_pkl(
    "graph_data/collaborate_network(2G)/collaborate_network_2007_2016.pkl")

method = "DeepWalk"
if method == "DANRL":
    emb_dicts = load_any_obj_pkl("output/collaborate_network(2G)/" + method +
                                 "/collaborate_network_2007_2016_embs.pkl")
else:
    emb_dicts = []
    for year in range(2007, 2017):
        emb_dicts.append(load_OpenNE_Embedding(method, year))

print(len(emb_dicts))
avg_score = dict()
for top_k in range(1, 11, 1):
                fps1, fps2)
            similarity_info.append([
                self.compounds[compound_index], self.compounds[i],
                simialrity_coefficient
            ])
        return similarity_info


if __name__ == "__main__":
    #create_compound_similarity_network_mp('data/drugbank_drugs.pkl', species_name = '_DB')#60
    #sw_similarity()#cp sa mm
    #Chem.SanitizeMol

    #all_keys = list(target_seqs.keys())
    #a = target_seqs[all_keys[0]]
    #print(type(a))
    #target_IDs = list(target_seqs.keys())
    #print(target_seqs[target_IDs[0]])
    #t1 = target_IDs[0]
    #seqs1 = target_seqs[t1].split('\n')
    #for item in target_IDs:
    #    seqs1 = target_seqs[item]
    #    print(len(seqs1.split('\n')[-3]))

    #create_compound_similarity_network_mp('data/drugbank_drugs.pkl', species_name = '_DB')
    target_seqs = utils.load_any_obj_pkl('data/drugbank_targets.pkl')
    create_target_similarity_network(target_seqs, 'DB_N_')

    #alignment, score, start_end_positions = skbio.alignment.local_pairwise_align_protein(Protein(p1_s), Protein(p2_s))
    #print(score)
Beispiel #12
0
def main(args):
    # -------- Step 1: prepare data --------
    print(f'Summary of all settings: {args}')
    print('\nStep 1: start loading data ...')
    t1 = time.time()
    G_dynamic = load_any_obj_pkl(args.graph)
    emb_dicts = load_any_obj_pkl(args.emb_file)
    t2 = time.time()
    print(f'Step 1: end loading data; time cost: {(t2-t1):.2f}s')

    # -------- Step 3: downstream task --------
    print('\n\nStep 3: start evaluating embeddings ...')
    t1 = time.time()

    print(time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime()))
    if args.task == 'lp' or args.task == 'all':
        from downstream import lpClassifier, gen_test_edge_wrt_changes
        # the size of LP testing data depends on the changes between two consecutive snapshots
        test_edges = []
        test_labels = []
        for t in range(len(G_dynamic) -
                       1):  # changed edges from t to t+1 as testing edges
            pos_edges_with_label, neg_edges_with_label = gen_test_edge_wrt_changes(
                G_dynamic[t], G_dynamic[t + 1], seed=args.seed)
            test_edges.append([e[:2] for e in pos_edges_with_label] +
                              [e[:2] for e in neg_edges_with_label])
            test_labels.append([e[2] for e in pos_edges_with_label] +
                               [e[2] for e in neg_edges_with_label])
        # ====== Changed Link Prediction task (via cos sim) by AUC score ======
        print(
            '--- Start changed link prediction task --> use current emb @t to predict **future** changed links @t+1: '
        )
        for t in range(len(G_dynamic) - 1):
            print(f'Current time step @t: {t}')
            print(f'Changed Link Prediction task (via cos sim) by AUC score')
            ds_task = lpClassifier(
                emb_dict=emb_dicts[t]
            )  # emb at t; did not use **future** changed edges
            ds_task.evaluate_auc(
                test_edges[t], test_labels[t]
            )  # evalate prediction of changed edges from t to t+1
        # ====== Changed Link Prediction task (Weighted-L1 edge_feat --> LR clf) by AUC score ======
        print(
            f'--- start changed link prediction task 1 --> use current emb @t to predict **future** changed links @t+1: '
        )
        LR_prev = None
        for t in range(len(G_dynamic) - 1):
            print(f'Current time step @t: {t}')
            print(
                f'Changed Link Prediction task (Weighted-L1 edge_feat --> LR clf) by AUC score'
            )
            ds_task = lpClassifier(emb_dict=emb_dicts[t])
            if t == 0:
                LR_prev = ds_task.lr_clf_init1(G_dynamic[t])
            LR_prev = ds_task.update_LR_auc1(
                test_edges[t], test_labels[t], LR_prev=LR_prev
            )  # incremental learning for Changed LP task; LogisticRegression(random_state=2021, penalty='l2', max_iter=1000)
        # ====== Changed Link Prediction task (Weighted-L2 edge_feat --> LR clf) by AUC score ======
        print(
            f'--- start changed link prediction task 2 --> use current emb @t to predict **future** changed links @t+1: '
        )
        LR_prev = None
        for t in range(len(G_dynamic) - 1):
            print(f'Current time step @t: {t}')
            print(
                f'Changed Link Prediction task (Weighted-L2 edge_feat --> LR clf) by AUC score'
            )
            ds_task = lpClassifier(emb_dict=emb_dicts[t])
            if t == 0:
                LR_prev = ds_task.lr_clf_init2(G_dynamic[t])
            LR_prev = ds_task.update_LR_auc2(
                test_edges[t], test_labels[t], LR_prev=LR_prev
            )  # incremental learning for Changed LP task; LogisticRegression(random_state=2021, penalty='l2', max_iter=1000)
        # ====== Changed Link Prediction task (Hadamard edge_feat --> LR clf) by AUC score ======
        print(
            f'--- start changed link prediction task 3 --> use current emb @t to predict **future** changed links @t+1: '
        )
        LR_prev = None
        for t in range(len(G_dynamic) - 1):
            print(f'Current time step @t: {t}')
            print(
                f'Changed Link Prediction task (Hadamard edge_feat --> LR clf) by AUC score'
            )
            ds_task = lpClassifier(emb_dict=emb_dicts[t])
            if t == 0:
                LR_prev = ds_task.lr_clf_init3(G_dynamic[t])
            LR_prev = ds_task.update_LR_auc3(
                test_edges[t], test_labels[t], LR_prev=LR_prev
            )  # incremental learning for Changed LP task; LogisticRegression(random_state=2021, penalty='l2', max_iter=1000)
        # ====== Changed Link Prediction task (Average edge_feat --> LR clf) by AUC score ======
        print(
            f'--- start changed link prediction task 4 --> use current emb @t to predict **future** changed links @t+1: '
        )
        LR_prev = None
        for t in range(len(G_dynamic) - 1):
            print(f'Current time step @t: {t}')
            print(
                f'Changed Link Prediction task (Average edge_feat --> LR clf) by AUC score'
            )
            ds_task = lpClassifier(emb_dict=emb_dicts[t])
            if t == 0:
                LR_prev = ds_task.lr_clf_init4(G_dynamic[t])
            LR_prev = ds_task.update_LR_auc4(
                test_edges[t], test_labels[t], LR_prev=LR_prev
            )  # incremental learning for Changed LP task; LogisticRegression(random_state=2021, penalty='l2', max_iter=1000)

    print(time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime()))
    if args.task == 'nr' or args.task == 'all':
        print(
            f'--- start changed node recommendation task --> use current emb @t to recommend nodes for **future** changed node in graph @t+1: '
        )
        from downstream import nrClassifier, gen_test_node_wrt_changes, align_nodes
        for t in range(len(G_dynamic) - 1):
            print(f'Current time step @t: {t}')
            node_list = gen_test_node_wrt_changes(
                G_dynamic[t], G_dynamic[t + 1]
            )  # generate the testing nodes that affected by changes and presented in both graphs
            print(
                '# of testing nodes that affected by changes and presented in both graphs: ',
                len(node_list))
            rc_next_graph_aligned = align_nodes(
                G_dynamic[t], G_dynamic[t + 1]
            )  # remove newly added nodes from G_dynamic[t+1], and add newly removed nodes to G_dynamic[t+1]
            ds_task = nrClassifier(emb_dict=emb_dicts[t],
                                   rc_graph=rc_next_graph_aligned)
            top_k_list = [5, 10, 50, 100]
            ds_task.evaluate_pk_and_apk(top_k_list, node_list)
            # If OOM, try grClassifier_batch (see dowmstream.py) which is slow but requires much smaller memory

    print(time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime()))
    if args.task == 'gr' or args.task == 'all':
        print(
            f'--- start graph/link reconstraction task --> use current emb @t to reconstruct **current** graph @t: '
        )
        from downstream import grClassifier
        for t in range(
                len(G_dynamic) - 1
        ):  # ignore the last one, so that the length is consistent with Changed LP
            print(f'Current time step @t: {t}')
            all_nodes = list(G_dynamic[t].nodes())
            if len(all_nodes) <= 10000:
                node_list = None  # testing all nodes
                print('# testing for all nodes in current graph')
            else:
                node_list = list(
                    np.random.choice(all_nodes, 10000, replace=False))
                print(
                    '# current graph is too larger -> randomly sample 10000 testing nodes: ',
                    len(node_list))
            ds_task = grClassifier(emb_dict=emb_dicts[t],
                                   rc_graph=G_dynamic[t])
            top_k_list = [5, 10, 50, 100]
            ds_task.evaluate_pk_and_apk(top_k_list, node_list)
            # If OOM, try grClassifier_batch (see dowmstream.py) which is slow but requires much smaller memory

    t2 = time.time()
    print(f'STEP3: end evaluating; time cost: {(t2-t1):.2f}s')
    print(time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime()))