Beispiel #1
0
 def setUp(self):
     infile = os.path.join(os.path.dirname(__file__), 'data', 'small_het_graph.txt')
     g = CSFGraph(infile)
     self.graph = g
     self.nodes = g.nodes()
     self.g1index = self.__get_index('g1')
     self.d1index = self.__get_index('d1')
Beispiel #2
0
def disease_link_prediction(positive_training_file, positive_test_file,
                            negative_training_file, negative_test_file,
                            embedded_graph, edge_embedding_method):
    """
    Predict disease links
    """

    training_graph = CSFGraph(positive_training_file)
    test_graph = CSFGraph(positive_test_file)
    negative_training_graph = CSFGraph(negative_training_file)
    negative_test_graph = CSFGraph(negative_test_file)
    lp = LinkPrediction(training_graph,
                        test_graph,
                        negative_training_graph,
                        negative_test_graph,
                        embedded_graph,
                        edge_embedding_method=edge_embedding_method)
    lp.predict_links()
    lp.output_Logistic_Reg_results()
Beispiel #3
0
def make_negative_edge_file(filename: str, num_edges_to_make: int,
                            pos_train_graph: CSFGraph,
                            pos_test_graph: CSFGraph) -> bool:
    edge_count = 0
    with open(filename, 'w') as neg_train_fh,\
            tqdm(total=pos_train_graph.edge_count()) as pbar:
        while edge_count < num_edges_to_make:
            node1_name = pos_train_graph.index_to_node_map[random_node(
                pos_train_graph)]
            node2_name = pos_train_graph.index_to_node_map[random_node(
                pos_train_graph)]

            if edge_count % 10 == 0:
                pbar.update(100)

            if not pos_train_graph.has_edge(node1_name, node2_name) and \
               not pos_test_graph.has_edge(node1_name, node2_name):
                neg_train_fh.write("\t".join([node1_name, node2_name, "1"]) +
                                   "\n")
                edge_count = edge_count + 1
    return True
Beispiel #4
0
def main(args):
    """
    The input files are positive training, positive test, negative training and negative test edges. The code
    reads the files and create graphs in CSFGraph format. Then, the positive training graph is embedded.
    Finally, link prediction is performed.

    :param args: parameters of node2vec and link prediction
    :return: Result of link prediction
    """
    print("[INFO]: p={}, q={}, classifier= {}, useGamma={}, word2vec_model={}".
          format(args.p, args.q, args.classifier, args.useGamma,
                 args.w2v_model))

    pos_train = os.path.join("data/pos_train.edges")
    pos_test = os.path.join("data/pos_test.edges")
    neg_train = os.path.join("data/neg_train.edges")
    neg_test = os.path.join("data/neg_test.edges")

    if args.make_edge_files:
        logging.info("Remaking edge files")
        make_train_test_files(upheno_graph=args.upheno_graph,
                              equiv_phenotypes=args.equivalent_phenotypes,
                              weight_multiplier=args.weight_multiplier,
                              pos_train=pos_train,
                              pos_test=pos_test,
                              neg_train=neg_train,
                              neg_test=neg_test)
    else:
        logging.info("Using existing edge files")

    pos_train_graph = CSFGraph(pos_train)
    pos_test_graph = CSFGraph(pos_test)
    neg_train_graph = CSFGraph(neg_train)
    neg_test_graph = CSFGraph(neg_test)

    pos_train_g = xn2v.hetnode2vec.N2vGraph(pos_train_graph, args.p, args.q,
                                            args.gamma, args.useGamma)
    walks = pos_train_g.simulate_walks(args.num_walks, args.walk_length)
    learn_embeddings(walks, pos_train_graph, args.w2v_model)
    linkpred(pos_train_graph, pos_test_graph, neg_train_graph, neg_test_graph)
Beispiel #5
0
def disease_gene_embeddings(training_file, output_file, p, q, gamma, use_gamma,
                            walk_length, num_walks, dimensions, window_size,
                            workers, num_steps, display_step):
    """
    Generate disease gene embeddings
    """
    logging.basicConfig(level=logging.INFO)
    print("Reading training file %s" % training_file)
    training_graph = CSFGraph(training_file)
    print(training_graph)
    training_graph.print_edge_type_distribution()

    hetgraph = xn2v.hetnode2vec.N2vGraph(training_graph, p, q, gamma,
                                         use_gamma)
    walks = hetgraph.simulate_walks(num_walks, walk_length)
    worddictionary = training_graph.get_node_to_index_map()
    reverse_worddictionary = training_graph.get_index_to_node_map()

    numberwalks = []
    for w in walks:
        nwalk = []
        for node in w:
            i = worddictionary[node]
            nwalk.append(i)
        numberwalks.append(nwalk)

    model = SkipGramWord2Vec(numberwalks,
                             worddictionary=worddictionary,
                             reverse_worddictionary=reverse_worddictionary,
                             num_steps=num_steps)
    model.train(display_step=display_step)
    model.write_embeddings(output_file)
Beispiel #6
0
def karate_test(training_file, test_file, output_file, p, q, gamma, use_gamma,
                walk_length, num_walks):
    training_graph = CSFGraph(training_file)
    hetgraph = xn2v.hetnode2vec.N2vGraph(training_graph, p, q, gamma,
                                         use_gamma)

    walks = hetgraph.simulate_walks(num_walks, walk_length)
    worddictionary = training_graph.get_node_to_index_map()
    reverse_worddictionary = training_graph.get_index_to_node_map()

    numberwalks = []
    for w in walks:
        nwalk = []
        for node in w:
            i = worddictionary[node]
            nwalk.append(i)
        numberwalks.append(nwalk)

    model = SkipGramWord2Vec(numberwalks,
                             worddictionary=worddictionary,
                             reverse_worddictionary=reverse_worddictionary,
                             num_steps=1000)
    model.train(display_step=100)
    output_filenname = 'karate.embedded'
    model.write_embeddings(output_filenname)

    test_graph = CSFGraph(test_file)
    path_to_embedded_graph = output_filenname
    parameters = {
        'edge_embedding_method': "hadamard",
        'portion_false_edges': 1
    }

    lp = LinkPrediction(
        training_graph, test_graph, path_to_embedded_graph, params=parameters
    )  #TODO:modify this part to work with new link prediction

    lp.predict_links()
    lp.output_Logistic_Reg_results()
    def test_embedding(self):
        training_file = os.path.join(os.path.dirname(__file__), 'data',
                                     'karate.train')
        output_file = os.path.join(os.path.dirname(__file__), 'data',
                                   'disease.embedded')
        training_graph = CSFGraph(training_file)
        training_graph.print_edge_type_distribution()

        p = 1
        q = 1
        gamma = 1
        useGamma = False
        hetgraph = xn2v.hetnode2vec.N2vGraph(training_graph, p, q, gamma,
                                             useGamma)

        walk_length = 80
        num_walks = 25
        walks = hetgraph.simulate_walks(num_walks, walk_length)

        worddictionary = training_graph.get_node_to_index_map()
        reverse_worddictionary = training_graph.get_index_to_node_map()

        numberwalks = []
        for w in walks:
            nwalk = []
            for node in w:
                i = worddictionary[node]
                nwalk.append(i)
            numberwalks.append(nwalk)

        model = SkipGramWord2Vec(numberwalks,
                                 worddictionary=worddictionary,
                                 reverse_worddictionary=reverse_worddictionary,
                                 num_steps=100)
        model.train(display_step=10)
        model.write_embeddings(output_file)
    os.environ.get("LOGFILE", "link_prediction.log"))
formatter = logging.Formatter(
    '%(asctime)s - %(levelname)s -%(filename)s:%(lineno)d - %(message)s')
handler.setFormatter(formatter)
log = logging.getLogger()
log.setLevel(os.environ.get("LOGLEVEL", "DEBUG"))
log.addHandler(handler)

pos_training_file = os.path.join(os.path.dirname(__file__), 'pos_train_edges')
pos_test_file = os.path.join(os.path.dirname(__file__), 'pos_test_edges')

neg_test_file = os.path.join(os.path.dirname(__file__), 'neg_test_edges')
neg_training_file = os.path.join(os.path.dirname(__file__),
                                 'neg_training_edges')

pos_train_graph = CSFGraph(pos_training_file)
pos_test_graph = CSFGraph(pos_test_file)
neg_train_graph = CSFGraph(neg_training_file)
neg_test_graph = CSFGraph(neg_test_file)

p = 1
q = 1
gamma = 1
useGamma = False
hetgraph = xn2v.hetnode2vec.N2vGraph(pos_train_graph, p, q, gamma, useGamma)

walk_length = 80
num_walks = 100
dimensions = 128
window_size = 10
workers = 8
Beispiel #9
0
 def setUp(self):
     inputfile = os.path.join(os.path.dirname(
         __file__), 'data', 'unweighted_small_graph.txt')
     g = CSFGraph(inputfile)
     self.g = g
     str(g)
Beispiel #10
0
def random_node(graph: CSFGraph) -> int:
    return int(numpy.random.uniform(0, graph.node_count(), 1))
Beispiel #11
0
def make_train_test_files(upheno_graph,
                          equiv_phenotypes,
                          weight_multiplier: int,
                          pos_train: str,
                          pos_test: str,
                          neg_train: str,
                          neg_test: str,
                          test_fraction: float = 0.2):
    """
    Read in equivalent phenotypes, split them into train/test (using test_fraction),
    then write out:
    pos_train (upheno_graph + train equivalent phenotypes)
    pos_test (test equivalent phenotypes)
    neg_train (random edges connecting nodes not connected in upheno_graph)
    neg_test (random edges connecting nodes not connected in equiv_phenotypes)

    :param upheno_graph file containing all edges from upheno (except equivalent
    phenotypes)
    :param equiv_phenotypes file containing equivalent phenotype edges, with weights
    :param weight_multiplier factor to multiply weight of phenotype links
    :param pos_train: filename write out pos train edges
    :param pos_test: filename to write out pos test edges
    :param neg_train: filename to write neg train edges
    :param neg_test: filename to write neg test edges
    :param test_fraction=0.2 what fraction of equiv_phenotypes should be used for
    testing

    :return: pos_train, pos_test, neg_train and neg_test graphs in CSFGraph format
    """

    curie_map = make_iri_to_curie_map()

    # write out pos_train and pos_test
    # first split equiv phenotype edges into train/test and write out positives edges
    logging.info("Making positive train and positive test files")
    with open(equiv_phenotypes.name, 'r') as equiv_fh, \
            open(pos_train, 'w') as pos_train_fh, \
            open(pos_test, 'w') as pos_test_fh:
        for line in equiv_fh:
            r = random.random()
            items = line.rstrip().split("\t")
            items[0] = curieize(items[0], curie_map)
            items[1] = curieize(items[1], curie_map)

            # default edge weight for known equivalent phenotypes
            if len(items) < 3:
                items.append("1")
            items[2] = str(float(items[2]) * weight_multiplier)

            outline = "\t".join(items) + "\n"
            if r > test_fraction:
                pos_train_fh.write(outline)
            else:
                pos_test_fh.write(outline)
        equiv_fh.close()
        pos_train_fh.close()
        pos_test_fh.close()

        # append upheno graph to pos_train edges:
        with open(pos_train, 'a') as pos_train_append_fh, \
                open(upheno_graph.name, 'r') as upheno_graph_fh:
            for line in upheno_graph_fh:
                # turn <IRI:1234> into CURIE:1234
                (item1, item2) = line.strip().split(" ")
                item1 = curieize(item1, curie_map)
                item2 = curieize(item2, curie_map)

                pos_train_append_fh.write("\t".join([item1, item2, "1"]) +
                                          "\n")

    logging.info(
        "Loading CSFGraphs from positive train and positive test edge files")
    pos_train_graph = CSFGraph(pos_train)
    pos_test_graph = CSFGraph(pos_test)

    # make negative edges
    logging.info("Making negative training edges file")
    make_negative_edge_file(neg_train, pos_train_graph.edge_count(),
                            pos_train_graph, pos_test_graph)

    logging.info("Making negative test edges file")
    make_negative_edge_file(neg_test, pos_test_graph.edge_count(),
                            pos_train_graph, pos_test_graph)

    logging.info("Loading CSFGraphs from negative train and test edge file")
    return True
Beispiel #12
0
 def setUp(self):
     inputfile = os.path.join(os.path.dirname(__file__), 'data',
                              'small_graph.txt')
     g = CSFGraph(inputfile)
     self.graph = g
import xn2v
from xn2v import CSFGraph
from xn2v.word2vec import SkipGramWord2Vec
import os

training_file = os.path.join(os.path.dirname(__file__), 'pos_train_edges')

g = CSFGraph(training_file)

p = 1
q = 1
gamma = 1
useGamma = False
graph = xn2v.hetnode2vec.N2vGraph(g, p, q, gamma, useGamma)

walk_length = 80
num_walks = 100
walks = graph.simulate_walks(num_walks, walk_length)
dimensions = 128
window_size = 10
workers = 8

worddictionary = g.get_node_to_index_map()
reverse_worddictionary = g.get_index_to_node_map()

walks_integer_nodes = []
for w in walks:
    nwalk = []
    for node in w:
        i = worddictionary[node]
        nwalk.append(i)