def setUp(self): infile = os.path.join(os.path.dirname(__file__), 'data', 'small_het_graph.txt') g = CSFGraph(infile) self.graph = g self.nodes = g.nodes() self.g1index = self.__get_index('g1') self.d1index = self.__get_index('d1')
def disease_link_prediction(positive_training_file, positive_test_file, negative_training_file, negative_test_file, embedded_graph, edge_embedding_method): """ Predict disease links """ training_graph = CSFGraph(positive_training_file) test_graph = CSFGraph(positive_test_file) negative_training_graph = CSFGraph(negative_training_file) negative_test_graph = CSFGraph(negative_test_file) lp = LinkPrediction(training_graph, test_graph, negative_training_graph, negative_test_graph, embedded_graph, edge_embedding_method=edge_embedding_method) lp.predict_links() lp.output_Logistic_Reg_results()
def make_negative_edge_file(filename: str, num_edges_to_make: int, pos_train_graph: CSFGraph, pos_test_graph: CSFGraph) -> bool: edge_count = 0 with open(filename, 'w') as neg_train_fh,\ tqdm(total=pos_train_graph.edge_count()) as pbar: while edge_count < num_edges_to_make: node1_name = pos_train_graph.index_to_node_map[random_node( pos_train_graph)] node2_name = pos_train_graph.index_to_node_map[random_node( pos_train_graph)] if edge_count % 10 == 0: pbar.update(100) if not pos_train_graph.has_edge(node1_name, node2_name) and \ not pos_test_graph.has_edge(node1_name, node2_name): neg_train_fh.write("\t".join([node1_name, node2_name, "1"]) + "\n") edge_count = edge_count + 1 return True
def main(args): """ The input files are positive training, positive test, negative training and negative test edges. The code reads the files and create graphs in CSFGraph format. Then, the positive training graph is embedded. Finally, link prediction is performed. :param args: parameters of node2vec and link prediction :return: Result of link prediction """ print("[INFO]: p={}, q={}, classifier= {}, useGamma={}, word2vec_model={}". format(args.p, args.q, args.classifier, args.useGamma, args.w2v_model)) pos_train = os.path.join("data/pos_train.edges") pos_test = os.path.join("data/pos_test.edges") neg_train = os.path.join("data/neg_train.edges") neg_test = os.path.join("data/neg_test.edges") if args.make_edge_files: logging.info("Remaking edge files") make_train_test_files(upheno_graph=args.upheno_graph, equiv_phenotypes=args.equivalent_phenotypes, weight_multiplier=args.weight_multiplier, pos_train=pos_train, pos_test=pos_test, neg_train=neg_train, neg_test=neg_test) else: logging.info("Using existing edge files") pos_train_graph = CSFGraph(pos_train) pos_test_graph = CSFGraph(pos_test) neg_train_graph = CSFGraph(neg_train) neg_test_graph = CSFGraph(neg_test) pos_train_g = xn2v.hetnode2vec.N2vGraph(pos_train_graph, args.p, args.q, args.gamma, args.useGamma) walks = pos_train_g.simulate_walks(args.num_walks, args.walk_length) learn_embeddings(walks, pos_train_graph, args.w2v_model) linkpred(pos_train_graph, pos_test_graph, neg_train_graph, neg_test_graph)
def disease_gene_embeddings(training_file, output_file, p, q, gamma, use_gamma, walk_length, num_walks, dimensions, window_size, workers, num_steps, display_step): """ Generate disease gene embeddings """ logging.basicConfig(level=logging.INFO) print("Reading training file %s" % training_file) training_graph = CSFGraph(training_file) print(training_graph) training_graph.print_edge_type_distribution() hetgraph = xn2v.hetnode2vec.N2vGraph(training_graph, p, q, gamma, use_gamma) walks = hetgraph.simulate_walks(num_walks, walk_length) worddictionary = training_graph.get_node_to_index_map() reverse_worddictionary = training_graph.get_index_to_node_map() numberwalks = [] for w in walks: nwalk = [] for node in w: i = worddictionary[node] nwalk.append(i) numberwalks.append(nwalk) model = SkipGramWord2Vec(numberwalks, worddictionary=worddictionary, reverse_worddictionary=reverse_worddictionary, num_steps=num_steps) model.train(display_step=display_step) model.write_embeddings(output_file)
def karate_test(training_file, test_file, output_file, p, q, gamma, use_gamma, walk_length, num_walks): training_graph = CSFGraph(training_file) hetgraph = xn2v.hetnode2vec.N2vGraph(training_graph, p, q, gamma, use_gamma) walks = hetgraph.simulate_walks(num_walks, walk_length) worddictionary = training_graph.get_node_to_index_map() reverse_worddictionary = training_graph.get_index_to_node_map() numberwalks = [] for w in walks: nwalk = [] for node in w: i = worddictionary[node] nwalk.append(i) numberwalks.append(nwalk) model = SkipGramWord2Vec(numberwalks, worddictionary=worddictionary, reverse_worddictionary=reverse_worddictionary, num_steps=1000) model.train(display_step=100) output_filenname = 'karate.embedded' model.write_embeddings(output_filenname) test_graph = CSFGraph(test_file) path_to_embedded_graph = output_filenname parameters = { 'edge_embedding_method': "hadamard", 'portion_false_edges': 1 } lp = LinkPrediction( training_graph, test_graph, path_to_embedded_graph, params=parameters ) #TODO:modify this part to work with new link prediction lp.predict_links() lp.output_Logistic_Reg_results()
def test_embedding(self): training_file = os.path.join(os.path.dirname(__file__), 'data', 'karate.train') output_file = os.path.join(os.path.dirname(__file__), 'data', 'disease.embedded') training_graph = CSFGraph(training_file) training_graph.print_edge_type_distribution() p = 1 q = 1 gamma = 1 useGamma = False hetgraph = xn2v.hetnode2vec.N2vGraph(training_graph, p, q, gamma, useGamma) walk_length = 80 num_walks = 25 walks = hetgraph.simulate_walks(num_walks, walk_length) worddictionary = training_graph.get_node_to_index_map() reverse_worddictionary = training_graph.get_index_to_node_map() numberwalks = [] for w in walks: nwalk = [] for node in w: i = worddictionary[node] nwalk.append(i) numberwalks.append(nwalk) model = SkipGramWord2Vec(numberwalks, worddictionary=worddictionary, reverse_worddictionary=reverse_worddictionary, num_steps=100) model.train(display_step=10) model.write_embeddings(output_file)
os.environ.get("LOGFILE", "link_prediction.log")) formatter = logging.Formatter( '%(asctime)s - %(levelname)s -%(filename)s:%(lineno)d - %(message)s') handler.setFormatter(formatter) log = logging.getLogger() log.setLevel(os.environ.get("LOGLEVEL", "DEBUG")) log.addHandler(handler) pos_training_file = os.path.join(os.path.dirname(__file__), 'pos_train_edges') pos_test_file = os.path.join(os.path.dirname(__file__), 'pos_test_edges') neg_test_file = os.path.join(os.path.dirname(__file__), 'neg_test_edges') neg_training_file = os.path.join(os.path.dirname(__file__), 'neg_training_edges') pos_train_graph = CSFGraph(pos_training_file) pos_test_graph = CSFGraph(pos_test_file) neg_train_graph = CSFGraph(neg_training_file) neg_test_graph = CSFGraph(neg_test_file) p = 1 q = 1 gamma = 1 useGamma = False hetgraph = xn2v.hetnode2vec.N2vGraph(pos_train_graph, p, q, gamma, useGamma) walk_length = 80 num_walks = 100 dimensions = 128 window_size = 10 workers = 8
def setUp(self): inputfile = os.path.join(os.path.dirname( __file__), 'data', 'unweighted_small_graph.txt') g = CSFGraph(inputfile) self.g = g str(g)
def random_node(graph: CSFGraph) -> int: return int(numpy.random.uniform(0, graph.node_count(), 1))
def make_train_test_files(upheno_graph, equiv_phenotypes, weight_multiplier: int, pos_train: str, pos_test: str, neg_train: str, neg_test: str, test_fraction: float = 0.2): """ Read in equivalent phenotypes, split them into train/test (using test_fraction), then write out: pos_train (upheno_graph + train equivalent phenotypes) pos_test (test equivalent phenotypes) neg_train (random edges connecting nodes not connected in upheno_graph) neg_test (random edges connecting nodes not connected in equiv_phenotypes) :param upheno_graph file containing all edges from upheno (except equivalent phenotypes) :param equiv_phenotypes file containing equivalent phenotype edges, with weights :param weight_multiplier factor to multiply weight of phenotype links :param pos_train: filename write out pos train edges :param pos_test: filename to write out pos test edges :param neg_train: filename to write neg train edges :param neg_test: filename to write neg test edges :param test_fraction=0.2 what fraction of equiv_phenotypes should be used for testing :return: pos_train, pos_test, neg_train and neg_test graphs in CSFGraph format """ curie_map = make_iri_to_curie_map() # write out pos_train and pos_test # first split equiv phenotype edges into train/test and write out positives edges logging.info("Making positive train and positive test files") with open(equiv_phenotypes.name, 'r') as equiv_fh, \ open(pos_train, 'w') as pos_train_fh, \ open(pos_test, 'w') as pos_test_fh: for line in equiv_fh: r = random.random() items = line.rstrip().split("\t") items[0] = curieize(items[0], curie_map) items[1] = curieize(items[1], curie_map) # default edge weight for known equivalent phenotypes if len(items) < 3: items.append("1") items[2] = str(float(items[2]) * weight_multiplier) outline = "\t".join(items) + "\n" if r > test_fraction: pos_train_fh.write(outline) else: pos_test_fh.write(outline) equiv_fh.close() pos_train_fh.close() pos_test_fh.close() # append upheno graph to pos_train edges: with open(pos_train, 'a') as pos_train_append_fh, \ open(upheno_graph.name, 'r') as upheno_graph_fh: for line in upheno_graph_fh: # turn <IRI:1234> into CURIE:1234 (item1, item2) = line.strip().split(" ") item1 = curieize(item1, curie_map) item2 = curieize(item2, curie_map) pos_train_append_fh.write("\t".join([item1, item2, "1"]) + "\n") logging.info( "Loading CSFGraphs from positive train and positive test edge files") pos_train_graph = CSFGraph(pos_train) pos_test_graph = CSFGraph(pos_test) # make negative edges logging.info("Making negative training edges file") make_negative_edge_file(neg_train, pos_train_graph.edge_count(), pos_train_graph, pos_test_graph) logging.info("Making negative test edges file") make_negative_edge_file(neg_test, pos_test_graph.edge_count(), pos_train_graph, pos_test_graph) logging.info("Loading CSFGraphs from negative train and test edge file") return True
def setUp(self): inputfile = os.path.join(os.path.dirname(__file__), 'data', 'small_graph.txt') g = CSFGraph(inputfile) self.graph = g
import xn2v from xn2v import CSFGraph from xn2v.word2vec import SkipGramWord2Vec import os training_file = os.path.join(os.path.dirname(__file__), 'pos_train_edges') g = CSFGraph(training_file) p = 1 q = 1 gamma = 1 useGamma = False graph = xn2v.hetnode2vec.N2vGraph(g, p, q, gamma, useGamma) walk_length = 80 num_walks = 100 walks = graph.simulate_walks(num_walks, walk_length) dimensions = 128 window_size = 10 workers = 8 worddictionary = g.get_node_to_index_map() reverse_worddictionary = g.get_index_to_node_map() walks_integer_nodes = [] for w in walks: nwalk = [] for node in w: i = worddictionary[node] nwalk.append(i)