Ejemplo n.º 1
0
    def setUp(self):

        # read in sample data
        current_directory = os.path.dirname(__file__)
        self.data_dir = os.path.join(current_directory, 'data')
        pos_train = os.path.abspath(self.data_dir +
                                    '/ppismall/pos_train_edges')

        # read data into graph
        training_graph = CSFGraph(pos_train)
        worddictionary = training_graph.get_node_to_index_map()
        self.reverse_worddictionary = training_graph.get_index_to_node_map()

        # generate random walks
        n2v_graph = N2vGraph(training_graph, 1, 1)
        walks = n2v_graph.simulate_walks(5, 10)

        # learn embeddings
        self.model = ContinuousBagOfWordsWord2Vec(
            walks,
            worddictionary=worddictionary,
            reverse_worddictionary=self.reverse_worddictionary,
            num_epochs=2)

        # create temporary directory to write data to
        self.temp_dir_loc = os.path.abspath(self.data_dir + '/temp')
        os.mkdir(self.temp_dir_loc)

        return None
Ejemplo n.º 2
0
    def setUp(self):
        curdir = os.path.dirname(__file__)
        pos_train = os.path.join(curdir, 'data/ppismall/pos_train_edges')
        pos_train = os.path.abspath(pos_train)
        training_graph = CSFGraph(pos_train)

        # obtain data needed to build model
        worddictionary = training_graph.get_node_to_index_map()
        reverse_worddictionary = training_graph.get_index_to_node_map()
        # initialize n2v object
        p, q = 1, 1
        self.number_of_nodes_in_training = training_graph.node_count()
        self.n2v_graph = N2vGraph(csf_graph=training_graph, p=p, q=q)

        # generate random walks
        self.walk_length = 10
        self.num_walks = 5
        self.walks = self.n2v_graph.simulate_walks(num_walks=self.num_walks, walk_length=self.walk_length)
        # walks is now a list of lists of ints

        # build cbow model
        self.cbow = ContinuousBagOfWordsWord2Vec(self.walks,
                                                 worddictionary=worddictionary,
                                                 reverse_worddictionary=reverse_worddictionary,
                                                 num_epochs=2)

        self.cbow.train()
Ejemplo n.º 3
0
def karate_test(pos_train_file, pos_valid_file, pos_test_file, neg_train_file,
                neg_valid_file, neg_test_file, embed_graph, p, q, walk_length,
                num_walks, num_epochs, classifier, edge_embed_method,
                skipValidation, output):
    pos_train_graph = CSFGraph(pos_train_file)
    pos_valid_graph = CSFGraph(pos_valid_file)
    pos_test_graph = CSFGraph(pos_test_file)
    neg_train_graph = CSFGraph(neg_train_file)
    neg_valid_graph = CSFGraph(neg_valid_file)
    neg_test_graph = CSFGraph(neg_test_file)
    # Graph (node) embeding using SkipGram as the word2vec model, with 2 epochs.
    graph = embiggen.random_walk_generator.N2vGraph(pos_train_graph, p, q)
    walks = graph.simulate_walks(num_walks, walk_length)
    worddictionary = pos_train_graph.get_node_to_index_map()
    reverse_worddictionary = pos_train_graph.get_index_to_node_map()
    model = SkipGramWord2Vec(walks,
                             worddictionary=worddictionary,
                             reverse_worddictionary=reverse_worddictionary,
                             num_epochs=num_epochs)
    model.train()
    write_embeddings(embed_graph, model.embedding, reverse_worddictionary)

    # Link prediction on the pos/neg train/valid/test sets using RF classifier
    lp = LinkPrediction(pos_train_graph, pos_valid_graph, pos_test_graph,
                        neg_train_graph, neg_valid_graph, neg_test_graph,
                        embed_graph, edge_embed_method, classifier,
                        skipValidation, output)
    lp.prepare_edge_and_node_labels()
    lp.predict_links()
    lp.output_classifier_results()
Ejemplo n.º 4
0
    def setUp(self):
        data_dir = os.path.join(os.path.dirname(__file__), 'data')

        edge_file = os.path.join(data_dir, 'small_het_graph_edges.tsv')
        node_file = os.path.join(data_dir, 'small_het_graph_nodes.tsv')

        g = CSFGraph(edge_file=edge_file, node_file=node_file)
        self.graph = g
        self.nodes = g.nodes()
        self.g1index = self.__get_index('g1')
        self.d1index = self.__get_index('d1')
Ejemplo n.º 5
0
    def setUp(self):
        input_file = os.path.join(os.path.dirname(__file__), 'data',
                                  'unweighted_small_graph.txt')
        g = CSFGraph(input_file)
        self.g = g

        return str(g)
Ejemplo n.º 6
0
    def setUp(self):
        data_dir = os.path.join(os.path.dirname(__file__), 'data')

        # these pass the tests okay
        node_file = os.path.join(data_dir, 'small_graph_nodes.tsv')
        edge_file = os.path.join(data_dir, 'small_graph_edges.tsv')

        g = CSFGraph(edge_file=edge_file, node_file=node_file)
        self.graph = g
Ejemplo n.º 7
0
    def setUp(self):
        data_dir = os.path.join(os.path.dirname(__file__), 'data')

        # files for canonical test graph
        self.edge_file = os.path.join(data_dir, 'small_graph_edges.tsv')
        self.node_file = os.path.join(data_dir, 'small_graph_nodes.tsv')

        # legacy and non-standard test files
        self.legacy_edge_file = os.path.join(data_dir,
                                             'small_graph_LEGACY.txt')
        self.tsv_no_subject = os.path.join(data_dir,
                                           'small_graph_edges_NO_SUBJECT.tsv')
        self.tsv_no_object = os.path.join(data_dir,
                                          'small_graph_edges_NO_OBJECT.tsv')
        self.node_file_missing_nodes = os.path.join(
            data_dir, 'small_graph_nodes_MISSING_NODES.tsv')

        g = CSFGraph(edge_file=self.edge_file)
        self.g = g
        str(g)
Ejemplo n.º 8
0
 def setUp(self) -> None:
     self.file_dir = 'tests/data/ppismall_with_validation/'
     self.pos_train_graph = CSFGraph(
         os.path.join(self.file_dir, 'pos_train_edges_max_comp_graph'))
     self.pos_valid_graph = CSFGraph(
         os.path.join(self.file_dir, 'pos_validation_edges_max_comp_graph'))
     self.pos_test_graph = CSFGraph(
         os.path.join(self.file_dir, 'pos_test_edges_max_comp_graph'))
     self.neg_train_graph = CSFGraph(
         os.path.join(self.file_dir, 'neg_train_edges_max_comp_graph'))
     self.neg_valid_graph = CSFGraph(
         os.path.join(self.file_dir, 'neg_validation_edges_max_comp_graph'))
     self.neg_test_graph = CSFGraph(
         os.path.join(self.file_dir, 'neg_test_edges_max_comp_graph'))
     self.test_embeddings = os.path.join(self.file_dir, 'test.embeddings')
Ejemplo n.º 9
0
def read_graphs():
    """
    Reads pos_train, pos_vslid, pos_test, neg_train train_valid and neg_test edges with CSFGraph
    :return: pos_train, pos_valid, pos_test, neg_train, neg_valid and neg_test graphs in CSFGraph format
    """
    start = time.time()

    pos_train_graph = CSFGraph(args.pos_train)
    pos_valid_graph = CSFGraph(args.pos_valid)
    pos_test_graph = CSFGraph(args.pos_test)
    neg_train_graph = CSFGraph(args.neg_train)
    neg_valid_graph = CSFGraph(args.neg_valid)
    neg_test_graph = CSFGraph(args.neg_test)
    end = time.time()
    logging.info("reading input edge lists files: {} seconds".format(end -
                                                                     start))

    return pos_train_graph, pos_valid_graph, pos_test_graph, neg_train_graph, neg_valid_graph, neg_test_graph
Ejemplo n.º 10
0
 def test_nodetype2count_dictionary(self):
     het_g = CSFGraph(edge_file=self.edge_file, node_file=self.node_file)
     self.assertIsInstance(self.g.nodetype2count_dictionary, dict)
     self.assertEqual(
         self.g.nodetype2count_dictionary['biolink:NamedThing'], 11)
     self.assertEqual(het_g.nodetype2count_dictionary['biolink:Disease'], 3)
Ejemplo n.º 11
0
 def test_count_edges_legacy_edge_file(self):
     g = CSFGraph(edge_file=self.legacy_edge_file)
     self.assertEqual(6, g.edge_count())
Ejemplo n.º 12
0
 def test_csfgraph_constructor_accepts_node_file(self):
     g = CSFGraph(edge_file=self.edge_file, node_file=self.node_file)
Ejemplo n.º 13
0
 def test_csfgraph_accepts_edge_file(self):
     g = CSFGraph(edge_file=self.edge_file)
Ejemplo n.º 14
0
 def test_csfgraph_checks_for_object_column(self):
     with self.assertRaises(CSFGraphNoObjectColumnError) as context:
         CSFGraph(
             edge_file=self.tsv_no_object)  # file doesn't have object col
Ejemplo n.º 15
0
 def test_csfgraph_populates_nodetype_to_index_map(self):
     het_g = CSFGraph(edge_file=self.edge_file, node_file=self.node_file)
     self.assertEqual(het_g.nodetype_to_index_map['biolink:Disease'],
                      [0, 1, 2])
Ejemplo n.º 16
0
 def test_csfgraph_populates_index_to_nodetype_map(self):
     het_g = CSFGraph(edge_file=self.edge_file, node_file=self.node_file)
     self.assertEqual(11, len(het_g.index_to_nodetype_map))
     self.assertEqual(het_g.index_to_nodetype_map[0], 'biolink:Disease')
Ejemplo n.º 17
0
 def setUp(self):
     inputfile = os.path.join(os.path.dirname(__file__), 'data',
                              'small_graph.txt')
     g = CSFGraph(inputfile)
     self.g = g
     str(g)
Ejemplo n.º 18
0
 def test_csfgraph_tolerates_missing_node_info(self):
     het_g = CSFGraph(edge_file=self.edge_file,
                      node_file=self.node_file_missing_nodes)
     self.assertEqual(het_g.index_to_nodetype_map[2],
                      het_g.default_node_type)
Ejemplo n.º 19
0
 def test_csfgraph_requires_arg(self):
     with self.assertRaises(Exception) as context:
         CSFGraph()  # missing edge arg
         self.assertTrue(str('missing' in context.exception))