Exemple #1
0
 def test_df_to_tsv(self):
     path = os.path.join(tempfile.mkdtemp(), 'some.tsv')
     df = tsv_to_df(self.edges_file)
     df_to_tsv(df, path)
     self.assertTrue(os.path.isfile(path))
     df_roundtrip = tsv_to_df(path)
     self.assertEqual(df.shape, df_roundtrip.shape)
Exemple #2
0
 def test_has_disconnected_nodes(self):
     nodes_extra_ids = tsv_to_df(
         'tests/resources/edges/bigger_graph_nodes_EXTRA_IDS.tsv')
     nodes_missing_ids = tsv_to_df(
         'tests/resources/edges/bigger_graph_nodes_MISSING_IDS.tsv')
     self.assertTrue(not has_disconnected_nodes(edges_df=self.edges,
                                                nodes_df=self.nodes))
     with self.assertWarns(Warning):
         self.assertTrue(not has_disconnected_nodes(edges_df=self.edges,
                                                    nodes_df=nodes_missing_ids))
     self.assertTrue(has_disconnected_nodes(edges_df=self.edges,
                                            nodes_df=nodes_extra_ids))
Exemple #3
0
    def setUpClass(cls) -> None:
        cls.nodes_file = 'tests/resources/edges/bigger_graph_nodes.tsv'
        cls.edges_file = 'tests/resources/edges/bigger_graph_edges.tsv'
        cls.edges = tsv_to_df(cls.edges_file)
        cls.nodes = tsv_to_df(cls.nodes_file)

        # make negative edges for small graph
        cls.ne = make_negative_edges(nodes_df=cls.nodes, edges_df=cls.edges)

        # make positive edges for small graph
        cls.train_fraction = 0.8
        (cls.train_edges, cls.test_edges) = make_positive_edges(
            nodes_df=cls.nodes, edges_df=cls.edges, train_fraction= cls.train_fraction,
            min_degree=0)
Exemple #4
0
 def test_make_edges_check_node_output_file(self):
     output_dir = tempfile.mkdtemp()
     output_file_with_path = os.path.join(output_dir, 'pos_train_nodes.tsv')
     input_nodes = tsv_to_df(self.nodes_file)
     make_edges(nodes=self.nodes_file, edges=self.edges_file,
                output_dir=output_dir, train_fraction=0.8,
                validation=False, min_degree=1)
     self.assertTrue(os.path.isfile(output_file_with_path))
     new_nodes_df = tsv_to_df(output_file_with_path)
     # make sure we get expected
     self.assertAlmostEqual(new_nodes_df.shape[0], input_nodes.shape[0])
     # should also have subject and object column
     self.assertTrue('id' in new_nodes_df)
     self.assertTrue('category' in new_nodes_df)
Exemple #5
0
 def test_make_edges_check_edge_output_files(self, output_file: str,
                                             make_validation: bool,
                                             file_should_exist: bool,
                                             expected_fract: float):
     me_output_dir = tempfile.mkdtemp()
     output_file_with_path = os.path.join(me_output_dir, output_file)
     input_edges = tsv_to_df(self.edges_file)
     num_input_edges = input_edges.shape[0]
     make_edges(nodes=self.nodes_file, edges=self.edges_file,
                output_dir=me_output_dir, train_fraction=0.8,
                validation=make_validation, min_degree=1)
     if file_should_exist:
         self.assertTrue(os.path.isfile(output_file_with_path))
         new_edges_df = tsv_to_df(output_file_with_path)
         # make sure we get expected
         self.assertAlmostEqual(new_edges_df.shape[0],
                                num_input_edges * expected_fract, 1)
         # should also have subject and object column
         self.assertTrue('subject' in new_edges_df)
         self.assertTrue('object' in new_edges_df)
     else:
         self.assertTrue(not os.path.isfile(output_file_with_path))
Exemple #6
0
    def test_make_edges_pos_train_test_valid_edges_distinct(self, train, test, valid):
        output_dir = tempfile.mkdtemp()
        input_edges = tsv_to_df(self.edges_file)
        make_edges(nodes=self.nodes_file, edges=self.edges_file,
                   output_dir=output_dir, train_fraction=0.8,
                   validation=True, min_degree=1)
        input_edges = tsv_to_df(self.edges_file)[['subject', 'object']]
        train_edges = tsv_to_df(os.path.join(output_dir, train))[['subject', 'object']]
        test_edges = tsv_to_df(os.path.join(output_dir, test))[['subject', 'object']]
        valid_edges = tsv_to_df(os.path.join(output_dir, valid))[['subject', 'object']]

        # train should not share any members with test
        self.assertTrue(not set(train_edges).isdisjoint(test_edges))
        # train should not share any members with valid
        self.assertTrue(not set(train_edges).isdisjoint(valid_edges))
        # test should not share any members with valid
        self.assertTrue(not set(test_edges).isdisjoint(valid_edges))

        # train should be a subset of input_edges
        self.assertTrue(set(train_edges) <= set(input_edges))
        # test should be a subset of input_edges
        self.assertTrue(set(test_edges) <= set(input_edges))
        # valid should be a subset of input_edges
        self.assertTrue(set(valid_edges) <= set(input_edges))
Exemple #7
0
 def test_make_positive_edges_test_min_degree_gt_zero(self):
     train_fraction = 0.90
     degree = 2
     hd_edges_file =\
         'tests/resources/edges/bigger_graph_edges_HIGHER_DEGREE_NODES.tsv'
     hd_edges = tsv_to_df(hd_edges_file)
     hd_nodes = ['p1', 'd1',
                 'g1', 'g2', 'g3', 'g4', 'g5', 'g6', 'g7', 'g8', 'g9', 'g10',
                 'g11', 'g12', 'g13', 'g14', 'g15', 'g16', 'g17', 'g18', 'g19',
                 'g20', 'g21', 'g22', 'g23', 'g24', 'g25']
     for _ in range(10):
         (train_edges, test_edges) = make_positive_edges(
             nodes_df=self.nodes, edges_df=hd_edges, train_fraction=train_fraction,
             min_degree=degree)
         these_nodes = set(list(test_edges.subject) + list(test_edges.object))
         self.assertTrue(set(these_nodes) < set(hd_nodes),
                         "Got some nodes with degree < 2: %s" %
                         " ".join(np.setdiff1d(these_nodes,hd_nodes)[0]))
Exemple #8
0
 def test_tsv_to_df(self):
     df = tsv_to_df(self.edges_file)
     self.assertTrue(isinstance(df, pd.DataFrame))
     self.assertEqual((150, 5), df.shape)
     self.assertEqual(df['subject'][0], 'g1')