def test_df_to_tsv(self): path = os.path.join(tempfile.mkdtemp(), 'some.tsv') df = tsv_to_df(self.edges_file) df_to_tsv(df, path) self.assertTrue(os.path.isfile(path)) df_roundtrip = tsv_to_df(path) self.assertEqual(df.shape, df_roundtrip.shape)
def test_has_disconnected_nodes(self): nodes_extra_ids = tsv_to_df( 'tests/resources/edges/bigger_graph_nodes_EXTRA_IDS.tsv') nodes_missing_ids = tsv_to_df( 'tests/resources/edges/bigger_graph_nodes_MISSING_IDS.tsv') self.assertTrue(not has_disconnected_nodes(edges_df=self.edges, nodes_df=self.nodes)) with self.assertWarns(Warning): self.assertTrue(not has_disconnected_nodes(edges_df=self.edges, nodes_df=nodes_missing_ids)) self.assertTrue(has_disconnected_nodes(edges_df=self.edges, nodes_df=nodes_extra_ids))
def setUpClass(cls) -> None: cls.nodes_file = 'tests/resources/edges/bigger_graph_nodes.tsv' cls.edges_file = 'tests/resources/edges/bigger_graph_edges.tsv' cls.edges = tsv_to_df(cls.edges_file) cls.nodes = tsv_to_df(cls.nodes_file) # make negative edges for small graph cls.ne = make_negative_edges(nodes_df=cls.nodes, edges_df=cls.edges) # make positive edges for small graph cls.train_fraction = 0.8 (cls.train_edges, cls.test_edges) = make_positive_edges( nodes_df=cls.nodes, edges_df=cls.edges, train_fraction= cls.train_fraction, min_degree=0)
def test_make_edges_check_node_output_file(self): output_dir = tempfile.mkdtemp() output_file_with_path = os.path.join(output_dir, 'pos_train_nodes.tsv') input_nodes = tsv_to_df(self.nodes_file) make_edges(nodes=self.nodes_file, edges=self.edges_file, output_dir=output_dir, train_fraction=0.8, validation=False, min_degree=1) self.assertTrue(os.path.isfile(output_file_with_path)) new_nodes_df = tsv_to_df(output_file_with_path) # make sure we get expected self.assertAlmostEqual(new_nodes_df.shape[0], input_nodes.shape[0]) # should also have subject and object column self.assertTrue('id' in new_nodes_df) self.assertTrue('category' in new_nodes_df)
def test_make_edges_check_edge_output_files(self, output_file: str, make_validation: bool, file_should_exist: bool, expected_fract: float): me_output_dir = tempfile.mkdtemp() output_file_with_path = os.path.join(me_output_dir, output_file) input_edges = tsv_to_df(self.edges_file) num_input_edges = input_edges.shape[0] make_edges(nodes=self.nodes_file, edges=self.edges_file, output_dir=me_output_dir, train_fraction=0.8, validation=make_validation, min_degree=1) if file_should_exist: self.assertTrue(os.path.isfile(output_file_with_path)) new_edges_df = tsv_to_df(output_file_with_path) # make sure we get expected self.assertAlmostEqual(new_edges_df.shape[0], num_input_edges * expected_fract, 1) # should also have subject and object column self.assertTrue('subject' in new_edges_df) self.assertTrue('object' in new_edges_df) else: self.assertTrue(not os.path.isfile(output_file_with_path))
def test_make_edges_pos_train_test_valid_edges_distinct(self, train, test, valid): output_dir = tempfile.mkdtemp() input_edges = tsv_to_df(self.edges_file) make_edges(nodes=self.nodes_file, edges=self.edges_file, output_dir=output_dir, train_fraction=0.8, validation=True, min_degree=1) input_edges = tsv_to_df(self.edges_file)[['subject', 'object']] train_edges = tsv_to_df(os.path.join(output_dir, train))[['subject', 'object']] test_edges = tsv_to_df(os.path.join(output_dir, test))[['subject', 'object']] valid_edges = tsv_to_df(os.path.join(output_dir, valid))[['subject', 'object']] # train should not share any members with test self.assertTrue(not set(train_edges).isdisjoint(test_edges)) # train should not share any members with valid self.assertTrue(not set(train_edges).isdisjoint(valid_edges)) # test should not share any members with valid self.assertTrue(not set(test_edges).isdisjoint(valid_edges)) # train should be a subset of input_edges self.assertTrue(set(train_edges) <= set(input_edges)) # test should be a subset of input_edges self.assertTrue(set(test_edges) <= set(input_edges)) # valid should be a subset of input_edges self.assertTrue(set(valid_edges) <= set(input_edges))
def test_make_positive_edges_test_min_degree_gt_zero(self): train_fraction = 0.90 degree = 2 hd_edges_file =\ 'tests/resources/edges/bigger_graph_edges_HIGHER_DEGREE_NODES.tsv' hd_edges = tsv_to_df(hd_edges_file) hd_nodes = ['p1', 'd1', 'g1', 'g2', 'g3', 'g4', 'g5', 'g6', 'g7', 'g8', 'g9', 'g10', 'g11', 'g12', 'g13', 'g14', 'g15', 'g16', 'g17', 'g18', 'g19', 'g20', 'g21', 'g22', 'g23', 'g24', 'g25'] for _ in range(10): (train_edges, test_edges) = make_positive_edges( nodes_df=self.nodes, edges_df=hd_edges, train_fraction=train_fraction, min_degree=degree) these_nodes = set(list(test_edges.subject) + list(test_edges.object)) self.assertTrue(set(these_nodes) < set(hd_nodes), "Got some nodes with degree < 2: %s" % " ".join(np.setdiff1d(these_nodes,hd_nodes)[0]))
def test_tsv_to_df(self): df = tsv_to_df(self.edges_file) self.assertTrue(isinstance(df, pd.DataFrame)) self.assertEqual((150, 5), df.shape) self.assertEqual(df['subject'][0], 'g1')