def test_kgtk_cat_two_files(self): f1_path = 'data/sample_kgtk_edge_Q47158.tsv' f2_path = 'data/sample_kgtk_edge_file_with_id.tsv' cli_entry("kgtk", "cat", "-i", f1_path, f2_path, "-o", f'{self.temp_dir}/cat.tsv') df = pd.read_csv(f'{self.temp_dir}/cat.tsv', sep='\t') self.assertEqual(len(df), 6)
def test_kgtk_add_id_overwrite_style_n1_l_num(self): cli_entry("kgtk", "add-id", "-i", self.file_path2, "-o", f'{self.temp_dir}/id.tsv', "--overwrite-id", "--id-style", "node1-label-num") df = pd.read_csv(f'{self.temp_dir}/id.tsv', sep='\t') for i, row in df.iterrows(): self.assertEqual(row['id'], f'{row["node1"]}-{row["label"]}-0000')
def test_kgtk_normalize_nodes_alias_only(self): cli_entry("kgtk", "normalize-nodes", "-i", self.file_path, "-o", f'{self.temp_dir}/normalize.tsv', "-c", "alias") df = pd.read_csv(f'{self.temp_dir}/normalize.tsv', sep='\t') self.assertEqual(len(df), 24) self.assertEqual(list(df['label'].unique())[0], 'alias')
def test_kgtk_add_id_overwrite_style_prefix(self): cli_entry("kgtk", "add-id", "-i", self.file_path2, "-o", f'{self.temp_dir}/id.tsv', "--overwrite-id", "--id-style", "prefix###", "--id-prefix", "THIS") df = pd.read_csv(f'{self.temp_dir}/id.tsv', sep='\t').fillna("") for i, row in df.iterrows(): self.assertEqual(row['id'], f'THIS{i + 1}')
def test_kgtk_add_id_new_id_column_specify_old_id_column(self): file_path = 'data/sample_kgtk_edge_file_with_id.tsv' cli_entry("kgtk", "add-id", "-i", file_path, "-o", f'{self.temp_dir}/id.tsv', "--new-id-column-name", "id_new", "--old-id-column-name", "id") df = pd.read_csv(f'{self.temp_dir}/id.tsv', sep='\t') for i, row in df.iterrows(): self.assertEqual(row['id_new'], f'E{i + 1}')
def test_kgtk_split_by_qnode(self): df = pd.read_csv(self.file_path, sep='\t') qnodes = list(df['node1'].unique()) cli_entry("kgtk", "split", "-i", self.file_path, "--output-path", self.temp_dir, "--split-by-qnode") for qnode in qnodes: self.assertTrue(os.path.exists(f'{self.temp_dir}/{qnode}.tsv')) self.assertEqual(len(pd.read_csv(f'{self.temp_dir}/Q1000133.tsv', sep='\t')), 8)
def test_kgtk_filter_one_row(self): cli_entry("kgtk", "filter", "-i", self.file_path, "-o", f'{self.temp_dir}/one_row.tsv', "-p", "Q65695069;P577;^2019-07-19T00:00:00Z/11", "-v", "--reject-file", f'{self.temp_dir}/reject.tsv') df = pd.read_csv(f'{self.temp_dir}/one_row.tsv', sep='\t') self.assertEqual(len(df), 1)
def test_graph_embeddings_format_glove(self): cli_entry("kgtk", "graph-embeddings", "-i", self.file_path, "-o", f'{self.temp_dir}/out.tsv', '-e', '1', '-ot', 'glove') with open(f'{self.temp_dir}/out.tsv') as f: data = f.readlines() for entity_emb in data: value = entity_emb.split('\t') self.assertTrue(len(value) == 101)
def test_kgtk_split_by_lines_gzipped(self): cli_entry("kgtk", "split", "-i", self.file_path, "--output-path", self.temp_dir, "--lines", '10000', "--gzipped-output") self.assertTrue(os.path.exists(f'{self.temp_dir}/split_0.tsv.gz')) df = pd.read_csv(f'{self.temp_dir}/split_0.tsv.gz', sep='\t') self.assertEqual(len(df), 9993) self.assertEqual(len(list(df['node1'].unique())), 1718)
def test_export_gt(self): cli_entry("kgtk", "export-gt", "-i", "data/sample_kgtk_edge_file.tsv", "--directed", "--log", f'{self.temp_dir}/log.txt', "-o", f'{self.temp_dir}/graph.gt') g = load_graph(f'{self.temp_dir}/graph.gt') self.assertEqual(g.num_edges(), 287) self.assertEqual(g.num_vertices(), 287)
def test_kgtk_filter_reject_file(self): df = self.df2.loc[self.df2['obj'] == 'Q11365'] cli_entry("kgtk", "filter", "-i", self.file_path2, "-o", f'{self.temp_dir}/Q11365.tsv', "-p", ";;Q11365", "--subj", "sub", "--pred", "pred", "--obj", "obj", "-v", "--invert", "--reject-file", f'{self.temp_dir}/reject.tsv') df_r = pd.read_csv(f'{self.temp_dir}/reject.tsv', sep='\t') self.assertEqual(len(df_r), len(df))
def test_kgtk_filter_single_pred_inverted(self): df = self.df2.loc[self.df2['pred'] != 'P577'] cli_entry("kgtk", "filter", "-i", self.file_path2, "-o", f'{self.temp_dir}/P577.tsv', "-p", ";P577;", "--subj", "sub", "--pred", "pred", "--obj", "obj", "-v", "--invert", "--reject-file", f'{self.temp_dir}/reject.tsv') df_r = pd.read_csv(f'{self.temp_dir}/P577.tsv', sep='\t') self.assertEqual(len(df_r), len(df))
def test_kgtk_cat_output_json_line(self): cli_entry("kgtk", "cat", "-i", self.file_path, "-o", f'{self.temp_dir}/cat.jl', "--output-format", "jsonl") f = open(f'{self.temp_dir}/cat.jl') lines = f.readlines() f.close() self.assertEqual(len(lines), 288) for line in lines: self.assertEqual(len(json.loads(line)), 5)
def test_kgtk_ifexists(self): Q47158_path = 'data/sample_kgtk_edge_Q47158.tsv' cli_entry("kgtk", "ifexists", "-i", self.file_path, "--filter-on", Q47158_path, "-o", f'{self.temp_dir}/Q47158.tsv', "--input-keys", "node1", "--filter-keys", "node1", "--show-option", "--verbose") df = pd.read_csv(f'{self.temp_dir}/Q47158.tsv', sep='\t') self.assertEqual(len(df), 118)
def test_kgtk_split_by_lines(self): cli_entry("kgtk", "split", "-i", self.file_path, "--output-path", self.temp_dir, "--lines", '5000') self.assertTrue(os.path.exists(f'{self.temp_dir}/split_0.tsv')) self.assertTrue(os.path.exists(f'{self.temp_dir}/split_1.tsv')) qnodes_0 = list(pd.read_csv(f'{self.temp_dir}/split_0.tsv', sep='\t')['node1'].unique()) qnodes_1 = list(pd.read_csv(f'{self.temp_dir}/split_1.tsv', sep='\t')['node1'].unique()) for qnode in qnodes_0: self.assertTrue(qnode not in qnodes_1)
def test_kgtk_ifnotexists(self): Q47158_path = 'data/Q47158_non_edge.tsv' cli_entry("kgtk", "ifnotexists", "-i", self.file_path, "--filter-on", Q47158_path, "-o", f'{self.temp_dir}/Q47158.tsv', "--input-keys", "node1", "--filter-keys", "heading", "--mode", "NONE", "--verbose") df = pd.read_csv(f'{self.temp_dir}/Q47158.tsv', sep='\t') self.assertEqual(len(df), 169)
def test_import(self): cli_entry("kgtk", "import-visualgenome", "-i", "data/vg10.tsv", "--attr-synsets", "data/attribute_synsets.json", "-o", f'{self.temp_dir}/vg.tsv') df = pd.read_csv(f'{self.temp_dir}/vg.tsv', sep='\t') self.assertEqual(len(df.columns), 9) relations = list(df['relation'].unique()) self.assertEqual(len(df), 580)
def test_kgtk_import_atomic(self): cli_entry("kgtk", "import-atomic", "-i", "data/atomic.csv", "-o", f'{self.temp_dir}/atomic.tsv') df = pd.read_csv(f'{self.temp_dir}/atomic.tsv', sep='\t') self.assertEqual(len(df.columns), 9) relations = df['relation'].unique() self.assertTrue('at:xAttr' in relations)
def test_graph_embeddings_format_w2v(self): cli_entry("kgtk", "graph-embeddings", "-i", self.file_path, "-o", f'{self.temp_dir}/out.tsv', '-e', '1', '-ot', 'w2v') with open(f'{self.temp_dir}/out.tsv') as f: data = f.readlines() for index, entity_emb in enumerate(data): value = entity_emb.split(' ') if index == 0: self.assertTrue(len(value) == 2) else: self.assertTrue(len(value) == 101)
def test_import(self): cli_entry("kgtk", "import_wordnet", "-o", f'{self.temp_dir}/wordnet.tsv') df = pd.read_csv(f'{self.temp_dir}/wordnet.tsv', sep='\t') self.assertEqual(len(df.columns), 9) relations = list(df['relation'].unique()) for r in ['/r/IsA', '/r/PartOf', '/r/MadeOf']: self.assertTrue(r in relations)
def test_kgtk_import_concept_pairs(self): cli_entry("kgtk", "import-concept-pairs", "-i", "data/synonyms.txt", "--source", "RG", "--relation", "/r/Synonym", "-o", f'{self.temp_dir}/roget_syn.tsv') df = pd.read_csv(f'{self.temp_dir}/roget_syn.tsv', sep='\t') self.assertEqual(len(df.columns), 9) for i, row in df.iterrows(): self.assertTrue(row['relation'] == '/r/Synonym') print('ROGET', df)
def test_kgtk_filter_p31(self): # create GT from the file itself using pandas p31_qnodes = list(self.df.loc[self.df['label'] == 'P31']['node1'].unique()) cli_entry("kgtk", "filter", "-i", self.file_path, "-o", f'{self.temp_dir}/p31.tsv', "-p", ";P31;", "-v", "--reject-file", f'{self.temp_dir}/reject.tsv') df = pd.read_csv(f'{self.temp_dir}/p31.tsv', sep='\t') r_qnodes = list(df['node1'].unique()) for q in r_qnodes: self.assertTrue(q in p31_qnodes) self.assertEqual(len(df), 10)
def test_kgtk_add_id_default(self): cli_entry( "kgtk", "add-id", "-i", self.file_path, "-o", f'{self.temp_dir}/id.tsv', "--verify-id-unique", ) df = pd.read_csv(f'{self.temp_dir}/id.tsv', sep='\t') for i, row in df.iterrows(): self.assertEqual(row['id'], f'E{i + 1}')
def test_kgtk_filter_Q2447774(self): # create GT from the file itself using pandas node2s = list(self.df.loc[self.df['node1'] == 'Q2447774']['node2']) cli_entry("kgtk", "filter", "-i", self.file_path, "-o", f'{self.temp_dir}/Q2447774.tsv', "-p", "Q2447774;;", "--reject-file", f'{self.temp_dir}/reject.tsv') df = pd.read_csv(f'{self.temp_dir}/Q2447774.tsv', sep='\t') r_node2s = list(df['node2']) for q in r_node2s: self.assertTrue(q in node2s) self.assertEqual(len(df), 27)
def test_kgtk_cat_output_json_line_map(self): cli_entry("kgtk", "cat", "-i", self.file_path, "-o", f'{self.temp_dir}/cat.jl', "--output-format", "jsonl-map") f = open(f'{self.temp_dir}/cat.jl') lines = f.readlines() f.close() self.assertEqual(len(lines), 287) for line in lines: x = json.loads(line) self.assertTrue('id' in x) self.assertTrue('node1' in x) self.assertTrue('label' in x) self.assertTrue('node2' in x) self.assertTrue('rank' in x)
def test_kgtk_cat_output_csv(self): cli_entry("kgtk", "cat", "-i", self.file_path, "-o", f'{self.temp_dir}/cat.csv', "--output-format", "csv") df_r = pd.read_csv(f'{self.temp_dir}/cat.csv') df = pd.read_csv(f'{self.file_path}', sep='\t') self.assertEqual(len(df), len(df_r)) self.assertEqual(list(df.columns), list(df_r.columns)) f = open(self.file_path) lines = f.readlines() f.close() for i, row in df_r.iterrows(): self.assertEqual(row["id"], lines[i + 1].split('\t')[0])
def test_kgtk_cat_output_json_map(self): cli_entry("kgtk", "cat", "-i", self.file_path, "-o", f'{self.temp_dir}/cat.json', "--output-format", "json-map") f = open(f'{self.temp_dir}/cat.json') obj = json.load(f) f.close() self.assertEqual(len(obj), 287) for x in obj: self.assertTrue('id' in x) self.assertTrue('node1' in x) self.assertTrue('label' in x) self.assertTrue('node2' in x) self.assertTrue('rank' in x)
def test_kgtk_import_framenet(self): cli_entry("kgtk", "import_framenet", "-o", f'{self.temp_dir}/framenet.tsv') df = pd.read_csv(f'{self.temp_dir}/framenet.tsv', sep='\t', na_filter=False) self.assertEqual( len(df.columns), 9) # Make sure that the amount of columns is as expected self.assertEqual( len(df), 29873) # Make sure that the amount of rows is as expected
def test_graph_embeddings_format_kgtk(self): cli_entry("kgtk", "graph-embeddings", "-i", self.file_path, "-o", f'{self.temp_dir}/out.tsv', '-e', '1', '-ot', 'kgtk') with open(f'{self.temp_dir}/out.tsv') as f: data = f.readlines() self.assertTrue(len(data) > 0) header = data.pop(0).rstrip('\r\n').split('\t') self.assertTrue(len(header) == 3) self.assertTrue(header[0] == 'node1') self.assertTrue(header[1] == 'label') self.assertTrue(header[2] == 'node2') for entity_emb in data: value = entity_emb.rstrip('\r\n').split('\t') self.assertTrue(len(value) == 3) self.assertTrue(value[1] == 'graph_embeddings')
def test_kgtk_normalize_nodes_default(self): cli_entry("kgtk", "normalize-nodes", "-i", self.file_path, "-o", f'{self.temp_dir}/normalize.tsv', "--verbose", "--show-option") df = pd.read_csv(f'{self.temp_dir}/normalize.tsv', sep='\t') self.assertEqual(len(df), 52) df = df.loc[df['node1'] == 'Q183'].loc[df['label'] == 'label'] self.assertTrue(len(df), 3) print(df) labels = list(df['node2'].unique()) self.assertTrue("'Germany'@en" in labels) self.assertTrue("'Германия'@ru" in labels) self.assertTrue("'Німеччина'@uk" in labels)