Exemple #1
0
 def test_kgtk_cat_two_files(self):
     f1_path = 'data/sample_kgtk_edge_Q47158.tsv'
     f2_path = 'data/sample_kgtk_edge_file_with_id.tsv'
     cli_entry("kgtk", "cat", "-i", f1_path, f2_path, "-o",
               f'{self.temp_dir}/cat.tsv')
     df = pd.read_csv(f'{self.temp_dir}/cat.tsv', sep='\t')
     self.assertEqual(len(df), 6)
Exemple #2
0
 def test_kgtk_add_id_overwrite_style_n1_l_num(self):
     cli_entry("kgtk", "add-id", "-i", self.file_path2, "-o",
               f'{self.temp_dir}/id.tsv', "--overwrite-id", "--id-style",
               "node1-label-num")
     df = pd.read_csv(f'{self.temp_dir}/id.tsv', sep='\t')
     for i, row in df.iterrows():
         self.assertEqual(row['id'], f'{row["node1"]}-{row["label"]}-0000')
Exemple #3
0
    def test_kgtk_normalize_nodes_alias_only(self):
        cli_entry("kgtk", "normalize-nodes", "-i", self.file_path, "-o",
                  f'{self.temp_dir}/normalize.tsv', "-c", "alias")

        df = pd.read_csv(f'{self.temp_dir}/normalize.tsv', sep='\t')
        self.assertEqual(len(df), 24)
        self.assertEqual(list(df['label'].unique())[0], 'alias')
Exemple #4
0
 def test_kgtk_add_id_overwrite_style_prefix(self):
     cli_entry("kgtk", "add-id", "-i", self.file_path2, "-o",
               f'{self.temp_dir}/id.tsv', "--overwrite-id", "--id-style",
               "prefix###", "--id-prefix", "THIS")
     df = pd.read_csv(f'{self.temp_dir}/id.tsv', sep='\t').fillna("")
     for i, row in df.iterrows():
         self.assertEqual(row['id'], f'THIS{i + 1}')
Exemple #5
0
 def test_kgtk_add_id_new_id_column_specify_old_id_column(self):
     file_path = 'data/sample_kgtk_edge_file_with_id.tsv'
     cli_entry("kgtk", "add-id", "-i", file_path, "-o",
               f'{self.temp_dir}/id.tsv', "--new-id-column-name", "id_new",
               "--old-id-column-name", "id")
     df = pd.read_csv(f'{self.temp_dir}/id.tsv', sep='\t')
     for i, row in df.iterrows():
         self.assertEqual(row['id_new'], f'E{i + 1}')
    def test_kgtk_split_by_qnode(self):
        df = pd.read_csv(self.file_path, sep='\t')
        qnodes = list(df['node1'].unique())

        cli_entry("kgtk", "split", "-i", self.file_path, "--output-path", self.temp_dir, "--split-by-qnode")
        for qnode in qnodes:
            self.assertTrue(os.path.exists(f'{self.temp_dir}/{qnode}.tsv'))
        self.assertEqual(len(pd.read_csv(f'{self.temp_dir}/Q1000133.tsv', sep='\t')), 8)
Exemple #7
0
    def test_kgtk_filter_one_row(self):
        cli_entry("kgtk", "filter", "-i", self.file_path, "-o", f'{self.temp_dir}/one_row.tsv', "-p",
                  "Q65695069;P577;^2019-07-19T00:00:00Z/11", "-v",
                  "--reject-file", f'{self.temp_dir}/reject.tsv')

        df = pd.read_csv(f'{self.temp_dir}/one_row.tsv', sep='\t')

        self.assertEqual(len(df), 1)
Exemple #8
0
 def test_graph_embeddings_format_glove(self):
     cli_entry("kgtk", "graph-embeddings", "-i", self.file_path, "-o",
               f'{self.temp_dir}/out.tsv', '-e', '1', '-ot', 'glove')
     with open(f'{self.temp_dir}/out.tsv') as f:
         data = f.readlines()
     for entity_emb in data:
         value = entity_emb.split('\t')
         self.assertTrue(len(value) == 101)
    def test_kgtk_split_by_lines_gzipped(self):

        cli_entry("kgtk", "split", "-i", self.file_path, "--output-path", self.temp_dir, "--lines", '10000',
                  "--gzipped-output")

        self.assertTrue(os.path.exists(f'{self.temp_dir}/split_0.tsv.gz'))
        df = pd.read_csv(f'{self.temp_dir}/split_0.tsv.gz', sep='\t')
        self.assertEqual(len(df), 9993)
        self.assertEqual(len(list(df['node1'].unique())), 1718)
Exemple #10
0
    def test_export_gt(self):
        cli_entry("kgtk", "export-gt", "-i", "data/sample_kgtk_edge_file.tsv",
                  "--directed", "--log", f'{self.temp_dir}/log.txt', "-o",
                  f'{self.temp_dir}/graph.gt')

        g = load_graph(f'{self.temp_dir}/graph.gt')

        self.assertEqual(g.num_edges(), 287)
        self.assertEqual(g.num_vertices(), 287)
Exemple #11
0
    def test_kgtk_filter_reject_file(self):
        df = self.df2.loc[self.df2['obj'] == 'Q11365']
        cli_entry("kgtk", "filter", "-i", self.file_path2, "-o", f'{self.temp_dir}/Q11365.tsv', "-p",
                  ";;Q11365", "--subj", "sub", "--pred", "pred", "--obj", "obj", "-v", "--invert", "--reject-file",
                  f'{self.temp_dir}/reject.tsv')

        df_r = pd.read_csv(f'{self.temp_dir}/reject.tsv', sep='\t')

        self.assertEqual(len(df_r), len(df))
Exemple #12
0
    def test_kgtk_filter_single_pred_inverted(self):
        df = self.df2.loc[self.df2['pred'] != 'P577']
        cli_entry("kgtk", "filter", "-i", self.file_path2, "-o", f'{self.temp_dir}/P577.tsv', "-p",
                  ";P577;", "--subj", "sub", "--pred", "pred", "--obj", "obj", "-v", "--invert",
                  "--reject-file", f'{self.temp_dir}/reject.tsv')

        df_r = pd.read_csv(f'{self.temp_dir}/P577.tsv', sep='\t')

        self.assertEqual(len(df_r), len(df))
Exemple #13
0
    def test_kgtk_cat_output_json_line(self):
        cli_entry("kgtk", "cat", "-i", self.file_path, "-o", f'{self.temp_dir}/cat.jl', "--output-format", "jsonl")

        f = open(f'{self.temp_dir}/cat.jl')
        lines = f.readlines()
        f.close()
        self.assertEqual(len(lines), 288)
        for line in lines:
            self.assertEqual(len(json.loads(line)), 5)
Exemple #14
0
    def test_kgtk_ifexists(self):
        Q47158_path = 'data/sample_kgtk_edge_Q47158.tsv'
        cli_entry("kgtk", "ifexists", "-i", self.file_path, "--filter-on",
                  Q47158_path, "-o", f'{self.temp_dir}/Q47158.tsv',
                  "--input-keys", "node1", "--filter-keys", "node1",
                  "--show-option", "--verbose")

        df = pd.read_csv(f'{self.temp_dir}/Q47158.tsv', sep='\t')

        self.assertEqual(len(df), 118)
Exemple #15
0
    def test_kgtk_split_by_lines(self):

        cli_entry("kgtk", "split", "-i", self.file_path, "--output-path", self.temp_dir, "--lines", '5000')

        self.assertTrue(os.path.exists(f'{self.temp_dir}/split_0.tsv'))
        self.assertTrue(os.path.exists(f'{self.temp_dir}/split_1.tsv'))
        qnodes_0 = list(pd.read_csv(f'{self.temp_dir}/split_0.tsv', sep='\t')['node1'].unique())
        qnodes_1 = list(pd.read_csv(f'{self.temp_dir}/split_1.tsv', sep='\t')['node1'].unique())
        for qnode in qnodes_0:
            self.assertTrue(qnode not in qnodes_1)
Exemple #16
0
    def test_kgtk_ifnotexists(self):
        Q47158_path = 'data/Q47158_non_edge.tsv'
        cli_entry("kgtk", "ifnotexists", "-i", self.file_path, "--filter-on",
                  Q47158_path, "-o", f'{self.temp_dir}/Q47158.tsv',
                  "--input-keys", "node1", "--filter-keys", "heading",
                  "--mode", "NONE", "--verbose")

        df = pd.read_csv(f'{self.temp_dir}/Q47158.tsv', sep='\t')

        self.assertEqual(len(df), 169)
Exemple #17
0
    def test_import(self):
        cli_entry("kgtk", "import-visualgenome", "-i", "data/vg10.tsv", "--attr-synsets", "data/attribute_synsets.json", "-o", f'{self.temp_dir}/vg.tsv')

        df = pd.read_csv(f'{self.temp_dir}/vg.tsv', sep='\t')

        self.assertEqual(len(df.columns), 9)

        relations = list(df['relation'].unique())

        self.assertEqual(len(df), 580)
Exemple #18
0
    def test_kgtk_import_atomic(self):
        cli_entry("kgtk", "import-atomic", "-i", "data/atomic.csv", "-o",
                  f'{self.temp_dir}/atomic.tsv')

        df = pd.read_csv(f'{self.temp_dir}/atomic.tsv', sep='\t')

        self.assertEqual(len(df.columns), 9)

        relations = df['relation'].unique()

        self.assertTrue('at:xAttr' in relations)
Exemple #19
0
 def test_graph_embeddings_format_w2v(self):
     cli_entry("kgtk", "graph-embeddings", "-i", self.file_path, "-o",
               f'{self.temp_dir}/out.tsv', '-e', '1', '-ot', 'w2v')
     with open(f'{self.temp_dir}/out.tsv') as f:
         data = f.readlines()
     for index, entity_emb in enumerate(data):
         value = entity_emb.split(' ')
         if index == 0:
             self.assertTrue(len(value) == 2)
         else:
             self.assertTrue(len(value) == 101)
Exemple #20
0
    def test_import(self):
        cli_entry("kgtk", "import_wordnet", "-o",
                  f'{self.temp_dir}/wordnet.tsv')

        df = pd.read_csv(f'{self.temp_dir}/wordnet.tsv', sep='\t')

        self.assertEqual(len(df.columns), 9)

        relations = list(df['relation'].unique())

        for r in ['/r/IsA', '/r/PartOf', '/r/MadeOf']:
            self.assertTrue(r in relations)
Exemple #21
0
    def test_kgtk_import_concept_pairs(self):
        cli_entry("kgtk", "import-concept-pairs", "-i", "data/synonyms.txt",
                  "--source", "RG", "--relation", "/r/Synonym", "-o",
                  f'{self.temp_dir}/roget_syn.tsv')

        df = pd.read_csv(f'{self.temp_dir}/roget_syn.tsv', sep='\t')

        self.assertEqual(len(df.columns), 9)

        for i, row in df.iterrows():
            self.assertTrue(row['relation'] == '/r/Synonym')
        print('ROGET', df)
Exemple #22
0
    def test_kgtk_filter_p31(self):
        # create GT from the file itself using pandas
        p31_qnodes = list(self.df.loc[self.df['label'] == 'P31']['node1'].unique())

        cli_entry("kgtk", "filter", "-i", self.file_path, "-o", f'{self.temp_dir}/p31.tsv', "-p", ";P31;", "-v",
                  "--reject-file", f'{self.temp_dir}/reject.tsv')

        df = pd.read_csv(f'{self.temp_dir}/p31.tsv', sep='\t')
        r_qnodes = list(df['node1'].unique())

        for q in r_qnodes:
            self.assertTrue(q in p31_qnodes)
        self.assertEqual(len(df), 10)
Exemple #23
0
 def test_kgtk_add_id_default(self):
     cli_entry(
         "kgtk",
         "add-id",
         "-i",
         self.file_path,
         "-o",
         f'{self.temp_dir}/id.tsv',
         "--verify-id-unique",
     )
     df = pd.read_csv(f'{self.temp_dir}/id.tsv', sep='\t')
     for i, row in df.iterrows():
         self.assertEqual(row['id'], f'E{i + 1}')
Exemple #24
0
    def test_kgtk_filter_Q2447774(self):
        # create GT from the file itself using pandas
        node2s = list(self.df.loc[self.df['node1'] == 'Q2447774']['node2'])

        cli_entry("kgtk", "filter", "-i", self.file_path, "-o", f'{self.temp_dir}/Q2447774.tsv', "-p", "Q2447774;;",
                  "--reject-file", f'{self.temp_dir}/reject.tsv')

        df = pd.read_csv(f'{self.temp_dir}/Q2447774.tsv', sep='\t')
        r_node2s = list(df['node2'])

        for q in r_node2s:
            self.assertTrue(q in node2s)
        self.assertEqual(len(df), 27)
Exemple #25
0
    def test_kgtk_cat_output_json_line_map(self):
        cli_entry("kgtk", "cat", "-i", self.file_path, "-o", f'{self.temp_dir}/cat.jl', "--output-format", "jsonl-map")

        f = open(f'{self.temp_dir}/cat.jl')
        lines = f.readlines()
        f.close()
        self.assertEqual(len(lines), 287)
        for line in lines:
            x = json.loads(line)
            self.assertTrue('id' in x)
            self.assertTrue('node1' in x)
            self.assertTrue('label' in x)
            self.assertTrue('node2' in x)
            self.assertTrue('rank' in x)
Exemple #26
0
    def test_kgtk_cat_output_csv(self):
        cli_entry("kgtk", "cat", "-i", self.file_path, "-o", f'{self.temp_dir}/cat.csv', "--output-format", "csv")

        df_r = pd.read_csv(f'{self.temp_dir}/cat.csv')

        df = pd.read_csv(f'{self.file_path}', sep='\t')
        self.assertEqual(len(df), len(df_r))
        self.assertEqual(list(df.columns), list(df_r.columns))

        f = open(self.file_path)
        lines = f.readlines()
        f.close()
        for i, row in df_r.iterrows():
            self.assertEqual(row["id"], lines[i + 1].split('\t')[0])
Exemple #27
0
    def test_kgtk_cat_output_json_map(self):
        cli_entry("kgtk", "cat", "-i", self.file_path, "-o",
                  f'{self.temp_dir}/cat.json', "--output-format", "json-map")

        f = open(f'{self.temp_dir}/cat.json')
        obj = json.load(f)
        f.close()
        self.assertEqual(len(obj), 287)
        for x in obj:
            self.assertTrue('id' in x)
            self.assertTrue('node1' in x)
            self.assertTrue('label' in x)
            self.assertTrue('node2' in x)
            self.assertTrue('rank' in x)
Exemple #28
0
    def test_kgtk_import_framenet(self):
        cli_entry("kgtk", "import_framenet", "-o",
                  f'{self.temp_dir}/framenet.tsv')

        df = pd.read_csv(f'{self.temp_dir}/framenet.tsv',
                         sep='\t',
                         na_filter=False)

        self.assertEqual(
            len(df.columns),
            9)  # Make sure that the amount of columns is as expected

        self.assertEqual(
            len(df), 29873)  # Make sure that the amount of rows is as expected
Exemple #29
0
 def test_graph_embeddings_format_kgtk(self):
     cli_entry("kgtk", "graph-embeddings", "-i", self.file_path, "-o",
               f'{self.temp_dir}/out.tsv', '-e', '1', '-ot', 'kgtk')
     with open(f'{self.temp_dir}/out.tsv') as f:
         data = f.readlines()
     self.assertTrue(len(data) > 0)
     header = data.pop(0).rstrip('\r\n').split('\t')
     self.assertTrue(len(header) == 3)
     self.assertTrue(header[0] == 'node1')
     self.assertTrue(header[1] == 'label')
     self.assertTrue(header[2] == 'node2')
     for entity_emb in data:
         value = entity_emb.rstrip('\r\n').split('\t')
         self.assertTrue(len(value) == 3)
         self.assertTrue(value[1] == 'graph_embeddings')
Exemple #30
0
    def test_kgtk_normalize_nodes_default(self):
        cli_entry("kgtk", "normalize-nodes", "-i", self.file_path, "-o",
                  f'{self.temp_dir}/normalize.tsv', "--verbose",
                  "--show-option")

        df = pd.read_csv(f'{self.temp_dir}/normalize.tsv', sep='\t')

        self.assertEqual(len(df), 52)
        df = df.loc[df['node1'] == 'Q183'].loc[df['label'] == 'label']
        self.assertTrue(len(df), 3)
        print(df)
        labels = list(df['node2'].unique())

        self.assertTrue("'Germany'@en" in labels)
        self.assertTrue("'Германия'@ru" in labels)
        self.assertTrue("'Німеччина'@uk" in labels)