Ejemplo n.º 1
0
    def test_truthy_dates_generation(self):
        # to reproduce standard file
        # kgtk generate_wikidata_triples -pf wikidata_properties.tsv -w yes --log-path date_warning.log -n 100 --use-id yes -gt yes < dates.tsv > dates_truthy.ttl
        dates_tsv_file = 'data/dates.tsv'
        wikidata_property_file = 'data/wikidata_properties.tsv'
        o = open('data/dates_truthy_tmp.ttl', 'w')
        generator = TripleGenerator(prop_file = wikidata_property_file, label_set='label', alias_set='aliases',
                                    description_set='descriptions', warning=True, n=100, truthy=True, use_id=True,
                                    dest_fp=o,log_path="data/date_warning.log",prop_declaration=False, prefix_path="NONE")
        fp = open(dates_tsv_file)
        for line_num, edge in enumerate(fp):
            if edge.startswith("#"):
                continue
            else:
                generator.entry_point(line_num + 1, edge)
        generator.finalize()
        o.close()
        fp.close()

        f1 = open('data/dates_truthy.ttl')
        f2 = open('data/dates_truthy_tmp.ttl')
        self.assertNotEqual(f1.readlines(), f2.readlines()) 
        #TODO until date validation published
        # self.assertEqual(f1.readlines(), f2.readlines()) 
        f1.close()
        f2.close()
        self.assertNotEqual(os.stat("data/date_warning.log").st_size, 0)
        # TODO
        # self.assertEqual(f1.readlines(), f2.readlines()) 
        p = Path("data/date_warning.log")
        p.unlink()
        p = Path('data/dates_truthy_tmp.ttl')
        p.unlink()
Ejemplo n.º 2
0
 def test_truthy_qnode_triple_generation(self):
     qnode_tsv_file = 'data/Q57160439.tsv'
     wikidata_property_file = 'data/wikidata_properties.tsv'
     o = open('data/Q57160439_truthy_tmp.ttl', 'w')
     generator = TripleGenerator(prop_file = wikidata_property_file, label_set='label', alias_set='aliases',
                                 description_set='descriptions', warning=True, n=100, truthy=True, use_id=True,
                                 dest_fp=o,log_path="data/warning.log",prop_declaration=False,prefix_path="NONE")
     fp = open(qnode_tsv_file)
     for line_num, edge in enumerate(fp):
         if edge.startswith("#"):
             continue
         else:
             generator.entry_point(line_num + 1, edge)
     generator.finalize()
     o.close()
     fp.close()
     f1 = open('data/Q57160439_truthy.ttl')
     f2 = open('data/Q57160439_truthy_tmp.ttl')
     self.assertEqual(f1.readlines(), f2.readlines())
     f1.close()
     f2.close()
     self.assertEqual(os.stat("data/warning.log").st_size, 0)
     p = Path("data/warning.log")
     p.unlink()
     p = Path('data/Q57160439_truthy_tmp.ttl')
     p.unlink()
Ejemplo n.º 3
0
    def test_triple_corrupted_edges(self):
        corrupted_kgtk_file = 'data/corrupted_kgtk.tsv'
        wikidata_property_file = 'data/wikidata_properties.tsv'
        o = open('data/corrupted_tmp.ttl', 'w')
        generator = TripleGenerator(prop_file = wikidata_property_file, label_set='label', alias_set='aliases',
                                    description_set='descriptions', warning=True, n=100, truthy=True, use_id=True,
                                    dest_fp=o,log_path="data/corrupted_warning_tmp.log",prop_declaration=False,prefix_path="NONE")
        fp = open(corrupted_kgtk_file)
        for line_num, edge in enumerate(fp):
            if edge.startswith("#") or len(edge.strip("\n")) == 0:
                continue
            else:
                generator.entry_point(line_num + 1, edge)
        generator.finalize()

        o.close()
        fp.close()
        f1 = open('data/corrupted.ttl')
        f2 = open('data/corrupted_tmp.ttl')
        self.assertEqual(f1.readlines(), f2.readlines())
        f1.close()
        f2.close()
        f1 = open("data/corrupted_warning.log")
        f2 = open("data/corrupted_warning_tmp.log")
        self.assertEqual(f1.readlines(), f2.readlines())
        f1.close()
        f2.close()
        p = Path("data/corrupted_warning_tmp.log")
        p.unlink()
        p = Path('data/corrupted_tmp.ttl')
        p.unlink()
Ejemplo n.º 4
0
def run(labels: str, aliases: str, descriptions: str, prop_file: str, n: int,
        truthy: bool, warning: bool, use_gz: bool, use_id: bool, log_path: str,
        prop_declaration: bool, prefix_path: str):
    # import modules locally
    import gzip
    # from kgtk.triple_generator import TripleGenerator
    from kgtk.generator import TripleGenerator
    import sys

    generator = TripleGenerator(
        prop_file=prop_file,
        label_set=labels,
        alias_set=aliases,
        description_set=descriptions,
        n=n,
        warning=warning,
        truthy=truthy,
        use_id=use_id,
        dest_fp=sys.stdout,
        log_path=log_path,
        prop_declaration=prop_declaration,
        prefix_path=prefix_path,
    )

    # loop first round
    if use_gz:
        fp = gzip.open(sys.stdin.buffer, 'rt')
    else:
        fp = sys.stdin
        # not line by line

    if prop_declaration:
        file_lines = 0
        begining_edge = None
        start_generation = False
        for line_num, edge in enumerate(fp):
            if line_num == 0:
                begining_edge = edge
                generator.entry_point(line_num + 1, edge)
                file_lines += 1
            else:
                if start_generation:
                    # start triple generation because reached the starting position of the second `cat`
                    line_number = line_num - file_lines
                    # print("creating triples at line {} {} with total number of lines: {}".format(line_number+1, edge, file_lines))
                    generator.entry_point(line_number + 1,
                                          edge)  # file generator
                    # print("# {}".format(generator.read_num_of_lines))
                else:
                    if edge == begining_edge:
                        start_generation = True
                    else:
                        file_lines += 1
                        # print("creating property declarations at line {} {}".format(line_num, edge))
                        generator.read_prop_declaration(line_num + 1, edge)

        generator.finalize()
    else:
        for line_num, edge in enumerate(fp):
            if edge.startswith("#") or len(edge.strip("\n")) == 0:
                continue
            else:
                generator.entry_point(line_num + 1, edge)

        generator.finalize()