Ejemplo n.º 1
0
    def test_triple_corrupted_edges(self):
        corrupted_kgtk_file = Path('data/corrupted_kgtk.tsv')
        wikidata_property_file = 'data/wikidata_properties.tsv'
        o = 'data/corrupted_tmp.ttl'
        generator = TripleGenerator(prop_file=wikidata_property_file,
                                    label_set='label',
                                    alias_set='aliases',
                                    description_set='descriptions',
                                    warning=True,
                                    n=100,
                                    truthy=True,
                                    use_id=True,
                                    dest_fp=o,
                                    log_path="data/corrupted_warning_tmp.log",
                                    prop_declaration=False,
                                    prefix_path="NONE",
                                    input_file=corrupted_kgtk_file,
                                    error_action='log')
        generator.process()

        f1 = open('data/corrupted.ttl')
        f2 = open('data/corrupted_tmp.ttl')
        self.assertEqual(f1.readlines(), f2.readlines())
        f1.close()
        f2.close()
        f1 = open("data/corrupted_warning.log")
        f2 = open("data/corrupted_warning_tmp.log")
        self.assertEqual(f1.readlines(), f2.readlines())
        f1.close()
        f2.close()
        p = Path("data/corrupted_warning_tmp.log")
        p.unlink()
        p = Path('data/corrupted_tmp.ttl')
        p.unlink()
Ejemplo n.º 2
0
    def test_property_triple_generation(self):
        property_tsv_file = Path('data/P10.tsv')
        wikidata_property_file = 'data/wikidata_properties.tsv'
        o = 'data/P10_not_truthy_tmp.ttl'
        generator = TripleGenerator(prop_file=wikidata_property_file,
                                    label_set='label',
                                    alias_set='aliases',
                                    description_set='descriptions',
                                    warning=True,
                                    n=100,
                                    truthy=False,
                                    use_id=True,
                                    dest_fp=o,
                                    log_path="data/warning.log",
                                    prop_declaration=False,
                                    prefix_path="NONE",
                                    input_file=property_tsv_file,
                                    error_action='log')
        generator.process()

        f1 = open('data/P10_not_truthy.ttl')
        f2 = open('data/P10_not_truthy_tmp.ttl')
        self.assertEqual(f1.readlines(), f2.readlines())
        f1.close()
        f2.close()
        self.assertEqual(os.stat("data/warning.log").st_size, 0)
        p = Path("data/warning.log")
        p.unlink()
        p = Path('data/P10_not_truthy_tmp.ttl')
        p.unlink()
def run(labels: str, aliases: str, descriptions: str, property_file: str,
        n: int, truthy: bool, warning: bool, use_id: bool, log_path: str,
        prop_declaration: bool, prefix_path: str, input_file: KGTKFiles,
        output_file: str, error_action: str):
    # import modules locally

    from kgtk.generator import TripleGenerator
    from kgtk.exceptions import KGTKException

    generator = TripleGenerator(prop_file=property_file,
                                label_set=labels,
                                alias_set=aliases,
                                description_set=descriptions,
                                n=n,
                                warning=warning,
                                truthy=truthy,
                                use_id=use_id,
                                dest_fp=output_file,
                                log_path=log_path,
                                prop_declaration=prop_declaration,
                                prefix_path=prefix_path,
                                input_file=input_file,
                                error_action=error_action)

    try:
        generator.process()
    except Exception as e:
        raise KGTKException(e)
Ejemplo n.º 4
0
    def test_triple_small_values(self):
        small_values_file = Path('data/small_values.tsv')
        wikidata_property_file = 'data/wikidata_properties.tsv'
        o = 'data/small_values_tmp.ttl'
        generator = TripleGenerator(prop_file=wikidata_property_file,
                                    label_set='label',
                                    alias_set='aliases',
                                    description_set='descriptions',
                                    warning=True,
                                    n=100,
                                    truthy=True,
                                    use_id=True,
                                    dest_fp=o,
                                    log_path="data/warning.log",
                                    prop_declaration=False,
                                    prefix_path="NONE",
                                    input_file=small_values_file,
                                    error_action='log')
        generator.process()

        # This is a gold file with the expected values, 0.00000019860001065575846:
        with open('data/small_values.ttl') as f1a:
            f1a_lines = f1a.readlines()

        # This is a gold value with values containing exponents, 1.9860001065575846E-7:
        with open('data/small_values_with_exponent.ttl') as f1b:
            f1b_lines = f1b.readlines()

        # This is the generated file:
        with open('data/small_values_tmp.ttl') as f2:
            f2_lines = f2.readlines()

        # If the generated files equals either of the gold files, accept the
        # result.
        if f1a_lines != f2_lines and f1b_lines != f2_lines:
            self.assertEqual(f1a_lines, f2_lines)

        self.assertEqual(os.stat("data/warning.log").st_size, 0)
        p = Path("data/warning.log")
        p.unlink()
        p = Path('data/small_values_tmp.ttl')
        p.unlink()
Ejemplo n.º 5
0
 def test_truthy_qnode_triple_generation(self):
     qnode_tsv_file = 'data/Q57160439.tsv'
     wikidata_property_file = 'data/wikidata_properties.tsv'
     o = open('data/Q57160439_truthy_tmp.ttl', 'w')
     generator = TripleGenerator(prop_file = wikidata_property_file, label_set='label', alias_set='aliases',
                                 description_set='descriptions', warning=True, n=100, truthy=True, use_id=True,
                                 dest_fp=o,log_path="data/warning.log",prop_declaration=False,prefix_path="NONE")
     fp = open(qnode_tsv_file)
     for line_num, edge in enumerate(fp):
         if edge.startswith("#"):
             continue
         else:
             generator.entry_point(line_num + 1, edge)
     generator.finalize()
     o.close()
     fp.close()
     f1 = open('data/Q57160439_truthy.ttl')
     f2 = open('data/Q57160439_truthy_tmp.ttl')
     self.assertEqual(f1.readlines(), f2.readlines())
     f1.close()
     f2.close()
     self.assertEqual(os.stat("data/warning.log").st_size, 0)
     p = Path("data/warning.log")
     p.unlink()
     p = Path('data/Q57160439_truthy_tmp.ttl')
     p.unlink()
Ejemplo n.º 6
0
    def test_truthy_dates_generation(self):
        # to reproduce standard file
        # kgtk generate_wikidata_triples -pf wikidata_properties.tsv -w yes --log-path date_warning.log -n 100 --use-id yes -gt yes < dates.tsv > dates_truthy.ttl
        dates_tsv_file = 'data/dates.tsv'
        wikidata_property_file = 'data/wikidata_properties.tsv'
        o = open('data/dates_truthy_tmp.ttl', 'w')
        generator = TripleGenerator(prop_file = wikidata_property_file, label_set='label', alias_set='aliases',
                                    description_set='descriptions', warning=True, n=100, truthy=True, use_id=True,
                                    dest_fp=o,log_path="data/date_warning.log",prop_declaration=False, prefix_path="NONE")
        fp = open(dates_tsv_file)
        for line_num, edge in enumerate(fp):
            if edge.startswith("#"):
                continue
            else:
                generator.entry_point(line_num + 1, edge)
        generator.finalize()
        o.close()
        fp.close()

        f1 = open('data/dates_truthy.ttl')
        f2 = open('data/dates_truthy_tmp.ttl')
        self.assertNotEqual(f1.readlines(), f2.readlines()) 
        #TODO until date validation published
        # self.assertEqual(f1.readlines(), f2.readlines()) 
        f1.close()
        f2.close()
        self.assertNotEqual(os.stat("data/date_warning.log").st_size, 0)
        # TODO
        # self.assertEqual(f1.readlines(), f2.readlines()) 
        p = Path("data/date_warning.log")
        p.unlink()
        p = Path('data/dates_truthy_tmp.ttl')
        p.unlink()
Ejemplo n.º 7
0
    def test_triple_corrupted_edges(self):
        corrupted_kgtk_file = 'data/corrupted_kgtk.tsv'
        wikidata_property_file = 'data/wikidata_properties.tsv'
        o = open('data/corrupted_tmp.ttl', 'w')
        generator = TripleGenerator(prop_file = wikidata_property_file, label_set='label', alias_set='aliases',
                                    description_set='descriptions', warning=True, n=100, truthy=True, use_id=True,
                                    dest_fp=o,log_path="data/corrupted_warning_tmp.log",prop_declaration=False,prefix_path="NONE")
        fp = open(corrupted_kgtk_file)
        for line_num, edge in enumerate(fp):
            if edge.startswith("#") or len(edge.strip("\n")) == 0:
                continue
            else:
                generator.entry_point(line_num + 1, edge)
        generator.finalize()

        o.close()
        fp.close()
        f1 = open('data/corrupted.ttl')
        f2 = open('data/corrupted_tmp.ttl')
        self.assertEqual(f1.readlines(), f2.readlines())
        f1.close()
        f2.close()
        f1 = open("data/corrupted_warning.log")
        f2 = open("data/corrupted_warning_tmp.log")
        self.assertEqual(f1.readlines(), f2.readlines())
        f1.close()
        f2.close()
        p = Path("data/corrupted_warning_tmp.log")
        p.unlink()
        p = Path('data/corrupted_tmp.ttl')
        p.unlink()
Ejemplo n.º 8
0
def run(labels: str, aliases: str, descriptions: str, prop_file: str, n: int,
        truthy: bool, warning: bool, use_gz: bool, use_id: bool, log_path: str,
        prop_declaration: bool, prefix_path: str):
    # import modules locally
    import gzip
    # from kgtk.triple_generator import TripleGenerator
    from kgtk.generator import TripleGenerator
    import sys

    generator = TripleGenerator(
        prop_file=prop_file,
        label_set=labels,
        alias_set=aliases,
        description_set=descriptions,
        n=n,
        warning=warning,
        truthy=truthy,
        use_id=use_id,
        dest_fp=sys.stdout,
        log_path=log_path,
        prop_declaration=prop_declaration,
        prefix_path=prefix_path,
    )

    # loop first round
    if use_gz:
        fp = gzip.open(sys.stdin.buffer, 'rt')
    else:
        fp = sys.stdin
        # not line by line

    if prop_declaration:
        file_lines = 0
        begining_edge = None
        start_generation = False
        for line_num, edge in enumerate(fp):
            if line_num == 0:
                begining_edge = edge
                generator.entry_point(line_num + 1, edge)
                file_lines += 1
            else:
                if start_generation:
                    # start triple generation because reached the starting position of the second `cat`
                    line_number = line_num - file_lines
                    # print("creating triples at line {} {} with total number of lines: {}".format(line_number+1, edge, file_lines))
                    generator.entry_point(line_number + 1,
                                          edge)  # file generator
                    # print("# {}".format(generator.read_num_of_lines))
                else:
                    if edge == begining_edge:
                        start_generation = True
                    else:
                        file_lines += 1
                        # print("creating property declarations at line {} {}".format(line_num, edge))
                        generator.read_prop_declaration(line_num + 1, edge)

        generator.finalize()
    else:
        for line_num, edge in enumerate(fp):
            if edge.startswith("#") or len(edge.strip("\n")) == 0:
                continue
            else:
                generator.entry_point(line_num + 1, edge)

        generator.finalize()