Exemple #1
0
    def test_ranked_kgtk_generation(self):
        ranked_tsv_file = Path('data/ranked_example.tsv')
        wikidata_property_file = 'data/wikidata_properties.tsv'
        generator = JsonGenerator(prop_file=Path(wikidata_property_file),
                                  label_set='label',
                                  alias_set='alias',
                                  description_set='description',
                                  warning=True,
                                  n=1000,
                                  log_path="data/ranked_warning.log",
                                  prop_declaration=False,
                                  has_rank=False,
                                  output_prefix="data/ranked_tmp",
                                  property_declaration_label="property_type",
                                  input_file=Path(ranked_tsv_file),
                                  error_action='log')
        generator.process()

        f1 = open('data/ranked0.jsonl')
        f2 = open('data/ranked_tmp0.jsonl')
        self.assertEqual(f1.readlines(), f2.readlines())
        f1.close()
        f2.close()
        p = Path('data/ranked_tmp0.jsonl')
        p.unlink()
Exemple #2
0
    def test_dates_generation(self):
        dates_tsv_file = Path('data/dates.tsv')
        wikidata_property_file = 'data/wikidata_properties.tsv'
        generator = JsonGenerator(prop_file=Path(wikidata_property_file),
                                  label_set='label',
                                  alias_set='aliase',
                                  description_set='description',
                                  warning=True,
                                  n=100,
                                  log_path="data/date_warning.log",
                                  has_rank=False,
                                  prop_declaration=False,
                                  output_prefix="data/dates_tmp",
                                  input_file=Path(dates_tsv_file),
                                  property_declaration_label="property_type",
                                  error_action='log')
        generator.process()

        f1 = open('data/dates0.jsonl')
        f2 = open('data/dates_tmp0.jsonl')
        self.assertEqual(f1.readlines(), f2.readlines())

        f1.close()
        f2.close()
        self.assertEqual(os.stat("data/date_warning.log").st_size, 0)
        p = Path("data/date_warning.log")
        p.unlink()
        p = Path('data/dates_tmp0.jsonl')
        p.unlink()
def run(
    input_file: KGTKFiles,
    prop_file: KGTKFiles,
    labels: str,
    aliases: str,
    descriptions: str,
    prop_declaration: bool,
    output_prefix: str,
    n: int,
    log_path: str,
    warning: bool,
    has_rank: bool,
    error_action: str,
    property_declaration_label: str,
    ignore_property_declarations_in_file: bool,
    filter_prop_file: bool,
    verbose: bool,
):
    # import modules locally
    from pathlib import Path
    from kgtk.generator import JsonGenerator
    import sys
    import gzip
    from kgtk.exceptions import KGTKException

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    prop_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_input_file(
            prop_file, who="KGTK prop file")

    generator = JsonGenerator(
        input_file=input_kgtk_file,
        prop_file=prop_kgtk_file,
        label_set=labels,
        alias_set=aliases,
        description_set=descriptions,
        output_prefix=output_prefix,
        n=n,
        log_path=log_path,
        warning=warning,
        prop_declaration=prop_declaration,
        has_rank=has_rank,
        error_action=error_action,
        property_declaration_label=property_declaration_label,
        ignore_property_declarations_in_file=
        ignore_property_declarations_in_file,
        filter_prop_file=filter_prop_file,
        verbose=verbose,
    )
    generator.process()
Exemple #4
0
    def test_qnode_json_generation(self):
        qnode_tsv_file = Path('data/Q57160439.tsv')
        wikidata_property_file = 'data/wikidata_properties.tsv'
        generator = JsonGenerator(prop_file=wikidata_property_file, label_set='label', alias_set='aliases',
                                  description_set='descriptions', warning=True, n=1000,
                                  log_path="data/Q57160439_warning.log",
                                  prop_declaration=False,
                                  has_rank=False,
                                  output_prefix="data/Q57160439_tmp",
                                  input_file=qnode_tsv_file, error_action='log')
        generator.process()

        f1 = open('data/Q571604390.jsonl')
        f2 = open('data/Q57160439_tmp0.jsonl')
        self.assertEqual(f1.readlines(), f2.readlines())
        f1.close()
        f2.close()
        self.assertEqual(os.stat("data/Q57160439_warning.log").st_size, 0)
        p = Path("data/Q57160439_warning.log")
        p.unlink()
        p = Path('data/Q57160439_tmp0.jsonl')
        p.unlink()
Exemple #5
0
def run(
    labels: str,
    aliases: str,
    descriptions: str,
    prop_file: str,
    prop_declaration: bool,
    use_gz: bool,
    output_prefix: str,
    n: int,
    log_path: str,
    warning: bool,
    input_file: str,
    has_rank: bool,
):
    # import modules locally
    from kgtk.generator import JsonGenerator
    import sys
    import gzip
    from kgtk.exceptions import KGTKException

    generator = JsonGenerator(label_set=labels,
                              alias_set=aliases,
                              description_set=descriptions,
                              prop_file=prop_file,
                              output_prefix=output_prefix,
                              n=n,
                              log_path=log_path,
                              warning=warning,
                              prop_declaration=prop_declaration,
                              has_rank=has_rank)
    # loop first round
    if use_gz:
        if input_file:
            try:
                fp = open(input_file, "rb")
            except:
                raise KGTKException(
                    "Fail to read from compressed file {}. Exiting.".format(
                        input_file))
        else:
            fp = gzip.open(sys.stdin.buffer, 'rt')
    else:
        if input_file:
            try:
                fp = open(input_file, "r")
            except:
                raise KGTKException(
                    "Fail to read from file {}. Exiting.".format(input_file))
        else:
            fp = sys.stdin
        # not line by line

    if prop_declaration:
        if input_file:
            for line_num, edge in enumerate(fp):
                generator.read_prop_declaration(line_num + 1, edge)
            fp.seek(0)
            for line_num, edge in enumerate(fp):
                generator.entry_point(line_num + 1, edge)
        else:
            file_lines = 0
            begining_edge = None
            start_generation = False
            for line_num, edge in enumerate(fp):
                if line_num == 0:
                    begining_edge = edge
                    generator.entry_point(line_num + 1, edge)
                    file_lines += 1
                    # print("initial edge at line {}".format(line_num))
                else:
                    if start_generation:
                        # start triple generation because reached the starting position of the second `cat`
                        line_number = line_num - file_lines
                        # print("creating jsons at line {} {} with total number of lines: {}".format(line_number+1, edge, file_lines))
                        generator.entry_point(line_number + 1,
                                              edge)  # file generator
                        # print("# {}".format(generator.read_num_of_lines))
                    else:
                        if edge == begining_edge:
                            # print("set generation start at line {} {}".format(line_num, edge))
                            start_generation = True
                        else:
                            file_lines += 1
                            # print("creating property declarations at line {} {}".format(line_num, edge))
                            generator.read_prop_declaration(line_num + 1, edge)
        generator.finalize()
    else:
        for line_num, edge in enumerate(fp):
            if edge.startswith("#") or len(edge.strip("\n")) == 0:
                continue
            else:
                generator.entry_point(line_num + 1, edge)

        generator.finalize()
    if input_file:
        fp.close()
Exemple #6
0
def run(labels: str, aliases: str, descriptions: str, prop_file: str,
        prop_declaration: bool, use_gz: bool, output_prefix: str, n: int,
        log_path: str, warning: bool):
    # import modules locally
    from kgtk.generator import JsonGenerator
    import sys
    import gzip

    generator = JsonGenerator(
        label_set=labels,
        alias_set=aliases,
        description_set=descriptions,
        prop_file=prop_file,
        output_prefix=output_prefix,
        n=n,
        log_path=log_path,
        warning=warning,
        prop_declaration=prop_declaration,
    )
    # process stdin
    if use_gz:
        fp = gzip.open(sys.stdin.buffer, 'rt')
    else:
        fp = sys.stdin
        # not line by line

    if prop_declaration:
        file_lines = 0
        begining_edge = None
        start_generation = False
        for line_num, edge in enumerate(fp):
            if line_num == 0:
                begining_edge = edge
                generator.entry_point(line_num + 1, edge)
                file_lines += 1
            else:
                if start_generation:
                    # start triple generation because reached the starting position of the second `cat`
                    line_num -= file_lines
                    # print("creating triples at line {} {} with total number of lines: {}".format(line_num+1, edge, file_lines))
                    generator.entry_point(line_num + 1, edge)  # file generator
                    # print("# {}".format(generator.read_num_of_lines))
                else:
                    if edge == begining_edge:
                        start_generation = True
                    else:
                        file_lines += 1
                        # print("creating property declarations at line {} {}".format(line_num, edge))
                        generator.read_prop_declaration(line_num + 1, edge)

        generator.finalize()
    else:
        for line_num, edge in enumerate(fp):
            if edge.startswith("#") or len(edge.strip("\n")) == 0:
                continue
            else:
                generator.entry_point(line_num + 1, edge)

        generator.finalize()