def test_ranked_kgtk_generation(self): ranked_tsv_file = Path('data/ranked_example.tsv') wikidata_property_file = 'data/wikidata_properties.tsv' generator = JsonGenerator(prop_file=Path(wikidata_property_file), label_set='label', alias_set='alias', description_set='description', warning=True, n=1000, log_path="data/ranked_warning.log", prop_declaration=False, has_rank=False, output_prefix="data/ranked_tmp", property_declaration_label="property_type", input_file=Path(ranked_tsv_file), error_action='log') generator.process() f1 = open('data/ranked0.jsonl') f2 = open('data/ranked_tmp0.jsonl') self.assertEqual(f1.readlines(), f2.readlines()) f1.close() f2.close() p = Path('data/ranked_tmp0.jsonl') p.unlink()
def test_dates_generation(self): dates_tsv_file = Path('data/dates.tsv') wikidata_property_file = 'data/wikidata_properties.tsv' generator = JsonGenerator(prop_file=Path(wikidata_property_file), label_set='label', alias_set='aliase', description_set='description', warning=True, n=100, log_path="data/date_warning.log", has_rank=False, prop_declaration=False, output_prefix="data/dates_tmp", input_file=Path(dates_tsv_file), property_declaration_label="property_type", error_action='log') generator.process() f1 = open('data/dates0.jsonl') f2 = open('data/dates_tmp0.jsonl') self.assertEqual(f1.readlines(), f2.readlines()) f1.close() f2.close() self.assertEqual(os.stat("data/date_warning.log").st_size, 0) p = Path("data/date_warning.log") p.unlink() p = Path('data/dates_tmp0.jsonl') p.unlink()
def run( input_file: KGTKFiles, prop_file: KGTKFiles, labels: str, aliases: str, descriptions: str, prop_declaration: bool, output_prefix: str, n: int, log_path: str, warning: bool, has_rank: bool, error_action: str, property_declaration_label: str, ignore_property_declarations_in_file: bool, filter_prop_file: bool, verbose: bool, ): # import modules locally from pathlib import Path from kgtk.generator import JsonGenerator import sys import gzip from kgtk.exceptions import KGTKException input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) prop_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_input_file( prop_file, who="KGTK prop file") generator = JsonGenerator( input_file=input_kgtk_file, prop_file=prop_kgtk_file, label_set=labels, alias_set=aliases, description_set=descriptions, output_prefix=output_prefix, n=n, log_path=log_path, warning=warning, prop_declaration=prop_declaration, has_rank=has_rank, error_action=error_action, property_declaration_label=property_declaration_label, ignore_property_declarations_in_file= ignore_property_declarations_in_file, filter_prop_file=filter_prop_file, verbose=verbose, ) generator.process()
def test_qnode_json_generation(self): qnode_tsv_file = Path('data/Q57160439.tsv') wikidata_property_file = 'data/wikidata_properties.tsv' generator = JsonGenerator(prop_file=wikidata_property_file, label_set='label', alias_set='aliases', description_set='descriptions', warning=True, n=1000, log_path="data/Q57160439_warning.log", prop_declaration=False, has_rank=False, output_prefix="data/Q57160439_tmp", input_file=qnode_tsv_file, error_action='log') generator.process() f1 = open('data/Q571604390.jsonl') f2 = open('data/Q57160439_tmp0.jsonl') self.assertEqual(f1.readlines(), f2.readlines()) f1.close() f2.close() self.assertEqual(os.stat("data/Q57160439_warning.log").st_size, 0) p = Path("data/Q57160439_warning.log") p.unlink() p = Path('data/Q57160439_tmp0.jsonl') p.unlink()
def run( labels: str, aliases: str, descriptions: str, prop_file: str, prop_declaration: bool, use_gz: bool, output_prefix: str, n: int, log_path: str, warning: bool, input_file: str, has_rank: bool, ): # import modules locally from kgtk.generator import JsonGenerator import sys import gzip from kgtk.exceptions import KGTKException generator = JsonGenerator(label_set=labels, alias_set=aliases, description_set=descriptions, prop_file=prop_file, output_prefix=output_prefix, n=n, log_path=log_path, warning=warning, prop_declaration=prop_declaration, has_rank=has_rank) # loop first round if use_gz: if input_file: try: fp = open(input_file, "rb") except: raise KGTKException( "Fail to read from compressed file {}. Exiting.".format( input_file)) else: fp = gzip.open(sys.stdin.buffer, 'rt') else: if input_file: try: fp = open(input_file, "r") except: raise KGTKException( "Fail to read from file {}. Exiting.".format(input_file)) else: fp = sys.stdin # not line by line if prop_declaration: if input_file: for line_num, edge in enumerate(fp): generator.read_prop_declaration(line_num + 1, edge) fp.seek(0) for line_num, edge in enumerate(fp): generator.entry_point(line_num + 1, edge) else: file_lines = 0 begining_edge = None start_generation = False for line_num, edge in enumerate(fp): if line_num == 0: begining_edge = edge generator.entry_point(line_num + 1, edge) file_lines += 1 # print("initial edge at line {}".format(line_num)) else: if start_generation: # start triple generation because reached the starting position of the second `cat` line_number = line_num - file_lines # print("creating jsons at line {} {} with total number of lines: {}".format(line_number+1, edge, file_lines)) generator.entry_point(line_number + 1, edge) # file generator # print("# {}".format(generator.read_num_of_lines)) else: if edge == begining_edge: # print("set generation start at line {} {}".format(line_num, edge)) start_generation = True else: file_lines += 1 # print("creating property declarations at line {} {}".format(line_num, edge)) generator.read_prop_declaration(line_num + 1, edge) generator.finalize() else: for line_num, edge in enumerate(fp): if edge.startswith("#") or len(edge.strip("\n")) == 0: continue else: generator.entry_point(line_num + 1, edge) generator.finalize() if input_file: fp.close()
def run(labels: str, aliases: str, descriptions: str, prop_file: str, prop_declaration: bool, use_gz: bool, output_prefix: str, n: int, log_path: str, warning: bool): # import modules locally from kgtk.generator import JsonGenerator import sys import gzip generator = JsonGenerator( label_set=labels, alias_set=aliases, description_set=descriptions, prop_file=prop_file, output_prefix=output_prefix, n=n, log_path=log_path, warning=warning, prop_declaration=prop_declaration, ) # process stdin if use_gz: fp = gzip.open(sys.stdin.buffer, 'rt') else: fp = sys.stdin # not line by line if prop_declaration: file_lines = 0 begining_edge = None start_generation = False for line_num, edge in enumerate(fp): if line_num == 0: begining_edge = edge generator.entry_point(line_num + 1, edge) file_lines += 1 else: if start_generation: # start triple generation because reached the starting position of the second `cat` line_num -= file_lines # print("creating triples at line {} {} with total number of lines: {}".format(line_num+1, edge, file_lines)) generator.entry_point(line_num + 1, edge) # file generator # print("# {}".format(generator.read_num_of_lines)) else: if edge == begining_edge: start_generation = True else: file_lines += 1 # print("creating property declarations at line {} {}".format(line_num, edge)) generator.read_prop_declaration(line_num + 1, edge) generator.finalize() else: for line_num, edge in enumerate(fp): if edge.startswith("#") or len(edge.strip("\n")) == 0: continue else: generator.entry_point(line_num + 1, edge) generator.finalize()