def test_literal_multiline_quotes(self): s, p, o = split_nt_line(self.literal_multiline_quotes) self.assertEqual(s, "http://example.org/#spiderman") self.assertEqual(p, "http://example.org/text") self.assertEqual( o, "\"This is a multi-line\\nliteral with many quotes (\\\"\\\"\\\"\\\"\\\")" "\\nand two apostrophes ('').\"")
def test_literal_xmls(self): s, p, o = split_nt_line(self.literal_xmls) self.assertEqual(s, "http://example.org/show/218") self.assertEqual(p, "http://www.w3.org/2000/01/rdf-schema#label") self.assertEqual( o, """"That Seventies Show"^^<http://www.w3.org/2001/XMLSchema#string>""" )
def lines_to_triples(self, triples: List[str]) -> List[Tuple[str, str, str]]: errors = 0 results = [] for triple_line in tqdm(triples, desc='Converting triples', unit=' triples'): # split it and check if it is a triple try: triple = split_nt_line(triple_line) results.append(triple) except ValueError: errors += 1 continue print(f'{errors} occurred during triple conversion') return results
def test_literal_double(self): s, p, o = split_nt_line(self.literal_double) self.assertEqual(s, "http://en.wikipedia.org/wiki/Helium") self.assertEqual(p, "http://example.org/elements/specificGravity") self.assertEqual( o, "\"1.663E-4\"^^<http://www.w3.org/2001/XMLSchema#double>")
def test_literal_integer(self): s, p, o = split_nt_line(self.literal_integer) self.assertEqual(s, "http://en.wikipedia.org/wiki/Helium") self.assertEqual(p, "http://example.org/elements/atomicNumber") self.assertEqual(o, "\"2\"^^<http://www.w3.org/2001/XMLSchema#integer>")
def test_literal_region(self): s, p, o = split_nt_line(self.literal_region) self.assertEqual(s, "http://example.org/show/218") self.assertEqual(p, "http://example.org/show/localName") self.assertEqual(o, "\"Cette Série des Années Septante\"@fr-be")
def test_literal_language(self): s, p, o = split_nt_line(self.literal_language) self.assertEqual(s, "http://example.org/show/218") self.assertEqual(p, "http://example.org/show/localName") self.assertEqual(o, "\"That Seventies Show\"@en")
def test_literal_untyped(self): s, p, o = split_nt_line(self.literal_untyped) self.assertEqual(s, "http://example.org/show/218") self.assertEqual(p, "http://www.w3.org/2000/01/rdf-schema#label") self.assertEqual(o, "\"That Seventies Show\"")
def test_split_simple_tabs(self): s, p, o = split_nt_line(self.simple_line_tabs) self.assertEqual(s, "http://example.org/#spiderman") self.assertEqual( p, "http://www.perceive.net/schemas/relationship/enemyOf") self.assertEqual(o, "http://example.org/#green-goblin")
def map_triple_lines(self, triple_lines: List[str]): """ Assigns each entity and each relation an id and creates a list of triples consisting of the ids. :param triple_lines: List of Triples :return: Tuple with the mapping of entity -> ID, relation -> ID and a list of the triples with the integer ids. """ # prepare for mapping dict_ent = dict() dict_rel = dict() list_triples = [] num_ent = 0 num_rel = 0 # iterate through every line containing a triple sys.stdout.write("Processing Triple lines ... ") sys.stdout.flush() skipped = 0 for triple_line in tqdm(triple_lines, desc='Processing', unit=' triples'): # split it and check if it is a triple try: triple = split_nt_line(triple_line) except ValueError as e: if self.fail_silently: skipped += 1 continue raise e # check if subject is in entities, add if not if triple[0] not in dict_ent: idx_sub = num_ent dict_ent.update({triple[0]: idx_sub}) num_ent += 1 else: idx_sub = dict_ent[triple[0]] # check if predicate is in relations, add if not if triple[1] not in dict_rel: idx_rel = num_rel dict_rel.update({triple[1]: idx_rel}) num_rel += 1 else: idx_rel = dict_rel[triple[1]] # check if object is in entities, add if not if triple[2] not in dict_ent: idx_obj = num_ent dict_ent.update({triple[2]: idx_obj}) num_ent += 1 else: idx_obj = dict_ent[triple[2]] # check triple if idx_sub < 0 or idx_rel < 0 or idx_obj < 0: sys.exit("Failure: Mapped Triple has invalid Indeces") # add to the mapped triples list # careful: OpenKE format is "subject object relation" mapped_triple = [idx_sub, idx_obj, idx_rel] list_triples.append(mapped_triple) # # output progess (avoid spamming) # progess += 1 # percentage = int((progess * 100) / finish) # if percentage > last_percentage: # sys.stdout.write("\rProcessing Triple lines ... " + str(percentage) + "%") # sys.stdout.flush() # last_percentage = percentage # output results print("") print(str(len(dict_ent)) + " Distinct Entities") print(str(len(dict_rel)) + " Distinct Relations") print(str(len(list_triples)) + " Distinct Triples") print("Skipped " + str(skipped) + " lines") return dict_ent, dict_rel, list_triples