""" def __init__(self, etk): ETKModule.__init__(self, etk) self.inferlink_extractor = InferlinkExtractor( InferlinkRuleSet( InferlinkRuleSet.load_rules_file( '../html_basic/sample_inferlink_rules.json'))) def process_document(self, doc): """ Add your code for processing the document """ raw = doc.select_segments("$.raw_content")[0] extractions = doc.extract(self.inferlink_extractor, raw) doc.store(extractions, "inferlink_extraction") return list() if __name__ == "__main__": sample_html = json.load(codecs.open('../html_basic/sample_html.json', 'r')) # read sample file from disk etk = ETK(modules=InferlinkETKModule) doc = etk.create_document(sample_html, mime_type="text/html", url="http://ex.com/123") docs = etk.process_ems(doc) print(json.dumps(docs[0].value, indent=2))
}, "matched_sentence": { "type": "string" }, "date": { "type": "string" } } } kg_schema = KGSchema(master_config) etk = ETK(kg_schema, ["./"]) # read the news news_file = open( '/Users/amandeep/Github/etk/examples/ranking_pipeline/resources/new_2018-04-03-first-10000.jl' ) # news_file = open('/Users/amandeep/Github/etk/examples/ranking_pipeline/resources/news_stories_3.jl') news_stories = [ etk.create_document(json.loads(line), url=json.loads(line)['tld'], doc_id=json.loads(line)['doc_id']) for line in news_file ] results = list() for news_story in news_stories: results.extend(etk.process_ems(news_story)) o = open('ifp_news_similarity.jl', 'w') for result in results: o.write(json.dumps(result.value)) o.write('\n')
class TripleGenerator(Generator): def __init__(self, **kwargs): super().__init__(**kwargs) prop_declaration = kwargs.pop("prop_declaration") dest_fp = kwargs.pop("dest_fp") truthy = kwargs.pop("truthy") use_id = kwargs.pop("use_id") prefix_path = kwargs.pop("prefix_path") self.datatype_mapping = { # nomenclature from https://w.wiki/Tfn "item": Item, "WikibaseItem": Item, "time": TimeValue, "Time": TimeValue, "globe-coordinate": GlobeCoordinate, "GlobeCoordinate": GlobeCoordinate, "quantity": QuantityValue, "Quantity": QuantityValue, "monolingualtext": MonolingualText, "Monolingualtext": MonolingualText, "string": StringValue, "String": StringValue, "external-identifier": ExternalIdentifier, "ExternalId": ExternalIdentifier, "url": StringValue, #TODO bug potentially in rdflib "Url": StringValue, "property": WDProperty, "WikibaseProperty": WDProperty } self.set_prefix(prefix_path) self.prop_declaration = prop_declaration self.set_properties(self.prop_file) self.fp = dest_fp self.truthy = truthy self.reset_etk_doc() self.serialize_prefix() self.use_id = use_id def set_prefix(self, prefix_path: str): self.prefix_dict = {} if prefix_path != "NONE": with open(prefix_path, "r") as fp: for line_num, edge in enumerate(fp): edge_list = edge.strip("\r\n").split("\t") if line_num == 0: node1_index, node2_index = edge_list.index( "node1"), edge_list.index("node2") else: prefix, expand = edge_list[node1_index], edge_list[ node2_index] self.prefix_dict[prefix] = expand def read_prop_declaration(self, line_number: int, edge: str): node1, node2, prop, e_id = self.parse_edges(edge) if prop == "data_type": self.prop_types[node1] = self.datatype_mapping[node2.strip()] return def set_properties(self, prop_file: str): self.prop_types = {} if prop_file == "NONE": return with open(prop_file, "r") as fp: props = fp.readlines() for line in props[1:]: node1, _, node2 = line.split("\t") try: self.prop_types[node1] = self.datatype_mapping[node2.strip()] except: raise KGTKException( "DataType {} of node {} is not supported.\n".format( node2, node1)) def _node_2_entity(self, node: str): ''' A node can be Qxxx or Pxxx, return the proper entity. ''' if node in self.prop_types: entity = WDProperty(node, self.prop_types[node]) else: entity = WDItem(TripleGenerator.replace_illegal_string(node)) return entity def reset_etk_doc(self, doc_id: str = "http://isi.edu/default-ns/projects"): """ reset the doc object and return it. Called at initialization and after outputting triples. """ kg_schema = KGSchema() kg_schema.add_schema("@prefix : <http://isi.edu/> .", "ttl") self.etk = ETK(kg_schema=kg_schema, modules=ETKModule) self.doc = self.etk.create_document({}, doc_id=doc_id) for k, v in wiki_namespaces.items(): if k in self.prefix_dict: self.doc.kg.bind(k, self.prefix_dict[k]) else: self.doc.kg.bind(k, v) def serialize(self): """ Seriealize the triples. Used a hack to avoid serializing the prefix again. """ docs = self.etk.process_ems(self.doc) self.fp.write("\n\n".join( docs[0].kg.serialize("ttl").split("\n\n")[1:])) self.fp.flush() self.reset() def serialize_prefix(self): """ This function should be called only once after the doc object is initialized. In order to serialize the prefix at the very begining it has to be printed per the change of rdflib 4.2.2->5.0.0 Relevent issue: https://github.com/RDFLib/rdflib/issues/965 """ for k, v in wiki_namespaces.items(): if k in self.prefix_dict: line = "@prefix " + k + ": <" + self.prefix_dict[k] + "> .\n" else: line = "@prefix " + k + ": <" + v + "> .\n" self.fp.write(line) self.fp.write("\n") self.fp.flush() self.reset() def reset(self): self.to_append_statement_id = None self.to_append_statement = None self.read_num_of_lines = 0 self.reset_etk_doc() def generate_label_triple(self, node1: str, node2: str) -> bool: entity = self._node_2_entity(node1) text_string, lang = TripleGenerator.process_text_string(node2) entity.add_label(text_string, lang=lang) self.doc.kg.add_subject(entity) return True def generate_description_triple(self, node1: str, node2: str) -> bool: entity = self._node_2_entity(node1) text_string, lang = TripleGenerator.process_text_string(node2) entity.add_description(text_string, lang=lang) self.doc.kg.add_subject(entity) return True def generate_alias_triple(self, node1: str, node2: str) -> bool: entity = self._node_2_entity(node1) text_string, lang = TripleGenerator.process_text_string(node2) entity.add_alias(text_string, lang=lang) self.doc.kg.add_subject(entity) return True def generate_prop_declaration_triple(self, node1: str, node2: str) -> bool: # update the known prop_types if node1 in self.prop_types: if not self.prop_declaration: raise KGTKException( "Duplicated property definition of {} found!".format( node1)) else: self.prop_types[node1] = node2 prop = WDProperty(node1, self.datatype_mapping[node2]) self.doc.kg.add_subject(prop) return True def generate_normal_triple(self, node1: str, property: str, node2: str, is_qualifier_edge: bool, e_id: str) -> bool: if self.use_id: e_id = TripleGenerator.replace_illegal_string(e_id) entity = self._node_2_entity(node1) edge_type = self.prop_types[property] if edge_type == Item: object = WDItem(TripleGenerator.replace_illegal_string(node2)) elif edge_type == WDProperty: object = WDProperty(TripleGenerator.replace_illegal_string(node2), self.prop_types[node2]) elif edge_type == TimeValue: if self.yyyy_mm_dd_pattern.match(node2): try: dateTimeString = node2 object = TimeValue( value=dateTimeString, # TODO calendar=Item("Q1985727"), precision=Precision.year, time_zone=0, ) except: return False elif self.yyyy_pattern.match(node2): try: dateTimeString = node2 + "-01-01" object = TimeValue( value=dateTimeString, # TODO calendar=Item("Q1985727"), precision=Precision.year, time_zone=0, ) except: return False else: try: # TODO, in future, the two cases above will be dropped in principle to comply with the iso format # now it is iso format assert (node2[0] == "^") node2 = node2[1:] # remove ^ if node2.startswith("+"): node2 = node2[1:] dateTimeString, precision = node2.split("/") dateTimeString = dateTimeString[:-1] # remove Z object = TimeValue( value=dateTimeString, calendar=Item("Q1985727"), precision=precision, time_zone=0, ) except: return False elif edge_type == GlobeCoordinate: latitude, longitude = node2[1:].split("/") latitude = float(latitude) longitude = float(longitude) object = GlobeCoordinate(latitude, longitude, 0.0001, globe=Item("Q2")) # earth elif edge_type == QuantityValue: # +70[+60,+80]Q743895 res = self.quantity_pattern.match(node2).groups() amount, lower_bound, upper_bound, unit = res amount = TripleGenerator.clean_number_string(amount) num_type = self.xsd_number_type(amount) lower_bound = TripleGenerator.clean_number_string(lower_bound) upper_bound = TripleGenerator.clean_number_string(upper_bound) if unit != None: if upper_bound != None and lower_bound != None: object = QuantityValue(amount, unit=Item(unit), upper_bound=upper_bound, lower_bound=lower_bound, type=num_type) else: object = QuantityValue(amount, unit=Item(unit), type=num_type) else: if upper_bound != None and lower_bound != None: object = QuantityValue(amount, upper_bound=upper_bound, lower_bound=lower_bound, type=num_type) else: object = QuantityValue(amount, type=num_type) elif edge_type == MonolingualText: text_string, lang = TripleGenerator.process_text_string(node2) object = MonolingualText(text_string, lang) elif edge_type == ExternalIdentifier: object = ExternalIdentifier(node2) elif edge_type == URLValue: if TripleGenerator.is_valid_uri_with_scheme_and_host(node2): object = URLValue(node2) else: return False else: # treat everything else as stringValue object = StringValue(node2) if type(object) == WDItem or type(object) == WDProperty: self.doc.kg.add_subject(object) if is_qualifier_edge: # edge: e8 p9 ^2013-01-01T00:00:00Z/11 # create qualifier edge on previous STATEMENT and return the updated STATEMENT self.to_append_statement.add_qualifier(property, object) self.doc.kg.add_subject(self.to_append_statement) else: # edge: q1 p8 q2 e8 # create brand new property edge and replace STATEMENT if self.truthy: self.to_append_statement = entity.add_truthy_statement( property, object, statement_id=e_id ) if self.use_id else entity.add_truthy_statement( property, object) else: self.to_append_statement = entity.add_statement( property, object, statement_id=e_id ) if self.use_id else entity.add_statement(property, object) self.doc.kg.add_subject(entity) return True def entry_point(self, line_number: int, edge: str): # print(line_number,edge) """ generates a list of two, the first element is the determination of the edge type using corresponding edge type the second element is a bool indicating whether this is a valid property edge or qualifier edge. Call corresponding downstream functions """ if line_number == 1: # initialize the order_map self.initialize_order_map(edge) return # use the order_map to map the node node1, node2, prop, e_id = self.parse_edges(edge) if line_number == 2: # by default a statement edge is_qualifier_edge = False else: if node1 != self.to_append_statement_id and node1 != self.corrupted_statement_id: is_qualifier_edge = False # also a new statement edge if self.read_num_of_lines >= self.n: self.serialize() else: # qualifier edge or property declaration edge is_qualifier_edge = True if node1 == self.corrupted_statement_id: self.warn_log.write( "QUALIFIER edge at line [{}] associated of corrupted statement edge of id [{}] dropped.\n" .format(line_number, self.corrupted_statement_id)) return if prop in self.label_set: success = self.generate_label_triple(node1, node2) elif prop in self.description_set: success = self.generate_description_triple(node1, node2) elif prop in self.alias_set: success = self.generate_alias_triple(node1, node2) elif prop == "data_type": # special edge of prop declaration success = self.generate_prop_declaration_triple(node1, node2) else: if prop in self.prop_types: success = self.generate_normal_triple(node1, prop, node2, is_qualifier_edge, e_id) else: raise KGTKException( "property [{}]'s type is unknown at line [{}].\n".format( prop, line_number)) if (not success) and self.warning: if not is_qualifier_edge: self.warn_log.write( "CORRUPTED_STATEMENT edge at line: [{}] with edge id [{}].\n" .format(line_number, e_id)) self.corrupted_statement_id = e_id else: self.warn_log.write( "CORRUPTED_QUALIFIER edge at line: [{}] with edge id [{}].\n" .format(line_number, e_id)) else: self.read_num_of_lines += 1 if not is_qualifier_edge: self.to_append_statement_id = e_id @staticmethod def xsd_number_type(num): if isinstance(num, float) and 'e' in str(num).lower(): return LiteralType.double return LiteralType.decimal
if extractions: path = '$."' + \ extractions[0].value + '"[?(@.country == "Italy")]' jsonpath_expr = jex.parse(path) city_match = jsonpath_expr.find(self.city_dataset) if city_match: # add corresponding values of city_dataset into knowledge graph of the doc for field in city_match[0].value: doc.kg.add_value( field, value=city_match[0].value[field]) new_docs.append(doc) return new_docs def document_selector(self, doc) -> bool: return doc.cdr_document.get("dataset") == "italy_team" if __name__ == "__main__": # url = 'https://en.wikipedia.org/wiki/List_of_football_clubs_in_Italy' cdr = json.load( open('./resources/italy_teams.json', mode='r', encoding='utf-8')) kg_schema = KGSchema(json.load(open('./resources/master_config.json'))) etk = ETK(modules=ItalyTeamsModule, kg_schema=kg_schema) etk.parser = jex.parse cdr_doc = Document(etk, cdr_document=cdr, mime_type='json', url=cdr['url']) results = etk.process_ems(cdr_doc)[1:] print('Total docs:', len(results)) print("Sample result:\n") print(json.dumps(results[0].value, indent=2))
parser.add_option("-o", "--output_file", action="store", type="string", dest="output_file") (c_options, args) = parser.parse_args() input_file = c_options.input_file output_file = c_options.output_file f = open(input_file, mode='r', encoding='utf-8') o = open(output_file, mode='w', encoding='utf-8') l = open('{}.log'.format(output_file), mode='w', encoding='utf-8') print('Starting to process file: {}'.format(input_file)) count = 0 sum = 0 for line in f: if count == 10000: sum += count l.write('Processed {} lines'.format(str(sum))) l.write('\n') count = 0 json_x = json.loads(line) doc = etk.create_document(json_x) doc.doc_id = json_x['doc_id'] sentences = etk.process_ems(doc) for s in sentences: o.write(json.dumps(s.value)) o.write('\n') count += 1
class ETKWorker(object): def __init__(self, master_config, em_paths, logger, worker_id, project_name, kafka_input_args=None, kafka_output_args=None): self.logger = logger self.worker_id = worker_id self.check_interval = 1000 self.exit_sign = False try: kg_schema = KGSchema(master_config) self.etk_ins = ETK(kg_schema, em_paths, logger=logger) except Exception as e: logger.exception('ETK initialization failed') raise e # kafka input self.kafka_input_server = config['input_server'] self.kafka_input_session_timeout = config['input_session_timeout'] self.kafka_input_group_id = config['input_group_id'] self.kafka_input_topic = '{project_name}_in'.format(project_name=project_name) self.kafka_input_args = dict() if kafka_input_args is None else kafka_input_args self.kafka_consumer = KafkaConsumer( bootstrap_servers=self.kafka_input_server, group_id=self.kafka_input_group_id, consumer_timeout_ms=self.check_interval, value_deserializer=lambda v: json.loads(v.decode('utf-8')), **self.kafka_input_args ) self.kafka_consumer.subscribe([self.kafka_input_topic]) # kafka output self.kafka_output_server = config['output_server'] self.kafka_output_topic = '{project_name}_out'.format(project_name=project_name) self.kafka_output_args = dict() if kafka_output_args is None else kafka_output_args self.kafka_producer = KafkaProducer( bootstrap_servers=self.kafka_output_server, value_serializer=lambda v: json.dumps(v).encode('utf-8'), **self.kafka_output_args ) self.timeout_count = self.kafka_input_session_timeout / self.check_interval self.current_timeout_count = 0 def process(self): # prev_doc_sent_time = None while not self.exit_sign: # high level api handles batching # will exit once timeout try: for msg in self.kafka_consumer: # force to commit, block till getting response self.kafka_consumer.commit() # get message, clear timeout count self.current_timeout_count = 0 cdr = msg.value # TODO better way to add execution profile # cdr['@execution_profile'] = {'@worker_id': self.worker_id} # doc_arrived_time = time.time() # cdr['@execution_profile']['@doc_arrived_time'] = \ # datetime.utcfromtimestamp(doc_arrived_time).isoformat() # cdr['@execution_profile']['@doc_wait_time'] = \ # 0.0 if not prev_doc_sent_time \ # else float(doc_arrived_time - prev_doc_sent_time) # cdr['@execution_profile']['@doc_length'] = len(json.dumps(cdr)) if 'doc_id' not in cdr or len(cdr['doc_id']) == 0: self.logger.error('invalid cdr: unknown doc_id') continue self.logger.info('processing %s' % cdr['doc_id']) try: # start_run_core_time = time.time() # run etk module doc = self.etk_ins.create_document(cdr, url=cdr['url'], doc_id=cdr['doc_id']) # process_ems returns a list of Documents results = self.etk_ins.process_ems(doc) for result in results: cdr_result = result.cdr_document # indexing # TODO indexed_cdr = index_knowledge_graph_fields(cdr_result) if not indexed_cdr: logger.error('indexing in sandpaper failed') continue # cdr = indexed_cdr # cdr['@execution_profile']['@run_core_time'] = \ # float(time.time() - start_run_core_time) # doc_sent_time = time.time() # cdr['@execution_profile']['@doc_sent_time'] = \ # datetime.utcfromtimestamp(doc_sent_time).isoformat() # prev_doc_sent_time = doc_sent_time # cdr['@execution_profile']['@doc_processed_time'] = \ # float(doc_sent_time - doc_arrived_time) # output result r = self.kafka_producer.send(self.kafka_output_topic, indexed_cdr) r.get(timeout=60) # wait till sent self.logger.info('{} done'.format(indexed_cdr['doc_id'])) except Exception as e: self.logger.exception('failed at %s' % cdr['doc_id']) except ValueError as e: # I/O operation on closed epoll fd self.logger.info('consumer closed') self.exit_sign = True except StopIteration as e: # timeout self.current_timeout_count += 1 if self.current_timeout_count >= self.timeout_count: self.exit_sign = True except CommitFailedError as e: self.exit_sign = True # https://github.com/dpkp/kafka-python/blob/535d8f6a85969c4e07de0bc81e14513c677995be/kafka/errors.py#L65 # if this worker is dead, restart and reattach to the group g_restart_worker = True def __del__(self): self.logger.info('ETK worker {} is exiting...'.format(self.worker_id)) try: self.kafka_consumer.close() except: pass try: self.kafka_producer.close() except: pass
# Record the country of this actor doc.kg.add_value("country", json_path="$.Side") # Add a title to the actor document doc.kg.add_value("title", json_path="$.Side") # Return an empty list because we didn't create new documents return list() # The main is for testing, and is not used in the DIG pipeline if __name__ == "__main__": # Tell ETK the schema of the fields in the KG, the DIG master_config can be used as the schema. kg_schema = KGSchema(json.load(open('master_config.json'))) # Instantiate ETK, with the two processing modules and the schema. etk = ETK(modules=[UCDPModule, UCDPActorModule], kg_schema=kg_schema) # Create a CSV processor to create documents for the relevant rows in the Excel sheet cp = CsvProcessor(etk=etk, heading_row=1) with open("ucdp.jl", "w") as f: # Iterate over all the rows in the spredsheet for doc in cp.tabular_extractor(filename="ucdp_sample.xls", dataset='ucdp'): # Each row produces a document, which we sent to ETK. # Note that each invocation of process_ems will also process any new documents created while # processing each doc for result in etk.process_ems(doc): print(result.cdr_document["knowledge_graph"]) f.write(json.dumps(result.cdr_document) + "\n")
# for segment in doc.select_segments(jsonpath='$.notes'): # doc.kg.add_value("description", segment.value) doc.kg.add_value("description", json_path='$.notes') def document_selector(self, doc) -> bool: """ Boolean function for selecting document Args: doc: Document Returns: """ return DefaultDocumentSelector().select_document(doc) if __name__ == "__main__": kg_schema = KGSchema(json.load(open('master_config.json'))) etk = ETK(modules=AcledModule, kg_schema=kg_schema) cp = CsvProcessor(etk=etk, heading_row=1) data_set = 'test_data_set_csv' docs = cp.tabular_extractor(filename="acled_raw_data.csv", dataset='acled', doc_id_field="data_id") results = etk.process_ems(docs[0]) print(json.dumps(results[0].value, indent=2))
"version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep, Anika and others." }, { "name": "rltk", "description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students." }] } #provenance use example case for fetching origins kg_schema = KGSchema(json.load(open("master_config.json", "r"))) etk = ETK(kg_schema=kg_schema, modules=ProvenanceOriginExtractionETKModule) doc = etk.create_document(sample_input) doc_ = etk.process_ems(doc) print(json.dumps(doc.kg.value, indent=2)) provenanceAPI = ProvenanceAPI(doc) #print(json.dumps(doc.value, indent=2)) #Use case/Example 1 origins = provenanceAPI.get_origins("developer", 1) print("Use case/Example 1: ") for origin in origins: print("start char: " + str(origin.start_char)) print("end char: " + str(origin.end_char)) print("jsonPath char: " + str(origin.json_path)) #Use case/Example 2 origins = provenanceAPI.get_origins("developer")
class TripleGenerator: """ A class to maintain the status of the generator """ def __init__( self, propFile: str, labelSet: str, aliasSet: str, descriptionSet: str, n: int, destFp: TextIO = sys.stdout, ): self.propTypes = self.__setPropTypes(propFile) self.labelSet, self.aliasSet, self.descriptionSet = self.__setSets( labelSet, aliasSet, descriptionSet) # TODO handle standard output self.fp = destFp self.n = int(n) self.read = 0 # serialize prfix kg_schema = KGSchema() kg_schema.add_schema("@prefix : <http://isi.edu/> .", "ttl") self.etk = ETK(kg_schema=kg_schema, modules=ETKModule) self.doc = self.__setDoc() self.__serialize_prefix() def __setPropTypes(self, propFile: str): dataTypeMappings = { "item": Item, "time": TimeValue, "globe-coordinate": GlobeCoordinate, "quantity": QuantityValue, "monolingualtext": MonolingualText, "string": StringValue, } with open(propFile, "r") as fp: props = fp.readlines() __propTypes = {} for line in props[1:]: node1, _, node2 = line.split("\t") try: __propTypes[node1] = dataTypeMappings[node2.strip()] except: raise KGTKException( "DataType {} of node {} is not supported.\n".format( node2, node1)) return __propTypes def __setSets(self, labelSet: str, aliasSet: str, descriptionSet: str): return ( set(labelSet.split(",")), set(aliasSet.split(",")), set(descriptionSet.split(",")), ) def __setDoc(self, doc_id: str = "http://isi.edu/default-ns/projects"): """ reset the doc object and return it. Called at initialization and after outputting triples. """ doc = self.etk.create_document({}, doc_id=doc_id) # bind prefixes doc.kg.bind("wikibase", "http://wikiba.se/ontology#") doc.kg.bind("wd", "http://www.wikidata.org/entity/") doc.kg.bind("wdt", "http://www.wikidata.org/prop/direct/") doc.kg.bind("wdtn", "http://www.wikidata.org/prop/direct-normalized/") doc.kg.bind("wdno", "http://www.wikidata.org/prop/novalue/") doc.kg.bind("wds", "http://www.wikidata.org/entity/statement/") doc.kg.bind("wdv", "http://www.wikidata.org/value/") doc.kg.bind("wdref", "http://www.wikidata.org/reference/") doc.kg.bind("p", "http://www.wikidata.org/prop/") doc.kg.bind("pr", "http://www.wikidata.org/prop/reference/") doc.kg.bind("prv", "http://www.wikidata.org/prop/reference/value/") doc.kg.bind( "prn", "http://www.wikidata.org/prop/reference/value-normalized/") doc.kg.bind("ps", "http://www.wikidata.org/prop/statement/") doc.kg.bind("psv", "http://www.wikidata.org/prop/statement/value/") doc.kg.bind( "psn", "http://www.wikidata.org/prop/statement/value-normalized/") doc.kg.bind("pq", "http://www.wikidata.org/prop/qualifier/") doc.kg.bind("pqv", "http://www.wikidata.org/prop/qualifier/value/") doc.kg.bind( "pqn", "http://www.wikidata.org/prop/qualifier/value-normalized/") doc.kg.bind("skos", "http://www.w3.org/2004/02/skos/core#") doc.kg.bind("prov", "http://www.w3.org/ns/prov#") doc.kg.bind("schema", "http://schema.org/") return doc def genLabelTriple(self, node1: str, label: str, node2: str) -> bool: if node1 in self.propTypes: entity = WDProperty(node1.upper(), self.propTypes[node1]) else: entity = WDItem(node1.upper()) if "@" in node2: node2, lang = node2.split("@") entity.add_label(node2.replace('"', "").replace("'", ""), lang=lang) else: entity.add_label(node2.replace('"', "").replace("'", ""), lang="en") # default self.doc.kg.add_subject(entity) return True def genDescriptionTriple(self, node1: str, label: str, node2: str) -> bool: if node1 in self.propTypes: entity = WDProperty(node1.upper(), self.propTypes[node1]) else: entity = WDItem(node1.upper()) if "@" in node2: node2, lang = node2.split("@") entity.add_description(node2.replace('"', "").replace("'", ""), lang=lang) else: entity.add_description(node2.replace('"', "").replace("'", ""), lang="en") # default self.doc.kg.add_subject(entity) return True def genDescriptionTriple(self, node1: str, label: str, node2: str) -> bool: if node1 in self.propTypes: entity = WDProperty(node1.upper(), self.propTypes[node1]) else: entity = WDItem(node1.upper()) if "@" in node2: node2, lang = node2.split("@") entity.add_description(node2.replace('"', "").replace("'", ""), lang=lang) else: entity.add_description(node2.replace('"', "").replace("'", ""), lang="en") # default self.doc.kg.add_subject(entity) return True def genAliasTriple(self, node1: str, label: str, node2: str) -> bool: if node1 in self.propTypes: entity = WDProperty(node1.upper(), self.propTypes[node1]) else: entity = WDItem(node1.upper()) if "@" in node2: node2, lang = node2.split("@") entity.add_alias(node2.replace('"', "").replace("'", ""), lang=lang) else: entity.add_alias(node2.replace('"', "").replace("'", ""), lang="en") # default self.doc.kg.add_subject(entity) return True def genPropDeclarationTriple(self, node1: str, label: str, node2: str) -> bool: prop = WDProperty(node1.upper(), self.propTypes[node1]) self.doc.kg.add_subject(prop) return True def genNormalTriple(self, node1: str, label: str, node2: str, isPropEdge: bool) -> bool: """ The normal triple's type is determined by 1. label's datatype in prop_types.tsv 2. kgtk format convention of node2 field Update the self.STATEMENT """ # determine the node type [property|item] if node1 in self.propTypes: entity = WDProperty(node1.upper(), self.propTypes[node1]) else: entity = WDItem(node1.upper()) # determine the edge type edgeType = self.propTypes[label] if edgeType == Item: OBJECT = Item(node2.upper()) elif edgeType == TimeValue: # https://www.wikidata.org/wiki/Help:Dates # ^201301-01T00:00:00Z/11 dateTimeString, precision = node2[1:].split("/") dateString, timeString = dateTimeString.split("T") OBJECT = TimeValue( value=dateString, calendar=Item("Q1985727"), precision=precision, time_zone=0, ) elif edgeType == GlobeCoordinate: latitude, longitude = node2[1:].split("/") OBJECT = GlobeCoordinate(latitude, longitude, 0.0001, globe=StringValue("Earth")) elif edgeType == QuantityValue: amount, unit = (re.compile("([\+|\-]?[0-9]+\.?[0-9]*)U([0-9]+)" ).match(node2).groups()) OBJECT = QuantityValue(amount=float(amount), unit=Item(unit)) elif edgeType == MonolingualText: try: textString, lang = node2.split("@") OBJECT = MonolingualText(textString, lang) except: OBJECT = MonolingualText(textString, "en") else: # treat everything else as stringValue OBJECT = StringValue(node2) if isPropEdge: # edge: q1 p8 q2 e8 # create brand new property edge and replace STATEMENT self.STATEMENT = entity.add_statement(label.upper(), OBJECT) else: # edge: e8 p9 ^2013-01-01T00:00:00Z/11 # create qualifier edge on previous STATEMENT and return the updated STATEMENT self.STATEMENT.add_qualifier(label.upper(), OBJECT) self.doc.kg.add_subject(self.STATEMENT) return True def entryPoint(self, edge: str): """ edge: "p8\tp1\t'hasFather'@en\te5\n", a line in the edges.tsv file generates a list of two, the first element is the determination of the edge type using corresponding edge type the second element is a bool indicating whether this is a valid property edge or qualifier edge. Call corresponding downstream functions """ edgeList = edge.strip().split("\t") l = len(edgeList) if l == 4: # property statement edge # try to serialize when a new property statement is encountered. if self.read >= self.n: self.serialize() isPropEdge = True [node1, label, node2, eID] = edgeList node1, label, node2, eID = ( node1.strip(), label.strip(), node2.strip(), eID.strip(), ) if eID == self.ID: raise KGTKException( "id {} of edge {} duplicates latest property statement id {}.\n" .format(eID, edge, self.ID)) return else: self.ID = eID elif l == 3: # qualifier edge or property declaration edge isPropEdge = False [node1, label, node2] = edgeList node1, label, node2 = node1.strip(), label.strip( ), node2.strip() if label != "type" and node1 != self.ID: # 1. not a property declaration edge and # 2. the current qualifier's node1 is not the latest property edge id, throw errors. raise KGTKException( "Node1 {} of qualifier edge {} doesn't agree with latest property edge id {}.\n" .format(node1, edge, self.ID)) return else: raise KGTKException( "Length {} of edge {} is not valid.\n".format(l, edge)) return if label in self.labelSet: self.read += self.genLabelTriple(node1, label, node2) elif label in self.descriptionSet: self.read += self.genDescriptionTriple(node1, label, node2) elif label in self.aliasSet: self.read += self.genAliasTriple(node1, label, node2) elif label == "type": # special edge of prop declaration self.read += self.genPropDeclarationTriple(node1, label, node2) else: if label in self.propTypes: self.read += self.genNormalTriple(node1, label, node2, isPropEdge) else: raise KGTKException( "property {}'s type is unknown as in edge {}.\n". format(label, edge)) def serialize(self): """ Seriealize the triples. Used a hack to avoid serializing the prefix again. """ docs = self.etk.process_ems(self.doc) self.fp.write("\n\n".join( docs[0].kg.serialize("ttl").split("\n\n")[1:])) self.__reset() def __serialize_prefix(self): """ This function should be called only once after the doc object is initialized. """ docs = self.etk.process_ems(self.doc) self.fp.write(docs[0].kg.serialize("ttl").split("\n\n")[0] + "\n\n") self.__reset() def __reset(self): self.ID = None self.STATEMENT = None self.read = 0 self.doc = self.__setDoc() def finalize(self): return