def _node_2_entity(self, node: str): ''' A node can be Qxxx or Pxxx, return the proper entity. ''' if node in self.prop_types: entity = WDProperty(node, self.prop_types[node]) else: entity = WDItem(TripleGenerator.replace_illegal_string(node)) return entity
def generate_prop_declaration_triple(self, node1: str, node2: str) -> bool: # update the known prop_types if node1 in self.prop_types: if not self.prop_declaration: raise KGTKException( "Duplicated property definition of {} found!".format( node1)) else: self.prop_types[node1] = node2 prop = WDProperty(node1, self.datatype_mapping[node2]) self.doc.kg.add_subject(prop) return True
def genLabelTriple(self, node1: str, label: str, node2: str) -> bool: if node1 in self.propTypes: entity = WDProperty(node1.upper(), self.propTypes[node1]) else: entity = WDItem(node1.upper()) if "@" in node2: node2, lang = node2.split("@") entity.add_label(node2.replace('"', "").replace("'", ""), lang=lang) else: entity.add_label(node2.replace('"', "").replace("'", ""), lang="en") # default self.doc.kg.add_subject(entity) return True
def model_data() -> None: """ This function generates triples for user defined properties for uploading them to wikidata :return: """ stream = open(Path.cwd().parent / "Datasets/new-property-configuration.yaml", 'r', encoding='utf8') yaml_data = yaml.safe_load(stream) # initialize kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects") # bind prefixes doc.kg.bind('wikibase', 'http://wikiba.se/ontology#') doc.kg.bind('wd', 'http://www.wikidata.org/entity/') doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/') doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/') doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/') doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/') doc.kg.bind('wdv', 'http://www.wikidata.org/value/') doc.kg.bind('wdref', 'http://www.wikidata.org/reference/') doc.kg.bind('p', 'http://www.wikidata.org/prop/') doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/') doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/') doc.kg.bind('prn', 'http://www.wikidata.org/prop/reference/value-normalized/') doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/') doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/') doc.kg.bind('psn', 'http://www.wikidata.org/prop/statement/value-normalized/') doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/') doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/') doc.kg.bind('pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/') doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#') doc.kg.bind('prov', 'http://www.w3.org/ns/prov#') doc.kg.bind('schema', 'http://schema.org/') sparql_endpoint = "https://query.wikidata.org/sparql" type_map = {'quantity': Datatype.QuantityValue, 'url': URLValue} property_type_cache = {} for k, v in yaml_data.items(): p = WDProperty(k, type_map[v['type']], creator='http://www.isi.edu/t2wml') for lang, value in v['label'].items(): for val in value: p.add_label(val, lang=lang) for lang, value in v['description'].items(): for val in value: p.add_description(val, lang=lang) for pnode, items in v['statements'].items(): for item in items: try: property_type = property_type_cache[pnode] except KeyError: property_type = get_property_type(pnode, sparql_endpoint) property_type_cache[pnode] = property_type if property_type == "WikibaseItem": value = Item(str(item['value'])) elif property_type == "WikibaseProperty": value = Property(item['value']) elif property_type == "String": value = StringValue(item['value']) elif property_type == "Quantity": value = QuantityValue(item['value']) elif property_type == "Time": value = TimeValue( str(item['value']), Item(item["calendar"]), translate_precision_to_integer(item["precision"]), item["time_zone"]) elif property_type == "Url": value = URLValue(item['value']) elif property_type == "Monolingualtext": value = MonolingualText(item['value'], item["lang"]) elif property_type == "ExternalId": value = ExternalIdentifier(item['value']) elif property_type == "GlobeCoordinate": value = GlobeCoordinate(item["latitude"], item["longitude"], item["precision"]) p.add_statement(pnode, value) doc.kg.add_subject(p) with open(Path.cwd().parent / "new_properties/result.ttl", "w") as f: data = doc.kg.serialize('ttl') f.write(data)
def model_schema(self): # read data data = self.read_data(self.data['schema']) # initialize KGSchema custom_dict, ns_dict = {}, {'wd': 'http://www.wikidata.org/entity/'} for each in data['prefix']: for k, v in each.items(): custom_dict[k] = v if k != 'wd': ns_dict[k] = v + '/entity' kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id='http://isi.edu/default-ns/projects') # bind prefix doc = create_custom_prefix(doc, custom_dict) type_map = { 'quantity': Datatype.QuantityValue, 'url': URLValue, 'item': Datatype.Item, 'time': Datatype.TimeValue, 'string': Datatype.StringValue, 'text': Datatype.MonolingualText } # model schema for k, v in data.items(): if ':' in k: k = k.split(':') if 'Q' in k[1]: p = WDItem(k[1], namespace=k[0], creator=':datamart') elif 'P' in k[1]: p = WDProperty(k[1], type_map[v['type']], namespace=k[0], creator=':datamart') else: raise Exception('There is no P/Q information.') return None for lang, value in v['description'].items(): for val in value: p.add_description(val, lang=lang) for lang, value in v['label'].items(): for val in value: p.add_label(val, lang=lang) for node, value in v['statements'].items(): ns = node.split(':')[0] if ':' in node else 'wd' for val in value: prop_type = self.get_property_type(node, ns_dict[ns]) if prop_type == 'WikibaseItem': v = Item(str(val['value'])) elif prop_type == 'WikibaseProperty': v = Property(val['value']) elif prop_type == 'String': v = StringValue(val['value']) elif prop_type == 'Quantity': v = QuantityValue(val['value']) elif prop_type == 'Url': v = URLValue(val['value']) elif prop_type == 'Monolingualtext': v = MonolingualText(val['value'], val['lang']) p.add_statement(node, v) doc.kg.add_subject(p) return doc
def generate_normal_triple(self, node1: str, property: str, node2: str, is_qualifier_edge: bool, e_id: str) -> bool: if self.use_id: e_id = TripleGenerator.replace_illegal_string(e_id) entity = self._node_2_entity(node1) edge_type = self.prop_types[property] if edge_type == Item: object = WDItem(TripleGenerator.replace_illegal_string(node2)) elif edge_type == WDProperty: object = WDProperty(TripleGenerator.replace_illegal_string(node2), self.prop_types[node2]) elif edge_type == TimeValue: if self.yyyy_mm_dd_pattern.match(node2): try: dateTimeString = node2 object = TimeValue( value=dateTimeString, # TODO calendar=Item("Q1985727"), precision=Precision.year, time_zone=0, ) except: return False elif self.yyyy_pattern.match(node2): try: dateTimeString = node2 + "-01-01" object = TimeValue( value=dateTimeString, # TODO calendar=Item("Q1985727"), precision=Precision.year, time_zone=0, ) except: return False else: try: # TODO, in future, the two cases above will be dropped in principle to comply with the iso format # now it is iso format assert (node2[0] == "^") node2 = node2[1:] # remove ^ if node2.startswith("+"): node2 = node2[1:] dateTimeString, precision = node2.split("/") dateTimeString = dateTimeString[:-1] # remove Z object = TimeValue( value=dateTimeString, calendar=Item("Q1985727"), precision=precision, time_zone=0, ) except: return False elif edge_type == GlobeCoordinate: latitude, longitude = node2[1:].split("/") latitude = float(latitude) longitude = float(longitude) object = GlobeCoordinate(latitude, longitude, 0.0001, globe=Item("Q2")) # earth elif edge_type == QuantityValue: # +70[+60,+80]Q743895 res = self.quantity_pattern.match(node2).groups() amount, lower_bound, upper_bound, unit = res amount = TripleGenerator.clean_number_string(amount) num_type = self.xsd_number_type(amount) lower_bound = TripleGenerator.clean_number_string(lower_bound) upper_bound = TripleGenerator.clean_number_string(upper_bound) if unit != None: if upper_bound != None and lower_bound != None: object = QuantityValue(amount, unit=Item(unit), upper_bound=upper_bound, lower_bound=lower_bound, type=num_type) else: object = QuantityValue(amount, unit=Item(unit), type=num_type) else: if upper_bound != None and lower_bound != None: object = QuantityValue(amount, upper_bound=upper_bound, lower_bound=lower_bound, type=num_type) else: object = QuantityValue(amount, type=num_type) elif edge_type == MonolingualText: text_string, lang = TripleGenerator.process_text_string(node2) object = MonolingualText(text_string, lang) elif edge_type == ExternalIdentifier: object = ExternalIdentifier(node2) elif edge_type == URLValue: if TripleGenerator.is_valid_uri_with_scheme_and_host(node2): object = URLValue(node2) else: return False else: # treat everything else as stringValue object = StringValue(node2) if type(object) == WDItem or type(object) == WDProperty: self.doc.kg.add_subject(object) if is_qualifier_edge: # edge: e8 p9 ^2013-01-01T00:00:00Z/11 # create qualifier edge on previous STATEMENT and return the updated STATEMENT self.to_append_statement.add_qualifier(property, object) self.doc.kg.add_subject(self.to_append_statement) else: # edge: q1 p8 q2 e8 # create brand new property edge and replace STATEMENT if self.truthy: self.to_append_statement = entity.add_truthy_statement( property, object, statement_id=e_id ) if self.use_id else entity.add_truthy_statement( property, object) else: self.to_append_statement = entity.add_statement( property, object, statement_id=e_id ) if self.use_id else entity.add_statement(property, object) self.doc.kg.add_subject(entity) return True
def _init_etk(): # initialize for etk kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects") # bind prefixes doc.kg.bind('wikibase', 'http://wikiba.se/ontology#') doc.kg.bind('wd', 'http://www.wikidata.org/entity/') doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/') doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/') doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/') doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/') doc.kg.bind('wdv', 'http://www.wikidata.org/value/') doc.kg.bind('wdref', 'http://www.wikidata.org/reference/') doc.kg.bind('p', 'http://www.wikidata.org/prop/') doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/') doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/') doc.kg.bind( 'prn', 'http://www.wikidata.org/prop/reference/value-normalized/') doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/') doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/') doc.kg.bind( 'psn', 'http://www.wikidata.org/prop/statement/value-normalized/') doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/') doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/') doc.kg.bind( 'pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/') doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#') doc.kg.bind('prov', 'http://www.w3.org/ns/prov#') doc.kg.bind('schema', 'http://schema.org/') # give definition of the nodes we definied p = WDProperty('C2001', Datatype.MonolingualText) p.add_label('datamart identifier', lang='en') p.add_description('identifier of a dataset in the Datamart system', lang='en') p.add_statement('P31', Item('Q19847637')) p.add_statement('P1629', Item('Q1172284')) doc.kg.add_subject(p) p = WDProperty('C2004', Datatype.StringValue) p.add_label('keywords', lang='en') p.add_description( 'keywords associated with an item to facilitate finding the item using text search', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2005', Datatype.StringValue) p.add_label('variable measured', lang='en') p.add_description('the variables measured in a dataset', lang='en') p.add_statement('P31', Item('Q18616576')) p.add_statement('P1628', URLValue('http://schema.org/variableMeasured')) doc.kg.add_subject(p) p = WDProperty('C2006', Datatype.StringValue) p.add_label('values', lang='en') p.add_description( 'the values of a variable represented as a text document', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2007', Datatype.Item) p.add_label('data type', lang='en') p.add_description( 'the data type used to represent the values of a variable, integer (Q729138), Boolean (Q520777), ' 'Real (Q4385701), String (Q184754), Categorical (Q2285707)', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2008', Datatype.URLValue) p.add_label('semantic type', lang='en') p.add_description( 'a URL that identifies the semantic type of a variable in a dataset', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2010', Datatype.StringValue) p.add_label('extra information', lang='en') p.add_description( 'some extra information that may needed for this dataset', lang='en') doc.kg.add_subject(p) p = WDProperty('C2011', Datatype.TimeValue) p.add_label('start date', lang='en') p.add_description( 'The earlist time exist in this dataset, only valid when there exists time format data in this dataset', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2012', Datatype.TimeValue) p.add_label('end date', lang='en') p.add_description( 'The latest time exist in this dataset, only valid when there exists time format data in this dataset', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2013', Datatype.QuantityValue) p.add_label('time granularity', lang='en') p.add_description('time granularity in a dataset', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2014', Datatype.StringValue) p.add_label('uploader information', lang='en') p.add_description('information about who uploaded and when uploaded', lang='en') doc.kg.add_subject(p) return doc
def __init__(self, query_server=None, update_server=None): self.punctuation_table = str.maketrans( dict.fromkeys(string.punctuation)) if query_server and update_server: self.query_server = query_server self.update_server = update_server else: self.query_server = DATAMRT_SERVER self.update_server = DATAMRT_SERVER # initialize kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) self.doc = etk.create_document( {}, doc_id="http://isi.edu/default-ns/projects") # bind prefixes self.doc.kg.bind('wikibase', 'http://wikiba.se/ontology#') self.doc.kg.bind('wd', 'http://www.wikidata.org/entity/') self.doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/') self.doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/') self.doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/') self.doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/') self.doc.kg.bind('wdv', 'http://www.wikidata.org/value/') self.doc.kg.bind('wdref', 'http://www.wikidata.org/reference/') self.doc.kg.bind('p', 'http://www.wikidata.org/prop/') self.doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/') self.doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/') self.doc.kg.bind( 'prn', 'http://www.wikidata.org/prop/reference/value-normalized/') self.doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/') self.doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/') self.doc.kg.bind( 'psn', 'http://www.wikidata.org/prop/statement/value-normalized/') self.doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/') self.doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/') self.doc.kg.bind( 'pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/') self.doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#') self.doc.kg.bind('prov', 'http://www.w3.org/ns/prov#') self.doc.kg.bind('schema', 'http://schema.org/') # give definition of the nodes we definied p = WDProperty('C2001', Datatype.MonolingualText) p.add_label('keywords', lang='en') p.add_description('identifier of a dataset in the Datamart system', lang='en') p.add_statement('P31', Item('Q19847637')) p.add_statement('P1629', Item('Q1172284')) self.doc.kg.add_subject(p) p = WDProperty('C2004', Datatype.StringValue) p.add_label('datamart identifier', lang='en') p.add_description( 'keywords associated with an item to facilitate finding the item using text search', lang='en') p.add_statement('P31', Item('Q18616576')) self.doc.kg.add_subject(p) p = WDProperty('C2005', Datatype.StringValue) p.add_label('variable measured', lang='en') p.add_description('the variables measured in a dataset', lang='en') p.add_statement('P31', Item('Q18616576')) p.add_statement('P1628', URLValue('http://schema.org/variableMeasured')) self.doc.kg.add_subject(p) p = WDProperty('C2006', Datatype.StringValue) p.add_label('values', lang='en') p.add_description( 'the values of a variable represented as a text document', lang='en') p.add_statement('P31', Item('Q18616576')) self.doc.kg.add_subject(p) p = WDProperty('C2007', Datatype.Item) p.add_label('data type', lang='en') p.add_description( 'the data type used to represent the values of a variable, integer (Q729138), Boolean (Q520777), ' 'Real (Q4385701), String (Q184754), Categorical (Q2285707)', lang='en') p.add_statement('P31', Item('Q18616576')) self.doc.kg.add_subject(p) p = WDProperty('C2008', Datatype.URLValue) p.add_label('semantic type', lang='en') p.add_description( 'a URL that identifies the semantic type of a variable in a dataset', lang='en') p.add_statement('P31', Item('Q18616576')) self.doc.kg.add_subject(p) # get the starting source id sparql_query = """ prefix wdt: <http://www.wikidata.org/prop/direct/> prefix wd: <http://www.wikidata.org/entity/> prefix wikibase: <http://wikiba.se/ontology#> PREFIX p: <http://www.wikidata.org/prop/> PREFIX pqv: <http://www.wikidata.org/prop/qualifier/value/> PREFIX pq: <http://www.wikidata.org/prop/qualifier/> PREFIX ps: <http://www.wikidata.org/prop/statement/> prefix bd: <http://www.bigdata.com/rdf#> prefix bds: <http://www.bigdata.com/rdf/search#> select ?x where { wd:Z00000 wdt:P1114 ?x . } """ try: sparql = SPARQLWrapper(self.query_server) sparql.setQuery(sparql_query) sparql.setReturnFormat(JSON) sparql.setMethod(POST) sparql.setRequestMethod(URLENCODED) results = sparql.query().convert()['results']['bindings'] except: print("Getting query of wiki data failed!") raise ValueError("Unable to initialize the datamart query service") if not results: print( "[WARNING] No starting source id found! Will initialize the starting source with D1000001" ) self.resource_id = 1000001 else: self.resource_id = 1000001
def genNormalTriple(self, node1: str, label: str, node2: str, isPropEdge: bool) -> bool: """ The normal triple's type is determined by 1. label's datatype in prop_types.tsv 2. kgtk format convention of node2 field Update the self.STATEMENT """ # determine the node type [property|item] if node1 in self.propTypes: entity = WDProperty(node1.upper(), self.propTypes[node1]) else: entity = WDItem(node1.upper()) # determine the edge type edgeType = self.propTypes[label] if edgeType == Item: OBJECT = Item(node2.upper()) elif edgeType == TimeValue: # https://www.wikidata.org/wiki/Help:Dates # ^201301-01T00:00:00Z/11 dateTimeString, precision = node2[1:].split("/") dateString, timeString = dateTimeString.split("T") OBJECT = TimeValue( value=dateString, calendar=Item("Q1985727"), precision=precision, time_zone=0, ) elif edgeType == GlobeCoordinate: latitude, longitude = node2[1:].split("/") OBJECT = GlobeCoordinate(latitude, longitude, 0.0001, globe=StringValue("Earth")) elif edgeType == QuantityValue: amount, unit = (re.compile("([\+|\-]?[0-9]+\.?[0-9]*)U([0-9]+)" ).match(node2).groups()) OBJECT = QuantityValue(amount=float(amount), unit=Item(unit)) elif edgeType == MonolingualText: try: textString, lang = node2.split("@") OBJECT = MonolingualText(textString, lang) except: OBJECT = MonolingualText(textString, "en") else: # treat everything else as stringValue OBJECT = StringValue(node2) if isPropEdge: # edge: q1 p8 q2 e8 # create brand new property edge and replace STATEMENT self.STATEMENT = entity.add_statement(label.upper(), OBJECT) else: # edge: e8 p9 ^2013-01-01T00:00:00Z/11 # create qualifier edge on previous STATEMENT and return the updated STATEMENT self.STATEMENT.add_qualifier(label.upper(), OBJECT) self.doc.kg.add_subject(self.STATEMENT) return True
def genPropDeclarationTriple(self, node1: str, label: str, node2: str) -> bool: prop = WDProperty(node1.upper(), self.propTypes[node1]) self.doc.kg.add_subject(prop) return True