def model_data() -> None: """ This function generates triples for user defined properties for uploading them to wikidata :return: """ stream = open(Path.cwd().parent / "Datasets/new-property-configuration.yaml", 'r', encoding='utf8') yaml_data = yaml.safe_load(stream) # initialize kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects") # bind prefixes doc.kg.bind('wikibase', 'http://wikiba.se/ontology#') doc.kg.bind('wd', 'http://www.wikidata.org/entity/') doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/') doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/') doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/') doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/') doc.kg.bind('wdv', 'http://www.wikidata.org/value/') doc.kg.bind('wdref', 'http://www.wikidata.org/reference/') doc.kg.bind('p', 'http://www.wikidata.org/prop/') doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/') doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/') doc.kg.bind('prn', 'http://www.wikidata.org/prop/reference/value-normalized/') doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/') doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/') doc.kg.bind('psn', 'http://www.wikidata.org/prop/statement/value-normalized/') doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/') doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/') doc.kg.bind('pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/') doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#') doc.kg.bind('prov', 'http://www.w3.org/ns/prov#') doc.kg.bind('schema', 'http://schema.org/') sparql_endpoint = "https://query.wikidata.org/sparql" type_map = {'quantity': Datatype.QuantityValue, 'url': URLValue} property_type_cache = {} for k, v in yaml_data.items(): p = WDProperty(k, type_map[v['type']], creator='http://www.isi.edu/t2wml') for lang, value in v['label'].items(): for val in value: p.add_label(val, lang=lang) for lang, value in v['description'].items(): for val in value: p.add_description(val, lang=lang) for pnode, items in v['statements'].items(): for item in items: try: property_type = property_type_cache[pnode] except KeyError: property_type = get_property_type(pnode, sparql_endpoint) property_type_cache[pnode] = property_type if property_type == "WikibaseItem": value = Item(str(item['value'])) elif property_type == "WikibaseProperty": value = Property(item['value']) elif property_type == "String": value = StringValue(item['value']) elif property_type == "Quantity": value = QuantityValue(item['value']) elif property_type == "Time": value = TimeValue( str(item['value']), Item(item["calendar"]), translate_precision_to_integer(item["precision"]), item["time_zone"]) elif property_type == "Url": value = URLValue(item['value']) elif property_type == "Monolingualtext": value = MonolingualText(item['value'], item["lang"]) elif property_type == "ExternalId": value = ExternalIdentifier(item['value']) elif property_type == "GlobeCoordinate": value = GlobeCoordinate(item["latitude"], item["longitude"], item["precision"]) p.add_statement(pnode, value) doc.kg.add_subject(p) with open(Path.cwd().parent / "new_properties/result.ttl", "w") as f: data = doc.kg.serialize('ttl') f.write(data)
def __init__(self, query_server=None, update_server=None): self.punctuation_table = str.maketrans( dict.fromkeys(string.punctuation)) if query_server and update_server: self.query_server = query_server self.update_server = update_server else: self.query_server = DATAMRT_SERVER self.update_server = DATAMRT_SERVER # initialize kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) self.doc = etk.create_document( {}, doc_id="http://isi.edu/default-ns/projects") # bind prefixes self.doc.kg.bind('wikibase', 'http://wikiba.se/ontology#') self.doc.kg.bind('wd', 'http://www.wikidata.org/entity/') self.doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/') self.doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/') self.doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/') self.doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/') self.doc.kg.bind('wdv', 'http://www.wikidata.org/value/') self.doc.kg.bind('wdref', 'http://www.wikidata.org/reference/') self.doc.kg.bind('p', 'http://www.wikidata.org/prop/') self.doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/') self.doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/') self.doc.kg.bind( 'prn', 'http://www.wikidata.org/prop/reference/value-normalized/') self.doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/') self.doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/') self.doc.kg.bind( 'psn', 'http://www.wikidata.org/prop/statement/value-normalized/') self.doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/') self.doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/') self.doc.kg.bind( 'pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/') self.doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#') self.doc.kg.bind('prov', 'http://www.w3.org/ns/prov#') self.doc.kg.bind('schema', 'http://schema.org/') # give definition of the nodes we definied p = WDProperty('C2001', Datatype.MonolingualText) p.add_label('keywords', lang='en') p.add_description('identifier of a dataset in the Datamart system', lang='en') p.add_statement('P31', Item('Q19847637')) p.add_statement('P1629', Item('Q1172284')) self.doc.kg.add_subject(p) p = WDProperty('C2004', Datatype.StringValue) p.add_label('datamart identifier', lang='en') p.add_description( 'keywords associated with an item to facilitate finding the item using text search', lang='en') p.add_statement('P31', Item('Q18616576')) self.doc.kg.add_subject(p) p = WDProperty('C2005', Datatype.StringValue) p.add_label('variable measured', lang='en') p.add_description('the variables measured in a dataset', lang='en') p.add_statement('P31', Item('Q18616576')) p.add_statement('P1628', URLValue('http://schema.org/variableMeasured')) self.doc.kg.add_subject(p) p = WDProperty('C2006', Datatype.StringValue) p.add_label('values', lang='en') p.add_description( 'the values of a variable represented as a text document', lang='en') p.add_statement('P31', Item('Q18616576')) self.doc.kg.add_subject(p) p = WDProperty('C2007', Datatype.Item) p.add_label('data type', lang='en') p.add_description( 'the data type used to represent the values of a variable, integer (Q729138), Boolean (Q520777), ' 'Real (Q4385701), String (Q184754), Categorical (Q2285707)', lang='en') p.add_statement('P31', Item('Q18616576')) self.doc.kg.add_subject(p) p = WDProperty('C2008', Datatype.URLValue) p.add_label('semantic type', lang='en') p.add_description( 'a URL that identifies the semantic type of a variable in a dataset', lang='en') p.add_statement('P31', Item('Q18616576')) self.doc.kg.add_subject(p) # get the starting source id sparql_query = """ prefix wdt: <http://www.wikidata.org/prop/direct/> prefix wd: <http://www.wikidata.org/entity/> prefix wikibase: <http://wikiba.se/ontology#> PREFIX p: <http://www.wikidata.org/prop/> PREFIX pqv: <http://www.wikidata.org/prop/qualifier/value/> PREFIX pq: <http://www.wikidata.org/prop/qualifier/> PREFIX ps: <http://www.wikidata.org/prop/statement/> prefix bd: <http://www.bigdata.com/rdf#> prefix bds: <http://www.bigdata.com/rdf/search#> select ?x where { wd:Z00000 wdt:P1114 ?x . } """ try: sparql = SPARQLWrapper(self.query_server) sparql.setQuery(sparql_query) sparql.setReturnFormat(JSON) sparql.setMethod(POST) sparql.setRequestMethod(URLENCODED) results = sparql.query().convert()['results']['bindings'] except: print("Getting query of wiki data failed!") raise ValueError("Unable to initialize the datamart query service") if not results: print( "[WARNING] No starting source id found! Will initialize the starting source with D1000001" ) self.resource_id = 1000001 else: self.resource_id = 1000001
def _init_etk(): # initialize for etk kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects") # bind prefixes doc.kg.bind('wikibase', 'http://wikiba.se/ontology#') doc.kg.bind('wd', 'http://www.wikidata.org/entity/') doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/') doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/') doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/') doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/') doc.kg.bind('wdv', 'http://www.wikidata.org/value/') doc.kg.bind('wdref', 'http://www.wikidata.org/reference/') doc.kg.bind('p', 'http://www.wikidata.org/prop/') doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/') doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/') doc.kg.bind( 'prn', 'http://www.wikidata.org/prop/reference/value-normalized/') doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/') doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/') doc.kg.bind( 'psn', 'http://www.wikidata.org/prop/statement/value-normalized/') doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/') doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/') doc.kg.bind( 'pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/') doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#') doc.kg.bind('prov', 'http://www.w3.org/ns/prov#') doc.kg.bind('schema', 'http://schema.org/') # give definition of the nodes we definied p = WDProperty('C2001', Datatype.MonolingualText) p.add_label('datamart identifier', lang='en') p.add_description('identifier of a dataset in the Datamart system', lang='en') p.add_statement('P31', Item('Q19847637')) p.add_statement('P1629', Item('Q1172284')) doc.kg.add_subject(p) p = WDProperty('C2004', Datatype.StringValue) p.add_label('keywords', lang='en') p.add_description( 'keywords associated with an item to facilitate finding the item using text search', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2005', Datatype.StringValue) p.add_label('variable measured', lang='en') p.add_description('the variables measured in a dataset', lang='en') p.add_statement('P31', Item('Q18616576')) p.add_statement('P1628', URLValue('http://schema.org/variableMeasured')) doc.kg.add_subject(p) p = WDProperty('C2006', Datatype.StringValue) p.add_label('values', lang='en') p.add_description( 'the values of a variable represented as a text document', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2007', Datatype.Item) p.add_label('data type', lang='en') p.add_description( 'the data type used to represent the values of a variable, integer (Q729138), Boolean (Q520777), ' 'Real (Q4385701), String (Q184754), Categorical (Q2285707)', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2008', Datatype.URLValue) p.add_label('semantic type', lang='en') p.add_description( 'a URL that identifies the semantic type of a variable in a dataset', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2010', Datatype.StringValue) p.add_label('extra information', lang='en') p.add_description( 'some extra information that may needed for this dataset', lang='en') doc.kg.add_subject(p) p = WDProperty('C2011', Datatype.TimeValue) p.add_label('start date', lang='en') p.add_description( 'The earlist time exist in this dataset, only valid when there exists time format data in this dataset', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2012', Datatype.TimeValue) p.add_label('end date', lang='en') p.add_description( 'The latest time exist in this dataset, only valid when there exists time format data in this dataset', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2013', Datatype.QuantityValue) p.add_label('time granularity', lang='en') p.add_description('time granularity in a dataset', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2014', Datatype.StringValue) p.add_label('uploader information', lang='en') p.add_description('information about who uploaded and when uploaded', lang='en') doc.kg.add_subject(p) return doc
def genNormalTriple(self, node1: str, label: str, node2: str, isPropEdge: bool) -> bool: """ The normal triple's type is determined by 1. label's datatype in prop_types.tsv 2. kgtk format convention of node2 field Update the self.STATEMENT """ # determine the node type [property|item] if node1 in self.propTypes: entity = WDProperty(node1.upper(), self.propTypes[node1]) else: entity = WDItem(node1.upper()) # determine the edge type edgeType = self.propTypes[label] if edgeType == Item: OBJECT = Item(node2.upper()) elif edgeType == TimeValue: # https://www.wikidata.org/wiki/Help:Dates # ^201301-01T00:00:00Z/11 dateTimeString, precision = node2[1:].split("/") dateString, timeString = dateTimeString.split("T") OBJECT = TimeValue( value=dateString, calendar=Item("Q1985727"), precision=precision, time_zone=0, ) elif edgeType == GlobeCoordinate: latitude, longitude = node2[1:].split("/") OBJECT = GlobeCoordinate(latitude, longitude, 0.0001, globe=StringValue("Earth")) elif edgeType == QuantityValue: amount, unit = (re.compile("([\+|\-]?[0-9]+\.?[0-9]*)U([0-9]+)" ).match(node2).groups()) OBJECT = QuantityValue(amount=float(amount), unit=Item(unit)) elif edgeType == MonolingualText: try: textString, lang = node2.split("@") OBJECT = MonolingualText(textString, lang) except: OBJECT = MonolingualText(textString, "en") else: # treat everything else as stringValue OBJECT = StringValue(node2) if isPropEdge: # edge: q1 p8 q2 e8 # create brand new property edge and replace STATEMENT self.STATEMENT = entity.add_statement(label.upper(), OBJECT) else: # edge: e8 p9 ^2013-01-01T00:00:00Z/11 # create qualifier edge on previous STATEMENT and return the updated STATEMENT self.STATEMENT.add_qualifier(label.upper(), OBJECT) self.doc.kg.add_subject(self.STATEMENT) return True