def _create_claim_graph(self, subject, subject_label, object, object_label, predicate, type='Statement'): # Claim graph claim_graph_uri = URIRef(to_iri(self.namespaces['LW'] + 'Claims')) claim_graph = self.dataset.graph(claim_graph_uri) # Statement statement_id = hash_statement_id( [subject_label, predicate, object_label]) statement = URIRef(to_iri(self.namespaces['LW'] + statement_id)) statement_type1 = URIRef(to_iri(self.namespaces['GRASP'] + type)) statement_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Instance')) statement_type3 = URIRef(to_iri(self.namespaces['SEM'] + 'Event')) # Create graph and add triple graph = self.dataset.graph(statement) graph.add((subject, self.namespaces['N2MU'][predicate], object)) claim_graph.add((statement, RDF.type, statement_type1)) claim_graph.add((statement, RDF.type, statement_type2)) claim_graph.add((statement, RDF.type, statement_type3)) return claim_graph, statement
def convert_asylum_csv(path, dataset, graph_uri): with open(path,'r') as csvfile: country = URIRef(to_iri(dbr + 'Kingdom of the Netherlands')) csv_contents = csv_parser(filename) enum = 0 graph_uri = URIRef('http://localhost:5820/test/resource/asylumGraph') # The URI for our graph graph = dataset.graph(graph_uri) # new graph object with our URI from the dataset for row in csv_contents[1:]: # Pre processing of the data + creation of triples asylum_seeker = URIRef(to_iri(resource + ' Asylum_seekers ' + str(enum))) try: gender = row['Geslacht'].strip() if gender == 'Vrouwen': gender = URIRef(to_iri(sdmx_code + 'sex-F')) else: gender = URIRef(to_iri(sdmx_code + 'sex-M')) except Exception as e: gender = Literal('N/A', datatype= XSD['string']) try: nationality_value = Literal(row['Nationaliteit'].strip(), lang = 'nl') nationality = URIRef(to_iri(resource + nationality_value )) except Exception as e: nationality = Literal('N/A', datatype= XSD['string']) #Preprocess dates temp_date = row['Perioden'] date = temp_date.split() year = date[0].strip() month = date[1] if date[1] != '' else None test = dateparser.parse(row['Perioden'], languages=['nl','en']) if test.month/5 >= 2: temp_date = str(test.year) +'-'+ str(test.month) else: temp_date = str(test.year) +'-'+'0'+str(test.month) try: date = Literal(temp_date,datatype=XSD['gYearMonth']) except Exception as e: date = Literal('N/A', datatype= XSD['string']) try: value = Literal(row['aantal'].strip(), datatype= XSD['integer']) except Exception as e: value = Literal('N/A', datatype= XSD['string']) graph.add((country, RDF.type, VOCAB['Country'])) graph.add((country, VOCAB['asylum_seekers'], asylum_seeker)) dataset.add((asylum_seeker, VOCAB['gender'], gender)) dataset.add((asylum_seeker, VOCAB['nationality'], nationality)) dataset.add((asylum_seeker, VOCAB['application_country'],country)) dataset.add((asylum_seeker, VOCAB['application_period'], date)) dataset.add((asylum_seeker, VOCAB['value'], value)) enum += 1 return dataset, graph
def makegraph(codebook, variable, vocab_name): base = 'http://data.socialhistory.org/resource/' + vocab_name + '/' vrb_iri = to_iri(base + variable + '/') VCB_NAMESPACE = Namespace(vrb_iri) SKOS = Namespace('http://www.w3.org/2004/02/skos/core#') g = Graph() g.bind(vocab_name, VCB_NAMESPACE) g.bind('skos', SKOS) g.add((VCB_NAMESPACE[variable], RDF.type, SKOS['Scheme'])) g.add((VCB_NAMESPACE[variable], SKOS['definition'], Literal(codebook['def'][0]))) if len(codebook) == 1: return g for i in range(len(codebook['code'])): iri = to_iri(VCB_NAMESPACE[str(codebook['code'][i])]) g.add((term.URIRef(iri), RDF.type, SKOS['Concept'])) g.add((term.URIRef(iri), SKOS['inScheme'], VCB_NAMESPACE[variable])) g.add((term.URIRef(iri), SKOS['prefLabel'], Literal(codebook['label'][i]))) if RepresentsInt(codebook['code'][i]): g.add((term.URIRef(iri), RDF.value, Literal(codebook['code'][i], datatype=XSD.int))) return g
def convert_to_rdf(input_file, output_file): rows = 0 data = __load__(input_file) RES, WEA, STA = __setup_namespace__() graph = __setup_graph__(RES) graph.parse(voc_location + 'weather.ttl', format='turtle') filter2020 = data for index, weather_data in filter2020.iterrows(): # Collision_id is primary key stationId = URIRef(to_iri(station + str(weather_data['station_id']))) Date = URIRef(to_iri(resource + str(weather_data['date']))) # graph.add((stationId,WEA['isOn'],Date)) # data property station_id = Literal(weather_data['station_id'], datatype=XSD['string']) date = Literal(str(weather_data['date']), datatype=XSD['date']) # borough_data = str(accident_data['BOROUGH']).capitalize() instance = URIRef( to_iri(weatherVocab + ''.join(station_id) + '/' + ''.join(str(weather_data['date'])))) graph.add((instance, RDF.type, instance)) # graph.add((instance, STA['station_id'], stationId)) graph.add((instance, WEA['stationID'], stationId)) graph.add((instance, RDFS.label, station_id)) graph.add((instance, RDFS.label, date)) # graph.add((Weather, WEA['isinstance'], instance)) if (pd.isnull(weather_data['AWND']) == False): graph.add((instance, WEA['hasAWND'], Literal(weather_data['AWND'], datatype=XSD['int']))) if (pd.isnull(weather_data['TMAX']) == False): graph.add((instance, WEA['hasTMAX'], Literal(weather_data['TMAX'], datatype=XSD['int']))) if (pd.isnull(weather_data['TMIN']) == False): graph.add((instance, WEA['hasTMIN'], Literal(weather_data['TMIN'], datatype=XSD['int']))) if (pd.isnull(weather_data['TAVG']) == False): graph.add((instance, WEA['hasTAVG'], Literal(weather_data['TAVG'], datatype=XSD['int']))) if (pd.isnull(weather_data['WESF']) == False): graph.add((instance, WEA['hasWESF'], Literal(weather_data['WESF'], datatype=XSD['int']))) # just for debugging purposes if ((index % 10000) == 0): print("done with " + str(rows) + "0,000 rows") rows += 1 __save__(graph, output_file)
def _create_leolani_world(self, capsule, type='Statement'): # Instance graph instance_graph_uri = URIRef(to_iri(self.namespaces['LW'] + 'Instances')) instance_graph = self.dataset.graph(instance_graph_uri) # Subject if type == 'Statement': subject, subject_label = self._generate_subject( capsule, instance_graph) elif type == 'Experience': subject = self._generate_leolani( instance_graph) if self.my_uri is None else self.my_uri subject_label = 'leolani' # Object if capsule['object']['type'] == '': # We only get the label object_vocab = OWL object_type = 'Thing' else: object_vocab = self.namespaces['N2MU'] object_type = capsule['object']['type'] object_id = capsule['object']['label'] object = URIRef(to_iri(self.namespaces['LW'] + object_id)) object_label = Literal(object_id) object_type1 = URIRef(to_iri(object_vocab + object_type)) object_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Instance')) instance_graph.add((object, RDFS.label, object_label)) instance_graph.add((object, RDF.type, object_type1)) instance_graph.add((object, RDF.type, object_type2)) if type == 'Statement': claim_graph, statement = self._create_claim_graph( subject, subject_label, object, object_label, capsule['predicate']['type'], type='Statement') elif type == 'Experience': claim_graph, statement = self._create_claim_graph( subject, subject_label, object, object_label, 'sees', type='Experience') return instance_graph, claim_graph, subject, object, statement
def main(): #reading dataset from csv fileName = "DBlist.csv" df = pd.read_csv(fileName) # A namespace for our resources data = 'http://dbpedia.org/ontology/resource/' DATA = Namespace(data) # A namespace for the schema (Classes) schema = 'http://dbpedia.org/ontology/' CLASS = Namespace(schema) # Creating a graph graph = Graph() graph.bind("owl", OWL) graph.bind("rdfs", RDFS) logging.info("Reading all data from " + fileName) for j in range(len(df)): #adding classes to the graph classN = URIRef(to_iri(schema + df.loc[j, 'Class_Name'])) name = Literal(df.loc[j, 'Class_Name'], datatype=XSD['string']) #the class name label graph.add((classN, RDF.type, OWL.Class)) graph.add((classN, RDF.type, RDFS.Class)) graph.add((classN, RDFS.label, name)) # in case their are no instances (only DBpedia) if df.loc[j, 'Number_of_Instances'] == 0: pass else: MyList = df.loc[j, 'Instances_Names'].split('|') # adding instances of a class to the graph for c in range(len(MyList)): MyList[c] = MyList[c].strip(' " ').replace(" ' ", '') instance = URIRef(to_iri(data + MyList[c])) graph.add((instance, RDF.type, classN)) instanceLabel = Literal( MyList[c], datatype=XSD['string']) #creating the label graph.add((instance, RDFS.label, instanceLabel)) outFile = 'TestCase/DBpedia.xml' logging.info("Writing the graph to " + outFile) with open(outFile, 'wb') as f: graph.serialize(f, format='xml')
def convert_info(self): """Converts the CSVW JSON file to valid RDF for serializing into the Nanopublication publication info graph.""" results = self.metadata_graph.query("""SELECT ?s ?p ?o WHERE { ?s ?p ?o . FILTER(?p = csvw:valueUrl || ?p = csvw:propertyUrl || ?p = csvw:aboutUrl)}""") for (s, p, o) in results: # Use iribaker try: # Python 2 escaped_object = URIRef(iribaker.to_iri(unicode(o))) except NameError: # Python 3 escaped_object = URIRef(iribaker.to_iri(str(o))) # print(escaped_object) # If the escaped IRI of the object is different from the original, # update the graph. if escaped_object != o: self.metadata_graph.set((s, p, escaped_object)) # Add the provenance of this operation. try: # Python 2 self.np.pg.add((escaped_object, PROV.wasDerivedFrom, Literal(unicode(o), datatype=XSD.string))) except NameError: # Python 3 self.np.pg.add((escaped_object, PROV.wasDerivedFrom, Literal(str(o), datatype=XSD.string))) # print(str(o)) #walk through the metadata graph to remove illigal "Resource" blank node caused by python3 transition. for s, p, o in self.metadata_graph.triples((None, None, None)): if s.startswith("Resource("): self.metadata_graph.remove((s,p,o)) self.metadata_graph.add((BNode(str(s)[9:-1]), p, o)) logger.debug("removed a triple because it was not formatted right. (started with \"Resource\")") # Add the information of the schema file to the provenance graph of the # nanopublication self.np.ingest(self.metadata_graph, self.np.pg.identifier) # for s,p,o in self.np.triples((None,None,None)): # print(s.__repr__,p.__repr__,o.__repr__) return
def convert_parking_dataset(path, dataset, graph_uri): f = open(path, 'r') json_data = json.load(f) graph = dataset.graph(graph_uri) country = URIRef(to_iri(dbr + 'Kingdom of the Netherlands')) city = URIRef(to_iri(dbr + 'Amsterdam')) for data in json_data['gehandicaptenparkeerplaatsen']: slot_data = data['node'] data_address = slot_data['Adres'].strip() if data_address == '': continue slot = URIRef(to_iri(resource + data_address)) slot_loc = URIRef(to_iri(resource + 'Amsterdam/' + data_address)) slot_loc_address = Literal(data_address) data_quantity = slot_data['Aantal'].strip() slot_quantity = Literal(int(data_quantity), datatype=XSD['unsignedInt']) if data_quantity != '' else None data_info = slot_data['Locatie-info'] slot_info = Literal(data_info) if data_info != '' else None slot_loc_borough = URIRef(to_iri(resource + 'Amsterdam/' + slot_data['Stadsdeel'].strip())) slot_coordinates = json.loads(slot_data['locatie'].strip()) slot_loc_lat = Literal(float(slot_coordinates['coordinates'][1])) slot_loc_long = Literal(float(slot_coordinates['coordinates'][0])) graph.add((slot, RDF.type, VOCAB['ParkingSlot'])) graph.add((slot, RDFS.label, slot_loc_address)) if slot_quantity: graph.add((slot, VOCAB['quantity'], slot_quantity)) if slot_info: graph.add((slot, VOCAB['info'], slot_info)) graph.add((slot, VOCAB['slotLocation'], slot_loc)) graph.add((slot_loc, RDF.type, VOCAB['Location'])) graph.add((slot_loc, RDFS.label, slot_loc_address)) graph.add((slot_loc, DBO['address'], slot_loc_address)) graph.add((slot_loc, DBO['city'], city)) graph.add((slot_loc, DBO['country'], country)) graph.add((slot_loc_borough, RDF.type, VOCAB['Borough'])) graph.add((slot_loc, VOCAB['borough'], slot_loc_borough)) graph.add((slot_loc, GEO['lat'], slot_loc_lat)) graph.add((slot_loc, GEO['long'], slot_loc_long)) return dataset, graph
def expandURL(self, url_pattern, row, datatype=False): """Takes a Jinja or Python formatted string, applies it to the row values, and returns it as a URIRef""" try: unicode_url_pattern = unicode(url_pattern) except NameError: unicode_url_pattern = str(url_pattern).split(')')[0].split('(')[-1] # print(unicode_url_pattern) url = self.render_pattern(unicode_url_pattern, row) # DEPRECATED # for ns, nsuri in namespaces.items(): # if url.startswith(ns): # url = url.replace(ns + ':', nsuri) # break try: iri = iribaker.to_iri(url) rfc3987.parse(iri, rule='IRI') except: raise Exception(u"Cannot convert `{}` to valid IRI".format(url)) # print(iri) return URIRef(iri)
def convert_info(self): """Converts the CSVW JSON file to valid RDF for serializing into the Nanopublication publication info graph.""" results = self.metadata_graph.query("""SELECT ?s ?p ?o WHERE { ?s ?p ?o . FILTER(?p = csvw:valueUrl || ?p = csvw:propertyUrl || ?p = csvw:aboutUrl)}""") for (s, p, o) in results: # Use iribaker escaped_object = URIRef(iribaker.to_iri(unicode(o))) # If the escaped IRI of the object is different from the original, # update the graph. if escaped_object != o: self.metadata_graph.set((s, p, escaped_object)) # Add the provenance of this operation. self.np.pg.add((escaped_object, PROV.wasDerivedFrom, Literal(unicode(o), datatype=XSD.string))) # Add the information of the schema file to the provenance graph of the # nanopublication self.np.ingest(self.metadata_graph, self.np.pg.identifier) return
def fill_entity(self, label, types, namespace='LW', uri=None): # type: (str, list, str, str) -> Entity """ Create an RDF entity given its label, types and its namespace Parameters ---------- label: str Label of entity types: List[str] List of types for this entity uri: str URI of the entity, is available (i.e. when extracting concepts from wikidata) namespace: str Namespace where entity belongs to Returns ------- Entity object with given label """ if types in [None, ''] and label != '': self._log.warning('Unknown type: {}'.format(label)) return self.fill_entity_from_label(label, namespace) else: entity_id = self.create_resource_uri( namespace, label) if not uri else URIRef(to_iri(uri)) return Entity(entity_id, Literal(label), types)
def convert_info(self): """Converts the CSVW JSON file to valid RDF for serializing into the Nanopublication publication info graph.""" results = self.metadata_graph.query("""SELECT ?s ?p ?o WHERE { ?s ?p ?o . FILTER(?p = csvw:valueUrl || ?p = csvw:propertyUrl || ?p = csvw:aboutUrl)}""" ) for (s, p, o) in results: # Use iribaker escaped_object = URIRef(iribaker.to_iri(unicode(o))) # If the escaped IRI of the object is different from the original, # update the graph. if escaped_object != o: self.metadata_graph.set((s, p, escaped_object)) # Add the provenance of this operation. self.np.pg.add((escaped_object, PROV.wasDerivedFrom, Literal(unicode(o), datatype=XSD.string))) # Add the information of the schema file to the provenance graph of the # nanopublication self.np.ingest(self.metadata_graph, self.np.pg.identifier) return
def resource(self, resource_type, resource_name): """Produce a resource-URI based on the ``_RESOURCE_URI_PATTERN`` constant""" raw_iri = self._RESOURCE_URI_PATTERN.format(resource_type, resource_name) iri = to_iri(raw_iri) return URIRef(iri)
def _generate_leolani(self, instance_graph): # Create Leolani leolani_id = 'leolani' leolani_label = 'leolani' leolani = URIRef(to_iri(self.namespaces['LW'] + leolani_id)) leolani_label = Literal(leolani_label) leolani_type1 = URIRef(to_iri(self.namespaces['N2MU'] + 'robot')) leolani_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Instance')) instance_graph.add((leolani, RDFS.label, leolani_label)) instance_graph.add((leolani, RDF.type, leolani_type1)) instance_graph.add((leolani, RDF.type, leolani_type2)) self.my_uri = leolani return leolani
def safe_url(NS, local): """Generates a URIRef from the namespace + local part that is safe for use in RDF graphs Arguments: NS -- a @Namespace object local -- the local name of the resource """ return URIRef(iribaker.to_iri(NS[local]))
def __init__(self, dataset, file_object=None): self.dataset = dataset if 'name' not in dataset: (head, dataset_local_name) = os.path.split(dataset['filename']) (dataset_name, extension) = os.path.splitext(dataset_local_name) self.dataset_name = dataset_name else: self.dataset_name = dataset['name'] if 'version' in dataset: self.dataset_uri = iribaker.to_iri( config.QBR_BASE + dataset['version'] + '/' + self.dataset_name) else: self.dataset_uri = iribaker.to_iri( config.QBR_BASE + self.dataset_name) print "Initialized adapter" return
def convert_to_rdf(input_file, output_file): rows = 0 data = __load__(input_file) RES, WEA, STA = __setup_namespace__() graph = __setup_graph__(RES) graph.parse(ontology + 'weather_type.ttl', format='turtle') filter2020 = data for index, data in filter2020.iterrows(): station_id = URIRef(to_iri(station + str(data['STATION_ID']))) date = URIRef(to_iri(resource + str(data['DATE']))) STATION_ID = Literal(data['STATION_ID'], datatype=XSD['string']) date_raw = str(data['DATE']) dt = datetime(int(date_raw[0:4]), int(date_raw[4:6]), int(date_raw[6:8])).isoformat() DATE = Literal(dt, datatype=XSD['dateTime']) instance = URIRef( to_iri(weatherVocab + ''.join(STATION_ID) + '/' + ''.join(str(data['DATE'])))) graph.add((instance, RDF.type, instance)) graph.add((instance, STA['station_id'], station_id)) # graph.add((instance, WEA['station_id'], STATION_ID)) graph.add((instance, RDFS.label, STATION_ID)) graph.add((instance, RDFS.label, DATE)) graph.add((instance, WEA['hasWeatherID'], Literal(data['WEATHER_ID'], datatype=XSD['string']))) graph.add((instance, WEA['hasWeatherType'], Literal(data['WEATHER_TYPE'], datatype=XSD['string']))) graph.add( (instance, WEA['onDate'], Literal(dt, datatype=XSD['dateTime']))) __save__(graph, output_file) # # input_file = '../data/csv/NY_weather_type_pivot.csv' # output_file = '../data/rdf/NY_weather_type.rdf' # convert_to_rdf(input_file,output_file)
def uri_to_iri(uri): result = urlparse(uri) if not result.scheme or not result.netloc or result.netloc == '-': raise ValueError("Provided URI does not have a valid schema or netloc") try: iri = iribaker.to_iri(uri) return iri except: raise ValueError("Provided URI can't be converted to IRI")
def standard_mode(table, metadata): FILE_URL = Namespace(table.url + '#') g = Graph() g.bind('csvw', CSVW) tg_bn = BNode() t_bn = BNode() g.add((tg_bn, RDF.type, CSVW.TableGroup)) g.add((tg_bn, CSVW.table, t_bn)) g.add((t_bn, CSVW.url, URIRef(table.url))) g.add((t_bn, RDF.type, CSVW.Table)) for s, p, o in metadata.triples((None, URIRef('http://www.w3.org/ns/csvw#column'), None)): collection_resource = metadata.value(s, URIRef('http://www.w3.org/ns/csvw#column')) collection = Collection(metadata, collection_resource) for row in table.rows: r_bn = BNode() rd_bn = BNode() g.add((t_bn, CSVW.row, r_bn)) g.add((r_bn, RDF.type, CSVW.Row)) g.add((r_bn, CSVW.rownum, Literal(row.number, datatype=XSD.integer))) g.add((r_bn, CSVW.describes, rd_bn)) g.add((r_bn, CSVW.url, FILE_URL['row=' + str(row.number + 1)])) for cell in row.cells: if cell.value != "": index = int(str(cell.column)[-1:]) - 1 column_name = metadata.value(collection[index], URIRef('http://www.w3.org/ns/csvw#title')) iri = iribaker.to_iri(FILE_URL[column_name.replace(" ", "%20")]) try: g.add((rd_bn, iri, Literal(cell.value))) except Exception: print "Exception!" print column_name print FILE_URL[column_name] print iri print Literal(cell.value) traceback.print_exc(file=sys.stdout) print return g
def iri(): """ Bake an IRI using iribaker Checks an IRI for compliance with RFC and converts invalid characters to underscores, if possible. **NB**: No roundtripping, this procedure may result in identity smushing: two input-IRI's may be mapped to the same output-IRI. --- tags: - Base consumes: - text/json parameters: - name: iri in: query description: The IRI to be checked for compliance required: true type: string responses: '200': description: IRI converted schema: description: A converted IRI result type: object properties: iri: description: The fully compliant IRI type: string source: description: The input IRI type: string required: - iri - source default: description: Unexpected error schema: id: Message type: object properties: code: type: integer format: int32 message: type: string """ unsafe_iri = request.args.get('iri', None) if unsafe_iri is not None: response = {'iri': iribaker.to_iri(unsafe_iri), 'source': unsafe_iri} return jsonify(response) else: raise (Exception( "The IRI {} could not be converted to a compliant IRI".format( unsafe_iri)))
def __init__(self, dataset, file_object=None): self.dataset = dataset if 'name' not in dataset: (head, dataset_local_name) = os.path.split(dataset['filename']) (dataset_name, extension) = os.path.splitext(dataset_local_name) self.dataset_name = dataset_name else: self.dataset_name = dataset['name'] if 'version' in dataset: self.dataset_uri = iribaker.to_iri(config.QBR_BASE + dataset['version'] + '/' + self.dataset_name) else: self.dataset_uri = iribaker.to_iri(config.QBR_BASE + self.dataset_name) print "Initialized adapter" return
def _generate_subject(self, capsule, instance_graph): if capsule['subject']['type'] == '': # We only get the label subject_vocab = OWL subject_type = 'Thing' else: subject_vocab = self.namespaces['N2MU'] subject_type = capsule['subject']['type'] subject_id = capsule['subject']['label'] subject = URIRef(to_iri(self.namespaces['LW'] + subject_id)) subject_label = Literal(subject_id) subject_type1 = URIRef(to_iri(subject_vocab + subject_type)) subject_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Instance')) instance_graph.add((subject, RDFS.label, subject_label)) instance_graph.add((subject, RDF.type, subject_type1)) instance_graph.add((subject, RDF.type, subject_type2)) return subject, subject_label
def iri(): """ Bake an IRI using iribaker Checks an IRI for compliance with RFC and converts invalid characters to underscores, if possible. **NB**: No roundtripping, this procedure may result in identity smushing: two input-IRI's may be mapped to the same output-IRI. --- tags: - Base consumes: - text/json parameters: - name: iri in: query description: The IRI to be checked for compliance required: true type: string responses: '200': description: IRI converted schema: description: A converted IRI result type: object properties: iri: description: The fully compliant IRI type: string source: description: The input IRI type: string required: - iri - source default: description: Unexpected error schema: id: Message type: object properties: code: type: integer format: int32 message: type: string """ unsafe_iri = request.args.get('iri', None) if unsafe_iri is not None: response = {'iri': iribaker.to_iri(unsafe_iri), 'source': unsafe_iri} return jsonify(response) else: raise(Exception("The IRI {} could not be converted to a compliant IRI".format(unsafe_iri)))
def _create_perspective_graph(self, capsule, turn_label, type='Statement'): # Perspective graph perspective_graph_uri = URIRef( to_iri(self.namespaces['LTa'] + 'Perspectives')) perspective_graph = self.dataset.graph(perspective_graph_uri) # Mention if type == 'Statement': mention_id = turn_label + '_char%s' % capsule['position'] elif type == 'Experience': mention_id = turn_label + '_pixel%s' % capsule['position'] mention = URIRef(to_iri(self.namespaces['LTa'] + mention_id)) mention_type = URIRef(to_iri(self.namespaces['GRASP'] + 'Mention')) perspective_graph.add((mention, RDF.type, mention_type)) # Attribution attribution_id = mention_id + '_CERTAIN' attribution = URIRef(to_iri(self.namespaces['LTa'] + attribution_id)) attribution_type = URIRef( to_iri(self.namespaces['GRASP'] + 'Attribution')) attribution_value = URIRef(to_iri(self.namespaces['GRASP'] + 'CERTAIN')) perspective_graph.add((attribution, RDF.type, attribution_type)) perspective_graph.add((attribution, RDF.value, attribution_value)) return perspective_graph, mention, attribution
def create_resource_uri(self, namespace, resource_name): """ Create an URI for the given resource (entity, predicate, named graph, etc) in the given namespace Parameters ---------- namespace: str Namespace where entity belongs to resource_name: str Label of resource Returns ------- uri: str Representing the URI of the resource """ if namespace in self.namespaces.keys(): uri = URIRef(to_iri(self.namespaces[namespace] + resource_name)) else: uri = URIRef(to_iri('{}:{}'.format(namespace, resource_name))) return uri
def convert_to_rdf(input_file, output_file): rows = 0 data = __load__(input_file) RES, VOCAB, GEO, STA = __setup_namespace__() graph = __setup_graph__(RES, VOCAB) graph.parse(voc_location + 'NY_station_2.ttl', format='turtle') filter2020 = data # print(filter2020) for index, data in filter2020.iterrows(): # station ID is primary key station = URIRef(to_iri(stationVocab + str(data['GHCND']))) station_id = Literal(data['GHCND'], datatype=XSD['string']) graph.add((station, STA['station_id'], station_id)) # graph.add((station, RDF.lable, station)) # print(graph) lat = Literal( data['LAT_DEC'] if pd.isnull(data['LAT_DEC']) == False else 0, datatype=XSD['double']) lon = Literal( data['LON_DEC'] if pd.isnull(data['LON_DEC']) == False else 0, datatype=XSD['double']) graph.add((station, GEO['lat'], lat)) graph.add((station, GEO['long'], lon)) name = Literal(data['STATION_NAME'], datatype=XSD['string']) graph.add((station, RDFS.label, name)) # countrytag = URIRef(to_iri(stationVocab + '/' + str(data['CC']))) country = Literal(data['CC'], datatype=XSD['string']) # graph.add((countrytag, RDF.type, countrytag)) graph.add((station, STA['country'], country)) # statetag = URIRef(to_iri(stationVocab + '/' + str(data['ST']))) state = Literal(data['ST'], datatype=XSD['string']) graph.add((station, STA['state'], state)) # graph.add((statetag, RDF.type, statetag)) # countytag = URIRef(to_iri(stationVocab + '/' + str(data['COUNTY']))) county = Literal(data['COUNTY'], datatype=XSD['string']) graph.add((station, STA['county'], county)) # graph.add((countytag, RDF.type, countytag)) __save__(graph, output_file)
def uri_to_iri(uri): """ convert URI to IRI (used for RDF) this function also validates the URI and throws a ValueError if the provided URI is invalid """ result = urlparse(uri) if not result.scheme or not result.netloc or result.netloc == '-': raise ValueError("Provided URI does not have a valid schema or netloc") try: iri = iribaker.to_iri(uri) return iri except: raise ValueError("Provided URI can't be converted to IRI")
def expandURL(self, url_pattern, row, datatype=False): """Takes a Jinja or Python formatted string, applies it to the row values, and returns it as a URIRef""" url = self.render_pattern(unicode(url_pattern), row) # DEPRECATED # for ns, nsuri in namespaces.items(): # if url.startswith(ns): # url = url.replace(ns + ':', nsuri) # break try: iri = iribaker.to_iri(url) rfc3987.parse(iri, rule='IRI') except: raise Exception(u"Cannot convert `{}` to valid IRI".format(url)) # print "Baked: ", iri return URIRef(iri)
def fill_entity_from_label(self, label, namespace='LW', uri=None): # type: (str, str, str) -> Entity """ Create an RDF entity given its label and its namespace Parameters ---------- label: str Label of entity uri: str URI of the entity, is available (i.e. when extracting concepts from wikidata) namespace: str Namespace where entity belongs to Returns ------- Entity object with given label and no type information """ entity_id = self.create_resource_uri( namespace, label) if not uri else URIRef(to_iri(uri)) return Entity(entity_id, Literal(label), [''])
def fill_predicate(self, label, namespace='N2MU', uri=None): # type: (str, str, str) -> Predicate """ Create an RDF predicate given its label and its namespace Parameters ---------- label: str Label of predicate uri: str URI of the predicate, is available (i.e. when extracting concepts from wikidata) namespace: Namespace where predicate belongs to Returns ------- Predicate object with given label """ predicate_id = self.create_resource_uri( namespace, label) if not uri else URIRef(to_iri(uri)) return Predicate(predicate_id, Literal(label))
def validateTerm(term, headers): # IRIs have a URIRef type if type(term) == URIRef: iri = None template = Template(term) #E.g. http://example.com/{{jinja_statement}} --> http://example.com/None rendered_template = None try: rendered_template = template.render(**headers) #E.g. http://example.com/{csv_column_name} --> http://example.com/None except TypeError as e: # This could happen when LD concepts interact with Jinja concepts, e.g. {{ _row + 'some_string' }} # In that case we take the {{ }} out, and assume the template is fine # In the rare cases it isn't, the conversion will fail rendered_template = re.sub(r'/{{.+}}', '', str(term)) try: potentially_valid_iri = rendered_template.format(**headers) iri = iribaker.to_iri(potentially_valid_iri) rfc3987.parse(iri, rule='IRI') except ValueError as e: logger.error(f"Found an invalid IRI: {iri}") raise e
def convert_to_rdf(input_file, output_file): rows = 0 data = __load__(input_file) GEO, ACT = __setup_namespace__() graph = __setup_graph__() # add accident ontology created via protege graph.parse(ontology_location + 'accident.ttl', format='turtle') # filter for a specific year filter2020 = data.loc[data['CRASH DATE'].str.split('/', expand=True)[2] == '2020'] # filter2020.to_csv('./data/csv/accident-NY-2020.csv', index=False) for index, accident_data in filter2020.iterrows(): # Collision_id is primary key accident = URIRef( to_iri(accidentVocab + str(accident_data['COLLISION_ID']))) vehicleAccident = URIRef(to_iri(accidentVocab + 'VehicleAccident')) collision_id = Literal(accident_data['COLLISION_ID'], datatype=XSD['integer']) # add accident to graph graph.add((accident, RDFS.label, collision_id)) graph.add((accident, RDF.type, vehicleAccident)) # setup and add crash date to graph as resource crash_date_raw = accident_data['CRASH DATE'].split('/') crash_date_formatted = crash_date_raw[2] + "-" + crash_date_raw[ 0] + "-" + crash_date_raw[1] dt = datetime(int(crash_date_raw[2]), int(crash_date_raw[0]), int(crash_date_raw[1])).isoformat() crash_date = Literal(dt, datatype=XSD['dateTime']) graph.add((accident, ACT['hasDate'], crash_date)) borough_raw = str(accident_data['BOROUGH']).split(" ") borough_data = [b.capitalize() for b in borough_raw] # setup and add borough data as resource, only if its defined in current instance # borough_data = str(accident_data['BOROUGH']).capitalize() if (''.join(borough_data) != 'Nan'): borough = URIRef(to_iri(accidentVocab + ''.join(borough_data))) graph.add((borough, RDF.type, borough)) graph.add((borough, RDFS.label, Literal(''.join(borough_data)))) graph.add((accident, ACT['hasBorough'], borough)) # setup and add zipcode data as resource, only if its defined in current instance if (pd.isnull(accident_data['ZIP CODE']) == False): zip = URIRef( to_iri(accidentVocab + str(int(accident_data['ZIP CODE'])))) zipCode_type = URIRef(to_iri(accidentVocab + 'ZipCode')) graph.add( (zip, RDFS.label, Literal(int(accident_data['ZIP CODE'])))) graph.add((zip, RDF.type, zipCode_type)) graph.add((accident, ACT['inZipCode'], zip)) if (''.join(borough_data) != 'Nan'): graph.add((borough, ACT['containsZipCode'], zip)) graph.add((zip, ACT['belongsToBorough'], borough)) # setup and add geo coordinates to graph as literals lat = Literal(accident_data['LATITUDE'] if pd.isnull(accident_data['LATITUDE']) == False else 0, datatype=XSD['double']) lon = Literal(accident_data['LONGITUDE'] if pd.isnull(accident_data['LONGITUDE']) == False else 0, datatype=XSD['double']) graph.add((accident, GEO['lat'], lat)) graph.add((accident, GEO['long'], lon)) # setup and add location to graph as resource (used to map to borough if only location is available) # 3 decimal values will ensure a precision of 111m location_data = '%.3f' % (accident_data['LATITUDE']) + ',' + '%.3f' % ( accident_data['LONGITUDE']) if (location_data != "nan,nan" and location_data != '0.000,0.000'): location = URIRef(to_iri(accidentVocab + location_data)) location_type = URIRef(to_iri(accidentVocab + 'Location')) graph.add((location, RDF.type, location_type)) graph.add((location, RDFS.label, Literal(location_data))) if ''.join(borough_data) != 'Nan': graph.add((borough, ACT['containsLocation'], location)) graph.add((location, ACT['inBorough'], borough)) graph.add((accident, ACT['inLocation'], location)) # setup and add street name to graph as Literal if (pd.isnull(accident_data['ON STREET NAME']) == False): streets = accident_data['ON STREET NAME'].rstrip().split(" ") street = [s.capitalize() for s in streets] street_data = Literal(''.join(street), datatype=XSD['string']) # street_data = URIRef(to_iri(accidentVocab + ''.join(street))) graph.add((accident, ACT['hasStreetName'], street_data)) # setup and add person and pedestrian data to graph as literals persons_injured = Literal(int( accident_data['NUMBER OF PERSONS INJURED']), datatype=XSD['integer']) graph.add((accident, ACT['hasPersonsInjured'], persons_injured)) persons_killed = Literal(int( accident_data['NUMBER OF PERSONS KILLED']), datatype=XSD['integer']) graph.add((accident, ACT['hasPersonsKilled'], persons_killed)) pedestrians_injured = Literal(int( accident_data['NUMBER OF PEDESTRIANS INJURED']), datatype=XSD['integer']) graph.add( (accident, ACT['hasPedestriansInjured'], pedestrians_injured)) pedestrians_killed = Literal(int( accident_data['NUMBER OF PEDESTRIANS KILLED']), datatype=XSD['integer']) graph.add((accident, ACT['hasPedestriansKilled'], pedestrians_killed)) # setup and add vehicle types involved in the accident to graph as resource vehicleType1_split = str( accident_data['VEHICLE TYPE CODE 1']).split(' ') vehicleType1_data = ''.join( [v.capitalize() for v in vehicleType1_split]) if (len(vehicleType1_data.split('/')) > 1): vehicleType1_data = vehicleType1_data.split('/')[0] if (vehicleType1_data != 'Nan' and isValidVehicle(vehicleType1_data)): vehicleType1 = URIRef(to_iri(accidentVocab + vehicleType1_data)) graph.add((vehicleType1, RDF.type, vehicleType1)) graph.add((vehicleType1, RDFS.label, Literal(vehicleType1_data))) graph.add((accident, ACT['hasVehicleType'], vehicleType1)) if (pd.isnull(accident_data['VEHICLE TYPE CODE 2']) == False): vehicleType2_split = str( accident_data['VEHICLE TYPE CODE 2']).split(' ') vehicleType2_data = ''.join( [v.capitalize() for v in vehicleType2_split]) if (len(vehicleType2_data.split('/')) > 1): vehicleType2_data = vehicleType2_data.split('/')[0] if (vehicleType2_data != 'Nan' and isValidVehicle(vehicleType2_data)): vehicleType2 = URIRef(to_iri(accidentVocab + vehicleType2_data)) graph.add((vehicleType2, RDF.type, vehicleType2)) graph.add( (vehicleType2, RDFS.label, Literal(vehicleType2_data))) graph.add((accident, ACT['hasVehicleType'], vehicleType2)) if (pd.isnull(accident_data['VEHICLE TYPE CODE 3']) == False): vehicleType3_split = str( accident_data['VEHICLE TYPE CODE 3']).split(' ') vehicleType3_data = ''.join( [v.capitalize() for v in vehicleType3_split]) if (len(vehicleType3_data.split('/')) > 1): vehicleType3_data = vehicleType3_data.split('/')[0].replace( " ", "").capitalize() if (vehicleType3_data != 'Nan' and isValidVehicle(vehicleType3_data)): vehicleType3 = URIRef(to_iri(accidentVocab + vehicleType3_data)) graph.add((vehicleType3, RDF.type, vehicleType3)) graph.add( (vehicleType3, RDFS.label, Literal(vehicleType3_data))) graph.add((accident, ACT['hasVehicleType'], vehicleType3)) # setup and add contributing factors to graph as resource if __check_if_Invalid__(accident_data, 'CONTRIBUTING FACTOR VEHICLE 1'): contributing_factor_1 = Literal( accident_data['CONTRIBUTING FACTOR VEHICLE 1'], datatype=XSD['string']) graph.add((accident, ACT['hasContributingFactor'], contributing_factor_1)) if __check_if_Invalid__(accident_data, 'CONTRIBUTING FACTOR VEHICLE 2'): contributing_factor_2 = Literal( accident_data['CONTRIBUTING FACTOR VEHICLE 2'], datatype=XSD['string']) graph.add((accident, ACT['hasContributingFactor'], contributing_factor_2)) if __check_if_Invalid__(accident_data, 'CONTRIBUTING FACTOR VEHICLE 3'): contributing_factor_3 = Literal( accident_data['CONTRIBUTING FACTOR VEHICLE 3'], datatype=XSD['string']) graph.add((accident, ACT['hasContributingFactor'], contributing_factor_3)) # just for debugging purposes if ((index % 10000) == 0): print("------> Done with " + str(rows) + "0,000 rows...") rows += 1 # only processing 50,000 rows so it can be loaded into protege within reasonable time # if(rows == 5): # break __save__(graph, output_file)
def process(self, count, rows, chunksize): """Process the rows fed to the converter. Count and chunksize are used to determine the current row number (needed for default observation identifiers)""" obs_count = count * chunksize # logger.info("Row: {}".format(obs_count)) #removed for readability # We iterate row by row, and then column by column, as given by the CSVW mapping file. mult_proc_counter = 0 iter_error_counter= 0 for row in rows: # This fixes issue:10 if row is None: mult_proc_counter += 1 # logger.debug( #removed for readability # "Skipping empty row caused by multiprocessing (multiple of chunksize exceeds number of rows in file)...") continue # set the '_row' value in case we need to generate 'default' URIs for each observation () # logger.debug("row: {}".format(obs_count)) #removed for readability row[u'_row'] = obs_count count += 1 # The self.columns dictionary gives the mapping definition per column in the 'columns' # array of the CSVW tableSchema definition. for c in self.columns: c = Item(self.metadata_graph, c) # default about URL s = self.expandURL(self.aboutURLSchema, row) try: # Can also be used to prevent the triggering of virtual # columns! # Get the raw value from the cell in the CSV file value = row[unicode(c.csvw_name)] # This checks whether we should continue parsing this cell, or skip it. if self.isValueNull(value, c): continue # If the null values are specified in an array, we need to parse it as a collection (list) elif isinstance(c.csvw_null, Item): nulls = Collection(self.metadata_graph, BNode(c.csvw_null)) if self.equal_to_null(nulls, row): # Continue to next column specification in this row, if the value is equal to (one of) the null values. continue except: # No column name specified (virtual) because there clearly was no c.csvw_name key in the row. # logger.debug(traceback.format_exc()) #removed for readability iter_error_counter +=1 if isinstance(c.csvw_null, Item): nulls = Collection(self.metadata_graph, BNode(c.csvw_null)) if self.equal_to_null(nulls, row): # Continue to next column specification in this row, if the value is equal to (one of) the null values. continue try: # This overrides the subject resource 's' that has been created earlier based on the # schema wide aboutURLSchema specification. if unicode(c.csvw_virtual) == u'true' and c.csvw_aboutUrl is not None: s = self.expandURL(c.csvw_aboutUrl, row) if c.csvw_valueUrl is not None: # This is an object property, because the value needs to be cast to a URL p = self.expandURL(c.csvw_propertyUrl, row) o = self.expandURL(c.csvw_valueUrl, row) if self.isValueNull(os.path.basename(unicode(o)), c): logger.debug("skipping empty value") continue if unicode(c.csvw_virtual) == u'true' and c.csvw_datatype is not None and URIRef(c.csvw_datatype) == XSD.anyURI: # Special case: this is a virtual column with object values that are URIs # For now using a test special property value = row[unicode(c.csvw_name)].encode('utf-8') o = URIRef(iribaker.to_iri(value)) if unicode(c.csvw_virtual) == u'true' and c.csvw_datatype is not None and URIRef(c.csvw_datatype) == XSD.linkURI: about_url = str(c.csvw_aboutUrl) about_url = about_url[about_url.find("{"):about_url.find("}")+1] s = self.expandURL(about_url, row) # logger.debug("s: {}".format(s)) value_url = str(c.csvw_valueUrl) value_url = value_url[value_url.find("{"):value_url.find("}")+1] o = self.expandURL(value_url, row) # logger.debug("o: {}".format(o)) # For coded properties, the collectionUrl can be used to indicate that the # value URL is a concept and a member of a SKOS Collection with that URL. if c.csvw_collectionUrl is not None: collection = self.expandURL(c.csvw_collectionUrl, row) self.g.add((collection, RDF.type, SKOS['Collection'])) self.g.add((o, RDF.type, SKOS['Concept'])) self.g.add((collection, SKOS['member'], o)) # For coded properties, the schemeUrl can be used to indicate that the # value URL is a concept and a member of a SKOS Scheme with that URL. if c.csvw_schemeUrl is not None: scheme = self.expandURL(c.csvw_schemeUrl, row) self.g.add((scheme, RDF.type, SKOS['Scheme'])) self.g.add((o, RDF.type, SKOS['Concept'])) self.g.add((o, SKOS['inScheme'], scheme)) else: # This is a datatype property if c.csvw_value is not None: value = self.render_pattern(unicode(c.csvw_value), row) elif c.csvw_name is not None: # print s # print c.csvw_name, self.encoding # print row[unicode(c.csvw_name)], type(row[unicode(c.csvw_name)]) # print row[unicode(c.csvw_name)].encode('utf-8') # print '...' value = row[unicode(c.csvw_name)].encode('utf-8') else: raise Exception("No 'name' or 'csvw:value' attribute found for this column specification") # If propertyUrl is specified, use it, otherwise use # the column name if c.csvw_propertyUrl is not None: p = self.expandURL(c.csvw_propertyUrl, row) else: if "" in self.metadata_graph.namespaces(): propertyUrl = self.metadata_graph.namespaces()[""][ unicode(c.csvw_name)] else: propertyUrl = "{}{}".format(get_namespaces()['sdv'], unicode(c.csvw_name)) p = self.expandURL(propertyUrl, row) if c.csvw_datatype is not None: if URIRef(c.csvw_datatype) == XSD.anyURI: # The xsd:anyURI datatype will be cast to a proper IRI resource. o = URIRef(iribaker.to_iri(value)) elif URIRef(c.csvw_datatype) == XSD.string and c.csvw_lang is not None: # If it is a string datatype that has a language, we turn it into a # language tagged literal # We also render the lang value in case it is a # pattern. o = Literal(value, lang=self.render_pattern( c.csvw_lang, row)) else: o = Literal(value, datatype=c.csvw_datatype, normalize=False) else: # It's just a plain literal without datatype. o = Literal(value) # Add the triple to the assertion graph self.g.add((s, p, o)) # Add provenance relating the propertyUrl to the column id if '@id' in c: self.g.add((p, PROV['wasDerivedFrom'], URIRef(c['@id']))) except: # print row[0], value traceback.print_exc() # We increment the observation (row number) with one obs_count += 1 logger.debug( "{} row skips caused by multiprocessing (multiple of chunksize exceeds number of rows in file)...".format(mult_proc_counter)) logger.debug( "{} errors encountered while trying to iterate over a NoneType...".format(mult_proc_counter)) logger.info("... done") return self.ds.serialize(format=self.output_format)
def convert_unemployment_csv(path, dataset, graph_uri): with open(path,'r') as csvfile: csv_contents = csv_parser(filename) enum = 0 graph_uri = URIRef('http://stardog.clariah-sdh.eculture.labs.vu.nl/databases/trump/resource/unemployment_eu_graph') # The URI for our graph graph = dataset.graph(graph_uri) # new graph object with our URI from the dataset for row in csv_contents[1:]: # Pre processing of the data + creation of triples country = URIRef(to_iri(dbr + row['GEO'].strip())) country_name = Literal(row['GEO'].strip(), datatype=XSD['string']) unemployment_rate = URIRef(to_iri(resource + 'Unemployment_rate' + str(enum))) try: gender = row['SEX'].strip() gender = URIRef(to_iri(sdmx_code + 'Total')) except Exception as e: gender = Literal('N/A', datatype= XSD['string']) #Preprocess dates temp_date = row['TIME'].strip() try: date = Literal(temp_date,datatype=XSD['gYear']) except Exception as e: date = Literal('N/A', datatype= XSD['string']) try: unemployment_value = Literal(row['Value'].strip(), datatype= XSD['float']) except Exception as e: unemployment_value = Literal('N/A', datatype= XSD['string']) try: unit = row['UNIT'] if 'total' in unit: unit_value = Literal('Total population', datatype = XSD['string']) else: unit_value = Literal('Active population', datatype = XSD['string']) except Exception as e: unit = Literal('N/A', datatype = XSD['string']) try: age_group = Literal(row['AGE'].strip(), datatype = XSD['string']) except Exception as e: age_group = Literal('N/A', datatype = XSD['string']) print 'Country : '+ country_name + ', in year ' + date + ', had unemployment rate : ' \ + unemployment_value + ', for age group : '+ age_group + ', unit : ' + unit_value graph.add((country, RDF.type, VOCAB['Country'])) graph.add((country, VOCAB['unemployment_rate'], unemployment_rate)) dataset.add((unemployment_rate, VOCAB['gender'], gender)) dataset.add((unemployment_rate, VOCAB['indicator_value'], unemployment_value)) dataset.add((unemployment_rate, VOCAB['time_period'],date)) dataset.add((unemployment_rate, VOCAB['country'], country)) dataset.add((unemployment_rate, VOCAB['unit'], unit_value)) enum += 1 return dataset, graph
def convert_inflow_csv(path, dataset, graph_uri): filename = path with open(path,'r') as csvfile: csv_contents = csv_parser(filename) enum = 0 graph_uri = URIRef('http://stardog.clariah-sdh.eculture.labs.vu.nl/databases/the_migration_portal/resource/inflow_graph') # The URI for our graph graph = dataset.graph(graph_uri) # new graph object with our URI from the dataset for row in csv_contents[1:]: # Pre processing of the data + creation of triples from_country_code = URIRef(to_iri(geo_country_code + row['Code'].strip()+"/")) temp_from_country_name = row['Country of birth/nationality'].strip().replace(",","") from_country = URIRef(to_iri(dbr + temp_from_country_name)) from_country_name = Literal(temp_from_country_name, datatype=XSD['string']) to_counry_code = URIRef(to_iri(geo_country_code + row['COU'].strip()+"/")) temp_to_country_name = row['Country'].strip().replace(",","") to_country = URIRef(to_iri(dbr + temp_to_country_name )) to_country_name = Literal(temp_to_country_name, datatype=XSD['string']) inflow = URIRef(to_iri(resource + 'Inflow' + str(enum))) try: gender = row['Gender'].strip() gender = URIRef(to_iri(sdmx_code + 'Total')) except Exception as e: gender = Literal('N/A', datatype= XSD['string']) #Preprocess dates try: date = Literal(row['Year'].strip(),datatype=XSD['gYear']) except Exception as e: date = Literal('N/A', datatype= XSD['string']) try: inflow_value = int(row['Value'].strip()) #print type(inflow_value) if isinstance(inflow_value, int): # print "This number is an int" inflow_value = Literal(row['Value'].strip(), datatype= XSD['int']) else: #print "This number is a int" inflow_value = Literal(inflow_value, datatype= XSD['float']) except Exception as e: inflow_value = Literal('N/A', datatype= XSD['string']) #print 'From Country : '+ from_country_name + ' to country ' + to_country_name + ', in year ' + date + ', inflow value : ' \ #+ inflow_value print 'Converting row' + str(enum) dataset.add((from_country, RDF.type, DBO['Country'])) dataset.add((from_country, RDFS.label, from_country_name)) dataset.add((from_country, GCC['country_code'], from_country_code)) graph.add((inflow, RDF.type, VOCAB['Inflow_of_population'])) graph.add((inflow, VOCAB['to_country'], to_country)) graph.add((inflow, VOCAB['from_country'], from_country)) graph.add((inflow, VOCAB['movement_time_period'], date)) graph.add((inflow, VOCAB['movement_value'], inflow_value)) graph.add((inflow, VOCAB['gender'], gender)) enum += 1 return dataset, graph
def build_schema(infile, outfile, delimiter=None, quotechar='\"', encoding=None, dataset_name=None, base="https://iisg.amsterdam/"): """ Build a CSVW schema based on the ``infile`` CSV file, and write the resulting JSON CSVW schema to ``outfile``. Takes various optional parameters for instructing the CSV reader, but is also quite good at guessing the right values. """ url = os.path.basename(infile) # Get the current date and time (UTC) today = datetime.datetime.utcnow().strftime("%Y-%m-%d") if dataset_name is None: dataset_name = url if encoding is None: detector = UniversalDetector() with open(infile, 'r') as f: for line in f.readlines(): detector.feed(line) if detector.done: break detector.close() encoding = detector.result['encoding'] logger.info("Detected encoding: {} ({} confidence)".format(detector.result['encoding'], detector.result['confidence'])) if delimiter is None: with open(infile, 'rb') as csvfile: # dialect = csv.Sniffer().sniff(csvfile.read(1024), delimiters=";,$\t") dialect = csv.Sniffer().sniff(csvfile.readline()) #read only the header instead of the entire file to determine delimiter csvfile.seek(0) logger.info("Detected dialect: {} (delimiter: '{}')".format(dialect, dialect.delimiter)) delimiter = dialect.delimiter logger.info("Delimiter is: {}".format(delimiter)) if base.endswith('/'): base = base[:-1] metadata = { "@id": iribaker.to_iri(u"{}/{}".format(base, url)), "@context": ["https://raw.githubusercontent.com/CLARIAH/COW/master/csvw.json", {"@language": "en", "@base": "{}/".format(base)}, get_namespaces(base)], "url": url, "dialect": {"delimiter": delimiter, "encoding": encoding, "quoteChar": quotechar }, "dc:title": dataset_name, "dcat:keyword": [], "dc:publisher": { "schema:name": "CLARIAH Structured Data Hub - Datalegend", "schema:url": {"@id": "http://datalegend.net"} }, "dc:license": {"@id": "http://opendefinition.org/licenses/cc-by/"}, "dc:modified": {"@value": today, "@type": "xsd:date"}, "tableSchema": { "columns": [], "primaryKey": None, "aboutUrl": "{_row}" } } with io.open(infile, 'r', encoding=encoding) as infile_file: r = csv.reader(infile_file, delimiter=delimiter, quotechar=quotechar) header = r.next() logger.info("Found headers: {}".format(header)) if u'' in header: logger.warning("WARNING: You have one or more empty column headers in your CSV file. Conversion might produce incorrect results because of conflated URIs or worse") if len(set(header)) < len(header): logger.warning("WARNING: You have two or more column headers that are syntactically the same. Conversion might produce incorrect results because of conflated URIs or worse") # First column is primary key metadata['tableSchema']['primaryKey'] = header[0] for head in header: col = { "@id": iribaker.to_iri("{}/{}/column/{}".format(base, url, head)), "name": head, "titles": [head], "dc:description": head, "datatype": "string" } metadata['tableSchema']['columns'].append(col) with open(outfile, 'w') as outfile_file: outfile_file.write(json.dumps(metadata, indent=True)) logger.info("Done") return
def build_schema(infile, outfile, delimiter=',', quotechar='\"', dataset_name=None): """Builds a basic QBer-style schema (probably deprecated)""" if dataset_name is None: dataset_name = os.path.basename(infile) dataset_uri = to_iri(SDR[dataset_name]) metadata = { "dataset": { "file": infile, "name": dataset_name, "uri": dataset_uri, "variables": {} } } with open(infile, 'r') as infile_file: r = csv.reader(infile_file, delimiter=delimiter, quotechar=quotechar) header = r.next() logger.debug(header) for variable in header: variable = variable.decode('utf-8') variable_iri = to_iri(_RESOURCE_URI_PATTERN.format('variable', variable)) col = { "category": "identifier", "category_comment": "`category` can be one of identifier, coded or other", "description": "The variable '{}' as taken from the '{}' dataset.".format(variable, dataset_name), "label": variable, "uri": variable_iri, "original": { "label": variable, "uri": variable_iri }, "type": "http://purl.org/linked-data/cube#DimensionProperty", "valueUrl": "{}/{{{}}}".format(variable_iri,variable), "datatype_REMOVEME": "Any XML Schema datatype, only applicable for variables of type `other`", "transform_REMOVEME": "Any body of a JavaScript function, that returns some value based on an input `v`, the actual value of a variable", "values_REMOVEME": [ { "comment": "`values` is a list of variable values that has the form specified here", "count": "The frequency of this value for this variable", "label": "The value itself, used as Literal value or as label in case of `identifier` or `coded`", "original": { "label": "The original value, in case of a mapped/modified value", "uri": "The original URI of the value (typically follows the `valueUrl` template)" }, "uri": "The URI for the value, ignored in case of `other`" } ] } metadata['dataset']['variables'][variable] = col with open(outfile, 'w') as outfile_file: outfile_file.write(json.dumps(metadata, indent=True)) logger.info("Done") return
def convert_population_csv(path, dataset, graph_uri): filename = path with open(path,'r') as csvfile: csv_contents = csv_parser(filename) enum = 0 #graph_uri = URIRef('http://stardog.clariah-sdh.eculture.labs.vu.nl/databases/trump/resource/population_graph') # The URI for our graph graph = dataset.graph(graph_uri) # new graph object with our URI from the dataset for row in csv_contents[1:]: # Pre processing of the data + creation of triples country = URIRef(to_iri(dbr + row['GEO'].strip())) country_name = Literal(row['GEO'].strip(), datatype=XSD['string']) population = URIRef(to_iri(resource + 'Population' + str(enum))) pop_type = Literal(row['CITIZEN'].strip(),datatype=XSD['string']) try: gender = row['SEX'].strip() gender = URIRef(to_iri(sdmx_code + 'Total')) except Exception as e: gender = Literal('N/A', datatype= XSD['string']) #Preprocess dates temp_date = row['TIME'].strip() try: date = Literal(temp_date,datatype=XSD['gYear']) except Exception as e: date = Literal('N/A', datatype= XSD['string']) try: temp = row['Value'].strip().replace(',','') if temp == ':': pass else: population_value = Literal(temp, datatype=XSD['int']) except Exception as e: population_value = Literal('N/A', datatype=XSD['string']) try: age_group = Literal(row['AGE'].strip(), datatype = XSD['string']) except Exception as e: age_group = Literal('N/A', datatype = XSD['string']) print 'Country : '+ country_name + ', in year ' + date + ', had population : ' \ + population_value + ', for age group : '+ age_group population_label = Literal('Population_' + country_name +'_'+ date, datatype=XSD['string']) dataset.add((country, RDF.type, DBO['Country'])) dataset.add((country, RDFS.label, country_name)) dataset.add((country, VOCAB['population'], population)) graph.add((population, RDF.type, VOCAB['Population'])) graph.add((population, RDFS.label, population_label)) graph.add((population, VOCAB['country'], country)) graph.add((population, VOCAB['gender'], gender)) graph.add((population, VOCAB['population_type'], pop_type)) graph.add((population, VOCAB['population_value'], population_value)) graph.add((population, VOCAB['time_period'],date)) enum += 1 return dataset, graph
def get_values(self): """ Return all unique values, and converts it to samples for each column. """ # Get all unique values for each column stats = {} for col in self.data.columns: istats = [] counts = self.data[col].value_counts() # print self.data[col][0] for i in counts.index: print col, i # The URI for the variable value i_uri = iribaker.to_iri(u"{}/value/{}/{}" .format(self.dataset_uri, col, i)) # Capture the counts and label in a dictionary for the value stat = { 'original': { 'uri': i_uri, 'label': i }, 'label': i, 'uri': i_uri, 'count': counts[i] } # And append it to the list of variable values istats.append(stat) # The URI for the variable variable_uri = iribaker.to_iri("{}/variable/{}" .format(self.dataset_uri, col)) # The URI for a (potential) codelist for the variable codelist_uri = iribaker.to_iri("{}/codelist/{}" .format(self.dataset_uri, col)) codelist_label = "Codelist generated from the values for '{}'".format( col) codelist = { 'original': { 'uri': codelist_uri, 'label': codelist_label }, 'uri': codelist_uri, 'label': codelist_label } stats[col] = { 'original': { 'uri': variable_uri, 'label': col }, 'uri': variable_uri, 'label': col, 'description': "The variable '{}' as taken " "from the '{}' dataset." .format(col, self.dataset_name), 'category': 'identifier', 'type': 'http://purl.org/linked-data/cube#DimensionProperty', # This is the default 'values': istats, 'codelist': codelist } return stats
def get_value_uri(dataset, variable, value): """Generates a variable value IRI for a given combination of dataset, variable and value""" BASE = get_base_uri(dataset) return iribaker.to_iri(BASE['code/' + variable + '/' + value])
def convert_unemployment_csv(path, dataset, graph_uri): filename = path with open(path, 'r') as csvfile: csv_contents = csv_parser(filename) enum = 0 #graph_uri = URIRef('http://stardog.clariah-sdh.eculture.labs.vu.nl/databases/trump/resource/unemployment_graph') # The URI for our graph graph = dataset.graph( graph_uri) # new graph object with our URI from the dataset for row in csv_contents[1:]: # Pre processing of the data + creation of triples country = URIRef(to_iri(dbr + row['GEO'].strip())) country_name = Literal(row['GEO'].strip(), datatype=XSD['string']) #Fix Germany unemployment_rate = URIRef( to_iri(resource + 'Unemployment_rate' + str(enum))) try: gender = row['SEX'].strip() gender = URIRef(to_iri(sdmx_code + 'Total')) except Exception as e: gender = Literal('N/A', datatype=XSD['string']) #Preprocess dates temp_date = row['TIME'].strip() try: date = Literal(temp_date, datatype=XSD['gYear']) except Exception as e: date = Literal('N/A', datatype=XSD['string']) try: unemployment_value = Literal(row['Value'].strip(), datatype=XSD['float']) except Exception as e: unemployment_value = Literal('N/A', datatype=XSD['string']) try: unit = row['UNIT'] if 'total' in unit: unit_value = Literal('Total population', datatype=XSD['string']) else: unit_value = Literal('Active population', datatype=XSD['string']) except Exception as e: unit = Literal('N/A', datatype=XSD['string']) try: age_group = Literal(row['AGE'].strip(), datatype=XSD['string']) except Exception as e: age_group = Literal('N/A', datatype=XSD['string']) print 'Country : '+ country_name + ', in year ' + date + ', had unemployment rate : ' \ + unemployment_value + ', for age group : '+ age_group + ', unit : ' + unit_value unemployment_rate_label = Literal('Unemployment_rate_' + country_name + '_' + date, datatype=XSD['string']) dataset.add((country, RDF.type, DBO['Country'])) dataset.add((country, RDFS.label, country_name)) dataset.add( (country, VOCAB['unemployment_rate'], unemployment_rate)) graph.add( (unemployment_rate, RDF.type, VOCAB['Unemployment_rate'])) graph.add((unemployment_rate, RDFS.label, unemployment_rate_label)) graph.add((unemployment_rate, VOCAB['gender'], gender)) graph.add((unemployment_rate, VOCAB['indicator_value'], unemployment_value)) graph.add((unemployment_rate, VOCAB['time_period'], date)) graph.add((unemployment_rate, VOCAB['country'], country)) graph.add((unemployment_rate, VOCAB['unit'], unit_value)) enum += 1 return dataset, graph
def convert_inflow_csv(path, dataset, graph_uri): filename = path with open(path, 'r') as csvfile: csv_contents = csv_parser(filename) enum = 0 graph_uri = URIRef( 'http://stardog.clariah-sdh.eculture.labs.vu.nl/databases/the_migration_portal/resource/inflow_graph' ) # The URI for our graph graph = dataset.graph( graph_uri) # new graph object with our URI from the dataset for row in csv_contents[1:]: # Pre processing of the data + creation of triples from_country_code = URIRef( to_iri(geo_country_code + row['Code'].strip() + "/")) temp_from_country_name = row['Country of birth/nationality'].strip( ).replace(",", "") from_country = URIRef(to_iri(dbr + temp_from_country_name)) from_country_name = Literal(temp_from_country_name, datatype=XSD['string']) to_counry_code = URIRef( to_iri(geo_country_code + row['COU'].strip() + "/")) temp_to_country_name = row['Country'].strip().replace(",", "") to_country = URIRef(to_iri(dbr + temp_to_country_name)) to_country_name = Literal(temp_to_country_name, datatype=XSD['string']) inflow = URIRef(to_iri(resource + 'Inflow' + str(enum))) try: gender = row['Gender'].strip() gender = URIRef(to_iri(sdmx_code + 'Total')) except Exception as e: gender = Literal('N/A', datatype=XSD['string']) #Preprocess dates try: date = Literal(row['Year'].strip(), datatype=XSD['gYear']) except Exception as e: date = Literal('N/A', datatype=XSD['string']) try: inflow_value = int(row['Value'].strip()) #print type(inflow_value) if isinstance(inflow_value, int): # print "This number is an int" inflow_value = Literal(row['Value'].strip(), datatype=XSD['int']) else: #print "This number is a int" inflow_value = Literal(inflow_value, datatype=XSD['float']) except Exception as e: inflow_value = Literal('N/A', datatype=XSD['string']) #print 'From Country : '+ from_country_name + ' to country ' + to_country_name + ', in year ' + date + ', inflow value : ' \ #+ inflow_value print 'Converting row' + str(enum) dataset.add((from_country, RDF.type, DBO['Country'])) dataset.add((from_country, RDFS.label, from_country_name)) dataset.add((from_country, GCC['country_code'], from_country_code)) graph.add((inflow, RDF.type, VOCAB['Inflow_of_population'])) graph.add((inflow, VOCAB['to_country'], to_country)) graph.add((inflow, VOCAB['from_country'], from_country)) graph.add((inflow, VOCAB['movement_time_period'], date)) graph.add((inflow, VOCAB['movement_value'], inflow_value)) graph.add((inflow, VOCAB['gender'], gender)) enum += 1 return dataset, graph
def convert_dataset(path, dataset, graph_uri, museums=True): f = open(path, 'r') json_data = json.load(f) graph = dataset.graph(graph_uri) country = URIRef(to_iri(dbr + 'Kingdom of the Netherlands')) for event_data in json_data: event = URIRef(to_iri(resource + event_data['title'].strip())) title = Literal(event_data['title'].strip(), datatype=XSD['string']) dates = event_data['dates'] if dates != []: single_dates = [Literal(datetime.strptime(d, '%d-%m-%Y').date()) for d in dates['singles']] if dates.has_key('singles') else [] start_date = Literal(datetime.strptime(dates['startdate'], '%d-%m-%Y').date()) \ if dates.has_key('startdate') else None end_date = Literal(datetime.strptime(dates['enddate'], '%d-%m-%Y').date()) \ if dates.has_key('enddate') and dates['enddate'] != '' else None location_dict = event_data['location'] location_d_name = location_dict['name'].strip() if location_d_name != '': place = URIRef(to_iri(resource + location_d_name)) place_name = Literal(location_d_name, datatype=XSD['string']) location_city_str = location_dict['city'].strip().capitalize() location = URIRef(to_iri(resource + location_city_str + '/' + location_dict['adress'].strip())) location_city = URIRef(to_iri(dbr + location_city_str)) location_address = Literal(location_dict['adress'].strip()) location_zip = Literal(location_dict['zipcode'].strip()) location_lat = Literal(float(location_dict['latitude'].replace(',', '.'))) location_lon = Literal(float(location_dict['longitude'].replace(',', '.'))) if event_data['media']: medias = [(Literal(m['url'].strip(), datatype=XSD['anyURI']), m['main'].strip() == 'true') for m in event_data['media']] urls = [Literal(url.strip(), datatype=XSD['anyURI']) for url in event_data['urls']] details_dict = event_data['details'] details = [] for lang in details_dict.iterkeys(): detail = {} if details_dict[lang]['calendarsummary'].strip() != '': detail['calendar_summary'] = Literal(details_dict[lang]['calendarsummary'].strip(), lang=lang) if details_dict[lang]['longdescription'].strip() != '': detail['long_description'] = Literal(details_dict[lang]['longdescription'].strip(), lang=lang) if details_dict[lang]['shortdescription'].strip() != '': detail['short_description'] = Literal(details_dict[lang]['shortdescription'].strip(), lang=lang) details.append(detail) graph.add((event, RDF.type, VOCAB['Event'])) graph.add((event, RDFS.label, title)) if dates != []: for single_date in single_dates: graph.add((event, VOCAB['single_date'], single_date)) if start_date: graph.add((event, VOCAB['start_date'], start_date)) if end_date: graph.add((event, VOCAB['end_date'], end_date)) if location_dict['name'] != '': if museums: graph.add((event, VOCAB['exhibitionVenue'], place)) else: graph.add((event, VOCAB['playVenue'], place)) graph.add((place, RDF.type, VOCAB['Venue'])) graph.add((place, RDFS.label, place_name)) graph.add((place, VOCAB['venueLocation'], location)) graph.add((location, RDF.type, VOCAB['Location'])) graph.add((location, RDFS.label, location_address)) graph.add((location, DBO['address'], location_address)) graph.add((location, DBO['city'], location_city)) graph.add((location, DBO['postalCode'], location_zip)) graph.add((location, DBO['country'], country)) graph.add((location, GEO['lat'], location_lat)) graph.add((location, GEO['long'], location_lon)) if medias: for m in medias: graph.add((event, VOCAB['main_media'] if m[1] else VOCAB['media'], m[0])) for url in urls: graph.add((event, VOCAB['url'], url)) for detail in details: if detail.has_key('calendar_summary'): graph.add((event, VOCAB['calendar_summary'], detail['calendar_summary'])) if detail.has_key('long_description'): graph.add((event, VOCAB['long_description'], detail['long_description'])) if detail.has_key('short_description'): graph.add((event, VOCAB['short_description'], detail['short_description'])) return dataset, graph
def get_variable_uri(dataset, variable): """Generates a variable IRI for a given combination of dataset and variable""" BASE = get_base_uri(dataset) return iribaker.to_iri(BASE[variable])
def convert_csv(path, dataset, graph_uri): with open(path,'r') as csvfile: csv_contents = csv_parser(filename) graph_uri = URIRef('http://localhost:5820/test/resource/movement_graph') # The URI for our graph graph = dataset.graph(graph_uri) # new graph object with our URI from the dataset for row in csv_contents[2:]: # Pre processing of the data + creation of triples country = URIRef(to_iri(dbr + row['Country'].strip())) country_name = Literal(row['Country'].strip(), datatype=XSD['string']) net_migration = URIRef(to_iri(resource + row['Net migration'].strip())) try: net_migration_value = Literal((int(row['Net migration']) * 1000), datatype=XSD['int']) except Exception as e: net_migration_value = Literal('N/A',datatype=XSD['string']) international_migrant_stock = URIRef(to_iri(resource + row['International migrant stock'].strip())) try: international_migrant_stock_value = Literal((int(row['International migrant stock']) * 1000), datatype=XSD['int']) except Exception as e: international_migrant_stock_value = Literal('N/A',datatype=XSD['string']) tetriary_educated_emigration = URIRef(to_iri(resource + \ row['Emigration rate of tertiary educated to OECD countries'].strip())) try: tetriary_educated_emigration_value_prct = float(row['Emigration rate of tertiary educated to OECD countries']) tetriary_educated_emigration_value_prct = Literal(tetriary_educated_emigration_value_prct, datatype=XSD['float']) except Exception as e: tetriary_educated_emigration_value_prct = Literal('N/A',datatype=XSD['string']) refugees_by_country_of_origin = URIRef(to_iri(resource + \ row['Refugees By country of origin'].strip())) try: refugees_by_country_of_origin_value = int(float(row['Refugees By country of origin']) * 1000) # make them thousands refugees_by_country_of_origin_value = Literal(refugees_by_country_of_origin_value, datatype=XSD['int']) except Exception as e: refugees_by_country_of_origin_value = Literal('N/A', datatype=XSD['string']) refugees_by_country_of_asylum = URIRef(to_iri(resource + \ row['Refugees By country of asylum'].strip())) try: refugees_by_country_of_asylum_value = int(float(row['Refugees By country of asylum']) * 1000) # make them thousands and int refugees_by_country_of_asylum_value = Literal(refugees_by_country_of_asylum_value, datatype=XSD['int']) except Exception as e: refugees_by_country_of_asylum_value = Literal('N/A', datatype=XSD['string']) personal_remittances_received = URIRef(to_iri(resource + row['Personal remittances received'].strip())) try: personal_remittances_received_value = long(row['Personal remittances received']) * 1000000 # make them millions and int personal_remittances_received_value = Literal(personal_remittances_received_value, datatype=XSD['long']) except Exception as e: personal_remittances_received_value = Literal('N/A', datatype=XSD['string']) personal_remittances_paid = URIRef(to_iri(resource + row['Personal remittances paid'].strip())) try: personal_remittances_paid_value = long(row['Personal remittances paid']) * 1000000 # make them millions personal_remittances_paid_value = Literal(personal_remittances_paid_value, datatype=XSD['long']) # turn it back to int except Exception as e: personal_remittances_paid_value = Literal('N/A', datatype=XSD['string']) # Add data to graph_uri_base graph.add((country, RDF.type, VOCAB['Country'])) graph.add((country, RDFS.label, country_name)) graph.add((country, VOCAB['net_migration'], net_migration_value)) graph.add((country, VOCAB['international_migrant_stock'], international_migrant_stock_value)) graph.add((country, VOCAB['emmigration_rate_to_OECD'], tetriary_educated_emigration_value_prct)) graph.add((country, VOCAB['refugees_by_country_of_origin'],refugees_by_country_of_origin_value)) graph.add((country, VOCAB['refugees_by_country_of_asylum'],refugees_by_country_of_asylum_value)) graph.add((country, VOCAB['personal_remittances_received'],personal_remittances_received_value)) graph.add((country, VOCAB['personal_remittances_paid'], personal_remittances_paid_value)) dataset.add((country, RDF.type, VOCAB['Country'])) dataset.add((country, RDFS.label, country_name)) dataset.add((net_migration, RDF.type , VOCAB['Net_migration'])) dataset.add((net_migration, VOCAB['value'], net_migration_value)) dataset.add((net_migration, VOCAB['year'], Literal('2012', datatype=XSD['gYear']))) dataset.add((international_migrant_stock, RDF.type, VOCAB['International_migrant_stock'])) dataset.add((international_migrant_stock,VOCAB['value'],international_migrant_stock_value)) dataset.add((international_migrant_stock, VOCAB['year'], Literal('2010', datatype=XSD['gYear']))) dataset.add((tetriary_educated_emigration,RDF.type,VOCAB['Emmigration_rate_to_OECD'])) dataset.add((tetriary_educated_emigration, VOCAB['value'], tetriary_educated_emigration_value_prct)) dataset.add((tetriary_educated_emigration, VOCAB['year'], Literal('2000', datatype=XSD['gYear']))) dataset.add((refugees_by_country_of_origin,RDF.type,VOCAB['Refugees_by_country_of_origin'])) dataset.add((refugees_by_country_of_origin, VOCAB['value'], refugees_by_country_of_origin_value)) dataset.add((refugees_by_country_of_origin, VOCAB['year'], Literal('2014', datatype=XSD['gYear']))) dataset.add((refugees_by_country_of_asylum,RDF.type,VOCAB['Refugees_by_country_of_asylum'])) dataset.add((refugees_by_country_of_asylum, VOCAB['value'], refugees_by_country_of_asylum_value)) dataset.add((refugees_by_country_of_asylum, VOCAB['year'], Literal('2014', datatype=XSD['gYear']))) dataset.add((personal_remittances_received,RDF.type,VOCAB['Personal_remittances_received'])) dataset.add((personal_remittances_received, VOCAB['value'], personal_remittances_received_value)) dataset.add((personal_remittances_received, VOCAB['year'], Literal('2014', datatype=XSD['gYear']))) dataset.add((personal_remittances_received,RDF.type,VOCAB['Personal_remittances_paid'])) dataset.add((personal_remittances_paid,VOCAB['value'],personal_remittances_paid_value)) dataset.add((personal_remittances_received, VOCAB['year'], Literal('2014', datatype=XSD['gYear']))) info = u'Frédéric Docquier, B. Lindsay Lowell, and Abdeslam Marfouk'+'s' +', "A Gendered Assessment of Highly Skilled Emigration" (2009)'.decode('utf-8') graph.add((VOCAB['net_migration'], URIRef(to_iri(prov + 'wasDerivedFrom')), Literal('United Nations Population Division, World Population Prospects', datatype=XSD['string']))) graph.add((VOCAB['international_migrant_stock'], URIRef(to_iri(prov + 'wasDerivedFrom')), Literal('United Nations Population Division, Trends in Total Migrant Stock: 2012 Revision.', datatype=XSD['string']))) graph.add((VOCAB['tetriary_educated_emigration'], URIRef(to_iri(prov + 'wasDerivedFrom')), Literal(info, datatype=XSD['string']))) graph.add((VOCAB['refugees_by_country_of_origin'], URIRef(to_iri(prov + 'wasDerivedFrom')), Literal('United Nations High Commissioner for Refugees (UNHCR), Statistical Yearbook and data files, \ complemented by statistics on Palestinian refugees under the mandate of the UNRWA as published on its website.\ Data from UNHCR are available online at: www.unhcr.org/statistics/populationdatabase.', datatype=XSD['string']))) graph.add((VOCAB['refugees_by_country_of_asylum'], URIRef(to_iri(prov + 'wasDerivedFrom')), Literal('United Nations High Commissioner for Refugees (UNHCR), Statistical Yearbook and data files, \ complemented by statistics on Palestinian refugees under the mandate of the UNRWA as published on its website. \ Data from UNHCR are available online at: www.unhcr.org/statistics/populationdatabase.', datatype=XSD['string']))) graph.add((VOCAB['personal_remittances_received'], URIRef(to_iri(prov + 'wasDerivedFrom')), Literal('World Bank staff estimates based on IMF balance of payments data.', datatype=XSD['string']))) graph.add((VOCAB['personal_remittances_paid'], URIRef(to_iri(prov + 'wasDerivedFrom')), Literal('World Bank staff estimates based on IMF balance of payments data.', datatype=XSD['string']))) return dataset, graph