Ejemplo n.º 1
0
    def _create_claim_graph(self,
                            subject,
                            subject_label,
                            object,
                            object_label,
                            predicate,
                            type='Statement'):
        # Claim graph
        claim_graph_uri = URIRef(to_iri(self.namespaces['LW'] + 'Claims'))
        claim_graph = self.dataset.graph(claim_graph_uri)

        # Statement
        statement_id = hash_statement_id(
            [subject_label, predicate, object_label])

        statement = URIRef(to_iri(self.namespaces['LW'] + statement_id))
        statement_type1 = URIRef(to_iri(self.namespaces['GRASP'] + type))
        statement_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Instance'))
        statement_type3 = URIRef(to_iri(self.namespaces['SEM'] + 'Event'))

        # Create graph and add triple
        graph = self.dataset.graph(statement)
        graph.add((subject, self.namespaces['N2MU'][predicate], object))

        claim_graph.add((statement, RDF.type, statement_type1))
        claim_graph.add((statement, RDF.type, statement_type2))
        claim_graph.add((statement, RDF.type, statement_type3))

        return claim_graph, statement
def convert_asylum_csv(path, dataset, graph_uri):
    with open(path,'r') as csvfile:
        country = URIRef(to_iri(dbr + 'Kingdom of the Netherlands'))
        csv_contents = csv_parser(filename)
        enum = 0
        graph_uri = URIRef('http://localhost:5820/test/resource/asylumGraph')  # The URI for our graph
        graph = dataset.graph(graph_uri)                                   # new graph object with our URI from the dataset

        for row in csv_contents[1:]:
            # Pre processing of the data + creation of triples
            asylum_seeker = URIRef(to_iri(resource + ' Asylum_seekers ' + str(enum)))
            try:
                gender = row['Geslacht'].strip()
                if gender == 'Vrouwen':
                    gender = URIRef(to_iri(sdmx_code + 'sex-F'))
                else:
                    gender = URIRef(to_iri(sdmx_code + 'sex-M'))
            except Exception as e:
                gender =  Literal('N/A', datatype= XSD['string'])

            try:
                nationality_value = Literal(row['Nationaliteit'].strip(), lang = 'nl')
                nationality = URIRef(to_iri(resource + nationality_value ))
            except Exception as e:
                nationality = Literal('N/A', datatype= XSD['string'])

            #Preprocess dates
            temp_date = row['Perioden']
            date = temp_date.split()
            year = date[0].strip()
            month = date[1] if date[1] != '' else None

            test = dateparser.parse(row['Perioden'], languages=['nl','en'])
            if test.month/5 >= 2:
                temp_date = str(test.year) +'-'+ str(test.month)
            else:
                temp_date = str(test.year) +'-'+'0'+str(test.month)

            try:
                date = Literal(temp_date,datatype=XSD['gYearMonth'])
            except Exception as e:
                date = Literal('N/A', datatype= XSD['string'])

            try:
                value = Literal(row['aantal'].strip(), datatype= XSD['integer'])
            except Exception as e:
                value = Literal('N/A', datatype= XSD['string'])

            graph.add((country, RDF.type, VOCAB['Country']))
            graph.add((country, VOCAB['asylum_seekers'], asylum_seeker))

            dataset.add((asylum_seeker, VOCAB['gender'], gender))
            dataset.add((asylum_seeker, VOCAB['nationality'], nationality))
            dataset.add((asylum_seeker, VOCAB['application_country'],country))
            dataset.add((asylum_seeker, VOCAB['application_period'], date))
            dataset.add((asylum_seeker, VOCAB['value'], value))

            enum += 1

    return dataset, graph
Ejemplo n.º 3
0
def makegraph(codebook, variable, vocab_name):

    base = 'http://data.socialhistory.org/resource/' + vocab_name + '/'
    vrb_iri = to_iri(base + variable + '/')
    VCB_NAMESPACE = Namespace(vrb_iri)
    SKOS = Namespace('http://www.w3.org/2004/02/skos/core#')

    g = Graph()
    g.bind(vocab_name, VCB_NAMESPACE)
    g.bind('skos', SKOS)
    
    
    g.add((VCB_NAMESPACE[variable], RDF.type, SKOS['Scheme']))
    
    g.add((VCB_NAMESPACE[variable], SKOS['definition'], Literal(codebook['def'][0])))
    if len(codebook) == 1:
        return g

    for i in range(len(codebook['code'])):
        
        iri = to_iri(VCB_NAMESPACE[str(codebook['code'][i])])

        g.add((term.URIRef(iri), RDF.type, SKOS['Concept']))
        g.add((term.URIRef(iri), SKOS['inScheme'], VCB_NAMESPACE[variable]))
        g.add((term.URIRef(iri), SKOS['prefLabel'], Literal(codebook['label'][i])))


        if RepresentsInt(codebook['code'][i]): 
            g.add((term.URIRef(iri), RDF.value, Literal(codebook['code'][i], datatype=XSD.int)))
    
    return g
def convert_to_rdf(input_file, output_file):
    rows = 0

    data = __load__(input_file)
    RES, WEA, STA = __setup_namespace__()
    graph = __setup_graph__(RES)
    graph.parse(voc_location + 'weather.ttl', format='turtle')

    filter2020 = data

    for index, weather_data in filter2020.iterrows():

        # Collision_id is primary key
        stationId = URIRef(to_iri(station + str(weather_data['station_id'])))

        Date = URIRef(to_iri(resource + str(weather_data['date'])))

        # graph.add((stationId,WEA['isOn'],Date))
        # data property
        station_id = Literal(weather_data['station_id'],
                             datatype=XSD['string'])
        date = Literal(str(weather_data['date']), datatype=XSD['date'])

        # borough_data = str(accident_data['BOROUGH']).capitalize()
        instance = URIRef(
            to_iri(weatherVocab + ''.join(station_id) + '/' +
                   ''.join(str(weather_data['date']))))
        graph.add((instance, RDF.type, instance))
        # graph.add((instance, STA['station_id'], stationId))
        graph.add((instance, WEA['stationID'], stationId))
        graph.add((instance, RDFS.label, station_id))
        graph.add((instance, RDFS.label, date))
        # graph.add((Weather, WEA['isinstance'], instance))

        if (pd.isnull(weather_data['AWND']) == False):
            graph.add((instance, WEA['hasAWND'],
                       Literal(weather_data['AWND'], datatype=XSD['int'])))

        if (pd.isnull(weather_data['TMAX']) == False):
            graph.add((instance, WEA['hasTMAX'],
                       Literal(weather_data['TMAX'], datatype=XSD['int'])))

        if (pd.isnull(weather_data['TMIN']) == False):
            graph.add((instance, WEA['hasTMIN'],
                       Literal(weather_data['TMIN'], datatype=XSD['int'])))

        if (pd.isnull(weather_data['TAVG']) == False):
            graph.add((instance, WEA['hasTAVG'],
                       Literal(weather_data['TAVG'], datatype=XSD['int'])))

        if (pd.isnull(weather_data['WESF']) == False):
            graph.add((instance, WEA['hasWESF'],
                       Literal(weather_data['WESF'], datatype=XSD['int'])))

        # just for debugging purposes
        if ((index % 10000) == 0):
            print("done with " + str(rows) + "0,000 rows")
            rows += 1

    __save__(graph, output_file)
Ejemplo n.º 5
0
    def _create_leolani_world(self, capsule, type='Statement'):
        # Instance graph
        instance_graph_uri = URIRef(to_iri(self.namespaces['LW'] +
                                           'Instances'))
        instance_graph = self.dataset.graph(instance_graph_uri)

        # Subject
        if type == 'Statement':
            subject, subject_label = self._generate_subject(
                capsule, instance_graph)
        elif type == 'Experience':
            subject = self._generate_leolani(
                instance_graph) if self.my_uri is None else self.my_uri
            subject_label = 'leolani'

        # Object
        if capsule['object']['type'] == '':  # We only get the label
            object_vocab = OWL
            object_type = 'Thing'
        else:
            object_vocab = self.namespaces['N2MU']
            object_type = capsule['object']['type']

        object_id = capsule['object']['label']

        object = URIRef(to_iri(self.namespaces['LW'] + object_id))
        object_label = Literal(object_id)
        object_type1 = URIRef(to_iri(object_vocab + object_type))
        object_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Instance'))

        instance_graph.add((object, RDFS.label, object_label))
        instance_graph.add((object, RDF.type, object_type1))
        instance_graph.add((object, RDF.type, object_type2))

        if type == 'Statement':
            claim_graph, statement = self._create_claim_graph(
                subject,
                subject_label,
                object,
                object_label,
                capsule['predicate']['type'],
                type='Statement')
        elif type == 'Experience':
            claim_graph, statement = self._create_claim_graph(
                subject,
                subject_label,
                object,
                object_label,
                'sees',
                type='Experience')

        return instance_graph, claim_graph, subject, object, statement
def main():
    #reading dataset from csv
    fileName = "DBlist.csv"
    df = pd.read_csv(fileName)

    # A namespace for our resources
    data = 'http://dbpedia.org/ontology/resource/'
    DATA = Namespace(data)

    # A namespace for the schema (Classes)
    schema = 'http://dbpedia.org/ontology/'
    CLASS = Namespace(schema)

    # Creating a graph
    graph = Graph()
    graph.bind("owl", OWL)
    graph.bind("rdfs", RDFS)

    logging.info("Reading all data from " + fileName)
    for j in range(len(df)):

        #adding classes to the graph
        classN = URIRef(to_iri(schema + df.loc[j, 'Class_Name']))
        name = Literal(df.loc[j, 'Class_Name'],
                       datatype=XSD['string'])  #the class name label

        graph.add((classN, RDF.type, OWL.Class))
        graph.add((classN, RDF.type, RDFS.Class))
        graph.add((classN, RDFS.label, name))

        # in case their are no instances (only DBpedia)
        if df.loc[j, 'Number_of_Instances'] == 0:
            pass
        else:
            MyList = df.loc[j, 'Instances_Names'].split('|')

        # adding instances of a class to the graph
        for c in range(len(MyList)):

            MyList[c] = MyList[c].strip(' " ').replace(" ' ", '')
            instance = URIRef(to_iri(data + MyList[c]))
            graph.add((instance, RDF.type, classN))
            instanceLabel = Literal(
                MyList[c], datatype=XSD['string'])  #creating the label
            graph.add((instance, RDFS.label, instanceLabel))

    outFile = 'TestCase/DBpedia.xml'
    logging.info("Writing the graph to " + outFile)

    with open(outFile, 'wb') as f:
        graph.serialize(f, format='xml')
Ejemplo n.º 7
0
    def convert_info(self):
        """Converts the CSVW JSON file to valid RDF for serializing into the Nanopublication publication info graph."""

        results = self.metadata_graph.query("""SELECT ?s ?p ?o
                                               WHERE { ?s ?p ?o .
                                                       FILTER(?p = csvw:valueUrl ||
                                                              ?p = csvw:propertyUrl ||
                                                              ?p = csvw:aboutUrl)}""")

        for (s, p, o) in results:
            # Use iribaker
            try:
                # Python 2
                escaped_object = URIRef(iribaker.to_iri(unicode(o)))
            except NameError:
                # Python 3
                escaped_object = URIRef(iribaker.to_iri(str(o)))
                # print(escaped_object)

            # If the escaped IRI of the object is different from the original,
            # update the graph.
            if escaped_object != o:
                self.metadata_graph.set((s, p, escaped_object))
                # Add the provenance of this operation.
                try:
                    # Python 2
                    self.np.pg.add((escaped_object,
                                PROV.wasDerivedFrom,
                                Literal(unicode(o), datatype=XSD.string)))
                except NameError:
                    # Python 3
                    self.np.pg.add((escaped_object,
                                PROV.wasDerivedFrom,
                                Literal(str(o), datatype=XSD.string)))
                    # print(str(o))

        #walk through the metadata graph to remove illigal "Resource" blank node caused by python3 transition.
        for s, p, o in self.metadata_graph.triples((None, None, None)):
            if s.startswith("Resource("):
                self.metadata_graph.remove((s,p,o))
                self.metadata_graph.add((BNode(str(s)[9:-1]), p, o))
                logger.debug("removed a triple because it was not formatted right. (started with \"Resource\")")

        # Add the information of the schema file to the provenance graph of the
        # nanopublication
        self.np.ingest(self.metadata_graph, self.np.pg.identifier)

        # for s,p,o in self.np.triples((None,None,None)):
        #     print(s.__repr__,p.__repr__,o.__repr__)

        return
def convert_parking_dataset(path, dataset, graph_uri):
    f = open(path, 'r')
    json_data = json.load(f)

    graph = dataset.graph(graph_uri)
    country = URIRef(to_iri(dbr + 'Kingdom of the Netherlands'))
    city = URIRef(to_iri(dbr + 'Amsterdam'))

    for data in json_data['gehandicaptenparkeerplaatsen']:
        slot_data = data['node']

        data_address = slot_data['Adres'].strip()
        if data_address == '':
            continue
        slot = URIRef(to_iri(resource + data_address))

        slot_loc = URIRef(to_iri(resource + 'Amsterdam/' + data_address))
        slot_loc_address = Literal(data_address)

        data_quantity = slot_data['Aantal'].strip()
        slot_quantity = Literal(int(data_quantity), datatype=XSD['unsignedInt']) if data_quantity != '' else None

        data_info = slot_data['Locatie-info']
        slot_info = Literal(data_info) if data_info != '' else None
        slot_loc_borough = URIRef(to_iri(resource + 'Amsterdam/' + slot_data['Stadsdeel'].strip()))

        slot_coordinates = json.loads(slot_data['locatie'].strip())
        slot_loc_lat = Literal(float(slot_coordinates['coordinates'][1]))
        slot_loc_long = Literal(float(slot_coordinates['coordinates'][0]))


        graph.add((slot, RDF.type, VOCAB['ParkingSlot']))
        graph.add((slot, RDFS.label, slot_loc_address))
        if slot_quantity:
            graph.add((slot, VOCAB['quantity'], slot_quantity))
        if slot_info:
            graph.add((slot, VOCAB['info'], slot_info))
        graph.add((slot, VOCAB['slotLocation'], slot_loc))

        graph.add((slot_loc, RDF.type, VOCAB['Location']))
        graph.add((slot_loc, RDFS.label, slot_loc_address))
        graph.add((slot_loc, DBO['address'], slot_loc_address))
        graph.add((slot_loc, DBO['city'], city))
        graph.add((slot_loc, DBO['country'], country))
        graph.add((slot_loc_borough, RDF.type, VOCAB['Borough']))
        graph.add((slot_loc, VOCAB['borough'], slot_loc_borough))
        graph.add((slot_loc, GEO['lat'], slot_loc_lat))
        graph.add((slot_loc, GEO['long'], slot_loc_long))

    return dataset, graph
Ejemplo n.º 9
0
    def expandURL(self, url_pattern, row, datatype=False):
        """Takes a Jinja or Python formatted string, applies it to the row values, and returns it as a URIRef"""

        try:
            unicode_url_pattern = unicode(url_pattern)
        except NameError:
            unicode_url_pattern = str(url_pattern).split(')')[0].split('(')[-1]
        # print(unicode_url_pattern)

        url = self.render_pattern(unicode_url_pattern, row)

        # DEPRECATED
        # for ns, nsuri in namespaces.items():
        #     if url.startswith(ns):
        #         url = url.replace(ns + ':', nsuri)
        #         break

        try:
            iri = iribaker.to_iri(url)
            rfc3987.parse(iri, rule='IRI')
        except:
            raise Exception(u"Cannot convert `{}` to valid IRI".format(url))

        # print(iri)
        return URIRef(iri)
Ejemplo n.º 10
0
    def convert_info(self):
        """Converts the CSVW JSON file to valid RDF for serializing into the Nanopublication publication info graph."""

        results = self.metadata_graph.query("""SELECT ?s ?p ?o
                                               WHERE { ?s ?p ?o .
                                                       FILTER(?p = csvw:valueUrl ||
                                                              ?p = csvw:propertyUrl ||
                                                              ?p = csvw:aboutUrl)}""")

        for (s, p, o) in results:
            # Use iribaker
            escaped_object = URIRef(iribaker.to_iri(unicode(o)))

            # If the escaped IRI of the object is different from the original,
            # update the graph.
            if escaped_object != o:
                self.metadata_graph.set((s, p, escaped_object))
                # Add the provenance of this operation.
                self.np.pg.add((escaped_object,
                                PROV.wasDerivedFrom,
                                Literal(unicode(o), datatype=XSD.string)))

        # Add the information of the schema file to the provenance graph of the
        # nanopublication
        self.np.ingest(self.metadata_graph, self.np.pg.identifier)

        return
Ejemplo n.º 11
0
    def fill_entity(self, label, types, namespace='LW', uri=None):
        # type: (str, list, str, str) -> Entity
        """
        Create an RDF entity given its label, types and its namespace
        Parameters
        ----------
        label: str
            Label of entity
        types: List[str]
            List of types for this entity
        uri: str
            URI of the entity, is available (i.e. when extracting concepts from wikidata)
        namespace: str
            Namespace where entity belongs to

        Returns
        -------
            Entity object with given label
        """
        if types in [None, ''] and label != '':
            self._log.warning('Unknown type: {}'.format(label))
            return self.fill_entity_from_label(label, namespace)
        else:
            entity_id = self.create_resource_uri(
                namespace, label) if not uri else URIRef(to_iri(uri))
            return Entity(entity_id, Literal(label), types)
Ejemplo n.º 12
0
Archivo: csvw.py Proyecto: raadjoe/COW
    def convert_info(self):
        """Converts the CSVW JSON file to valid RDF for serializing into the Nanopublication publication info graph."""

        results = self.metadata_graph.query("""SELECT ?s ?p ?o
                                               WHERE { ?s ?p ?o .
                                                       FILTER(?p = csvw:valueUrl ||
                                                              ?p = csvw:propertyUrl ||
                                                              ?p = csvw:aboutUrl)}"""
                                            )

        for (s, p, o) in results:
            # Use iribaker
            escaped_object = URIRef(iribaker.to_iri(unicode(o)))

            # If the escaped IRI of the object is different from the original,
            # update the graph.
            if escaped_object != o:
                self.metadata_graph.set((s, p, escaped_object))
                # Add the provenance of this operation.
                self.np.pg.add((escaped_object, PROV.wasDerivedFrom,
                                Literal(unicode(o), datatype=XSD.string)))

        # Add the information of the schema file to the provenance graph of the
        # nanopublication
        self.np.ingest(self.metadata_graph, self.np.pg.identifier)

        return
Ejemplo n.º 13
0
    def resource(self, resource_type, resource_name):
        """Produce a resource-URI based on the ``_RESOURCE_URI_PATTERN`` constant"""
        raw_iri = self._RESOURCE_URI_PATTERN.format(resource_type,
                                                    resource_name)
        iri = to_iri(raw_iri)

        return URIRef(iri)
Ejemplo n.º 14
0
    def _generate_leolani(self, instance_graph):
        # Create Leolani
        leolani_id = 'leolani'
        leolani_label = 'leolani'

        leolani = URIRef(to_iri(self.namespaces['LW'] + leolani_id))
        leolani_label = Literal(leolani_label)
        leolani_type1 = URIRef(to_iri(self.namespaces['N2MU'] + 'robot'))
        leolani_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Instance'))

        instance_graph.add((leolani, RDFS.label, leolani_label))
        instance_graph.add((leolani, RDF.type, leolani_type1))
        instance_graph.add((leolani, RDF.type, leolani_type2))

        self.my_uri = leolani

        return leolani
Ejemplo n.º 15
0
def safe_url(NS, local):
    """Generates a URIRef from the namespace + local part that is safe for
    use in RDF graphs

    Arguments:
    NS      -- a @Namespace object
    local   -- the local name of the resource
    """
    return URIRef(iribaker.to_iri(NS[local]))
Ejemplo n.º 16
0
    def __init__(self, dataset, file_object=None):
        self.dataset = dataset

        if 'name' not in dataset:
            (head, dataset_local_name) = os.path.split(dataset['filename'])
            (dataset_name, extension) = os.path.splitext(dataset_local_name)
            self.dataset_name = dataset_name
        else:
            self.dataset_name = dataset['name']

        if 'version' in dataset:
            self.dataset_uri = iribaker.to_iri(
                config.QBR_BASE + dataset['version'] + '/' + self.dataset_name)
        else:
            self.dataset_uri = iribaker.to_iri(
                config.QBR_BASE + self.dataset_name)

        print "Initialized adapter"
        return
def convert_to_rdf(input_file, output_file):
    rows = 0
    data = __load__(input_file)
    RES, WEA, STA = __setup_namespace__()
    graph = __setup_graph__(RES)
    graph.parse(ontology + 'weather_type.ttl', format='turtle')

    filter2020 = data

    for index, data in filter2020.iterrows():
        station_id = URIRef(to_iri(station + str(data['STATION_ID'])))
        date = URIRef(to_iri(resource + str(data['DATE'])))
        STATION_ID = Literal(data['STATION_ID'], datatype=XSD['string'])
        date_raw = str(data['DATE'])
        dt = datetime(int(date_raw[0:4]), int(date_raw[4:6]),
                      int(date_raw[6:8])).isoformat()
        DATE = Literal(dt, datatype=XSD['dateTime'])

        instance = URIRef(
            to_iri(weatherVocab + ''.join(STATION_ID) + '/' +
                   ''.join(str(data['DATE']))))

        graph.add((instance, RDF.type, instance))

        graph.add((instance, STA['station_id'], station_id))
        # graph.add((instance, WEA['station_id'], STATION_ID))
        graph.add((instance, RDFS.label, STATION_ID))
        graph.add((instance, RDFS.label, DATE))

        graph.add((instance, WEA['hasWeatherID'],
                   Literal(data['WEATHER_ID'], datatype=XSD['string'])))
        graph.add((instance, WEA['hasWeatherType'],
                   Literal(data['WEATHER_TYPE'], datatype=XSD['string'])))
        graph.add(
            (instance, WEA['onDate'], Literal(dt, datatype=XSD['dateTime'])))

    __save__(graph, output_file)


#
# input_file = '../data/csv/NY_weather_type_pivot.csv'
# output_file = '../data/rdf/NY_weather_type.rdf'
# convert_to_rdf(input_file,output_file)
Ejemplo n.º 18
0
def uri_to_iri(uri):
    result = urlparse(uri)
    if not result.scheme or not result.netloc or result.netloc == '-':
        raise ValueError("Provided URI does not have a valid schema or netloc")

    try:
        iri = iribaker.to_iri(uri)
        return iri
    except:
        raise ValueError("Provided URI can't be converted to IRI")
Ejemplo n.º 19
0
def standard_mode(table, metadata):
        
    FILE_URL = Namespace(table.url + '#')
        
    g = Graph()
    g.bind('csvw', CSVW)

    tg_bn = BNode()
    t_bn = BNode()
    
    g.add((tg_bn, RDF.type, CSVW.TableGroup))
    g.add((tg_bn, CSVW.table, t_bn))
    
    g.add((t_bn, CSVW.url, URIRef(table.url)))  
    g.add((t_bn, RDF.type, CSVW.Table))

    for s, p, o in metadata.triples((None, URIRef('http://www.w3.org/ns/csvw#column'), None)):
        collection_resource = metadata.value(s, URIRef('http://www.w3.org/ns/csvw#column'))
        collection = Collection(metadata, collection_resource)

    for row in table.rows:
        
        r_bn = BNode()
        rd_bn = BNode()
        
        g.add((t_bn, CSVW.row, r_bn))
        g.add((r_bn, RDF.type, CSVW.Row))
        g.add((r_bn, CSVW.rownum, Literal(row.number, datatype=XSD.integer)))
        g.add((r_bn, CSVW.describes, rd_bn))
        g.add((r_bn, CSVW.url, FILE_URL['row=' + str(row.number + 1)]))


        for cell in row.cells:
             
            if cell.value != "": 
                
                index = int(str(cell.column)[-1:]) - 1 
                column_name = metadata.value(collection[index], URIRef('http://www.w3.org/ns/csvw#title'))
                
                iri = iribaker.to_iri(FILE_URL[column_name.replace(" ", "%20")])

                try:
                    g.add((rd_bn, iri, Literal(cell.value)))
                
                except Exception: 
                    print "Exception!"
                    print column_name 
                    print FILE_URL[column_name]
                    print iri
                    print Literal(cell.value)
                    
                    traceback.print_exc(file=sys.stdout)
                    print
        
    return g
Ejemplo n.º 20
0
def iri():
    """
    Bake an IRI using iribaker
    Checks an IRI for compliance with RFC and converts invalid characters to underscores, if possible.
    **NB**: No roundtripping, this procedure may result in identity smushing: two input-IRI's may be
    mapped to the same output-IRI.
    ---
      tags:
        - Base
      consumes:
        - text/json
      parameters:
        - name: iri
          in: query
          description: The IRI to be checked for compliance
          required: true
          type: string
      responses:
        '200':
          description: IRI converted
          schema:
            description: A converted IRI result
            type: object
            properties:
                iri:
                    description: The fully compliant IRI
                    type: string
                source:
                    description: The input IRI
                    type: string
            required:
                - iri
                - source
        default:
          description: Unexpected error
          schema:
            id: Message
            type: object
            properties:
              code:
                type: integer
                format: int32
              message:
                type: string
    """

    unsafe_iri = request.args.get('iri', None)

    if unsafe_iri is not None:
        response = {'iri': iribaker.to_iri(unsafe_iri), 'source': unsafe_iri}
        return jsonify(response)
    else:
        raise (Exception(
            "The IRI {} could not be converted to a compliant IRI".format(
                unsafe_iri)))
Ejemplo n.º 21
0
    def __init__(self, dataset, file_object=None):
        self.dataset = dataset

        if 'name' not in dataset:
            (head, dataset_local_name) = os.path.split(dataset['filename'])
            (dataset_name, extension) = os.path.splitext(dataset_local_name)
            self.dataset_name = dataset_name
        else:
            self.dataset_name = dataset['name']

        if 'version' in dataset:
            self.dataset_uri = iribaker.to_iri(config.QBR_BASE +
                                               dataset['version'] + '/' +
                                               self.dataset_name)
        else:
            self.dataset_uri = iribaker.to_iri(config.QBR_BASE +
                                               self.dataset_name)

        print "Initialized adapter"
        return
Ejemplo n.º 22
0
    def _generate_subject(self, capsule, instance_graph):
        if capsule['subject']['type'] == '':  # We only get the label
            subject_vocab = OWL
            subject_type = 'Thing'
        else:
            subject_vocab = self.namespaces['N2MU']
            subject_type = capsule['subject']['type']

        subject_id = capsule['subject']['label']

        subject = URIRef(to_iri(self.namespaces['LW'] + subject_id))
        subject_label = Literal(subject_id)
        subject_type1 = URIRef(to_iri(subject_vocab + subject_type))
        subject_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Instance'))

        instance_graph.add((subject, RDFS.label, subject_label))
        instance_graph.add((subject, RDF.type, subject_type1))
        instance_graph.add((subject, RDF.type, subject_type2))

        return subject, subject_label
Ejemplo n.º 23
0
def iri():
    """
    Bake an IRI using iribaker
    Checks an IRI for compliance with RFC and converts invalid characters to underscores, if possible.
    **NB**: No roundtripping, this procedure may result in identity smushing: two input-IRI's may be
    mapped to the same output-IRI.
    ---
      tags:
        - Base
      consumes:
        - text/json
      parameters:
        - name: iri
          in: query
          description: The IRI to be checked for compliance
          required: true
          type: string
      responses:
        '200':
          description: IRI converted
          schema:
            description: A converted IRI result
            type: object
            properties:
                iri:
                    description: The fully compliant IRI
                    type: string
                source:
                    description: The input IRI
                    type: string
            required:
                - iri
                - source
        default:
          description: Unexpected error
          schema:
            id: Message
            type: object
            properties:
              code:
                type: integer
                format: int32
              message:
                type: string
    """

    unsafe_iri = request.args.get('iri', None)

    if unsafe_iri is not None:
        response = {'iri': iribaker.to_iri(unsafe_iri), 'source': unsafe_iri}
        return jsonify(response)
    else:
        raise(Exception("The IRI {} could not be converted to a compliant IRI".format(unsafe_iri)))
Ejemplo n.º 24
0
    def _create_perspective_graph(self, capsule, turn_label, type='Statement'):
        # Perspective graph
        perspective_graph_uri = URIRef(
            to_iri(self.namespaces['LTa'] + 'Perspectives'))
        perspective_graph = self.dataset.graph(perspective_graph_uri)

        # Mention
        if type == 'Statement':
            mention_id = turn_label + '_char%s' % capsule['position']
        elif type == 'Experience':
            mention_id = turn_label + '_pixel%s' % capsule['position']
        mention = URIRef(to_iri(self.namespaces['LTa'] + mention_id))
        mention_type = URIRef(to_iri(self.namespaces['GRASP'] + 'Mention'))

        perspective_graph.add((mention, RDF.type, mention_type))

        # Attribution
        attribution_id = mention_id + '_CERTAIN'
        attribution = URIRef(to_iri(self.namespaces['LTa'] + attribution_id))
        attribution_type = URIRef(
            to_iri(self.namespaces['GRASP'] + 'Attribution'))
        attribution_value = URIRef(to_iri(self.namespaces['GRASP'] +
                                          'CERTAIN'))

        perspective_graph.add((attribution, RDF.type, attribution_type))
        perspective_graph.add((attribution, RDF.value, attribution_value))

        return perspective_graph, mention, attribution
Ejemplo n.º 25
0
    def create_resource_uri(self, namespace, resource_name):
        """
        Create an URI for the given resource (entity, predicate, named graph, etc) in the given namespace
        Parameters
        ----------
        namespace: str
            Namespace where entity belongs to
        resource_name: str
            Label of resource

        Returns
        -------
        uri: str
            Representing the URI of the resource

        """
        if namespace in self.namespaces.keys():
            uri = URIRef(to_iri(self.namespaces[namespace] + resource_name))
        else:
            uri = URIRef(to_iri('{}:{}'.format(namespace, resource_name)))

        return uri
def convert_to_rdf(input_file, output_file):
    rows = 0

    data = __load__(input_file)
    RES, VOCAB, GEO, STA = __setup_namespace__()
    graph = __setup_graph__(RES, VOCAB)
    graph.parse(voc_location + 'NY_station_2.ttl', format='turtle')

    filter2020 = data

    # print(filter2020)

    for index, data in filter2020.iterrows():

        # station ID is primary key
        station = URIRef(to_iri(stationVocab + str(data['GHCND'])))
        station_id = Literal(data['GHCND'], datatype=XSD['string'])
        graph.add((station, STA['station_id'], station_id))
        # graph.add((station, RDF.lable, station))

        # print(graph)

        lat = Literal(
            data['LAT_DEC'] if pd.isnull(data['LAT_DEC']) == False else 0,
            datatype=XSD['double'])
        lon = Literal(
            data['LON_DEC'] if pd.isnull(data['LON_DEC']) == False else 0,
            datatype=XSD['double'])
        graph.add((station, GEO['lat'], lat))
        graph.add((station, GEO['long'], lon))

        name = Literal(data['STATION_NAME'], datatype=XSD['string'])
        graph.add((station, RDFS.label, name))

        # countrytag = URIRef(to_iri(stationVocab + '/' + str(data['CC'])))
        country = Literal(data['CC'], datatype=XSD['string'])
        # graph.add((countrytag, RDF.type, countrytag))
        graph.add((station, STA['country'], country))

        # statetag = URIRef(to_iri(stationVocab + '/' +  str(data['ST'])))
        state = Literal(data['ST'], datatype=XSD['string'])
        graph.add((station, STA['state'], state))
        # graph.add((statetag, RDF.type, statetag))

        # countytag = URIRef(to_iri(stationVocab + '/' +  str(data['COUNTY'])))
        county = Literal(data['COUNTY'], datatype=XSD['string'])
        graph.add((station, STA['county'], county))
        # graph.add((countytag, RDF.type, countytag))

    __save__(graph, output_file)
def uri_to_iri(uri):
    """
    convert URI to IRI (used for RDF)
    this function also validates the URI and throws a ValueError if the
    provided URI is invalid
    """
    result = urlparse(uri)
    if not result.scheme or not result.netloc or result.netloc == '-':
        raise ValueError("Provided URI does not have a valid schema or netloc")

    try:
        iri = iribaker.to_iri(uri)
        return iri
    except:
        raise ValueError("Provided URI can't be converted to IRI")
Ejemplo n.º 28
0
    def expandURL(self, url_pattern, row, datatype=False):
        """Takes a Jinja or Python formatted string, applies it to the row values, and returns it as a URIRef"""
        url = self.render_pattern(unicode(url_pattern), row)

        # DEPRECATED
        # for ns, nsuri in namespaces.items():
        #     if url.startswith(ns):
        #         url = url.replace(ns + ':', nsuri)
        #         break

        try:
            iri = iribaker.to_iri(url)
            rfc3987.parse(iri, rule='IRI')
        except:
            raise Exception(u"Cannot convert `{}` to valid IRI".format(url))

        # print "Baked: ", iri
        return URIRef(iri)
Ejemplo n.º 29
0
    def fill_entity_from_label(self, label, namespace='LW', uri=None):
        # type: (str, str, str) -> Entity
        """
        Create an RDF entity given its label and its namespace
        Parameters
        ----------
        label: str
            Label of entity
        uri: str
            URI of the entity, is available (i.e. when extracting concepts from wikidata)
        namespace: str
            Namespace where entity belongs to

        Returns
        -------
            Entity object with given label and no type information
        """
        entity_id = self.create_resource_uri(
            namespace, label) if not uri else URIRef(to_iri(uri))

        return Entity(entity_id, Literal(label), [''])
Ejemplo n.º 30
0
    def fill_predicate(self, label, namespace='N2MU', uri=None):
        # type: (str, str, str) -> Predicate
        """
        Create an RDF predicate given its label and its namespace
        Parameters
        ----------
        label: str
            Label of predicate
        uri: str
            URI of the predicate, is available (i.e. when extracting concepts from wikidata)
        namespace:
            Namespace where predicate belongs to

        Returns
        -------

            Predicate object with given label
        """
        predicate_id = self.create_resource_uri(
            namespace, label) if not uri else URIRef(to_iri(uri))

        return Predicate(predicate_id, Literal(label))
Ejemplo n.º 31
0
def validateTerm(term, headers):
    # IRIs have a URIRef type
    if type(term) == URIRef:
        iri = None
        template = Template(term)
        #E.g. http://example.com/{{jinja_statement}} --> http://example.com/None

        rendered_template = None
        try:
            rendered_template = template.render(**headers)
            #E.g. http://example.com/{csv_column_name} --> http://example.com/None
        except TypeError as e:
            # This could happen when LD concepts interact with Jinja concepts, e.g. {{ _row + 'some_string' }}
            # In that case we take the {{ }} out, and assume the template is fine
            # In the rare cases it isn't, the conversion will fail
            rendered_template = re.sub(r'/{{.+}}', '', str(term))

        try:
            potentially_valid_iri = rendered_template.format(**headers)
            iri = iribaker.to_iri(potentially_valid_iri)
            rfc3987.parse(iri, rule='IRI')
        except ValueError as e:
            logger.error(f"Found an invalid IRI: {iri}")
            raise e
def convert_to_rdf(input_file, output_file):
    rows = 0

    data = __load__(input_file)
    GEO, ACT = __setup_namespace__()
    graph = __setup_graph__()
    # add accident ontology created via protege
    graph.parse(ontology_location + 'accident.ttl', format='turtle')

    # filter for a specific year
    filter2020 = data.loc[data['CRASH DATE'].str.split('/', expand=True)[2] ==
                          '2020']
    # filter2020.to_csv('./data/csv/accident-NY-2020.csv', index=False)

    for index, accident_data in filter2020.iterrows():

        # Collision_id is primary key
        accident = URIRef(
            to_iri(accidentVocab + str(accident_data['COLLISION_ID'])))
        vehicleAccident = URIRef(to_iri(accidentVocab + 'VehicleAccident'))
        collision_id = Literal(accident_data['COLLISION_ID'],
                               datatype=XSD['integer'])

        # add accident to graph
        graph.add((accident, RDFS.label, collision_id))
        graph.add((accident, RDF.type, vehicleAccident))

        # setup and add crash date to graph as resource
        crash_date_raw = accident_data['CRASH DATE'].split('/')
        crash_date_formatted = crash_date_raw[2] + "-" + crash_date_raw[
            0] + "-" + crash_date_raw[1]
        dt = datetime(int(crash_date_raw[2]), int(crash_date_raw[0]),
                      int(crash_date_raw[1])).isoformat()
        crash_date = Literal(dt, datatype=XSD['dateTime'])
        graph.add((accident, ACT['hasDate'], crash_date))

        borough_raw = str(accident_data['BOROUGH']).split(" ")
        borough_data = [b.capitalize() for b in borough_raw]

        # setup and add borough data as resource, only if its defined in current instance
        # borough_data = str(accident_data['BOROUGH']).capitalize()
        if (''.join(borough_data) != 'Nan'):
            borough = URIRef(to_iri(accidentVocab + ''.join(borough_data)))
            graph.add((borough, RDF.type, borough))
            graph.add((borough, RDFS.label, Literal(''.join(borough_data))))
            graph.add((accident, ACT['hasBorough'], borough))

        # setup and add zipcode data as resource, only if its defined in current instance
        if (pd.isnull(accident_data['ZIP CODE']) == False):
            zip = URIRef(
                to_iri(accidentVocab + str(int(accident_data['ZIP CODE']))))
            zipCode_type = URIRef(to_iri(accidentVocab + 'ZipCode'))
            graph.add(
                (zip, RDFS.label, Literal(int(accident_data['ZIP CODE']))))
            graph.add((zip, RDF.type, zipCode_type))
            graph.add((accident, ACT['inZipCode'], zip))
            if (''.join(borough_data) != 'Nan'):
                graph.add((borough, ACT['containsZipCode'], zip))
                graph.add((zip, ACT['belongsToBorough'], borough))

        # setup and add geo coordinates to graph as literals
        lat = Literal(accident_data['LATITUDE']
                      if pd.isnull(accident_data['LATITUDE']) == False else 0,
                      datatype=XSD['double'])
        lon = Literal(accident_data['LONGITUDE']
                      if pd.isnull(accident_data['LONGITUDE']) == False else 0,
                      datatype=XSD['double'])
        graph.add((accident, GEO['lat'], lat))
        graph.add((accident, GEO['long'], lon))

        # setup and add location to graph as resource (used to map to borough if only location is available)
        # 3 decimal values will ensure a precision of 111m
        location_data = '%.3f' % (accident_data['LATITUDE']) + ',' + '%.3f' % (
            accident_data['LONGITUDE'])
        if (location_data != "nan,nan" and location_data != '0.000,0.000'):
            location = URIRef(to_iri(accidentVocab + location_data))
            location_type = URIRef(to_iri(accidentVocab + 'Location'))
            graph.add((location, RDF.type, location_type))
            graph.add((location, RDFS.label, Literal(location_data)))
            if ''.join(borough_data) != 'Nan':
                graph.add((borough, ACT['containsLocation'], location))
                graph.add((location, ACT['inBorough'], borough))
            graph.add((accident, ACT['inLocation'], location))

        # setup and add street name to graph as Literal
        if (pd.isnull(accident_data['ON STREET NAME']) == False):
            streets = accident_data['ON STREET NAME'].rstrip().split(" ")
            street = [s.capitalize() for s in streets]
            street_data = Literal(''.join(street), datatype=XSD['string'])
            # street_data = URIRef(to_iri(accidentVocab + ''.join(street)))
            graph.add((accident, ACT['hasStreetName'], street_data))

        # setup and add person and pedestrian data to graph as literals
        persons_injured = Literal(int(
            accident_data['NUMBER OF PERSONS INJURED']),
                                  datatype=XSD['integer'])
        graph.add((accident, ACT['hasPersonsInjured'], persons_injured))

        persons_killed = Literal(int(
            accident_data['NUMBER OF PERSONS KILLED']),
                                 datatype=XSD['integer'])
        graph.add((accident, ACT['hasPersonsKilled'], persons_killed))

        pedestrians_injured = Literal(int(
            accident_data['NUMBER OF PEDESTRIANS INJURED']),
                                      datatype=XSD['integer'])
        graph.add(
            (accident, ACT['hasPedestriansInjured'], pedestrians_injured))

        pedestrians_killed = Literal(int(
            accident_data['NUMBER OF PEDESTRIANS KILLED']),
                                     datatype=XSD['integer'])
        graph.add((accident, ACT['hasPedestriansKilled'], pedestrians_killed))

        # setup and add vehicle types involved in the accident to graph as resource
        vehicleType1_split = str(
            accident_data['VEHICLE TYPE CODE 1']).split(' ')
        vehicleType1_data = ''.join(
            [v.capitalize() for v in vehicleType1_split])
        if (len(vehicleType1_data.split('/')) > 1):
            vehicleType1_data = vehicleType1_data.split('/')[0]

        if (vehicleType1_data != 'Nan' and isValidVehicle(vehicleType1_data)):
            vehicleType1 = URIRef(to_iri(accidentVocab + vehicleType1_data))
            graph.add((vehicleType1, RDF.type, vehicleType1))
            graph.add((vehicleType1, RDFS.label, Literal(vehicleType1_data)))
            graph.add((accident, ACT['hasVehicleType'], vehicleType1))

        if (pd.isnull(accident_data['VEHICLE TYPE CODE 2']) == False):
            vehicleType2_split = str(
                accident_data['VEHICLE TYPE CODE 2']).split(' ')
            vehicleType2_data = ''.join(
                [v.capitalize() for v in vehicleType2_split])
            if (len(vehicleType2_data.split('/')) > 1):
                vehicleType2_data = vehicleType2_data.split('/')[0]

            if (vehicleType2_data != 'Nan'
                    and isValidVehicle(vehicleType2_data)):
                vehicleType2 = URIRef(to_iri(accidentVocab +
                                             vehicleType2_data))
                graph.add((vehicleType2, RDF.type, vehicleType2))
                graph.add(
                    (vehicleType2, RDFS.label, Literal(vehicleType2_data)))
                graph.add((accident, ACT['hasVehicleType'], vehicleType2))

        if (pd.isnull(accident_data['VEHICLE TYPE CODE 3']) == False):
            vehicleType3_split = str(
                accident_data['VEHICLE TYPE CODE 3']).split(' ')
            vehicleType3_data = ''.join(
                [v.capitalize() for v in vehicleType3_split])
            if (len(vehicleType3_data.split('/')) > 1):
                vehicleType3_data = vehicleType3_data.split('/')[0].replace(
                    " ", "").capitalize()

            if (vehicleType3_data != 'Nan'
                    and isValidVehicle(vehicleType3_data)):
                vehicleType3 = URIRef(to_iri(accidentVocab +
                                             vehicleType3_data))
                graph.add((vehicleType3, RDF.type, vehicleType3))
                graph.add(
                    (vehicleType3, RDFS.label, Literal(vehicleType3_data)))
                graph.add((accident, ACT['hasVehicleType'], vehicleType3))

        # setup and add contributing factors to graph as resource
        if __check_if_Invalid__(accident_data,
                                'CONTRIBUTING FACTOR VEHICLE 1'):
            contributing_factor_1 = Literal(
                accident_data['CONTRIBUTING FACTOR VEHICLE 1'],
                datatype=XSD['string'])
            graph.add((accident, ACT['hasContributingFactor'],
                       contributing_factor_1))

        if __check_if_Invalid__(accident_data,
                                'CONTRIBUTING FACTOR VEHICLE 2'):
            contributing_factor_2 = Literal(
                accident_data['CONTRIBUTING FACTOR VEHICLE 2'],
                datatype=XSD['string'])
            graph.add((accident, ACT['hasContributingFactor'],
                       contributing_factor_2))

        if __check_if_Invalid__(accident_data,
                                'CONTRIBUTING FACTOR VEHICLE 3'):
            contributing_factor_3 = Literal(
                accident_data['CONTRIBUTING FACTOR VEHICLE 3'],
                datatype=XSD['string'])
            graph.add((accident, ACT['hasContributingFactor'],
                       contributing_factor_3))

        # just for debugging purposes
        if ((index % 10000) == 0):
            print("------> Done with " + str(rows) + "0,000 rows...")
            rows += 1

        # only processing 50,000 rows so it can be loaded into protege within reasonable time
        # if(rows == 5):
        #     break

    __save__(graph, output_file)
Ejemplo n.º 33
0
    def process(self, count, rows, chunksize):
        """Process the rows fed to the converter. Count and chunksize are used to determine the
        current row number (needed for default observation identifiers)"""
        obs_count = count * chunksize

        # logger.info("Row: {}".format(obs_count)) #removed for readability

        # We iterate row by row, and then column by column, as given by the CSVW mapping file.
        mult_proc_counter = 0
        iter_error_counter= 0
        for row in rows:
            # This fixes issue:10
            if row is None:
                mult_proc_counter += 1
                # logger.debug( #removed for readability
                #     "Skipping empty row caused by multiprocessing (multiple of chunksize exceeds number of rows in file)...")
                continue

            # set the '_row' value in case we need to generate 'default' URIs for each observation ()
            # logger.debug("row: {}".format(obs_count)) #removed for readability
            row[u'_row'] = obs_count
            count += 1

            # The self.columns dictionary gives the mapping definition per column in the 'columns'
            # array of the CSVW tableSchema definition.
            for c in self.columns:

                c = Item(self.metadata_graph, c)
                # default about URL
                s = self.expandURL(self.aboutURLSchema, row)

                try:
                    # Can also be used to prevent the triggering of virtual
                    # columns!

                    # Get the raw value from the cell in the CSV file
                    value = row[unicode(c.csvw_name)]
                    # This checks whether we should continue parsing this cell, or skip it.
                    if self.isValueNull(value, c):
                        continue

                    # If the null values are specified in an array, we need to parse it as a collection (list)
                    elif isinstance(c.csvw_null, Item):
                        nulls = Collection(self.metadata_graph, BNode(c.csvw_null))

                        if self.equal_to_null(nulls, row):
                            # Continue to next column specification in this row, if the value is equal to (one of) the null values.
                            continue
                except:
                    # No column name specified (virtual) because there clearly was no c.csvw_name key in the row.
                    # logger.debug(traceback.format_exc()) #removed for readability
                    iter_error_counter +=1
                    if isinstance(c.csvw_null, Item):
                        nulls = Collection(self.metadata_graph, BNode(c.csvw_null))
                        if self.equal_to_null(nulls, row):
                            # Continue to next column specification in this row, if the value is equal to (one of) the null values.
                            continue

                try:
                    # This overrides the subject resource 's' that has been created earlier based on the
                    # schema wide aboutURLSchema specification.
                    if unicode(c.csvw_virtual) == u'true' and c.csvw_aboutUrl is not None:
                        s = self.expandURL(c.csvw_aboutUrl, row)

                    if c.csvw_valueUrl is not None:
                        # This is an object property, because the value needs to be cast to a URL
                        p = self.expandURL(c.csvw_propertyUrl, row)
                        o = self.expandURL(c.csvw_valueUrl, row)
                        if self.isValueNull(os.path.basename(unicode(o)), c):
                            logger.debug("skipping empty value")
                            continue

                        if unicode(c.csvw_virtual) == u'true' and c.csvw_datatype is not None and URIRef(c.csvw_datatype) == XSD.anyURI:
                            # Special case: this is a virtual column with object values that are URIs
                            # For now using a test special property
                            value = row[unicode(c.csvw_name)].encode('utf-8')
                            o = URIRef(iribaker.to_iri(value))

                        if unicode(c.csvw_virtual) == u'true' and c.csvw_datatype is not None and URIRef(c.csvw_datatype) == XSD.linkURI:
                            about_url = str(c.csvw_aboutUrl)
                            about_url = about_url[about_url.find("{"):about_url.find("}")+1]
                            s = self.expandURL(about_url, row)
                            # logger.debug("s: {}".format(s))
                            value_url = str(c.csvw_valueUrl)
                            value_url = value_url[value_url.find("{"):value_url.find("}")+1]
                            o = self.expandURL(value_url, row)
                            # logger.debug("o: {}".format(o))

                        # For coded properties, the collectionUrl can be used to indicate that the
                        # value URL is a concept and a member of a SKOS Collection with that URL.
                        if c.csvw_collectionUrl is not None:
                            collection = self.expandURL(c.csvw_collectionUrl, row)
                            self.g.add((collection, RDF.type, SKOS['Collection']))
                            self.g.add((o, RDF.type, SKOS['Concept']))
                            self.g.add((collection, SKOS['member'], o))

                        # For coded properties, the schemeUrl can be used to indicate that the
                        # value URL is a concept and a member of a SKOS Scheme with that URL.
                        if c.csvw_schemeUrl is not None:
                            scheme = self.expandURL(c.csvw_schemeUrl, row)
                            self.g.add((scheme, RDF.type, SKOS['Scheme']))
                            self.g.add((o, RDF.type, SKOS['Concept']))
                            self.g.add((o, SKOS['inScheme'], scheme))
                    else:
                        # This is a datatype property
                        if c.csvw_value is not None:
                            value = self.render_pattern(unicode(c.csvw_value), row)
                        elif c.csvw_name is not None:
                            # print s
                            # print c.csvw_name, self.encoding
                            # print row[unicode(c.csvw_name)], type(row[unicode(c.csvw_name)])
                            # print row[unicode(c.csvw_name)].encode('utf-8')
                            # print '...'
                            value = row[unicode(c.csvw_name)].encode('utf-8')
                        else:
                            raise Exception("No 'name' or 'csvw:value' attribute found for this column specification")

                        # If propertyUrl is specified, use it, otherwise use
                        # the column name
                        if c.csvw_propertyUrl is not None:
                            p = self.expandURL(c.csvw_propertyUrl, row)
                        else:
                            if "" in self.metadata_graph.namespaces():
                                propertyUrl = self.metadata_graph.namespaces()[""][
                                    unicode(c.csvw_name)]
                            else:
                                propertyUrl = "{}{}".format(get_namespaces()['sdv'],
                                    unicode(c.csvw_name))

                            p = self.expandURL(propertyUrl, row)

                        if c.csvw_datatype is not None:
                            if URIRef(c.csvw_datatype) == XSD.anyURI:
                                # The xsd:anyURI datatype will be cast to a proper IRI resource.
                                o = URIRef(iribaker.to_iri(value))
                            elif URIRef(c.csvw_datatype) == XSD.string and c.csvw_lang is not None:
                                # If it is a string datatype that has a language, we turn it into a
                                # language tagged literal
                                # We also render the lang value in case it is a
                                # pattern.
                                o = Literal(value, lang=self.render_pattern(
                                    c.csvw_lang, row))
                            else:
                                o = Literal(value, datatype=c.csvw_datatype, normalize=False)
                        else:
                            # It's just a plain literal without datatype.
                            o = Literal(value)

                    # Add the triple to the assertion graph
                    self.g.add((s, p, o))

                    # Add provenance relating the propertyUrl to the column id
                    if '@id' in c:
                        self.g.add((p, PROV['wasDerivedFrom'], URIRef(c['@id'])))

                except:
                    # print row[0], value
                    traceback.print_exc()

            # We increment the observation (row number) with one
            obs_count += 1

        logger.debug(
            "{} row skips caused by multiprocessing (multiple of chunksize exceeds number of rows in file)...".format(mult_proc_counter))
        logger.debug(
            "{} errors encountered while trying to iterate over a NoneType...".format(mult_proc_counter))
        logger.info("... done")
        return self.ds.serialize(format=self.output_format)
def convert_unemployment_csv(path, dataset, graph_uri):
    with open(path,'r') as csvfile:

        csv_contents = csv_parser(filename)
        enum = 0
        graph_uri = URIRef('http://stardog.clariah-sdh.eculture.labs.vu.nl/databases/trump/resource/unemployment_eu_graph')  # The URI for our graph
        graph = dataset.graph(graph_uri)                                   # new graph object with our URI from the dataset

        for row in csv_contents[1:]:
            # Pre processing of the data + creation of triples
            country = URIRef(to_iri(dbr + row['GEO'].strip()))
            country_name = Literal(row['GEO'].strip(), datatype=XSD['string'])

            unemployment_rate = URIRef(to_iri(resource + 'Unemployment_rate' + str(enum)))
            try:
                gender = row['SEX'].strip()
                gender = URIRef(to_iri(sdmx_code + 'Total'))
            except Exception as e:
                gender =  Literal('N/A', datatype= XSD['string'])

            #Preprocess dates
            temp_date = row['TIME'].strip()
            try:
                date = Literal(temp_date,datatype=XSD['gYear'])
            except Exception as e:
                date = Literal('N/A', datatype= XSD['string'])

            try:
                unemployment_value = Literal(row['Value'].strip(), datatype= XSD['float'])
            except Exception as e:
                unemployment_value = Literal('N/A', datatype= XSD['string'])

            try:
                unit = row['UNIT']
                if 'total' in unit:
                    unit_value = Literal('Total population', datatype = XSD['string'])
                else:
                    unit_value = Literal('Active population', datatype = XSD['string'])
            except Exception as e:
                unit = Literal('N/A', datatype = XSD['string'])

            try:
                age_group = Literal(row['AGE'].strip(), datatype = XSD['string'])
            except Exception as e:
                age_group = Literal('N/A', datatype = XSD['string'])


            print 'Country : '+ country_name + ', in year ' + date + ', had unemployment rate : ' \
            + unemployment_value + ', for age group : '+ age_group + ', unit : ' + unit_value


            graph.add((country, RDF.type, VOCAB['Country']))
            graph.add((country, VOCAB['unemployment_rate'], unemployment_rate))


            dataset.add((unemployment_rate, VOCAB['gender'], gender))
            dataset.add((unemployment_rate, VOCAB['indicator_value'], unemployment_value))
            dataset.add((unemployment_rate, VOCAB['time_period'],date))
            dataset.add((unemployment_rate, VOCAB['country'], country))
            dataset.add((unemployment_rate, VOCAB['unit'], unit_value))

            enum += 1

    return dataset, graph
def convert_inflow_csv(path, dataset, graph_uri):
    filename = path
    with open(path,'r') as csvfile:
        csv_contents = csv_parser(filename)
        enum = 0
        graph_uri = URIRef('http://stardog.clariah-sdh.eculture.labs.vu.nl/databases/the_migration_portal/resource/inflow_graph')  # The URI for our graph
        graph = dataset.graph(graph_uri)  # new graph object with our URI from the dataset

        for row in csv_contents[1:]:
            # Pre processing of the data + creation of triples
            from_country_code = URIRef(to_iri(geo_country_code + row['Code'].strip()+"/"))

            temp_from_country_name = row['Country of birth/nationality'].strip().replace(",","")

            from_country = URIRef(to_iri(dbr + temp_from_country_name))
            from_country_name = Literal(temp_from_country_name, datatype=XSD['string'])

            to_counry_code = URIRef(to_iri(geo_country_code + row['COU'].strip()+"/"))

            temp_to_country_name = row['Country'].strip().replace(",","")
            to_country =  URIRef(to_iri(dbr + temp_to_country_name ))
            to_country_name = Literal(temp_to_country_name, datatype=XSD['string'])

            inflow = URIRef(to_iri(resource + 'Inflow' + str(enum)))

            try:
                gender = row['Gender'].strip()
                gender = URIRef(to_iri(sdmx_code + 'Total'))
            except Exception as e:
                gender =  Literal('N/A', datatype= XSD['string'])

            #Preprocess dates
            try:
                date = Literal(row['Year'].strip(),datatype=XSD['gYear'])
            except Exception as e:
                date = Literal('N/A', datatype= XSD['string'])

            try:
                inflow_value = int(row['Value'].strip())
                #print type(inflow_value)
                if isinstance(inflow_value, int):
                #    print "This number is an int"
                    inflow_value = Literal(row['Value'].strip(), datatype= XSD['int'])
                else:
                    #print "This number is a int"
                    inflow_value = Literal(inflow_value, datatype= XSD['float'])
            except Exception as e:
                inflow_value = Literal('N/A', datatype= XSD['string'])

            #print 'From Country : '+ from_country_name + ' to country ' + to_country_name + ', in year ' + date + ', inflow value : ' \
            #+ inflow_value
            print 'Converting row' + str(enum)

            dataset.add((from_country, RDF.type, DBO['Country']))
            dataset.add((from_country, RDFS.label, from_country_name))
            dataset.add((from_country, GCC['country_code'], from_country_code))

            graph.add((inflow, RDF.type, VOCAB['Inflow_of_population']))

            graph.add((inflow, VOCAB['to_country'], to_country))
            graph.add((inflow, VOCAB['from_country'], from_country))
            graph.add((inflow, VOCAB['movement_time_period'], date))
            graph.add((inflow, VOCAB['movement_value'], inflow_value))
            graph.add((inflow, VOCAB['gender'], gender))

            enum += 1

    return dataset, graph
Ejemplo n.º 36
0
def build_schema(infile, outfile, delimiter=None, quotechar='\"', encoding=None, dataset_name=None, base="https://iisg.amsterdam/"):
    """
    Build a CSVW schema based on the ``infile`` CSV file, and write the resulting JSON CSVW schema to ``outfile``.

    Takes various optional parameters for instructing the CSV reader, but is also quite good at guessing the right values.
    """

    url = os.path.basename(infile)
    # Get the current date and time (UTC)
    today = datetime.datetime.utcnow().strftime("%Y-%m-%d")

    if dataset_name is None:
        dataset_name = url

    if encoding is None:
        detector = UniversalDetector()
        with open(infile, 'r') as f:
            for line in f.readlines():
                detector.feed(line)
                if detector.done:
                    break
        detector.close()
        encoding = detector.result['encoding']
        logger.info("Detected encoding: {} ({} confidence)".format(detector.result['encoding'],
                                                                   detector.result['confidence']))

    if delimiter is None:
        with open(infile, 'rb') as csvfile:
            # dialect = csv.Sniffer().sniff(csvfile.read(1024), delimiters=";,$\t")
            dialect = csv.Sniffer().sniff(csvfile.readline()) #read only the header instead of the entire file to determine delimiter
            csvfile.seek(0)
        logger.info("Detected dialect: {} (delimiter: '{}')".format(dialect, dialect.delimiter))
        delimiter = dialect.delimiter


    logger.info("Delimiter is: {}".format(delimiter))

    if base.endswith('/'):
        base = base[:-1]

    metadata = {
        "@id": iribaker.to_iri(u"{}/{}".format(base, url)),
        "@context": ["https://raw.githubusercontent.com/CLARIAH/COW/master/csvw.json",
                     {"@language": "en",
                      "@base": "{}/".format(base)},
                     get_namespaces(base)],
        "url": url,
        "dialect": {"delimiter": delimiter,
                    "encoding": encoding,
                    "quoteChar": quotechar
                    },
        "dc:title": dataset_name,
        "dcat:keyword": [],
        "dc:publisher": {
            "schema:name": "CLARIAH Structured Data Hub - Datalegend",
            "schema:url": {"@id": "http://datalegend.net"}
        },
        "dc:license": {"@id": "http://opendefinition.org/licenses/cc-by/"},
        "dc:modified": {"@value": today, "@type": "xsd:date"},
        "tableSchema": {
            "columns": [],
            "primaryKey": None,
            "aboutUrl": "{_row}"
        }
    }

    with io.open(infile, 'r', encoding=encoding) as infile_file:
        r = csv.reader(infile_file, delimiter=delimiter, quotechar=quotechar)

        header = r.next()

        logger.info("Found headers: {}".format(header))

        if u'' in header:
            logger.warning("WARNING: You have one or more empty column headers in your CSV file. Conversion might produce incorrect results because of conflated URIs or worse")
        if len(set(header)) < len(header):
            logger.warning("WARNING: You have two or more column headers that are syntactically the same. Conversion might produce incorrect results because of conflated URIs or worse")

        # First column is primary key
        metadata['tableSchema']['primaryKey'] = header[0]

        for head in header:
            col = {
                "@id": iribaker.to_iri("{}/{}/column/{}".format(base, url, head)),
                "name": head,
                "titles": [head],
                "dc:description": head,
                "datatype": "string"
            }

            metadata['tableSchema']['columns'].append(col)

    with open(outfile, 'w') as outfile_file:
        outfile_file.write(json.dumps(metadata, indent=True))

    logger.info("Done")
    return
Ejemplo n.º 37
0
def build_schema(infile, outfile, delimiter=',', quotechar='\"', dataset_name=None):
    """Builds a basic QBer-style schema (probably deprecated)"""
    
    if dataset_name is None:
        dataset_name = os.path.basename(infile)

    dataset_uri = to_iri(SDR[dataset_name])

    metadata = {
        "dataset": {
            "file": infile,
            "name": dataset_name,
            "uri": dataset_uri,
            "variables": {}
        }
    }

    with open(infile, 'r') as infile_file:
        r = csv.reader(infile_file, delimiter=delimiter, quotechar=quotechar)

        header = r.next()

        logger.debug(header)

        for variable in header:
            variable = variable.decode('utf-8')
            variable_iri = to_iri(_RESOURCE_URI_PATTERN.format('variable', variable))
            col = {
                "category": "identifier",
                "category_comment": "`category` can be one of identifier, coded or other",
                "description": "The variable '{}' as taken from the '{}' dataset.".format(variable, dataset_name),
                "label": variable,
                "uri": variable_iri,
                "original": {
                    "label": variable,
                    "uri": variable_iri
                },
                "type": "http://purl.org/linked-data/cube#DimensionProperty",
                "valueUrl": "{}/{{{}}}".format(variable_iri,variable),
                "datatype_REMOVEME": "Any XML Schema datatype, only applicable for variables of type `other`",
                "transform_REMOVEME": "Any body of a JavaScript function, that returns some value based on an input `v`, the actual value of a variable",
                "values_REMOVEME": [
                    {
                        "comment": "`values` is a list of variable values that has the form specified here",
                        "count": "The frequency of this value for this variable",
                        "label": "The value itself, used as Literal value or as label in case of `identifier` or `coded`",
                        "original": {
                            "label": "The original value, in case of a mapped/modified value",
                            "uri": "The original URI of the value (typically follows the `valueUrl` template)"
                        },
                        "uri": "The URI for the value, ignored in case of `other`"
                    }
                ]
            }

            metadata['dataset']['variables'][variable] = col

    with open(outfile, 'w') as outfile_file:
        outfile_file.write(json.dumps(metadata, indent=True))

    logger.info("Done")
    return
def convert_population_csv(path, dataset, graph_uri):
    filename = path
    with open(path,'r') as csvfile:
        csv_contents = csv_parser(filename)
        enum = 0
        #graph_uri = URIRef('http://stardog.clariah-sdh.eculture.labs.vu.nl/databases/trump/resource/population_graph')  # The URI for our graph
        graph = dataset.graph(graph_uri)                                   # new graph object with our URI from the dataset

        for row in csv_contents[1:]:
            # Pre processing of the data + creation of triples
            country = URIRef(to_iri(dbr + row['GEO'].strip()))
            country_name = Literal(row['GEO'].strip(), datatype=XSD['string'])
            population = URIRef(to_iri(resource + 'Population' + str(enum)))
            pop_type = Literal(row['CITIZEN'].strip(),datatype=XSD['string'])
            try:
                gender = row['SEX'].strip()
                gender = URIRef(to_iri(sdmx_code + 'Total'))
            except Exception as e:
                gender =  Literal('N/A', datatype= XSD['string'])

            #Preprocess dates
            temp_date = row['TIME'].strip()
            try:
                date = Literal(temp_date,datatype=XSD['gYear'])
            except Exception as e:
                date = Literal('N/A', datatype= XSD['string'])

            try:
                temp = row['Value'].strip().replace(',','')
                if temp == ':':
                    pass
                else:
                    population_value = Literal(temp, datatype=XSD['int'])
            except Exception as e:
                population_value = Literal('N/A', datatype=XSD['string'])

            try:
                age_group = Literal(row['AGE'].strip(), datatype = XSD['string'])
            except Exception as e:
                age_group = Literal('N/A', datatype = XSD['string'])

            print 'Country : '+ country_name + ', in year ' + date + ', had population : ' \
            + population_value + ', for age group : '+ age_group

            population_label = Literal('Population_' + country_name +'_'+ date, datatype=XSD['string'])


            dataset.add((country, RDF.type, DBO['Country']))
            dataset.add((country, RDFS.label, country_name))
            dataset.add((country, VOCAB['population'], population))

            graph.add((population, RDF.type, VOCAB['Population']))
            graph.add((population, RDFS.label, population_label))
            graph.add((population, VOCAB['country'], country))
            graph.add((population, VOCAB['gender'], gender))
            graph.add((population, VOCAB['population_type'], pop_type))
            graph.add((population, VOCAB['population_value'], population_value))
            graph.add((population, VOCAB['time_period'],date))

            enum += 1

    return dataset, graph
Ejemplo n.º 39
0
    def get_values(self):
        """
        Return all unique values, and converts it to samples for each column.
        """

        # Get all unique values for each column
        stats = {}
        for col in self.data.columns:
            istats = []

            counts = self.data[col].value_counts()

            # print self.data[col][0]

            for i in counts.index:
                print col, i
                # The URI for the variable value
                i_uri = iribaker.to_iri(u"{}/value/{}/{}"
                                        .format(self.dataset_uri, col, i))

                # Capture the counts and label in a dictionary for the value
                stat = {
                    'original': {
                        'uri': i_uri,
                        'label': i
                    },
                    'label': i,
                    'uri': i_uri,
                    'count': counts[i]
                }

                # And append it to the list of variable values
                istats.append(stat)

            # The URI for the variable
            variable_uri = iribaker.to_iri("{}/variable/{}"
                                           .format(self.dataset_uri, col))
            # The URI for a (potential) codelist for the variable
            codelist_uri = iribaker.to_iri("{}/codelist/{}"
                                           .format(self.dataset_uri, col))

            codelist_label = "Codelist generated from the values for '{}'".format(
                col)

            codelist = {
                'original': {
                    'uri': codelist_uri,
                    'label': codelist_label
                },
                'uri': codelist_uri,
                'label': codelist_label
            }

            stats[col] = {
                'original': {
                    'uri': variable_uri,
                    'label': col
                },
                'uri': variable_uri,
                'label': col,
                'description': "The variable '{}' as taken "
                               "from the '{}' dataset."
                               .format(col, self.dataset_name),
                'category': 'identifier',
                'type': 'http://purl.org/linked-data/cube#DimensionProperty',  # This is the default
                'values': istats,
                'codelist': codelist
            }

        return stats
Ejemplo n.º 40
0
def get_value_uri(dataset, variable, value):
    """Generates a variable value IRI for a given combination of dataset, variable and value"""
    BASE = get_base_uri(dataset)

    return iribaker.to_iri(BASE['code/' + variable + '/' + value])
def convert_unemployment_csv(path, dataset, graph_uri):
    filename = path
    with open(path, 'r') as csvfile:
        csv_contents = csv_parser(filename)
        enum = 0
        #graph_uri = URIRef('http://stardog.clariah-sdh.eculture.labs.vu.nl/databases/trump/resource/unemployment_graph')  # The URI for our graph
        graph = dataset.graph(
            graph_uri)  # new graph object with our URI from the dataset

        for row in csv_contents[1:]:
            # Pre processing of the data + creation of triples
            country = URIRef(to_iri(dbr + row['GEO'].strip()))
            country_name = Literal(row['GEO'].strip(), datatype=XSD['string'])
            #Fix Germany
            unemployment_rate = URIRef(
                to_iri(resource + 'Unemployment_rate' + str(enum)))
            try:
                gender = row['SEX'].strip()
                gender = URIRef(to_iri(sdmx_code + 'Total'))
            except Exception as e:
                gender = Literal('N/A', datatype=XSD['string'])

            #Preprocess dates
            temp_date = row['TIME'].strip()
            try:
                date = Literal(temp_date, datatype=XSD['gYear'])
            except Exception as e:
                date = Literal('N/A', datatype=XSD['string'])

            try:
                unemployment_value = Literal(row['Value'].strip(),
                                             datatype=XSD['float'])
            except Exception as e:
                unemployment_value = Literal('N/A', datatype=XSD['string'])

            try:
                unit = row['UNIT']
                if 'total' in unit:
                    unit_value = Literal('Total population',
                                         datatype=XSD['string'])
                else:
                    unit_value = Literal('Active population',
                                         datatype=XSD['string'])
            except Exception as e:
                unit = Literal('N/A', datatype=XSD['string'])

            try:
                age_group = Literal(row['AGE'].strip(), datatype=XSD['string'])
            except Exception as e:
                age_group = Literal('N/A', datatype=XSD['string'])


            print 'Country : '+ country_name + ', in year ' + date + ', had unemployment rate : ' \
            + unemployment_value + ', for age group : '+ age_group + ', unit : ' + unit_value

            unemployment_rate_label = Literal('Unemployment_rate_' +
                                              country_name + '_' + date,
                                              datatype=XSD['string'])

            dataset.add((country, RDF.type, DBO['Country']))
            dataset.add((country, RDFS.label, country_name))
            dataset.add(
                (country, VOCAB['unemployment_rate'], unemployment_rate))

            graph.add(
                (unemployment_rate, RDF.type, VOCAB['Unemployment_rate']))
            graph.add((unemployment_rate, RDFS.label, unemployment_rate_label))
            graph.add((unemployment_rate, VOCAB['gender'], gender))
            graph.add((unemployment_rate, VOCAB['indicator_value'],
                       unemployment_value))
            graph.add((unemployment_rate, VOCAB['time_period'], date))
            graph.add((unemployment_rate, VOCAB['country'], country))
            graph.add((unemployment_rate, VOCAB['unit'], unit_value))

            enum += 1

    return dataset, graph
def convert_inflow_csv(path, dataset, graph_uri):
    filename = path
    with open(path, 'r') as csvfile:
        csv_contents = csv_parser(filename)
        enum = 0
        graph_uri = URIRef(
            'http://stardog.clariah-sdh.eculture.labs.vu.nl/databases/the_migration_portal/resource/inflow_graph'
        )  # The URI for our graph
        graph = dataset.graph(
            graph_uri)  # new graph object with our URI from the dataset

        for row in csv_contents[1:]:
            # Pre processing of the data + creation of triples
            from_country_code = URIRef(
                to_iri(geo_country_code + row['Code'].strip() + "/"))

            temp_from_country_name = row['Country of birth/nationality'].strip(
            ).replace(",", "")

            from_country = URIRef(to_iri(dbr + temp_from_country_name))
            from_country_name = Literal(temp_from_country_name,
                                        datatype=XSD['string'])

            to_counry_code = URIRef(
                to_iri(geo_country_code + row['COU'].strip() + "/"))

            temp_to_country_name = row['Country'].strip().replace(",", "")
            to_country = URIRef(to_iri(dbr + temp_to_country_name))
            to_country_name = Literal(temp_to_country_name,
                                      datatype=XSD['string'])

            inflow = URIRef(to_iri(resource + 'Inflow' + str(enum)))

            try:
                gender = row['Gender'].strip()
                gender = URIRef(to_iri(sdmx_code + 'Total'))
            except Exception as e:
                gender = Literal('N/A', datatype=XSD['string'])

            #Preprocess dates
            try:
                date = Literal(row['Year'].strip(), datatype=XSD['gYear'])
            except Exception as e:
                date = Literal('N/A', datatype=XSD['string'])

            try:
                inflow_value = int(row['Value'].strip())
                #print type(inflow_value)
                if isinstance(inflow_value, int):
                    #    print "This number is an int"
                    inflow_value = Literal(row['Value'].strip(),
                                           datatype=XSD['int'])
                else:
                    #print "This number is a int"
                    inflow_value = Literal(inflow_value, datatype=XSD['float'])
            except Exception as e:
                inflow_value = Literal('N/A', datatype=XSD['string'])

            #print 'From Country : '+ from_country_name + ' to country ' + to_country_name + ', in year ' + date + ', inflow value : ' \
            #+ inflow_value
            print 'Converting row' + str(enum)

            dataset.add((from_country, RDF.type, DBO['Country']))
            dataset.add((from_country, RDFS.label, from_country_name))
            dataset.add((from_country, GCC['country_code'], from_country_code))

            graph.add((inflow, RDF.type, VOCAB['Inflow_of_population']))

            graph.add((inflow, VOCAB['to_country'], to_country))
            graph.add((inflow, VOCAB['from_country'], from_country))
            graph.add((inflow, VOCAB['movement_time_period'], date))
            graph.add((inflow, VOCAB['movement_value'], inflow_value))
            graph.add((inflow, VOCAB['gender'], gender))

            enum += 1

    return dataset, graph
def convert_dataset(path, dataset, graph_uri, museums=True):
    f = open(path, 'r')
    json_data = json.load(f)

    graph = dataset.graph(graph_uri)
    country = URIRef(to_iri(dbr + 'Kingdom of the Netherlands'))

    for event_data in json_data:
        event = URIRef(to_iri(resource + event_data['title'].strip()))
        title = Literal(event_data['title'].strip(), datatype=XSD['string'])

        dates = event_data['dates']
        if dates != []:
            single_dates = [Literal(datetime.strptime(d, '%d-%m-%Y').date())
                for d in dates['singles']] if dates.has_key('singles') else []
            start_date = Literal(datetime.strptime(dates['startdate'], '%d-%m-%Y').date()) \
                if dates.has_key('startdate') else None
            end_date = Literal(datetime.strptime(dates['enddate'], '%d-%m-%Y').date()) \
                if dates.has_key('enddate') and dates['enddate'] != '' else None

        location_dict = event_data['location']
        location_d_name = location_dict['name'].strip()
        if location_d_name != '':
            place = URIRef(to_iri(resource + location_d_name))
            place_name = Literal(location_d_name, datatype=XSD['string'])
            location_city_str = location_dict['city'].strip().capitalize()
            location = URIRef(to_iri(resource + location_city_str + '/' + location_dict['adress'].strip()))
            location_city = URIRef(to_iri(dbr + location_city_str))
            location_address = Literal(location_dict['adress'].strip())
            location_zip = Literal(location_dict['zipcode'].strip())
            location_lat = Literal(float(location_dict['latitude'].replace(',', '.')))
            location_lon = Literal(float(location_dict['longitude'].replace(',', '.')))

        if event_data['media']:
            medias = [(Literal(m['url'].strip(), datatype=XSD['anyURI']), m['main'].strip() == 'true') for m in event_data['media']]

        urls = [Literal(url.strip(), datatype=XSD['anyURI']) for url in event_data['urls']]

        details_dict = event_data['details']
        details = []
        for lang in details_dict.iterkeys():
            detail = {}
            if details_dict[lang]['calendarsummary'].strip() != '':
                detail['calendar_summary'] = Literal(details_dict[lang]['calendarsummary'].strip(), lang=lang)
            if details_dict[lang]['longdescription'].strip() != '':
                detail['long_description'] = Literal(details_dict[lang]['longdescription'].strip(), lang=lang)
            if details_dict[lang]['shortdescription'].strip() != '':
                detail['short_description'] = Literal(details_dict[lang]['shortdescription'].strip(), lang=lang)
            details.append(detail)


        graph.add((event, RDF.type, VOCAB['Event']))
        graph.add((event, RDFS.label, title))

        if dates != []:
            for single_date in single_dates:
                graph.add((event, VOCAB['single_date'],  single_date))
            if start_date:
                graph.add((event, VOCAB['start_date'], start_date))
            if end_date:
                graph.add((event, VOCAB['end_date'], end_date))

        if location_dict['name'] != '':
            if museums:
                graph.add((event, VOCAB['exhibitionVenue'], place))
            else:
                graph.add((event, VOCAB['playVenue'], place))
            graph.add((place, RDF.type, VOCAB['Venue']))
            graph.add((place, RDFS.label, place_name))
            graph.add((place, VOCAB['venueLocation'], location))
            graph.add((location, RDF.type, VOCAB['Location']))
            graph.add((location, RDFS.label, location_address))
            graph.add((location, DBO['address'], location_address))
            graph.add((location, DBO['city'], location_city))
            graph.add((location, DBO['postalCode'], location_zip))
            graph.add((location, DBO['country'], country))
            graph.add((location, GEO['lat'], location_lat))
            graph.add((location, GEO['long'], location_lon))

        if medias:
            for m in medias:
                graph.add((event, VOCAB['main_media'] if m[1] else VOCAB['media'], m[0]))

        for url in urls:
            graph.add((event, VOCAB['url'], url))

        for detail in details:
            if detail.has_key('calendar_summary'):
                graph.add((event, VOCAB['calendar_summary'], detail['calendar_summary']))
            if detail.has_key('long_description'):
                graph.add((event, VOCAB['long_description'], detail['long_description']))
            if detail.has_key('short_description'):
                graph.add((event, VOCAB['short_description'], detail['short_description']))

    return dataset, graph
Ejemplo n.º 44
0
def get_variable_uri(dataset, variable):
    """Generates a variable IRI for a given combination of dataset and variable"""
    BASE = get_base_uri(dataset)

    return iribaker.to_iri(BASE[variable])
Ejemplo n.º 45
0
def convert_csv(path, dataset, graph_uri):
    with open(path,'r') as csvfile:
        csv_contents = csv_parser(filename)

        graph_uri = URIRef('http://localhost:5820/test/resource/movement_graph')  # The URI for our graph
        graph = dataset.graph(graph_uri)                                   # new graph object with our URI from the dataset

        for row in csv_contents[2:]:
            # Pre processing of the data + creation of triples
            country = URIRef(to_iri(dbr + row['Country'].strip()))
            country_name = Literal(row['Country'].strip(), datatype=XSD['string'])

            net_migration = URIRef(to_iri(resource + row['Net migration'].strip()))
            try:
                net_migration_value = Literal((int(row['Net migration']) * 1000), datatype=XSD['int'])
            except Exception as e:
                net_migration_value = Literal('N/A',datatype=XSD['string'])

            international_migrant_stock = URIRef(to_iri(resource + row['International migrant stock'].strip()))
            try:
                international_migrant_stock_value = Literal((int(row['International migrant stock']) * 1000), datatype=XSD['int'])
            except Exception as e:
                international_migrant_stock_value = Literal('N/A',datatype=XSD['string'])

            tetriary_educated_emigration = URIRef(to_iri(resource + \
            row['Emigration rate of tertiary educated to OECD countries'].strip()))
            try:
                tetriary_educated_emigration_value_prct = float(row['Emigration rate of tertiary educated to OECD countries'])
                tetriary_educated_emigration_value_prct = Literal(tetriary_educated_emigration_value_prct, datatype=XSD['float'])
            except Exception as e:
                tetriary_educated_emigration_value_prct = Literal('N/A',datatype=XSD['string'])

            refugees_by_country_of_origin = URIRef(to_iri(resource + \
            row['Refugees By country of origin'].strip()))
            try:
                refugees_by_country_of_origin_value = int(float(row['Refugees By country of origin']) * 1000) # make them thousands
                refugees_by_country_of_origin_value = Literal(refugees_by_country_of_origin_value, datatype=XSD['int'])
            except Exception as e:
                refugees_by_country_of_origin_value = Literal('N/A', datatype=XSD['string'])

            refugees_by_country_of_asylum = URIRef(to_iri(resource + \
            row['Refugees By country of asylum'].strip()))
            try:
                refugees_by_country_of_asylum_value = int(float(row['Refugees By country of asylum']) * 1000) # make them thousands and int
                refugees_by_country_of_asylum_value = Literal(refugees_by_country_of_asylum_value, datatype=XSD['int'])
            except Exception as e:
                refugees_by_country_of_asylum_value = Literal('N/A', datatype=XSD['string'])

            personal_remittances_received = URIRef(to_iri(resource + row['Personal remittances received'].strip()))
            try:
                personal_remittances_received_value = long(row['Personal remittances received']) * 1000000 # make them millions and int
                personal_remittances_received_value = Literal(personal_remittances_received_value, datatype=XSD['long'])
            except Exception as e:
                personal_remittances_received_value = Literal('N/A', datatype=XSD['string'])

            personal_remittances_paid = URIRef(to_iri(resource + row['Personal remittances paid'].strip()))
            try:
                personal_remittances_paid_value = long(row['Personal remittances paid']) * 1000000 # make them millions
                personal_remittances_paid_value = Literal(personal_remittances_paid_value, datatype=XSD['long']) # turn it back to int
            except Exception as e:
                personal_remittances_paid_value = Literal('N/A', datatype=XSD['string'])

            # Add data to graph_uri_base
            graph.add((country, RDF.type, VOCAB['Country']))
            graph.add((country, RDFS.label, country_name))
            graph.add((country, VOCAB['net_migration'], net_migration_value))
            graph.add((country, VOCAB['international_migrant_stock'], international_migrant_stock_value))
            graph.add((country, VOCAB['emmigration_rate_to_OECD'], tetriary_educated_emigration_value_prct))
            graph.add((country, VOCAB['refugees_by_country_of_origin'],refugees_by_country_of_origin_value))
            graph.add((country, VOCAB['refugees_by_country_of_asylum'],refugees_by_country_of_asylum_value))
            graph.add((country, VOCAB['personal_remittances_received'],personal_remittances_received_value))
            graph.add((country, VOCAB['personal_remittances_paid'], personal_remittances_paid_value))

            dataset.add((country, RDF.type, VOCAB['Country']))
            dataset.add((country, RDFS.label, country_name))

            dataset.add((net_migration, RDF.type , VOCAB['Net_migration']))
            dataset.add((net_migration, VOCAB['value'], net_migration_value))
            dataset.add((net_migration, VOCAB['year'], Literal('2012', datatype=XSD['gYear'])))

            dataset.add((international_migrant_stock, RDF.type, VOCAB['International_migrant_stock']))
            dataset.add((international_migrant_stock,VOCAB['value'],international_migrant_stock_value))
            dataset.add((international_migrant_stock, VOCAB['year'], Literal('2010', datatype=XSD['gYear'])))

            dataset.add((tetriary_educated_emigration,RDF.type,VOCAB['Emmigration_rate_to_OECD']))
            dataset.add((tetriary_educated_emigration, VOCAB['value'], tetriary_educated_emigration_value_prct))
            dataset.add((tetriary_educated_emigration, VOCAB['year'], Literal('2000', datatype=XSD['gYear'])))

            dataset.add((refugees_by_country_of_origin,RDF.type,VOCAB['Refugees_by_country_of_origin']))
            dataset.add((refugees_by_country_of_origin, VOCAB['value'], refugees_by_country_of_origin_value))
            dataset.add((refugees_by_country_of_origin, VOCAB['year'], Literal('2014', datatype=XSD['gYear'])))

            dataset.add((refugees_by_country_of_asylum,RDF.type,VOCAB['Refugees_by_country_of_asylum']))
            dataset.add((refugees_by_country_of_asylum, VOCAB['value'],
            refugees_by_country_of_asylum_value))
            dataset.add((refugees_by_country_of_asylum, VOCAB['year'], Literal('2014', datatype=XSD['gYear'])))

            dataset.add((personal_remittances_received,RDF.type,VOCAB['Personal_remittances_received']))
            dataset.add((personal_remittances_received, VOCAB['value'],
            personal_remittances_received_value))
            dataset.add((personal_remittances_received, VOCAB['year'], Literal('2014', datatype=XSD['gYear'])))

            dataset.add((personal_remittances_received,RDF.type,VOCAB['Personal_remittances_paid']))
            dataset.add((personal_remittances_paid,VOCAB['value'],personal_remittances_paid_value))
            dataset.add((personal_remittances_received, VOCAB['year'], Literal('2014', datatype=XSD['gYear'])))

    info = u'Frédéric Docquier, B. Lindsay Lowell, and Abdeslam Marfouk'+'s' +', "A Gendered Assessment of Highly Skilled Emigration" (2009)'.decode('utf-8')

    graph.add((VOCAB['net_migration'], URIRef(to_iri(prov + 'wasDerivedFrom')),
    Literal('United Nations Population Division, World Population Prospects', datatype=XSD['string'])))

    graph.add((VOCAB['international_migrant_stock'], URIRef(to_iri(prov + 'wasDerivedFrom')),
    Literal('United Nations Population Division, Trends in Total Migrant Stock: 2012 Revision.', datatype=XSD['string'])))

    graph.add((VOCAB['tetriary_educated_emigration'], URIRef(to_iri(prov + 'wasDerivedFrom')),
    Literal(info, datatype=XSD['string'])))

    graph.add((VOCAB['refugees_by_country_of_origin'], URIRef(to_iri(prov + 'wasDerivedFrom')),
    Literal('United Nations High Commissioner for Refugees (UNHCR), Statistical Yearbook and data files, \
    complemented by statistics on Palestinian refugees under the mandate of the UNRWA as published on its website.\
    Data from UNHCR are available online at: www.unhcr.org/statistics/populationdatabase.', datatype=XSD['string'])))

    graph.add((VOCAB['refugees_by_country_of_asylum'], URIRef(to_iri(prov + 'wasDerivedFrom')),
    Literal('United Nations High Commissioner for Refugees (UNHCR), Statistical Yearbook and data files, \
    complemented by statistics on Palestinian refugees under the mandate of the UNRWA as published on its website. \
    Data from UNHCR are available online at: www.unhcr.org/statistics/populationdatabase.', datatype=XSD['string'])))

    graph.add((VOCAB['personal_remittances_received'], URIRef(to_iri(prov + 'wasDerivedFrom')),
    Literal('World Bank staff estimates based on IMF balance of payments data.', datatype=XSD['string'])))

    graph.add((VOCAB['personal_remittances_paid'], URIRef(to_iri(prov + 'wasDerivedFrom')),
    Literal('World Bank staff estimates based on IMF balance of payments data.', datatype=XSD['string'])))

    return dataset, graph