Example #1
0
    def _build_config_lookup(self, config):
        if config is None:
            log("Mapping file not provided")
            return

        if not os.path.isfile(config):
            log("{} file not found".format(config))
            return

        with open(config) as f:
            reader = csv.DictReader(f)
            for row in reader:
                # drop empty values
                res = dict((k, v) for k, v in row.iteritems() if v is not "")
                self._config_dict[row[FIELD_NAME]] = res
Example #2
0
    def _build_config_lookup(self, config):
        if config is None:
            log("Mapping file not provided")
            return

        if not os.path.isfile(config):
            log("{} file not found".format(config))
            return

        with open(config) as f:
            reader = csv.DictReader(f)
            for row in reader:
                # drop empty values
                res = dict((k, v) for k, v in row.iteritems() if v is not "")
                self._config_dict[row[FIELD_NAME]] = res
Example #3
0
    def build_graph(self, dd, mapping):
        """Constructs a graph from the data dictionary using a config file.

        Args:
            dd (str): Path to the data dictionary csv file.
            mapping (str): Path to a csv formatted config with supplementary
                information file.

        Returns:
            None

        """
        log("Processing: {}".format(dd))

        # self._build_config_lookup(mapping)

        self._build_datadict(dd)
    def build_graph(self, dd, mapping):
        """Constructs a graph from the data dictionary using a config file.

        Args:
            dd (str): Path to the data dictionary csv file.
            mapping (str): Path to a csv formatted config with supplementary
                information file.

        Returns:
            None

        """
        log("Processing: {}".format(dd))

        # self._build_config_lookup(mapping)

        self._build_datadict(dd)
Example #5
0
    def add_metadata(self, metadata_path):
        """Adds the dataset metadata to the graph

        Args:
            metadata_path (str): Path to a csv formatted file with the dataset
                metadata

        Returns:
            None

        """
        if metadata_path is None:
            log("Metadata file is not provided")
            return

        if not os.path.isfile(metadata_path):
            log("{} file not found".format(metadata_path))
            return
        log("Metadata processing: {}".format(metadata_path))

        with open(metadata_path) as f:
            reader = csv.DictReader(f)
            for row in reader:
                md_dataset_id = row[DATASET_ID]
                md_title = row[TITLE]
                md_description = row[DESCRIPTION]
                md_publisher = row[PUBLISHER]
                md_issued = row[ISSUED]
                md_subject = row[SUBJECT]

                term = URIRef(md_dataset_id)
                self._g.add((term,
                             self.terms.rdf_type,
                             self.terms.dataset_type))
                self._g.add((term,
                             self.terms.title,
                             Literal(md_title)))
                self._g.add((term,
                             self.terms.description,
                             Literal(md_description)))
                self._g.add((term,
                             self.terms.publisher,
                             Literal(md_publisher)))
                self._g.add((term,
                             self.terms.issued,
                             Literal(md_issued,
                                     datatype=XSD['date'])))
                self._g.add((term,
                             self.terms.subject,
                             URIRef(md_subject)))
Example #6
0
    def add_metadata(self, metadata_path):
        """Adds the dataset metadata to the graph

        Args:
            metadata_path (str): Path to a csv formatted file with the dataset
                metadata

        Returns:
            None

        """
        if metadata_path is None:
            log("Metadata file is not provided")
            return

        if not os.path.isfile(metadata_path):
            log("{} file not found".format(metadata_path))
            return
        log("Metadata processing: {}".format(metadata_path))

        with open(metadata_path) as f:
            reader = csv.DictReader(f)
            for row in reader:
                md_dataset_id = row[DATASET_ID]
                md_title = row[TITLE]
                md_description = row[DESCRIPTION]
                md_publisher = row[PUBLISHER]
                md_issued = row[ISSUED]
                md_subject = row[SUBJECT]

                term = URIRef(md_dataset_id)
                self._g.add(
                    (term, self.terms.rdf_type, self.terms.dataset_type))
                self._g.add((term, self.terms.title, Literal(md_title)))
                self._g.add(
                    (term, self.terms.description, Literal(md_description)))
                self._g.add(
                    (term, self.terms.publisher, Literal(md_publisher)))
                self._g.add((term, self.terms.issued,
                             Literal(md_issued, datatype=XSD['date'])))
                self._g.add((term, self.terms.subject, URIRef(md_subject)))
Example #7
0
    def add_observations(self, observations):
        """Adds a set of observations to the RDF graph

        Args:
            observations (str): Path to the observations csv file.
        Returns:
            None

        """
        log("Processing: {}".format(observations))

        # Constants.
        if self._datadict:
            dd = self.ns.get(PROJECT)[self._datadict]
        else:
            dd = URIRef(self._datadict)
        dataset_iri = self._get_dataset_iri()
        reader = get_dict_reader(observations)
        index = 0
        for row in reader:
            obs = self._get_sha1_iri(row)
            slice_vals = [row.get(i) for i in self._dimensions[1:]]
            slice_iri = self._get_sha1_iri(slice_vals)
            self._g.add(
                (obs, self.terms.rdf_type, self.terms.observation_type))
            self._g.add((obs, self.terms.dataset, dataset_iri))
            self._g.add(
                (slice_iri, self.terms.rdf_type, self.terms.slice_type))
            self._g.add((dataset_iri, self.terms.slice, slice_iri))
            self._g.add((slice_iri, self.terms.slice_structure, dd))
            for field_name, value in row.iteritems():
                if field_name.endswith('_label'):
                    continue
                field_name_iri = self.ns.get(PROJECT)[field_name]
                # Get the rdfs:range to determine datatype.
                rdfs_ranges = list(
                    self._g.objects(field_name_iri, self.terms.rdfs_range))
                if rdfs_ranges:
                    rdfs_range_iri = rdfs_ranges[0]
                else:
                    error = "rdfs:range not set for {}.".format(field_name_iri)
                    log(error)
                    raise (KeyError(error))
                # Only include the first dimension at the observation level.
                if field_name not in self._dimensions[1:]:
                    # If the range is not an XSD Literal (i.e., this is an
                    # object property), use coded iri.
                    xsd = str(XSD[''].defrag())
                    if str(rdfs_range_iri.defrag()) != xsd:
                        coded_iri = self._convert_literal_to_coded_iri(
                            rdfs_range_iri, value)
                        self._g.add((obs, field_name_iri, coded_iri))
                    else:
                        datatype_iri = rdfs_range_iri
                        self._g.add((obs, field_name_iri,
                                     Literal(value, datatype=datatype_iri)))
                    self._g.add((slice_iri, self.terms.observation, obs))
                else:
                    # Add slice indices.
                    coded_iri = self._convert_literal_to_coded_iri(
                        rdfs_range_iri, value)
                    self._g.add((slice_iri, field_name_iri, coded_iri))
            index += 1
Example #8
0
    def add_observations(self, observations):
        """Adds a set of observations to the RDF graph

        Args:
            observations (str): Path to the observations csv file.
        Returns:
            None

        """
        if not os.path.isfile(observations):
            print("{} file not found".format(observations))
            return
        log("Processing: {}".format(observations))

        # constants
        if self._datadict:
            dd = self.ns.get('sibis')[self._datadict]
        else:
            dd = URIRef(self._datadict)
        dataset_uriref = list(
            self._g.subjects(self.terms.rdf_type, self.terms.dataset_type))
        if dataset_uriref:
            dataset_uri = dataset_uriref[0]
        else:
            dataset_uri = URIRef("")

        with open(observations) as f:
            reader = csv.DictReader(f)
            index = 0
            for row in reader:
                obs = self._get_sha1_iri(row)
                slice_vals = [row.get(i) for i in self._dimensions[1:]]
                slice_iri = self._get_sha1_iri(slice_vals)
                self._g.add(
                    (obs, self.terms.rdf_type, self.terms.observation_type))
                self._g.add((obs, self.terms.dataset, dataset_uri))
                self._g.add(
                    (slice_iri, self.terms.rdf_type, self.terms.slice_type))
                self._g.add((slice_iri, self.terms.slice_structure, dd))
                for key, vals in self._config_dict.iteritems():
                    field_name = vals[FIELD_NAME]
                    field_name_iri = self.ns.get("ncanda")[field_name]
                    # Only include the first dimension at the observation level.
                    if field_name not in self._dimensions[1:]:
                        # Get the rdfs:range to determine datatype.
                        rdfs_ranges = list(
                            self._g.objects(field_name_iri,
                                            self.terms.rdfs_range))
                        rdfs_range_iri = rdfs_ranges[0]
                        # If the range is not an XSD Literal (i.e., this is an
                        # object property), set to xsd:anyURI.
                        xsd = str(XSD[''].defrag())
                        if str(rdfs_range_iri.defrag()) != xsd:
                            rdfs_range_iri = XSD['anyURI']
                        # TODO: Use concept from code list instead of a Literal
                        # for coded values.
                        self._g.add((obs, field_name_iri,
                                     Literal(row[key],
                                             datatype=rdfs_range_iri)))
                        self._g.add((slice_iri, self.terms.observation, obs))
                    else:
                        # Add slice indices.
                        self._g.add(
                            (slice_iri, field_name_iri, Literal(row[key])))
                index += 1
Example #9
0
    def build_graph(self, dd, mapping):
        """Constructs a graph from the data dictionary using a config file.

        Args:
            dd (str): Path to the data dictionary csv file.
            mapping (str): Path to a csv formatted config with supplementary
                information file.

        Returns:
            None

        """
        self._build_config_lookup(mapping)
        if dd is None:
            log("Data dictionary file not provided")
            return

        if not os.path.isfile(dd):
            log("{} file not found".format(dd))
            return
        log("Processing: {}".format(dd))

        self._datadict = os.path.basename(dd)
        with open(dd) as f:
            reader = csv.DictReader(f)
            for row in reader:
                field_name = row[FIELD_NAME]
                field_label = row[FIELD_LABEL]
                self._fields.append(field_name)
                node = self.ns.get("ncanda")[field_name]
                # Default to MeasureProperty.
                prop = self.terms.measure_property_type
                # Use field_name to create "Field Name" label.
                if field_label:
                    label = field_label
                else:
                    split = [i.capitalize() for i in field_label.split('_')]
                    label = ' '.join(split)
                self._g.add((node, self.terms.rdfs_label, Literal(label)))
                # Set prop for dimension properties.
                if (field_name in self._config_dict
                        and DIMENSION in self._config_dict[field_name]):
                    if self._config_dict[field_name][DIMENSION] == "y":
                        prop = self.terms.dimension_property_type
                self._g.add((node, self.terms.rdf_type, prop))
                self._g.add(
                    (node, self.terms.rdf_type, self.terms.property_type))
                # Annotate with Concepts.
                if (field_name in self._config_dict
                        and CONCEPT in self._config_dict[field_name]):
                    obj = URIRef(self._config_dict[field_name][CONCEPT])
                    self._g.add((node, self.terms.concept, obj))
                # Annotate with Range.
                if (field_name in self._config_dict
                        and RANGE in self._config_dict[field_name]):
                    xsd_type = URIRef(self._config_dict[field_name][RANGE])
                else:
                    xsd_type = self._data_element_type(row)
                self._g.add((node, self.terms.rdfs_range, xsd_type))
                # Annotate with Units.
                if (field_name in self._config_dict
                        and UNITS in self._config_dict[field_name]):
                    obj = URIRef(self._config_dict[field_name][UNITS])
                    self._g.add((node, self.terms.unit_measure, obj))
                # Annotate with Statistic.
                if (field_name in self._config_dict
                        and STATISTIC in self._config_dict[field_name]):
                    obj = URIRef(self._config_dict[field_name][STATISTIC])
                    self._g.add((node, self.terms.statistic, obj))
                # Todo: Create qb:codeList for dimension and categorical data
                if (field_name in self._config_dict and row[CHOICES]):
                    # Create a skos:Concept Class.
                    class_label = ''.join(
                        [i.capitalize() for i in field_name.split('_')])
                    class_uri = self.ns.get("ncanda")[class_label]
                    self._g.add(
                        (class_uri, self.terms.rdf_type, self.terms.owl_class))
                    self._g.add((class_uri, self.terms.rdfs_subclass_of,
                                 self.terms.concept_type))
                    title = "Code List Class for '{}' term."
                    self._g.add((class_uri, self.terms.rdfs_label,
                                 Literal(title.format(field_label))))
                    # Create a skos:ConceptScheme.
                    scheme_label = "{}_concept_scheme".format(field_name)
                    concept_scheme_uri = self.ns.get("ncanda")[scheme_label]
                    self._g.add((concept_scheme_uri, self.terms.rdf_type,
                                 self.terms.concept_scheme_type))
                    self._g.add((concept_scheme_uri, self.terms.notation,
                                 Literal(field_name)))
                    self._g.add(
                        (concept_scheme_uri, self.terms.rdfs_label,
                         Literal(
                             "Code List for '{}' term.".format(field_label))))
                    self._g.add((class_uri, self.terms.rdfs_see_also,
                                 concept_scheme_uri))
                    choices = row[CHOICES].split("|")
                    # Create skos:Concept for each code.
                    for choice in choices:
                        k, v = choice.split(',')
                        code = k.strip()
                        code_label = v.strip()
                        choice_uri = self.ns.get("ncanda")['-'.join(
                            [field_name, code])]
                        self._g.add((choice_uri, self.terms.rdf_type,
                                     self.terms.concept_type))
                        self._g.add(
                            (choice_uri, self.terms.rdf_type, class_uri))
                        self._g.add(
                            (choice_uri, self.terms.notation, Literal(code)))
                        self._g.add((choice_uri, self.terms.top_concept_of,
                                     concept_scheme_uri))
                        self._g.add((choice_uri, self.terms.pref_label,
                                     Literal(code_label)))
                        self._g.add((concept_scheme_uri,
                                     self.terms.has_top_concept, choice_uri))
                        self._g.add((concept_scheme_uri, self.terms.in_scheme,
                                     choice_uri))
Example #10
0
    def build_graph(self, dd, mapping):
        """Constructs a graph from the data dictionary using a config file.

        Args:
            dd (str): Path to the data dictionary csv file.
            mapping (str): Path to a csv formatted config with supplementary
                information file.

        Returns:
            None

        """
        self._build_config_lookup(mapping)
        if dd is None:
            log("Data dictionary file not provided")
            return

        if not os.path.isfile(dd):
            log("{} file not found".format(dd))
            return
        log("Processing: {}".format(dd))

        self._datadict = os.path.basename(dd)
        with open(dd) as f:
            reader = csv.DictReader(f)
            for row in reader:
                field_name = row[FIELD_NAME]
                field_label = row[FIELD_LABEL]
                self._fields.append(field_name)
                node = self.ns.get("ncanda")[field_name]
                # Default to MeasureProperty.
                prop = self.terms.measure_property_type
                # Use field_name to create "Field Name" label.
                if field_label:
                    label = field_label
                else:
                    split = [i.capitalize() for i in field_label.split('_')]
                    label = ' '.join(split)
                self._g.add((node, self.terms.rdfs_label, Literal(label)))
                # Set prop for dimension properties.
                if (field_name in self._config_dict and
                        DIMENSION in self._config_dict[field_name]):
                    if self._config_dict[field_name][DIMENSION] == "y":
                        prop = self.terms.dimension_property_type
                self._g.add((node, self.terms.rdf_type, prop))
                self._g.add((node,
                             self.terms.rdf_type,
                             self.terms.property_type))
                # Annotate with Concepts.
                if (field_name in self._config_dict and
                        CONCEPT in self._config_dict[field_name]):
                    obj = URIRef(self._config_dict[field_name][CONCEPT])
                    self._g.add((node, self.terms.concept, obj))
                # Annotate with Range.
                if (field_name in self._config_dict and
                        RANGE in self._config_dict[field_name]):
                    xsd_type = URIRef(self._config_dict[field_name][RANGE])
                else:
                    xsd_type = self._data_element_type(row)
                self._g.add((node, self.terms.rdfs_range, xsd_type))
                # Annotate with Units.
                if (field_name in self._config_dict and
                        UNITS in self._config_dict[field_name]):
                    obj = URIRef(self._config_dict[field_name][UNITS])
                    self._g.add((node, self.terms.unit_measure, obj))
                # Annotate with Statistic.
                if (field_name in self._config_dict and
                        STATISTIC in self._config_dict[field_name]):
                    obj = URIRef(self._config_dict[field_name][STATISTIC])
                    self._g.add((node, self.terms.statistic, obj))
                # Todo: Create qb:codeList for dimension and categorical data
                if (field_name in self._config_dict and
                        row[CHOICES]):
                    # Create a skos:Concept Class.
                    class_label = ''.join([i.capitalize()
                                           for i in field_name.split('_')])
                    class_uri = self.ns.get("ncanda")[class_label]
                    self._g.add((class_uri,
                                 self.terms.rdf_type,
                                 self.terms.owl_class))
                    self._g.add((class_uri,
                                 self.terms.rdfs_subclass_of,
                                 self.terms.concept_type))
                    title = "Code List Class for '{}' term."
                    self._g.add((class_uri,
                                 self.terms.rdfs_label,
                                 Literal(title.format(
                                     field_label))))
                    # Create a skos:ConceptScheme.
                    scheme_label = "{}_concept_scheme".format(field_name)
                    concept_scheme_uri = self.ns.get("ncanda")[scheme_label]
                    self._g.add((concept_scheme_uri,
                                 self.terms.rdf_type,
                                 self.terms.concept_scheme_type))
                    self._g.add((concept_scheme_uri,
                                 self.terms.notation,
                                 Literal(field_name)))
                    self._g.add((concept_scheme_uri,
                                 self.terms.rdfs_label,
                                 Literal("Code List for '{}' term.".format(
                                     field_label))))
                    self._g.add((class_uri,
                                 self.terms.rdfs_see_also,
                                 concept_scheme_uri))
                    choices = row[CHOICES].split("|")
                    # Create skos:Concept for each code.
                    for choice in choices:
                        k, v = choice.split(',')
                        code = k.strip()
                        code_label = v.strip()
                        choice_uri = self.ns.get("ncanda")['-'.join(
                            [field_name, code])]
                        self._g.add((choice_uri,
                                     self.terms.rdf_type,
                                     self.terms.concept_type))
                        self._g.add((choice_uri,
                                     self.terms.rdf_type,
                                     class_uri))
                        self._g.add((choice_uri,
                                     self.terms.notation,
                                     Literal(code)))
                        self._g.add((choice_uri,
                                     self.terms.top_concept_of,
                                     concept_scheme_uri))
                        self._g.add((choice_uri,
                                     self.terms.pref_label,
                                     Literal(code_label)))
                        self._g.add((concept_scheme_uri,
                                     self.terms.has_top_concept,
                                     choice_uri))
                        self._g.add((concept_scheme_uri,
                                     self.terms.in_scheme,
                                     choice_uri))
Example #11
0
    def add_observations(self, observations):
        """Adds a set of observations to the RDF graph

        Args:
            observations (str): Path to the observations csv file.
        Returns:
            None

        """
        if not os.path.isfile(observations):
            print("{} file not found".format(observations))
            return
        log("Processing: {}".format(observations))

        # constants
        if self._datadict:
            dd = self.ns.get('sibis')[self._datadict]
        else:
            dd = URIRef(self._datadict)
        dataset_uriref = list(self._g.subjects(self.terms.rdf_type,
                                               self.terms.dataset_type))
        if dataset_uriref:
            dataset_uri = dataset_uriref[0]
        else:
            dataset_uri = URIRef("")

        with open(observations) as f:
            reader = csv.DictReader(f)
            index = 0
            for row in reader:
                obs = self._get_sha1_iri(row)
                slice_vals = [row.get(i) for i in self._dimensions[1:]]
                slice_iri = self._get_sha1_iri(slice_vals)
                self._g.add((obs,
                             self.terms.rdf_type,
                             self.terms.observation_type))
                self._g.add((obs, self.terms.dataset, dataset_uri))
                self._g.add((slice_iri,
                             self.terms.rdf_type,
                             self.terms.slice_type))
                self._g.add((slice_iri, self.terms.slice_structure, dd))
                for key, vals in self._config_dict.iteritems():
                    field_name = vals[FIELD_NAME]
                    field_name_iri = self.ns.get("ncanda")[field_name]
                    # Only include the first dimension at the observation level.
                    if field_name not in self._dimensions[1:]:
                        # Get the rdfs:range to determine datatype.
                        rdfs_ranges = list(self._g.objects(
                            field_name_iri, self.terms.rdfs_range))
                        rdfs_range_iri = rdfs_ranges[0]
                        # If the range is not an XSD Literal (i.e., this is an
                        # object property), set to xsd:anyURI.
                        xsd = str(XSD[''].defrag())
                        if str(rdfs_range_iri.defrag()) != xsd:
                            rdfs_range_iri = XSD['anyURI']
                        # TODO: Use concept from code list instead of a Literal
                        # for coded values.
                        self._g.add((obs,
                                     field_name_iri,
                                     Literal(row[key],
                                             datatype=rdfs_range_iri)))
                        self._g.add((slice_iri, self.terms.observation, obs))
                    else:
                        # Add slice indices.
                        self._g.add((slice_iri,
                                     field_name_iri,
                                     Literal(row[key])))
                index += 1
Example #12
0
    def add_observations(self, observations):
        """Adds a set of observations to the RDF graph

        Args:
            observations (str): Path to the observations csv file.
        Returns:
            None

        """
        log("Processing: {}".format(observations))

        # Constants.
        if self._datadict:
            dd = self.ns.get(PROJECT)[self._datadict]
        else:
            dd = URIRef(self._datadict)
        dataset_iri = self._get_dataset_iri()
        reader = get_dict_reader(observations)
        index = 0
        for row in reader:
            obs = self._get_sha1_iri(row)
            slice_vals = [row.get(i) for i in self._dimensions[1:]]
            slice_iri = self._get_sha1_iri(slice_vals)
            self._g.add((obs,
                         self.terms.rdf_type,
                         self.terms.observation_type))
            self._g.add((obs, self.terms.dataset, dataset_iri))
            self._g.add((slice_iri,
                         self.terms.rdf_type,
                         self.terms.slice_type))
            self._g.add((dataset_iri, self.terms.slice, slice_iri))
            self._g.add((slice_iri, self.terms.slice_structure, dd))
            for field_name, value in row.iteritems():
                if field_name.endswith('_label'):
                    continue
                field_name_iri = self.ns.get(PROJECT)[field_name]
                # Get the rdfs:range to determine datatype.
                rdfs_ranges = list(self._g.objects(
                    field_name_iri, self.terms.rdfs_range))
                if rdfs_ranges:
                    rdfs_range_iri = rdfs_ranges[0]
                else:
                    error = "rdfs:range not set for {}.".format(field_name_iri)
                    log(error)
                    raise(KeyError(error))
                # Only include the first dimension at the observation level.
                if field_name not in self._dimensions[1:]:
                    # If the range is not an XSD Literal (i.e., this is an
                    # object property), use coded iri.
                    xsd = str(XSD[''].defrag())
                    if str(rdfs_range_iri.defrag()) != xsd:
                        coded_iri = self._convert_literal_to_coded_iri(
                            rdfs_range_iri, value)
                        self._g.add((obs,
                                     field_name_iri,
                                     coded_iri))
                    else:
                        datatype_iri = rdfs_range_iri
                        self._g.add((obs,
                                     field_name_iri,
                                     Literal(value,
                                             datatype=datatype_iri)))
                    self._g.add((slice_iri, self.terms.observation, obs))
                else:
                    # Add slice indices.
                    coded_iri = self._convert_literal_to_coded_iri(
                        rdfs_range_iri, value)
                    self._g.add((slice_iri, field_name_iri, coded_iri))
            index += 1