def _build_config_lookup(self, config): if config is None: log("Mapping file not provided") return if not os.path.isfile(config): log("{} file not found".format(config)) return with open(config) as f: reader = csv.DictReader(f) for row in reader: # drop empty values res = dict((k, v) for k, v in row.iteritems() if v is not "") self._config_dict[row[FIELD_NAME]] = res
def build_graph(self, dd, mapping): """Constructs a graph from the data dictionary using a config file. Args: dd (str): Path to the data dictionary csv file. mapping (str): Path to a csv formatted config with supplementary information file. Returns: None """ log("Processing: {}".format(dd)) # self._build_config_lookup(mapping) self._build_datadict(dd)
def add_metadata(self, metadata_path): """Adds the dataset metadata to the graph Args: metadata_path (str): Path to a csv formatted file with the dataset metadata Returns: None """ if metadata_path is None: log("Metadata file is not provided") return if not os.path.isfile(metadata_path): log("{} file not found".format(metadata_path)) return log("Metadata processing: {}".format(metadata_path)) with open(metadata_path) as f: reader = csv.DictReader(f) for row in reader: md_dataset_id = row[DATASET_ID] md_title = row[TITLE] md_description = row[DESCRIPTION] md_publisher = row[PUBLISHER] md_issued = row[ISSUED] md_subject = row[SUBJECT] term = URIRef(md_dataset_id) self._g.add((term, self.terms.rdf_type, self.terms.dataset_type)) self._g.add((term, self.terms.title, Literal(md_title))) self._g.add((term, self.terms.description, Literal(md_description))) self._g.add((term, self.terms.publisher, Literal(md_publisher))) self._g.add((term, self.terms.issued, Literal(md_issued, datatype=XSD['date']))) self._g.add((term, self.terms.subject, URIRef(md_subject)))
def add_metadata(self, metadata_path): """Adds the dataset metadata to the graph Args: metadata_path (str): Path to a csv formatted file with the dataset metadata Returns: None """ if metadata_path is None: log("Metadata file is not provided") return if not os.path.isfile(metadata_path): log("{} file not found".format(metadata_path)) return log("Metadata processing: {}".format(metadata_path)) with open(metadata_path) as f: reader = csv.DictReader(f) for row in reader: md_dataset_id = row[DATASET_ID] md_title = row[TITLE] md_description = row[DESCRIPTION] md_publisher = row[PUBLISHER] md_issued = row[ISSUED] md_subject = row[SUBJECT] term = URIRef(md_dataset_id) self._g.add( (term, self.terms.rdf_type, self.terms.dataset_type)) self._g.add((term, self.terms.title, Literal(md_title))) self._g.add( (term, self.terms.description, Literal(md_description))) self._g.add( (term, self.terms.publisher, Literal(md_publisher))) self._g.add((term, self.terms.issued, Literal(md_issued, datatype=XSD['date']))) self._g.add((term, self.terms.subject, URIRef(md_subject)))
def add_observations(self, observations): """Adds a set of observations to the RDF graph Args: observations (str): Path to the observations csv file. Returns: None """ log("Processing: {}".format(observations)) # Constants. if self._datadict: dd = self.ns.get(PROJECT)[self._datadict] else: dd = URIRef(self._datadict) dataset_iri = self._get_dataset_iri() reader = get_dict_reader(observations) index = 0 for row in reader: obs = self._get_sha1_iri(row) slice_vals = [row.get(i) for i in self._dimensions[1:]] slice_iri = self._get_sha1_iri(slice_vals) self._g.add( (obs, self.terms.rdf_type, self.terms.observation_type)) self._g.add((obs, self.terms.dataset, dataset_iri)) self._g.add( (slice_iri, self.terms.rdf_type, self.terms.slice_type)) self._g.add((dataset_iri, self.terms.slice, slice_iri)) self._g.add((slice_iri, self.terms.slice_structure, dd)) for field_name, value in row.iteritems(): if field_name.endswith('_label'): continue field_name_iri = self.ns.get(PROJECT)[field_name] # Get the rdfs:range to determine datatype. rdfs_ranges = list( self._g.objects(field_name_iri, self.terms.rdfs_range)) if rdfs_ranges: rdfs_range_iri = rdfs_ranges[0] else: error = "rdfs:range not set for {}.".format(field_name_iri) log(error) raise (KeyError(error)) # Only include the first dimension at the observation level. if field_name not in self._dimensions[1:]: # If the range is not an XSD Literal (i.e., this is an # object property), use coded iri. xsd = str(XSD[''].defrag()) if str(rdfs_range_iri.defrag()) != xsd: coded_iri = self._convert_literal_to_coded_iri( rdfs_range_iri, value) self._g.add((obs, field_name_iri, coded_iri)) else: datatype_iri = rdfs_range_iri self._g.add((obs, field_name_iri, Literal(value, datatype=datatype_iri))) self._g.add((slice_iri, self.terms.observation, obs)) else: # Add slice indices. coded_iri = self._convert_literal_to_coded_iri( rdfs_range_iri, value) self._g.add((slice_iri, field_name_iri, coded_iri)) index += 1
def add_observations(self, observations): """Adds a set of observations to the RDF graph Args: observations (str): Path to the observations csv file. Returns: None """ if not os.path.isfile(observations): print("{} file not found".format(observations)) return log("Processing: {}".format(observations)) # constants if self._datadict: dd = self.ns.get('sibis')[self._datadict] else: dd = URIRef(self._datadict) dataset_uriref = list( self._g.subjects(self.terms.rdf_type, self.terms.dataset_type)) if dataset_uriref: dataset_uri = dataset_uriref[0] else: dataset_uri = URIRef("") with open(observations) as f: reader = csv.DictReader(f) index = 0 for row in reader: obs = self._get_sha1_iri(row) slice_vals = [row.get(i) for i in self._dimensions[1:]] slice_iri = self._get_sha1_iri(slice_vals) self._g.add( (obs, self.terms.rdf_type, self.terms.observation_type)) self._g.add((obs, self.terms.dataset, dataset_uri)) self._g.add( (slice_iri, self.terms.rdf_type, self.terms.slice_type)) self._g.add((slice_iri, self.terms.slice_structure, dd)) for key, vals in self._config_dict.iteritems(): field_name = vals[FIELD_NAME] field_name_iri = self.ns.get("ncanda")[field_name] # Only include the first dimension at the observation level. if field_name not in self._dimensions[1:]: # Get the rdfs:range to determine datatype. rdfs_ranges = list( self._g.objects(field_name_iri, self.terms.rdfs_range)) rdfs_range_iri = rdfs_ranges[0] # If the range is not an XSD Literal (i.e., this is an # object property), set to xsd:anyURI. xsd = str(XSD[''].defrag()) if str(rdfs_range_iri.defrag()) != xsd: rdfs_range_iri = XSD['anyURI'] # TODO: Use concept from code list instead of a Literal # for coded values. self._g.add((obs, field_name_iri, Literal(row[key], datatype=rdfs_range_iri))) self._g.add((slice_iri, self.terms.observation, obs)) else: # Add slice indices. self._g.add( (slice_iri, field_name_iri, Literal(row[key]))) index += 1
def build_graph(self, dd, mapping): """Constructs a graph from the data dictionary using a config file. Args: dd (str): Path to the data dictionary csv file. mapping (str): Path to a csv formatted config with supplementary information file. Returns: None """ self._build_config_lookup(mapping) if dd is None: log("Data dictionary file not provided") return if not os.path.isfile(dd): log("{} file not found".format(dd)) return log("Processing: {}".format(dd)) self._datadict = os.path.basename(dd) with open(dd) as f: reader = csv.DictReader(f) for row in reader: field_name = row[FIELD_NAME] field_label = row[FIELD_LABEL] self._fields.append(field_name) node = self.ns.get("ncanda")[field_name] # Default to MeasureProperty. prop = self.terms.measure_property_type # Use field_name to create "Field Name" label. if field_label: label = field_label else: split = [i.capitalize() for i in field_label.split('_')] label = ' '.join(split) self._g.add((node, self.terms.rdfs_label, Literal(label))) # Set prop for dimension properties. if (field_name in self._config_dict and DIMENSION in self._config_dict[field_name]): if self._config_dict[field_name][DIMENSION] == "y": prop = self.terms.dimension_property_type self._g.add((node, self.terms.rdf_type, prop)) self._g.add( (node, self.terms.rdf_type, self.terms.property_type)) # Annotate with Concepts. if (field_name in self._config_dict and CONCEPT in self._config_dict[field_name]): obj = URIRef(self._config_dict[field_name][CONCEPT]) self._g.add((node, self.terms.concept, obj)) # Annotate with Range. if (field_name in self._config_dict and RANGE in self._config_dict[field_name]): xsd_type = URIRef(self._config_dict[field_name][RANGE]) else: xsd_type = self._data_element_type(row) self._g.add((node, self.terms.rdfs_range, xsd_type)) # Annotate with Units. if (field_name in self._config_dict and UNITS in self._config_dict[field_name]): obj = URIRef(self._config_dict[field_name][UNITS]) self._g.add((node, self.terms.unit_measure, obj)) # Annotate with Statistic. if (field_name in self._config_dict and STATISTIC in self._config_dict[field_name]): obj = URIRef(self._config_dict[field_name][STATISTIC]) self._g.add((node, self.terms.statistic, obj)) # Todo: Create qb:codeList for dimension and categorical data if (field_name in self._config_dict and row[CHOICES]): # Create a skos:Concept Class. class_label = ''.join( [i.capitalize() for i in field_name.split('_')]) class_uri = self.ns.get("ncanda")[class_label] self._g.add( (class_uri, self.terms.rdf_type, self.terms.owl_class)) self._g.add((class_uri, self.terms.rdfs_subclass_of, self.terms.concept_type)) title = "Code List Class for '{}' term." self._g.add((class_uri, self.terms.rdfs_label, Literal(title.format(field_label)))) # Create a skos:ConceptScheme. scheme_label = "{}_concept_scheme".format(field_name) concept_scheme_uri = self.ns.get("ncanda")[scheme_label] self._g.add((concept_scheme_uri, self.terms.rdf_type, self.terms.concept_scheme_type)) self._g.add((concept_scheme_uri, self.terms.notation, Literal(field_name))) self._g.add( (concept_scheme_uri, self.terms.rdfs_label, Literal( "Code List for '{}' term.".format(field_label)))) self._g.add((class_uri, self.terms.rdfs_see_also, concept_scheme_uri)) choices = row[CHOICES].split("|") # Create skos:Concept for each code. for choice in choices: k, v = choice.split(',') code = k.strip() code_label = v.strip() choice_uri = self.ns.get("ncanda")['-'.join( [field_name, code])] self._g.add((choice_uri, self.terms.rdf_type, self.terms.concept_type)) self._g.add( (choice_uri, self.terms.rdf_type, class_uri)) self._g.add( (choice_uri, self.terms.notation, Literal(code))) self._g.add((choice_uri, self.terms.top_concept_of, concept_scheme_uri)) self._g.add((choice_uri, self.terms.pref_label, Literal(code_label))) self._g.add((concept_scheme_uri, self.terms.has_top_concept, choice_uri)) self._g.add((concept_scheme_uri, self.terms.in_scheme, choice_uri))
def build_graph(self, dd, mapping): """Constructs a graph from the data dictionary using a config file. Args: dd (str): Path to the data dictionary csv file. mapping (str): Path to a csv formatted config with supplementary information file. Returns: None """ self._build_config_lookup(mapping) if dd is None: log("Data dictionary file not provided") return if not os.path.isfile(dd): log("{} file not found".format(dd)) return log("Processing: {}".format(dd)) self._datadict = os.path.basename(dd) with open(dd) as f: reader = csv.DictReader(f) for row in reader: field_name = row[FIELD_NAME] field_label = row[FIELD_LABEL] self._fields.append(field_name) node = self.ns.get("ncanda")[field_name] # Default to MeasureProperty. prop = self.terms.measure_property_type # Use field_name to create "Field Name" label. if field_label: label = field_label else: split = [i.capitalize() for i in field_label.split('_')] label = ' '.join(split) self._g.add((node, self.terms.rdfs_label, Literal(label))) # Set prop for dimension properties. if (field_name in self._config_dict and DIMENSION in self._config_dict[field_name]): if self._config_dict[field_name][DIMENSION] == "y": prop = self.terms.dimension_property_type self._g.add((node, self.terms.rdf_type, prop)) self._g.add((node, self.terms.rdf_type, self.terms.property_type)) # Annotate with Concepts. if (field_name in self._config_dict and CONCEPT in self._config_dict[field_name]): obj = URIRef(self._config_dict[field_name][CONCEPT]) self._g.add((node, self.terms.concept, obj)) # Annotate with Range. if (field_name in self._config_dict and RANGE in self._config_dict[field_name]): xsd_type = URIRef(self._config_dict[field_name][RANGE]) else: xsd_type = self._data_element_type(row) self._g.add((node, self.terms.rdfs_range, xsd_type)) # Annotate with Units. if (field_name in self._config_dict and UNITS in self._config_dict[field_name]): obj = URIRef(self._config_dict[field_name][UNITS]) self._g.add((node, self.terms.unit_measure, obj)) # Annotate with Statistic. if (field_name in self._config_dict and STATISTIC in self._config_dict[field_name]): obj = URIRef(self._config_dict[field_name][STATISTIC]) self._g.add((node, self.terms.statistic, obj)) # Todo: Create qb:codeList for dimension and categorical data if (field_name in self._config_dict and row[CHOICES]): # Create a skos:Concept Class. class_label = ''.join([i.capitalize() for i in field_name.split('_')]) class_uri = self.ns.get("ncanda")[class_label] self._g.add((class_uri, self.terms.rdf_type, self.terms.owl_class)) self._g.add((class_uri, self.terms.rdfs_subclass_of, self.terms.concept_type)) title = "Code List Class for '{}' term." self._g.add((class_uri, self.terms.rdfs_label, Literal(title.format( field_label)))) # Create a skos:ConceptScheme. scheme_label = "{}_concept_scheme".format(field_name) concept_scheme_uri = self.ns.get("ncanda")[scheme_label] self._g.add((concept_scheme_uri, self.terms.rdf_type, self.terms.concept_scheme_type)) self._g.add((concept_scheme_uri, self.terms.notation, Literal(field_name))) self._g.add((concept_scheme_uri, self.terms.rdfs_label, Literal("Code List for '{}' term.".format( field_label)))) self._g.add((class_uri, self.terms.rdfs_see_also, concept_scheme_uri)) choices = row[CHOICES].split("|") # Create skos:Concept for each code. for choice in choices: k, v = choice.split(',') code = k.strip() code_label = v.strip() choice_uri = self.ns.get("ncanda")['-'.join( [field_name, code])] self._g.add((choice_uri, self.terms.rdf_type, self.terms.concept_type)) self._g.add((choice_uri, self.terms.rdf_type, class_uri)) self._g.add((choice_uri, self.terms.notation, Literal(code))) self._g.add((choice_uri, self.terms.top_concept_of, concept_scheme_uri)) self._g.add((choice_uri, self.terms.pref_label, Literal(code_label))) self._g.add((concept_scheme_uri, self.terms.has_top_concept, choice_uri)) self._g.add((concept_scheme_uri, self.terms.in_scheme, choice_uri))
def add_observations(self, observations): """Adds a set of observations to the RDF graph Args: observations (str): Path to the observations csv file. Returns: None """ if not os.path.isfile(observations): print("{} file not found".format(observations)) return log("Processing: {}".format(observations)) # constants if self._datadict: dd = self.ns.get('sibis')[self._datadict] else: dd = URIRef(self._datadict) dataset_uriref = list(self._g.subjects(self.terms.rdf_type, self.terms.dataset_type)) if dataset_uriref: dataset_uri = dataset_uriref[0] else: dataset_uri = URIRef("") with open(observations) as f: reader = csv.DictReader(f) index = 0 for row in reader: obs = self._get_sha1_iri(row) slice_vals = [row.get(i) for i in self._dimensions[1:]] slice_iri = self._get_sha1_iri(slice_vals) self._g.add((obs, self.terms.rdf_type, self.terms.observation_type)) self._g.add((obs, self.terms.dataset, dataset_uri)) self._g.add((slice_iri, self.terms.rdf_type, self.terms.slice_type)) self._g.add((slice_iri, self.terms.slice_structure, dd)) for key, vals in self._config_dict.iteritems(): field_name = vals[FIELD_NAME] field_name_iri = self.ns.get("ncanda")[field_name] # Only include the first dimension at the observation level. if field_name not in self._dimensions[1:]: # Get the rdfs:range to determine datatype. rdfs_ranges = list(self._g.objects( field_name_iri, self.terms.rdfs_range)) rdfs_range_iri = rdfs_ranges[0] # If the range is not an XSD Literal (i.e., this is an # object property), set to xsd:anyURI. xsd = str(XSD[''].defrag()) if str(rdfs_range_iri.defrag()) != xsd: rdfs_range_iri = XSD['anyURI'] # TODO: Use concept from code list instead of a Literal # for coded values. self._g.add((obs, field_name_iri, Literal(row[key], datatype=rdfs_range_iri))) self._g.add((slice_iri, self.terms.observation, obs)) else: # Add slice indices. self._g.add((slice_iri, field_name_iri, Literal(row[key]))) index += 1
def add_observations(self, observations): """Adds a set of observations to the RDF graph Args: observations (str): Path to the observations csv file. Returns: None """ log("Processing: {}".format(observations)) # Constants. if self._datadict: dd = self.ns.get(PROJECT)[self._datadict] else: dd = URIRef(self._datadict) dataset_iri = self._get_dataset_iri() reader = get_dict_reader(observations) index = 0 for row in reader: obs = self._get_sha1_iri(row) slice_vals = [row.get(i) for i in self._dimensions[1:]] slice_iri = self._get_sha1_iri(slice_vals) self._g.add((obs, self.terms.rdf_type, self.terms.observation_type)) self._g.add((obs, self.terms.dataset, dataset_iri)) self._g.add((slice_iri, self.terms.rdf_type, self.terms.slice_type)) self._g.add((dataset_iri, self.terms.slice, slice_iri)) self._g.add((slice_iri, self.terms.slice_structure, dd)) for field_name, value in row.iteritems(): if field_name.endswith('_label'): continue field_name_iri = self.ns.get(PROJECT)[field_name] # Get the rdfs:range to determine datatype. rdfs_ranges = list(self._g.objects( field_name_iri, self.terms.rdfs_range)) if rdfs_ranges: rdfs_range_iri = rdfs_ranges[0] else: error = "rdfs:range not set for {}.".format(field_name_iri) log(error) raise(KeyError(error)) # Only include the first dimension at the observation level. if field_name not in self._dimensions[1:]: # If the range is not an XSD Literal (i.e., this is an # object property), use coded iri. xsd = str(XSD[''].defrag()) if str(rdfs_range_iri.defrag()) != xsd: coded_iri = self._convert_literal_to_coded_iri( rdfs_range_iri, value) self._g.add((obs, field_name_iri, coded_iri)) else: datatype_iri = rdfs_range_iri self._g.add((obs, field_name_iri, Literal(value, datatype=datatype_iri))) self._g.add((slice_iri, self.terms.observation, obs)) else: # Add slice indices. coded_iri = self._convert_literal_to_coded_iri( rdfs_range_iri, value) self._g.add((slice_iri, field_name_iri, coded_iri)) index += 1