def list_hierarchy(class_name, bases): """ Creates a list of the class hierarchy Args: ----- class_name: name of the current class bases: list/tuple of bases for the current class """ class_list = [Uri(class_name)] for base in bases: if base.__name__ not in IGNORE_CLASSES: class_list.append(Uri(base.__name__)) return list([i for i in set(class_list)])
class CSVProcessor(Processor): """CSV RDF Mapping Processor""" rdf_name = Uri("kds:RmlCSVPRocessor") def __init__(self, **kwargs): if "fields" in kwargs: self.fields = kwargs.pop("fields") if "rml_rules" in kwargs: rml_rules = kwargs.pop("rml_rules") csv_file = kwargs.pop("csv_file") self.reader = csv.DictReader(open(csv_file, 'rb')) super(CSVProcessor, self).__init__(rml_rules, **kwargs) def __generate_reference__(self, triple_map, **kwargs): """Extracts the value of either column by key or by position """ pass def execute(self, triple_map, **kwargs): """Method executes mapping between CSV source and output RDF args: triple_map(SimpleNamespace): Triple Map """ pass def run(self, **kwargs): """Method runs through CSV Reader and applies rules to each row. """ pass
def get_uri_list(self, **kwargs): """ Returns a list of Uris to index """ index_status_filter = """ optional {{ ?s dcterm:modified ?modTime }} . optional {{ ?s kds:esIndexTime ?time }} . optional {{ ?s kds:esIndexError ?error }} filter ( !(bound(?time)) || ?time<?modTime || (bound(?error) && ?time < {idx_start_time})) """.format(idx_start_time=self.idx_start_time.sparql) items_query_template = """ SELECT DISTINCT ?s ?es_id {{ VALUES ?rdftypes {{\n\t\t{rdf_types} }} . ?s a ?rdftypes . BIND(SHA1(STR(?s)) as ?es_id) . {status_filter} }} {order_by} """ status_filter = index_status_filter \ if not kwargs.get("no_status") else "" order_by = kwargs.get("order_by", "") sparql = items_query_template.format( rdf_types="\n\t\t".join(self.rdf_types), status_filter=status_filter, order_by=order_by) results = [(Uri(item['s']['value']), item['es_id']['value'],) for item in self.tstore_conn.query(sparql=sparql)] return results #[:100]
def get_all_item_data(items, conn, graph=None, output='json', **kwargs): """ queries a triplestore with the provided template or uses a generic template that returns triples 3 edges out in either direction from the provided item_uri args: items: the starting uri or list of uris to the query conn: the rdfframework triplestore connection to query against output: 'json' or 'rdf' kwargs: template: template to use in place of the generic template rdfclass: rdfclass the items are based on. filters: list of filters to apply """ # set the jinja2 template to use if kwargs.get('template'): template = kwargs.pop('template') else: template = "sparqlAllItemDataTemplate.rq" # build the keyword arguments for the templace template_kwargs = {"prefix": NSM.prefix(), "output": output} if isinstance(items, list): template_kwargs['uri_list'] = items else: template_kwargs['item_uri'] = Uri(items).sparql if kwargs.get("special_union"): template_kwargs['special_union'] = kwargs.get("special_union") if kwargs.get('rdfclass'): # pdb.set_trace() template_kwargs.update(kwargs['rdfclass'].query_kwargs) if kwargs.get("filters"): template_kwargs['filters'] = make_sparql_filter(kwargs.get('filters')) sparql = render_without_request(template, **template_kwargs) return conn.query(sparql, **kwargs)
class AddClassHierarchyProcessor(PropertyProcessor, metaclass=PropSingleton): """ Adds the rdf:Class hierarchy URIs to the property's list of values. This is useful for indexing in elasticsearch when dealing with rdf:type. This way when doing a term search for a particular rdf:type all of the subclasses for that type will be included as well. Example: -------- For a property with 'schema_Person' as the associated class, ['schema:Thing', 'schema:Person'] will be added to the property list of values since 'schema:Person' is a subclass of 'schema:Thing' """ definition_uri = Uri('kdr:AddClassHierarchyProcessor') def __call__(self, prop): data = self.__data_source__(prop) rtn_list = [item for item in data] for prop_uri in prop.bound_class.hierarchy: rtn_list.append(prop_uri) rtn_list = list(set(rtn_list)) self.__set_data__(prop, rtn_list)
class ConvertObjectToStringProcessor(PropertyProcessor): """ Converts the object values of the property to a string Args: ----- params: {'kds_lookupProperty': the name of the rdf property in the object value to convert to a string} Returns: -------- strings for each object value """ definition_uri = Uri('kdr:ConvertObjectToStringProcessor') def __init__(self, params=[{}], data_attr=None, classnames=[]): super().__init__(params, data_attr, classnames) str_prop = params[0].get('kds_lookupProperty') if str_prop: self.str_prop = str_prop[0] else: self.str_prop = None def __call__(self, prop): data = self.__data_source__(prop) rtn_list = [] if self.str_prop: for val in data: if val.get(self.str_prop): rtn_list = [str(item) for item in val[self.str_prop]] else: rtn_list = [str(item) for item in data] self.__set_data__(prop, rtn_list)
class AddClassProcessor(PropertyProcessor, metaclass=PropSingleton): """ Adds the rdf:Class URI to the property's list of values """ definition_uri = Uri('kdr:AddClassProcessor') def __call__(self, prop): prop += prop.bound_class.class_names
def get_def(prop_defs, def_fields, default_val=None): """ returns the cross corelated fields for delealing with mutiple vocabularies args: prop_defs: the propertry definition object def_fields: list of the mapped field names default_val: Default value if none of the fields are found """ rtn_list = [] for fld in def_fields: if prop_defs.get(fld): rtn_list += prop_defs.get(fld) if not rtn_list and default_val: rtn_list.append(default_val) elif rtn_list: try: rtn_list = list(set(rtn_list)) except TypeError as e: # This deals with a domain that required a conjunction of two # rdf_Classes # pdb.set_trace() new_rtn = [] for item in rtn_list: if isinstance(item, MODULE.rdfclass.RdfClassBase): new_rtn.append(\ "|".join(merge_rdf_list(item['owl_unionOf']))) elif isinstance(item, list): new_rtn.append("|".join(item)) else: new_rtn.append(item) rtn_list = list(set(new_rtn)) new_rtn = [] for item in rtn_list: if "|" in item: new_rtn.append([Uri(domain) \ for domain in item.split("|")]) else: new_rtn.append(Uri(item)) rtn_list = new_rtn # pdb.set_trace() return rtn_list
def unique_append(self, value): """ function for only appending unique items to a list. #! consider the possibility of item using this to a set """ if value not in self: try: super(self.__class__, self).append(Uri(value)) except AttributeError as err: if isinstance(value, MODULE.rdfclass.RdfClassBase): super(self.__class__, self).append(value) else: raise err
def set_list_predicates(self): """ Reads through the rml mappings and determines all fields that should map to a list/array with a json output """ results = self.rml.query(""" SELECT DISTINCT ?subj_class ?list_field { ?bn rr:datatype rdf:List . ?bn rr:predicate ?list_field . ?s ?p ?bn . ?s rr:subjectMap ?sm_bn . ?sm_bn rr:class ?subj_class . }""") list_preds = [(Uri(row[0]).sparql, Uri(row[1]).sparql) for row in results] array_fields = {} for tup in list_preds: try: array_fields[tup[0]].append(tup[1]) except KeyError: array_fields[tup[0]] = [tup[1]] self.array_fields = array_fields
def make_property(prop_defs, prop_name, cls_names=[], hierarchy=[]): """ Generates a property class from the defintion dictionary args: prop_defs: the dictionary defining the property prop_name: the base name of the property cls_name: the name of the rdf_class with which the property is associated """ register = False try: cls_names.remove('RdfClassBase') except ValueError: pass if cls_names: new_name = "%s_%s" % (prop_name.pyuri, "_".join(cls_names)) prop_defs['kds_appliesToClass'] = cls_names elif not cls_names: cls_names = [Uri('kdr_AllClasses')] register = True new_name = prop_name else: new_name = prop_name new_prop = types.new_class( new_name, ( RdfPropertyBase, list, ), { 'metaclass': RdfPropertyMeta, 'prop_defs': prop_defs, 'class_names': cls_names, 'prop_name': prop_name, 'hierarchy': hierarchy }) if register: global properties global domain_props properties[new_name] = new_prop for domain in new_prop.rdfs_domain: try: # domain_props[domain].append(new_prop) domain_props[domain][prop_name] = prop_defs except KeyError: # domain_props[domain] = [new_prop] domain_props[domain] = {} domain_props[domain][prop_name] = prop_defs except TypeError: pass return new_prop
def test_uri(value): """ test to see if the value is a uri or bnode Returns: Uri or Bnode """ # .__wrapped__ if not isinstance(value, (Uri, BlankNode)): try: if value.startswith("_:"): return BlankNode(value) else: return Uri(value) except: return BlankNode() else: return value
def set_context(self): """ Reads throught the namespaces in the RML and generates a context for json+ld output when compared to the RdfNsManager namespaces """ results = self.rml.query(""" SELECT ?o { { ?s rr:class ?o } UNION { ?s rr:predicate ?o } }""") namespaces = [ Uri(row[0]).value[0] for row in results if isinstance(row[0], rdflib.URIRef) ] self.context = {ns[0]: ns[1] for ns in namespaces if ns[0]}
def __prepare__(mcs, name, bases, **kwargs): # if name == 'bf_UsageAndAccessPolicy': # pdb.set_trace() try: cls_defs = kwargs.pop('cls_defs') props = get_properties(name) #cls_defs) doc_string = make_doc_string(name, cls_defs, bases, props) new_def = {} # if name == 'bf_Topic': pdb.set_trace() new_def['__doc__'] = doc_string new_def['doc'] = property(print_doc) new_def['properties'] = list_base_properties( bases) #property(list_properties) # new_def['json_def'] = cls_defs new_def['hierarchy'] = list_hierarchy(name, bases) new_def['id'] = None new_def['class_names'] = [name] es_defs = es_get_class_defs(cls_defs, name) if hasattr(bases[0], 'es_defs'): es_defs.update(bases[0].es_defs) new_def['es_defs'] = get_rml_processors(es_defs) new_def['query_kwargs'] = get_query_kwargs(es_defs) new_def['uri'] = Uri(name).sparql_uri for prop, value in props.items(): new_def[prop] = MODULE.rdfclass.make_property( value, prop, new_def['class_names']) new_def['properties'][prop] = new_def[prop] if __a__ not in new_def.keys(): new_def[__a__] = MODULE.rdfclass.properties.get(__a__) new_def['properties'][__a__] = new_def[__a__] new_def['cls_defs'] = cls_defs #cls_defs.pop(name) new_def['es_props'] = [] for prop_name, prop in new_def['properties'].items(): rng_def = get_prop_range_def(\ get_prop_range_defs(new_def['class_names'], prop.kds_rangeDef)) if rng_def.get('kds_esLookup'): new_def['es_props'].append(prop_name) return new_def except KeyError: return {}
def __init__(self, data=None, base_uri=None, **kwargs): start = datetime.datetime.now() self.smap = 's' self.pmap = 'p' self.omap = 'o' self.rmap = {} if base_uri: base_uri = Uri(base_uri) self.base_uri = base_uri if kwargs.get("debug"): log.setLevel(logging.DEBUG) # realate_bnode_obj_types sets whether to relate the object of a class # back to itself self.relate_obj_types = ['bnode', 'uri'] if kwargs.get("bnode_only"): self.relate_obj_types = ['bnode'] if data: self.load_data(data, **kwargs) log.debug("loaded %s triples in %s" % (len(data), (datetime.datetime.now() - start)))
def filter_prop_defs(prop_defs, hierarchy, cls_names): """ Reads through the prop_defs and returns a dictionary filtered by the current class args: prop_defs: the defintions from the rdf vocabulary defintion cls_object: the class object to tie the property cls_names: the name of the classes """ def _is_valid(test_list, valid_list): """ reads the list of classes in appliesToClass and returns whether the test_list matches args: test_list: the list of clasees to test against valid_list: list of possible matches """ for test in test_list: if test in valid_list: return True return False new_dict = {} valid_classes = [Uri('kdr_AllClasses')] + cls_names + hierarchy for def_name, value in prop_defs.items(): new_dict[def_name] = [] empty_def = [] try: for item in value: if item.get('kds_appliesToClass'): if _is_valid(item['kds_appliesToClass'], valid_classes): new_dict[def_name].append(item) else: empty_def.append(item) if not new_dict[def_name]: new_dict[def_name] = empty_def except AttributeError: new_dict[def_name] = value return new_dict
def run(self, tag=None, output=None, **kwargs): """ runs the extractor Args: ----- output: ['filepath', None] """ start = datetime.datetime.now() count = 0 if tag: tag = Uri(tag) xml_generator = etree.iterparse( self.source, #events=("start", "end"), tag=tag.etree) else: xml_generator = etree.iterparse(self.source) #, #events=("start", "end")) i = 0 for event, element in xml_generator: type_tags = element.findall(_RDF_TYPE_TAG) rdf_types = [ el.get(_RES_TAG) for el in type_tags if el.get(_RES_TAG) ] # print(rdf_types) if str(self.filter_val) in rdf_types: pdb.set_trace() # print("%s - %s - %s - %s" % (event, # element.tag, # element.attrib, # element.text)) count += 1 # if i == 100: # break i += 1 element.clear() print("Found '{}' items in {}".format( count, (datetime.datetime.now() - start)))
def add_property(self, pred, obj): """ adds a property and its value to the class instance args: pred: the predicate/property to add obj: the value/object to add obj_method: *** No longer used. """ pred = Uri(pred) try: self[pred].append(obj) # except AttributeError: # new_list = [self[pred]] # new_list.append(obj) # self[pred] = new_list except KeyError: try: new_prop = self.properties[pred] except AttributeError: self.properties = {} self.add_property(pred, obj) return except KeyError: try: new_prop = MODULE.rdfclass.properties[pred] except KeyError: new_prop = MODULE.rdfclass.make_property({}, pred, self.class_names) try: self.properties[pred] = new_prop except AttributeError: self.properties = {pred: new_prop} init_prop = new_prop(self, get_attr(self, "dataset")) setattr(self, pred, init_prop) self[pred] = init_prop self[pred].append(obj) if self.dataset: self.dataset.add_rmap_item(self, pred, obj)
class SPARQLBatchProcessor(Processor): """Class batches all triple_maps queries into a single SPARQL query in an attempt to reduce the time spent in the triplestore/network bottleneck""" rdf_name = Uri("kdr:RmlSPARQLBatchProcessor") def __init__(self, rml_rules, triplestore_url=None, triplestore=None): super(SPARQLBatchProcessor, self).__init__(rml_rules, **kwargs) __set_prefix__() if triplestore_url is not None: self.triplestore_url = triplestore_url elif triplestore is not None: self.triplestore = triplestore def __get_bindings__(self, sparql): bindings = [] if self.triplestore_url is not None: result = requests.post(self.triplestore_url, data={ "query": sparql, "format": "json" }) bindings = result.json().get("results").get("bindings") elif self.triplestore is not None: result = self.triplestore.query(sparql) bindings = result.bindings return bindings def __construct_compound_query__(self, triple_map): select_clause = PREFIX + """ SELECT""" where_clause = """ WHERE {{""" for pred_map in triple_map.predicateObjectMap: if pred_map.constant is not None or\ pred_map.reference is not None: continue #if pred_obj_map.parentTriplesMap is not None: # self.__handle_parents__( # parent_map=pred_obj_map.parentTriplesMap, # subject=entity, # predicate=predicate, # **kwargs) # continue select_line = pred_map.query.splitlines()[0] for term in select_line.split(): if term.startswith("?") and term not in select_clause: select_clause += " {}".format(term) where_clause += "\nOPTIONAL{{\n\t" +\ pred_map.query +\ "\n}}\n" return select_clause + where_clause + "}}" def run(self, **kwargs): kwargs['output'] = self.__graph__() super(SPARQLBatchProcessor, self).run(**kwargs) self.output = kwargs['output'] return kwargs['output'] def execute(self, triple_map, output, **kwargs): """Method iterates through triple map's predicate object maps and processes query. Args: triple_map(SimpleNamespace): Triple Map """ sparql = PREFIX + triple_map.logicalSource.query.format(**kwargs) bindings = self.__get_bindings__(sparql) iterator = str(triple_map.logicalSource.iterator) for binding in bindings: entity_dict = binding.get(iterator) if isinstance(entity_dict, rdflib.term.Node): entity = entity_dict elif isinstance(entity_dict, dict): raw_value = entity_dict.get('value') if entity_dict.get('type').startswith('bnode'): entity = rdflib.BNode(raw_value) else: entity = rdflib.URIRef(raw_value) if triple_map.subjectMap.class_ is not None: output.add( (entity, rdflib.RDF.type, triple_map.subjectMap.class_)) sparql_query = self.__construct_compound_query__( triple_map).format(**kwargs) properties = self.__get_bindings__(sparql_query) for pred_obj_map in triple_map.predicateObjectMap: predicate = pred_obj_map.predicate if pred_obj_map.constant is not None: output.add((entity, predicate, pred_obj_map.constant)) continue if "#" in str(predicate): key = str(predicate).split("#")[-1] else: key = str(predicate).split("/")[-1] for property_ in properties: if key in property_.keys(): info = {"about": property_.get(key)} object_ = __get_object__(info) output.add((entity, predicate, object_))
def __init__(self, source, output=None, **kwargs): self.source = source self.output = output self.filter_tag = Uri("rdf:type") self.filter_val = Uri("bf:Topic") self.rdf_type = Uri("rdf:type")
""" import pdb import datetime import click try: from lxml import etree except ImportError: log.warning("'lxml' package not available. Using ptyhon 'xml'") import xml.etree.ElementTree as etree from rdfframework.datatypes import Uri, RdfNsManager RdfNsManager({'bf': 'http://id.loc.gov/ontologies/bibframe/'}) _RES_TAG = Uri("rdf:resource").etree _RDF_TYPE_TAG = Uri("rdf:type").etree class Extractor(object): """ Extracts all nodes specified nodes from an xml file Args: ----- source: the filepath to the xml file output: the filepath to output the results """ def __init__(self, source, output=None, **kwargs): self.source = source self.output = output
def prepare_prop_defs(prop_defs, prop_name, cls_names): """ Examines and adds any missing defs to the prop_defs dictionary for use with the RdfPropertyMeta.__prepare__ method Args: ----- prop_defs: the defintions from the rdf vocabulary defintion prop_name: the property name cls_names: the name of the associated classes Returns: -------- prop_defs """ def get_def(prop_defs, def_fields, default_val=None): """ returns the cross corelated fields for delealing with mutiple vocabularies args: prop_defs: the propertry definition object def_fields: list of the mapped field names default_val: Default value if none of the fields are found """ rtn_list = [] for fld in def_fields: if prop_defs.get(fld): rtn_list += prop_defs.get(fld) if not rtn_list and default_val: rtn_list.append(default_val) elif rtn_list: try: rtn_list = list(set(rtn_list)) except TypeError as e: # This deals with a domain that required a conjunction of two # rdf_Classes # pdb.set_trace() new_rtn = [] for item in rtn_list: if isinstance(item, MODULE.rdfclass.RdfClassBase): new_rtn.append(\ "|".join(merge_rdf_list(item['owl_unionOf']))) elif isinstance(item, list): new_rtn.append("|".join(item)) else: new_rtn.append(item) rtn_list = list(set(new_rtn)) new_rtn = [] for item in rtn_list: if "|" in item: new_rtn.append([Uri(domain) \ for domain in item.split("|")]) else: new_rtn.append(Uri(item)) rtn_list = new_rtn # pdb.set_trace() return rtn_list required_def_defaults = { Uri('kds_rangeDef'): [{}], Uri('rdfs_range'): [Uri("xsd_string")], Uri('rdfs_domain'): cls_names, Uri('rdfs_label'): [NSM.nouri(prop_name)], Uri('kds_formDefault'): [{ Uri('kds:appliesToClass'): Uri('kdr:AllClasses'), Uri('kds:formFieldName'): "emailaddr", Uri('kds:formLabelName'): [NSM.nouri(prop_name)], Uri('kds:formFieldHelp'): find_values(DESCRIPTION_FIELDS, prop_defs, None), Uri('kds:fieldType'): { Uri('rdf:type'): Uri('kdr:TextField') } }], Uri('kds_propertyValidation'): [], Uri('kds_propertySecurity'): [], Uri('kds_propertyProcessing'): [] } for prop in required_def_defaults: if prop not in prop_defs.keys(): prop_defs[prop] = required_def_defaults[prop] prop_defs['rdfs_domain'] = get_def(prop_defs, DOMAIN_FIELDS, cls_names) prop_defs['rdfs_range'] = get_def(prop_defs, RANGE_FIELDS, Uri('xsd_string')) return prop_defs
import pprint, pdb import logging import multiprocessing as mp # from rdfframework import rdfclass from rdfframework.utilities import DictClass, make_list, SimpleMapReduce from rdfframework.configuration import RdfConfigManager from rdfframework.datatypes import pyrdf, BaseRdfDataType, Uri from rdfframework.rdfclass import RdfClassBase, remove_parents, list_hierarchy from .jsonquery import json_qry MODULE = __import__(__name__) # import rdfframework.rdfclass as rdfclass CFG = RdfConfigManager() pool_size = mp.cpu_count() - 1 or 1 __a__ = Uri("rdf:type") def convert_row_main(val, i, key, output): # rtn_obj = {} # rtn_tup = (pyrdf(row['s']), pyrdf(row['p']), pyrdf(row['o'])) # return pyrdf(row['s']) #rtn_tup # for key, value in row.items(): # # try: # # print("convert_row_main: ", value) # # if value.get("datatype") == 'http://www.w3.org/2001/XMLSchema#dateTime': # # pdb.set_trace() # rtn_obj[key] = pyrdf(value) # # print(rtn_obj) # # except: # # pdb.set_trace()
class SPARQLProcessor(Processor): """SPARQLProcessor provides a RML Processor for external SPARQL endpoints""" rdf_name = Uri("kdr:RmlSPARQLProcessor") def __init__(self, rml_rules, **kwargs): # if "rml_rules" in kwargs: # rml_rules = kwargs.pop("rml_rules") super(SPARQLProcessor, self).__init__(rml_rules, **kwargs) __set_prefix__() #! self.triplestore = kwargs.get("triplestore", self.__graph__()) # Sets defaults self.limit, self.offset = 5000, 0 self.data_query = self.rml.value(subject=NS_MGR.kds.DataQuery.rdflib, predicate=NS_MGR.rml.query.rdflib) def __get_bindings__(self, sparql, output_format): """Internal method queries triplestore or remote sparql endpont and returns the bindings Args: ---- sparql: String of SPARQL query output_format: String of type of outputform """ return self.ext_conn.query(sparql, rtn_format=output_format, debug=False) def run(self, **kwargs): kwargs['output'] = self.__graph__() if "limit" in kwargs: self.limit = kwargs.get('limit') if "offset" in kwargs: self.offset = kwargs.get('offset') start = datetime.datetime.now() if kwargs.get("no_json"): self.use_json_qry = False else: self.use_json_qry = self.default_use_json_qry if self.use_json_qry: if not kwargs.get('dataset'): if self.data_query: sparql = PREFIX + self.data_query.format(**kwargs) data = self.ext_conn.query(sparql) else: try: data = get_all_item_data( items=kwargs[kwargs['iri_key']], conn=self.ext_conn, output='json', debug=False) log.debug("data triple count: %s", len(data)) except KeyError: raise KeyError( "missing kwarg['iri_key'] defining which" " kwarg to use that contians the subject" " uri used to query for data. Example: " "iri_key='instance_iri, instance_iri=" "<http://some.iri>") kwargs['dataset'] = RdfDataset(data) # pdb.set_trace() # start = datetime.datetime.now() super(SPARQLProcessor, self).run(**kwargs) # print("query time: ", (datetime.datetime.now() - start)) self.output = kwargs['output'] return kwargs['output'] def execute(self, triple_map, output, **kwargs): """Execute """ subjects = [] if NS_MGR.ql.JSON.rdflib in \ triple_map.logicalSource.reference_formulations: output_format = "json" else: output_format = "xml" if 'limit' not in kwargs: kwargs['limit'] = self.limit if 'offset' not in kwargs: kwargs['offset'] = self.offset # log.debug("triple_map.logicalSource: \n%s", # pprint.pformat(triple_map.logicalSource.__dict__)) iterator = str(triple_map.logicalSource.iterator) start = datetime.datetime.now() key, json_query = None, None # pdb.set_trace() if hasattr(triple_map.logicalSource, 'json_query') \ and self.use_json_qry: key = kwargs.get(str(triple_map.logicalSource.json_key)) if not key: key =[val for val in kwargs.values() \ if isinstance(val, rdflib.URIRef)][0] json_query = triple_map.logicalSource.json_query bindings = kwargs['dataset'].json_qry(json_query, {'$': key}) else: sparql = PREFIX + triple_map.logicalSource.query.format(**kwargs) bindings = self.__get_bindings__(sparql, output_format) for binding in bindings: if key: try: entity_raw = binding.subject.rdflib except AttributeError: entity_raw = binding else: entity_raw = binding.get(iterator) if isinstance(entity_raw, (rdflib.URIRef, rdflib.BNode, BaseRdfDataType)): entity = entity_raw else: raw_value = entity_raw.get('value') if entity_raw.get('type').startswith('bnode'): entity = BlankNode(raw_value) else: entity = Uri(raw_value) if triple_map.subjectMap.class_ is not None: sub = entity if isinstance(entity, BaseRdfDataType): sub = entity.rdflib output.add((sub, NS_MGR.rdf.type.rdflib, triple_map.subjectMap.class_)) # pdb.set_trace() for pred_obj_map in triple_map.predicateObjectMap: predicate = pred_obj_map.predicate kwargs[iterator] = entity if pred_obj_map.parentTriplesMap is not None: self.__handle_parents__( output=output, parent_map=pred_obj_map.parentTriplesMap, subject=entity, predicate=predicate, **kwargs) continue if pred_obj_map.reference is not None: ref_key = str(pred_obj_map.reference) if pred_obj_map.json_query: # if pred_obj_map.json_query =="$.schema_logo": # pdb.set_trace() if ref_key in binding: for item in binding[ref_key]: output.add((entity, predicate, item.rdflib)) continue else: if ref_key in binding: object_ = __get_object__(binding[ref_key]) output.add((entity, predicate, object_)) continue if pred_obj_map.constant is not None: if isinstance(entity, BaseRdfDataType): entity = entity.rdflib output.add((entity, predicate, pred_obj_map.constant)) continue json_query = None if pred_obj_map.json_query and self.use_json_qry: json_query = pred_obj_map.json_query start = datetime.datetime.now() # pdb.set_trace() # if str(pred_obj_map.predicate) == "http://purl.org/dc/terms/creator": # pdb.set_trace() pre_obj_bindings = kwargs['dataset'].json_qry( json_query, {'$': entity}) else: sparql_query = PREFIX + pred_obj_map.query.format(**kwargs) pre_obj_bindings = self.__get_bindings__( sparql_query, output_format) for row in pre_obj_bindings: if json_query and self.use_json_qry: if isinstance(entity, BaseRdfDataType): entity = entity.rdflib output.add((entity, predicate, row.rdflib)) else: object_ = __get_object__(row) if object_ is None: continue if isinstance(entity, BaseRdfDataType): entity = entity.rdflib output.add((entity, predicate, object_)) subjects.append(entity) return subjects
def execute(self, triple_map, output, **kwargs): """Execute """ subjects = [] if NS_MGR.ql.JSON.rdflib in \ triple_map.logicalSource.reference_formulations: output_format = "json" else: output_format = "xml" if 'limit' not in kwargs: kwargs['limit'] = self.limit if 'offset' not in kwargs: kwargs['offset'] = self.offset # log.debug("triple_map.logicalSource: \n%s", # pprint.pformat(triple_map.logicalSource.__dict__)) iterator = str(triple_map.logicalSource.iterator) start = datetime.datetime.now() key, json_query = None, None # pdb.set_trace() if hasattr(triple_map.logicalSource, 'json_query') \ and self.use_json_qry: key = kwargs.get(str(triple_map.logicalSource.json_key)) if not key: key =[val for val in kwargs.values() \ if isinstance(val, rdflib.URIRef)][0] json_query = triple_map.logicalSource.json_query bindings = kwargs['dataset'].json_qry(json_query, {'$': key}) else: sparql = PREFIX + triple_map.logicalSource.query.format(**kwargs) bindings = self.__get_bindings__(sparql, output_format) for binding in bindings: if key: try: entity_raw = binding.subject.rdflib except AttributeError: entity_raw = binding else: entity_raw = binding.get(iterator) if isinstance(entity_raw, (rdflib.URIRef, rdflib.BNode, BaseRdfDataType)): entity = entity_raw else: raw_value = entity_raw.get('value') if entity_raw.get('type').startswith('bnode'): entity = BlankNode(raw_value) else: entity = Uri(raw_value) if triple_map.subjectMap.class_ is not None: sub = entity if isinstance(entity, BaseRdfDataType): sub = entity.rdflib output.add((sub, NS_MGR.rdf.type.rdflib, triple_map.subjectMap.class_)) # pdb.set_trace() for pred_obj_map in triple_map.predicateObjectMap: predicate = pred_obj_map.predicate kwargs[iterator] = entity if pred_obj_map.parentTriplesMap is not None: self.__handle_parents__( output=output, parent_map=pred_obj_map.parentTriplesMap, subject=entity, predicate=predicate, **kwargs) continue if pred_obj_map.reference is not None: ref_key = str(pred_obj_map.reference) if pred_obj_map.json_query: # if pred_obj_map.json_query =="$.schema_logo": # pdb.set_trace() if ref_key in binding: for item in binding[ref_key]: output.add((entity, predicate, item.rdflib)) continue else: if ref_key in binding: object_ = __get_object__(binding[ref_key]) output.add((entity, predicate, object_)) continue if pred_obj_map.constant is not None: if isinstance(entity, BaseRdfDataType): entity = entity.rdflib output.add((entity, predicate, pred_obj_map.constant)) continue json_query = None if pred_obj_map.json_query and self.use_json_qry: json_query = pred_obj_map.json_query start = datetime.datetime.now() # pdb.set_trace() # if str(pred_obj_map.predicate) == "http://purl.org/dc/terms/creator": # pdb.set_trace() pre_obj_bindings = kwargs['dataset'].json_qry( json_query, {'$': entity}) else: sparql_query = PREFIX + pred_obj_map.query.format(**kwargs) pre_obj_bindings = self.__get_bindings__( sparql_query, output_format) for row in pre_obj_bindings: if json_query and self.use_json_qry: if isinstance(entity, BaseRdfDataType): entity = entity.rdflib output.add((entity, predicate, row.rdflib)) else: object_ = __get_object__(row) if object_ is None: continue if isinstance(entity, BaseRdfDataType): entity = entity.rdflib output.add((entity, predicate, object_)) subjects.append(entity) return subjects
class XMLProcessor(Processor): """XML RDF Mapping Processor""" rdf_name = Uri("kdr:RmlXMLProcessor") def __init__(self, **kwargs): if "rml_rules" in kwargs: rml_rules = kwargs.pop("rml_rules") super(XMLProcessor, self).__init__(rml_rules, **kwargs) if "namespaces" in kwargs: self.xml_ns = kwargs.pop("namespaces") else: self.xml_ns = dict() self.constants.update(kwargs) def __generate_reference__(self, triple_map, **kwargs): """Internal method takes a triple_map and returns the result of applying to XPath to the current DOM context Args: ----- triple_map: SimpleNamespace element: etree.Element """ element = kwargs.get("element") found_elements = element.xpath(triple_map.reference, namespaces=self.xml_ns) for elem in found_elements: raw_text = elem.text.strip() #! Quick and dirty test for valid URI if not raw_text.startswith("http"): continue return rdflib.URIRef(raw_text) def __reference_handler__(self, output, **kwargs): """Internal method for handling rr:reference in triples map Keyword Args: ------------- predicate_obj_map: SimpleNamespace element: etree.Element subject: rdflib.URIRef """ subjects = [] pred_obj_map = kwargs.get("predicate_obj_map") element = kwargs.get("element") subject = kwargs.get("subject") if pred_obj_map.reference is None: return subjects predicate = pred_obj_map.predicate found_elements = element.xpath(str(pred_obj_map.reference), namespaces=self.xml_ns) for found_elem in found_elements: if not hasattr(pred_obj_map, "datatype") or \ pred_obj_map.datatype is None: datatype = None else: datatype = pred_obj_map.datatype if isinstance(found_elem, str): # Handle xpath attributes object_ = self.__generate_object_term__(datatype, found_elem) output.add((subject, predicate, object_)) continue if found_elem.text is None or len(found_elem.text) < 1: continue if pred_obj_map.constant is not None: output.add((subject, predicate, pred_obj_map.constant)) continue if pred_obj_map.delimiters != []: subjects.extend( self.__generate_delimited_objects__( output, triple_map=pred_obj_map, subject=subject, predicate=predicate, element=found_elem, delimiters=pred_obj_map.delimiters, datatype=datatype)) else: object_ = self.__generate_object_term__( datatype, found_elem.text) output.add((subject, predicate, object_)) return subjects def execute(self, triple_map, output, **kwargs): """Method executes mapping between source Args: ----- triple_map: SimpleNamespace, Triple Map """ subjects = [] found_elements = self.source.xpath(str( triple_map.logicalSource.iterator), namespaces=self.xml_ns) for element in found_elements: subject = self.generate_term(term_map=triple_map.subjectMap, element=element, **kwargs) start = len(output) for row in triple_map.predicateObjectMap: predicate = row.predicate if row.template is not None: obj_ = self.generate_term(term_map=row, **kwargs) output.add((subject, predicate, obj_)) if row.parentTriplesMap is not None: self.__handle_parents__(output, parent_map=row.parentTriplesMap, subject=subject, predicate=predicate, **kwargs) new_subjects = self.__reference_handler__( output, predicate_obj_map=row, element=element, subject=subject) subjects.extend(new_subjects) if row.constant is not None: output.add((subject, predicate, row.constant)) if start < len(output): if triple_map.subjectMap.class_ is not None: output.add((subject, NS_MGR.rdf.type.rdflib, triple_map.subjectMap.class_)) subjects.append(subject) return subjects def run(self, xml, **kwargs): """Method takes either an etree.ElementTree or raw XML text as the first argument. Args: xml(etree.ElementTree or text """ kwargs['output'] = self.__graph__() if isinstance(xml, str): try: self.source = etree.XML(xml) except ValueError: try: self.source = etree.XML(xml.encode()) except: raise ValueError("Cannot run error {}".format( sys.exc_info()[0])) else: self.source = xml super(XMLProcessor, self).run(**kwargs) self.output = kwargs['output'] return kwargs['output']
class JSONProcessor(Processor): """JSON RDF Mapping Processor""" rdf_name = Uri("kdr:RmlJSONProcessor") def __init__(self, **kwargs): try: rml_rules = kwargs.pop("rml_rules") except KeyError: rml_rules = [] super(JSONProcessor, self).__init__(rml_rules, **kwargs) def __generate_reference__(self, triple_map, **kwargs): json_obj = kwargs.get("obj") path_expr = jsonpath_ng.parse(triple_map.reference) results = [r.value.strip() for r in path_expr.find(json_obj)] for row in results: if rdflib.term._is_valid_uri(row): return rdflib.URIRef(row) def __reference_handler__(self, output, **kwargs): """Internal method for handling rr:reference in triples map Keyword Args: ------------- predicate_obj_map: SimpleNamespace obj: dict subject: rdflib.URIRef """ subjects = [] pred_obj_map = kwargs.get("predicate_obj_map") obj = kwargs.get("obj") subject = kwargs.get("subject") if pred_obj_map.reference is None: return subjects predicate = pred_obj_map.predicate ref_exp = jsonpath_ng.parse(str(pred_obj_map.refernce)) found_objects = [r.value for r in ref_exp(obj)] for row in found_objects: output.add((subject, predicate, rdflib.Literal(row))) def execute(self, triple_map, output, **kwargs): """Method executes mapping between JSON source and output RDF Args: ----- triple_map: SimpleNamespace """ subjects = [] logical_src_iterator = str(triple_map.logicalSource.iterator) json_object = kwargs.get('obj', self.source) # Removes '.' as a generic iterator, replace with '@' if logical_src_iterator == ".": results = [ None, ] else: json_path_exp = jsonpath_ng.parse(logical_src_iterator) results = [r.value for r in json_path_exp.find(json_object)][0] for row in results: subject = self.generate_term(term_map=triple_map.subjectMap, **kwargs) for pred_obj_map in triple_map.predicateObjectMap: predicate = pred_obj_map.predicate if pred_obj_map.template is not None: output.add((subject, predicate, self.generate_term(term_map=pred_obj_map, **kwargs))) if pred_obj_map.parentTriplesMap is not None: self.__handle_parents__( output, parent_map=pred_obj_map.parentTriplesMap, subject=subject, predicate=predicate, obj=row, **kwargs) if pred_obj_map.reference is not None: ref_exp = jsonpath_ng.parse(str(pred_obj_map.reference)) found_objects = [r.value for r in ref_exp.find(row)] for obj in found_objects: if rdflib.term._is_valid_uri(obj): rdf_obj = rdflib.URIRef(str(obj)) else: rdf_obj = rdflib.Literal(str(obj)) output.add((subject, predicate, rdf_obj)) if pred_obj_map.constant is not None: output.add((subject, predicate, pred_obj_map.constant)) subjects.append(subject) return subjects def run(self, source, **kwargs): """Method takes a JSON source and any keywords and transforms from JSON to Lean BIBFRAME 2.0 triples Args: ---- source: str, dict """ kwargs['output'] = self.__graph__() if isinstance(source, str): import json source = json.loads(source) self.source = source super(JSONProcessor, self).run(**kwargs) self.output = kwargs['output'] return output
class CSVRowProcessor(Processor): """RML Processor for CSV/TSV or other delimited file supported by the python standard library module csv""" rdf_name = Uri("kdr:RmlCSVRowProcessor") def __init__(self, **kwargs): if "rml_rules" in kwargs: rml_rules = kwargs.pop("rml_rules") else: rml_rules = [] super(CSVRowProcessor, self).__init__(rml_rules, **kwargs) def __generate_reference__(self, triple_map, **kwargs): """Generates a RDF entity based on triple map Args: triple_map(SimpleNamespace): Triple Map """ raw_value = self.source.get(str(triple_map.reference)) if raw_value is None or len(raw_value) < 1: return if hasattr(triple_map, "datatype"): if triple_map.datatype == NS_MGR.xsd.anyURI.rdflib: output = rdflib.URIRef(raw_value) else: output = rdflib.Literal(raw_value, datatype=triple_map.datatype) else: output = rdflib.Literal(raw_value) return output def execute(self, triple_map, output, **kwargs): """Method executes mapping between CSV source and output RDF args: triple_map(SimpleNamespace): Triple Map """ subject = self.generate_term(term_map=triple_map.subjectMap, **kwargs) start_size = len(output) all_subjects = [] for pred_obj_map in triple_map.predicateObjectMap: predicate = pred_obj_map.predicate if pred_obj_map.template is not None: object_ = self.generate_term(term_map=pred_obj_map, **kwargs) if len(str(object)) > 0: output.add((subject, predicate, object_)) if pred_obj_map.parentTriplesMap is not None: self.__handle_parents__( parent_map=pred_obj_map.parentTriplesMap, subject=subject, predicate=predicate, **kwargs) if pred_obj_map.reference is not None: object_ = self.generate_term(term_map=pred_obj_map, **kwargs) if object_ and len(str(object_)) > 0: output.add((subject, predicate, object_)) if pred_obj_map.constant is not None: output.add((subject, predicate, pred_obj_map.constant)) finish_size = len(output) if finish_size > start_size: output.add((subject, NS_MGR.rdf.type.rdflib, triple_map.subjectMap.class_)) all_subjects.append(subject) return all_subjects def run(self, row, **kwargs): """Methods takes a row and depending if a dict or list, runs RML rules. Args: ----- row(Dict, List): Row from CSV Reader """ self.source = row kwargs['output'] = self.__graph__() super(CSVRowProcessor, self).run(**kwargs) return kwargs['output']
class RdfClassFactory(RdfBaseFactory): """ Extends RdfBaseFactory to property creation specific querying """ log_level = logging.INFO #MLOG_LVL # cache_file = "classes.json" classes_key = set([Uri(item) for item in RDF_CLASSES]) inferred_key = set([Uri(item) for item in INFERRED_CLASS_PROPS]) rdf_type = Uri('rdf_type') def __init__(self, conn, reset=False, nsm=NSM, cfg=CFG): if cfg.props_initialized != True: err_msg = [ "RdfPropertyFactory must be run prior to", "the intialization of RdfClassFactory!" ] raise RuntimeError(" ".join(err_msg)) sparql_template = "sparqlDefinitionClassesAll.rq" super().__init__(conn, sparql_template, reset, nsm, cfg) def make(self): """ reads through the definitions and generates an python class for each definition """ log.setLevel(self.log_level) created = [] self.set_class_dict() start = datetime.datetime.now() log.info(" # of classes to create: %s" % len(self.class_dict)) log.debug(" creating classes that are not subclassed") for name, cls_defs in self.class_dict.items(): # if name in ['bf_Organization', 'bf_Agent']: # pdb.set_trace() if not self.class_dict[name].get('rdfs_subClassOf'): created.append(name) setattr( MODULE.rdfclass, name, types.new_class( name, (RdfClassBase, ), { #'metaclass': RdfClassMeta, 'cls_defs': cls_defs })) log.debug(" created %s classes in: %s", len(created), (datetime.datetime.now() - start)) for name in created: del self.class_dict[name] left = len(self.class_dict) classes = [] while left > 0: new = [] for name, cls_defs in self.class_dict.items(): # if name in ['bf_Organization', 'bf_Agent']: # pdb.set_trace() parents = self.class_dict[name].get('rdfs_subClassOf') if not parents: bases += (RdfClassBase, ) else: for parent in make_list(parents): bases = tuple() if parent in created or parent in classes: if parent in classes: bases += (RdfClassBase, ) else: base = getattr(MODULE.rdfclass, parent) bases += (base, ) + base.__bases__ if len(bases) > 0: created.append(name) setattr( MODULE.rdfclass, name, types.new_class( name, bases, { #'metaclass': RdfClassMeta, 'cls_defs': cls_defs })) for name in created: try: del self.class_dict[name] except KeyError: pass if left == len(self.class_dict): # c_list = [self.class_dict[name].get('rdfs_subClassOf') \ # for name in self.class_dict] missing_parents = [] for name in self.class_dict: missing_parents += \ self.class_dict[name].get('rdfs_subClassOf', []) missing_parents = set(missing_parents) still_valid = set([ name for name in self.class_dict if name not in missing_parents ]) classes = list(missing_parents.difference(\ set(self.class_dict.keys()))) # classess = [] # for cl in c_list: # for item in cl: # classes.append(item) for name in self.class_dict: if name in classes: classes.remove(name) for p_name in self.class_dict[name].get( 'rdfs_subClassOf', []).copy(): if p_name in classes: self.class_dict[name]['rdfs_subClassOf'].remove(\ p_name) # pdb.set_trace() left = len(self.class_dict) # self.tie_properties(created) log.info(" created all classes in %s", (datetime.datetime.now() - start)) def set_class_dict(self): """ Reads through the dataset and assigns self.class_dict the key value pairs for the classes in the dataset """ self.class_dict = {} for name, cls_defs in self.defs.items(): def_type = set(cls_defs.get(self.rdf_type, [])) if name.type == 'bnode': continue # a class can be determined by checking to see if it is of an # rdf_type listed in the classes_key or has a property that is # listed in the inferred_key if def_type.intersection(self.classes_key) or \ list([cls_defs.get(item) for item in self.inferred_key]): self.class_dict[name] = cls_defs def tie_properties(self, class_list): """ Runs through the classess and ties the properties to the class args: class_list: a list of class names to run """ log.setLevel(self.log_level) start = datetime.datetime.now() log.info(" Tieing properties to the class") for cls_name in class_list: cls_obj = getattr(MODULE.rdfclass, cls_name) prop_dict = dict(cls_obj.properties) for prop_name, prop_obj in cls_obj.properties.items(): setattr(cls_obj, prop_name, link_property(prop_obj, cls_obj)) log.info(" Finished tieing properties in: %s", (datetime.datetime.now() - start))
def json_qry(dataset, qry_str, params={}): """ Takes a json query string and returns the results args: dataset: RdfDataset to query against qry_str: query string params: dictionary of params """ # if qry_str.startswith("$.bf_itemOf[rdf_type=bf_Print].='print',\n"): # pdb.set_trace() if not '$' in qry_str: qry_str = ".".join(['$', qry_str.strip()]) dallor_val = params.get("$", dataset) if isinstance(dallor_val, rdflib.URIRef): dallor_val = Uri(dallor_val) if qry_str.strip() == '$': return [dallor_val] parsed_qry = parse_json_qry(qry_str) qry_parts = parsed_qry['qry_parts'] post_actions = parsed_qry['params'] # print(qry_parts) rtn_list = UniqueList() if params.get('dataset'): dataset = params['dataset'] for or_part in qry_parts: if or_part[1] == 0: if isinstance(dallor_val, dict): result = dallor_val else: try: result = dataset[dallor_val] except KeyError: try: result = dataset[Uri(dallor_val)] except KeyError: try: result = dataset[BlankNode(dallor_val)] except KeyError: continue forward = True for part in or_part[0][1:]: if part == "*": forward = not forward else: if forward: result = get_json_qry_item(result, part) else: result = get_reverse_json_qry_item(result, part, False) else: result = dataset parts = or_part[0].copy() parts.reverse() forward = False for part in parts[1:]: if part == "*": forward = not forward else: if forward: result = get_json_qry_item(result, part) else: result = get_reverse_json_qry_item( result, part, False, dallor_val) rtn_list += result for action in post_actions: rtn_list = action(rtn_list) return rtn_list