def materialize_entity(ctx, etype, unique=None): ''' Routine for creating a BIBFRAME resource. Takes the entity (resource) type and a data mapping according to the resource type. Implements the Libhub Resource Hash Convention As a convenience, if a vocabulary base is provided in the context, concatenate it to etype and the data keys ctx - context information governing creation fo the mew entity etype - type IRI for th enew entity unique - scalar or ordered dict of data to use in generating its unique ID, or None in which case one is just randomly generated ''' params = {} if ctx.base: etype = ctx.base + etype unique_full = unique if isinstance(unique, OrderedDict): unique_full = OrderedDict() for (k, v) in unique.items(): unique_full[ k if iri.is_absolute(k) else iri.absolutize(k, ctx.base) ] = v if unique_full: plaintext = json.dumps([etype, unique_full], cls=OrderedJsonEncoder) eid = ctx.idgen.send(plaintext) else: #We only have a type; no other distinguishing data. Generate a random hash eid = next(ctx.idgen) return eid
def _link(ctx): (origin, _, t, a) = ctx.current_link if derive_origin: #Have enough info to derive the origin from context. Ignore origin in current link origin = derive_origin(ctx) #If need be call the Versa action function to determine the relationship to the materialized resource rels = rel(ctx) if callable(rel) else rel if not isinstance(rels, list): rels = [rels] values = value(ctx) if callable(value) else (t if value is None else value) if not isinstance(values, list): values = [values] def recurse_values(vs): for v in vs: if callable(v): yield from recurse_values(v(ctx)) else: yield v for _value in recurse_values(values): #If asked to convert value to resource, do so as long as it is absolute and ignore_refs is false if res and not (ignore_refs and not iri.is_absolute(_value)): try: _value = I(_value) except ValueError: ctx.extras['logger'].warn('Requirement to convert link target to IRI failed for invalid input, causing the corresponding output link to be omitted entirely: {0}'.format(repr((I(origin), I(iri.absolutize(rel, ctx.base)), _value)))) #XXX How do we really want to handle this error? #return [] continue for r in rels: ctx.output_model.add(I(origin), I(iri.absolutize(r, ctx.base)), _value, {}) return
def _toiri(ctx): _arg = arg(ctx) if is_pipeline_action(arg) else arg _arg = [_arg] if not isinstance(_arg, list) else _arg ret = [] for u in _arg: iu = u if not (ignore_refs and not iri.is_absolute(iu)): # coerce into an IRIref, but fallout as untyped text otherwise try: iu = I(iu) except ValueError as e: # attempt to recover by percent encoding try: iu = I(iri.percent_encode(iu)) except ValueError as e: ctx.extras['logger'].warn( 'Unable to convert "{}" to IRI reference:\n{}'. format(iu, e)) if base is not None and isinstance(iu, I): iu = I(iri.absolutize(iu, base)) ret.append(iu) return ret
def resource_id(etype, unique=None, idgen=default_idgen(None), vocabbase=None): ''' Very low level routine for generating a, ID value using the hash algorithm outlined by the Libhub initiative for for BIBFRAME Lite (Libhub Resource Hash Convention). Takes the entity (resource) type and an ordered data mapping. etype - type IRI for th enew entity unique - list of key/value tuples of data to use in generating its unique ID, or None in which case one is just randomly generated defaultvocabbase - for convenience, provided, use to resolve relative etype & data keys ''' params = {} #XXX: Use proper URI normalization? Have a philosophical discussion with Mark about this :) if vocabbase: etype = vocabbase + etype unique_computed = [] for k, v in unique: if vocabbase: #XXX OK absolutize used here. Go figure k = k if iri.is_absolute(k) else iri.absolutize(k, vocabbase) unique_computed.append((k, v)) if unique_computed: # XXX Is OrderedJsonEncoder neded now that we're using list of tuples rather than ordered dict? plaintext = json.dumps([etype, unique_computed], cls=OrderedJsonEncoder) eid = idgen.send(plaintext) else: #We only have a type; no other distinguishing data. Generate a random hash eid = next(idgen) return eid
def _link(ctx): (origin, _, t, a) = ctx.current_link if derive_origin: #Have enough info to derive the origin from context. Ignore origin in current link origin = derive_origin(ctx) #If need be call the Versa action function to determine the relationship to the materialized resource rels = rel(ctx) if callable(rel) else rel if not isinstance(rels, list): rels = [rels] _value = value(ctx) if callable(value) else ( t if value is None else value) #Just work with the first provided statement, for now if res and not (ignore_refs and not iri.is_absolute(_value)): try: _value = I(_value) except ValueError: ctx.extras['logger'].warn( 'Requirement to convert link target to IRI failed for invalid input, causing the corresponding output link to be omitted entirely: {0}' .format( repr( (I(origin), I(iri.absolutize(rel, ctx.base)), _value)))) #XXX How do we really want to handle this error? return [] for r in rels: ctx.output_model.add(I(origin), I(iri.absolutize(r, ctx.base)), _value, {}) return
def materialize_entity(etype, ctx_params=None, model_to_update=None, data=None, addtype=True, loop=None, logger=logging): ''' Routine for creating a BIBFRAME resource. Takes the entity (resource) type and a data mapping according to the resource type. Implements the Libhub Resource Hash Convention As a convenience, if a vocabulary base is provided, concatenate it to etype and the data keys data - list of key/value pairs used to compute the hash. If empty the hash will be a default for the entity type WARNING: THIS FUNCTION MANGLES THE data ARG ''' ctx_params = ctx_params or {} vocabbase = ctx_params.get('vocabbase', BL) entbase = ctx_params.get('entbase') existing_ids = ctx_params.get('existing_ids', set()) plugins = ctx_params.get('plugins') logger = ctx_params.get('logger', logging) output_model = ctx_params.get('output_model') ids = ctx_params.get('ids', default_idgen(entbase)) if vocabbase and not iri.is_absolute(etype): etype = vocabbase + etype params = {'logger': logger} data = data or [] if addtype: data.insert(0, [TYPE_REL, etype]) data_full = [ ((vocabbase + k if not iri.is_absolute(k) else k), v) for (k, v) in data ] plaintext = json.dumps(data_full, separators=(',', ':'), cls=OrderedJsonEncoder) eid = ids.send(plaintext) if model_to_update: model_to_update.add(I(eid), TYPE_REL, I(etype)) params['materialized_id'] = eid params['first_seen'] = eid in existing_ids params['plaintext'] = plaintext for plugin in plugins or (): #Not using yield from if BF_MATRES_TASK in plugin: for p in plugin[BF_MATRES_TASK](loop, output_model, params): pass #logger.debug("Pending tasks: %s" % asyncio.Task.all_tasks(loop)) return eid
def materialize_entity(etype, ctx_params=None, model_to_update=None, data=None, addtype=True, logger=logging): ''' Routine for creating a BIBFRAME resource. Takes the entity (resource) type and a data mapping according to the resource type. Implements the Libhub Resource Hash Convention As a convenience, if a vocabulary base is provided, concatenate it to etype and the data keys data - list of key/value pairs used to compute the hash. If empty the hash will be a default for the entity type WARNING: THIS FUNCTION MANGLES THE data ARG ''' ctx_params = ctx_params or {} vocabbase = ctx_params.get('vocabbase', BL) entbase = ctx_params.get('entbase') existing_ids = ctx_params.get('existing_ids', set()) plugins = ctx_params.get('plugins') logger = ctx_params.get('logger', logging) output_model = ctx_params.get('output_model') ids = ctx_params.get('ids', default_idgen(entbase)) if vocabbase and not iri.is_absolute(etype): etype = vocabbase + etype params = {'logger': logger} data = data or [] if addtype: data.insert(0, [VTYPE_REL, etype]) data_full = [ ((vocabbase + k if not iri.is_absolute(k) else k), v) for (k, v) in data ] plaintext = json.dumps(data_full, separators=(',', ':'), cls=OrderedJsonEncoder) eid = ids.send(plaintext) if model_to_update: model_to_update.add(I(eid), VTYPE_REL, I(etype)) params['materialized_id'] = eid params['first_seen'] = eid in existing_ids params['plaintext'] = plaintext for plugin in plugins or (): #Not using yield from if BF_MATRES_TASK in plugin: for p in plugin[BF_MATRES_TASK](output_model, params): pass return eid
def resource_id(etype, fprint=None, idgen=default_idgen(None), vocabbase=None): ''' Lowest level routine for generating a, ID value using the Versa comvention The Versa convention originated as the hash algorithm outlined by the Libhub initiative for for BIBFRAME Lite, and now codified in the document [Computing Versa Resource Hashes ](https://github.com/uogbuji/versa/wiki/Computing-Versa-Resource-Hashes). etype - type IRI for the new entity (if the entity has multiple types, this is the primary and additional types can be provided in the fingerprint set) fprint - fingerprint set. List of key/value tuples of data to use in generating its unique ID, or None in which case one is just randomly generated defaultvocabbase - for convenience, provided, use to resolve relative etype & fingerprint keys >>> from versa.pipeline import resource_id >>> resource_id("http://schema.org/Person", [("http://schema.org/name", "Jonathan Bruce Postel"), ("http://schema.org/birthDate", "1943-08-06")]) '-7hP9d_Xo8M' >>> resource_id("http://schema.org/Person", [("http://schema.org/name", "Augusta Ada King")]) 'xjgOrUFiw_o' ''' params = {} if vocabbase and not iri.is_absolute(etype): etype = vocabbase(etype) fprint_processed = [] for k, v in fprint or []: if vocabbase and not iri.is_absolute(k): k = vocabbase(k) fprint_processed.append((k, v)) if fprint_processed: fprint_processed.append((VTYPE_REL, etype)) fprint_processed.sort() plaintext = json.dumps(fprint_processed, separators=(',', ':'), cls=OrderedJsonEncoder) eid = idgen.send(plaintext) else: #We only have a type; no other distinguishing data. Generate a random hash eid = next(idgen) return I(eid)
def materialize_entity(etype, ctx_params=None, loop=None, model_to_update=None, data=None, addtype=True): ''' Routine for creating a BIBFRAME resource. Takes the entity (resource) type and a data mapping according to the resource type. Implements the Libhub Resource Hash Convention As a convenience, if a vocabulary base is provided, concatenate it to etype and the data keys ''' ctx_params = ctx_params or {} vocabbase = ctx_params.get('vocabbase', BL) existing_ids = ctx_params.get('existing_ids') plugins = ctx_params.get('plugins') logger = ctx_params.get('logger', logging) output_model = ctx_params.get('output_model') ids = ctx_params.get('ids') if vocabbase and not iri.is_absolute(etype): etype = vocabbase + etype params = {'logger': logger} data = data or [] if addtype: data.insert(0, [TYPE_REL, etype]) data_full = [ ((vocabbase + k if not iri.is_absolute(k) else k), v) for (k, v) in data ] plaintext = json.dumps(data_full, separators=(',', ':'), cls=OrderedJsonEncoder) eid = ids.send(plaintext) if model_to_update: model_to_update.add(I(eid), TYPE_REL, I(etype)) params['materialized_id'] = eid params['first_seen'] = eid in existing_ids params['plaintext'] = plaintext for plugin in plugins or (): #Not using yield from if BF_MATRES_TASK in plugin: for p in plugin[BF_MATRES_TASK](loop, output_model, params): pass #logger.debug("Pending tasks: %s" % asyncio.Task.all_tasks(loop)) return eid
def _res(ctx): _arg = arg(ctx) if callable(arg) else arg _arg = [_arg] if not isinstance(_arg, list) else _arg ret = [] for u in _arg: iu = None try: iu = I(u) except ValueError: # attempt to recover by percent encoding try: iu = I(iri.percent_encode(u)) except ValueError as e: ctx.logger('Unable to convert "{}" to IRI reference:\n{}'.format(u, e)) continue if iu and not iri.is_absolute(iu) and base is not None: iu = I(iri.absolutize(iu, base)) ret.append(iu) return ret
def _res(ctx): _arg = arg(ctx) if callable(arg) else arg _arg = [_arg] if not isinstance(_arg, list) else _arg ret = [] for u in _arg: iu = u if not (ignore_refs and not iri.is_absolute(iu)): # coerce into an IRIref, but fallout as untyped text otherwise try: iu = I(iu) except ValueError as e: # attempt to recover by percent encoding try: iu = I(iri.percent_encode(iu)) except ValueError as e: ctx.extras['logger'].warn('Unable to convert "{}" to IRI reference:\n{}'.format(iu, e)) if base is not None and isinstance(iu, I): iu = I(iri.absolutize(iu, base)) ret.append(iu) return ret
def resource_id(etype, unique=None, idgen=default_idgen(None), vocabbase=None): ''' Very low level routine for generating a, ID value using the hash algorithm outlined by the Libhub initiative for for BIBFRAME Lite (Libhub Resource Hash Convention). https://github.com/zepheira/pybibframe/wiki/From-Records-to-Resources:-the-Library.Link-resource-ID-generation-algorithm Takes the entity (resource) type and an ordered data mapping. etype - type IRI for th enew entity unique - list of key/value tuples of data to use in generating its unique ID, or None in which case one is just randomly generated defaultvocabbase - for convenience, provided, use to resolve relative etype & data keys >>> from bibframe.util import resource_id >>> resource_id("http://schema.org/Person", [("http://schema.org/name", "Jonathan Bruce Postel"), ("http://schema.org/birthDate", "1943-08-06")]) '-7hP9d_Xo8M' >>> resource_id("http://schema.org/Person", [("http://schema.org/name", "Augusta Ada King")]) 'xjgOrUFiw_o' ''' params = {} #XXX: Use proper URI normalization? Have a philosophical discussion with Mark about this :) if vocabbase: etype = vocabbase + etype unique_computed = [] for k, v in unique: if vocabbase: #XXX OK absolutize used here. Go figure k = k if iri.is_absolute(k) else iri.absolutize(k, vocabbase) unique_computed.append((k, v)) if unique_computed: unique_computed.insert(0, [VTYPE_REL, etype]) plaintext = json.dumps(unique_computed, separators=(',', ':')) eid = idgen.send(plaintext) else: #We only have a type; no other distinguishing data. Generate a random hash eid = next(idgen) return eid
:return: target of the context's current link ''' #Action function generator to multiplex a relationship at processing time def _target(ctx): ''' Versa action function Utility to return the target of the context's current link :param ctx: Versa context used in processing (e.g. includes the prototype link :return: Target of the context's current link ''' return ctx.current_link[TARGET] return _target NS_PATCH = lambda ns, k, v: (ns+k, v) if not iri.is_absolute(k) else (k, v) def all_subfields(ctx): ''' Utility to return a hash key from all subfields mentioned in the MARC prototype link :param ctx: Versa context used in processing (e.g. includes the prototype link :return: Tuple of key/value tuples from the attributes; suitable for hashing ''' #result = [ valitem for keys, values in ctx.linkset[0][ATTRIBUTES].items() for valitem in values ] #print(result) #for valitem in ctx.linkset[0][ATTRIBUTES].items(): # result.extend(valitem) #sorted(functools.reduce(lambda a, b: a.extend(b), )) #ctx.logger('GRIPPO' + repr(sorted(functools.reduce(lambda a, b: a.extend(b), ctx.linkset[0][ATTRIBUTES].items())))) attrs = ctx.current_link[ATTRIBUTES]
def __init__(self, obj, siri=None, encoding=None, streamopenmode='rb', sourcetype=inputsourcetype.unknown): ''' obj - byte string, proper string (only if you really know what you're doing), file-like object (stream), file path or URI. uri - optional override URI. Base URI for the input source will be set to this value >>> from amara3 import inputsource >>> inp = inputsource('abc') >>> inp.stream <_io.StringIO object at 0x1056fbf78> >>> inp.iri >>> print(inp.iri) None >>> inp = inputsource(['abc', 'def']) #Now multiple streams in one source >>> inp.stream <_io.StringIO object at 0x1011aff78> >>> print(inp.iri) None >>> inp = next(inp) >>> inp.stream <_io.StringIO object at 0x1011af5e8> >>> print(inp.iri) None >>> ''' # from amara3 import inputsource; inp = inputsource('foo.zip') # from amara3 import inputsource; inp = inputsource('test/resource/std-examples.zip') # s = inp.stream.read(100) # s # b'<?xml version="1.0" encoding="UTF-8"?>\r\n<!-- edited with XML Spy v4.3 U (http://www.xmlspy.com) by M' # s # b'<?xml version="1.0" encoding="UTF-8"?>\r\n<collection xmlns="http://www.loc.gov/MARC21/slim">\r\n <reco' self.stream = None self.iri = siri self.sourcetype = sourcetype if obj in ('', b''): raise ValueError("Cannot parse an empty string as XML") if hasattr(obj, 'read'): #Create dummy Uri to use as base #uri = uri or uuid4().urn self.stream = obj #elif sourcetype == inputsourcetype.xmlstring: #See this article about XML detection heuristics #http://www.xml.com/pub/a/2007/02/28/what-does-xml-smell-like.html #uri = uri or uuid4().urn elif self.sourcetype == inputsourcetype.iri or ( siri and iri.matches_uri_syntax(obj)): self.iri = siri or obj self.stream = urlopen(iri) elif self.sourcetype == inputsourcetype.filename or ( siri and iri.is_absolute(obj) and not os.path.isfile(obj)): #FIXME: convert path to URI self.iri = siri or iri.os_path_to_uri(obj) self.stream = open(obj, streamopenmode) elif self.sourcetype == inputsourcetype.string or isinstance( obj, str) or isinstance(obj, bytes): self.stream = StringIO(obj) #If obj is beyond a certain length, don't even try it as a URI #if len(obj) < MAX_URI_LENGTH_FOR_HEURISTIC: # self.iri = iri.os_path_to_uri(obj) # self.stream = urlopen(siri) else: raise ValueError("Unable to recognize as an inputsource") return
def __init__(self, obj, siri=None, encoding=None, streamopenmode='rb', sourcetype=inputsourcetype.unknown): ''' obj - byte string, proper string (only if you really know what you're doing), file-like object (stream), file path or URI. uri - optional override URI. Base URI for the input source will be set to this value >>> from amara3 import inputsource >>> inp = inputsource('abc') >>> inp.stream <_io.StringIO object at 0x1056fbf78> >>> inp.iri >>> print(inp.iri) None >>> inp = inputsource(['abc', 'def']) #Now multiple streams in one source >>> inp.stream <_io.StringIO object at 0x1011aff78> >>> print(inp.iri) None >>> inp = next(inp) >>> inp.stream <_io.StringIO object at 0x1011af5e8> >>> print(inp.iri) None >>> ''' # from amara3 import inputsource; inp = inputsource('foo.zip') # from amara3 import inputsource; inp = inputsource('test/resource/std-examples.zip') # s = inp.stream.read(100) # s # b'<?xml version="1.0" encoding="UTF-8"?>\r\n<!-- edited with XML Spy v4.3 U (http://www.xmlspy.com) by M' # s # b'<?xml version="1.0" encoding="UTF-8"?>\r\n<collection xmlns="http://www.loc.gov/MARC21/slim">\r\n <reco' self.stream = None self.iri = siri self.sourcetype = sourcetype if obj in ('', b''): raise ValueError("Cannot parse an empty string as XML") if hasattr(obj, 'read'): #Create dummy Uri to use as base #uri = uri or uuid4().urn self.stream = obj #elif sourcetype == inputsourcetype.xmlstring: #See this article about XML detection heuristics #http://www.xml.com/pub/a/2007/02/28/what-does-xml-smell-like.html #uri = uri or uuid4().urn elif self.sourcetype == inputsourcetype.iri or (siri and iri.matches_uri_syntax(obj)): self.iri = siri or obj self.stream = urlopen(iri) elif self.sourcetype == inputsourcetype.filename or (siri and iri.is_absolute(obj) and not os.path.isfile(obj)): #FIXME: convert path to URI self.iri = siri or iri.os_path_to_uri(obj) self.stream = open(obj, streamopenmode) elif self.sourcetype == inputsourcetype.string or isinstance(obj, str) or isinstance(obj, bytes): self.stream = StringIO(obj) #If obj is beyond a certain length, don't even try it as a URI #if len(obj) < MAX_URI_LENGTH_FOR_HEURISTIC: # self.iri = iri.os_path_to_uri(obj) # self.stream = urlopen(siri) else: raise ValueError("Unable to recognize as an inputsource") return
''' #Action function generator to multiplex a relationship at processing time def _origin(ctx): ''' Versa action function Utility to return the origin of the context's current link :param ctx: Versa context used in processing (e.g. includes the prototype link :return: Origin of the context's current link ''' return ctx.current_link[ORIGIN] return _origin NS_PATCH = lambda ns, k, v: (ns + k, v) if not iri.is_absolute(k) else (k, v) def all_subfields(ctx): ''' Utility to return a hash key from all subfields mentioned in the MARC prototype link :param ctx: Versa context used in processing (e.g. includes the prototype link :return: Tuple of key/value tuples from the attributes; suitable for hashing ''' #result = [ valitem for keys, values in ctx.linkset[0][ATTRIBUTES].items() for valitem in values ] #print(result) #for valitem in ctx.linkset[0][ATTRIBUTES].items(): # result.extend(valitem) #sorted(functools.reduce(lambda a, b: a.extend(b), )) #ctx.logger('GRIPPO' + repr(sorted(functools.reduce(lambda a, b: a.extend(b), ctx.linkset[0][ATTRIBUTES].items()))))