def linkreport(config=None, **kwargs): #Any configuration variables passed in if config is None: config = {} try: #Initialize the output outstr = '' while True: params = yield model = params['model'] items = {} #Get the title #First get the work ID workid = params['workid'] #simple_lookup() is a little helper for getting a property from a resource title = simple_lookup(model, workid, TITLE_REL) #Get the ISBN, just pick the first one isbn = '' if params['instanceids']: inst1 = params['instanceids'][0] isbn = simple_lookup(model, inst1, ISBN_REL) envelope = '<div id="{0} isbn="{1}"><title>{2}</title>\n'.format(workid, isbn, title) #iterate over all the relationship targets to see which is a link for stmt in model.match(): if iri.matches_uri_syntax(stmt[TARGET]) and iri.split_uri_ref(stmt[TARGET])[1] != BFHOST: envelope += '<a href="{0}">{0}</a>\n'.format(stmt[TARGET], stmt[TARGET]) envelope += '</div>\n' outstr += envelope except GeneratorExit: #Reached when close() is called on this coroutine with open(config['output-file'], "w") as outf: outf.write(outstr)
def handle_record_links(self, loop, model, params): ''' Task coroutine of the main event loop for MARC conversion, called with In this case update a report of links encountered in the MARC/XML model -- raw Versa model with converted resource information from the MARC details from each MARC/XML record processed params -- parameters passed in from processing: params['workid']: ID of the work constructed from the MARC record params['instanceid']: list of IDs of instances constructed from the MARC record ''' #print ('BF_MARCREC_TASK', linkreport.PLUGIN_ID) items = {} #Get the title #First get the work ID workid = params['workid'] #simple_lookup() is a little helper for getting a property from a resource title = simple_lookup(model, workid, TITLE_REL) #Get the ISBN, just pick the first one isbn = '' if params['instanceids']: inst1 = params['instanceids'][0] isbn = simple_lookup(model, inst1, ISBN_REL) envelope = '<div id="{0}" isbn="{1}"><title>{2}</title>\n'.format(workid, isbn, title) #iterate over all the relationship targets to see which is a link for stmt in model.match(): if iri.matches_uri_syntax(stmt[TARGET]) and iri.split_uri_ref(stmt[TARGET])[1] != BFHOST: self._links_found.add(stmt[TARGET]) envelope += '<a href="{0}">{0}</a>\n'.format(stmt[TARGET], stmt[TARGET]) envelope += '</div>\n' self._outstr += envelope #print ('DONE BF_MARCREC_TASK', linkreport.PLUGIN_ID) return
def _abbreviate(self, rid): ''' Abbreviate a relationship or resource ID target for efficient storage in the DB. Works only with a prefix/suffix split of hierarchical HTTP-like IRIs, e.g. 'http://example.org/spam/eggs' becomes something like '{a23}eggs' and afterward there will be an entry in the prefix map from 'a23' to 'http://example.org/spam/' The map can then easily be used with str.format ''' if not isinstance(rid, str) or '/' not in rid or not iri.matches_uri_syntax(rid): return rid head, tail = rid.rsplit('/', 1) head += '/' pmap = self._db['@_abbreviations'] assert pmap is not None #FIXME: probably called too often to do this every time inv_pmap = {v: k for k, v in pmap.items()} if head in inv_pmap: prefix = inv_pmap[head] else: prefix = f'a{self._abbr_index}' pmap[prefix] = head self._abbr_index += 1 self._db['@_abbreviations'] = pmap post_rid = '{' + prefix + '}' + tail return post_rid
def _abbreviate(self, rid, txn): ''' Abbreviate a relationship or resource ID target for efficient storage in the DB. Works only with a prefix/suffix split of hierarchical HTTP-like IRIs, e.g. 'http://example.org/spam/eggs' becomes something like '{a23}eggs' and afterward there will be an entry in the prefix map from 'a23' to 'http://example.org/spam/' The map can then easily be used with str.format ''' if not isinstance( rid, str) or '/' not in rid or not iri.matches_uri_syntax(rid): return rid head, tail = rid.rsplit('/', 1) head += '/' prefix_map = txn.get(b'@_abbreviations') assert prefix_map is not None prefix_map = msgpack.loads(prefix_map, raw=False) #FIXME: probably called too often to do this every time inv_prefix_map = {v: k for k, v in prefix_map.items()} if head in inv_prefix_map: prefix = inv_prefix_map[head] else: prefix = f'a{self._abbr_index}' prefix_map[prefix] = head self._abbr_index += 1 txn.put(b'@_abbreviations', msgpack.dumps(prefix_map, use_bin_type=True)) post_rid = '{' + prefix + '}' + tail.replace('{', '{{').replace( '}', '}}') return post_rid
def _abbreviate(self, rid): ''' Abbreviate a relationship or resource ID target for efficient storage in the DB. Works only with a prefix/suffix split of hierarchical HTTP-like IRIs, e.g. 'http://example.org/spam/eggs' becomes something like '{a23}eggs' and afterward there will be an entry in the prefix map from 'a23' to 'http://example.org/spam/' The map can then easily be used with str.format ''' if not isinstance(rid, str) or not iri.matches_uri_syntax(rid): return rid head, tail = rid.rsplit('/', 1) head += '/' abbrev_obj = self._db_coll.find_one({'origin': '@_abbreviations'}) assert abbrev_obj is not None pmap = abbrev_obj['map'] #FIXME: probably called too often to do this every time inv_pmap = {v: k for k, v in pmap.items()} if head in inv_pmap: prefix = inv_pmap[head] else: prefix = f'a{self._abbr_index}' pmap[prefix] = head self._abbr_index += 1 self._db_coll.replace_one({'origin': '@_abbreviations'}, { 'origin': '@_abbreviations', 'map': pmap }) post_rid = '{' + prefix + '}' + tail return post_rid
def handle_record_links(self, loop, model, params): ''' Task coroutine of the main event loop for MARC conversion, called with In this case update a report of links encountered in the MARC/XML model -- raw Versa model with converted resource information from the MARC details from each MARC/XML record processed params -- parameters passed in from processing: params['workid']: ID of the work constructed from the MARC record params['instanceid']: list of IDs of instances constructed from the MARC record ''' #print ('BF_MARCREC_TASK', linkreport.PLUGIN_ID) items = {} #Get the title #First get the work ID workid = params['workid'] #simple_lookup() is a little helper for getting a property from a resource title = simple_lookup(model, workid, TITLE_REL) #Get the ISBN, just pick the first one isbn = '' if params['instanceids']: inst1 = params['instanceids'][0] isbn = simple_lookup(model, inst1, ISBN_REL) envelope = '<div id="{0}" isbn="{1}"><title>{2}</title>\n'.format( workid, isbn, title) #iterate over all the relationship targets to see which is a link for stmt in model.match(): if iri.matches_uri_syntax(stmt[TARGET]) and iri.split_uri_ref( stmt[TARGET])[1] != BFHOST: self._links_found.add(stmt[TARGET]) envelope += '<a href="{0}">{0}</a>\n'.format( stmt[TARGET], stmt[TARGET]) envelope += '</div>\n' self._outstr += envelope #print ('DONE BF_MARCREC_TASK', linkreport.PLUGIN_ID) return
def __init__(self, obj, siri=None, encoding=None, streamopenmode='rb', sourcetype=inputsourcetype.unknown): ''' obj - byte string, proper string (only if you really know what you're doing), file-like object (stream), file path or URI. uri - optional override URI. Base URI for the input source will be set to this value >>> from amara3 import inputsource >>> inp = inputsource('abc') >>> inp.stream <_io.StringIO object at 0x1056fbf78> >>> inp.iri >>> print(inp.iri) None >>> inp = inputsource(['abc', 'def']) #Now multiple streams in one source >>> inp.stream <_io.StringIO object at 0x1011aff78> >>> print(inp.iri) None >>> inp = next(inp) >>> inp.stream <_io.StringIO object at 0x1011af5e8> >>> print(inp.iri) None >>> ''' # from amara3 import inputsource; inp = inputsource('foo.zip') # from amara3 import inputsource; inp = inputsource('test/resource/std-examples.zip') # s = inp.stream.read(100) # s # b'<?xml version="1.0" encoding="UTF-8"?>\r\n<!-- edited with XML Spy v4.3 U (http://www.xmlspy.com) by M' # s # b'<?xml version="1.0" encoding="UTF-8"?>\r\n<collection xmlns="http://www.loc.gov/MARC21/slim">\r\n <reco' self.stream = None self.iri = siri self.sourcetype = sourcetype if obj in ('', b''): raise ValueError("Cannot parse an empty string as XML") if hasattr(obj, 'read'): #Create dummy Uri to use as base #uri = uri or uuid4().urn self.stream = obj #elif sourcetype == inputsourcetype.xmlstring: #See this article about XML detection heuristics #http://www.xml.com/pub/a/2007/02/28/what-does-xml-smell-like.html #uri = uri or uuid4().urn elif self.sourcetype == inputsourcetype.iri or ( siri and iri.matches_uri_syntax(obj)): self.iri = siri or obj self.stream = urlopen(iri) elif self.sourcetype == inputsourcetype.filename or ( siri and iri.is_absolute(obj) and not os.path.isfile(obj)): #FIXME: convert path to URI self.iri = siri or iri.os_path_to_uri(obj) self.stream = open(obj, streamopenmode) elif self.sourcetype == inputsourcetype.string or isinstance( obj, str) or isinstance(obj, bytes): self.stream = StringIO(obj) #If obj is beyond a certain length, don't even try it as a URI #if len(obj) < MAX_URI_LENGTH_FOR_HEURISTIC: # self.iri = iri.os_path_to_uri(obj) # self.stream = urlopen(siri) else: raise ValueError("Unable to recognize as an inputsource") return
def __init__(self, obj, siri=None, encoding=None, streamopenmode='rb', sourcetype=inputsourcetype.unknown): ''' obj - byte string, proper string (only if you really know what you're doing), file-like object (stream), file path or URI. uri - optional override URI. Base URI for the input source will be set to this value >>> from amara3 import inputsource >>> inp = inputsource('abc') >>> inp.stream <_io.StringIO object at 0x1056fbf78> >>> inp.iri >>> print(inp.iri) None >>> inp = inputsource(['abc', 'def']) #Now multiple streams in one source >>> inp.stream <_io.StringIO object at 0x1011aff78> >>> print(inp.iri) None >>> inp = next(inp) >>> inp.stream <_io.StringIO object at 0x1011af5e8> >>> print(inp.iri) None >>> ''' # from amara3 import inputsource; inp = inputsource('foo.zip') # from amara3 import inputsource; inp = inputsource('test/resource/std-examples.zip') # s = inp.stream.read(100) # s # b'<?xml version="1.0" encoding="UTF-8"?>\r\n<!-- edited with XML Spy v4.3 U (http://www.xmlspy.com) by M' # s # b'<?xml version="1.0" encoding="UTF-8"?>\r\n<collection xmlns="http://www.loc.gov/MARC21/slim">\r\n <reco' self.stream = None self.iri = siri self.sourcetype = sourcetype if obj in ('', b''): raise ValueError("Cannot parse an empty string as XML") if hasattr(obj, 'read'): #Create dummy Uri to use as base #uri = uri or uuid4().urn self.stream = obj #elif sourcetype == inputsourcetype.xmlstring: #See this article about XML detection heuristics #http://www.xml.com/pub/a/2007/02/28/what-does-xml-smell-like.html #uri = uri or uuid4().urn elif self.sourcetype == inputsourcetype.iri or (siri and iri.matches_uri_syntax(obj)): self.iri = siri or obj self.stream = urlopen(iri) elif self.sourcetype == inputsourcetype.filename or (siri and iri.is_absolute(obj) and not os.path.isfile(obj)): #FIXME: convert path to URI self.iri = siri or iri.os_path_to_uri(obj) self.stream = open(obj, streamopenmode) elif self.sourcetype == inputsourcetype.string or isinstance(obj, str) or isinstance(obj, bytes): self.stream = StringIO(obj) #If obj is beyond a certain length, don't even try it as a URI #if len(obj) < MAX_URI_LENGTH_FOR_HEURISTIC: # self.iri = iri.os_path_to_uri(obj) # self.stream = urlopen(siri) else: raise ValueError("Unable to recognize as an inputsource") return