Esempio n. 1
0
def linkreport(config=None, **kwargs):
    #Any configuration variables passed in
    if config is None: config = {}
    try:
        #Initialize the output
        outstr = ''
        while True:
            params = yield
            model = params['model']
            items = {}
            #Get the title
            #First get the work ID
            workid = params['workid']
            #simple_lookup() is a little helper for getting a property from a resource
            title = simple_lookup(model, workid, TITLE_REL)
            #Get the ISBN, just pick the first one
            isbn = ''
            if params['instanceids']:
                inst1 = params['instanceids'][0]
                isbn = simple_lookup(model, inst1, ISBN_REL)

            envelope = '<div id="{0} isbn="{1}"><title>{2}</title>\n'.format(workid, isbn, title)
            #iterate over all the relationship targets to see which is a link
            for stmt in model.match():
                if iri.matches_uri_syntax(stmt[TARGET]) and iri.split_uri_ref(stmt[TARGET])[1] != BFHOST:
                    envelope += '<a href="{0}">{0}</a>\n'.format(stmt[TARGET], stmt[TARGET])
            envelope += '</div>\n'
            outstr += envelope
    except GeneratorExit:
        #Reached when close() is called on this coroutine
        with open(config['output-file'], "w") as outf:
            outf.write(outstr)
Esempio n. 2
0
    def handle_record_links(self, loop, model, params):
        '''
        Task coroutine of the main event loop for MARC conversion, called with 
        In this case update a report of links encountered in the MARC/XML

        model -- raw Versa model with converted resource information from the MARC details from each MARC/XML record processed
        params -- parameters passed in from processing:
            params['workid']: ID of the work constructed from the MARC record
            params['instanceid']: list of IDs of instances constructed from the MARC record
        '''
        #print ('BF_MARCREC_TASK', linkreport.PLUGIN_ID)
        items = {}
        #Get the title
        #First get the work ID
        workid = params['workid']
        #simple_lookup() is a little helper for getting a property from a resource
        title = simple_lookup(model, workid, TITLE_REL)
        #Get the ISBN, just pick the first one
        isbn = ''
        if params['instanceids']:
            inst1 = params['instanceids'][0]
            isbn = simple_lookup(model, inst1, ISBN_REL)

        envelope = '<div id="{0}" isbn="{1}"><title>{2}</title>\n'.format(workid, isbn, title)
        #iterate over all the relationship targets to see which is a link
        for stmt in model.match():
            if iri.matches_uri_syntax(stmt[TARGET]) and iri.split_uri_ref(stmt[TARGET])[1] != BFHOST:
                self._links_found.add(stmt[TARGET])
                envelope += '<a href="{0}">{0}</a>\n'.format(stmt[TARGET], stmt[TARGET])
        envelope += '</div>\n'
        self._outstr += envelope
        #print ('DONE BF_MARCREC_TASK', linkreport.PLUGIN_ID)
        return
Esempio n. 3
0
 def _abbreviate(self, rid):
     '''
     Abbreviate a relationship or resource ID target for efficient storage
     in the DB. Works only with a prefix/suffix split of hierarchical HTTP-like IRIs,
     e.g. 'http://example.org/spam/eggs' becomes something like '{a23}eggs'
     and afterward there will be an entry in the prefix map from 'a23' to 'http://example.org/spam/'
     The map can then easily be used with str.format
     '''
     if not isinstance(rid, str) or '/' not in rid or not iri.matches_uri_syntax(rid):
         return rid
     head, tail = rid.rsplit('/', 1)
     head += '/'
     pmap = self._db['@_abbreviations']
     assert pmap is not None
     #FIXME: probably called too often to do this every time
     inv_pmap = {v: k for k, v in pmap.items()}
     if head in inv_pmap:
         prefix = inv_pmap[head]
     else:
         prefix = f'a{self._abbr_index}'
         pmap[prefix] = head
         self._abbr_index += 1
         self._db['@_abbreviations'] = pmap
     post_rid = '{' + prefix + '}' + tail
     return post_rid
Esempio n. 4
0
 def _abbreviate(self, rid, txn):
     '''
     Abbreviate a relationship or resource ID target for efficient storage
     in the DB. Works only with a prefix/suffix split of hierarchical HTTP-like IRIs,
     e.g. 'http://example.org/spam/eggs' becomes something like '{a23}eggs'
     and afterward there will be an entry in the prefix map from 'a23' to 'http://example.org/spam/'
     The map can then easily be used with str.format
     '''
     if not isinstance(
             rid, str) or '/' not in rid or not iri.matches_uri_syntax(rid):
         return rid
     head, tail = rid.rsplit('/', 1)
     head += '/'
     prefix_map = txn.get(b'@_abbreviations')
     assert prefix_map is not None
     prefix_map = msgpack.loads(prefix_map, raw=False)
     #FIXME: probably called too often to do this every time
     inv_prefix_map = {v: k for k, v in prefix_map.items()}
     if head in inv_prefix_map:
         prefix = inv_prefix_map[head]
     else:
         prefix = f'a{self._abbr_index}'
         prefix_map[prefix] = head
         self._abbr_index += 1
         txn.put(b'@_abbreviations',
                 msgpack.dumps(prefix_map, use_bin_type=True))
     post_rid = '{' + prefix + '}' + tail.replace('{', '{{').replace(
         '}', '}}')
     return post_rid
Esempio n. 5
0
 def _abbreviate(self, rid):
     '''
     Abbreviate a relationship or resource ID target for efficient storage
     in the DB. Works only with a prefix/suffix split of hierarchical HTTP-like IRIs,
     e.g. 'http://example.org/spam/eggs' becomes something like '{a23}eggs'
     and afterward there will be an entry in the prefix map from 'a23' to 'http://example.org/spam/'
     The map can then easily be used with str.format
     '''
     if not isinstance(rid, str) or not iri.matches_uri_syntax(rid):
         return rid
     head, tail = rid.rsplit('/', 1)
     head += '/'
     abbrev_obj = self._db_coll.find_one({'origin': '@_abbreviations'})
     assert abbrev_obj is not None
     pmap = abbrev_obj['map']
     #FIXME: probably called too often to do this every time
     inv_pmap = {v: k for k, v in pmap.items()}
     if head in inv_pmap:
         prefix = inv_pmap[head]
     else:
         prefix = f'a{self._abbr_index}'
         pmap[prefix] = head
         self._abbr_index += 1
         self._db_coll.replace_one({'origin': '@_abbreviations'}, {
             'origin': '@_abbreviations',
             'map': pmap
         })
     post_rid = '{' + prefix + '}' + tail
     return post_rid
Esempio n. 6
0
    def handle_record_links(self, loop, model, params):
        '''
        Task coroutine of the main event loop for MARC conversion, called with 
        In this case update a report of links encountered in the MARC/XML

        model -- raw Versa model with converted resource information from the MARC details from each MARC/XML record processed
        params -- parameters passed in from processing:
            params['workid']: ID of the work constructed from the MARC record
            params['instanceid']: list of IDs of instances constructed from the MARC record
        '''
        #print ('BF_MARCREC_TASK', linkreport.PLUGIN_ID)
        items = {}
        #Get the title
        #First get the work ID
        workid = params['workid']
        #simple_lookup() is a little helper for getting a property from a resource
        title = simple_lookup(model, workid, TITLE_REL)
        #Get the ISBN, just pick the first one
        isbn = ''
        if params['instanceids']:
            inst1 = params['instanceids'][0]
            isbn = simple_lookup(model, inst1, ISBN_REL)

        envelope = '<div id="{0}" isbn="{1}"><title>{2}</title>\n'.format(
            workid, isbn, title)
        #iterate over all the relationship targets to see which is a link
        for stmt in model.match():
            if iri.matches_uri_syntax(stmt[TARGET]) and iri.split_uri_ref(
                    stmt[TARGET])[1] != BFHOST:
                self._links_found.add(stmt[TARGET])
                envelope += '<a href="{0}">{0}</a>\n'.format(
                    stmt[TARGET], stmt[TARGET])
        envelope += '</div>\n'
        self._outstr += envelope
        #print ('DONE BF_MARCREC_TASK', linkreport.PLUGIN_ID)
        return
Esempio n. 7
0
    def __init__(self,
                 obj,
                 siri=None,
                 encoding=None,
                 streamopenmode='rb',
                 sourcetype=inputsourcetype.unknown):
        '''
        obj - byte string, proper string (only if you really know what you're doing),
            file-like object (stream), file path or URI.
        uri - optional override URI.  Base URI for the input source will be set to
            this value

        >>> from amara3 import inputsource
        >>> inp = inputsource('abc')
        >>> inp.stream
        <_io.StringIO object at 0x1056fbf78>
        >>> inp.iri
        >>> print(inp.iri)
        None
        >>> inp = inputsource(['abc', 'def']) #Now multiple streams in one source
        >>> inp.stream
        <_io.StringIO object at 0x1011aff78>
        >>> print(inp.iri)
        None
        >>> inp = next(inp)
        >>> inp.stream
        <_io.StringIO object at 0x1011af5e8>
        >>> print(inp.iri)
        None
        >>>
        '''
        # from amara3 import inputsource; inp = inputsource('foo.zip')
        # from amara3 import inputsource; inp = inputsource('test/resource/std-examples.zip')
        # s = inp.stream.read(100)
        # s
        # b'<?xml version="1.0" encoding="UTF-8"?>\r\n<!-- edited with XML Spy v4.3 U (http://www.xmlspy.com) by M'
        # s
        # b'<?xml version="1.0" encoding="UTF-8"?>\r\n<collection xmlns="http://www.loc.gov/MARC21/slim">\r\n  <reco'

        self.stream = None
        self.iri = siri
        self.sourcetype = sourcetype

        if obj in ('', b''):
            raise ValueError("Cannot parse an empty string as XML")

        if hasattr(obj, 'read'):
            #Create dummy Uri to use as base
            #uri = uri or uuid4().urn
            self.stream = obj
        #elif sourcetype == inputsourcetype.xmlstring:
        #See this article about XML detection heuristics
        #http://www.xml.com/pub/a/2007/02/28/what-does-xml-smell-like.html
        #uri = uri or uuid4().urn
        elif self.sourcetype == inputsourcetype.iri or (
                siri and iri.matches_uri_syntax(obj)):
            self.iri = siri or obj
            self.stream = urlopen(iri)
        elif self.sourcetype == inputsourcetype.filename or (
                siri and iri.is_absolute(obj) and not os.path.isfile(obj)):
            #FIXME: convert path to URI
            self.iri = siri or iri.os_path_to_uri(obj)
            self.stream = open(obj, streamopenmode)
        elif self.sourcetype == inputsourcetype.string or isinstance(
                obj, str) or isinstance(obj, bytes):
            self.stream = StringIO(obj)
            #If obj is beyond a certain length, don't even try it as a URI
            #if len(obj) < MAX_URI_LENGTH_FOR_HEURISTIC:
            #    self.iri = iri.os_path_to_uri(obj)
            #    self.stream = urlopen(siri)
        else:
            raise ValueError("Unable to recognize as an inputsource")
        return
Esempio n. 8
0
    def __init__(self, obj, siri=None, encoding=None, streamopenmode='rb',
                    sourcetype=inputsourcetype.unknown):
        '''
        obj - byte string, proper string (only if you really know what you're doing),
            file-like object (stream), file path or URI.
        uri - optional override URI.  Base URI for the input source will be set to
            this value

        >>> from amara3 import inputsource
        >>> inp = inputsource('abc')
        >>> inp.stream
        <_io.StringIO object at 0x1056fbf78>
        >>> inp.iri
        >>> print(inp.iri)
        None
        >>> inp = inputsource(['abc', 'def']) #Now multiple streams in one source
        >>> inp.stream
        <_io.StringIO object at 0x1011aff78>
        >>> print(inp.iri)
        None
        >>> inp = next(inp)
        >>> inp.stream
        <_io.StringIO object at 0x1011af5e8>
        >>> print(inp.iri)
        None
        >>>
        '''
        # from amara3 import inputsource; inp = inputsource('foo.zip')
        # from amara3 import inputsource; inp = inputsource('test/resource/std-examples.zip')
        # s = inp.stream.read(100)
        # s
        # b'<?xml version="1.0" encoding="UTF-8"?>\r\n<!-- edited with XML Spy v4.3 U (http://www.xmlspy.com) by M'
        # s
        # b'<?xml version="1.0" encoding="UTF-8"?>\r\n<collection xmlns="http://www.loc.gov/MARC21/slim">\r\n  <reco'

        self.stream = None
        self.iri = siri
        self.sourcetype = sourcetype

        if obj in ('', b''):
            raise ValueError("Cannot parse an empty string as XML")

        if hasattr(obj, 'read'):
            #Create dummy Uri to use as base
            #uri = uri or uuid4().urn
            self.stream = obj
        #elif sourcetype == inputsourcetype.xmlstring:
            #See this article about XML detection heuristics
            #http://www.xml.com/pub/a/2007/02/28/what-does-xml-smell-like.html
            #uri = uri or uuid4().urn
        elif self.sourcetype == inputsourcetype.iri or (siri and iri.matches_uri_syntax(obj)):
            self.iri = siri or obj
            self.stream = urlopen(iri)
        elif self.sourcetype == inputsourcetype.filename or (siri and iri.is_absolute(obj) and not os.path.isfile(obj)):
            #FIXME: convert path to URI
            self.iri = siri or iri.os_path_to_uri(obj)
            self.stream = open(obj, streamopenmode)
        elif self.sourcetype == inputsourcetype.string or isinstance(obj, str) or isinstance(obj, bytes):
            self.stream = StringIO(obj)
            #If obj is beyond a certain length, don't even try it as a URI
            #if len(obj) < MAX_URI_LENGTH_FOR_HEURISTIC:
            #    self.iri = iri.os_path_to_uri(obj)
            #    self.stream = urlopen(siri)
        else:
            raise ValueError("Unable to recognize as an inputsource")
        return