Example #1
0
    def handle_record_links(self, loop, model, params):
        '''
        Task coroutine of the main event loop for MARC conversion, called with 
        In this case update a report of links encountered in the MARC/XML

        model -- raw Versa model with converted resource information from the MARC details from each MARC/XML record processed
        params -- parameters passed in from processing:
            params['workid']: ID of the work constructed from the MARC record
            params['instanceid']: list of IDs of instances constructed from the MARC record
        '''
        #print ('BF_MARCREC_TASK', linkreport.PLUGIN_ID)
        items = {}
        #Get the title
        #First get the work ID
        workid = params['workid']
        #simple_lookup() is a little helper for getting a property from a resource
        title = simple_lookup(model, workid, TITLE_REL)
        #Get the ISBN, just pick the first one
        isbn = ''
        if params['instanceids']:
            inst1 = params['instanceids'][0]
            isbn = simple_lookup(model, inst1, ISBN_REL)

        envelope = '<div id="{0}" isbn="{1}"><title>{2}</title>\n'.format(workid, isbn, title)
        #iterate over all the relationship targets to see which is a link
        for stmt in model.match():
            if iri.matches_uri_syntax(stmt[TARGET]) and iri.split_uri_ref(stmt[TARGET])[1] != BFHOST:
                self._links_found.add(stmt[TARGET])
                envelope += '<a href="{0}">{0}</a>\n'.format(stmt[TARGET], stmt[TARGET])
        envelope += '</div>\n'
        self._outstr += envelope
        #print ('DONE BF_MARCREC_TASK', linkreport.PLUGIN_ID)
        return
Example #2
0
 def __init__(self, frontpage, **kwargs):
     self._frontpage = frontpage
     _, self._fphost, _, _, _ = iri.split_uri_ref(self._frontpage)
     self.outfolder = kwargs['csvexport']
     from bibframe import BFZ, BL, BA, REL, MARC
     self.rules = [
         (BL + 'controlCode', 'controlCode'),
         (BL + 'instantiates', 'instantiates'),
         (BL + 'link', 'link'),
         (BL + 'title', 'title'),
         (BL + 'name', 'name'),
         (BL + 'providerDate', 'providerDate'),
         (BL + 'providerPlace', 'providerPlace'),
         (BL + 'creator', 'creator'),
         (BL + 'genre', 'genre'),
         (BL + 'language', 'language'),
         (BL + 'subject', 'subject'),
         (BL + 'controlCode', 'controlCode'),
         (BL + 'focus', 'focus'),
         (BL + 'date', 'date'),
         (MARC + 'isbn', 'isbn'),
         (MARC + 'lccn', 'lccn'),
         (MARC + 'titleStatement', 'titleStatement'),
         (MARC + 'lcCallNumber', 'lcCallNumber'),
         (MARC + 'lcItemNumber', 'lcItemNumber'),
         (MARC + 'literaryForm', 'literaryForm'),
         (MARC + 'seriesStatement', 'seriesStatement'),
         (MARC + 'formSubdivision', 'formSubdivision'),
     ]
     return
Example #3
0
def linkreport(config=None, **kwargs):
    #Any configuration variables passed in
    if config is None: config = {}
    try:
        #Initialize the output
        outstr = ''
        while True:
            params = yield
            model = params['model']
            items = {}
            #Get the title
            #First get the work ID
            workid = params['workid']
            #simple_lookup() is a little helper for getting a property from a resource
            title = simple_lookup(model, workid, TITLE_REL)
            #Get the ISBN, just pick the first one
            isbn = ''
            if params['instanceids']:
                inst1 = params['instanceids'][0]
                isbn = simple_lookup(model, inst1, ISBN_REL)

            envelope = '<div id="{0} isbn="{1}"><title>{2}</title>\n'.format(workid, isbn, title)
            #iterate over all the relationship targets to see which is a link
            for stmt in model.match():
                if iri.matches_uri_syntax(stmt[TARGET]) and iri.split_uri_ref(stmt[TARGET])[1] != BFHOST:
                    envelope += '<a href="{0}">{0}</a>\n'.format(stmt[TARGET], stmt[TARGET])
            envelope += '</div>\n'
            outstr += envelope
    except GeneratorExit:
        #Reached when close() is called on this coroutine
        with open(config['output-file'], "w") as outf:
            outf.write(outstr)
Example #4
0
 def __init__(self, frontpage, **kwargs):
     self._frontpage = frontpage
     _, self._fphost, _, _, _ = iri.split_uri_ref(self._frontpage)
     self.outfolder = kwargs['csvexport']
     from bibframe import BFZ, BL, BA, REL, MARC
     self.rules = [
         (BL + 'controlCode', 'controlCode'),
         (BL + 'instantiates', 'instantiates'),
         (BL + 'link', 'link'),
         (BL + 'title', 'title'),
         (BL + 'name', 'name'),
         (BL + 'providerDate', 'providerDate'),
         (BL + 'providerPlace', 'providerPlace'),
         (BL + 'creator', 'creator'),
         (BL + 'genre', 'genre'),
         (BL + 'language', 'language'),
         (BL + 'subject', 'subject'),
         (BL + 'controlCode', 'controlCode'),
         (BL + 'focus', 'focus'),
         (BL + 'date', 'date'),
         (MARC + 'isbn', 'isbn'),
         (MARC + 'lccn', 'lccn'),
         (MARC + 'titleStatement', 'titleStatement'),
         (MARC + 'lcCallNumber', 'lcCallNumber'),
         (MARC + 'lcItemNumber', 'lcItemNumber'),
         (MARC + 'literaryForm', 'literaryForm'),
         (MARC + 'seriesStatement', 'seriesStatement'),
         (MARC + 'formSubdivision', 'formSubdivision'),
     ]
     return
Example #5
0
def test_normalize_case():
    for uri, expected0, expected1 in case_normalization_tests:
        testname = uri
        uri = iri.split_uri_ref(uri)
        assert expected0 == iri.unsplit_uri_ref(
            iri.normalize_case(uri)), testname
        assert expected1 == iri.unsplit_uri_ref(
            iri.normalize_case(uri, doHost=1)), testname + ' (host too)'
Example #6
0
    async def send(self, data):
        #Body text, response URL (e.g. after redirections), aiohttp.header object from response, referrer, controlling task ID
        (body, respurl, respheaders, referrer, task_id) = data
        _, respurlhost, _, _, _ = iri.split_uri_ref(respurl)
        if self._fphost == respurlhost:
            #csvexport_sink.logger.debug('[TASK {}]: Target subpage {} -> {}'.format(task_id, referrer, respurl))

            root = html5.parse(body)
            linkset = self._queue_links(root, respurl)

            try:
                _, resid = llnurl_ident(respurl)
            except ValueError:
                resid = None
            if resid:
                model = memory.connection()
                rdfalite.toversa(body, model, respurl)
                #Lock the file for
                resstem = resid[:HASH_WIDTH]
                csvexport_sink.locks.setdefault(resstem, Lock())
                #csvexport_sink.logger.debug('Awaiting lock on {}; TASK [{}].'.format(resstem, task_id))
                print('Awaiting lock on {}; TASK [{}].'.format(
                    resstem, task_id),
                      file=sys.stderr)
                await csvexport_sink.locks[resstem]
                print('Acquired lock on {}; TASK [{}].'.format(
                    resstem, task_id),
                      file=sys.stderr)

                try:
                    resstem_fpath = os.path.join(self.outfolder,
                                                 resstem + '.csv')
                    csvexists = os.path.exists(resstem_fpath)
                    #with gzip.open(resstem_fpath, 'at', newline='') as resstem_fp:
                    with open(resstem_fpath, 'at', newline='') as resstem_fp:
                        resstem_csv = csv.writer(resstem_fp,
                                                 delimiter=',',
                                                 quotechar='"',
                                                 quoting=csv.QUOTE_MINIMAL)
                        vcsv.write(model,
                                   resstem_csv,
                                   self.rules,
                                   not csvexists,
                                   base=respurl,
                                   logger=csvexport_sink.logger)
                finally:
                    csvexport_sink.locks[resstem].release()
                    #csvexport_sink.logger.debug('Released lock on {}; TASK [{}].'.format(resstem, task_id))
                    print('Released lock on {}; TASK [{}].'.format(
                        resstem, task_id),
                          file=sys.stderr)

            #self.save_ntriples()
            return linkset
        return None
Example #7
0
 def __init__(self, baseurl=None):
     if baseurl:
         model, _ = load_rdfa_page(baseurl)
         if not model:
             raise RuntimeError(baseurl, 'doesn\'t appear to be a Library.Link site')
         #<dd property="dcterms:modified">2018-04-17T04:17:32Z</dd>
         
         self.lastmod = next(versautil.lookup(model, None, 'http://purl.org/dc/terms/modified'), None)
         self.sitemap = iri.absolutize('/harvest/sitemap.xml', baseurl)
         self.url = baseurl
         protocol, self.host, path, query, fragment = iri.split_uri_ref(baseurl)
Example #8
0
    async def lln_handle_one_link(self, source, link):
        _, fphost, _, _, _ = iri.split_uri_ref(self._front_page)
        connector = aiohttp.TCPConnector(verify_ssl=False)
        #FIXME: Switch to the line below when we can use 3.6 across the board
        uastring = random.choice(STACKED_USER_AGENTS)
        #uastring = random.choices(USER_AGENT_STRINGS, USER_AGENT_WEIGHTS)
        headers = {'User-Agent': uastring, 'Referer': source}
        if not source:
            del headers['Referer']
        body = None
        try:
            async with aiohttp.ClientSession(headers=headers,
                                             connector=connector) as session:
                try:
                    continue_to_get = False
                    async with session.head(link) as resp:
                        #self._logger.debug('[TASK {}] Content type ({}): {}'.format(self._task_id, link, resp.headers.get('CONTENT-TYPE')))
                        #LIBRARY_LINK_HEADER tests it's an LLn page in the first place
                        if resp.status == 200 and resp.headers.get(
                                'CONTENT-TYPE'
                        ) in HTML_CTYPES and LIBRARY_LINK_HEADER in resp.headers:
                            #with async_timeout.timeout(10):
                            continue_to_get = True
                    if continue_to_get:
                        async with session.get(link) as resp:
                            respurl = str(resp.url)
                            #Handle possible redirection
                            if respurl not in self._seen:
                                body = await resp.text()  #.read()
                except (aiohttp.ClientOSError, aiohttp.ClientResponseError,
                        aiohttp.client_exceptions.ServerDisconnectedError,
                        aiohttp.client_exceptions.InvalidURL) as e:
                    self._logger.debug('Error: {} [TASK {}] -> {}'.format(
                        link, self._task_id, repr(e)))

            if body:
                ls = await self._sink.send(
                    (body, respurl, resp.headers, source, self._task_id))
                #Trim links which have already been seen when queued (saves memory)
                if ls:
                    #XXX: Use set intersection?
                    self._linkset_q.put_nowait(
                        (respurl,
                         liblink_set(link for link in ls
                                     if link not in self._seen)))
        except Exception as e:
            self._logger.exception()
            self._logger.debug(
                'Above error in context of TASK {}, LINK {}, source {}'.format(
                    self._task_id, link, source))
Example #9
0
 def sink(accumulator):
     while True:
         e = yield
         loc = next(select_name(e, 'loc'))
         lastmod = next(select_name(e, 'lastmod'))
         s = liblink_site()
         s.sitemap = loc.xml_value
         s.url, _, tail = s.sitemap.partition('harvest/sitemap.xml')
         s.base_url = s.url #Legacy property name
         #Early warning for funky URLs breaking stuff downstream
         assert not tail
         protocol, s.host, path, query, fragment = iri.split_uri_ref(s.sitemap)
         s.lastmod = lastmod.xml_value
         accumulator.append(s)
Example #10
0
    async def send(self, data):
        #Body text, response URL (e.g. after redirections), aiohttp.header object from response, referrer, controlling task ID
        (body, respurl, respheaders, referrer, task_id) = data
        _, respurlhost, _, _, _ = iri.split_uri_ref(respurl)
        if self._fphost == respurlhost:
            #csvexport_sink.logger.debug('[TASK {}]: Target subpage {} -> {}'.format(task_id, referrer, respurl))

            root = html5.parse(body)
            linkset = self._queue_links(root, respurl)
            
            try:
                _, resid = llnurl_ident(respurl)
            except ValueError:
                resid = None
            if resid:
                model = memory.connection()
                rdfalite.toversa(body, model, respurl)
                #Lock the file for 
                resstem = resid[:HASH_WIDTH]
                csvexport_sink.locks.setdefault(resstem, Lock())
                #csvexport_sink.logger.debug('Awaiting lock on {}; TASK [{}].'.format(resstem, task_id))
                print('Awaiting lock on {}; TASK [{}].'.format(resstem, task_id), file=sys.stderr)
                await csvexport_sink.locks[resstem]
                print('Acquired lock on {}; TASK [{}].'.format(resstem, task_id), file=sys.stderr)

                try:
                    resstem_fpath = os.path.join(self.outfolder, resstem + '.csv')
                    csvexists = os.path.exists(resstem_fpath)
                    #with gzip.open(resstem_fpath, 'at', newline='') as resstem_fp:
                    with open(resstem_fpath, 'at', newline='') as resstem_fp:
                        resstem_csv = csv.writer(resstem_fp, delimiter=',',
                                                    quotechar='"', quoting=csv.QUOTE_MINIMAL)
                        vcsv.write(model, resstem_csv, self.rules, not csvexists, base=respurl, logger=csvexport_sink.logger)
                finally:
                    csvexport_sink.locks[resstem].release()
                    #csvexport_sink.logger.debug('Released lock on {}; TASK [{}].'.format(resstem, task_id))
                    print('Released lock on {}; TASK [{}].'.format(resstem, task_id), file=sys.stderr)

            #self.save_ntriples()
            return linkset
        return None
Example #11
0
    async def send(self, data):
        #Body text, response URL (e.g. after redirections), aiohttp.header object from response, referrer, controlling task ID
        (body, respurl, respheaders, referrer, task_id) = data
        _, respurlhost, _, _, _ = iri.split_uri_ref(respurl)
        if self._fphost == respurlhost:
            #ntriplify_sink.logger.debug('[TASK {}]: Target subpage {} -> {}'.format(task_id, referrer, respurl))

            root = html5.parse(body)
            linkset = self._queue_links(root, respurl)

            try:
                _, resid = llnurl_ident(respurl)
            except ValueError:
                resid = None
            if resid:
                #Lock the file for
                resstem = resid[:3]
                ntriplify_sink.locks.setdefault(resstem, Lock())
                await ntriplify_sink.locks[resstem]
                try:
                    #Note:
                    #timeit.timeit(‘rdfalite.toversa(open(“index.html”).read(), model, “http://link.delawarelibrary.org/portal/Nest-Esther-Ehrlich-overdrive-ebook/F-h_bGCl5lk/“)’, setup=‘from versa.driver import memory; from versa.reader import rdfalite; model = memory.connection()’, number=10)
                    #4.412366830001702
                    #timeit.timeit(‘g = rdflib.Graph(); g.parse(“index.html”, format=“html”)’, setup=‘import rdflib’, number=10)
                    #[snip tons of warnings]
                    #16.82040351499745
                    #IOW Versa is 4X faster than RDFlib for this task, and more robust
                    with open(os.path.join(self.outfolder, resstem + '.nt'),
                              'a') as resstem_fp:
                        model = memory.connection()
                        rdfalite.toversa(body, model, respurl)
                        ntriples.write(model,
                                       out=resstem_fp,
                                       base=respurl,
                                       logger=ntriplify_sink.logger)
                finally:
                    ntriplify_sink.locks[resstem].release()

            #self.save_ntriples()
            return linkset
        return None
Example #12
0
    def send(self, data):
        #Body text, respunse URL (e.g. after redirections), aiohttp.header object from response, referrer, controlling task ID
        (body, respurl, respheaders, referrer, task_id) = data
        _, respurlhost, _, _, _ = iri.split_uri_ref(respurl)
        if LIBRARY_LINK_HEADER not in respheaders:
            #Not even an LLN page at all
            return
        if self._fphost == respurlhost:
            output_model = memory.connection()
            quickinfo_sink.logger.debug('[TASK {}]: Target subpage {} -> {}'.format(task_id, referrer, respurl))
            #Subpage of the target site
            rdfalite.toversa(body, output_model, respurl)
            resname = versautil.simple_lookup(output_model, respurl, SCHEMAORG + 'name')
            print(respurl, '|', resname, file=quickinfo_sink.outfp)
            #orgentity = util.simple_lookup_byvalue(model, RDFTYPE, SCHEMAORG + 'Organization')
            #name = util.simple_lookup(model, orgentity, BL + 'name')
            #name = util.simple_lookup(model, baseurl + '#_default', BL + 'name')

            root = html5.parse(body)
            linkset = self._queue_links(root, respurl)
        return linkset
Example #13
0
def llnurl_ident(url):
    '''
    Return the identifying pair of (site, hash) from an LLN URL

    >>> from librarylink.util import llnurl_ident
    >>> llnurl_ident('http://link.worthingtonlibraries.org/resource/9bz8W30aSZY/')
    ('link.worthingtonlibraries.org', '9bz8W30aSZY')
    >>> llnurl_ident('http://link.worthingtonlibraries.org/portal/Unshakeable--your-financial-freedom-playbook/cZlfLtSpcng/')
    ('link.worthingtonlibraries.org', 'cZlfLtSpcng')
    '''
    scheme, host, path, query, fragment = iri.split_uri_ref(url)
    try:
        if path.startswith('/resource/'):
            rhash = path.partition('/resource/')[-1].split('/')[0]
        elif '/portal/' in url:
            rhash = path.partition('/portal/')[-1].split('/')[1]
        else:
            raise ValueError('Invalid LLN URL: ' + repr(url))
    except IndexError as e:
        #FIXME L10N
        raise ValueError('Invalid LLN URL: ' + repr(url))
    return host, rhash
Example #14
0
    async def lln_handle_one_link(self, source, link):
        _, fphost, _, _, _ = iri.split_uri_ref(self._front_page)
        connector = aiohttp.TCPConnector(verify_ssl=False)
        #FIXME: Switch to the line below when we can use 3.6 across the board
        uastring = random.choice(STACKED_USER_AGENTS)
        #uastring = random.choices(USER_AGENT_STRINGS, USER_AGENT_WEIGHTS)
        headers = {'User-Agent': uastring, 'Referer': source}
        if not source:
            del headers['Referer']
        body = None
        try:
            async with aiohttp.ClientSession(headers=headers, connector=connector) as session:
                try:
                    continue_to_get = False
                    async with session.head(link) as resp:
                        #self._logger.debug('[TASK {}] Content type ({}): {}'.format(self._task_id, link, resp.headers.get('CONTENT-TYPE')))
                        #LIBRARY_LINK_HEADER tests it's an LLn page in the first place
                        if resp.status == 200 and resp.headers.get('CONTENT-TYPE') in HTML_CTYPES and LIBRARY_LINK_HEADER in resp.headers:
                            #with async_timeout.timeout(10):
                            continue_to_get = True
                    if continue_to_get:
                        async with session.get(link) as resp:
                            respurl = str(resp.url)
                            #Handle possible redirection
                            if respurl not in self._seen:
                                body = await resp.text() #.read()
                except (aiohttp.ClientOSError, aiohttp.ClientResponseError, aiohttp.client_exceptions.ServerDisconnectedError, aiohttp.client_exceptions.InvalidURL) as e:
                    self._logger.debug('Error: {} [TASK {}] -> {}'.format(link, self._task_id, repr(e)))

            if body:
                ls = await self._sink.send((body, respurl, resp.headers, source, self._task_id))
                #Trim links which have already been seen when queued (saves memory)
                if ls:
                    #XXX: Use set intersection?
                    self._linkset_q.put_nowait((respurl, liblink_set( link for link in ls if link not in self._seen )))
        except Exception as e:
            self._logger.exception()
            self._logger.debug('Above error in context of TASK {}, LINK {}, source {}'.format(self._task_id, link, source))
Example #15
0
def simplify_link(url):
    '''
    Return a simplified & unique form of an LLN URL

    >>> from librarylink.util import simplify_link
    >>> simplify_link('http://link.worthingtonlibraries.org/resource/9bz8W30aSZY/')
    'http://link.worthingtonlibraries.org/resource/9bz8W30aSZY/'
    >>> simplify_link('http://link.worthingtonlibraries.org/portal/Unshakeable--your-financial-freedom-playbook/cZlfLtSpcng/')
    'http://link.worthingtonlibraries.org/portal/Unshakeable--your-financial-freedom-playbook/cZlfLtSpcng/'
    >>> simplify_link('http://link.worthingtonlibraries.org/portal/Unshakeable--your-financial-freedom-playbook/cZlfLtSpcng/borrow/')
    'http://link.worthingtonlibraries.org/portal/Unshakeable--your-financial-freedom-playbook/cZlfLtSpcng/'
    >>> simplify_link('http://link.worthingtonlibraries.org/res/9bz8W30aSZY/boo/') is None
    True
    >>> simplify_link('http://link.worthingtonlibraries.org/resource/9bz8W30aSZY/boo/')
    'http://link.worthingtonlibraries.org/resource/9bz8W30aSZY/'
    >>> simplify_link('/res/9bz8W30aSZY/boo/') is None
    True
    >>> simplify_link('/resource/9bz8W30aSZY/boo/')
    '/resource/9bz8W30aSZY/'
    >>> simplify_link('https://link.worthingtonlibraries.org/resource/9bz8W30aSZY/')
    'https://link.worthingtonlibraries.org/resource/9bz8W30aSZY/'
    >>> simplify_link('https://link.worthingtonlibraries.org/resource/9bz8W30aSZY/borrow/')
    'https://link.worthingtonlibraries.org/resource/9bz8W30aSZY/'
    '''
    scheme, auth, path, query, fragment = iri.split_uri_ref(url)
    try:
        if path.startswith('/resource/'):
            path = '/resource/' + path.partition('/resource/')[-1].split('/')[0] + '/'
            return iri.unsplit_uri_ref((scheme, auth, path, None, None))
        if '/portal/' in url:
            path = '/portal/' + '/'.join(path.partition('/portal/')[-1].split('/')[:2]) + '/'
            return iri.unsplit_uri_ref((scheme, auth, path, None, None))
        else:
            path = None
    except IndexError as e:
        #FIXME L10N
        raise ValueError('Invalid LLN URL: ' + repr(url))
    return path
Example #16
0
    async def send(self, data):
        #Body text, response URL (e.g. after redirections), aiohttp.header object from response, referrer, controlling task ID
        (body, respurl, respheaders, referrer, task_id) = data
        _, respurlhost, _, _, _ = iri.split_uri_ref(respurl)
        if self._fphost == respurlhost:
            #ntriplify_sink.logger.debug('[TASK {}]: Target subpage {} -> {}'.format(task_id, referrer, respurl))

            root = html5.parse(body)
            linkset = self._queue_links(root, respurl)
            
            try:
                _, resid = llnurl_ident(respurl)
            except ValueError:
                resid = None
            if resid:
                #Lock the file for 
                resstem = resid[:3]
                ntriplify_sink.locks.setdefault(resstem, Lock())
                await ntriplify_sink.locks[resstem]
                try:
                    #Note:
                    #timeit.timeit(‘rdfalite.toversa(open(“index.html”).read(), model, “http://link.delawarelibrary.org/portal/Nest-Esther-Ehrlich-overdrive-ebook/F-h_bGCl5lk/“)’, setup=‘from versa.driver import memory; from versa.reader import rdfalite; model = memory.connection()’, number=10)
                    #4.412366830001702
                    #timeit.timeit(‘g = rdflib.Graph(); g.parse(“index.html”, format=“html”)’, setup=‘import rdflib’, number=10)
                    #[snip tons of warnings]
                    #16.82040351499745
                    #IOW Versa is 4X faster than RDFlib for this task, and more robust
                    with open(os.path.join(self.outfolder, resstem + '.nt'), 'a') as resstem_fp:
                        model = memory.connection()
                        rdfalite.toversa(body, model, respurl)
                        ntriples.write(model, out=resstem_fp, base=respurl, logger=ntriplify_sink.logger)
                finally:
                    ntriplify_sink.locks[resstem].release()

            #self.save_ntriples()
            return linkset
        return None
Example #17
0
    def handle_record_links(self, loop, model, params):
        '''
        Task coroutine of the main event loop for MARC conversion, called with 
        In this case update a report of links encountered in the MARC/XML

        model -- raw Versa model with converted resource information from the MARC details from each MARC/XML record processed
        params -- parameters passed in from processing:
            params['workid']: ID of the work constructed from the MARC record
            params['instanceid']: list of IDs of instances constructed from the MARC record
        '''
        #print ('BF_MARCREC_TASK', linkreport.PLUGIN_ID)
        items = {}
        #Get the title
        #First get the work ID
        workid = params['workid']
        #simple_lookup() is a little helper for getting a property from a resource
        title = simple_lookup(model, workid, TITLE_REL)
        #Get the ISBN, just pick the first one
        isbn = ''
        if params['instanceids']:
            inst1 = params['instanceids'][0]
            isbn = simple_lookup(model, inst1, ISBN_REL)

        envelope = '<div id="{0}" isbn="{1}"><title>{2}</title>\n'.format(
            workid, isbn, title)
        #iterate over all the relationship targets to see which is a link
        for stmt in model.match():
            if iri.matches_uri_syntax(stmt[TARGET]) and iri.split_uri_ref(
                    stmt[TARGET])[1] != BFHOST:
                self._links_found.add(stmt[TARGET])
                envelope += '<a href="{0}">{0}</a>\n'.format(
                    stmt[TARGET], stmt[TARGET])
        envelope += '</div>\n'
        self._outstr += envelope
        #print ('DONE BF_MARCREC_TASK', linkreport.PLUGIN_ID)
        return
Example #18
0
    def send(self, data):
        #Body text, respunse URL (e.g. after redirections), aiohttp.header object from response, referrer, controlling task ID
        (body, respurl, respheaders, referrer, task_id) = data
        _, respurlhost, _, _, _ = iri.split_uri_ref(respurl)
        if LIBRARY_LINK_HEADER not in respheaders:
            #Not even an LLN page at all
            return
        if self._fphost == respurlhost:
            output_model = memory.connection()
            quickinfo_sink.logger.debug(
                '[TASK {}]: Target subpage {} -> {}'.format(
                    task_id, referrer, respurl))
            #Subpage of the target site
            rdfalite.toversa(body, output_model, respurl)
            resname = versautil.simple_lookup(output_model, respurl,
                                              SCHEMAORG + 'name')
            print(respurl, '|', resname, file=quickinfo_sink.outfp)
            #orgentity = util.simple_lookup_byvalue(model, RDFTYPE, SCHEMAORG + 'Organization')
            #name = util.simple_lookup(model, orgentity, BL + 'name')
            #name = util.simple_lookup(model, baseurl + '#_default', BL + 'name')

            root = html5.parse(body)
            linkset = self._queue_links(root, respurl)
        return linkset
Example #19
0
 def __init__(self, frontpage, **kwargs):
     self._frontpage = frontpage
     _, self._fphost, _, _, _ = iri.split_uri_ref(self._frontpage)
     return
Example #20
0
 def __init__(self, frontpage, **kwargs):
     self._frontpage = frontpage
     _, self._fphost, _, _, _ = iri.split_uri_ref(self._frontpage)
     self.outfolder = kwargs['outrdfnt']
     return
Example #21
0
def test_normalize_case():
    for uri, expected0, expected1 in case_normalization_tests:
        testname = uri
        uri = iri.split_uri_ref(uri)
        assert expected0 == iri.unsplit_uri_ref(iri.normalize_case(uri)), testname
        assert expected1 == iri.unsplit_uri_ref(iri.normalize_case(uri, doHost=1)), testname + ' (host too)'
Example #22
0
 def __init__(self, frontpage, **kwargs):
     self._frontpage = frontpage
     _, self._fphost, _, _, _ = iri.split_uri_ref(self._frontpage)
     return
Example #23
0
 def __init__(self, frontpage, **kwargs):
     self._frontpage = frontpage
     _, self._fphost, _, _, _ = iri.split_uri_ref(self._frontpage)
     self.outfolder = kwargs['outrdfnt']
     return