def handle_record_links(self, loop, model, params): ''' Task coroutine of the main event loop for MARC conversion, called with In this case update a report of links encountered in the MARC/XML model -- raw Versa model with converted resource information from the MARC details from each MARC/XML record processed params -- parameters passed in from processing: params['workid']: ID of the work constructed from the MARC record params['instanceid']: list of IDs of instances constructed from the MARC record ''' #print ('BF_MARCREC_TASK', linkreport.PLUGIN_ID) items = {} #Get the title #First get the work ID workid = params['workid'] #simple_lookup() is a little helper for getting a property from a resource title = simple_lookup(model, workid, TITLE_REL) #Get the ISBN, just pick the first one isbn = '' if params['instanceids']: inst1 = params['instanceids'][0] isbn = simple_lookup(model, inst1, ISBN_REL) envelope = '<div id="{0}" isbn="{1}"><title>{2}</title>\n'.format(workid, isbn, title) #iterate over all the relationship targets to see which is a link for stmt in model.match(): if iri.matches_uri_syntax(stmt[TARGET]) and iri.split_uri_ref(stmt[TARGET])[1] != BFHOST: self._links_found.add(stmt[TARGET]) envelope += '<a href="{0}">{0}</a>\n'.format(stmt[TARGET], stmt[TARGET]) envelope += '</div>\n' self._outstr += envelope #print ('DONE BF_MARCREC_TASK', linkreport.PLUGIN_ID) return
def __init__(self, frontpage, **kwargs): self._frontpage = frontpage _, self._fphost, _, _, _ = iri.split_uri_ref(self._frontpage) self.outfolder = kwargs['csvexport'] from bibframe import BFZ, BL, BA, REL, MARC self.rules = [ (BL + 'controlCode', 'controlCode'), (BL + 'instantiates', 'instantiates'), (BL + 'link', 'link'), (BL + 'title', 'title'), (BL + 'name', 'name'), (BL + 'providerDate', 'providerDate'), (BL + 'providerPlace', 'providerPlace'), (BL + 'creator', 'creator'), (BL + 'genre', 'genre'), (BL + 'language', 'language'), (BL + 'subject', 'subject'), (BL + 'controlCode', 'controlCode'), (BL + 'focus', 'focus'), (BL + 'date', 'date'), (MARC + 'isbn', 'isbn'), (MARC + 'lccn', 'lccn'), (MARC + 'titleStatement', 'titleStatement'), (MARC + 'lcCallNumber', 'lcCallNumber'), (MARC + 'lcItemNumber', 'lcItemNumber'), (MARC + 'literaryForm', 'literaryForm'), (MARC + 'seriesStatement', 'seriesStatement'), (MARC + 'formSubdivision', 'formSubdivision'), ] return
def linkreport(config=None, **kwargs): #Any configuration variables passed in if config is None: config = {} try: #Initialize the output outstr = '' while True: params = yield model = params['model'] items = {} #Get the title #First get the work ID workid = params['workid'] #simple_lookup() is a little helper for getting a property from a resource title = simple_lookup(model, workid, TITLE_REL) #Get the ISBN, just pick the first one isbn = '' if params['instanceids']: inst1 = params['instanceids'][0] isbn = simple_lookup(model, inst1, ISBN_REL) envelope = '<div id="{0} isbn="{1}"><title>{2}</title>\n'.format(workid, isbn, title) #iterate over all the relationship targets to see which is a link for stmt in model.match(): if iri.matches_uri_syntax(stmt[TARGET]) and iri.split_uri_ref(stmt[TARGET])[1] != BFHOST: envelope += '<a href="{0}">{0}</a>\n'.format(stmt[TARGET], stmt[TARGET]) envelope += '</div>\n' outstr += envelope except GeneratorExit: #Reached when close() is called on this coroutine with open(config['output-file'], "w") as outf: outf.write(outstr)
def test_normalize_case(): for uri, expected0, expected1 in case_normalization_tests: testname = uri uri = iri.split_uri_ref(uri) assert expected0 == iri.unsplit_uri_ref( iri.normalize_case(uri)), testname assert expected1 == iri.unsplit_uri_ref( iri.normalize_case(uri, doHost=1)), testname + ' (host too)'
async def send(self, data): #Body text, response URL (e.g. after redirections), aiohttp.header object from response, referrer, controlling task ID (body, respurl, respheaders, referrer, task_id) = data _, respurlhost, _, _, _ = iri.split_uri_ref(respurl) if self._fphost == respurlhost: #csvexport_sink.logger.debug('[TASK {}]: Target subpage {} -> {}'.format(task_id, referrer, respurl)) root = html5.parse(body) linkset = self._queue_links(root, respurl) try: _, resid = llnurl_ident(respurl) except ValueError: resid = None if resid: model = memory.connection() rdfalite.toversa(body, model, respurl) #Lock the file for resstem = resid[:HASH_WIDTH] csvexport_sink.locks.setdefault(resstem, Lock()) #csvexport_sink.logger.debug('Awaiting lock on {}; TASK [{}].'.format(resstem, task_id)) print('Awaiting lock on {}; TASK [{}].'.format( resstem, task_id), file=sys.stderr) await csvexport_sink.locks[resstem] print('Acquired lock on {}; TASK [{}].'.format( resstem, task_id), file=sys.stderr) try: resstem_fpath = os.path.join(self.outfolder, resstem + '.csv') csvexists = os.path.exists(resstem_fpath) #with gzip.open(resstem_fpath, 'at', newline='') as resstem_fp: with open(resstem_fpath, 'at', newline='') as resstem_fp: resstem_csv = csv.writer(resstem_fp, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) vcsv.write(model, resstem_csv, self.rules, not csvexists, base=respurl, logger=csvexport_sink.logger) finally: csvexport_sink.locks[resstem].release() #csvexport_sink.logger.debug('Released lock on {}; TASK [{}].'.format(resstem, task_id)) print('Released lock on {}; TASK [{}].'.format( resstem, task_id), file=sys.stderr) #self.save_ntriples() return linkset return None
def __init__(self, baseurl=None): if baseurl: model, _ = load_rdfa_page(baseurl) if not model: raise RuntimeError(baseurl, 'doesn\'t appear to be a Library.Link site') #<dd property="dcterms:modified">2018-04-17T04:17:32Z</dd> self.lastmod = next(versautil.lookup(model, None, 'http://purl.org/dc/terms/modified'), None) self.sitemap = iri.absolutize('/harvest/sitemap.xml', baseurl) self.url = baseurl protocol, self.host, path, query, fragment = iri.split_uri_ref(baseurl)
async def lln_handle_one_link(self, source, link): _, fphost, _, _, _ = iri.split_uri_ref(self._front_page) connector = aiohttp.TCPConnector(verify_ssl=False) #FIXME: Switch to the line below when we can use 3.6 across the board uastring = random.choice(STACKED_USER_AGENTS) #uastring = random.choices(USER_AGENT_STRINGS, USER_AGENT_WEIGHTS) headers = {'User-Agent': uastring, 'Referer': source} if not source: del headers['Referer'] body = None try: async with aiohttp.ClientSession(headers=headers, connector=connector) as session: try: continue_to_get = False async with session.head(link) as resp: #self._logger.debug('[TASK {}] Content type ({}): {}'.format(self._task_id, link, resp.headers.get('CONTENT-TYPE'))) #LIBRARY_LINK_HEADER tests it's an LLn page in the first place if resp.status == 200 and resp.headers.get( 'CONTENT-TYPE' ) in HTML_CTYPES and LIBRARY_LINK_HEADER in resp.headers: #with async_timeout.timeout(10): continue_to_get = True if continue_to_get: async with session.get(link) as resp: respurl = str(resp.url) #Handle possible redirection if respurl not in self._seen: body = await resp.text() #.read() except (aiohttp.ClientOSError, aiohttp.ClientResponseError, aiohttp.client_exceptions.ServerDisconnectedError, aiohttp.client_exceptions.InvalidURL) as e: self._logger.debug('Error: {} [TASK {}] -> {}'.format( link, self._task_id, repr(e))) if body: ls = await self._sink.send( (body, respurl, resp.headers, source, self._task_id)) #Trim links which have already been seen when queued (saves memory) if ls: #XXX: Use set intersection? self._linkset_q.put_nowait( (respurl, liblink_set(link for link in ls if link not in self._seen))) except Exception as e: self._logger.exception() self._logger.debug( 'Above error in context of TASK {}, LINK {}, source {}'.format( self._task_id, link, source))
def sink(accumulator): while True: e = yield loc = next(select_name(e, 'loc')) lastmod = next(select_name(e, 'lastmod')) s = liblink_site() s.sitemap = loc.xml_value s.url, _, tail = s.sitemap.partition('harvest/sitemap.xml') s.base_url = s.url #Legacy property name #Early warning for funky URLs breaking stuff downstream assert not tail protocol, s.host, path, query, fragment = iri.split_uri_ref(s.sitemap) s.lastmod = lastmod.xml_value accumulator.append(s)
async def send(self, data): #Body text, response URL (e.g. after redirections), aiohttp.header object from response, referrer, controlling task ID (body, respurl, respheaders, referrer, task_id) = data _, respurlhost, _, _, _ = iri.split_uri_ref(respurl) if self._fphost == respurlhost: #csvexport_sink.logger.debug('[TASK {}]: Target subpage {} -> {}'.format(task_id, referrer, respurl)) root = html5.parse(body) linkset = self._queue_links(root, respurl) try: _, resid = llnurl_ident(respurl) except ValueError: resid = None if resid: model = memory.connection() rdfalite.toversa(body, model, respurl) #Lock the file for resstem = resid[:HASH_WIDTH] csvexport_sink.locks.setdefault(resstem, Lock()) #csvexport_sink.logger.debug('Awaiting lock on {}; TASK [{}].'.format(resstem, task_id)) print('Awaiting lock on {}; TASK [{}].'.format(resstem, task_id), file=sys.stderr) await csvexport_sink.locks[resstem] print('Acquired lock on {}; TASK [{}].'.format(resstem, task_id), file=sys.stderr) try: resstem_fpath = os.path.join(self.outfolder, resstem + '.csv') csvexists = os.path.exists(resstem_fpath) #with gzip.open(resstem_fpath, 'at', newline='') as resstem_fp: with open(resstem_fpath, 'at', newline='') as resstem_fp: resstem_csv = csv.writer(resstem_fp, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) vcsv.write(model, resstem_csv, self.rules, not csvexists, base=respurl, logger=csvexport_sink.logger) finally: csvexport_sink.locks[resstem].release() #csvexport_sink.logger.debug('Released lock on {}; TASK [{}].'.format(resstem, task_id)) print('Released lock on {}; TASK [{}].'.format(resstem, task_id), file=sys.stderr) #self.save_ntriples() return linkset return None
async def send(self, data): #Body text, response URL (e.g. after redirections), aiohttp.header object from response, referrer, controlling task ID (body, respurl, respheaders, referrer, task_id) = data _, respurlhost, _, _, _ = iri.split_uri_ref(respurl) if self._fphost == respurlhost: #ntriplify_sink.logger.debug('[TASK {}]: Target subpage {} -> {}'.format(task_id, referrer, respurl)) root = html5.parse(body) linkset = self._queue_links(root, respurl) try: _, resid = llnurl_ident(respurl) except ValueError: resid = None if resid: #Lock the file for resstem = resid[:3] ntriplify_sink.locks.setdefault(resstem, Lock()) await ntriplify_sink.locks[resstem] try: #Note: #timeit.timeit(‘rdfalite.toversa(open(“index.html”).read(), model, “http://link.delawarelibrary.org/portal/Nest-Esther-Ehrlich-overdrive-ebook/F-h_bGCl5lk/“)’, setup=‘from versa.driver import memory; from versa.reader import rdfalite; model = memory.connection()’, number=10) #4.412366830001702 #timeit.timeit(‘g = rdflib.Graph(); g.parse(“index.html”, format=“html”)’, setup=‘import rdflib’, number=10) #[snip tons of warnings] #16.82040351499745 #IOW Versa is 4X faster than RDFlib for this task, and more robust with open(os.path.join(self.outfolder, resstem + '.nt'), 'a') as resstem_fp: model = memory.connection() rdfalite.toversa(body, model, respurl) ntriples.write(model, out=resstem_fp, base=respurl, logger=ntriplify_sink.logger) finally: ntriplify_sink.locks[resstem].release() #self.save_ntriples() return linkset return None
def send(self, data): #Body text, respunse URL (e.g. after redirections), aiohttp.header object from response, referrer, controlling task ID (body, respurl, respheaders, referrer, task_id) = data _, respurlhost, _, _, _ = iri.split_uri_ref(respurl) if LIBRARY_LINK_HEADER not in respheaders: #Not even an LLN page at all return if self._fphost == respurlhost: output_model = memory.connection() quickinfo_sink.logger.debug('[TASK {}]: Target subpage {} -> {}'.format(task_id, referrer, respurl)) #Subpage of the target site rdfalite.toversa(body, output_model, respurl) resname = versautil.simple_lookup(output_model, respurl, SCHEMAORG + 'name') print(respurl, '|', resname, file=quickinfo_sink.outfp) #orgentity = util.simple_lookup_byvalue(model, RDFTYPE, SCHEMAORG + 'Organization') #name = util.simple_lookup(model, orgentity, BL + 'name') #name = util.simple_lookup(model, baseurl + '#_default', BL + 'name') root = html5.parse(body) linkset = self._queue_links(root, respurl) return linkset
def llnurl_ident(url): ''' Return the identifying pair of (site, hash) from an LLN URL >>> from librarylink.util import llnurl_ident >>> llnurl_ident('http://link.worthingtonlibraries.org/resource/9bz8W30aSZY/') ('link.worthingtonlibraries.org', '9bz8W30aSZY') >>> llnurl_ident('http://link.worthingtonlibraries.org/portal/Unshakeable--your-financial-freedom-playbook/cZlfLtSpcng/') ('link.worthingtonlibraries.org', 'cZlfLtSpcng') ''' scheme, host, path, query, fragment = iri.split_uri_ref(url) try: if path.startswith('/resource/'): rhash = path.partition('/resource/')[-1].split('/')[0] elif '/portal/' in url: rhash = path.partition('/portal/')[-1].split('/')[1] else: raise ValueError('Invalid LLN URL: ' + repr(url)) except IndexError as e: #FIXME L10N raise ValueError('Invalid LLN URL: ' + repr(url)) return host, rhash
async def lln_handle_one_link(self, source, link): _, fphost, _, _, _ = iri.split_uri_ref(self._front_page) connector = aiohttp.TCPConnector(verify_ssl=False) #FIXME: Switch to the line below when we can use 3.6 across the board uastring = random.choice(STACKED_USER_AGENTS) #uastring = random.choices(USER_AGENT_STRINGS, USER_AGENT_WEIGHTS) headers = {'User-Agent': uastring, 'Referer': source} if not source: del headers['Referer'] body = None try: async with aiohttp.ClientSession(headers=headers, connector=connector) as session: try: continue_to_get = False async with session.head(link) as resp: #self._logger.debug('[TASK {}] Content type ({}): {}'.format(self._task_id, link, resp.headers.get('CONTENT-TYPE'))) #LIBRARY_LINK_HEADER tests it's an LLn page in the first place if resp.status == 200 and resp.headers.get('CONTENT-TYPE') in HTML_CTYPES and LIBRARY_LINK_HEADER in resp.headers: #with async_timeout.timeout(10): continue_to_get = True if continue_to_get: async with session.get(link) as resp: respurl = str(resp.url) #Handle possible redirection if respurl not in self._seen: body = await resp.text() #.read() except (aiohttp.ClientOSError, aiohttp.ClientResponseError, aiohttp.client_exceptions.ServerDisconnectedError, aiohttp.client_exceptions.InvalidURL) as e: self._logger.debug('Error: {} [TASK {}] -> {}'.format(link, self._task_id, repr(e))) if body: ls = await self._sink.send((body, respurl, resp.headers, source, self._task_id)) #Trim links which have already been seen when queued (saves memory) if ls: #XXX: Use set intersection? self._linkset_q.put_nowait((respurl, liblink_set( link for link in ls if link not in self._seen ))) except Exception as e: self._logger.exception() self._logger.debug('Above error in context of TASK {}, LINK {}, source {}'.format(self._task_id, link, source))
def simplify_link(url): ''' Return a simplified & unique form of an LLN URL >>> from librarylink.util import simplify_link >>> simplify_link('http://link.worthingtonlibraries.org/resource/9bz8W30aSZY/') 'http://link.worthingtonlibraries.org/resource/9bz8W30aSZY/' >>> simplify_link('http://link.worthingtonlibraries.org/portal/Unshakeable--your-financial-freedom-playbook/cZlfLtSpcng/') 'http://link.worthingtonlibraries.org/portal/Unshakeable--your-financial-freedom-playbook/cZlfLtSpcng/' >>> simplify_link('http://link.worthingtonlibraries.org/portal/Unshakeable--your-financial-freedom-playbook/cZlfLtSpcng/borrow/') 'http://link.worthingtonlibraries.org/portal/Unshakeable--your-financial-freedom-playbook/cZlfLtSpcng/' >>> simplify_link('http://link.worthingtonlibraries.org/res/9bz8W30aSZY/boo/') is None True >>> simplify_link('http://link.worthingtonlibraries.org/resource/9bz8W30aSZY/boo/') 'http://link.worthingtonlibraries.org/resource/9bz8W30aSZY/' >>> simplify_link('/res/9bz8W30aSZY/boo/') is None True >>> simplify_link('/resource/9bz8W30aSZY/boo/') '/resource/9bz8W30aSZY/' >>> simplify_link('https://link.worthingtonlibraries.org/resource/9bz8W30aSZY/') 'https://link.worthingtonlibraries.org/resource/9bz8W30aSZY/' >>> simplify_link('https://link.worthingtonlibraries.org/resource/9bz8W30aSZY/borrow/') 'https://link.worthingtonlibraries.org/resource/9bz8W30aSZY/' ''' scheme, auth, path, query, fragment = iri.split_uri_ref(url) try: if path.startswith('/resource/'): path = '/resource/' + path.partition('/resource/')[-1].split('/')[0] + '/' return iri.unsplit_uri_ref((scheme, auth, path, None, None)) if '/portal/' in url: path = '/portal/' + '/'.join(path.partition('/portal/')[-1].split('/')[:2]) + '/' return iri.unsplit_uri_ref((scheme, auth, path, None, None)) else: path = None except IndexError as e: #FIXME L10N raise ValueError('Invalid LLN URL: ' + repr(url)) return path
def handle_record_links(self, loop, model, params): ''' Task coroutine of the main event loop for MARC conversion, called with In this case update a report of links encountered in the MARC/XML model -- raw Versa model with converted resource information from the MARC details from each MARC/XML record processed params -- parameters passed in from processing: params['workid']: ID of the work constructed from the MARC record params['instanceid']: list of IDs of instances constructed from the MARC record ''' #print ('BF_MARCREC_TASK', linkreport.PLUGIN_ID) items = {} #Get the title #First get the work ID workid = params['workid'] #simple_lookup() is a little helper for getting a property from a resource title = simple_lookup(model, workid, TITLE_REL) #Get the ISBN, just pick the first one isbn = '' if params['instanceids']: inst1 = params['instanceids'][0] isbn = simple_lookup(model, inst1, ISBN_REL) envelope = '<div id="{0}" isbn="{1}"><title>{2}</title>\n'.format( workid, isbn, title) #iterate over all the relationship targets to see which is a link for stmt in model.match(): if iri.matches_uri_syntax(stmt[TARGET]) and iri.split_uri_ref( stmt[TARGET])[1] != BFHOST: self._links_found.add(stmt[TARGET]) envelope += '<a href="{0}">{0}</a>\n'.format( stmt[TARGET], stmt[TARGET]) envelope += '</div>\n' self._outstr += envelope #print ('DONE BF_MARCREC_TASK', linkreport.PLUGIN_ID) return
def send(self, data): #Body text, respunse URL (e.g. after redirections), aiohttp.header object from response, referrer, controlling task ID (body, respurl, respheaders, referrer, task_id) = data _, respurlhost, _, _, _ = iri.split_uri_ref(respurl) if LIBRARY_LINK_HEADER not in respheaders: #Not even an LLN page at all return if self._fphost == respurlhost: output_model = memory.connection() quickinfo_sink.logger.debug( '[TASK {}]: Target subpage {} -> {}'.format( task_id, referrer, respurl)) #Subpage of the target site rdfalite.toversa(body, output_model, respurl) resname = versautil.simple_lookup(output_model, respurl, SCHEMAORG + 'name') print(respurl, '|', resname, file=quickinfo_sink.outfp) #orgentity = util.simple_lookup_byvalue(model, RDFTYPE, SCHEMAORG + 'Organization') #name = util.simple_lookup(model, orgentity, BL + 'name') #name = util.simple_lookup(model, baseurl + '#_default', BL + 'name') root = html5.parse(body) linkset = self._queue_links(root, respurl) return linkset
def __init__(self, frontpage, **kwargs): self._frontpage = frontpage _, self._fphost, _, _, _ = iri.split_uri_ref(self._frontpage) return
def __init__(self, frontpage, **kwargs): self._frontpage = frontpage _, self._fphost, _, _, _ = iri.split_uri_ref(self._frontpage) self.outfolder = kwargs['outrdfnt'] return
def test_normalize_case(): for uri, expected0, expected1 in case_normalization_tests: testname = uri uri = iri.split_uri_ref(uri) assert expected0 == iri.unsplit_uri_ref(iri.normalize_case(uri)), testname assert expected1 == iri.unsplit_uri_ref(iri.normalize_case(uri, doHost=1)), testname + ' (host too)'