def test_pipeline(): idg = idgen(EXAMPLE_ORG) existing_ids = [] mat = functools.partial(materialize, hashidgen=idg, existing_ids=existing_ids) TRANSFORMS = { 'id': functools.partial(discard), 'title': functools.partial(relabel, new_rel='name'), 'author': functools.partial(mat, new_rel='author', unique=run('target'), typ='Person', properties={'name': run('target')}), 'link': functools.partial(relabel, new_rel='link'), 'cover': functools.partial(relabel, new_rel='cover'), } #'type': functools.partial(relabel, new_rel=VTYPE_REL), out_m = memory.connection(baseiri='http://example.org/') rid = SIMPLE_BOOK['id'] out_m.add(rid, VTYPE_REL, BOOK_TYPE) for k, v in SIMPLE_BOOK.items(): link = (rid, k, v) func = TRANSFORMS.get(k) if func: in_m = memory.connection(baseiri='http://example.org/') ctx = context(link, in_m, out_m, base=SCHEMA_ORG) func(ctx) assert out_m.size() == 7 assert next(out_m.match('http://example.org/book/catcher-in-the-rye', VTYPE_REL))[TARGET] == BOOK_TYPE assert next(out_m.match('http://example.org/book/catcher-in-the-rye', I(iri.absolutize('name', SCHEMA_ORG))))[TARGET] == 'The Catcher in the Rye'
def run_one(snippet, expected, desc, entbase=None, config=None, loop=None, canonical=True): m = memory.connection() m_expected = memory.connection() infile = tempfile.NamedTemporaryFile() infile.write(snippet.encode('utf-8')) infile.seek(0) outstream = StringIO() bfconvert([infile], model=m, out=outstream, config=config, canonical=canonical, loop=loop) #bfconvert(factory(infile), model=m, out=outstream, config=config, canonical=canonical, loop=loop) infile.close() outstream.seek(0) hashmap, m = hash_neutral_model(outstream) hashmap = '\n'.join(sorted([repr((i[1], i[0])) for i in hashmap.items()])) expected_stream = StringIO(expected) hashmap_expected, m_expected = hash_neutral_model(expected_stream) hashmap_expected = '\n'.join( sorted([repr((i[1], i[0])) for i in hashmap_expected.items()])) assert hashmap == hashmap_expected, "Changes to hashes found ({0}):\n{1}\n\nActual model structure diff:\n{2}".format( desc, file_diff(hashmap_expected, hashmap), file_diff(repr(m_expected), repr(m))) assert m == m_expected, "Discrepancies found ({0}):\n{1}".format( desc, file_diff(repr(m_expected), repr(m)))
def test_pipeline1(): idg = idgen(EXAMPLE_ORG) existing_ids = set() TRANSFORMS = { "id": discard(), "title": rename(rel="name"), "author": materialize("Person", rel="author", unique=run("target"), links={"name": run("target")}), "link": rename(rel="link"), "cover": rename(rel="cover"), } #'type': functools.partial(relabel, rel=VTYPE_REL), out_m = memory.connection(baseiri="http://example.org/") rid = SIMPLE_BOOK["id"] out_m.add(rid, VTYPE_REL, BOOK_TYPE) for k, v in SIMPLE_BOOK.items(): link = (rid, k, v, {}) func = TRANSFORMS.get(k) if func: in_m = memory.connection(baseiri="http://example.org/") ctx = context(link, in_m, out_m, base=SCHEMA_ORG, idgen=idg) func(ctx) assert out_m.size() == 7, repr(out_m) assert next(out_m.match("http://example.org/book/catcher-in-the-rye", VTYPE_REL))[TARGET] == BOOK_TYPE assert ( next(out_m.match("http://example.org/book/catcher-in-the-rye", I(iri.absolutize("name", SCHEMA_ORG))))[TARGET] == "The Catcher in the Rye" )
def test_work_fallback_author_in_marc_with_plusbib(): m = memory.connection() m_expected = memory.connection() s = StringIO() bfconvert([BytesIO(REGULAR_MARC_EXAMPLE)], model=m, out=s, config=WORK_FALLBACK_AUTHOR_IN_MARC_CONFIG_PLUS_BIB, canonical=True) s.seek(0) #with open('/tmp/foo.versa.json', 'w') as f: # f.write(s.read()) #s.seek(0) hashmap, m = hash_neutral_model(s) hashmap = '\n'.join(sorted([ repr((i[1], i[0])) for i in hashmap.items() ])) removals = [] #Strip out tag-XXX relationships for ix, (o, r, t, a) in m: #logging.debug(r) if r.startswith('http://bibfra.me/vocab/marcext/tag-') or r.startswith('http://bibfra.me/vocab/marcext/sf-'): removals.append(ix) m.remove(removals) hashmap_expected, m_expected = hash_neutral_model(StringIO(WORK_FALLBACK_AUTHOR_IN_MARC_EXPECTED_PLUS_BIB)) hashmap_expected = '\n'.join(sorted([ repr((i[1], i[0])) for i in hashmap_expected.items() ])) assert hashmap == hashmap_expected, "Changes to hashes found:\n{0}\n\nActual model structure diff:\n{0}".format(file_diff(hashmap_expected, hashmap), file_diff(repr(m_expected), repr(m))) assert m == m_expected, "Discrepancies found:\n{0}".format(file_diff(repr(m_expected), repr(m)))
def run_one(snippet, expected, desc, entbase=None, config=None, loop=None, canonical=True): m = memory.connection() m_expected = memory.connection() instream = BytesIO(snippet.encode('utf-8')) outstream = StringIO() bfconvert(instream, model=m, out=outstream, config=config, canonical=canonical, loop=loop) outstream.seek(0) jsonload(m, outstream) expected_stream = StringIO(expected) jsonload(m_expected, expected_stream) assert m == m_expected, "Discrepancies found ({0}):\n{1}".format(desc, file_diff(repr(m_expected), repr(m)))
def run_one(name, entbase=None, config=None, loop=None, canonical=True): m = memory.connection() m_expected = memory.connection() s = StringIO() with open(os.path.join(RESOURCEPATH, name+'.mrx'), 'rb') as indoc: bfconvert(indoc, model=m, out=s, config=config, canonical=canonical, loop=loop) s.seek(0) jsonload(m, s) with open(os.path.join(RESOURCEPATH, name+'.versa')) as indoc: jsonload(m_expected, indoc) assert m == m_expected, "Discrepancies found for {0}:\n{1}".format(name, file_diff(repr(m_expected), repr(m)))
async def rdfa_from_page(url, session=None, max_retries=1): ''' Async helper to load RDFa page as text, plus load a Versa model with the metadata Yields a versa memory model, the raw site text and HTTP response info, except in error case where it returns None and the exception >>> from amara3.asynctools import go_async >>> from librarylink.util import rdfa_from_page >>> from versa import util as versautil >>> url = "http://link.crlibrary.org/portal/Estamos-en-un-libro-por-Mo-Willems--traducido/ZAxkTVTDCxE/" >>> model, sitetext, response = go_async(rdfa_from_page(url)) >>> next(versautil.lookup(model, 'http://link.crlibrary.org/resource/zXft1yv0T9k/', 'http://schema.org/name')) 'Libros y lectura -- Novela juvenil' ''' retry_count = 0 while True: model = memory.connection() try: if session == None: import aiohttp async with aiohttp.ClientSession() as session: async with session.get(url) as response: body = await response.read() rdfalite.toversa(body, model, url) return model, body, response else: async with session.get(url) as response: body = await response.read() rdfalite.toversa(body, model, url) return model, body, response except Exception as e: #print(url, f'[EXCEPTION {e}], context: {context}') retry_count += 1 if retry_count >= max_retries: return None, e, None
async def network_isbn_info(isbn, session=None, max_retries=1): ''' Async helper to get JSON content from network resource page Returns a JSON object >>> from amara3.asynctools import go_async >>> from librarylink.resource import network_isbn_info >>> obj = go_async(network_isbn_info(9780871290861)) >>> obj['workExample'][0].get('holdings_count') 19 ''' retry_count = 0 url = LL_ISBN_STEMPLATE.format(**{'isbn': isbn}) #print('processing', url, file=sys.stderr) while True: await asyncio.sleep(0.2) model = memory.connection() try: if session == None: import aiohttp async with aiohttp.ClientSession() as session: async with session.get(url) as response: obj = await response.json() return obj else: async with session.get(url) as response: obj = await response.json() return obj except Exception as e: #print(url, f'[EXCEPTION {e}], context: {context}', file=sys.stderr) retry_count += 1 if retry_count >= max_retries: return None
def test_basics(): "Basic query test" m = memory.connection() [m.add(*l) for l in RELS_1] variables = {'DC': DC, 'H5': H5, 'H5L': H5L} ctx = context(tuple(RELS_1[0]), m, U + 'uo', base=None, extras=None, variables=variables) parsed = miniparse("?($a, H5 'title', *) and ?($b, H5L 'see-also', $a)") result = parsed.evaluate(ctx) assert result == { 'a': set(['http://uche.ogbuji.net/ndewo/']), 'b': set(['http://uche.ogbuji.net/']) } parsed = miniparse("?($a, H5L 'see-also', *)") result = parsed.evaluate(ctx) assert result == { 'a': set(['http://uche.ogbuji.net/', 'http://uche.ogbuji.net/ndewo/']) } parsed = miniparse("?($a, H5 'title', *)") result = parsed.evaluate(ctx) assert result == {'a': set(['http://uche.ogbuji.net/ndewo/'])} return
def test_basics(): "test ..." model = memory.connection() for (subj, pred, obj, attrs) in RELS_1: model.add(subj, pred, obj, attrs) results = model.match(origin='http://copia.ogbuji.net') logging.debug('BASICS PART 1') for result in results: logging.debug('Result: {0}'.format(repr(result))) #assert result == () #assert results == None, "Boo! " results = model.match(origin='http://uche.ogbuji.net', attrs={u'@lang': u'ig'}) logging.debug('BASICS PART 2') results = tuple(list(results)) #import pprint; pprint.pprint(results) for result in results: logging.debug('Result: {0}'.format(repr(result))) #assert result == () expected = (('http://uche.ogbuji.net', 'http://purl.org/dc/elements/1.1/title', 'Ulo Uche', { '@context': 'http://uche.ogbuji.net#_metadata', '@lang': 'ig' }), ) assert results == expected, (results, expected)
def test_author_in_marc(): loop = asyncio.new_event_loop() asyncio.set_event_loop(None) m = memory.connection() m_expected = memory.connection() s = StringIO() bfconvert([BytesIO(AUTHOR_IN_MARC)], model=m, out=s, config=AUTHOR_IN_MARC_CONFIG, canonical=True, loop=loop) s.seek(0) #with open('/tmp/foo.versa.json', 'w') as f: # f.write(s.read()) #s.seek(0) #sys.exit(-1) hashmap, m = hash_neutral_model(s) hashmap = '\n'.join(sorted([repr((i[1], i[0])) for i in hashmap.items()])) removals = [] #Strip out tag-XXX relationships for ix, (o, r, t, a) in m: #logging.debug(r) if r.startswith('http://bibfra.me/vocab/marcext/tag-') or r.startswith( 'http://bibfra.me/vocab/marcext/sf-'): removals.append(ix) m.remove(removals) #with open('/tmp/foo.versa.json', 'w') as f: # f.write(repr(m)) hashmap_expected, m_expected = hash_neutral_model( StringIO(AUTHOR_IN_MARC_EXPECTED)) hashmap_expected = '\n'.join( sorted([repr((i[1], i[0])) for i in hashmap_expected.items()])) assert hashmap == hashmap_expected, "Changes to hashes found:\n{0}\n\nActual model structure diff:\n{0}".format( file_diff(hashmap_expected, hashmap), file_diff(repr(m_expected), repr(m))) assert m == m_expected, "Discrepancies found:\n{0}".format( file_diff(repr(m_expected), repr(m)))
def test_model_consumed(): loop = asyncio.new_event_loop() asyncio.set_event_loop(None) m = memory.connection() with open(os.path.join(RESOURCEPATH, 'multiple-authlinks.xml'), 'rb') as indoc: bfconvert([indoc], entbase='http://example.org/', model=m, config=None, verbose=False, loop=loop) assert m.size() == 0, 'Model not consumed:\n'+repr(m)
def hash_neutral_model(stream): ''' >>> VJSON = """[ ["DoVM1hvc","http://bibfra.me/purl/versa/type","http://bibfra.me/vocab/lite/Person",{"@target-type": "@iri-ref"}], ["DoVM1hvc","http://bibfra.me/vocab/lite/date","1878-1967.",{}], ["DoVM1hvc","http://bibfra.me/vocab/lite/name","Sandburg, Carl,",{}], ["DoVM1hvc","http://bibfra.me/vocab/marcext/sf-a","Sandburg, Carl,",{}], ["DoVM1hvc","http://bibfra.me/vocab/marcext/sf-d","1878-1967.",{}], ["Ht2FQsIY","http://bibfra.me/purl/versa/type","http://bibfra.me/vocab/lite/Instance",{"@target-type": "@iri-ref"}], ["Ht2FQsIY","http://bibfra.me/vocab/lite/instantiates","XsrrgYIS",{"@target-type": "@iri-ref"}], ["XsrrgYIS","http://bibfra.me/purl/versa/type","http://bibfra.me/vocab/lite/Work",{"@target-type": "@iri-ref"}], ["XsrrgYIS","http://bibfra.me/purl/versa/type","http://bibfra.me/vocab/marc/Books",{"@target-type": "@iri-ref"}], ["XsrrgYIS","http://bibfra.me/purl/versa/type","http://bibfra.me/vocab/marc/LanguageMaterial",{"@target-type": "@iri-ref"}], ["XsrrgYIS","http://bibfra.me/vocab/lite/creator","DoVM1hvc",{"@target-type": "@iri-ref"}], ["XsrrgYIS","http://bibfra.me/vocab/marc/natureOfContents","encyclopedias",{}], ["XsrrgYIS","http://bibfra.me/vocab/marc/natureOfContents","legal articles",{}], ["XsrrgYIS","http://bibfra.me/vocab/marc/natureOfContents","surveys of literature",{}], ["XsrrgYIS","http://bibfra.me/vocab/marcext/tag-008","920219s1993 caua j 000 0 eng",{}] ]""" >>> from io import StringIO, BytesIO >>> s = StringIO(VJSON) >>> from bibframe.util import hash_neutral_model >>> hashmap, model = hash_neutral_model(s) >>> hashmap {'XsrrgYIS': '@R0', 'DoVM1hvc': '@R1', 'Ht2FQsIY': '@R2'} >>> [ (o, r, t, a) for (rid, (o, r, t, a)) in model ][0] #Safe ordering for memory model only, mind you ('@R1', 'http://bibfra.me/vocab/lite/name', 'Sandburg, Carl,', OrderedDict()) ''' stage1 = memory.connection() stage2 = memory.connection() stage3 = memory.connection() jsonload(stage1, stream) hashmap = {} #One pass for origins dummy = repr(stage1) #Mysterious bug (presumably in jsonload): attributes lose all their contents without this line for (rid, (o, r, t, a)) in sorted(stage1, key=lambda x:x[1][0]): # sort by resource id hash_neutral_origin = hashmap.setdefault(o, '@R{}'.format(len(hashmap))) stage2.add(hash_neutral_origin, r, t, a) del stage1 #clean up #Another pass for targets for (rid, (o, r, t, a)) in sorted(stage2): hash_neutral_target = t if a.get("@target-type") == "@iri-ref": hash_neutral_target = hashmap.get(t, t) stage3.add(o, r, hash_neutral_target, a) return hashmap, stage3
def test_model_consumed(): loop = asyncio.new_event_loop() asyncio.set_event_loop(None) m = memory.connection() fname = os.path.join(RESOURCEPATH, 'multiple-authlinks.mrx') #bfconvert([inputsource(open(fname, 'rb'))], entbase='http://example.org/', model=m, config=None, verbose=False, loop=loop) bfconvert([open(fname, 'rb')], entbase='http://example.org/', model=m, config=None, verbose=False, loop=loop) assert m.size() == 0, 'Model not consumed:\n'+repr(m)
def test_index(): model = memory.connection() r1 = model.add('s1','p0','lit0',{}) r2 = model.add('s1','p1','lit1',{}) r3 = model.add('s1','p2','lit2',{}) assert model[r1][0] == 's1' assert model[r2][1] == 'p1' assert model[r3][2] == 'lit2'
def test_index(): model = memory.connection() r1 = model.add('s1', 'p0', 'lit0', {}) r2 = model.add('s1', 'p1', 'lit1', {}) r3 = model.add('s1', 'p2', 'lit2', {}) assert model[r1][0] == 's1' assert model[r2][1] == 'p1' assert model[r3][2] == 'lit2'
def bfconvert(inputs, base=None, out=None, limit=None, rdfttl=None, config=None, verbose=False, logger=logging): ''' inputs - List of MARC/XML files to be parsed and converted to BIBFRAME RDF (Note: want to allow singular input strings) out - file where raw Versa JSON dump output should be written (default: write to stdout) rdfttl - stream to where RDF Turtle output should be written config - configuration information limit - Limit the number of records processed to this number. If omitted, all records will be processed. base - Base IRI to be used for creating resources. verbose - If true show additional messages and information (default: False) logger - logging object for messages ''' #if stats: # register_service(statsgen.statshandler) if hasattr(inputs, 'read') and hasattr(inputs, 'close'): #It's a file type? inputs = [inputs] if limit is not None: try: limit = int(limit) except ValueError: logger.debug('Limit must be a number, not "{0}". Ignoring.'.format(limit)) ids = marc.idgen(base) m = memory.connection() g = rdflib.Graph() g.bind('bf', BFNS) g.bind('bfc', BFCNS) g.bind('bfd', BFDNS) g.bind('v', VNS) def postprocess(rec): #No need to bother with Versa -> RDF translation if we were not asked to generate Turtle if rdfttl is not None: rdf.process(m, g, logger=logger) m.create_space() #Initialize auxiliary services (i.e. plugins) plugins = [] for pc in config.get(u'plugins', []): try: plugins.append(g_services[pc[u'id']]( config=pc, logger=logger, ) ) except KeyError: raise Exception(u'Unknown plugin {0}'.format(pc[u'id'])) limiting = [0, limit] for inf in inputs: sink = marc.record_handler(m, idbase=base, limiting=limiting, plugins=plugins, ids=ids, postprocess=postprocess, out=out, logger=logger) parse_marcxml(inf, sink) if rdfttl is not None: rdfttl.write(g.serialize(format="turtle")) for plugin in plugins: plugin.close() return
async def send(self, data): #Body text, response URL (e.g. after redirections), aiohttp.header object from response, referrer, controlling task ID (body, respurl, respheaders, referrer, task_id) = data _, respurlhost, _, _, _ = iri.split_uri_ref(respurl) if self._fphost == respurlhost: #csvexport_sink.logger.debug('[TASK {}]: Target subpage {} -> {}'.format(task_id, referrer, respurl)) root = html5.parse(body) linkset = self._queue_links(root, respurl) try: _, resid = llnurl_ident(respurl) except ValueError: resid = None if resid: model = memory.connection() rdfalite.toversa(body, model, respurl) #Lock the file for resstem = resid[:HASH_WIDTH] csvexport_sink.locks.setdefault(resstem, Lock()) #csvexport_sink.logger.debug('Awaiting lock on {}; TASK [{}].'.format(resstem, task_id)) print('Awaiting lock on {}; TASK [{}].'.format( resstem, task_id), file=sys.stderr) await csvexport_sink.locks[resstem] print('Acquired lock on {}; TASK [{}].'.format( resstem, task_id), file=sys.stderr) try: resstem_fpath = os.path.join(self.outfolder, resstem + '.csv') csvexists = os.path.exists(resstem_fpath) #with gzip.open(resstem_fpath, 'at', newline='') as resstem_fp: with open(resstem_fpath, 'at', newline='') as resstem_fp: resstem_csv = csv.writer(resstem_fp, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) vcsv.write(model, resstem_csv, self.rules, not csvexists, base=respurl, logger=csvexport_sink.logger) finally: csvexport_sink.locks[resstem].release() #csvexport_sink.logger.debug('Released lock on {}; TASK [{}].'.format(resstem, task_id)) print('Released lock on {}; TASK [{}].'.format( resstem, task_id), file=sys.stderr) #self.save_ntriples() return linkset return None
def run_one(snippet, expected, desc, entbase=None, config=None, loop=None, canonical=True): m = memory.connection() m_expected = memory.connection() infile = tempfile.NamedTemporaryFile() infile.write(snippet.encode('utf-8')) infile.seek(0) outstream = StringIO() bfconvert([infile], model=m, out=outstream, config=config, canonical=canonical, loop=loop) #bfconvert(factory(infile), model=m, out=outstream, config=config, canonical=canonical, loop=loop) infile.close() outstream.seek(0) hashmap, m = hash_neutral_model(outstream) hashmap = '\n'.join(sorted([ repr((i[1], i[0])) for i in hashmap.items() ])) expected_stream = StringIO(expected) hashmap_expected, m_expected = hash_neutral_model(expected_stream) hashmap_expected = '\n'.join(sorted([ repr((i[1], i[0])) for i in hashmap_expected.items() ])) assert hashmap == hashmap_expected, "Changes to hashes found ({0}):\n{1}\n\nActual model structure diff:\n{2}".format(desc, file_diff(hashmap_expected, hashmap), file_diff(repr(m_expected), repr(m))) assert m == m_expected, "Discrepancies found ({0}):\n{1}".format(desc, file_diff(repr(m_expected), repr(m)))
def run_one(name, entbase=None, config=None, loop=None, canonical=True): m = memory.connection() m_expected = memory.connection() s = StringIO() fname = os.path.join(RESOURCEPATH, name+'.mrx') #bfconvert(factory(open(fname, 'rb')), model=m, out=s, config=config, canonical=canonical, loop=loop) #raise(Exception(repr(inputsource(open(fname, 'rb'))))) bfconvert([inputsource(open(fname, 'rb'))], model=m, out=s, config=config, canonical=canonical, loop=loop) s.seek(0) hashmap, m = hash_neutral_model(s) hashmap = '\n'.join(sorted([ repr((i[1], i[0])) for i in hashmap.items() ])) with open(os.path.join(RESOURCEPATH, name+'.versa')) as indoc: hashmap_expected, m_expected = hash_neutral_model(indoc) hashmap_expected = '\n'.join(sorted([ repr((i[1], i[0])) for i in hashmap_expected.items() ])) assert hashmap == hashmap_expected, "Changes to hashes found for {0}:\n{1}\n\nActual model structure diff:\n{2}".format(name, file_diff(hashmap_expected, hashmap), file_diff(repr(m_expected), repr(m))) assert m == m_expected, "Discrepancies found for {0}:\n{1}".format(name, file_diff(repr(m_expected), repr(m)))
def run_one(name, entbase=None, config=None, variation=''): m = memory.connection() m_expected = memory.connection() s = StringIO() fpath = os.path.join(RESOURCEPATH, name+'.mrx') instream = BytesIO(open(fpath, 'rb').read()) print('Running {} ...'.format('('+variation+')' if variation else ''), fpath) def main(): #Need a new event loop per timeit iteration loop = asyncio.new_event_loop(); asyncio.set_event_loop(None); instream.seek(io.SEEK_SET) bfconvert(instream, model=m, out=s, config=config, loop=loop) global_space = globals() global_space.update(locals()) timing = timeit.timeit('main()', setup='', number=NLOOPS, globals=global_space) print('{} loops, best of 3: {:.2f} sec per loop.'.format(NLOOPS, timing))
def test_ordering_insertion(): model = memory.connection() model.add('s1','p1','lit1',{}) model.add('s1','p2','lit2',{}) model.add('s1','p0','lit0',{},index=1) model.add('s2','p3','lit3',{}) assert list(model)[0][1][1] == 'p1' assert list(model)[1][1][1] == 'p0' assert list(model)[2][1][1] == 'p2' assert list(model)[3][1][1] == 'p3'
def test_copy(): model = memory.connection() r1 = model.add('s1','p0','lit0',{}) r2 = model.add('s1','p1','lit1',{}) r3 = model.add('s1','p2','lit2',{}) model2 = model.copy() assert model == model2 model3 = model.copy(contents=False) assert model3.size() == 0
def test_ordering_insertion(): model = memory.connection() model.add('s1', 'p1', 'lit1', {}) model.add('s1', 'p2', 'lit2', {}) model.add('s1', 'p0', 'lit0', {}, index=1) model.add('s2', 'p3', 'lit3', {}) assert list(model)[0][1][1] == 'p1' assert list(model)[1][1][1] == 'p0' assert list(model)[2][1][1] == 'p2' assert list(model)[3][1][1] == 'p3'
def test_copy(): model = memory.connection() r1 = model.add('s1', 'p0', 'lit0', {}) r2 = model.add('s1', 'p1', 'lit1', {}) r3 = model.add('s1', 'p2', 'lit2', {}) model2 = model.copy() assert model == model2 model3 = model.copy(contents=False) assert model3.size() == 0
def test_basic_marc1(): import os import amara from bibframe.reader.marc import process indoc = amara.parse(os.path.join(RESOURCEPATH, 'kford-holdings1.xml')) #Top level ma:collection is optional, so can't just assume /ma:collection/ma:record XPath recs = indoc.xml_select(u'//ma:record', prefixes=PREFIXES) #logging.debug(recs) m = memory.connection() m.create_space() process(recs, m, idbase='http://example.org/') logging.debug('MARC BASICS PART 1')
def test_pipeline2(): idg = idgen(EXAMPLE_ORG) existing_ids = set() TRANSFORMS = [ ("id", discard()), ("title", rename(rel="name")), # For testing; doesn't make much sense, really, otherwise ( "author", materialize("Person", rel="author", unique=run("target"), links={"name": run("target")}, inverse=True), ), ("link", rename(rel="link")), ("cover", rename(rel="cover")), ] out_m = memory.connection(baseiri="http://example.org/") rid = SIMPLE_BOOK["id"] out_m.add(rid, VTYPE_REL, BOOK_TYPE) for k, v in SIMPLE_BOOK.items(): link = (rid, k, v, {}) for rel, func in TRANSFORMS: if k == rel: in_m = memory.connection(baseiri="http://example.org/") ctx = context(link, in_m, out_m, base=SCHEMA_ORG, idgen=idg) func(ctx) assert out_m.size() == 7, repr(out_m) assert next(out_m.match("http://example.org/book/catcher-in-the-rye", VTYPE_REL))[TARGET] == BOOK_TYPE assert ( next(out_m.match("http://example.org/book/catcher-in-the-rye", I(iri.absolutize("name", SCHEMA_ORG))))[TARGET] == "The Catcher in the Rye" ) author = next( out_m.match(None, I(iri.absolutize("author", SCHEMA_ORG))), "http://example.org/book/catcher-in-the-rye" )[ORIGIN] assert next(out_m.match(author, I(iri.absolutize("name", SCHEMA_ORG))), None)[TARGET] == "J.D. Salinger"
def run(infile): m = memory.connection() from_markdown(infile.read(), m) #from versa.util import jsondump #jsondump(m, open('/tmp/foo.json', 'w')) for poem in resources_by_type(m, 'http://uche.ogbuji.net/poems/poem'): choice = '@choice' in list(map(operator.itemgetter(TARGET), m.match(poem, 'http://www.w3.org/2005/Atom/category'))) if not choice: continue d = parse_date(simple_lookup(m, poem, 'http://www.w3.org/2005/Atom/updated')) source = next(m.match(poem, 'http://www.w3.org/2005/Atom/source')) source = source[ATTRIBUTES]['title'] title = simple_lookup(m, poem, 'http://www.w3.org/2005/Atom/title') print('\t'.join(("'" + title + "'", 'Poem', d.strftime('%B, %Y'), source))) print()
def test_removal(): model = memory.connection() model.add('s1','p0','lit0',{}) model.add('s1','p1','lit1',{}) model.add('s1','p2','lit2',{}) model.add('s2','p3','lit3',{}) model.remove([3,0]) assert list(model)[0][1][2] == 'lit1' assert list(model)[1][1][2] == 'lit2' assert model.size() == 2 model.remove(0) assert list(model)[0][1][2] == 'lit2' assert model.size() == 1
def test_removal(): model = memory.connection() model.add('s1', 'p0', 'lit0', {}) model.add('s1', 'p1', 'lit1', {}) model.add('s1', 'p2', 'lit2', {}) model.add('s2', 'p3', 'lit3', {}) model.remove([3, 0]) assert list(model)[0][1][2] == 'lit1' assert list(model)[1][1][2] == 'lit2' assert model.size() == 2 model.remove(0) assert list(model)[0][1][2] == 'lit2' assert model.size() == 1
def test_book_cases(label, transforms, asserter): idg = idgen(EXAMPLE_ORG) existing_ids = set() out_m = memory.connection(baseiri='http://example.org/') rid = SIMPLE_BOOK['id'] out_m.add(rid, VTYPE_REL, BOOK_TYPE) for k, v in SIMPLE_BOOK.items(): ctxlink = (rid, k, v, {}) func = transforms.get(k) if func: ctx = context(ctxlink, IN_M, out_m, base=SCHEMA_ORG, idgen=idg) func(ctx) asserter(out_m)
def test_basics(): "Basic query test" m = memory.connection() [ m.add(*l) for l in RELS_1 ] variables = {'DC': DC, 'H5': H5, 'H5L': H5L} ctx = context(tuple(RELS_1[0]), m, U + 'uo', base=None, extras=None, variables=variables) parsed = miniparse("?($a, H5 'title', *) and ?($b, H5L 'see-also', $a)") result = parsed.evaluate(ctx) assert result == {'a': set(['http://uche.ogbuji.net/ndewo/']), 'b': set(['http://uche.ogbuji.net/'])} parsed = miniparse("?($a, H5L 'see-also', *)") result = parsed.evaluate(ctx) assert result == {'a': set(['http://uche.ogbuji.net/', 'http://uche.ogbuji.net/ndewo/'])} parsed = miniparse("?($a, H5 'title', *)") result = parsed.evaluate(ctx) assert result == {'a': set(['http://uche.ogbuji.net/ndewo/'])} return
def load_rdfa_page(site, max_retries=1): ''' Helper to load RDFa page as text, plus load a Versa model with the metadata Returns a versa memory model and the raw site text, except in eror case where it returns None and the error ''' retry_count = 0 while True: model = memory.connection() try: with urllib.request.urlopen(site) as resourcefp: sitetext = resourcefp.read() rdfalite.toversa(sitetext, model, site) break #Success, so break out of retry loop except (urllib.error.HTTPError, urllib.error.URLError, http.client.RemoteDisconnected) as e: retry_count += 1 if retry_count >= max_retries: return None, e return model, sitetext
def test_versa_syntax1(): config = { 'autotype-h1': 'http://example.org/r1', 'autotype-h2': 'http://example.org/r2', 'interpretations': { VERSA_BASEIRI + 'refines': VERSA_BASEIRI + 'resourceset', VERSA_BASEIRI + 'properties': VERSA_BASEIRI + 'resourceset', VERSA_BASEIRI + 'synonyms': VERSA_BASEIRI + 'resourceset' } } m = memory.connection(baseiri='http://example.org/') #from_markdown(VERSA_LITERATE1, m, encoding='utf-8') doc = open(os.path.join(RESOURCEPATH, 'ubibframe.md')).read() from_markdown(doc, m, config=config) logging.debug('VERSA LITERATE EXAMPLE 1') for link in m.match(): logging.debug('Result: {0}'.format(repr(link))) #assert result == () assert results == None, "Boo! "
def test_versa_syntax1(): config = { 'autotype-h1': 'http://example.org/r1', 'autotype-h2': 'http://example.org/r2', 'interpretations': { VERSA_BASEIRI + 'refines': VERSA_BASEIRI + 'resource', VERSA_BASEIRI + 'properties': VERSA_BASEIRI + 'resourceset', VERSA_BASEIRI + 'synonyms': VERSA_BASEIRI + 'resourceset' } } m = memory.connection(baseuri='http://example.org/') #from_markdown(VERSA_LITERATE1, m, encoding='utf-8') doc = open(os.path.join(RESOURCEPATH, 'ubibframe.md')).read() from_markdown(doc, m, config=config) logging.debug('VERSA LITERATE EXAMPLE 1') for stmt in m.match(): logging.debug('Result: {0}'.format(repr(stmt))) #assert result == () assert results == None, "Boo! "
async def send(self, data): #Body text, response URL (e.g. after redirections), aiohttp.header object from response, referrer, controlling task ID (body, respurl, respheaders, referrer, task_id) = data _, respurlhost, _, _, _ = iri.split_uri_ref(respurl) if self._fphost == respurlhost: #csvexport_sink.logger.debug('[TASK {}]: Target subpage {} -> {}'.format(task_id, referrer, respurl)) root = html5.parse(body) linkset = self._queue_links(root, respurl) try: _, resid = llnurl_ident(respurl) except ValueError: resid = None if resid: model = memory.connection() rdfalite.toversa(body, model, respurl) #Lock the file for resstem = resid[:HASH_WIDTH] csvexport_sink.locks.setdefault(resstem, Lock()) #csvexport_sink.logger.debug('Awaiting lock on {}; TASK [{}].'.format(resstem, task_id)) print('Awaiting lock on {}; TASK [{}].'.format(resstem, task_id), file=sys.stderr) await csvexport_sink.locks[resstem] print('Acquired lock on {}; TASK [{}].'.format(resstem, task_id), file=sys.stderr) try: resstem_fpath = os.path.join(self.outfolder, resstem + '.csv') csvexists = os.path.exists(resstem_fpath) #with gzip.open(resstem_fpath, 'at', newline='') as resstem_fp: with open(resstem_fpath, 'at', newline='') as resstem_fp: resstem_csv = csv.writer(resstem_fp, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) vcsv.write(model, resstem_csv, self.rules, not csvexists, base=respurl, logger=csvexport_sink.logger) finally: csvexport_sink.locks[resstem].release() #csvexport_sink.logger.debug('Released lock on {}; TASK [{}].'.format(resstem, task_id)) print('Released lock on {}; TASK [{}].'.format(resstem, task_id), file=sys.stderr) #self.save_ntriples() return linkset return None
async def send(self, data): #Body text, response URL (e.g. after redirections), aiohttp.header object from response, referrer, controlling task ID (body, respurl, respheaders, referrer, task_id) = data _, respurlhost, _, _, _ = iri.split_uri_ref(respurl) if self._fphost == respurlhost: #ntriplify_sink.logger.debug('[TASK {}]: Target subpage {} -> {}'.format(task_id, referrer, respurl)) root = html5.parse(body) linkset = self._queue_links(root, respurl) try: _, resid = llnurl_ident(respurl) except ValueError: resid = None if resid: #Lock the file for resstem = resid[:3] ntriplify_sink.locks.setdefault(resstem, Lock()) await ntriplify_sink.locks[resstem] try: #Note: #timeit.timeit(‘rdfalite.toversa(open(“index.html”).read(), model, “http://link.delawarelibrary.org/portal/Nest-Esther-Ehrlich-overdrive-ebook/F-h_bGCl5lk/“)’, setup=‘from versa.driver import memory; from versa.reader import rdfalite; model = memory.connection()’, number=10) #4.412366830001702 #timeit.timeit(‘g = rdflib.Graph(); g.parse(“index.html”, format=“html”)’, setup=‘import rdflib’, number=10) #[snip tons of warnings] #16.82040351499745 #IOW Versa is 4X faster than RDFlib for this task, and more robust with open(os.path.join(self.outfolder, resstem + '.nt'), 'a') as resstem_fp: model = memory.connection() rdfalite.toversa(body, model, respurl) ntriples.write(model, out=resstem_fp, base=respurl, logger=ntriplify_sink.logger) finally: ntriplify_sink.locks[resstem].release() #self.save_ntriples() return linkset return None
def send(self, data): #Body text, respunse URL (e.g. after redirections), aiohttp.header object from response, referrer, controlling task ID (body, respurl, respheaders, referrer, task_id) = data _, respurlhost, _, _, _ = iri.split_uri_ref(respurl) if LIBRARY_LINK_HEADER not in respheaders: #Not even an LLN page at all return if self._fphost == respurlhost: output_model = memory.connection() quickinfo_sink.logger.debug('[TASK {}]: Target subpage {} -> {}'.format(task_id, referrer, respurl)) #Subpage of the target site rdfalite.toversa(body, output_model, respurl) resname = versautil.simple_lookup(output_model, respurl, SCHEMAORG + 'name') print(respurl, '|', resname, file=quickinfo_sink.outfp) #orgentity = util.simple_lookup_byvalue(model, RDFTYPE, SCHEMAORG + 'Organization') #name = util.simple_lookup(model, orgentity, BL + 'name') #name = util.simple_lookup(model, baseurl + '#_default', BL + 'name') root = html5.parse(body) linkset = self._queue_links(root, respurl) return linkset
def test_basics(): "test ..." model = memory.connection() for (subj, pred, obj, attrs) in RELS_1: model.add(subj, pred, obj, attrs) results = model.match(origin='http://copia.ogbuji.net') logging.debug('BASICS PART 1') for result in results: logging.debug('Result: {0}'.format(repr(result))) #assert result == () #assert results == None, "Boo! " results = model.match(origin='http://uche.ogbuji.net', attrs={u'@lang': u'ig'}) logging.debug('BASICS PART 2') results = tuple(list(results)) import pprint; pprint.pprint(results) for result in results: logging.debug('Result: {0}'.format(repr(result))) #assert result == () expected = (('http://uche.ogbuji.net', 'http://purl.org/dc/elements/1.1/title', 'Ulo Uche', {'@context': 'http://uche.ogbuji.net#_metadata', '@lang': 'ig'}),) assert results == expected, (results, expected)
def run(infile): m = memory.connection() from_markdown(infile.read(), m) #from versa.util import jsondump #jsondump(m, open('/tmp/foo.json', 'w')) print('<descriptionSet>') for poem in resources_by_type(m, 'http://uche.ogbuji.net/poems/poem'): choice = '@choice' in list(map(operator.itemgetter(TARGET), m.match(poem, 'http://www.w3.org/2005/Atom/category'))) if not choice: continue print('<description>') d = parse_date(simple_lookup(m, poem, 'http://www.w3.org/2005/Atom/updated')) source = next(m.match(poem, 'http://www.w3.org/2005/Atom/source')) source = source[ATTRIBUTES]['title'] title = simple_lookup(m, poem, 'http://www.w3.org/2005/Atom/title') print(' <title>{0}</title>\n <date>{1}</date>\n <publisher>{2}</publisher>'.format(title, d.strftime('%B, %Y'), source)) hlink = list(map(operator.itemgetter(TARGET), m.match(poem, 'http://www.w3.org/2005/Atom/link'))) if hlink: hlink = hlink[0] print(' <link href="{0}"/>'.format(hlink)) print('</description>') print('</descriptionSet>')
def start_element(self, name, attributes): if self._lax: (head, sep, tail) = name.partition(':') local = tail or head ns = MARCXML_NS #Just assume all elements are MARC/XML else: ns, local = name.split(NSSEP) if NSSEP in name else (None, name) if ns == MARCXML_NS: #Ignore the 'collection' element #What to do with the record/@type if local == 'record': self.no_records = False #XXX: Entity base IRI needed? self._record_id = 'record-{0}:{1}'.format(self._parser.CurrentLineNumber, self._parser.CurrentColumnNumber) #Versa model with a representation of the record #For input model plugins, important that natural ordering be preserved self._record_model = memory.connection(attr_cls=OrderedDict)#logger=logger) elif local == 'leader': self._chardata_dest = '' self._link_iri = MARCXML_NS + '/leader' self._marc_attributes = OrderedDict() self._getcontent = True elif local == 'controlfield': self._chardata_dest = '' self._link_iri = MARCXML_NS + '/control/' + attributes['tag'].strip() #Control tags have neither indicators nor subfields self._marc_attributes = OrderedDict({'tag': attributes['tag'].strip()}) self._getcontent = True elif local == 'datafield': self._link_iri = MARCXML_NS + '/data/' + attributes['tag'].strip() self._marc_attributes = OrderedDict(([k, v.strip()] for (k, v) in attributes.items() if ' ' not in k)) elif local == 'subfield': self._chardata_dest = '' self._subfield = attributes['code'].strip() if not self._subfield or ord(self._subfield) not in VALID_SUBFIELDS: self._subfield = '_' self._getcontent = True return
def jsonize_site(site, rules=None): ''' >>> from librarylink.util import jsonize_site >>> obj = jsonize_site('http://link.denverlibrary.org') >>> with open('denverlibrary.ld.json', 'w') as fp: json.dump(obj, fp, indent=2) >>> rules = {'ignore-predicates': ['http://bibfra.me/', 'http://library.link/'], 'rename-predicates': {'http://library.link/vocab/branchOf': 'http://schema.org/branch'}} >>> obj = jsonize_site('http://link.denverlibrary.org', rules=rules) >>> with open('denverlibrary.ld.json', 'w') as fp: json.dump(obj, fp, indent=2) ''' from versa.util import uniquify from versa.writer import jsonld rules = rules or {} ignore_pred = rules.get('ignore-predicates', set()) rename_pred = rules.get('rename-predicates', {}) ignore_oftypes = rules.get('ignore-oftypes', []) invert = rules.get('invert', {}) context = rules.get('context', {}) pre_model, _ = load_rdfa_page(site) if not pre_model: return None uniquify(pre_model) post_model = memory.connection() for o, r, t, a in pre_model.match(): #print(o, r, t) for oldp, newp in rename_pred: if r == oldp: r = newp for rpre, rpost in invert: if r == rpre: assert isinstance(t, I) o, r, t = t, rpost, o for igp in ignore_pred: if r.startswith(igp): break else: post_model.add(o, r, t, a) obj = jsonld.bind(post_model, context=context, ignore_oftypes=ignore_oftypes) return obj
def send(self, data): #Body text, respunse URL (e.g. after redirections), aiohttp.header object from response, referrer, controlling task ID (body, respurl, respheaders, referrer, task_id) = data _, respurlhost, _, _, _ = iri.split_uri_ref(respurl) if LIBRARY_LINK_HEADER not in respheaders: #Not even an LLN page at all return if self._fphost == respurlhost: output_model = memory.connection() quickinfo_sink.logger.debug( '[TASK {}]: Target subpage {} -> {}'.format( task_id, referrer, respurl)) #Subpage of the target site rdfalite.toversa(body, output_model, respurl) resname = versautil.simple_lookup(output_model, respurl, SCHEMAORG + 'name') print(respurl, '|', resname, file=quickinfo_sink.outfp) #orgentity = util.simple_lookup_byvalue(model, RDFTYPE, SCHEMAORG + 'Organization') #name = util.simple_lookup(model, orgentity, BL + 'name') #name = util.simple_lookup(model, baseurl + '#_default', BL + 'name') root = html5.parse(body) linkset = self._queue_links(root, respurl) return linkset
def bfconvert(inputs, handle_marc_source=handle_marcxml_source, entbase=None, model=None, out=None, limit=None, rdfttl=None, rdfxml=None, xml=None, config=None, verbose=False, logger=logging, loop=None, canonical=False, lax=False, defaultsourcetype=inputsourcetype.unknown): ''' inputs - One or more open file-like object, string with MARC content, or filename or IRI. If filename or IRI it's a good idea to indicate this via the defaultsourcetype parameter handle_marc_source - Function to turn a source of MARC data (e.g. XML or JSON) into the internal format for processing entbase - Base IRI to be used for creating resources. model - model instance for internal use out - file where raw Versa JSON dump output should be written (default: write to stdout) limit - Limit the number of records processed to this number. If omitted, all records will be processed. rdfttl - stream to where RDF Turtle output should be written rdfxml - stream to where RDF/XML output should be written config - configuration information verbose - If true show additional messages and information (default: False) logger - logging object for messages loop - optional asyncio event loop to use canonical - output Versa's canonical form? lax - If True signal to the handle_marc_source function that relaxed syntax rules should be applied (e.g. accept XML with namespace problems) defaultsourcetype - Signal indicating how best to interpret inputs to create an inputsource ''' #if stats: # register_service(statsgen.statshandler) config = config or {} if limit is not None: try: limit = int(limit) except ValueError: logger.debug( 'Limit must be a number, not "{0}". Ignoring.'.format(limit)) def resolve_class(fullname): ''' Given a full name for a Python class, return the class object ''' import importlib modpath, name = fullname.rsplit('.', 1) module = importlib.import_module(modpath) cls = getattr(module, name) return cls attr_cls = resolve_class(config.get('versa-attr-cls', 'builtins.dict')) model_factory = functools.partial(memory.connection, attr_cls=attr_cls) #,logger=logger) if 'marc_record_handler' in config: handle_marc_source = AVAILABLE_MARC_HANDLERS[ config['marc_record_handler']] readmode = handle_marc_source.readmode #inputs = ( inputsource(open(i, readmode)) for i in inputs ) #if not isinstance(inputs[0], inputsource): # inputs = ( inputsource(i, streamopenmode=readmode) for i in inputs ) if handle_marc_source.makeinputsource: inputs = factory(inputs, defaultsourcetype=defaultsourcetype, streamopenmode=readmode) #inputs = ( inputsource(i, streamopenmode=readmode) for i in inputs ) ids = marc.idgen(entbase) if model is None: model = model_factory() if any((rdfttl, rdfxml)): import rdflib BFNS = rdflib.Namespace(BFZ) BFCNS = rdflib.Namespace(BFZ + 'cftag/') BFDNS = rdflib.Namespace(BFZ + 'dftag/') g = rdflib.Graph() #Intentionally not using either factory if canonical: global_model = memory.connection() #logger=logger) if xml is not None: xmlw = writer.raw(xml, indent=' ') xmlw.start_element('bibframe') extant_resources = None #extant_resources = set() def postprocess(): #No need to bother with Versa -> RDF translation if we were not asked to generate Turtle if any((rdfttl, rdfxml)): rdf.process(model, g, to_ignore=extant_resources, logger=logger) if canonical: global_model.add_many([(o, r, t, a) for (rid, (o, r, t, a)) in model]) if xml is not None: microxml.process(model, xmlw, to_ignore=extant_resources, logger=logger) model.create_space() #Set up event loop if not provided if not loop: loop = asyncio.get_event_loop() #Allow configuration of a separate base URI for vocab items (classes & properties) #XXX: Is this the best way to do this, or rather via a post-processing plug-in vb = config.get('vocab-base-uri', BL) transform_iris = config.get('transforms', []) marcspecials_vocab = config.get('marcspecials-vocab') transforms = transform_set(transform_iris, marcspecials_vocab) lookups = config.get('lookups', {}) #Initialize auxiliary services (i.e. plugins) plugins = [] for pc in config.get('plugins', []): try: pinfo = g_services[pc['id']] plugins.append(pinfo) pinfo[BF_INIT_TASK](pinfo, config=pc) except KeyError: raise Exception('Unknown plugin {0}'.format(pc['id'])) limiting = [0, limit] #logger=logger, #raise(Exception(repr(inputs))) for source in inputs: @asyncio.coroutine #Wrap the parse operation to make it a task in the event loop def wrap_task(): #source=source sink = marc.record_handler(loop, model, entbase=entbase, vocabbase=vb, limiting=limiting, plugins=plugins, ids=ids, postprocess=postprocess, out=out, logger=logger, transforms=transforms, canonical=canonical, lookups=lookups, model_factory=model_factory) args = dict(lax=lax) handle_marc_source(source, sink, args, logger, model_factory) sink.close() yield task = asyncio. async (wrap_task(), loop=loop) try: loop.run_until_complete(task) except Exception as ex: raise ex finally: loop.close() if canonical: out.write(repr(global_model)) if any((rdfttl, rdfxml)): if vb == BFZ: g.bind('bf', BFNS) g.bind('bfc', BFCNS) g.bind('bfd', BFDNS) else: g.bind('vb', rdflib.Namespace(vb)) if entbase: g.bind('ent', entbase) if rdfttl is not None: logger.debug('Converting to RDF (Turtle).') rdfttl.write(g.serialize(format="turtle")) if rdfxml is not None: logger.debug('Converting to RDF (XML).') rdfxml.write(g.serialize(format="pretty-xml")) if xml is not None: logger.debug('Converting to XML.') xmlw.end_element('bibframe') return