Exemple #1
0
def test_pipeline():
    idg = idgen(EXAMPLE_ORG)
    existing_ids = []
    mat = functools.partial(materialize, hashidgen=idg, existing_ids=existing_ids)

    TRANSFORMS = {
        'id': functools.partial(discard),
        'title': functools.partial(relabel, new_rel='name'),
        'author': functools.partial(mat, new_rel='author', unique=run('target'), typ='Person', properties={'name': run('target')}),
        'link': functools.partial(relabel, new_rel='link'),
        'cover': functools.partial(relabel, new_rel='cover'),
    }
    #'type': functools.partial(relabel, new_rel=VTYPE_REL),

    out_m = memory.connection(baseiri='http://example.org/')

    rid = SIMPLE_BOOK['id']
    out_m.add(rid, VTYPE_REL, BOOK_TYPE)
    for k, v in SIMPLE_BOOK.items():
        link = (rid, k, v)
        func = TRANSFORMS.get(k)
        if func:
            in_m = memory.connection(baseiri='http://example.org/')
            ctx = context(link, in_m, out_m, base=SCHEMA_ORG)
            func(ctx)
    
    assert out_m.size() == 7
    assert next(out_m.match('http://example.org/book/catcher-in-the-rye', VTYPE_REL))[TARGET] == BOOK_TYPE
    assert next(out_m.match('http://example.org/book/catcher-in-the-rye', I(iri.absolutize('name', SCHEMA_ORG))))[TARGET] == 'The Catcher in the Rye'
Exemple #2
0
def run_one(snippet,
            expected,
            desc,
            entbase=None,
            config=None,
            loop=None,
            canonical=True):
    m = memory.connection()
    m_expected = memory.connection()
    infile = tempfile.NamedTemporaryFile()
    infile.write(snippet.encode('utf-8'))
    infile.seek(0)
    outstream = StringIO()
    bfconvert([infile],
              model=m,
              out=outstream,
              config=config,
              canonical=canonical,
              loop=loop)
    #bfconvert(factory(infile), model=m, out=outstream, config=config, canonical=canonical, loop=loop)
    infile.close()
    outstream.seek(0)
    hashmap, m = hash_neutral_model(outstream)
    hashmap = '\n'.join(sorted([repr((i[1], i[0])) for i in hashmap.items()]))

    expected_stream = StringIO(expected)
    hashmap_expected, m_expected = hash_neutral_model(expected_stream)
    hashmap_expected = '\n'.join(
        sorted([repr((i[1], i[0])) for i in hashmap_expected.items()]))

    assert hashmap == hashmap_expected, "Changes to hashes found ({0}):\n{1}\n\nActual model structure diff:\n{2}".format(
        desc, file_diff(hashmap_expected, hashmap),
        file_diff(repr(m_expected), repr(m)))
    assert m == m_expected, "Discrepancies found ({0}):\n{1}".format(
        desc, file_diff(repr(m_expected), repr(m)))
Exemple #3
0
def test_pipeline1():
    idg = idgen(EXAMPLE_ORG)
    existing_ids = set()

    TRANSFORMS = {
        "id": discard(),
        "title": rename(rel="name"),
        "author": materialize("Person", rel="author", unique=run("target"), links={"name": run("target")}),
        "link": rename(rel="link"),
        "cover": rename(rel="cover"),
    }
    #'type': functools.partial(relabel, rel=VTYPE_REL),

    out_m = memory.connection(baseiri="http://example.org/")

    rid = SIMPLE_BOOK["id"]
    out_m.add(rid, VTYPE_REL, BOOK_TYPE)
    for k, v in SIMPLE_BOOK.items():
        link = (rid, k, v, {})
        func = TRANSFORMS.get(k)
        if func:
            in_m = memory.connection(baseiri="http://example.org/")
            ctx = context(link, in_m, out_m, base=SCHEMA_ORG, idgen=idg)
            func(ctx)

    assert out_m.size() == 7, repr(out_m)
    assert next(out_m.match("http://example.org/book/catcher-in-the-rye", VTYPE_REL))[TARGET] == BOOK_TYPE
    assert (
        next(out_m.match("http://example.org/book/catcher-in-the-rye", I(iri.absolutize("name", SCHEMA_ORG))))[TARGET]
        == "The Catcher in the Rye"
    )
def test_work_fallback_author_in_marc_with_plusbib():
    m = memory.connection()
    m_expected = memory.connection()
    s = StringIO()

    bfconvert([BytesIO(REGULAR_MARC_EXAMPLE)], model=m, out=s, config=WORK_FALLBACK_AUTHOR_IN_MARC_CONFIG_PLUS_BIB, canonical=True)
    s.seek(0)

    #with open('/tmp/foo.versa.json', 'w') as f:
    #    f.write(s.read())
    #s.seek(0)

    hashmap, m = hash_neutral_model(s)
    hashmap = '\n'.join(sorted([ repr((i[1], i[0])) for i in hashmap.items() ]))

    removals = []
    #Strip out tag-XXX relationships
    for ix, (o, r, t, a) in m:
        #logging.debug(r)
        if r.startswith('http://bibfra.me/vocab/marcext/tag-') or r.startswith('http://bibfra.me/vocab/marcext/sf-'):
            removals.append(ix)
    m.remove(removals)

    hashmap_expected, m_expected = hash_neutral_model(StringIO(WORK_FALLBACK_AUTHOR_IN_MARC_EXPECTED_PLUS_BIB))
    hashmap_expected = '\n'.join(sorted([ repr((i[1], i[0])) for i in hashmap_expected.items() ]))

    assert hashmap == hashmap_expected, "Changes to hashes found:\n{0}\n\nActual model structure diff:\n{0}".format(file_diff(hashmap_expected, hashmap), file_diff(repr(m_expected), repr(m)))
    assert m == m_expected, "Discrepancies found:\n{0}".format(file_diff(repr(m_expected), repr(m)))
def run_one(snippet, expected, desc, entbase=None, config=None, loop=None, canonical=True):
    m = memory.connection()
    m_expected = memory.connection()
    instream = BytesIO(snippet.encode('utf-8'))
    outstream = StringIO()
    bfconvert(instream, model=m, out=outstream, config=config, canonical=canonical, loop=loop)
    outstream.seek(0)
    jsonload(m, outstream)

    expected_stream = StringIO(expected)
    jsonload(m_expected, expected_stream)

    assert m == m_expected, "Discrepancies found ({0}):\n{1}".format(desc, file_diff(repr(m_expected), repr(m)))
def run_one(name, entbase=None, config=None, loop=None, canonical=True):
    m = memory.connection()
    m_expected = memory.connection()
    s = StringIO()

    with open(os.path.join(RESOURCEPATH, name+'.mrx'), 'rb') as indoc:
        bfconvert(indoc, model=m, out=s, config=config, canonical=canonical, loop=loop)
        s.seek(0)
        jsonload(m, s)

    with open(os.path.join(RESOURCEPATH, name+'.versa')) as indoc:
        jsonload(m_expected, indoc)

    assert m == m_expected, "Discrepancies found for {0}:\n{1}".format(name, file_diff(repr(m_expected), repr(m)))
Exemple #7
0
async def rdfa_from_page(url, session=None, max_retries=1):
    '''
    Async helper to load RDFa page as text, plus load a Versa model with the metadata
    
    Yields a versa memory model, the raw site text and HTTP response info, except in error case where it returns None and the exception

    >>> from amara3.asynctools import go_async
    >>> from librarylink.util import rdfa_from_page
    >>> from versa import util as versautil
    >>> url = "http://link.crlibrary.org/portal/Estamos-en-un-libro-por-Mo-Willems--traducido/ZAxkTVTDCxE/"
    >>> model, sitetext, response = go_async(rdfa_from_page(url))
    >>> next(versautil.lookup(model, 'http://link.crlibrary.org/resource/zXft1yv0T9k/', 'http://schema.org/name'))
    'Libros y lectura -- Novela juvenil'
    '''
    retry_count = 0
    while True:
        model = memory.connection()
        try:
            if session == None:
                import aiohttp
                async with aiohttp.ClientSession() as session:
                    async with session.get(url) as response:
                        body = await response.read()
                        rdfalite.toversa(body, model, url)
                        return model, body, response
            else:
                async with session.get(url) as response:
                    body = await response.read()
                    rdfalite.toversa(body, model, url)
                    return model, body, response
        except Exception as e:
            #print(url, f'[EXCEPTION {e}], context: {context}')
            retry_count += 1
            if retry_count >= max_retries:
                return None, e, None
Exemple #8
0
async def network_isbn_info(isbn, session=None, max_retries=1):
    '''
    Async helper to get JSON content from network resource page
    
    Returns a JSON object

    >>> from amara3.asynctools import go_async
    >>> from librarylink.resource import network_isbn_info
    >>> obj = go_async(network_isbn_info(9780871290861))
    >>> obj['workExample'][0].get('holdings_count')
    19
    '''
    retry_count = 0
    url = LL_ISBN_STEMPLATE.format(**{'isbn': isbn})
    #print('processing', url, file=sys.stderr)
    while True:
        await asyncio.sleep(0.2)
        model = memory.connection()
        try:
            if session == None:
                import aiohttp
                async with aiohttp.ClientSession() as session:
                    async with session.get(url) as response:
                        obj = await response.json()
                        return obj
            else:
                async with session.get(url) as response:
                    obj = await response.json()
                    return obj
        except Exception as e:
            #print(url, f'[EXCEPTION {e}], context: {context}', file=sys.stderr)
            retry_count += 1
            if retry_count >= max_retries:
                return None
Exemple #9
0
def test_basics():
    "Basic query test"
    m = memory.connection()
    [m.add(*l) for l in RELS_1]
    variables = {'DC': DC, 'H5': H5, 'H5L': H5L}
    ctx = context(tuple(RELS_1[0]),
                  m,
                  U + 'uo',
                  base=None,
                  extras=None,
                  variables=variables)
    parsed = miniparse("?($a, H5 'title', *) and ?($b, H5L 'see-also', $a)")
    result = parsed.evaluate(ctx)
    assert result == {
        'a': set(['http://uche.ogbuji.net/ndewo/']),
        'b': set(['http://uche.ogbuji.net/'])
    }

    parsed = miniparse("?($a, H5L 'see-also', *)")
    result = parsed.evaluate(ctx)
    assert result == {
        'a': set(['http://uche.ogbuji.net/', 'http://uche.ogbuji.net/ndewo/'])
    }

    parsed = miniparse("?($a, H5 'title', *)")
    result = parsed.evaluate(ctx)
    assert result == {'a': set(['http://uche.ogbuji.net/ndewo/'])}
    return
Exemple #10
0
async def network_isbn_info(isbn, session=None, max_retries=1):
    '''
    Async helper to get JSON content from network resource page
    
    Returns a JSON object

    >>> from amara3.asynctools import go_async
    >>> from librarylink.resource import network_isbn_info
    >>> obj = go_async(network_isbn_info(9780871290861))
    >>> obj['workExample'][0].get('holdings_count')
    19
    '''
    retry_count = 0
    url = LL_ISBN_STEMPLATE.format(**{'isbn': isbn})
    #print('processing', url, file=sys.stderr)
    while True:
        await asyncio.sleep(0.2)
        model = memory.connection()
        try:
            if session == None:
                import aiohttp
                async with aiohttp.ClientSession() as session:
                    async with session.get(url) as response:
                        obj = await response.json()
                        return obj
            else:
                async with session.get(url) as response:
                    obj = await response.json()
                    return obj
        except Exception as e:
            #print(url, f'[EXCEPTION {e}], context: {context}', file=sys.stderr)
            retry_count += 1
            if retry_count >= max_retries:
                return None
Exemple #11
0
def test_basics():
    "test ..."
    model = memory.connection()
    for (subj, pred, obj, attrs) in RELS_1:
        model.add(subj, pred, obj, attrs)
    results = model.match(origin='http://copia.ogbuji.net')
    logging.debug('BASICS PART 1')
    for result in results:
        logging.debug('Result: {0}'.format(repr(result)))
        #assert result == ()
    #assert results == None, "Boo! "

    results = model.match(origin='http://uche.ogbuji.net',
                          attrs={u'@lang': u'ig'})
    logging.debug('BASICS PART 2')
    results = tuple(list(results))
    #import pprint; pprint.pprint(results)
    for result in results:
        logging.debug('Result: {0}'.format(repr(result)))
        #assert result == ()
    expected = (('http://uche.ogbuji.net',
                 'http://purl.org/dc/elements/1.1/title', 'Ulo Uche', {
                     '@context': 'http://uche.ogbuji.net#_metadata',
                     '@lang': 'ig'
                 }), )
    assert results == expected, (results, expected)
def test_author_in_marc():
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(None)

    m = memory.connection()
    m_expected = memory.connection()
    s = StringIO()

    bfconvert([BytesIO(AUTHOR_IN_MARC)],
              model=m,
              out=s,
              config=AUTHOR_IN_MARC_CONFIG,
              canonical=True,
              loop=loop)
    s.seek(0)

    #with open('/tmp/foo.versa.json', 'w') as f:
    #    f.write(s.read())
    #s.seek(0)

    #sys.exit(-1)

    hashmap, m = hash_neutral_model(s)
    hashmap = '\n'.join(sorted([repr((i[1], i[0])) for i in hashmap.items()]))

    removals = []
    #Strip out tag-XXX relationships
    for ix, (o, r, t, a) in m:
        #logging.debug(r)
        if r.startswith('http://bibfra.me/vocab/marcext/tag-') or r.startswith(
                'http://bibfra.me/vocab/marcext/sf-'):
            removals.append(ix)
    m.remove(removals)

    #with open('/tmp/foo.versa.json', 'w') as f:
    #    f.write(repr(m))

    hashmap_expected, m_expected = hash_neutral_model(
        StringIO(AUTHOR_IN_MARC_EXPECTED))
    hashmap_expected = '\n'.join(
        sorted([repr((i[1], i[0])) for i in hashmap_expected.items()]))

    assert hashmap == hashmap_expected, "Changes to hashes found:\n{0}\n\nActual model structure diff:\n{0}".format(
        file_diff(hashmap_expected, hashmap),
        file_diff(repr(m_expected), repr(m)))
    assert m == m_expected, "Discrepancies found:\n{0}".format(
        file_diff(repr(m_expected), repr(m)))
def test_model_consumed():
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(None)
    m = memory.connection()
    with open(os.path.join(RESOURCEPATH, 'multiple-authlinks.xml'), 'rb') as indoc:
        bfconvert([indoc], entbase='http://example.org/', model=m, config=None, verbose=False, loop=loop)

    assert m.size() == 0, 'Model not consumed:\n'+repr(m)
Exemple #14
0
def hash_neutral_model(stream):
    '''
    >>> VJSON = """[
    ["DoVM1hvc","http://bibfra.me/purl/versa/type","http://bibfra.me/vocab/lite/Person",{"@target-type": "@iri-ref"}],
    ["DoVM1hvc","http://bibfra.me/vocab/lite/date","1878-1967.",{}],
    ["DoVM1hvc","http://bibfra.me/vocab/lite/name","Sandburg, Carl,",{}],
    ["DoVM1hvc","http://bibfra.me/vocab/marcext/sf-a","Sandburg, Carl,",{}],
    ["DoVM1hvc","http://bibfra.me/vocab/marcext/sf-d","1878-1967.",{}],
    ["Ht2FQsIY","http://bibfra.me/purl/versa/type","http://bibfra.me/vocab/lite/Instance",{"@target-type": "@iri-ref"}],
    ["Ht2FQsIY","http://bibfra.me/vocab/lite/instantiates","XsrrgYIS",{"@target-type": "@iri-ref"}],
    ["XsrrgYIS","http://bibfra.me/purl/versa/type","http://bibfra.me/vocab/lite/Work",{"@target-type": "@iri-ref"}],
    ["XsrrgYIS","http://bibfra.me/purl/versa/type","http://bibfra.me/vocab/marc/Books",{"@target-type": "@iri-ref"}],
    ["XsrrgYIS","http://bibfra.me/purl/versa/type","http://bibfra.me/vocab/marc/LanguageMaterial",{"@target-type": "@iri-ref"}],
    ["XsrrgYIS","http://bibfra.me/vocab/lite/creator","DoVM1hvc",{"@target-type": "@iri-ref"}],
    ["XsrrgYIS","http://bibfra.me/vocab/marc/natureOfContents","encyclopedias",{}],
    ["XsrrgYIS","http://bibfra.me/vocab/marc/natureOfContents","legal articles",{}],
    ["XsrrgYIS","http://bibfra.me/vocab/marc/natureOfContents","surveys of literature",{}],
    ["XsrrgYIS","http://bibfra.me/vocab/marcext/tag-008","920219s1993 caua j 000 0 eng",{}]
    ]"""
    >>> from io import StringIO, BytesIO
    >>> s = StringIO(VJSON)
    >>> from bibframe.util import hash_neutral_model
    >>> hashmap, model = hash_neutral_model(s)
    >>> hashmap
    {'XsrrgYIS': '@R0', 'DoVM1hvc': '@R1', 'Ht2FQsIY': '@R2'}
    >>> [ (o, r, t, a) for (rid, (o, r, t, a)) in model ][0] #Safe ordering for memory model only, mind you
    ('@R1', 'http://bibfra.me/vocab/lite/name', 'Sandburg, Carl,', OrderedDict())
    '''
    stage1 = memory.connection()
    stage2 = memory.connection()
    stage3 = memory.connection()
    jsonload(stage1, stream)
    hashmap = {}
    #One pass for origins
    dummy = repr(stage1) #Mysterious bug (presumably in jsonload): attributes lose all their contents without this line
    for (rid, (o, r, t, a)) in sorted(stage1, key=lambda x:x[1][0]): # sort by resource id
        hash_neutral_origin = hashmap.setdefault(o, '@R{}'.format(len(hashmap)))
        stage2.add(hash_neutral_origin, r, t, a)
    del stage1 #clean up
    #Another pass for targets
    for (rid, (o, r, t, a)) in sorted(stage2):
        hash_neutral_target = t
        if a.get("@target-type") == "@iri-ref":
            hash_neutral_target = hashmap.get(t, t)
        stage3.add(o, r, hash_neutral_target, a)
    return hashmap, stage3
Exemple #15
0
def hash_neutral_model(stream):
    '''
    >>> VJSON = """[
    ["DoVM1hvc","http://bibfra.me/purl/versa/type","http://bibfra.me/vocab/lite/Person",{"@target-type": "@iri-ref"}],
    ["DoVM1hvc","http://bibfra.me/vocab/lite/date","1878-1967.",{}],
    ["DoVM1hvc","http://bibfra.me/vocab/lite/name","Sandburg, Carl,",{}],
    ["DoVM1hvc","http://bibfra.me/vocab/marcext/sf-a","Sandburg, Carl,",{}],
    ["DoVM1hvc","http://bibfra.me/vocab/marcext/sf-d","1878-1967.",{}],
    ["Ht2FQsIY","http://bibfra.me/purl/versa/type","http://bibfra.me/vocab/lite/Instance",{"@target-type": "@iri-ref"}],
    ["Ht2FQsIY","http://bibfra.me/vocab/lite/instantiates","XsrrgYIS",{"@target-type": "@iri-ref"}],
    ["XsrrgYIS","http://bibfra.me/purl/versa/type","http://bibfra.me/vocab/lite/Work",{"@target-type": "@iri-ref"}],
    ["XsrrgYIS","http://bibfra.me/purl/versa/type","http://bibfra.me/vocab/marc/Books",{"@target-type": "@iri-ref"}],
    ["XsrrgYIS","http://bibfra.me/purl/versa/type","http://bibfra.me/vocab/marc/LanguageMaterial",{"@target-type": "@iri-ref"}],
    ["XsrrgYIS","http://bibfra.me/vocab/lite/creator","DoVM1hvc",{"@target-type": "@iri-ref"}],
    ["XsrrgYIS","http://bibfra.me/vocab/marc/natureOfContents","encyclopedias",{}],
    ["XsrrgYIS","http://bibfra.me/vocab/marc/natureOfContents","legal articles",{}],
    ["XsrrgYIS","http://bibfra.me/vocab/marc/natureOfContents","surveys of literature",{}],
    ["XsrrgYIS","http://bibfra.me/vocab/marcext/tag-008","920219s1993 caua j 000 0 eng",{}]
    ]"""
    >>> from io import StringIO, BytesIO
    >>> s = StringIO(VJSON)
    >>> from bibframe.util import hash_neutral_model
    >>> hashmap, model = hash_neutral_model(s)
    >>> hashmap
    {'XsrrgYIS': '@R0', 'DoVM1hvc': '@R1', 'Ht2FQsIY': '@R2'}
    >>> [ (o, r, t, a) for (rid, (o, r, t, a)) in model ][0] #Safe ordering for memory model only, mind you
    ('@R1', 'http://bibfra.me/vocab/lite/name', 'Sandburg, Carl,', OrderedDict())
    '''
    stage1 = memory.connection()
    stage2 = memory.connection()
    stage3 = memory.connection()
    jsonload(stage1, stream)
    hashmap = {}
    #One pass for origins
    dummy = repr(stage1) #Mysterious bug (presumably in jsonload): attributes lose all their contents without this line
    for (rid, (o, r, t, a)) in sorted(stage1, key=lambda x:x[1][0]): # sort by resource id
        hash_neutral_origin = hashmap.setdefault(o, '@R{}'.format(len(hashmap)))
        stage2.add(hash_neutral_origin, r, t, a)
    del stage1 #clean up
    #Another pass for targets
    for (rid, (o, r, t, a)) in sorted(stage2):
        hash_neutral_target = t
        if a.get("@target-type") == "@iri-ref":
            hash_neutral_target = hashmap.get(t, t)
        stage3.add(o, r, hash_neutral_target, a)
    return hashmap, stage3
Exemple #16
0
def test_model_consumed():
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(None)
    m = memory.connection()
    fname = os.path.join(RESOURCEPATH, 'multiple-authlinks.mrx')
    #bfconvert([inputsource(open(fname, 'rb'))], entbase='http://example.org/', model=m, config=None, verbose=False, loop=loop)
    bfconvert([open(fname, 'rb')], entbase='http://example.org/', model=m, config=None, verbose=False, loop=loop)

    assert m.size() == 0, 'Model not consumed:\n'+repr(m)
Exemple #17
0
def test_index():
    model = memory.connection()
    r1 = model.add('s1','p0','lit0',{})
    r2 = model.add('s1','p1','lit1',{})
    r3 = model.add('s1','p2','lit2',{})

    assert model[r1][0] == 's1'
    assert model[r2][1] == 'p1'
    assert model[r3][2] == 'lit2'
Exemple #18
0
def test_index():
    model = memory.connection()
    r1 = model.add('s1', 'p0', 'lit0', {})
    r2 = model.add('s1', 'p1', 'lit1', {})
    r3 = model.add('s1', 'p2', 'lit2', {})

    assert model[r1][0] == 's1'
    assert model[r2][1] == 'p1'
    assert model[r3][2] == 'lit2'
Exemple #19
0
def bfconvert(inputs, base=None, out=None, limit=None, rdfttl=None, config=None, verbose=False, logger=logging):
    '''
    inputs - List of MARC/XML files to be parsed and converted to BIBFRAME RDF (Note: want to allow singular input strings)
    out - file where raw Versa JSON dump output should be written (default: write to stdout)
    rdfttl - stream to where RDF Turtle output should be written
    config - configuration information
    limit - Limit the number of records processed to this number. If omitted, all records will be processed.
    base - Base IRI to be used for creating resources.
    verbose - If true show additional messages and information (default: False)
    logger - logging object for messages
    '''
    #if stats:
    #    register_service(statsgen.statshandler)

    if hasattr(inputs, 'read') and hasattr(inputs, 'close'):
        #It's a file type?
        inputs = [inputs]
    if limit is not None:
        try:
            limit = int(limit)
        except ValueError:
            logger.debug('Limit must be a number, not "{0}". Ignoring.'.format(limit))

    ids = marc.idgen(base)
    m = memory.connection()
    g = rdflib.Graph()
    g.bind('bf', BFNS)
    g.bind('bfc', BFCNS)
    g.bind('bfd', BFDNS)
    g.bind('v', VNS)

    def postprocess(rec):
        #No need to bother with Versa -> RDF translation if we were not asked to generate Turtle
        if rdfttl is not None: rdf.process(m, g, logger=logger)
        m.create_space()

    #Initialize auxiliary services (i.e. plugins)
    plugins = []
    for pc in config.get(u'plugins', []):
        try:
            plugins.append(g_services[pc[u'id']](
                config=pc,
                logger=logger,
            )
            )
        except KeyError:
            raise Exception(u'Unknown plugin {0}'.format(pc[u'id']))

    limiting = [0, limit]
    for inf in inputs:
        sink = marc.record_handler(m, idbase=base, limiting=limiting, plugins=plugins, ids=ids, postprocess=postprocess, out=out, logger=logger)
        parse_marcxml(inf, sink)

    if rdfttl is not None: rdfttl.write(g.serialize(format="turtle"))
    for plugin in plugins:
        plugin.close()
    return
Exemple #20
0
    async def send(self, data):
        #Body text, response URL (e.g. after redirections), aiohttp.header object from response, referrer, controlling task ID
        (body, respurl, respheaders, referrer, task_id) = data
        _, respurlhost, _, _, _ = iri.split_uri_ref(respurl)
        if self._fphost == respurlhost:
            #csvexport_sink.logger.debug('[TASK {}]: Target subpage {} -> {}'.format(task_id, referrer, respurl))

            root = html5.parse(body)
            linkset = self._queue_links(root, respurl)

            try:
                _, resid = llnurl_ident(respurl)
            except ValueError:
                resid = None
            if resid:
                model = memory.connection()
                rdfalite.toversa(body, model, respurl)
                #Lock the file for
                resstem = resid[:HASH_WIDTH]
                csvexport_sink.locks.setdefault(resstem, Lock())
                #csvexport_sink.logger.debug('Awaiting lock on {}; TASK [{}].'.format(resstem, task_id))
                print('Awaiting lock on {}; TASK [{}].'.format(
                    resstem, task_id),
                      file=sys.stderr)
                await csvexport_sink.locks[resstem]
                print('Acquired lock on {}; TASK [{}].'.format(
                    resstem, task_id),
                      file=sys.stderr)

                try:
                    resstem_fpath = os.path.join(self.outfolder,
                                                 resstem + '.csv')
                    csvexists = os.path.exists(resstem_fpath)
                    #with gzip.open(resstem_fpath, 'at', newline='') as resstem_fp:
                    with open(resstem_fpath, 'at', newline='') as resstem_fp:
                        resstem_csv = csv.writer(resstem_fp,
                                                 delimiter=',',
                                                 quotechar='"',
                                                 quoting=csv.QUOTE_MINIMAL)
                        vcsv.write(model,
                                   resstem_csv,
                                   self.rules,
                                   not csvexists,
                                   base=respurl,
                                   logger=csvexport_sink.logger)
                finally:
                    csvexport_sink.locks[resstem].release()
                    #csvexport_sink.logger.debug('Released lock on {}; TASK [{}].'.format(resstem, task_id))
                    print('Released lock on {}; TASK [{}].'.format(
                        resstem, task_id),
                          file=sys.stderr)

            #self.save_ntriples()
            return linkset
        return None
def run_one(snippet, expected, desc, entbase=None, config=None, loop=None, canonical=True):
    m = memory.connection()
    m_expected = memory.connection()
    infile = tempfile.NamedTemporaryFile()
    infile.write(snippet.encode('utf-8'))
    infile.seek(0)
    outstream = StringIO()
    bfconvert([infile], model=m, out=outstream, config=config, canonical=canonical, loop=loop)
    #bfconvert(factory(infile), model=m, out=outstream, config=config, canonical=canonical, loop=loop)
    infile.close()
    outstream.seek(0)
    hashmap, m = hash_neutral_model(outstream)
    hashmap = '\n'.join(sorted([ repr((i[1], i[0])) for i in hashmap.items() ]))

    expected_stream = StringIO(expected)
    hashmap_expected, m_expected = hash_neutral_model(expected_stream)
    hashmap_expected = '\n'.join(sorted([ repr((i[1], i[0])) for i in hashmap_expected.items() ]))

    assert hashmap == hashmap_expected, "Changes to hashes found ({0}):\n{1}\n\nActual model structure diff:\n{2}".format(desc, file_diff(hashmap_expected, hashmap), file_diff(repr(m_expected), repr(m)))
    assert m == m_expected, "Discrepancies found ({0}):\n{1}".format(desc, file_diff(repr(m_expected), repr(m)))
def run_one(name, entbase=None, config=None, loop=None, canonical=True):
    m = memory.connection()
    m_expected = memory.connection()
    s = StringIO()

    fname = os.path.join(RESOURCEPATH, name+'.mrx')
    #bfconvert(factory(open(fname, 'rb')), model=m, out=s, config=config, canonical=canonical, loop=loop)
    #raise(Exception(repr(inputsource(open(fname, 'rb')))))

    bfconvert([inputsource(open(fname, 'rb'))], model=m, out=s, config=config, canonical=canonical, loop=loop)
    s.seek(0)
    hashmap, m = hash_neutral_model(s)
    hashmap = '\n'.join(sorted([ repr((i[1], i[0])) for i in hashmap.items() ]))

    with open(os.path.join(RESOURCEPATH, name+'.versa')) as indoc:
        hashmap_expected, m_expected = hash_neutral_model(indoc)
        hashmap_expected = '\n'.join(sorted([ repr((i[1], i[0])) for i in hashmap_expected.items() ]))

    assert hashmap == hashmap_expected, "Changes to hashes found for {0}:\n{1}\n\nActual model structure diff:\n{2}".format(name, file_diff(hashmap_expected, hashmap), file_diff(repr(m_expected), repr(m)))
    assert m == m_expected, "Discrepancies found for {0}:\n{1}".format(name, file_diff(repr(m_expected), repr(m)))
Exemple #23
0
def run_one(name, entbase=None, config=None, variation=''):
    m = memory.connection()
    m_expected = memory.connection()
    s = StringIO()

    fpath = os.path.join(RESOURCEPATH, name+'.mrx')
    instream = BytesIO(open(fpath, 'rb').read())

    print('Running {} ...'.format('('+variation+')' if variation else ''), fpath)

    def main():
        #Need a new event loop per timeit iteration
        loop = asyncio.new_event_loop(); asyncio.set_event_loop(None);
        instream.seek(io.SEEK_SET)
        bfconvert(instream, model=m, out=s, config=config, loop=loop)

    global_space = globals()
    global_space.update(locals())
    timing = timeit.timeit('main()', setup='', number=NLOOPS, globals=global_space)
    print('{} loops, best of 3: {:.2f} sec per loop.'.format(NLOOPS, timing))
Exemple #24
0
def test_ordering_insertion():
    model = memory.connection()
    model.add('s1','p1','lit1',{})
    model.add('s1','p2','lit2',{})
    model.add('s1','p0','lit0',{},index=1)
    model.add('s2','p3','lit3',{})

    assert list(model)[0][1][1] == 'p1'
    assert list(model)[1][1][1] == 'p0'
    assert list(model)[2][1][1] == 'p2'
    assert list(model)[3][1][1] == 'p3'
Exemple #25
0
def test_copy():
    model = memory.connection()
    r1 = model.add('s1','p0','lit0',{})
    r2 = model.add('s1','p1','lit1',{})
    r3 = model.add('s1','p2','lit2',{})

    model2 = model.copy()
    assert model == model2

    model3 = model.copy(contents=False)
    assert model3.size() == 0
Exemple #26
0
def test_ordering_insertion():
    model = memory.connection()
    model.add('s1', 'p1', 'lit1', {})
    model.add('s1', 'p2', 'lit2', {})
    model.add('s1', 'p0', 'lit0', {}, index=1)
    model.add('s2', 'p3', 'lit3', {})

    assert list(model)[0][1][1] == 'p1'
    assert list(model)[1][1][1] == 'p0'
    assert list(model)[2][1][1] == 'p2'
    assert list(model)[3][1][1] == 'p3'
Exemple #27
0
def test_copy():
    model = memory.connection()
    r1 = model.add('s1', 'p0', 'lit0', {})
    r2 = model.add('s1', 'p1', 'lit1', {})
    r3 = model.add('s1', 'p2', 'lit2', {})

    model2 = model.copy()
    assert model == model2

    model3 = model.copy(contents=False)
    assert model3.size() == 0
Exemple #28
0
def run_one(name, entbase=None, config=None, loop=None, canonical=True):
    m = memory.connection()
    m_expected = memory.connection()
    s = StringIO()

    fname = os.path.join(RESOURCEPATH, name+'.mrx')
    #bfconvert(factory(open(fname, 'rb')), model=m, out=s, config=config, canonical=canonical, loop=loop)
    #raise(Exception(repr(inputsource(open(fname, 'rb')))))

    bfconvert([inputsource(open(fname, 'rb'))], model=m, out=s, config=config, canonical=canonical, loop=loop)
    s.seek(0)
    hashmap, m = hash_neutral_model(s)
    hashmap = '\n'.join(sorted([ repr((i[1], i[0])) for i in hashmap.items() ]))

    with open(os.path.join(RESOURCEPATH, name+'.versa')) as indoc:
        hashmap_expected, m_expected = hash_neutral_model(indoc)
        hashmap_expected = '\n'.join(sorted([ repr((i[1], i[0])) for i in hashmap_expected.items() ]))

    assert hashmap == hashmap_expected, "Changes to hashes found for {0}:\n{1}\n\nActual model structure diff:\n{2}".format(name, file_diff(hashmap_expected, hashmap), file_diff(repr(m_expected), repr(m)))
    assert m == m_expected, "Discrepancies found for {0}:\n{1}".format(name, file_diff(repr(m_expected), repr(m)))
def test_basic_marc1():
    import os
    import amara
    from bibframe.reader.marc import process
    indoc = amara.parse(os.path.join(RESOURCEPATH, 'kford-holdings1.xml'))
    #Top level ma:collection is optional, so can't just assume /ma:collection/ma:record XPath
    recs = indoc.xml_select(u'//ma:record', prefixes=PREFIXES)
    #logging.debug(recs)
    m = memory.connection()
    m.create_space()
    process(recs, m, idbase='http://example.org/')
    logging.debug('MARC BASICS PART 1')
Exemple #30
0
def test_pipeline2():
    idg = idgen(EXAMPLE_ORG)
    existing_ids = set()

    TRANSFORMS = [
        ("id", discard()),
        ("title", rename(rel="name")),
        # For testing; doesn't make much sense, really, otherwise
        (
            "author",
            materialize("Person", rel="author", unique=run("target"), links={"name": run("target")}, inverse=True),
        ),
        ("link", rename(rel="link")),
        ("cover", rename(rel="cover")),
    ]

    out_m = memory.connection(baseiri="http://example.org/")

    rid = SIMPLE_BOOK["id"]
    out_m.add(rid, VTYPE_REL, BOOK_TYPE)
    for k, v in SIMPLE_BOOK.items():
        link = (rid, k, v, {})
        for rel, func in TRANSFORMS:
            if k == rel:
                in_m = memory.connection(baseiri="http://example.org/")
                ctx = context(link, in_m, out_m, base=SCHEMA_ORG, idgen=idg)
                func(ctx)

    assert out_m.size() == 7, repr(out_m)
    assert next(out_m.match("http://example.org/book/catcher-in-the-rye", VTYPE_REL))[TARGET] == BOOK_TYPE
    assert (
        next(out_m.match("http://example.org/book/catcher-in-the-rye", I(iri.absolutize("name", SCHEMA_ORG))))[TARGET]
        == "The Catcher in the Rye"
    )
    author = next(
        out_m.match(None, I(iri.absolutize("author", SCHEMA_ORG))), "http://example.org/book/catcher-in-the-rye"
    )[ORIGIN]
    assert next(out_m.match(author, I(iri.absolutize("name", SCHEMA_ORG))), None)[TARGET] == "J.D. Salinger"
Exemple #31
0
def run(infile):
    m = memory.connection()
    from_markdown(infile.read(), m)
    #from versa.util import jsondump
    #jsondump(m, open('/tmp/foo.json', 'w'))
    for poem in resources_by_type(m, 'http://uche.ogbuji.net/poems/poem'):
        choice = '@choice' in list(map(operator.itemgetter(TARGET), m.match(poem, 'http://www.w3.org/2005/Atom/category')))
        if not choice: continue
        d = parse_date(simple_lookup(m, poem, 'http://www.w3.org/2005/Atom/updated'))
        source = next(m.match(poem, 'http://www.w3.org/2005/Atom/source'))
        source = source[ATTRIBUTES]['title']
        title = simple_lookup(m, poem, 'http://www.w3.org/2005/Atom/title')
        print('\t'.join(("'" + title + "'", 'Poem', d.strftime('%B, %Y'), source)))
        print()
Exemple #32
0
def test_removal():
    model = memory.connection()
    model.add('s1','p0','lit0',{})
    model.add('s1','p1','lit1',{})
    model.add('s1','p2','lit2',{})
    model.add('s2','p3','lit3',{})
    model.remove([3,0])

    assert list(model)[0][1][2] == 'lit1'
    assert list(model)[1][1][2] == 'lit2'
    assert model.size() == 2

    model.remove(0)
    assert list(model)[0][1][2] == 'lit2'
    assert model.size() == 1
Exemple #33
0
def test_removal():
    model = memory.connection()
    model.add('s1', 'p0', 'lit0', {})
    model.add('s1', 'p1', 'lit1', {})
    model.add('s1', 'p2', 'lit2', {})
    model.add('s2', 'p3', 'lit3', {})
    model.remove([3, 0])

    assert list(model)[0][1][2] == 'lit1'
    assert list(model)[1][1][2] == 'lit2'
    assert model.size() == 2

    model.remove(0)
    assert list(model)[0][1][2] == 'lit2'
    assert model.size() == 1
Exemple #34
0
def test_book_cases(label, transforms, asserter):
    idg = idgen(EXAMPLE_ORG)
    existing_ids = set()
    out_m = memory.connection(baseiri='http://example.org/')

    rid = SIMPLE_BOOK['id']
    out_m.add(rid, VTYPE_REL, BOOK_TYPE)

    for k, v in SIMPLE_BOOK.items():
        ctxlink = (rid, k, v, {})
        func = transforms.get(k)
        if func:
            ctx = context(ctxlink, IN_M, out_m, base=SCHEMA_ORG, idgen=idg)
            func(ctx)

    asserter(out_m)
Exemple #35
0
def test_basics():
    "Basic query test"
    m = memory.connection()
    [ m.add(*l) for l in RELS_1 ]
    variables = {'DC': DC, 'H5': H5, 'H5L': H5L}
    ctx = context(tuple(RELS_1[0]), m, U + 'uo', base=None, extras=None, variables=variables)
    parsed = miniparse("?($a, H5 'title', *) and ?($b, H5L 'see-also', $a)")
    result = parsed.evaluate(ctx)
    assert result == {'a': set(['http://uche.ogbuji.net/ndewo/']), 'b': set(['http://uche.ogbuji.net/'])}

    parsed = miniparse("?($a, H5L 'see-also', *)")
    result = parsed.evaluate(ctx)
    assert result == {'a': set(['http://uche.ogbuji.net/', 'http://uche.ogbuji.net/ndewo/'])}

    parsed = miniparse("?($a, H5 'title', *)")
    result = parsed.evaluate(ctx)
    assert result == {'a': set(['http://uche.ogbuji.net/ndewo/'])}
    return
Exemple #36
0
def load_rdfa_page(site, max_retries=1):
    '''
    Helper to load RDFa page as text, plus load a Versa model with the metadata
    
    Returns a versa memory model and the raw site text, except in eror case where it returns None and the error
    '''
    retry_count = 0
    while True:
        model = memory.connection()
        try:
            with urllib.request.urlopen(site) as resourcefp:
                sitetext = resourcefp.read()
                rdfalite.toversa(sitetext, model, site)
            break #Success, so break out of retry loop
        except (urllib.error.HTTPError, urllib.error.URLError, http.client.RemoteDisconnected) as e:
            retry_count += 1
            if retry_count >= max_retries:
                return None, e
    return model, sitetext
Exemple #37
0
def test_versa_syntax1():
    config = {
        'autotype-h1': 'http://example.org/r1',
        'autotype-h2': 'http://example.org/r2',
        'interpretations': {
            VERSA_BASEIRI + 'refines': VERSA_BASEIRI + 'resourceset',
            VERSA_BASEIRI + 'properties': VERSA_BASEIRI + 'resourceset',
            VERSA_BASEIRI + 'synonyms': VERSA_BASEIRI + 'resourceset'
        }
    }

    m = memory.connection(baseiri='http://example.org/')
    #from_markdown(VERSA_LITERATE1, m, encoding='utf-8')
    doc = open(os.path.join(RESOURCEPATH, 'ubibframe.md')).read()
    from_markdown(doc, m, config=config)
    logging.debug('VERSA LITERATE EXAMPLE 1')
    for link in m.match():
        logging.debug('Result: {0}'.format(repr(link)))
        #assert result == ()
    assert results == None, "Boo! "
Exemple #38
0
def test_versa_syntax1():
    config = {
        'autotype-h1': 'http://example.org/r1',
        'autotype-h2': 'http://example.org/r2',
        'interpretations': {
            VERSA_BASEIRI + 'refines': VERSA_BASEIRI + 'resource',
            VERSA_BASEIRI + 'properties': VERSA_BASEIRI + 'resourceset',
            VERSA_BASEIRI + 'synonyms': VERSA_BASEIRI + 'resourceset'
        }
    }

    m = memory.connection(baseuri='http://example.org/')
    #from_markdown(VERSA_LITERATE1, m, encoding='utf-8')
    doc = open(os.path.join(RESOURCEPATH, 'ubibframe.md')).read()
    from_markdown(doc, m, config=config)
    logging.debug('VERSA LITERATE EXAMPLE 1')
    for stmt in m.match():
        logging.debug('Result: {0}'.format(repr(stmt)))
        #assert result == ()
    assert results == None, "Boo! "
Exemple #39
0
    async def send(self, data):
        #Body text, response URL (e.g. after redirections), aiohttp.header object from response, referrer, controlling task ID
        (body, respurl, respheaders, referrer, task_id) = data
        _, respurlhost, _, _, _ = iri.split_uri_ref(respurl)
        if self._fphost == respurlhost:
            #csvexport_sink.logger.debug('[TASK {}]: Target subpage {} -> {}'.format(task_id, referrer, respurl))

            root = html5.parse(body)
            linkset = self._queue_links(root, respurl)
            
            try:
                _, resid = llnurl_ident(respurl)
            except ValueError:
                resid = None
            if resid:
                model = memory.connection()
                rdfalite.toversa(body, model, respurl)
                #Lock the file for 
                resstem = resid[:HASH_WIDTH]
                csvexport_sink.locks.setdefault(resstem, Lock())
                #csvexport_sink.logger.debug('Awaiting lock on {}; TASK [{}].'.format(resstem, task_id))
                print('Awaiting lock on {}; TASK [{}].'.format(resstem, task_id), file=sys.stderr)
                await csvexport_sink.locks[resstem]
                print('Acquired lock on {}; TASK [{}].'.format(resstem, task_id), file=sys.stderr)

                try:
                    resstem_fpath = os.path.join(self.outfolder, resstem + '.csv')
                    csvexists = os.path.exists(resstem_fpath)
                    #with gzip.open(resstem_fpath, 'at', newline='') as resstem_fp:
                    with open(resstem_fpath, 'at', newline='') as resstem_fp:
                        resstem_csv = csv.writer(resstem_fp, delimiter=',',
                                                    quotechar='"', quoting=csv.QUOTE_MINIMAL)
                        vcsv.write(model, resstem_csv, self.rules, not csvexists, base=respurl, logger=csvexport_sink.logger)
                finally:
                    csvexport_sink.locks[resstem].release()
                    #csvexport_sink.logger.debug('Released lock on {}; TASK [{}].'.format(resstem, task_id))
                    print('Released lock on {}; TASK [{}].'.format(resstem, task_id), file=sys.stderr)

            #self.save_ntriples()
            return linkset
        return None
Exemple #40
0
    async def send(self, data):
        #Body text, response URL (e.g. after redirections), aiohttp.header object from response, referrer, controlling task ID
        (body, respurl, respheaders, referrer, task_id) = data
        _, respurlhost, _, _, _ = iri.split_uri_ref(respurl)
        if self._fphost == respurlhost:
            #ntriplify_sink.logger.debug('[TASK {}]: Target subpage {} -> {}'.format(task_id, referrer, respurl))

            root = html5.parse(body)
            linkset = self._queue_links(root, respurl)

            try:
                _, resid = llnurl_ident(respurl)
            except ValueError:
                resid = None
            if resid:
                #Lock the file for
                resstem = resid[:3]
                ntriplify_sink.locks.setdefault(resstem, Lock())
                await ntriplify_sink.locks[resstem]
                try:
                    #Note:
                    #timeit.timeit(‘rdfalite.toversa(open(“index.html”).read(), model, “http://link.delawarelibrary.org/portal/Nest-Esther-Ehrlich-overdrive-ebook/F-h_bGCl5lk/“)’, setup=‘from versa.driver import memory; from versa.reader import rdfalite; model = memory.connection()’, number=10)
                    #4.412366830001702
                    #timeit.timeit(‘g = rdflib.Graph(); g.parse(“index.html”, format=“html”)’, setup=‘import rdflib’, number=10)
                    #[snip tons of warnings]
                    #16.82040351499745
                    #IOW Versa is 4X faster than RDFlib for this task, and more robust
                    with open(os.path.join(self.outfolder, resstem + '.nt'),
                              'a') as resstem_fp:
                        model = memory.connection()
                        rdfalite.toversa(body, model, respurl)
                        ntriples.write(model,
                                       out=resstem_fp,
                                       base=respurl,
                                       logger=ntriplify_sink.logger)
                finally:
                    ntriplify_sink.locks[resstem].release()

            #self.save_ntriples()
            return linkset
        return None
Exemple #41
0
    def send(self, data):
        #Body text, respunse URL (e.g. after redirections), aiohttp.header object from response, referrer, controlling task ID
        (body, respurl, respheaders, referrer, task_id) = data
        _, respurlhost, _, _, _ = iri.split_uri_ref(respurl)
        if LIBRARY_LINK_HEADER not in respheaders:
            #Not even an LLN page at all
            return
        if self._fphost == respurlhost:
            output_model = memory.connection()
            quickinfo_sink.logger.debug('[TASK {}]: Target subpage {} -> {}'.format(task_id, referrer, respurl))
            #Subpage of the target site
            rdfalite.toversa(body, output_model, respurl)
            resname = versautil.simple_lookup(output_model, respurl, SCHEMAORG + 'name')
            print(respurl, '|', resname, file=quickinfo_sink.outfp)
            #orgentity = util.simple_lookup_byvalue(model, RDFTYPE, SCHEMAORG + 'Organization')
            #name = util.simple_lookup(model, orgentity, BL + 'name')
            #name = util.simple_lookup(model, baseurl + '#_default', BL + 'name')

            root = html5.parse(body)
            linkset = self._queue_links(root, respurl)
        return linkset
Exemple #42
0
def test_basics():
    "test ..."
    model = memory.connection()
    for (subj, pred, obj, attrs) in RELS_1:
        model.add(subj, pred, obj, attrs)
    results = model.match(origin='http://copia.ogbuji.net')
    logging.debug('BASICS PART 1')
    for result in results:
        logging.debug('Result: {0}'.format(repr(result)))
        #assert result == ()
    #assert results == None, "Boo! "

    results = model.match(origin='http://uche.ogbuji.net', attrs={u'@lang': u'ig'})
    logging.debug('BASICS PART 2')
    results = tuple(list(results))
    import pprint; pprint.pprint(results)
    for result in results:
        logging.debug('Result: {0}'.format(repr(result)))
        #assert result == ()
    expected = (('http://uche.ogbuji.net', 'http://purl.org/dc/elements/1.1/title', 'Ulo Uche', {'@context': 'http://uche.ogbuji.net#_metadata', '@lang': 'ig'}),)
    assert results == expected, (results, expected)
def run(infile):
    m = memory.connection()
    from_markdown(infile.read(), m)
    #from versa.util import jsondump
    #jsondump(m, open('/tmp/foo.json', 'w'))
    print('<descriptionSet>')
    for poem in resources_by_type(m, 'http://uche.ogbuji.net/poems/poem'):
        choice = '@choice' in list(map(operator.itemgetter(TARGET), m.match(poem, 'http://www.w3.org/2005/Atom/category')))
        if not choice: continue
        print('<description>')
        d = parse_date(simple_lookup(m, poem, 'http://www.w3.org/2005/Atom/updated'))
        source = next(m.match(poem, 'http://www.w3.org/2005/Atom/source'))
        source = source[ATTRIBUTES]['title']
        title = simple_lookup(m, poem, 'http://www.w3.org/2005/Atom/title')
        print('  <title>{0}</title>\n  <date>{1}</date>\n  <publisher>{2}</publisher>'.format(title, d.strftime('%B, %Y'), source))
        hlink = list(map(operator.itemgetter(TARGET), m.match(poem, 'http://www.w3.org/2005/Atom/link')))
        if hlink:
            hlink = hlink[0]
            print('  <link href="{0}"/>'.format(hlink))
        print('</description>')
    print('</descriptionSet>')
Exemple #44
0
 def start_element(self, name, attributes):
     if self._lax:
         (head, sep, tail) = name.partition(':')
         local = tail or head
         ns = MARCXML_NS #Just assume all elements are MARC/XML
     else:
         ns, local = name.split(NSSEP) if NSSEP in name else (None, name)
     if ns == MARCXML_NS:
         #Ignore the 'collection' element
         #What to do with the record/@type
         if local == 'record':
             self.no_records = False
             #XXX: Entity base IRI needed?
             self._record_id = 'record-{0}:{1}'.format(self._parser.CurrentLineNumber, self._parser.CurrentColumnNumber)
             #Versa model with a representation of the record
             #For input model plugins, important that natural ordering be preserved
             self._record_model = memory.connection(attr_cls=OrderedDict)#logger=logger)
         elif local == 'leader':
             self._chardata_dest = ''
             self._link_iri = MARCXML_NS + '/leader'
             self._marc_attributes = OrderedDict()
             self._getcontent = True
         elif local == 'controlfield':
             self._chardata_dest = ''
             self._link_iri = MARCXML_NS + '/control/' + attributes['tag'].strip()
             #Control tags have neither indicators nor subfields
             self._marc_attributes = OrderedDict({'tag': attributes['tag'].strip()})
             self._getcontent = True
         elif local == 'datafield':
             self._link_iri = MARCXML_NS + '/data/' + attributes['tag'].strip()
             self._marc_attributes = OrderedDict(([k, v.strip()] for (k, v) in attributes.items() if ' ' not in k))
         elif local == 'subfield':
             self._chardata_dest = ''
             self._subfield = attributes['code'].strip()
             if not self._subfield or ord(self._subfield) not in VALID_SUBFIELDS:
                 self._subfield = '_'
             self._getcontent = True
     return
Exemple #45
0
def jsonize_site(site, rules=None):
    '''
    >>> from librarylink.util import jsonize_site
    >>> obj = jsonize_site('http://link.denverlibrary.org')
    >>> with open('denverlibrary.ld.json', 'w') as fp: json.dump(obj, fp, indent=2)

    >>> rules = {'ignore-predicates': ['http://bibfra.me/', 'http://library.link/'], 'rename-predicates': {'http://library.link/vocab/branchOf': 'http://schema.org/branch'}}
    >>> obj = jsonize_site('http://link.denverlibrary.org', rules=rules)
    >>> with open('denverlibrary.ld.json', 'w') as fp: json.dump(obj, fp, indent=2)
    '''
    from versa.util import uniquify
    from versa.writer import jsonld
    rules = rules or {}
    ignore_pred = rules.get('ignore-predicates', set())
    rename_pred = rules.get('rename-predicates', {})
    ignore_oftypes = rules.get('ignore-oftypes', [])
    invert = rules.get('invert', {})
    context = rules.get('context', {})
    pre_model, _ = load_rdfa_page(site)
    if not pre_model:
        return None
    uniquify(pre_model)
    post_model = memory.connection()
    for o, r, t, a in pre_model.match():
        #print(o, r, t)
        for oldp, newp in rename_pred:
            if r == oldp: r = newp
        for rpre, rpost in invert:
            if r == rpre:
                assert isinstance(t, I)
                o, r, t = t, rpost, o
        for igp in ignore_pred:
            if r.startswith(igp):
                break
        else:
            post_model.add(o, r, t, a)
    obj = jsonld.bind(post_model, context=context, ignore_oftypes=ignore_oftypes)
    return obj
Exemple #46
0
    async def send(self, data):
        #Body text, response URL (e.g. after redirections), aiohttp.header object from response, referrer, controlling task ID
        (body, respurl, respheaders, referrer, task_id) = data
        _, respurlhost, _, _, _ = iri.split_uri_ref(respurl)
        if self._fphost == respurlhost:
            #ntriplify_sink.logger.debug('[TASK {}]: Target subpage {} -> {}'.format(task_id, referrer, respurl))

            root = html5.parse(body)
            linkset = self._queue_links(root, respurl)
            
            try:
                _, resid = llnurl_ident(respurl)
            except ValueError:
                resid = None
            if resid:
                #Lock the file for 
                resstem = resid[:3]
                ntriplify_sink.locks.setdefault(resstem, Lock())
                await ntriplify_sink.locks[resstem]
                try:
                    #Note:
                    #timeit.timeit(‘rdfalite.toversa(open(“index.html”).read(), model, “http://link.delawarelibrary.org/portal/Nest-Esther-Ehrlich-overdrive-ebook/F-h_bGCl5lk/“)’, setup=‘from versa.driver import memory; from versa.reader import rdfalite; model = memory.connection()’, number=10)
                    #4.412366830001702
                    #timeit.timeit(‘g = rdflib.Graph(); g.parse(“index.html”, format=“html”)’, setup=‘import rdflib’, number=10)
                    #[snip tons of warnings]
                    #16.82040351499745
                    #IOW Versa is 4X faster than RDFlib for this task, and more robust
                    with open(os.path.join(self.outfolder, resstem + '.nt'), 'a') as resstem_fp:
                        model = memory.connection()
                        rdfalite.toversa(body, model, respurl)
                        ntriples.write(model, out=resstem_fp, base=respurl, logger=ntriplify_sink.logger)
                finally:
                    ntriplify_sink.locks[resstem].release()

            #self.save_ntriples()
            return linkset
        return None
Exemple #47
0
    def send(self, data):
        #Body text, respunse URL (e.g. after redirections), aiohttp.header object from response, referrer, controlling task ID
        (body, respurl, respheaders, referrer, task_id) = data
        _, respurlhost, _, _, _ = iri.split_uri_ref(respurl)
        if LIBRARY_LINK_HEADER not in respheaders:
            #Not even an LLN page at all
            return
        if self._fphost == respurlhost:
            output_model = memory.connection()
            quickinfo_sink.logger.debug(
                '[TASK {}]: Target subpage {} -> {}'.format(
                    task_id, referrer, respurl))
            #Subpage of the target site
            rdfalite.toversa(body, output_model, respurl)
            resname = versautil.simple_lookup(output_model, respurl,
                                              SCHEMAORG + 'name')
            print(respurl, '|', resname, file=quickinfo_sink.outfp)
            #orgentity = util.simple_lookup_byvalue(model, RDFTYPE, SCHEMAORG + 'Organization')
            #name = util.simple_lookup(model, orgentity, BL + 'name')
            #name = util.simple_lookup(model, baseurl + '#_default', BL + 'name')

            root = html5.parse(body)
            linkset = self._queue_links(root, respurl)
        return linkset
Exemple #48
0
def bfconvert(inputs,
              handle_marc_source=handle_marcxml_source,
              entbase=None,
              model=None,
              out=None,
              limit=None,
              rdfttl=None,
              rdfxml=None,
              xml=None,
              config=None,
              verbose=False,
              logger=logging,
              loop=None,
              canonical=False,
              lax=False,
              defaultsourcetype=inputsourcetype.unknown):
    '''
    inputs - One or more open file-like object, string with MARC content, or filename or IRI. If filename or
                IRI it's a good idea to indicate this via the defaultsourcetype parameter
    handle_marc_source - Function to turn a source of MARC data (e.g. XML or JSON) into the internal format for processing
    entbase - Base IRI to be used for creating resources.
    model - model instance for internal use
    out - file where raw Versa JSON dump output should be written (default: write to stdout)
    limit - Limit the number of records processed to this number. If omitted, all records will be processed.
    rdfttl - stream to where RDF Turtle output should be written
    rdfxml - stream to where RDF/XML output should be written
    config - configuration information
    verbose - If true show additional messages and information (default: False)
    logger - logging object for messages
    loop - optional asyncio event loop to use
    canonical - output Versa's canonical form?
    lax - If True signal to the handle_marc_source function that relaxed syntax rules should be applied
            (e.g. accept XML with namespace problems)
    defaultsourcetype - Signal indicating how best to interpret inputs to create an inputsource
    '''
    #if stats:
    #    register_service(statsgen.statshandler)

    config = config or {}
    if limit is not None:
        try:
            limit = int(limit)
        except ValueError:
            logger.debug(
                'Limit must be a number, not "{0}". Ignoring.'.format(limit))

    def resolve_class(fullname):
        '''
        Given a full name for a Python class, return the class object
        '''
        import importlib
        modpath, name = fullname.rsplit('.', 1)
        module = importlib.import_module(modpath)
        cls = getattr(module, name)
        return cls

    attr_cls = resolve_class(config.get('versa-attr-cls', 'builtins.dict'))

    model_factory = functools.partial(memory.connection,
                                      attr_cls=attr_cls)  #,logger=logger)

    if 'marc_record_handler' in config:
        handle_marc_source = AVAILABLE_MARC_HANDLERS[
            config['marc_record_handler']]

    readmode = handle_marc_source.readmode
    #inputs = ( inputsource(open(i, readmode)) for i in inputs )
    #if not isinstance(inputs[0], inputsource):
    #    inputs = ( inputsource(i, streamopenmode=readmode) for i in inputs )

    if handle_marc_source.makeinputsource:
        inputs = factory(inputs,
                         defaultsourcetype=defaultsourcetype,
                         streamopenmode=readmode)
    #inputs = ( inputsource(i, streamopenmode=readmode) for i in inputs )

    ids = marc.idgen(entbase)
    if model is None: model = model_factory()

    if any((rdfttl, rdfxml)):
        import rdflib

        BFNS = rdflib.Namespace(BFZ)
        BFCNS = rdflib.Namespace(BFZ + 'cftag/')
        BFDNS = rdflib.Namespace(BFZ + 'dftag/')

        g = rdflib.Graph()
    #Intentionally not using either factory
    if canonical: global_model = memory.connection()  #logger=logger)

    if xml is not None:
        xmlw = writer.raw(xml, indent='  ')
        xmlw.start_element('bibframe')

    extant_resources = None

    #extant_resources = set()
    def postprocess():
        #No need to bother with Versa -> RDF translation if we were not asked to generate Turtle
        if any((rdfttl, rdfxml)):
            rdf.process(model, g, to_ignore=extant_resources, logger=logger)
        if canonical:
            global_model.add_many([(o, r, t, a)
                                   for (rid, (o, r, t, a)) in model])

        if xml is not None:
            microxml.process(model,
                             xmlw,
                             to_ignore=extant_resources,
                             logger=logger)

        model.create_space()

    #Set up event loop if not provided
    if not loop:
        loop = asyncio.get_event_loop()

    #Allow configuration of a separate base URI for vocab items (classes & properties)
    #XXX: Is this the best way to do this, or rather via a post-processing plug-in
    vb = config.get('vocab-base-uri', BL)

    transform_iris = config.get('transforms', [])
    marcspecials_vocab = config.get('marcspecials-vocab')
    transforms = transform_set(transform_iris, marcspecials_vocab)

    lookups = config.get('lookups', {})

    #Initialize auxiliary services (i.e. plugins)
    plugins = []
    for pc in config.get('plugins', []):
        try:
            pinfo = g_services[pc['id']]
            plugins.append(pinfo)
            pinfo[BF_INIT_TASK](pinfo, config=pc)
        except KeyError:
            raise Exception('Unknown plugin {0}'.format(pc['id']))

    limiting = [0, limit]
    #logger=logger,

    #raise(Exception(repr(inputs)))
    for source in inputs:

        @asyncio.coroutine
        #Wrap the parse operation to make it a task in the event loop
        def wrap_task():  #source=source
            sink = marc.record_handler(loop,
                                       model,
                                       entbase=entbase,
                                       vocabbase=vb,
                                       limiting=limiting,
                                       plugins=plugins,
                                       ids=ids,
                                       postprocess=postprocess,
                                       out=out,
                                       logger=logger,
                                       transforms=transforms,
                                       canonical=canonical,
                                       lookups=lookups,
                                       model_factory=model_factory)

            args = dict(lax=lax)
            handle_marc_source(source, sink, args, logger, model_factory)
            sink.close()
            yield

        task = asyncio. async (wrap_task(), loop=loop)

        try:
            loop.run_until_complete(task)
        except Exception as ex:
            raise ex
        finally:
            loop.close()

    if canonical:
        out.write(repr(global_model))

    if any((rdfttl, rdfxml)):
        if vb == BFZ:
            g.bind('bf', BFNS)
            g.bind('bfc', BFCNS)
            g.bind('bfd', BFDNS)
        else:
            g.bind('vb', rdflib.Namespace(vb))
        if entbase:
            g.bind('ent', entbase)

    if rdfttl is not None:
        logger.debug('Converting to RDF (Turtle).')
        rdfttl.write(g.serialize(format="turtle"))

    if rdfxml is not None:
        logger.debug('Converting to RDF (XML).')
        rdfxml.write(g.serialize(format="pretty-xml"))

    if xml is not None:
        logger.debug('Converting to XML.')
        xmlw.end_element('bibframe')
    return