Beispiel #1
0
    def _toiri(ctx):
        _arg = arg(ctx) if is_pipeline_action(arg) else arg
        _arg = [_arg] if not isinstance(_arg, list) else _arg
        ret = []
        for u in _arg:
            iu = u
            if not (ignore_refs and not iri.is_absolute(iu)):
                # coerce into an IRIref, but fallout as untyped text otherwise
                try:
                    iu = I(iu)
                except ValueError as e:
                    # attempt to recover by percent encoding
                    try:
                        iu = I(iri.percent_encode(iu))
                    except ValueError as e:
                        ctx.extras['logger'].warn(
                            'Unable to convert "{}" to IRI reference:\n{}'.
                            format(iu, e))

                if base is not None and isinstance(iu, I):
                    iu = I(iri.absolutize(iu, base))

            ret.append(iu)

        return ret
Beispiel #2
0
    def _link(ctx):
        (origin, _, t, a) = ctx.current_link
        if derive_origin:
            #Have enough info to derive the origin from context. Ignore origin in current link
            origin = derive_origin(ctx)

        #If need be call the Versa action function to determine the relationship to the materialized resource
        rels = rel(ctx) if callable(rel) else rel
        if not isinstance(rels, list): rels = [rels]

        _value = value(ctx) if callable(value) else (
            t if value is None else value)
        #Just work with the first provided statement, for now
        if res and not (ignore_refs and not iri.is_absolute(_value)):
            try:
                _value = I(_value)
            except ValueError:
                ctx.extras['logger'].warn(
                    'Requirement to convert link target to IRI failed for invalid input, causing the corresponding output link to be omitted entirely: {0}'
                    .format(
                        repr(
                            (I(origin), I(iri.absolutize(rel,
                                                         ctx.base)), _value))))
                #XXX How do we really want to handle this error?
                return []
        for r in rels:
            ctx.output_model.add(I(origin), I(iri.absolutize(r, ctx.base)),
                                 _value, {})
        return
Beispiel #3
0
 def expand_iri(iri_in, base):
     if iri_in.startswith('@'):
         return I(iri.absolutize(iri_in[1:], VERSA_BASEIRI))
     iri_match = URI_EXPLICIT_PAT.match(iri_in)
     if iri_match:
         return I(iri.absolutize(iri_match.group(1), base))
     iri_match = URI_ABBR_PAT.match(iri_in)
     if iri_match:
         uri = iris[iri_match.group(1)]
         fulliri = URI_ABBR_PAT.sub(uri + '\\2\\3', iri_in)
     else:
         fulliri = I(iri.absolutize(iri_in, base))
     return fulliri
Beispiel #4
0
def abbreviate(rel, bases):
    for base in bases:
        abbr = iri.relativize(rel, base, subPathOnly=True)
        if abbr:
            if base is VERSA_BASEIRI:
                abbr = '@' + abbr
            return abbr
    return I(rel)
Beispiel #5
0
def handle_resourcelist(ltext, **kwargs):
    '''
    A helper that converts lists of resources from a textual format such as Markdown, including absolutizing relative IRIs
    '''
    base = kwargs.get('base', VERSA_BASEIRI)
    model = kwargs.get('model')
    iris = ltext.strip().split()
    newlist = model.generate_resource()
    for i in iris:
        model.add(newlist, VERSA_BASEIRI + 'item', I(iri.absolutize(i, base)))
    return newlist
Beispiel #6
0
def handle_resourceset(ltext, **kwargs):
    '''
    A helper that converts sets of resources from a textual format such as Markdown, including absolutizing relative IRIs
    '''
    fullprop = kwargs.get('fullprop')
    rid = kwargs.get('rid')
    base = kwargs.get('base', VERSA_BASEIRI)
    model = kwargs.get('model')
    iris = ltext.strip().split()
    for i in iris:
        model.add(rid, fullprop, I(iri.absolutize(i, base)))
    return None
Beispiel #7
0
def materialize_entity(etype, ctx_params=None, model_to_update=None, data=None, addtype=True, loop=None, logger=logging):
    '''
    Routine for creating a BIBFRAME resource. Takes the entity (resource) type and a data mapping
    according to the resource type. Implements the Libhub Resource Hash Convention
    As a convenience, if a vocabulary base is provided, concatenate it to etype and the data keys

    data - list of key/value pairs used to compute the hash. If empty the hash will be a default for the entity type
            WARNING: THIS FUNCTION MANGLES THE data ARG
    '''
    ctx_params = ctx_params or {}
    vocabbase = ctx_params.get('vocabbase', BL)
    entbase = ctx_params.get('entbase')
    existing_ids = ctx_params.get('existing_ids', set())
    plugins = ctx_params.get('plugins')
    logger = ctx_params.get('logger', logging)
    output_model = ctx_params.get('output_model')
    ids = ctx_params.get('ids', default_idgen(entbase))
    if vocabbase and not iri.is_absolute(etype):
        etype = vocabbase + etype
    params = {'logger': logger}

    data = data or []
    if addtype: data.insert(0, [TYPE_REL, etype])
    data_full =  [ ((vocabbase + k if not iri.is_absolute(k) else k), v) for (k, v) in data ]
    plaintext = json.dumps(data_full, separators=(',', ':'), cls=OrderedJsonEncoder)

    eid = ids.send(plaintext)

    if model_to_update:
        model_to_update.add(I(eid), TYPE_REL, I(etype))

    params['materialized_id'] = eid
    params['first_seen'] = eid in existing_ids
    params['plaintext'] = plaintext
    for plugin in plugins or ():
        #Not using yield from
        if BF_MATRES_TASK in plugin:
            for p in plugin[BF_MATRES_TASK](loop, output_model, params): pass
        #logger.debug("Pending tasks: %s" % asyncio.Task.all_tasks(loop))
    return eid
Beispiel #8
0
def duplicate_statements(model, oldorigin, neworigin, rfilter=None):
    '''
    Take links with a given origin, and create duplicate links with the same information but a new origin

    :param model: Versa model to be updated
    :param oldres: resource IRI to be duplicated
    :param newres: origin resource IRI for duplication
    :return: None
    '''
    for o, r, t, a in model.match(oldorigin):
        if rfilter is None or rfilter(o, r, t, a):
            model.add(I(neworigin), r, t, a)
    return
Beispiel #9
0
def process(source, target, rdfsonly, base=None, logger=logging):
    '''
    Prepare a statement into a triple ready for rdflib graph

    '''
    for link in source.match():
        s, p, o = link[:3]
        #SKip docheader statements
        if s == (base or '') + '@docheader': continue
        if p in RESOURCE_MAPPING: p = RESOURCE_MAPPING[p]
        if o in RESOURCE_MAPPING: o = RESOURCE_MAPPING[o]
        if p == VERSA_BASEIRI + 'refines':
            tlinks = list(source.match(s, TYPE_REL))
            if tlinks:
                if tlinks[0][TARGET] == VERSA_BASEIRI + 'Resource':
                    p = I(RDFS_NAMESPACE + 'subClassOf')
                elif tlinks[0][TARGET] == VERSA_BASEIRI + 'Property':
                    p = I(RDFS_NAMESPACE + 'subPropertyOf')
        if p == VERSA_BASEIRI + 'properties':
            suri = I(iri.absolutize(s, base)) if base else s
            target.add(
                (URIRef(o), URIRef(RDFS_NAMESPACE + 'domain'), URIRef(suri)))
            continue
        if p == VERSA_BASEIRI + 'value':
            if o not in ['Literal', 'IRI']:
                ouri = I(iri.absolutize(o, base)) if base else o
                target.add((URIRef(s), URIRef(RDFS_NAMESPACE + 'range'),
                            URIRef(ouri)))
                continue
        s = URIRef(s)
        #Translate v:type to rdf:type
        p = RDF.type if p == TYPE_REL else URIRef(p)
        o = URIRef(o) if isinstance(o, I) else Literal(o)
        if not rdfsonly or p.startswith(RDF_NAMESPACE) or p.startswith(
                RDFS_NAMESPACE):
            target.add((s, p, o))
    return
Beispiel #10
0
def materialize_entity(ctx, etype, fprint=None):
    '''
    Low-level routine for creating a resource. Takes the entity (resource) type
    and a data mapping according to the resource type. As a convenience, if a
    vocabulary base is provided in the context, concatenate it to etype and
    data keys

    ctx - context information governing creation of the new entity
    etype - type IRI for the new entity
    fprint - list of key/value tuples of data to use in generating
                unique ID, or None in which case one is randomly generated
    '''
    fprint_processed = []
    for ix, (k, v) in enumerate(fprint or []):
        fprint_processed.append((k, v(ctx) if is_pipeline_action(v) else v))
    return I(resource_id(etype, fprint=fprint_processed, idgen=ctx.idgen,
                vocabbase=ctx.base))
Beispiel #11
0
def jsonload(model, fp):
    '''
    Load Versa model dumped into JSON form, either raw or canonical
    '''
    dumped_list = json.load(fp)
    for link in dumped_list:
        if len(link) == 2:
            sid, (s, p, o, a) = link
        elif len(link) == 4: #canonical
            (s, p, o, a) = link
            tt = a.get('@target-type')
            if tt == '@iri-ref':
                o = I(o)
            a.pop('@target-type', None)
        else:
            continue
        model.add(s, p, o, a)
    return
Beispiel #12
0
def create_resource(output_model, rtypes, fprint, links, existing_ids=None, id_helper=None, preserve_fprint=False):
    '''
    General-purpose routine to create a new resource in the output model, based on provided resource types and fingerprinting info

    output_model    - Versa connection to model to be updated
    rtypes          - Type IRIor list of IRIs for the new resource, used to give the object a Versa type relationship
    fprint          - list of key/value pairs for determining a unique hash for the new resource
    links           - list of key/value pairs for setting properties on the new resource
    id_helper       - If a string, a base URL for the generatd ID. If callable, a function used to return the entity. If None, set a default good enough for testing.
    existing_ids    - set of existing IDs to not recreate, or None, in which case a new resource will always be created
    '''
    rtypes = rtypes if isinstance(rtypes, list) else [rtypes]
    rtype, *moretypes = rtypes
    for t in moretypes:
        links.append([VTYPE_REL, t])

    if isinstance(id_helper, str):
        idg = idgen(id_helper)
    elif isinstance(id_helper, GeneratorType):
        idg = id_helper
    elif id_helper is None:
        idg = default_idgen(None)
    else:
        #FIXME: G11N
        raise ValueError('id_helper must be string (URL), callable or None')
    ctx = context(None, None, output_model, base=None, idgen=idg, existing_ids=existing_ids, extras=None)
    rid = I(materialize_entity(ctx, rtype, fprint=fprint))
    if existing_ids is not None:
        if rid in existing_ids:
            return (False, rid)
        existing_ids.add(rid)
    output_model.add(rid, VTYPE_REL, rtype)

    if preserve_fprint:
        attrs = { k:v for (k,v) in fprint }
        attrs[VTYPE_REL] = rtypes
        output_model.add(rid, VFPRINT_REL, rtype, attrs)

    for r, t in links:
        output_model.add(rid, r, t)
    return (True, rid)
Beispiel #13
0
def resource_id(etype, fprint=None, idgen=default_idgen(None), vocabbase=None):
    '''
    Lowest level routine for generating a, ID value using the Versa comvention
    
    The Versa convention originated as the hash algorithm outlined by
    the Libhub initiative for for BIBFRAME Lite, and now codified in the document [Computing Versa Resource Hashes
](https://github.com/uogbuji/versa/wiki/Computing-Versa-Resource-Hashes).

    etype - type IRI for the new entity (if the entity has multiple types, this is the primary and additional types
    can be provided in the fingerprint set)
    fprint - fingerprint set. List of key/value tuples of data to use in generating its unique ID, or None in which
    case one is just randomly generated
    defaultvocabbase - for convenience, provided, use to resolve relative etype & fingerprint keys

    >>> from versa.pipeline import resource_id
    >>> resource_id("http://schema.org/Person", [("http://schema.org/name", "Jonathan Bruce Postel"), ("http://schema.org/birthDate", "1943-08-06")])
    '-7hP9d_Xo8M'
    >>> resource_id("http://schema.org/Person", [("http://schema.org/name", "Augusta Ada King")])
    'xjgOrUFiw_o'
    '''
    params = {}
    if vocabbase and not iri.is_absolute(etype):
        etype = vocabbase(etype)

    fprint_processed = []
    for k, v in fprint or []:
        if vocabbase and not iri.is_absolute(k):
            k = vocabbase(k)
        fprint_processed.append((k, v))

    if fprint_processed:
        fprint_processed.append((VTYPE_REL, etype))
        fprint_processed.sort()
        plaintext = json.dumps(fprint_processed, separators=(',', ':'), cls=OrderedJsonEncoder)
        eid = idgen.send(plaintext)
    else:
        #We only have a type; no other distinguishing data. Generate a random hash
        eid = next(idgen)
    return I(eid)
Beispiel #14
0
def instance_postprocess(params, skip_relationships=None):
    skip_relationships = list(skip_relationships) or []
    instanceids = params['instanceids']
    model = params['output_model']
    vocabbase = params['vocabbase']
    skip_relationships.extend([
        ISBN_REL, ISBN_VTYPE_REL,
        I(iri.absolutize('instantiates', vocabbase))
    ])

    def dupe_filter(o, r, t, a):
        #Filter out ISBN relationships
        return (r, t) != (VTYPE_REL, I(iri.absolutize('Instance', vocabbase))) \
            and r not in skip_relationships

    if len(instanceids) > 1:
        base_instance_id = instanceids[0]
        for instanceid in instanceids[1:]:
            duplicate_statements(model,
                                 base_instance_id,
                                 instanceid,
                                 rfilter=dupe_filter)
    return
Beispiel #15
0
import warnings
# from pathlib import Path

# import plac # Cmdline processing tool

from amara3 import iri

from versa import ORIGIN, RELATIONSHIP, TARGET
from versa import I, VERSA_BASEIRI, VTYPE_REL, VLABEL_REL
from versa import util
from versa.driver.memory import newmodel
from versa.serial import literate
from versa.pipeline import *
from versa.contrib.datachefids import idgen as default_idgen

BOOK_NS = I('https://example.org/')
DC_NS = I('http://purl.org/dc/terms/')
SCH_NS = I('https://schema.org/')

# Input data (e.g. as if parsed from DC XML)
# see e.g. the MODS https://library.britishcouncil.co.zw/cgi-bin/koha/opac-export.pl?op=export&bib=59705&format=mods

# Abstractly, Versa pipelines operate by maping a set of input entities
# to an output entity, but in practice the input entities are often bundled
# into some sort of record format. We'll use such terminology interchangeably.

INPUT_RECORDS = []
INPUT_RECORDS.append('''\
# @docheader

* @iri:
Beispiel #16
0
import time
from itertools import islice
import logging

import rdflib
from rdflib import URIRef, Literal
from amara3 import iri

from rdflib import URIRef, Literal, RDF

from versa import I, VERSA_BASEIRI, ORIGIN, RELATIONSHIP, TARGET
from versa.driver import memory
from versa import VERSA_BASEIRI
from versa.reader.md import from_markdown

TYPE_REL = I(iri.absolutize('type', VERSA_BASEIRI))
VNS = rdflib.Namespace(VERSA_BASEIRI)

RDF_NAMESPACE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
RDFS_NAMESPACE = 'http://www.w3.org/2000/01/rdf-schema#'

RESOURCE_MAPPING = {
    I(VERSA_BASEIRI + 'Resource'): I(RDFS_NAMESPACE + 'Class'),
    I(VERSA_BASEIRI + 'Property'): I(RDF_NAMESPACE + 'Property'),
    I(VERSA_BASEIRI + 'description'): I(RDFS_NAMESPACE + 'comment'),
    I(VERSA_BASEIRI + 'label'): I(RDFS_NAMESPACE + 'label'),
}


def prep(link):
    '''
Beispiel #17
0
* alternateName: Hi-Tek
* name: Tony Cottrell
* birthDate: 1976-05-05
'''


@pytest.fixture
def expected_modout1():
    modout = newmodel()
    #literate.parse('''

    #''', modout)
    return modout


SCH_NS = I('https://schema.org/')
DOC_NS = I('http://example.org/records/')


def test_mosdef_only(testresourcepath, expected_modout1):
    modin = newmodel()
    literate.parse(INPUT_GRAPH_1, modin)

    modin = newmodel()
    literate.parse(INPUT_GRAPH_1, modin)

    FINGERPRINT_RULES = {
        SCH_NS('MusicAlbum'):
        (if_(contains(follow(SCH_NS('byArtist')), DOC_NS('md')),
             materialize(COPY()))),
        SCH_NS('Person'): (materialize(COPY())),
Beispiel #18
0
# bibframe

from versa import I

# use BFZ namespace to scope MARC tags that don't match transformation recipes
BFZ = I('http://bibfra.me/vocab/marcext/')
BFLC = I('http://bibframe.org/vocab/')

#A way to register services to specialize bibframe.py processing
#Maps URL to callable
g_services = {}

BF_INIT_TASK = 'http://bibfra.me/tool/pybibframe#task.init'
BF_INPUT_TASK = 'http://bibfra.me/tool/pybibframe#task.input-model'
BF_INPUT_XREF_TASK = 'http://bibfra.me/tool/pybibframe#task.input-xref-model'
BF_MARCREC_TASK = 'http://bibfra.me/tool/pybibframe#task.marcrec'
BF_MATRES_TASK = 'http://bibfra.me/tool/pybibframe#task.materialize-resource'
BF_FINAL_TASK = 'http://bibfra.me/tool/pybibframe#task.final'

BL = I('http://bibfra.me/vocab/lite/')
BA = I('http://bibfra.me/vocab/annotation/')
REL = I('http://bibfra.me/vocab/relation/')
MARC = I('http://bibfra.me/vocab/marc/')
RBMS = I('http://bibfra.me/vocab/rbms/')
AV = I('http://bibfra.me/vocab/audiovisual/')
ARCHIVE = I('http://bibfra.me/vocab/archive/')
MARCEXT = I('http://bibfra.me/vocab/marcext/')

POSTPROCESS_AS_INSTANCE = 'http://bibfra.me/tool/pybibframe#marc.postprocess.instance'

#def register_service(coro, iri=None):
Beispiel #19
0
import warnings
from pathlib import Path

import click # Cmdline processing tool. pip install click

from amara3 import iri

from versa import ORIGIN, RELATIONSHIP, TARGET
from versa import I, VERSA_BASEIRI, VTYPE_REL, VLABEL_REL
from versa import util
from versa.driver.memory import newmodel
from versa.serial import csv, literate, mermaid
from versa.pipeline import *
from versa.contrib.datachefids import idgen as default_idgen

BOOK_NS = I('https://example.org/')
IMPLICIT_NS = I('http://example.org/vocab/')
SCH_NS = I('https://schema.org/')


from versa.pipeline import *

FINGERPRINT_RULES = {
    # Fingerprint DC book by ISBN & output resource will be a SCH Book
    IMPLICIT_NS('Book'): materialize(SCH_NS('Book'),
                        fprint=[
                            (SCH_NS('isbn'), follow(IMPLICIT_NS('identifier'))),
                        ]
    )
}
Beispiel #20
0
def test_basics_4(testresourcepath):
    '''
    Convert from schema.org to [MusicBrainz scheme](https://musicbrainz.org/doc/MusicBrainz_Database/Schema)
    '''
    import sys # Uncomment to debug
    MB_NS = I('https://musicbrainz.org/doc/MusicBrainz_Database/Schema/')
    R_TYP = MB_NS('Release')
    RG_TYP = MB_NS('ReleaseGroup')
    A_TYP = MB_NS('Artist')
    DOC_NS = I('http://example.org/records/')

    modin = newmodel()
    modin_fpath = 'schemaorg/blackstar.md'
    literate.parse(open(os.path.join(testresourcepath, modin_fpath)).read(), modin)
    # Hand-add a comment property to the Mos Def resource to test that this value doesn't bleed e.g. to Kweli's output
    modin.add(DOC_NS('md'), SCH_NS('comment'), 'test')

    FINGERPRINT_RULES = {
        SCH_NS('MusicAlbum'): ( 
            materialize(MB_NS('ReleaseGroup'),
                fprint=[
                    (MB_NS('title'), follow(SCH_NS('name'))),
                    (MB_NS('artist'), follow(SCH_NS('byArtist'), SCH_NS('name'))),
                ],
                links=[
                    (MB_NS('contains'), materialize(MB_NS('Release'),
                        fprint=[
                            (MB_NS('catalogue-number'), var('catnum')),
                        ],
                        links=[
                            (MB_NS('catalogue-number'), var('catnum')),
                        ]
                    ))
                ],
                vars={'catnum': follow(SCH_NS('catalogNumber'))},
                # debug=sys.stderr, # Uncomment to debug
            )
        ),

        SCH_NS('Person'): ( 
            materialize(MB_NS('Artist'),
                fprint=[
                    (MB_NS('name'), var('aname')),
                ],
                links=[
                    (MB_NS('name'), var('aname')),
                    (MB_NS('remark'), var('comment')),
                ],
                vars={'aname': follow(SCH_NS('name')), 'comment': follow(SCH_NS('comment'))},
            )
        )
    }

    TRANSFORM_RULES = {
        (SCH_NS('name'), R_TYP, RG_TYP): link(rel=MB_NS('title')),

        (SCH_NS('byArtist'), R_TYP): link(rel=MB_NS('by'), target=lookup('@resource')),
    }

    # Intentionally shadows the global LABELIZE_RULES
    LABELIZE_RULES = {
        MB_NS('ReleaseGroup'): follow(MB_NS('title')),
        MB_NS('Release'): follow(MB_NS('title')),
        MB_NS('Artist'): follow(MB_NS('name'))
    }

    ppl = generic_pipeline(FINGERPRINT_RULES, TRANSFORM_RULES, LABELIZE_RULES)

    modout = ppl.run(input_model=modin)
    # Use -s to see this
    print('='*10, 'test_basics_4', '='*10)
    literate.write(modout)
    # import pprint; pprint.pprint(list(iter(modout)))

    assert len(modout) == 16
    assert len(list(util.all_origins(modout, only_types={MB_NS('ReleaseGroup')}))) == 1
    assert len(list(util.all_origins(modout, only_types={MB_NS('ReleaseGroup')}))) == 1
    assert len(list(util.all_origins(modout, only_types={MB_NS('Artist')}))) == 2
    # assert len(list(modout.match(None, BF_NS('birthDate'), '1919-01-01'))) == 1
    # DOC_NS('md') -> I('i5GvPVm7ClA') in the transform
    assert [ l[0] for l in modout.match(None, MB_NS('remark'), 'test')] == [I('i5GvPVm7ClA')]
Beispiel #21
0
py.test -s test/py/test_pipeline.py

'''

import os

# Requires pytest-mock
import pytest

from versa import I, VERSA_BASEIRI, VTYPE_REL, VLABEL_REL, ORIGIN, RELATIONSHIP, TARGET
from versa import util
from versa.driver.memory import newmodel
from versa.serial import csv, literate, mermaid
from versa.pipeline import *

SCH_NS = I('https://schema.org/')
BF_NS = I('http://bibfra.me/')


@pytest.fixture
def expected_modout1():
    modout = newmodel()
    #literate.parse('''

    #''', modout)
    return modout


WT = BF_NS('Work')
IT = BF_NS('Instance')
Beispiel #22
0
import re
import os
import logging
import itertools

#from rdflib import Graph, BNode, Namespace
from rdflib import URIRef, Literal, RDF, RDFS

from amara3 import iri

from versa import I, VERSA_BASEIRI, ORIGIN, RELATIONSHIP, TARGET

from bibframe import BFZ, BFLC

VTYPE_REL = I(iri.absolutize('type', VERSA_BASEIRI))
VLABEL_REL = I(iri.absolutize('label', VERSA_BASEIRI))

WORKCLASS = iri.absolutize('Work', BFZ)
INSTANCECLASS = iri.absolutize('Instance', BFZ)
INSTANCEREL = iri.absolutize('hasInstance', BFZ)

PROP_MAP = {
    VTYPE_REL: RDF.type,
    VLABEL_REL: RDFS.label,
}


def prep(stmt):
    '''
    Prepare a statement into a triple ready for rdflib
Beispiel #23
0
import itertools
import asyncio

from itertools import tee, zip_longest

from versa import I, VERSA_BASEIRI, ORIGIN, RELATIONSHIP, TARGET
from versa.util import simple_lookup

from amara3 import iri

from bibframe import BFZ, BFLC, g_services, BF_INIT_TASK, BF_MARCREC_TASK, BF_MATRES_TASK, BF_FINAL_TASK

RDF_NAMESPACE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
RDFS_NAMESPACE = 'http://www.w3.org/2000/01/rdf-schema#'

VTYPE_REL = I(iri.absolutize('type', VERSA_BASEIRI))
RDFS_LABEL = RDFS_NAMESPACE + 'label'


def pairwise(iterable):
    a, b = tee(iterable)
    next(b, None)
    return zip_longest(a, b)


#A plug-in is a series of callables, each of which handles a phase of
#Process

#The only phase predefined for all plug-ins is BF_INIT_TASK

Beispiel #24
0
    def _materialize(ctx):
        '''
        Inserts at least two main links in the context's output_model, one or more for
        the relationship from the origin to the materialized resource, one for the
        type of the materialized resource, and links according to the links parameter

        :param ctx: Runtime Versa context used in processing (e.g. includes the prototype link)
        :return: None

        This function is intricate in its use and shifting of Versa context, but the
        intricacies are all designed to make the marcpatterns mini language more natural.
        '''
        # FIXME: Part of the datachef sorting out
        if not ctx.idgen: ctx.idgen = idgen
        if debug is None:
            def log_debug(msg): return
        elif not hasattr(debug, 'write'):
            raise TypeError('debug argument to materialize must be file-like object or None')
        else:
            def log_debug(msg):
                print(msg, file=debug)

        # Set up variables to be made available in any derived contexts
        vars_items = list((vars or {}).items())
        if vars_items:
            # First make sure we're not tainting the passed-in context
            ctx = ctx.copy(variables=ctx.variables.copy())
            for k, v in vars_items:
                if None in (k, v): continue
                #v = v if isinstance(v, list) else [v]
                v = v(ctx) if is_pipeline_action(v) else v
                if v:
                    v = v[0] if isinstance(v, list) else v
                    ctx.variables[k] = v

        (o, r, t, a) = ctx.current_link
        if isinstance(typ, COPY):
            object_copy = typ
            object_copy.id = o
            _typ = next(util.resourcetypes(ctx.input_model, o), None)
            object_copy.links = []
            for stmt in ctx.input_model.match(o):
                if object_copy.rels is None or stmt[RELATIONSHIP] in typ.rels:
                    # FIXME: Attributes?
                    object_copy.links.append((stmt[RELATIONSHIP], stmt[TARGET]))
        else:
            _typ = typ(ctx) if is_pipeline_action(typ) else typ
            object_copy = None
        _fprint = fprint(ctx) if is_pipeline_action(fprint) else fprint
        # FIXME: On redesign implement split using function composition instead
        targets = [ sub_t.strip() for sub_t in t.split(split) if sub_t.strip() ] if split else [t]

        # If the rel in the incoming context is null and there is no rel passed in, nothing to attach
        # Especially useful signal in a pipeline's fingerprinting stage
        attach_ = False if rel is None and r is None else attach

        if '@added-links' not in ctx.extras: ctx.extras['@added-links'] = set()

        # Make sure we end up with a list or None
        rels = rel if isinstance(rel, list) else ([rel] if rel else [r])
        log_debug(f'materialize action. Type: {_typ}. Anchoring rels: {rels} Initial context current link: {ctx.current_link}')
        log_debug(f'Variables (including from vars= arg): {ctx.variables}')
        objids = []

        # Botanical analogy: stem context is from the caller (e.g. connection point of newly materialized resource)
        # vein comtexts derive from the stem
        for target in targets:
            ctx_stem = ctx.copy(current_link=(ctx.current_link[ORIGIN], ctx.current_link[RELATIONSHIP], target, ctx.current_link[ATTRIBUTES]))
            if origin:
                # Have been given enough info to derive the origin from context. Ignore origin in current link
                o = origin(ctx_stem)
            if not o: #Defensive coding
                continue

            computed_fprint = [] if _fprint else None
            rtypes = set([_typ])
            if _fprint:
                # strip None values from computed unique list, including pairs where v is None
                for k, v in _fprint:
                    if None in (k, v): continue
                    for subitem in (v if isinstance(v, list) else [v]):
                        subval = subitem(ctx_stem) if is_pipeline_action(subitem) else subitem
                        if subval:
                            subval = subval if isinstance(subval, list) else [subval]
                            if k == VTYPE_REL: rtypes.update(set(subval))
                            computed_fprint.extend([(k, s) for s in subval])
            log_debug(f'Provided fingerprinting info: {computed_fprint}')

            if object_copy:
                objid = object_copy.id
            else:
                objid = materialize_entity(ctx_stem, _typ, fprint=computed_fprint)
            objids.append(objid)
            log_debug(f'Newly materialized object: {objid}')
            # rels = [ ('_' + curr_rel if curr_rel.isdigit() else curr_rel) for curr_rel in rels if curr_rel ]
            computed_rels = []
            for curr_relobj in rels:
                # e.g. scenario if passed in rel=ifexists(...)
                curr_rels = curr_relobj(ctx_stem) if is_pipeline_action(curr_relobj) else curr_relobj
                curr_rels = curr_rels if isinstance(curr_rels, list) else [curr_rels]
                for curr_rel in curr_rels:
                    if not curr_rel: continue
                    # FIXME: Fix properly, by slugifying & making sure slugify handles all numeric case (prepend '_')
                    curr_rel = '_' + curr_rel if curr_rel.isdigit() else curr_rel
                    if attach_:
                        _smart_add(ctx_stem.output_model, I(o), I(iri.absolutize(curr_rel, ctx_stem.base)), I(objid), (), ctx.extras['@added-links'])
                    computed_rels.append(curr_rel)
            # print((objid, ctx_.existing_ids))
            # XXX: Means links are only processed on new objects! This needs some thought
            if objid not in ctx_stem.existing_ids:
                if _typ:
                    _smart_add(ctx_stem.output_model, I(objid), VTYPE_REL, I(iri.absolutize(_typ, ctx_stem.base)), (), ctx.extras['@added-links'])
                if preserve_fprint:
                    # Consolidate types
                    computed_fprint = [ (k, v) for (k, v) in computed_fprint if k != VTYPE_REL ]
                    # computed_fprint += 
                    attrs = tuple(computed_fprint + [(VTYPE_REL, r) for r in rtypes])
                    _smart_add(ctx_stem.output_model, I(objid), VFPRINT_REL, _typ, attrs, ctx.extras['@added-links'])

                # XXX: Use Nones to mark blanks, or should Versa define some sort of null resource?
                all_links = object_copy.links + links if object_copy else links
                for l in all_links:
                    if len(l) == 2:
                        lo = I(objid)
                        lr, lt = l
                    elif len(l) == 3:
                        lo, lr, lt = l
                    # This context is in effect 

                    # First of all, hold on to the inbound origin so that it can be accessed in embedded actions
                    vein_vars = ctx_stem.variables.copy()
                    vein_vars['@stem'] = ctx_stem.current_link[ORIGIN]

                    # Newly materialized resource is the origin. The overall context target for embedded actions
                    ctx_vein = ctx_stem.copy(current_link=(objid, ctx_stem.current_link[RELATIONSHIP], ctx_stem.current_link[TARGET], ctx_stem.current_link[ATTRIBUTES]), variables=vein_vars)

                    lo = lo or ctx_vein.current_link[ORIGIN]
                    lr = lr or ctx_vein.current_link[RELATIONSHIP]
                    lt = lt or ctx_vein.current_link[TARGET]

                    lo = lo(ctx_vein) if is_pipeline_action(lo) else lo
                    lo = lo if isinstance(lo, list) else [lo]
                    lr = lr(ctx_vein) if is_pipeline_action(lr) else lr

                    # Update lr
                    # XXX This needs cleaning up
                    ctx_vein = ctx_stem.copy(current_link=(ctx_vein.current_link[ORIGIN], lr, ctx_vein.current_link[TARGET], ctx_stem.current_link[ATTRIBUTES]), variables=vein_vars)

                    # If k is a list of contexts use it to dynamically execute functions
                    if isinstance(lr, list):
                        if lr and isinstance(lr[0], context):
                            for newctx in lr:
                                #The function in question will generate any needed links in the output model
                                lt(newctx)
                            continue

                    # import traceback; traceback.print_stack() #For looking up the call stack e.g. to debug nested materialize
                    # Check that the links key is not None, which is a signal not to
                    # generate the item. For example if the key is an ifexists and the
                    # test expression result is False, it will come back as None,
                    # and we don't want to run the v function
                    if lr:
                        lt = lt(ctx_vein) if is_pipeline_action(lt) else lt

                        # If k or v come from pipeline functions as None it signals to skip generating anything else for this link item
                        if lt is not None:
                            # FIXME: Fix properly, by slugifying & making sure slugify handles all-numeric case
                            if lr.isdigit(): lr = '_' + lr
                            _lr = I(iri.absolutize(lr, ctx_vein.base))
                            log_debug(f'Generated link: {lo, _lr, lt}')
                            if isinstance(lt, list):
                                for valitems in lt:
                                    if valitems:
                                        for loi in lo:
                                            _smart_add(ctx_vein.output_model, loi, _lr, valitems, (), ctx.extras['@added-links'])
                            else:
                                for loi in lo:
                                    _smart_add(ctx_vein.output_model, loi, _lr, lt, (), ctx.extras['@added-links'])
                ctx_stem.existing_ids.add(objid)
                for func in ctx.extras.get('@new-entity-hook', []):
                    func(objid)
        log_debug(f'End materialize')
            
        return objids
Beispiel #25
0
def parse(md, model, encoding='utf-8', config=None):
    """
    Translate the Versa Markdown syntax into Versa model relationships

    md -- markdown source text
    model -- Versa model to take the output relationship
    encoding -- character encoding (defaults to UTF-8)

    Returns: The overall base URI (`@base`) specified in the Markdown file, or None

    >>> from versa.driver.memory import newmodel
    >>> from versa.serial.literate import parse
    >>> m = newmodel()
    >>> parse(open('test/resource/poetry.md').read(), m)
    'http://uche.ogbuji.net/poems/'
    >>> m.size()
    40
    >>> next(m.match(None, 'http://uche.ogbuji.net/poems/updated', '2013-10-15'))
    (I(http://uche.ogbuji.net/poems/1), I(http://uche.ogbuji.net/poems/updated), '2013-10-15', {})
    """
    #Set up configuration to interpret the conventions for the Markdown
    config = config or {}
    #This mapping takes syntactical elements such as the various header levels in Markdown and associates a resource type with the specified resources
    syntaxtypemap = {}
    if config.get('autotype-h1'):
        syntaxtypemap['h1'] = config.get('autotype-h1')
    if config.get('autotype-h2'):
        syntaxtypemap['h2'] = config.get('autotype-h2')
    if config.get('autotype-h3'):
        syntaxtypemap['h3'] = config.get('autotype-h3')
    interp_stanza = config.get('interpretations', {})
    interpretations = {}

    def setup_interpretations(interp):
        #Map the interpretation IRIs to functions to do the data prep
        for prop, interp_key in interp.items():
            if interp_key.startswith('@'):
                interp_key = iri.absolutize(interp_key[1:], VERSA_BASEIRI)
            if interp_key in PREP_METHODS:
                interpretations[prop] = PREP_METHODS[interp_key]
            else:
                #just use the identity, i.e. no-op
                interpretations[prop] = lambda x, **kwargs: x

    setup_interpretations(interp_stanza)

    #Prep ID generator, in case needed
    idg = idgen(None)

    #Preprocess the Markdown to deal with IRI-valued property values
    def iri_ref_tool(m):
        body = m.group(1)
        lchar = '&lt;' if iri.matches_uri_ref_syntax(body) else '<'
        return lchar + m.group(1) + '>'

    md = IRIREF_CAND_PAT.sub(iri_ref_tool, md)

    #Parse the Markdown
    #Alternately:
    #from xml.sax.saxutils import escape, unescape
    #h = markdown.markdown(escape(md.decode(encoding)), output_format='html5')
    #Note: even using safe_mode this should not be presumed safe from tainted input
    #h = markdown.markdown(md.decode(encoding), safe_mode='escape', output_format='html5')
    comments = mkdcomments.CommentsExtension()
    h = markdown.markdown(md,
                          safe_mode='escape',
                          output_format='html5',
                          extensions=[comments])

    #doc = html.markup_fragment(inputsource.text(h.encode('utf-8')))
    tb = treebuilder()
    h = '<html>' + h + '</html>'
    root = html5.parse(h)
    #root = tb.parse(h)
    #Each section contains one resource description, but the special one named @docheader contains info to help interpret the rest
    first_h1 = next(select_name(descendants(root), 'h1'))
    #top_section_fields = itertools.takewhile(lambda x: x.xml_name != 'h1', select_name(following_siblings(first_h1), 'h2'))

    # Extract header elements. Notice I use an empty element with an empty parent as the default result
    docheader = next(
        select_value(select_name(descendants(root), 'h1'), '@docheader'),
        element('empty', parent=root))  # //h1[.="@docheader"]
    sections = filter(
        lambda x: x.xml_value != '@docheader',
        select_name_pattern(descendants(root), HEADER_PAT)
    )  # //h1[not(.="@docheader")]|h2[not(.="@docheader")]|h3[not(.="@docheader")]

    def fields(sect):
        '''
        Each section represents a resource and contains a list with its properties
        This generator parses the list and yields the key value pairs representing the properties
        Some properties have attributes, expressed in markdown as a nested list. If present these attributes
        Are yielded as well, else None is yielded
        '''
        #import logging; logging.debug(repr(sect))
        #Pull all the list elements until the next header. This accommodates multiple lists in a section
        try:
            sect_body_items = itertools.takewhile(
                lambda x: HEADER_PAT.match(x.xml_name) is None,
                select_elements(following_siblings(sect)))
        except StopIteration:
            return
        #results_until(sect.xml_select('following-sibling::*'), 'self::h1|self::h2|self::h3')
        #field_list = [ U(li) for ul in sect.xml_select('following-sibling::ul') for li in ul.xml_select('./li') ]
        field_list = [
            li for elem in select_name(sect_body_items, 'ul')
            for li in select_name(elem, 'li')
        ]

        def parse_li(pair):
            '''
            Parse each list item into a property pair
            '''
            if pair.strip():
                matched = REL_PAT.match(pair)
                if not matched:
                    raise ValueError(
                        _('Syntax error in relationship expression: {0}'.
                          format(pair)))
                if matched.group(3): prop = matched.group(3).strip()
                if matched.group(4): prop = matched.group(4).strip()
                if matched.group(7):
                    val = matched.group(7).strip()
                    typeindic = RES_VAL
                elif matched.group(9):
                    val = matched.group(9).strip()
                    typeindic = TEXT_VAL
                elif matched.group(11):
                    val = matched.group(11).strip()
                    typeindic = TEXT_VAL
                elif matched.group(12):
                    val = matched.group(12).strip()
                    typeindic = UNKNOWN_VAL
                else:
                    val = ''
                    typeindic = UNKNOWN_VAL
                #prop, val = [ part.strip() for part in U(li.xml_select('string(.)')).split(':', 1) ]
                #import logging; logging.debug(repr((prop, val)))
                return prop, val, typeindic
            return None, None, None

        def prep_li(li):
            '''
            Take care of Markdown parsing minutiae. Also, Exclude child uls

            * a/href embedded in the li means it was specified as <link_text>.
            Restore the angle brackets as expected by the li parser
            * Similar for cases where e.g. prop: <abc> gets turned into prop: <abc></abc>
            '''
            prepped = ''
            for ch in itertools.takewhile(
                    lambda x: not (isinstance(x, element) and x.xml_name ==
                                   'ul'), li.xml_children):
                if isinstance(ch, text):
                    prepped += ch
                elif isinstance(ch, element):
                    if ch.xml_name == 'a':
                        prepped += '<' + ch.xml_value + '>'
                    else:
                        prepped += '<' + ch.xml_name + '>'
            return prepped

        #Go through each list item
        for li in field_list:
            #Is there a nested list, which expresses attributes on a property
            if list(select_name(li, 'ul')):
                #main = ''.join([ node.xml_value
                #        for node in itertools.takewhile(
                #            lambda x: x.xml_name != 'ul', select_elements(li)
                #            )
                #    ])
                main = prep_li(li)
                prop, val, typeindic = parse_li(main)
                subfield_list = [
                    parse_li(prep_li(sli)) for e in select_name(li, 'ul')
                    for sli in (select_name(e, 'li'))
                ]
                subfield_list = [(p, v, t) for (p, v, t) in subfield_list
                                 if p is not None]
                #Support a special case for syntax such as in the @iri and @interpretations: stanza of @docheader
                if val is None: val = ''
                yield prop, val, typeindic, subfield_list
            #Just a regular, unadorned property
            else:
                prop, val, typeindic = parse_li(prep_li(li))
                if prop: yield prop, val, typeindic, None

    iris = {}

    # Gather the document-level metadata from the @docheader section
    base = schemabase = rtbase = document_iri = default_lang = None
    for prop, val, typeindic, subfield_list in fields(docheader):
        #The @iri section is where key IRI prefixes can be set
        if prop == '@iri':
            for (k, uri, typeindic) in subfield_list:
                if k == '@base':
                    base = schemabase = rtbase = uri
                # @property is legacy
                elif k == '@schema' or k == '@property':
                    schemabase = uri
                elif k == '@resource-type':
                    rtbase = uri
                else:
                    iris[k] = uri
        #The @interpretations section is where defaults can be set as to the primitive types of values from the Markdown, based on the relevant property/relationship
        elif prop == '@interpretations':
            #Iterate over items from the @docheader/@interpretations section to set up for further parsing
            interp = {}
            for k, v, x in subfield_list:
                interp[I(iri.absolutize(k, schemabase))] = v
            setup_interpretations(interp)
        #Setting an IRI for this very document being parsed
        elif prop == '@document':
            document_iri = val
        elif prop == '@language':
            default_lang = val
        #If we have a resource to which to attach them, just attach all other properties
        elif document_iri or base:
            rid = document_iri or base
            fullprop = I(iri.absolutize(prop, schemabase or base))
            if fullprop in interpretations:
                val = interpretations[fullprop](val,
                                                rid=rid,
                                                fullprop=fullprop,
                                                base=base,
                                                model=model)
                if val is not None: model.add(rid, fullprop, val)
            else:
                model.add(rid, fullprop, val)

    #Default IRI prefixes if @iri/@base is set
    if not schemabase: schemabase = base
    if not rtbase: rtbase = base
    if not document_iri: document_iri = base

    #Go through the resources expressed in remaining sections
    for sect in sections:
        #if U(sect) == '@docheader': continue #Not needed because excluded by ss
        #The header can take one of 4 forms: "ResourceID" "ResourceID [ResourceType]" "[ResourceType]" or "[]"
        #The 3rd form is for an anonymous resource with specified type and the 4th an anonymous resource with unspecified type
        matched = RESOURCE_PAT.match(sect.xml_value)
        if not matched:
            raise ValueError(
                _('Syntax error in resource header: {0}'.format(
                    sect.xml_value)))
        rid = matched.group(1)
        rtype = matched.group(3)
        if rtype:
            rtype = I(iri.absolutize(rtype, schemabase))

        if rid:
            rid = I(iri.absolutize(rid, base))
        if not rid:
            rid = next(idg)

        #Resource type might be set by syntax config
        if not rtype:
            rtype = syntaxtypemap.get(sect.xml_name)
        if rtype:
            model.add(rid, TYPE_REL, rtype)

        def expand_iri(iri_in, base):
            if iri_in.startswith('@'):
                return I(iri.absolutize(iri_in[1:], VERSA_BASEIRI))
            iri_match = URI_EXPLICIT_PAT.match(iri_in)
            if iri_match:
                return I(iri.absolutize(iri_match.group(1), base))
            iri_match = URI_ABBR_PAT.match(iri_in)
            if iri_match:
                uri = iris[iri_match.group(1)]
                fulliri = URI_ABBR_PAT.sub(uri + '\\2\\3', iri_in)
            else:
                fulliri = I(iri.absolutize(iri_in, base))
            return fulliri

        #Add the property
        for prop, val, typeindic, subfield_list in fields(sect):
            attrs = {}
            for (aprop, aval, atype) in subfield_list or ():
                fullaprop = expand_iri(aprop, schemabase)
                if atype == RES_VAL:
                    val = expand_iri(aval, rtbase)
                    valmatch = URI_ABBR_PAT.match(aval)
                    if valmatch:
                        uri = iris[valmatch.group(1)]
                        attrs[fullaprop] = URI_ABBR_PAT.sub(
                            uri + '\\2\\3', aval)
                    else:
                        attrs[fullaprop] = I(iri.absolutize(aval, rtbase))
                elif atype == TEXT_VAL:
                    attrs[fullaprop] = aval
                elif atype == UNKNOWN_VAL:
                    val_iri_match = URI_EXPLICIT_PAT.match(aval)
                    if val_iri_match:
                        aval = expand_iri(aval, rtbase)
                    elif fullaprop in interpretations:
                        aval = interpretations[fullaprop](aval,
                                                          rid=rid,
                                                          fullprop=fullaprop,
                                                          base=base,
                                                          model=model)
                    if aval is not None:
                        attrs[fullaprop] = aval

            fullprop = expand_iri(prop, schemabase)
            if typeindic == RES_VAL:
                val = expand_iri(val, rtbase)
                model.add(rid, fullprop, val, attrs)
            elif typeindic == TEXT_VAL:
                if '@lang' not in attrs: attrs['@lang'] = default_lang
                model.add(rid, fullprop, val, attrs)
            elif typeindic == UNKNOWN_VAL:
                val_iri_match = URI_EXPLICIT_PAT.match(val)
                if val_iri_match:
                    val = expand_iri(val, rtbase)
                elif fullprop in interpretations:
                    val = interpretations[fullprop](val,
                                                    rid=rid,
                                                    fullprop=fullprop,
                                                    base=base,
                                                    model=model)
                if val is not None:
                    model.add(rid, fullprop, val, attrs)

            #resinfo = AB_RESOURCE_PAT.match(val)
            #if resinfo:
            #    val = resinfo.group(1)
            #    valtype = resinfo.group(3)
            #    if not val: val = model.generate_resource()
            #    if valtype: attrs[TYPE_REL] = valtype

    return document_iri
Beispiel #26
0
    '''
    fullprop = kwargs.get('fullprop')
    rid = kwargs.get('rid')
    base = kwargs.get('base', VERSA_BASEIRI)
    model = kwargs.get('model')
    iris = ltext.strip().split()
    for i in iris:
        model.add(rid, fullprop, I(iri.absolutize(i, base)))
    return None


PREP_METHODS = {
    VERSA_BASEIRI + 'text':
    lambda x, **kwargs: x,
    VERSA_BASEIRI + 'resource':
    lambda x, base=VERSA_BASEIRI, **kwargs: I(iri.absolutize(x, base)),
    VERSA_BASEIRI + 'resourceset':
    handle_resourceset,
}


def parse(md, model, encoding='utf-8', config=None):
    """
    Translate the Versa Markdown syntax into Versa model relationships

    md -- markdown source text
    model -- Versa model to take the output relationship
    encoding -- character encoding (defaults to UTF-8)

    Returns: The overall base URI (`@base`) specified in the Markdown file, or None
Beispiel #27
0
'''

import os
import json
import itertools
import asyncio

from versa import I, ORIGIN, RELATIONSHIP, TARGET
from versa.util import simple_lookup

from amara3 import iri

from bibframe import BFZ, BFLC, g_services, BF_INIT_TASK, BF_MARCREC_TASK, BF_FINAL_TASK

ISBN_REL = I(iri.absolutize('isbn', BFZ))
TITLE_REL = I(iri.absolutize('title', BFZ))

BFHOST = 'bibfra.me'

#A plug-in is a series of callables, each of which handles a phase of
#Process

#The only phase predefined for all plug-ins is BF_INIT_TASK


#One convenient way to organize the Plug-in is as a class
#In this case we want to create a separate instance for each full processing event loop
class linkreport(object):
    PLUGIN_ID = 'http://bibfra.me/tool/pybibframe#linkreport'
Beispiel #28
0
Test NTriples serializer
'''

import logging
import functools

# Requires pytest-mock
import pytest
from amara3 import iri

from versa import I
from versa.driver.memory import newmodel
from versa.serial.ntriples import *
# from versa.util import jsondump, jsonload

NT_SPEC = I('http://www.w3.org/2001/sw/RDFCore/ntriples/')
DC_CREATOR = I('http://purl.org/dc/elements/1.1/creator')
DC_PUBLISHER = I('http://purl.org/dc/elements/1.1/publisher')
W3C = I('http://www.w3.org/')


@pytest.fixture
def ntrips_1():
    return '''\
<http://www.w3.org/2001/sw/RDFCore/ntriples/> <http://purl.org/dc/elements/1.1/creator> "Dave Beckett" .
<http://www.w3.org/2001/sw/RDFCore/ntriples/> <http://purl.org/dc/elements/1.1/creator> "Art Barstow" .
<http://www.w3.org/2001/sw/RDFCore/ntriples/> <http://purl.org/dc/elements/1.1/publisher> <http://www.w3.org/> .
'''


def test_parse1(ntrips_1):
Beispiel #29
0
    def handle_record_links(self, loop, model, params):
        '''
        Task coroutine of the main event loop for MARC conversion, called with
        In this case update a report of links encountered in the MARC/XML

        model -- raw Versa model with converted resource information from the MARC details from each MARC/XML record processed
        params -- parameters passed in from processing:
            params['workid']: ID of the work constructed from the MARC record
            params['instanceid']: list of IDs of instances constructed from the MARC record
        '''
        #print ('BF_MARCREC_TASK', linkreport.PLUGIN_ID)
        #Get the configured default vocabulary base IRI
        vocabbase = params['vocabbase']
        for obj, _r, typ, _a in model.match(None, VTYPE_REL, None):
            # build labels based on model order, iterating over every property of
            # every resource, and building the label if that property is consulted
            rule = self._config['lookup'].get(typ)
            if rule is None: continue

            rules = rule if isinstance(rule, list) else [rule]

            label = ''
            for rule in rules:

                def chunk_eval(s):
                    # used when configuration is stored in JSON and one of these labelizer instructions is an eval-able string
                    # a known Python injection attack vector, so mentioned in README
                    if isinstance(s, str) and len(s) > 5:
                        s = eval(s, {'I': I}, locals())
                    return s

                marc_order = rule.get('marcOrder', False)
                separator = chunk_eval(rule.get('separator', ' '))
                wrapper = chunk_eval(rule.get('wrapper', None))
                multivalsep = chunk_eval(rule.get('multivalSeparator', ' | '))
                props = rule.get('properties', [])

                if marc_order:
                    link_stream = pairwise(
                        (l for l in model.match(obj, None, None)
                         if l[1] in props))
                else:
                    link_stream = pairwise(
                        (l for p in props for l in model.match(obj, p, None)))

                #print("LABELIZING {} of type {}".format(obj, typ))
                for (link1, link2) in link_stream:

                    _o1, rel1, target1, _a1 = link1
                    _o2, rel2, target2, _a2 = link2 if link2 is not None else (
                        None, None, None, None)

                    ctx = {
                        'currentProperty': rel1,
                        'currentValue': target1,
                        'nextProperty': rel2,
                        'nextValue': target2,
                    }

                    _wrapper = wrapper(ctx) if callable(wrapper) else wrapper
                    if _wrapper:
                        target1 = _wrapper[0] + target1 + _wrapper[1]

                    label += target1
                    if rel2 == rel1:
                        _multivalsep = multivalsep(ctx) if callable(
                            multivalsep) else multivalsep
                        label += _multivalsep
                    elif rel2 is not None:
                        _separator = separator(ctx) if callable(
                            separator) else separator
                        label += _separator
                    #print("current label", label)

                if label:
                    model.add(obj, I(RDFS_LABEL), label)
                    break  # we've found a rule that produces a label, so skip other rules

                label = ''

            if not label and 'default-label' in self._config:
                # if we've gone through all rules and not produced a label, yield specified default
                model.add(obj, I(RDFS_LABEL), self._config['default-label'])

        return
Beispiel #30
0
def parse(nt,
          model,
          encoding='utf-8',
          disjoint=None,
          only_rel=None,
          exclude_rel=None):
    '''
    nt - string or file-like object with NTriples to parse
    model - Versa model into which to parse the data
    encoding character encoding for NTriples (default UTF-8)
    disjoint - if not None a list or set of link tuples against which parsed links
                should be compared, and omitted if matching.
    only_rel - if not None a collection of link relations limiting the parsed
                NTriples statements to only be added to the model if the
                predicate matches one in only_rel
    exclude_rel - if not None a collection of link relations limiting
                the parsed NTriples statements to be skipped if the predicate
                matches one in exclude_rel
 
    >>> 
    '''
    exclude_rel = exclude_rel or set()
    only_rel = only_rel or set()
    disjoint = disjoint or set()
    added_links = set()
    new_origins = set()

    # Make sure typing is not accidentally omitted
    if only_rel:
        only_rel.add(VTYPE_REL)

    def _add(o, r, t, a=None):
        '''
        Conditionally add a statement to model, if not a duplicate
        '''
        a = a or {}
        parts = (o, r, t, tuple(a.items()))
        if (parts in added_links) or (parts in disjoint):
            return False
        model.add(o, r, t, a)
        added_links.add((o, r, t, tuple(a.items())))
        return True

    nt_gen = nt
    if isinstance(nt, str):
        nt_gen = nt.splitlines()
    for line in nt_gen:
        m = NT_LINE_PAT.match(line.strip())
        if m:
            #print(list(enumerate(m.groups())))
            _, s, s_iri, s_blank, p_iri, o, _, o_iri, o_str, o_blank = tuple(
                m.groups())
            #print((s, s_iri, s_blank, p_iri, o, o_iri, o_str, o_blank))
            if p_iri == RDF_TYPE_REL:
                p_iri = VTYPE_REL

            if o_blank or s_blank:
                raise NotImplementedError('Blank nodes not yet implemented')

            p_iri = I(p_iri)
            if only_rel:
                if p_iri not in only_rel:
                    print('skipped', line)
                    continue
            else:
                if p_iri in exclude_rel:
                    continue

            if _add(I(s_iri), p_iri, I(o_iri) if o_iri else o_str):
                new_origins.add(I(s_iri))

    return