Exemple #1
0
def process(source, target, rdfsonly, base=None, logger=logging):
    '''
    Prepare a statement into a triple ready for rdflib graph

    '''
    for link in source.match():
        s, p, o = link[:3]
        #SKip docheader statements
        if s == (base or '') + '@docheader': continue
        if p in RESOURCE_MAPPING: p = RESOURCE_MAPPING[p]
        if o in RESOURCE_MAPPING: o = RESOURCE_MAPPING[o]
        if p == VERSA_BASEIRI + 'refines':
            tlinks = list(source.match(s, TYPE_REL))
            if tlinks:
                if tlinks[0][TARGET] == VERSA_BASEIRI + 'Resource':
                    p = I(RDFS_NAMESPACE + 'subClassOf')
                elif tlinks[0][TARGET] == VERSA_BASEIRI + 'Property':
                    p = I(RDFS_NAMESPACE + 'subPropertyOf')
        if p == VERSA_BASEIRI + 'properties':
            suri = I(iri.absolutize(s, base)) if base else s
            target.add((URIRef(o), URIRef(RDFS_NAMESPACE + 'domain'), URIRef(suri)))
            continue
        if p == VERSA_BASEIRI + 'value':
            if o not in ['Literal', 'IRI']:
                ouri = I(iri.absolutize(o, base)) if base else o
                target.add((URIRef(s), URIRef(RDFS_NAMESPACE + 'range'), URIRef(ouri)))
                continue
        s = URIRef(s)
        #Translate v:type to rdf:type
        p = RDF.type if p == TYPE_REL else URIRef(p)
        o = URIRef(o) if isinstance(o, I) else Literal(o)
        if not rdfsonly or p.startswith(RDF_NAMESPACE) or p.startswith(RDFS_NAMESPACE):
            target.add((s, p, o))
    return
Exemple #2
0
    def _link(ctx):
        (origin, _, t, a) = ctx.current_link
        if derive_origin:
            #Have enough info to derive the origin from context. Ignore origin in current link
            origin = derive_origin(ctx)

        #If need be call the Versa action function to determine the relationship to the materialized resource
        rels = rel(ctx) if callable(rel) else rel
        if not isinstance(rels, list): rels = [rels]

        _value = value(ctx) if callable(value) else (
            t if value is None else value)
        #Just work with the first provided statement, for now
        if res and not (ignore_refs and not iri.is_absolute(_value)):
            try:
                _value = I(_value)
            except ValueError:
                ctx.extras['logger'].warn(
                    'Requirement to convert link target to IRI failed for invalid input, causing the corresponding output link to be omitted entirely: {0}'
                    .format(
                        repr(
                            (I(origin), I(iri.absolutize(rel,
                                                         ctx.base)), _value))))
                #XXX How do we really want to handle this error?
                return []
        for r in rels:
            ctx.output_model.add(I(origin), I(iri.absolutize(r, ctx.base)),
                                 _value, {})
        return
Exemple #3
0
    def handle_docheader(self, docheader_elem):
        # Special node to hold document header info for processing
        # FIXME: reconsider ID & type
        docheader_node = node(ONYA('docheader'), ONYA('docheader'))

        iris = {}

        # Gather document-level metadata from the @docheader section
        fields(docheader_elem, docheader_node, None)

        for prop in docheader_node.properties:
            # @iri section is where key IRI prefixes can be set
            if prop == '@iri':
                for (k, uri, typeindic) in subfield_list:
                    if k == '@base':
                        self.base = self.schemabase = self.rtbase = uri
                    # @property is legacy
                    elif k == '@schema' or k == '@property':
                        self.schemabase = uri
                    elif k == '@resource-type':
                        self.rtbase = uri
                    else:
                        iris[k] = uri
            # @interpretations section is where defaults can be set as to the primitive types of values from the Markdown, based on the relevant property/relationship
            elif prop == '@interpretations':
                #Iterate over items from the @docheader/@interpretations section to set up for further parsing
                interp = {}
                for k, v, x in subfield_list:
                    interp[I(iri.absolutize(k, schemabase))] = v
                self.setup_interpretations(interp)
            # Setting an IRI for this very document being parsed
            elif prop == '@document':
                document_iri = val
            elif prop == '@language':
                default_lang = val
            # If we have a resource to which to attach them, just attach all other properties
            elif document_iri or base:
                rid = document_iri or base
                fullprop = I(iri.absolutize(prop, schemabase or base))
                if fullprop in self.interpretations:
                    val = self.interpretations[fullprop](val,
                                                         rid=rid,
                                                         fullprop=fullprop,
                                                         base=base,
                                                         model=model)
                    if val is not None: model.add(rid, fullprop, val)
                else:
                    model.add(rid, fullprop, val)

        # Default IRI prefixes if @iri/@base is set
        if not self.schemabase: self.schemabase = base
        if not self.rtbase: self.rtbase = base
        if not self.document_iri: self.document_iri = base

        schema = (base, schemabase, rtbase, document_iri, default_lang)
Exemple #4
0
def isbn_instancegen(params, loop, model):
    '''
    Default handling of the idea of splitting a MARC record with FRBR Work info as well as instances signalled by ISBNs

    According to Vicki Instances can be signalled by 007, 020 or 3XX, but we stick to 020 for now
    '''
    #Handle ISBNs re: https://foundry.zepheira.com/issues/1976
    entbase = params['entbase']
    output_model = params['output_model']
    input_model = params['input_model']
    vocabbase = params['vocabbase']
    logger = params['logger']
    materialize_entity = params['materialize_entity']
    existing_ids = params['existing_ids']
    workid = params['workid']
    ids = params['ids']
    plugins = params['plugins']

    INSTANTIATES_REL = I(iri.absolutize('instantiates', vocabbase))

    isbns = list(( val for code, val in marc_lookup(input_model, '020$a')))
    logger.debug('Raw ISBNS:\t{0}'.format(isbns))

    # sorted to remove non-determinism which interferes with canonicalization
    normalized_isbns = sorted(list(isbn_list(isbns, logger=logger)))

    subscript = ord('a')
    instance_ids = []
    logger.debug('Normalized ISBN:\t{0}'.format(normalized_isbns))
    if normalized_isbns:
        for inum, itype in normalized_isbns:
            ean13 = compute_ean13_check(inum)
            data = [['instantiates', workid], [ISBNNS + 'isbn', ean13]]
            instanceid = materialize_entity('Instance', ctx_params=params, loop=loop, model_to_update=params['output_model'], data=data)
            if entbase: instanceid = I(iri.absolutize(instanceid, entbase))

            output_model.add(I(instanceid), ISBN_REL, ean13)
            output_model.add(I(instanceid), INSTANTIATES_REL, I(workid))
            if itype: output_model.add(I(instanceid), ISBN_TYPE_REL, itype)
            existing_ids.add(instanceid)
            instance_ids.append(instanceid)
    else:
        #If there are no ISBNs, we'll generate a default Instance
        data = [['instantiates', workid]]
        instanceid = materialize_entity('Instance', ctx_params=params, loop=loop, model_to_update=params['output_model'], data=data)
        instanceid = I(iri.absolutize(instanceid, entbase)) if entbase else I(instanceid)
        output_model.add(I(instanceid), INSTANTIATES_REL, I(workid))
        existing_ids.add(instanceid)
        instance_ids.append(instanceid)

    #output_model.add(instance_ids[0], I(iri.absolutize('instantiates', vocabbase)), I(workid))
    #output_model.add(I(instance_ids[0]), TYPE_REL, I(iri.absolutize('Instance', vocabbase)))

    return instance_ids
Exemple #5
0
 def expand_iri(iri_in, base):
     if iri_in.startswith('@'):
         return I(iri.absolutize(iri_in[1:], VERSA_BASEIRI))
     iri_match = URI_EXPLICIT_PAT.match(iri_in)
     if iri_match:
         return I(iri.absolutize(iri_match.group(1), base))
     iri_match = URI_ABBR_PAT.match(iri_in)
     if iri_match:
         uri = iris[iri_match.group(1)]
         fulliri = URI_ABBR_PAT.sub(uri + '\\2\\3', iri_in)
     else:
         fulliri = I(iri.absolutize(iri_in, base))
     return fulliri
Exemple #6
0
    def _toiri(ctx):
        _arg = arg(ctx) if is_pipeline_action(arg) else arg
        _arg = [_arg] if not isinstance(_arg, list) else _arg
        ret = []
        for u in _arg:
            iu = u
            if not (ignore_refs and not iri.is_absolute(iu)):
                # coerce into an IRIref, but fallout as untyped text otherwise
                try:
                    iu = I(iu)
                except ValueError as e:
                    # attempt to recover by percent encoding
                    try:
                        iu = I(iri.percent_encode(iu))
                    except ValueError as e:
                        ctx.extras['logger'].warn(
                            'Unable to convert "{}" to IRI reference:\n{}'.
                            format(iu, e))

                if base is not None and isinstance(iu, I):
                    iu = I(iri.absolutize(iu, base))

            ret.append(iu)

        return ret
Exemple #7
0
    def handle_record_links(self, loop, model, params):
        '''
        Task coroutine of the main event loop for MARC conversion, called with 
        In this case update a report of links encountered in the MARC/XML

        model -- raw Versa model with converted resource information from the MARC details from each MARC/XML record processed
        params -- parameters passed in from processing:
            params['workid']: ID of the work constructed from the MARC record
            params['instanceid']: list of IDs of instances constructed from the MARC record
        '''
        #print ('BF_MARCREC_TASK', linkreport.PLUGIN_ID)
        #Get the configured default vocabulary base IRI
        vocabbase = params['vocabbase']
        for cls, prop in self._config['lookup'].items():
            for link in model.match(None, VTYPE_REL, I(iri.absolutize(cls, vocabbase))):
                #simple_lookup() is a little helper for getting a property from a resource
                props = prop if isinstance(prop, list) else ['', prop]
                label = ''
                sep = props[0]
                def label_segments(props):
                    for p in props[1:]:
                        links = model.match(link[ORIGIN], I(iri.absolutize(p, vocabbase)))
                        s = [ link[TARGET] for link in links ]
                        if len(s) > 0:
                            yield ' | '.join(s)

                segments = list(label_segments(props))
                model.add(link[ORIGIN], I(RDFS_LABEL), sep.join(segments))
        return
Exemple #8
0
def test_relativize():
    for targetUri, againstUri, relativeUri, subPathUri in relativize_test_cases:
        res = iri.relativize(targetUri, againstUri)
        assert relativeUri == res, 'target=%r against=%r (subPathOnly=False)' % \
          (targetUri, againstUri)
        if res is not None:
            res = iri.absolutize(res, againstUri)
            assert res == targetUri, 'target=%r against=%r (subPathOnly=False, Absolutize)' % \
            (targetUri, againstUri)
        res = iri.relativize(targetUri, againstUri, True)
        assert subPathUri == res, 'target=%r against=%r (subPathOnly=True)' % \
          (targetUri, againstUri)
        if res is not None:
            res = iri.absolutize(res, againstUri)
            assert res == targetUri, 'target=%r against=%r (subPathOnly=True, Absolutize)' % \
            (targetUri, againstUri)
Exemple #9
0
def test_relativize():
    for targetUri, againstUri, relativeUri, subPathUri in relativize_test_cases:
        res = iri.relativize(targetUri, againstUri)
        assert relativeUri == res, 'target=%r against=%r (subPathOnly=False)' % \
          (targetUri, againstUri)
        if res is not None:
            res = iri.absolutize(res, againstUri)
            assert res == targetUri, 'target=%r against=%r (subPathOnly=False, Absolutize)' % \
            (targetUri, againstUri)
        res = iri.relativize(targetUri, againstUri, True)
        assert subPathUri == res, 'target=%r against=%r (subPathOnly=True)' % \
          (targetUri, againstUri)
        if res is not None:
            res = iri.absolutize(res, againstUri)
            assert res == targetUri, 'target=%r against=%r (subPathOnly=True, Absolutize)' % \
            (targetUri, againstUri)
Exemple #10
0
def idgen(idbase, tint=None, bits=64):
    '''
    Generate an IRI as a hash of given information, or just make one up if None given
    idbase -- Base URI for generating links
    tint -- String that affects the sequence of IDs generated if sent None

    >>> from bibframe.contrib.datachefids import idgen
    >>> g = idgen(None)
    >>> next(g) #Or g.send(None)
    'gKNG1b7eySo'
    >>> next(g)
    'cXx7iv67-3E'
    >>> g.send('spam')
    'OZxOEos8e-k'
    >>> next(g)
    'mCFhsaWQ1_0'
    >>> g.send('spam')
    'OZxOEos8e-k'
    >>> g.send('eggs')
    'xQAd4Guk040'
    >>> g.send('')
    'AAAAAAAAAAA'
    '''
    counter = -1
    to_hash = None
    while True:
        if to_hash is None:
            to_hash = str(counter)
            if tint: to_hash += tint
        to_hash = simple_hashstring(to_hash, bits=bits)
        to_hash = yield iri.absolutize(to_hash, idbase) if idbase else to_hash
        counter += 1
Exemple #11
0
    def _link(ctx):
        (origin, _, t, a) = ctx.current_link
        if derive_origin:
            #Have enough info to derive the origin from context. Ignore origin in current link
            origin = derive_origin(ctx)

        #If need be call the Versa action function to determine the relationship to the materialized resource
        rels = rel(ctx) if callable(rel) else rel
        if not isinstance(rels, list): rels = [rels]

        values = value(ctx) if callable(value) else (t if value is None else value)
        if not isinstance(values, list): values = [values]

        def recurse_values(vs):
            for v in vs:
                if callable(v):
                    yield from recurse_values(v(ctx))
                else:
                    yield v

        for _value in recurse_values(values):
            #If asked to convert value to resource, do so as long as it is absolute and ignore_refs is false
            if res and not (ignore_refs and not iri.is_absolute(_value)):
                try:
                    _value = I(_value)
                except ValueError:
                    ctx.extras['logger'].warn('Requirement to convert link target to IRI failed for invalid input, causing the corresponding output link to be omitted entirely: {0}'.format(repr((I(origin), I(iri.absolutize(rel, ctx.base)), _value))))
                    #XXX How do we really want to handle this error?
                    #return []
                    continue

            for r in rels:
                ctx.output_model.add(I(origin), I(iri.absolutize(r, ctx.base)), _value, {})

        return
Exemple #12
0
def materialize_entity(ctx, etype, unique=None):
    '''
    Routine for creating a BIBFRAME resource. Takes the entity (resource) type and a data mapping
    according to the resource type. Implements the Libhub Resource Hash Convention
    As a convenience, if a vocabulary base is provided in the context, concatenate it to etype and the data keys

    ctx - context information governing creation fo the mew entity
    etype - type IRI for th enew entity
    unique - scalar or ordered dict of data to use in generating its unique ID, or None in which case one is just randomly generated
    '''
    params = {}
    if ctx.base:
        etype = ctx.base + etype
    unique_full = unique
    if isinstance(unique, OrderedDict):
        unique_full = OrderedDict()
        for (k, v) in unique.items():
            unique_full[ k if iri.is_absolute(k) else iri.absolutize(k, ctx.base) ] = v

    if unique_full:
        plaintext = json.dumps([etype, unique_full], cls=OrderedJsonEncoder)
        eid = ctx.idgen.send(plaintext)
    else:
        #We only have a type; no other distinguishing data. Generate a random hash
        eid = next(ctx.idgen)
    return eid
Exemple #13
0
def idgen(idbase, tint=None):
    '''
    Generate an IRI as a hash of given information, or just make one up if None given
    idbase -- Base URI for generating links
    tint -- String that affects the sequence of IDs generated if sent None

    >>> from datachef.ids import idgen
    >>> g = idgen(None)
    >>> next(g) #Or g.send(None)
    'RtW-3skq'
    >>> next(g)
    'e4r-u_tx'
    >>> g.send('spam')
    'ThKLPHvp'
    >>> next(g)
    'YbGlkNf9'
    >>> g.send('spam')
    'ThKLPHvp'
    >>> g.send('eggs')
    'HeBrpNON'
    >>> g.send('')
    'AAAAAAAA'
    '''
    counter = -1
    to_hash = None
    while True:
        if to_hash is None:
            to_hash = str(counter)
            if tint: to_hash += tint
        to_hash = simple_hashstring(to_hash)
        to_hash = yield iri.absolutize(to_hash, idbase) if idbase else to_hash
        counter += 1
Exemple #14
0
def idgen(idbase, tint=None, bits=64):
    '''
    Generate an IRI as a hash of given information, or just make one up if None given
    idbase -- Base URI for generating links
    tint -- String that affects the sequence of IDs generated if sent None

    >>> from bibframe.contrib.datachefids import idgen
    >>> g = idgen(None)
    >>> next(g) #Or g.send(None)
    'gKNG1b7eySo'
    >>> next(g)
    'cXx7iv67-3E'
    >>> g.send('spam')
    'OZxOEos8e-k'
    >>> next(g)
    'mCFhsaWQ1_0'
    >>> g.send('spam')
    'OZxOEos8e-k'
    >>> g.send('eggs')
    'xQAd4Guk040'
    >>> g.send('')
    'AAAAAAAAAAA'
    '''
    counter = -1
    to_hash = None
    while True:
        if to_hash is None:
            to_hash = str(counter)
            if tint: to_hash += tint
        to_hash = simple_hashstring(to_hash, bits=bits)
        to_hash = yield iri.absolutize(to_hash, idbase) if idbase else to_hash
        counter += 1
Exemple #15
0
def resource_id(etype, unique=None, idgen=default_idgen(None), vocabbase=None):
    '''
    Very low level routine for generating a, ID value using the hash algorithm
    outlined by the Libhub initiative for for BIBFRAME Lite (Libhub Resource Hash Convention).
    Takes the entity (resource) type and an ordered data mapping.

    etype - type IRI for th enew entity
    unique - list of key/value tuples of data to use in generating its unique ID, or None in which case one is just randomly generated
    defaultvocabbase - for convenience, provided, use to resolve relative etype & data keys
    '''
    params = {}
    #XXX: Use proper URI normalization? Have a philosophical discussion with Mark about this :)
    if vocabbase: etype = vocabbase + etype

    unique_computed = []
    for k, v in unique:
        if vocabbase:
            #XXX OK absolutize used here. Go figure
            k = k if iri.is_absolute(k) else iri.absolutize(k, vocabbase)
        unique_computed.append((k, v))

    if unique_computed:
        # XXX Is OrderedJsonEncoder neded now that we're using list of tuples rather than ordered dict?
        plaintext = json.dumps([etype, unique_computed], cls=OrderedJsonEncoder)
        eid = idgen.send(plaintext)
    else:
        #We only have a type; no other distinguishing data. Generate a random hash
        eid = next(idgen)
    return eid
Exemple #16
0
def test_absolutize():
    for uriRef, baseUri, expectedUri in absolutize_test_cases:
        res = iri.absolutize(uriRef, baseUri)
        # in a couple cases, there's more than one correct result
        if isinstance(expectedUri, tuple):
            assert res in expectedUri, 'base=%r ref=%r' % (baseUri, uriRef)
        else:
            assert expectedUri == res, 'base=%r ref=%r' % (baseUri, uriRef)
Exemple #17
0
def test_absolutize():
    for uriRef, baseUri, expectedUri in absolutize_test_cases:
        res = iri.absolutize(uriRef, baseUri)
        # in a couple cases, there's more than one correct result
        if isinstance(expectedUri, tuple):
            assert res in expectedUri, 'base=%r ref=%r' % (baseUri, uriRef)
        else:
            assert expectedUri == res, 'base=%r ref=%r' % (baseUri, uriRef)
Exemple #18
0
    def _rename(ctx):
        (o, r, t, a) = ctx.current_link
        if res:
            try:
                t = I(t)
            except ValueError:
                return []

        out_attrs = {}
        for k, v in attributes.items():
            k = k(ctx) if callable(k) else k
            #If k is a list of contexts use it to dynamically execute functions
            if isinstance(k, list):
                if k and isinstance(k[0], context):
                    for newctx in k:
                        #Presumably the function in question will generate any needed links in the output model
                        v(newctx)
                    continue

            #import traceback; traceback.print_stack() #For looking up the call stack e.g. to debug nested materialize
            #Check that the attributes key is not None, which is a signal not to
            #generate the item. For example if the key is an ifexists and the
            #test expression result is False, it will come back as None,
            #and we don't want to run the v function
            if k:
                new_current_link = (o, k, ctx.current_link[TARGET], ctx.current_link[ATTRIBUTES])
                newctx = ctx.copy(current_link=new_current_link)
                v = v(newctx) if callable(v) else v

                #If k or v come from pipeline functions as None it signals to skip generating anything else for this link item
                if v is not None:
                    v = v(newctx) if callable(v) else v
                    #FIXME: Fix properly, by slugifying & making sure slugify handles all-numeric case
                    if k.isdigit(): k = '_' + k
                    if isinstance(v, list):
                        for valitems in v:
                            if valitems:
                                #out_attrs[k] = valitems
                                out_attrs[I(iri.absolutize(k, newctx.base))] = valitems
                    else:
                        #out_attrs[I(iri.absolutize(k, newctx.base))] = v
                        out_attrs[k] = v

        ctx.output_model.add(I(o), I(iri.absolutize(rel, ctx.base)), t, out_attrs)
        return
Exemple #19
0
def idgen(idbase):
    '''
    Generate a IRI
    '''
    #Simple tumbler for now, possibly switch to random number, with some sort of sequence override for unit testing
    ix = 0
    while True:
        yield iri.absolutize(str(ix), idbase) if idbase else str(ix)
        ix += 1
Exemple #20
0
def relabel(ctx, new_rel=None, res=False):
    '''
    Update the label of the relationship to be added to the link space
    '''
    #Just work with the first provided statement, for now
    (o, r, t) = ctx.current_link
    if res: t = I(t)
    ctx.output_model.add(I(o), I(iri.absolutize(new_rel, ctx.base)), t, {})
    return None
Exemple #21
0
 def _rename(ctx):
     (o, r, t, a) = ctx.current_link
     if res:
         try:
             t = I(t)
         except ValueError:
             return []
     ctx.output_model.add(I(o), I(iri.absolutize(rel, ctx.base)), t, {})
     return
Exemple #22
0
 def setup_interpretations(interp):
     #Map the interpretation IRIs to functions to do the data prep
     for prop, interp_key in interp.items():
         if interp_key.startswith('@'):
             interp_key = iri.absolutize(interp_key[1:], VERSA_BASEIRI)
         if interp_key in PREP_METHODS:
             interpretations[prop] = PREP_METHODS[interp_key]
         else:
             #just use the identity, i.e. no-op
             interpretations[prop] = lambda x, **kwargs: x
Exemple #23
0
 def setup_interpretations(interp):
     #Map the interpretation IRIs to functions to do the data prep
     for prop, interp_key in interp.items():
         if interp_key.startswith('@'):
             interp_key = iri.absolutize(interp_key[1:], VERSA_BASEIRI)
         if interp_key in PREP_METHODS:
             interpretations[prop] = PREP_METHODS[interp_key]
         else:
             #just use the identity, i.e. no-op
             interpretations[prop] = lambda x, **kwargs: x
Exemple #24
0
    def process_annotation(anntype, subfields, extra_annotation_props):
        #Separate annotation subfields from object subfields
        object_subfields = subfields.copy()
        annotation_subfields = {}
        for k, v in subfields.items():
            if code + k in ANNOTATIONS_FIELDS:
                annotation_subfields[k] = v
                del object_subfields[k]
            params['transforms'].append((code + k, code + k))

        #objectid = next(idg)
        #object_props.update(object_subfields)

        annotationid = next(ids)
        relsink.add(I(annotationid), TYPE_REL, I(iri.absolutize(anntype, BFZ)))
        for k, v in itertools.chain(annotation_subfields.items(), extra_annotation_props.items()):
            relsink.add(I(annotationid), I(iri.absolutize(k, BFZ)), v)

        #Return enough info to generate the main subject/object relationship. The annotation is taken care of at this point
        return annotationid, object_subfields
Exemple #25
0
 def __init__(self, baseurl=None):
     if baseurl:
         model, _ = load_rdfa_page(baseurl)
         if not model:
             raise RuntimeError(baseurl, 'doesn\'t appear to be a Library.Link site')
         #<dd property="dcterms:modified">2018-04-17T04:17:32Z</dd>
         
         self.lastmod = next(versautil.lookup(model, None, 'http://purl.org/dc/terms/modified'), None)
         self.sitemap = iri.absolutize('/harvest/sitemap.xml', baseurl)
         self.url = baseurl
         protocol, self.host, path, query, fragment = iri.split_uri_ref(baseurl)
Exemple #26
0
    def handle_record_links(self, loop, model, params):
        '''
        Task coroutine of the main event loop for MARC conversion, called with 
        In this case update a report of links encountered in the MARC/XML

        model -- raw Versa model with converted resource information from the MARC details from each MARC/XML record processed
        params -- parameters passed in from processing:
            params['workid']: ID of the work constructed from the MARC record
            params['instanceid']: list of IDs of instances constructed from the MARC record
        '''
        #print ('BF_MARCREC_TASK', linkreport.PLUGIN_ID)
        #Get the configured vocabulary base IRI
        vocabbase = params['vocabbase']
        for cls, prop in self._config['lookup'].items():
            for link in model.match(None, TYPE_REL, I(iri.absolutize(cls, vocabbase))):
                #simple_lookup() is a little helper for getting a property from a resource
                val = simple_lookup(model, link[ORIGIN], I(iri.absolutize(prop, vocabbase)))
                if val:
                    model.add(link[ORIGIN], I(iri.absolutize('label', vocabbase)), val)
        return
Exemple #27
0
def inverse_materialize(ctx, hashidgen=None, existing_ids=None, unique=None, typ=None, new_rel=None, properties=None):
    '''
    Create a new resource related to the origin
    '''
    properties = properties or {}
    #Just work with the first provided statement, for now
    (o, r, t) = ctx.current_link
    if unique:
        objid = hashidgen.send(unique(ctx))
    else:
        objid = next(hashidgen)
    if objid != I(iri.absolutize(FROM_EMPTY_HASH, ctx.base)):
        ctx.output_model.add(I(objid), I(iri.absolutize(new_rel, ctx.base)), I(o), {})
        if objid not in existing_ids:
            if typ: ctx.output_model.add(I(objid), VTYPE_REL, I(iri.absolutize(typ, ctx.base)), {})
            for k, v in properties.items():
                if callable(v):
                    v = v(ctx)
                ctx.output_model.add(I(objid), I(iri.absolutize(k, ctx.base)), v, {})
    return objid
Exemple #28
0
def handle_resourcelist(ltext, **kwargs):
    '''
    A helper that converts lists of resources from a textual format such as Markdown, including absolutizing relative IRIs
    '''
    base = kwargs.get('base', VERSA_BASEIRI)
    model = kwargs.get('model')
    iris = ltext.strip().split()
    newlist = model.generate_resource()
    for i in iris:
        model.add(newlist, VERSA_BASEIRI + 'item', I(iri.absolutize(i, base)))
    return newlist
Exemple #29
0
def handle_resourcelist(ltext, **kwargs):
    '''
    A helper that converts lists of resources from a textual format such as Markdown, including absolutizing relative IRIs
    '''
    base=kwargs.get('base', VERSA_BASEIRI)
    model=kwargs.get('model')
    iris = ltext.strip().split()
    newlist = model.generate_resource()
    for i in iris:
        model.add(newlist, VERSA_BASEIRI + 'item', I(iri.absolutize(i, base)))
    return newlist
Exemple #30
0
def handle_resourceset(ltext, **kwargs):
    '''
    A helper that converts sets of resources from a textual format such as Markdown, including absolutizing relative IRIs
    '''
    fullprop=kwargs.get('fullprop')
    rid=kwargs.get('rid')
    base=kwargs.get('base', VERSA_BASEIRI)
    model=kwargs.get('model')
    iris = ltext.strip().split()
    for i in iris:
        model.add(rid, fullprop, I(iri.absolutize(i, base)))
    return None
Exemple #31
0
 def _rename(ctx):
     workid, iid = ctx.extras[WORKID], ctx.extras[IID]
     new_o = {origin_class.work: workid, origin_class.instance: iid}[self._use_origin]
     #Just work with the first provided statement, for now
     (o, r, t, a) = ctx.current_link
     if res:
         try:
             t = I(t)
         except ValueError:
             return []
     ctx.output_model.add(I(new_o), I(iri.absolutize(rel, ctx.base)), t, {})
     return
Exemple #32
0
def handle_resourceset(ltext, **kwargs):
    '''
    A helper that converts sets of resources from a textual format such as Markdown, including absolutizing relative IRIs
    '''
    fullprop = kwargs.get('fullprop')
    rid = kwargs.get('rid')
    base = kwargs.get('base', VERSA_BASEIRI)
    model = kwargs.get('model')
    iris = ltext.strip().split()
    for i in iris:
        model.add(rid, fullprop, I(iri.absolutize(i, base)))
    return None
Exemple #33
0
def handle_resourceset(ltext, **kwargs):
    '''
    Helper converts lists of resources from text (i.e. Markdown),
    including absolutizing relative IRIs
    '''
    fullprop = kwargs.get('fullprop')
    rid = kwargs.get('rid')
    base = kwargs.get('base', ONYA)
    model = kwargs.get('model')
    iris = ltext.strip().split()
    for i in iris:
        model.add(rid, fullprop, I(iri.absolutize(i, base)))
    return None
Exemple #34
0
def links_from_html(root, baseurl, look_for=HTML_LINKS):
    '''
    '''
    for e in select_elements(descendants(root)):
        if e.xml_name in HTML_LINKS:
            for k, v in e.xml_attributes.items():
                if k in HTML_LINKS[e.xml_name]:
                    try:
                        link = iri.absolutize(v, baseurl, limit_schemes=('http', 'https'))
                    except ValueError: #Ignore scheme
                        continue
                    link, frag = iri.split_fragment(link)
                    yield link
Exemple #35
0
    def process_materialization(lookup, subfields, code=None):
        materializedid = hashid(idbase, tuple(subfields.items()))
        #The extra_props are parameters inherent to a particular MARC field/subfield for purposes of linked data representation
        if code is None: code = lookup
        (subst, extra_props) = MATERIALIZE[lookup]
        if RESOURCE_TYPE in extra_props:
            relsink.add(I(materializedid), TYPE_REL, I(iri.absolutize(extra_props[RESOURCE_TYPE], BFZ)))
        #logger.debug((lookup, subfields, extra_props))

        if materializedid not in T_prior_materializedids:
            #Just bundle in the subfields as they are, to avoid throwing out data. They can be otherwise used or just stripped later on
            #for k, v in itertools.chain((('marccode', code),), subfields.items(), extra_props.items()):
            for k, v in itertools.chain(subfields.items(), extra_props.items()):
                if k == RESOURCE_TYPE: continue
                fieldname = 'subfield-' + k
                if code + k in FIELD_RENAMINGS:
                    fieldname = FIELD_RENAMINGS[code + k]
                    if len(k) == 1: params['transforms'].append((code + k, fieldname)) #Only if proper MARC subfield
                    #params['transforms'].append((code + k, FIELD_RENAMINGS.get(sflookup, sflookup)))
                relsink.add(I(materializedid), iri.absolutize(fieldname, BFZ), v)
            T_prior_materializedids.add(materializedid)

        return materializedid, subst
Exemple #36
0
def process(source, target, rdfsonly, base=None, logger=logging):
    '''
    Prepare a statement into a triple ready for rdflib graph

    '''
    for link in source.match():
        s, p, o = link[:3]
        #SKip docheader statements
        if s == (base or '') + '@docheader': continue
        if p in RESOURCE_MAPPING: p = RESOURCE_MAPPING[p]
        if o in RESOURCE_MAPPING: o = RESOURCE_MAPPING[o]
        if p == VERSA_BASEIRI + 'refines':
            tlinks = list(source.match(s, TYPE_REL))
            if tlinks:
                if tlinks[0][TARGET] == VERSA_BASEIRI + 'Resource':
                    p = I(RDFS_NAMESPACE + 'subClassOf')
                elif tlinks[0][TARGET] == VERSA_BASEIRI + 'Property':
                    p = I(RDFS_NAMESPACE + 'subPropertyOf')
        if p == VERSA_BASEIRI + 'properties':
            suri = I(iri.absolutize(s, base)) if base else s
            target.add(
                (URIRef(o), URIRef(RDFS_NAMESPACE + 'domain'), URIRef(suri)))
            continue
        if p == VERSA_BASEIRI + 'value':
            if o not in ['Literal', 'IRI']:
                ouri = I(iri.absolutize(o, base)) if base else o
                target.add((URIRef(s), URIRef(RDFS_NAMESPACE + 'range'),
                            URIRef(ouri)))
                continue
        s = URIRef(s)
        #Translate v:type to rdf:type
        p = RDF.type if p == TYPE_REL else URIRef(p)
        o = URIRef(o) if isinstance(o, I) else Literal(o)
        if not rdfsonly or p.startswith(RDF_NAMESPACE) or p.startswith(
                RDFS_NAMESPACE):
            target.add((s, p, o))
    return
Exemple #37
0
def links_from_html(root, baseurl, look_for=HTML_LINKS):
    '''
    '''
    for e in select_elements(descendants(root)):
        if e.xml_name in HTML_LINKS:
            for k, v in e.xml_attributes.items():
                if k in HTML_LINKS[e.xml_name]:
                    try:
                        link = iri.absolutize(v,
                                              baseurl,
                                              limit_schemes=('http', 'https'))
                    except ValueError:  #Ignore scheme
                        continue
                    link, frag = iri.split_fragment(link)
                    yield link
Exemple #38
0
def instance_postprocess(params, skip_relationships=None):
    skip_relationships = list(skip_relationships) or []
    instanceids = params['instanceids']
    model = params['output_model']
    vocabbase = params['vocabbase']
    skip_relationships.extend([ISBN_REL, ISBN_TYPE_REL, I(iri.absolutize('instantiates', vocabbase))])
    def dupe_filter(o, r, t, a):
        #Filter out ISBN relationships
        return (r, t) != (TYPE_REL, I(iri.absolutize('Instance', vocabbase))) \
            and r not in skip_relationships
    if len(instanceids) > 1:
        base_instance_id = instanceids[0]
        for instanceid in instanceids[1:]:
            duplicate_statements(model, base_instance_id, instanceid, rfilter=dupe_filter)
    return
Exemple #39
0
 def _link(ctx):
     (o, r, t, a) = ctx.current_link
     _value = value(ctx) if callable(value) else (t if value is None else value)
     workid, iid = ctx.extras[WORKID], ctx.extras[IID]
     new_o = {origin_class.work: workid, origin_class.instance: iid}[self._use_origin]
     #Just work with the first provided statement, for now
     if res:
         try:
             _value = I(_value)
         except ValueError:
             ctx.extras['logger'].warn('Requirement to convert link target to IRI failed for invalid input, causing the corresponding output link to be omitted entirely: {0}'.format(repr((I(new_o), I(iri.absolutize(rel, ctx.base)), _value))))
             #XXX How do we really want to handle this error?
             return []
     ctx.output_model.add(I(new_o), I(iri.absolutize(rel, ctx.base)), _value, {})
     return
Exemple #40
0
def isbn_instancegen(params):
    '''
    Default handling of the idea of splitting a MARC record with FRBR Work info as well as instances signalled by ISBNs


    '''
    #Handle ISBNs re: https://foundry.zepheira.com/issues/1976
    entbase = params['entbase']
    model = params['model']
    vocabbase = params['vocabbase']
    logger = params['logger']
    ids = params['ids']
    rec = params['rec']
    existing_ids = params['existing_ids']
    workid = params['workid']

    isbns = marc_lookup(rec, ['020$a'])
    logger.debug('Raw ISBNS:\t{0}'.format(isbns))

    normalized_isbns = list(isbn_list(isbns))

    subscript = ord('a')
    instance_ids = []
    logger.debug('Normalized ISBN:\t{0}'.format(normalized_isbns))
    if normalized_isbns:
        for subix, (inum, itype) in enumerate(normalized_isbns):
            instanceid = ids.send(['Instance', workid, inum])
            if entbase: instanceid = I(iri.absolutize(instanceid, entbase))

            model.add(I(instanceid), I(iri.absolutize('isbn', vocabbase)), inum)
            #subitem['id'] = instanceid + (unichr(subscript + subix) if subix else '')
            if itype: model.add(I(instanceid), I(iri.absolutize('isbnType', vocabbase)), itype)
            instance_ids.append(instanceid)
    else:
        instanceid = ids.send(['Instance', workid])
        if entbase: instanceid = I(iri.absolutize(instanceid, entbase))
        model.add(I(instanceid), TYPE_REL, I(iri.absolutize('Instance', vocabbase)))
        existing_ids.add(instanceid)
        instance_ids.append(instanceid)

    for instanceid in instance_ids:
        model.add(I(workid), I(iri.absolutize('hasInstance', vocabbase)), instanceid)
        model.add(I(instanceid), TYPE_REL, I(iri.absolutize('Instance', vocabbase)))

    return instance_ids
Exemple #41
0
    def _res(ctx):
        _arg = arg(ctx) if callable(arg) else arg
        _arg = [_arg] if not isinstance(_arg, list) else _arg
        ret = []
        for u in _arg:
            iu = None
            try:
                iu = I(u)
            except ValueError:
                # attempt to recover by percent encoding
                try:
                    iu = I(iri.percent_encode(u))
                except ValueError as e:
                    ctx.logger('Unable to convert "{}" to IRI reference:\n{}'.format(u, e))
                    continue

            if iu and not iri.is_absolute(iu) and base is not None:
                iu = I(iri.absolutize(iu, base))

            ret.append(iu)

        return ret
Exemple #42
0
def instance_postprocess(params, skip_relationships=None):
    skip_relationships = list(skip_relationships) or []
    instanceids = params['instanceids']
    model = params['output_model']
    vocabbase = params['vocabbase']
    skip_relationships.extend([
        ISBN_REL, ISBN_VTYPE_REL,
        I(iri.absolutize('instantiates', vocabbase))
    ])

    def dupe_filter(o, r, t, a):
        #Filter out ISBN relationships
        return (r, t) != (VTYPE_REL, I(iri.absolutize('Instance', vocabbase))) \
            and r not in skip_relationships

    if len(instanceids) > 1:
        base_instance_id = instanceids[0]
        for instanceid in instanceids[1:]:
            duplicate_statements(model,
                                 base_instance_id,
                                 instanceid,
                                 rfilter=dupe_filter)
    return
Exemple #43
0
    def _res(ctx):
        _arg = arg(ctx) if callable(arg) else arg
        _arg = [_arg] if not isinstance(_arg, list) else _arg
        ret = []
        for u in _arg:
            iu = u
            if not (ignore_refs and not iri.is_absolute(iu)):
                # coerce into an IRIref, but fallout as untyped text otherwise
                try:
                    iu = I(iu)
                except ValueError as e:
                    # attempt to recover by percent encoding
                    try:
                        iu = I(iri.percent_encode(iu))
                    except ValueError as e:
                        ctx.extras['logger'].warn('Unable to convert "{}" to IRI reference:\n{}'.format(iu, e))

                if base is not None and isinstance(iu, I):
                    iu = I(iri.absolutize(iu, base))

            ret.append(iu)

        return ret
Exemple #44
0
def resource_id(etype, unique=None, idgen=default_idgen(None), vocabbase=None):
    '''
    Very low level routine for generating a, ID value using the hash algorithm
    outlined by the Libhub initiative for for BIBFRAME Lite (Libhub Resource Hash Convention).
    https://github.com/zepheira/pybibframe/wiki/From-Records-to-Resources:-the-Library.Link-resource-ID-generation-algorithm
    Takes the entity (resource) type and an ordered data mapping.

    etype - type IRI for th enew entity
    unique - list of key/value tuples of data to use in generating its unique ID, or None in which case one is just randomly generated
    defaultvocabbase - for convenience, provided, use to resolve relative etype & data keys

    >>> from bibframe.util import resource_id
    >>> resource_id("http://schema.org/Person", [("http://schema.org/name", "Jonathan Bruce Postel"), ("http://schema.org/birthDate", "1943-08-06")])
    '-7hP9d_Xo8M'
    >>> resource_id("http://schema.org/Person", [("http://schema.org/name", "Augusta Ada King")])
    'xjgOrUFiw_o'
    '''
    params = {}
    #XXX: Use proper URI normalization? Have a philosophical discussion with Mark about this :)
    if vocabbase: etype = vocabbase + etype

    unique_computed = []
    for k, v in unique:
        if vocabbase:
            #XXX OK absolutize used here. Go figure
            k = k if iri.is_absolute(k) else iri.absolutize(k, vocabbase)
        unique_computed.append((k, v))

    if unique_computed:
        unique_computed.insert(0, [VTYPE_REL, etype])
        plaintext = json.dumps(unique_computed, separators=(',', ':'))
        eid = idgen.send(plaintext)
    else:
        #We only have a type; no other distinguishing data. Generate a random hash
        eid = next(idgen)
    return eid
Exemple #45
0
    def do_parse(elem, resource, vocab=None, prop=None, prefixes=None):
        prefixes = prefixes or DEFAULT_PREFIXES.copy()
        vocab = elem.xml_attributes.get('vocab', vocab)
        #element_satisfied = False
        if vocab:
            prefix = elem.xml_attributes.get('prefix')
            if prefix:
                #logging.debug('{}'.format(prefix))
                prefix_bits = prefix.split()
                # a, b = tee(prefix.split())
                # next(b, None)
                # for p, ns in zip(a, b):
                #     p = p.strip().strip(':')
                #     ns = ns.strip()
                #     print((p, ns))
                #     #print(p, ns)
                #     prefixes[p] = ns
                for i, j in zip(range(0, len(prefix_bits), 2), range(1, len(prefix_bits), 2)):
                    p = prefix_bits[i].strip().strip(':')
                    ns = prefix_bits[j].strip()
                    #print(p, ns)
                    prefixes[p] = ns
            new_resource = elem.xml_attributes.get('resource')
            if new_resource:
                try:
                    resource = new_resource = I(iri.absolutize(new_resource, source_uri))
                except ValueError:
                    warnings.warn('Invalid URL or anchor {} found in {}. Ignored.'.format(new_resource, source_uri))
                    new_resource = None

            typeof_list = elem.xml_attributes.get('typeof')
            if typeof_list:
                if not new_resource: new_resource = mock_bnode('')
                for typeof in typeof_list.split():
                    try:
                        typeof = I(iri.absolutize(typeof, vocab))
                    except ValueError:
                        warnings.warn('Invalid URL or anchor {} found in {}. Ignored'.format(typeof, source_uri))
                    statement_sink.send((new_resource or resource, RDF_NS + 'type', typeof))

            new_prop_list = elem.xml_attributes.get('property')
            new_value = None
            if new_prop_list:
                if new_resource:
                    new_value = new_resource
                for new_prop in new_prop_list.split():
                    if new_prop == 'about':
                        continue
                    elif ':' in new_prop:
                        p, local = new_prop.split(':', 1)
                        if not p in prefixes:
                            #FIXME: Silent error for now
                            continue
                        try:
                            prop = I(iri.absolutize(local, prefixes[p]))
                        except ValueError:
                            warnings.warn('Invalid URL or anchor {} found in {}. Ignored'.format(local, source_uri))
                            continue
                    else:
                        try:
                            prop = I(iri.absolutize(new_prop, vocab))
                        except ValueError:
                            warnings.warn('Invalid URL or anchor {} found in {}. Ignored'.format(new_prop, source_uri))
                            continue
                    href_res = elem.xml_attributes.get('href')
                    if href_res:
                        try:
                            href_res = I(href_res)
                        except ValueError:
                            warnings.warn('Invalid URL or anchor {} found in {}. Ignored'.format(href_res, source_uri))
                            continue
                    href_src = elem.xml_attributes.get('src')
                    if href_src:
                        try:
                            href_src = I(href_src)
                        except ValueError:
                            warnings.warn('Invalid URL or anchor {} found in {}. Ignored'.format(href_src, source_uri))
                            continue
                    value = new_value or elem.xml_attributes.get('content') or href_res or href_src or elem.xml_value
                    statement_sink.send((resource, prop, value))
                    #logging.debug('{}'.format((resource, prop, value)))
                    #element_satisfied = True
            if new_value: resource = new_value
        for child in elem.xml_children:
            if isinstance(child, element):
                do_parse(child, resource, vocab=vocab, prop=prop, prefixes=prefixes)
Exemple #46
0
#LINKROLES = {0: link.origin, 1: link.relationship, 2: link.target, 3: link.attributes}


def init_localization():
    '''prepare l10n'''
    locale.setlocale(locale.LC_ALL, '') # User's preferred locale, according to environment

    # Use first two characters of country code, defaulting to 'en' in the absence of a preference
    loc = locale.getlocale()
    lang = loc[0][0:2] if loc[0] else 'en'
    filename = "res/messages_%s.mo" % lang

    try:
        logging.debug( "Opening message file %s for locale %s", filename, loc[0] )
        trans = gettext.GNUTranslations(open( filename, "rb" ) )
    except IOError:
        logging.debug( "Locale not found. Using default messages" )
        trans = gettext.NullTranslations()

    trans.install()


#Intentionally after the localization setup
from versa.iriref import iriref as I
VERSA_BASEIRI = I('http://bibfra.me/purl/versa/')

#Very common Versa:specific types. Analogous to rdf:type & rdfs:label
VTYPE_REL = I(iri.absolutize('type', VERSA_BASEIRI))
VLABEL_REL = I(iri.absolutize('label', VERSA_BASEIRI))
Exemple #47
0
    def _materialize(ctx):
        '''
        Inserts at least two main links in the context's output_model, one or more for
        the relationship from the origin to the materialized resource, one for the
        type of the materialized resource, and links according to the links parameter

        :param ctx: Runtime Versa context used in processing (e.g. includes the prototype link)
        :return: None

        This function is intricate in its use and shifting of Versa context, but the
        intricacies are all designed to make the marcpatterns mini language more natural.
        '''
        # FIXME: Part of the datachef sorting out
        if not ctx.idgen: ctx.idgen = idgen
        if debug is None:
            def log_debug(msg): return
        elif not hasattr(debug, 'write'):
            raise TypeError('debug argument to materialize must be file-like object or None')
        else:
            def log_debug(msg):
                print(msg, file=debug)

        # Set up variables to be made available in any derived contexts
        vars_items = list((vars or {}).items())
        if vars_items:
            # First make sure we're not tainting the passed-in context
            ctx = ctx.copy(variables=ctx.variables.copy())
            for k, v in vars_items:
                if None in (k, v): continue
                #v = v if isinstance(v, list) else [v]
                v = v(ctx) if is_pipeline_action(v) else v
                if v:
                    v = v[0] if isinstance(v, list) else v
                    ctx.variables[k] = v

        (o, r, t, a) = ctx.current_link
        if isinstance(typ, COPY):
            object_copy = typ
            object_copy.id = o
            _typ = next(util.resourcetypes(ctx.input_model, o), None)
            object_copy.links = []
            for stmt in ctx.input_model.match(o):
                if object_copy.rels is None or stmt[RELATIONSHIP] in typ.rels:
                    # FIXME: Attributes?
                    object_copy.links.append((stmt[RELATIONSHIP], stmt[TARGET]))
        else:
            _typ = typ(ctx) if is_pipeline_action(typ) else typ
            object_copy = None
        _fprint = fprint(ctx) if is_pipeline_action(fprint) else fprint
        # FIXME: On redesign implement split using function composition instead
        targets = [ sub_t.strip() for sub_t in t.split(split) if sub_t.strip() ] if split else [t]

        # If the rel in the incoming context is null and there is no rel passed in, nothing to attach
        # Especially useful signal in a pipeline's fingerprinting stage
        attach_ = False if rel is None and r is None else attach

        if '@added-links' not in ctx.extras: ctx.extras['@added-links'] = set()

        # Make sure we end up with a list or None
        rels = rel if isinstance(rel, list) else ([rel] if rel else [r])
        log_debug(f'materialize action. Type: {_typ}. Anchoring rels: {rels} Initial context current link: {ctx.current_link}')
        log_debug(f'Variables (including from vars= arg): {ctx.variables}')
        objids = []

        # Botanical analogy: stem context is from the caller (e.g. connection point of newly materialized resource)
        # vein comtexts derive from the stem
        for target in targets:
            ctx_stem = ctx.copy(current_link=(ctx.current_link[ORIGIN], ctx.current_link[RELATIONSHIP], target, ctx.current_link[ATTRIBUTES]))
            if origin:
                # Have been given enough info to derive the origin from context. Ignore origin in current link
                o = origin(ctx_stem)
            if not o: #Defensive coding
                continue

            computed_fprint = [] if _fprint else None
            rtypes = set([_typ])
            if _fprint:
                # strip None values from computed unique list, including pairs where v is None
                for k, v in _fprint:
                    if None in (k, v): continue
                    for subitem in (v if isinstance(v, list) else [v]):
                        subval = subitem(ctx_stem) if is_pipeline_action(subitem) else subitem
                        if subval:
                            subval = subval if isinstance(subval, list) else [subval]
                            if k == VTYPE_REL: rtypes.update(set(subval))
                            computed_fprint.extend([(k, s) for s in subval])
            log_debug(f'Provided fingerprinting info: {computed_fprint}')

            if object_copy:
                objid = object_copy.id
            else:
                objid = materialize_entity(ctx_stem, _typ, fprint=computed_fprint)
            objids.append(objid)
            log_debug(f'Newly materialized object: {objid}')
            # rels = [ ('_' + curr_rel if curr_rel.isdigit() else curr_rel) for curr_rel in rels if curr_rel ]
            computed_rels = []
            for curr_relobj in rels:
                # e.g. scenario if passed in rel=ifexists(...)
                curr_rels = curr_relobj(ctx_stem) if is_pipeline_action(curr_relobj) else curr_relobj
                curr_rels = curr_rels if isinstance(curr_rels, list) else [curr_rels]
                for curr_rel in curr_rels:
                    if not curr_rel: continue
                    # FIXME: Fix properly, by slugifying & making sure slugify handles all numeric case (prepend '_')
                    curr_rel = '_' + curr_rel if curr_rel.isdigit() else curr_rel
                    if attach_:
                        _smart_add(ctx_stem.output_model, I(o), I(iri.absolutize(curr_rel, ctx_stem.base)), I(objid), (), ctx.extras['@added-links'])
                    computed_rels.append(curr_rel)
            # print((objid, ctx_.existing_ids))
            # XXX: Means links are only processed on new objects! This needs some thought
            if objid not in ctx_stem.existing_ids:
                if _typ:
                    _smart_add(ctx_stem.output_model, I(objid), VTYPE_REL, I(iri.absolutize(_typ, ctx_stem.base)), (), ctx.extras['@added-links'])
                if preserve_fprint:
                    # Consolidate types
                    computed_fprint = [ (k, v) for (k, v) in computed_fprint if k != VTYPE_REL ]
                    # computed_fprint += 
                    attrs = tuple(computed_fprint + [(VTYPE_REL, r) for r in rtypes])
                    _smart_add(ctx_stem.output_model, I(objid), VFPRINT_REL, _typ, attrs, ctx.extras['@added-links'])

                # XXX: Use Nones to mark blanks, or should Versa define some sort of null resource?
                all_links = object_copy.links + links if object_copy else links
                for l in all_links:
                    if len(l) == 2:
                        lo = I(objid)
                        lr, lt = l
                    elif len(l) == 3:
                        lo, lr, lt = l
                    # This context is in effect 

                    # First of all, hold on to the inbound origin so that it can be accessed in embedded actions
                    vein_vars = ctx_stem.variables.copy()
                    vein_vars['@stem'] = ctx_stem.current_link[ORIGIN]

                    # Newly materialized resource is the origin. The overall context target for embedded actions
                    ctx_vein = ctx_stem.copy(current_link=(objid, ctx_stem.current_link[RELATIONSHIP], ctx_stem.current_link[TARGET], ctx_stem.current_link[ATTRIBUTES]), variables=vein_vars)

                    lo = lo or ctx_vein.current_link[ORIGIN]
                    lr = lr or ctx_vein.current_link[RELATIONSHIP]
                    lt = lt or ctx_vein.current_link[TARGET]

                    lo = lo(ctx_vein) if is_pipeline_action(lo) else lo
                    lo = lo if isinstance(lo, list) else [lo]
                    lr = lr(ctx_vein) if is_pipeline_action(lr) else lr

                    # Update lr
                    # XXX This needs cleaning up
                    ctx_vein = ctx_stem.copy(current_link=(ctx_vein.current_link[ORIGIN], lr, ctx_vein.current_link[TARGET], ctx_stem.current_link[ATTRIBUTES]), variables=vein_vars)

                    # If k is a list of contexts use it to dynamically execute functions
                    if isinstance(lr, list):
                        if lr and isinstance(lr[0], context):
                            for newctx in lr:
                                #The function in question will generate any needed links in the output model
                                lt(newctx)
                            continue

                    # import traceback; traceback.print_stack() #For looking up the call stack e.g. to debug nested materialize
                    # Check that the links key is not None, which is a signal not to
                    # generate the item. For example if the key is an ifexists and the
                    # test expression result is False, it will come back as None,
                    # and we don't want to run the v function
                    if lr:
                        lt = lt(ctx_vein) if is_pipeline_action(lt) else lt

                        # If k or v come from pipeline functions as None it signals to skip generating anything else for this link item
                        if lt is not None:
                            # FIXME: Fix properly, by slugifying & making sure slugify handles all-numeric case
                            if lr.isdigit(): lr = '_' + lr
                            _lr = I(iri.absolutize(lr, ctx_vein.base))
                            log_debug(f'Generated link: {lo, _lr, lt}')
                            if isinstance(lt, list):
                                for valitems in lt:
                                    if valitems:
                                        for loi in lo:
                                            _smart_add(ctx_vein.output_model, loi, _lr, valitems, (), ctx.extras['@added-links'])
                            else:
                                for loi in lo:
                                    _smart_add(ctx_vein.output_model, loi, _lr, lt, (), ctx.extras['@added-links'])
                ctx_stem.existing_ids.add(objid)
                for func in ctx.extras.get('@new-entity-hook', []):
                    func(objid)
        log_debug(f'End materialize')
            
        return objids
Exemple #48
0
'''

import os
import json
import itertools
import asyncio

from versa import I, ORIGIN, RELATIONSHIP, TARGET
from versa.util import simple_lookup

from amara3 import iri

from bibframe import BFZ, BFLC, g_services, BF_INIT_TASK, BF_MARCREC_TASK, BF_FINAL_TASK

ISBN_REL = I(iri.absolutize('isbn', BFZ))
TITLE_REL = I(iri.absolutize('title', BFZ))

BFHOST = 'bibfra.me'

#A plug-in is a series of callables, each of which handles a phase of
#Process

#The only phase predefined for all plug-ins is BF_INIT_TASK


#One convenient way to organize the Plug-in is as a class
#In this case we want to create a separate instance for each full processing event loop
class linkreport(object):
    PLUGIN_ID = 'http://bibfra.me/tool/pybibframe#linkreport'
Exemple #49
0
    '''
    fullprop = kwargs.get('fullprop')
    rid = kwargs.get('rid')
    base = kwargs.get('base', VERSA_BASEIRI)
    model = kwargs.get('model')
    iris = ltext.strip().split()
    for i in iris:
        model.add(rid, fullprop, I(iri.absolutize(i, base)))
    return None


PREP_METHODS = {
    VERSA_BASEIRI + 'text':
    lambda x, **kwargs: x,
    VERSA_BASEIRI + 'resource':
    lambda x, base=VERSA_BASEIRI, **kwargs: I(iri.absolutize(x, base)),
    VERSA_BASEIRI + 'resourceset':
    handle_resourceset,
}


def parse(md, model, encoding='utf-8', config=None):
    """
    Translate the Versa Markdown syntax into Versa model relationships

    md -- markdown source text
    model -- Versa model to take the output relationship
    encoding -- character encoding (defaults to UTF-8)

    Returns: The overall base URI (`@base`) specified in the Markdown file, or None
Exemple #50
0
def parse(md, model, encoding='utf-8', config=None):
    """
    Translate the Versa Markdown syntax into Versa model relationships

    md -- markdown source text
    model -- Versa model to take the output relationship
    encoding -- character encoding (defaults to UTF-8)

    Returns: The overall base URI (`@base`) specified in the Markdown file, or None

    >>> from versa.driver.memory import newmodel
    >>> from versa.serial.literate import parse
    >>> m = newmodel()
    >>> parse(open('test/resource/poetry.md').read(), m)
    'http://uche.ogbuji.net/poems/'
    >>> m.size()
    40
    >>> next(m.match(None, 'http://uche.ogbuji.net/poems/updated', '2013-10-15'))
    (I(http://uche.ogbuji.net/poems/1), I(http://uche.ogbuji.net/poems/updated), '2013-10-15', {})
    """
    #Set up configuration to interpret the conventions for the Markdown
    config = config or {}
    #This mapping takes syntactical elements such as the various header levels in Markdown and associates a resource type with the specified resources
    syntaxtypemap = {}
    if config.get('autotype-h1'):
        syntaxtypemap['h1'] = config.get('autotype-h1')
    if config.get('autotype-h2'):
        syntaxtypemap['h2'] = config.get('autotype-h2')
    if config.get('autotype-h3'):
        syntaxtypemap['h3'] = config.get('autotype-h3')
    interp_stanza = config.get('interpretations', {})
    interpretations = {}

    def setup_interpretations(interp):
        #Map the interpretation IRIs to functions to do the data prep
        for prop, interp_key in interp.items():
            if interp_key.startswith('@'):
                interp_key = iri.absolutize(interp_key[1:], VERSA_BASEIRI)
            if interp_key in PREP_METHODS:
                interpretations[prop] = PREP_METHODS[interp_key]
            else:
                #just use the identity, i.e. no-op
                interpretations[prop] = lambda x, **kwargs: x

    setup_interpretations(interp_stanza)

    #Prep ID generator, in case needed
    idg = idgen(None)

    #Preprocess the Markdown to deal with IRI-valued property values
    def iri_ref_tool(m):
        body = m.group(1)
        lchar = '&lt;' if iri.matches_uri_ref_syntax(body) else '<'
        return lchar + m.group(1) + '>'

    md = IRIREF_CAND_PAT.sub(iri_ref_tool, md)

    #Parse the Markdown
    #Alternately:
    #from xml.sax.saxutils import escape, unescape
    #h = markdown.markdown(escape(md.decode(encoding)), output_format='html5')
    #Note: even using safe_mode this should not be presumed safe from tainted input
    #h = markdown.markdown(md.decode(encoding), safe_mode='escape', output_format='html5')
    comments = mkdcomments.CommentsExtension()
    h = markdown.markdown(md,
                          safe_mode='escape',
                          output_format='html5',
                          extensions=[comments])

    #doc = html.markup_fragment(inputsource.text(h.encode('utf-8')))
    tb = treebuilder()
    h = '<html>' + h + '</html>'
    root = html5.parse(h)
    #root = tb.parse(h)
    #Each section contains one resource description, but the special one named @docheader contains info to help interpret the rest
    first_h1 = next(select_name(descendants(root), 'h1'))
    #top_section_fields = itertools.takewhile(lambda x: x.xml_name != 'h1', select_name(following_siblings(first_h1), 'h2'))

    # Extract header elements. Notice I use an empty element with an empty parent as the default result
    docheader = next(
        select_value(select_name(descendants(root), 'h1'), '@docheader'),
        element('empty', parent=root))  # //h1[.="@docheader"]
    sections = filter(
        lambda x: x.xml_value != '@docheader',
        select_name_pattern(descendants(root), HEADER_PAT)
    )  # //h1[not(.="@docheader")]|h2[not(.="@docheader")]|h3[not(.="@docheader")]

    def fields(sect):
        '''
        Each section represents a resource and contains a list with its properties
        This generator parses the list and yields the key value pairs representing the properties
        Some properties have attributes, expressed in markdown as a nested list. If present these attributes
        Are yielded as well, else None is yielded
        '''
        #import logging; logging.debug(repr(sect))
        #Pull all the list elements until the next header. This accommodates multiple lists in a section
        try:
            sect_body_items = itertools.takewhile(
                lambda x: HEADER_PAT.match(x.xml_name) is None,
                select_elements(following_siblings(sect)))
        except StopIteration:
            return
        #results_until(sect.xml_select('following-sibling::*'), 'self::h1|self::h2|self::h3')
        #field_list = [ U(li) for ul in sect.xml_select('following-sibling::ul') for li in ul.xml_select('./li') ]
        field_list = [
            li for elem in select_name(sect_body_items, 'ul')
            for li in select_name(elem, 'li')
        ]

        def parse_li(pair):
            '''
            Parse each list item into a property pair
            '''
            if pair.strip():
                matched = REL_PAT.match(pair)
                if not matched:
                    raise ValueError(
                        _('Syntax error in relationship expression: {0}'.
                          format(pair)))
                if matched.group(3): prop = matched.group(3).strip()
                if matched.group(4): prop = matched.group(4).strip()
                if matched.group(7):
                    val = matched.group(7).strip()
                    typeindic = RES_VAL
                elif matched.group(9):
                    val = matched.group(9).strip()
                    typeindic = TEXT_VAL
                elif matched.group(11):
                    val = matched.group(11).strip()
                    typeindic = TEXT_VAL
                elif matched.group(12):
                    val = matched.group(12).strip()
                    typeindic = UNKNOWN_VAL
                else:
                    val = ''
                    typeindic = UNKNOWN_VAL
                #prop, val = [ part.strip() for part in U(li.xml_select('string(.)')).split(':', 1) ]
                #import logging; logging.debug(repr((prop, val)))
                return prop, val, typeindic
            return None, None, None

        def prep_li(li):
            '''
            Take care of Markdown parsing minutiae. Also, Exclude child uls

            * a/href embedded in the li means it was specified as <link_text>.
            Restore the angle brackets as expected by the li parser
            * Similar for cases where e.g. prop: <abc> gets turned into prop: <abc></abc>
            '''
            prepped = ''
            for ch in itertools.takewhile(
                    lambda x: not (isinstance(x, element) and x.xml_name ==
                                   'ul'), li.xml_children):
                if isinstance(ch, text):
                    prepped += ch
                elif isinstance(ch, element):
                    if ch.xml_name == 'a':
                        prepped += '<' + ch.xml_value + '>'
                    else:
                        prepped += '<' + ch.xml_name + '>'
            return prepped

        #Go through each list item
        for li in field_list:
            #Is there a nested list, which expresses attributes on a property
            if list(select_name(li, 'ul')):
                #main = ''.join([ node.xml_value
                #        for node in itertools.takewhile(
                #            lambda x: x.xml_name != 'ul', select_elements(li)
                #            )
                #    ])
                main = prep_li(li)
                prop, val, typeindic = parse_li(main)
                subfield_list = [
                    parse_li(prep_li(sli)) for e in select_name(li, 'ul')
                    for sli in (select_name(e, 'li'))
                ]
                subfield_list = [(p, v, t) for (p, v, t) in subfield_list
                                 if p is not None]
                #Support a special case for syntax such as in the @iri and @interpretations: stanza of @docheader
                if val is None: val = ''
                yield prop, val, typeindic, subfield_list
            #Just a regular, unadorned property
            else:
                prop, val, typeindic = parse_li(prep_li(li))
                if prop: yield prop, val, typeindic, None

    iris = {}

    # Gather the document-level metadata from the @docheader section
    base = schemabase = rtbase = document_iri = default_lang = None
    for prop, val, typeindic, subfield_list in fields(docheader):
        #The @iri section is where key IRI prefixes can be set
        if prop == '@iri':
            for (k, uri, typeindic) in subfield_list:
                if k == '@base':
                    base = schemabase = rtbase = uri
                # @property is legacy
                elif k == '@schema' or k == '@property':
                    schemabase = uri
                elif k == '@resource-type':
                    rtbase = uri
                else:
                    iris[k] = uri
        #The @interpretations section is where defaults can be set as to the primitive types of values from the Markdown, based on the relevant property/relationship
        elif prop == '@interpretations':
            #Iterate over items from the @docheader/@interpretations section to set up for further parsing
            interp = {}
            for k, v, x in subfield_list:
                interp[I(iri.absolutize(k, schemabase))] = v
            setup_interpretations(interp)
        #Setting an IRI for this very document being parsed
        elif prop == '@document':
            document_iri = val
        elif prop == '@language':
            default_lang = val
        #If we have a resource to which to attach them, just attach all other properties
        elif document_iri or base:
            rid = document_iri or base
            fullprop = I(iri.absolutize(prop, schemabase or base))
            if fullprop in interpretations:
                val = interpretations[fullprop](val,
                                                rid=rid,
                                                fullprop=fullprop,
                                                base=base,
                                                model=model)
                if val is not None: model.add(rid, fullprop, val)
            else:
                model.add(rid, fullprop, val)

    #Default IRI prefixes if @iri/@base is set
    if not schemabase: schemabase = base
    if not rtbase: rtbase = base
    if not document_iri: document_iri = base

    #Go through the resources expressed in remaining sections
    for sect in sections:
        #if U(sect) == '@docheader': continue #Not needed because excluded by ss
        #The header can take one of 4 forms: "ResourceID" "ResourceID [ResourceType]" "[ResourceType]" or "[]"
        #The 3rd form is for an anonymous resource with specified type and the 4th an anonymous resource with unspecified type
        matched = RESOURCE_PAT.match(sect.xml_value)
        if not matched:
            raise ValueError(
                _('Syntax error in resource header: {0}'.format(
                    sect.xml_value)))
        rid = matched.group(1)
        rtype = matched.group(3)
        if rtype:
            rtype = I(iri.absolutize(rtype, schemabase))

        if rid:
            rid = I(iri.absolutize(rid, base))
        if not rid:
            rid = next(idg)

        #Resource type might be set by syntax config
        if not rtype:
            rtype = syntaxtypemap.get(sect.xml_name)
        if rtype:
            model.add(rid, TYPE_REL, rtype)

        def expand_iri(iri_in, base):
            if iri_in.startswith('@'):
                return I(iri.absolutize(iri_in[1:], VERSA_BASEIRI))
            iri_match = URI_EXPLICIT_PAT.match(iri_in)
            if iri_match:
                return I(iri.absolutize(iri_match.group(1), base))
            iri_match = URI_ABBR_PAT.match(iri_in)
            if iri_match:
                uri = iris[iri_match.group(1)]
                fulliri = URI_ABBR_PAT.sub(uri + '\\2\\3', iri_in)
            else:
                fulliri = I(iri.absolutize(iri_in, base))
            return fulliri

        #Add the property
        for prop, val, typeindic, subfield_list in fields(sect):
            attrs = {}
            for (aprop, aval, atype) in subfield_list or ():
                fullaprop = expand_iri(aprop, schemabase)
                if atype == RES_VAL:
                    val = expand_iri(aval, rtbase)
                    valmatch = URI_ABBR_PAT.match(aval)
                    if valmatch:
                        uri = iris[valmatch.group(1)]
                        attrs[fullaprop] = URI_ABBR_PAT.sub(
                            uri + '\\2\\3', aval)
                    else:
                        attrs[fullaprop] = I(iri.absolutize(aval, rtbase))
                elif atype == TEXT_VAL:
                    attrs[fullaprop] = aval
                elif atype == UNKNOWN_VAL:
                    val_iri_match = URI_EXPLICIT_PAT.match(aval)
                    if val_iri_match:
                        aval = expand_iri(aval, rtbase)
                    elif fullaprop in interpretations:
                        aval = interpretations[fullaprop](aval,
                                                          rid=rid,
                                                          fullprop=fullaprop,
                                                          base=base,
                                                          model=model)
                    if aval is not None:
                        attrs[fullaprop] = aval

            fullprop = expand_iri(prop, schemabase)
            if typeindic == RES_VAL:
                val = expand_iri(val, rtbase)
                model.add(rid, fullprop, val, attrs)
            elif typeindic == TEXT_VAL:
                if '@lang' not in attrs: attrs['@lang'] = default_lang
                model.add(rid, fullprop, val, attrs)
            elif typeindic == UNKNOWN_VAL:
                val_iri_match = URI_EXPLICIT_PAT.match(val)
                if val_iri_match:
                    val = expand_iri(val, rtbase)
                elif fullprop in interpretations:
                    val = interpretations[fullprop](val,
                                                    rid=rid,
                                                    fullprop=fullprop,
                                                    base=base,
                                                    model=model)
                if val is not None:
                    model.add(rid, fullprop, val, attrs)

            #resinfo = AB_RESOURCE_PAT.match(val)
            #if resinfo:
            #    val = resinfo.group(1)
            #    valtype = resinfo.group(3)
            #    if not val: val = model.generate_resource()
            #    if valtype: attrs[TYPE_REL] = valtype

    return document_iri
Exemple #51
0
def bind(models, context=None, ignore_oftypes=None, logger=logging):
    if not isinstance(models, list): models = [models]
    vocab = context.get('@vocab')
    non_top_ids = set()
    obj_pool = {} #Mapping from resource id to object and list of referring ids
    used_objects = set() #Track multiple instance of docs to prevent data structure recursion
    #typed_origins = set()
    for m in models:
        #Everything with a type
        for origin in all_origins(m):
            typ = next(lookup(m, origin, RDF_TYPE), None)
            #if p == VERSA_TYPE: p = RDF_TYPE
            obj, referents = obj_pool.setdefault(origin, ({}, []))
            if vocab and typ:
                typ_rel = iri.relativize(typ, vocab)
                if typ_rel: typ = typ_rel
            if typ: obj['@type'] = typ
            if not origin.startswith('__VERSABLANKNODE__'): obj['@id'] = origin
            for o, r, t, a in m.match(origin):
                if r == RDF_TYPE: continue
                if isinstance(t, I) and o != t:
                    if vocab:
                        t_rel = iri.relativize(t, vocab)
                        if t_rel: t = t_rel
                    valobj, referents = obj_pool.setdefault(t, ({}, []))
                    if t in used_objects:
                        val = t
                    else:
                        val = valobj
                        if not t.startswith('__VERSABLANKNODE__') and '@id' not in val: val['@id'] = t
                        used_objects.add(t)

                        non_top_ids.add(t) #If something has an object as a value it does not appear at the top
                    referents.append(o)
                else:
                    val = t
                if vocab:
                    r_rel = iri.relativize(r, vocab)
                    if r_rel: r = r_rel
                if r in obj and isinstance(obj[r], list):
                    obj[r].append(val)
                elif r in obj:
                    obj[r] = [obj[r], val]
                else:
                    obj[r] = val

    #Eliminate objects of types to be ignored
    to_remove = []
    for (oid, (obj, referents)) in obj_pool.items():
        typ = obj.get('@type')
        if vocab and typ: typ = iri.absolutize(typ, vocab)
        if typ in ignore_oftypes:
            to_remove.append(oid)
            for ref in referents:
                refobj, _ = obj_pool[ref]
                for k in list(refobj.keys()):
                    v = refobj[k]
                    if isinstance(v, list) and obj in v:
                        v.remove(obj)
                        if len(v) == 1:
                            refobj[k] = v[0]
                    elif v == obj:
                        del refobj[k]
                        
    for k in to_remove:
        del obj_pool[k]

    #Handle @id only
    for (oid, (obj, referents)) in obj_pool.items():
        for k, v in obj.items():
            if len(v) == 1 and '@id' in v:
                obj[k] = v['@id']
    
    top_objs = [ obj for (k, (obj, refs)) in obj_pool.items() if k not in non_top_ids ]
    #Eliminate stranded top-level objects with no more than type
    to_remove = []
    #for ix, obj in enumerate(top_objs):
    for obj in top_objs:
        if len(obj) == 1 and '@type' in obj:
            to_remove.append(obj)
    for obj in to_remove:
        top_objs.remove(obj)
    #import pprint;pprint.pprint(top_objs)
    if context and context.get('@output', True):
        top = {'@context': context, '@graph': top_objs}
    else:
        return top_objs
Exemple #52
0
SIMPLE_BOOK = {
    'id': 'http://example.org/book/catcher-in-the-rye',
    'title': 'The Catcher in the Rye',
    'type': 'http://ogp.me/ns/books#books.book',
    'link': 'https://en.wikipedia.org/wiki/The_Catcher_in_the_Rye',
    'author': 'J.D. Salinger',
    'cover': 'http://example.org/book/catcher-in-the-rye-book-cover.jpg',
}

BOOK_TYPE = 'http://schema.org/Book'
SCH = SCHEMA_ORG = 'http://schema.org/'
EXAMPLE_ORG = 'http://example.org/'

BOOK_ID = 'http://example.org/book/catcher-in-the-rye'
SCHEMA_NAME = I(iri.absolutize('name', SCHEMA_ORG))
SCHEMA_AUTHOR = I(iri.absolutize('author', SCHEMA_ORG))
XXX_WROTE = 'http://example.org/wrote'

BOOK_CASES = []

transforms = {
    'id':
    ignore(),
    'title':
    link(rel=SCH + 'name'),
    'author':
    materialize(SCH + 'Person',
                rel=SCH + 'author',
                unique=[(SCH + 'name', target())],
                links=[(SCH + 'name', target())]),
Exemple #53
0
import re
import os
import logging
import itertools

#from rdflib import Graph, BNode, Namespace
from rdflib import URIRef, Literal, RDF, RDFS

from amara3 import iri

from versa import I, VERSA_BASEIRI, ORIGIN, RELATIONSHIP, TARGET

from bibframe import BFZ, BFLC

VTYPE_REL = I(iri.absolutize('type', VERSA_BASEIRI))
VLABEL_REL = I(iri.absolutize('label', VERSA_BASEIRI))

WORKCLASS = iri.absolutize('Work', BFZ)
INSTANCECLASS = iri.absolutize('Instance', BFZ)
INSTANCEREL = iri.absolutize('hasInstance', BFZ)

PROP_MAP = {
    VTYPE_REL: RDF.type,
    VLABEL_REL: RDFS.label,
}


def prep(stmt):
    '''
    Prepare a statement into a triple ready for rdflib
Exemple #54
0
def process_marcpatterns(params, transforms, input_model, phase_target):
    output_model = params['output_model']
    if phase_target == BOOTSTRAP_PHASE:
        input_model_iter = params['input_model']
    else:
        # Need to sort our way through the input model so that the materializations occur
        # at the same place each time, otherwise canonicalization fails due to the
        # addition of the subfield context (at the end of materialize())

        # XXX Is the int() cast necessary? If not we could do key=operator.itemgetter(0)
        input_model_iter = sorted(list(params['input_model']),
                                  key=lambda x: int(x[0]))
    params['to_postprocess'] = []
    for lid, marc_link in input_model_iter:
        origin, taglink, val, attribs = marc_link
        origin = params.get('default-origin', origin)
        #params['logger'].debug('PHASE {} ORIGIN: {}\n'.format(phase_target, origin))
        if taglink == MARCXML_NS + '/leader':
            params['leader'] = leader = val
            continue
        #Sort out attributes
        params['indicators'] = indicators = {
            k: v
            for k, v in attribs.items() if k.startswith('ind')
        }
        params['subfields'] = curr_subfields = subfields(attribs)
        curr_subfields_keys = [tup[0] for tup in curr_subfields]
        if taglink.startswith(MARCXML_NS + '/extra/') or 'tag' not in attribs:
            continue
        params['code'] = tag = attribs['tag']
        if taglink.startswith(MARCXML_NS + '/control'):
            #No indicators on control fields. Turn them off, in effect
            indicator_list = ('#', '#')
            key = 'tag-' + tag
            if tag == '006':
                params['fields006'].append(val)
            if tag == '007':
                params['fields007'].append(val)
            if tag == '008':
                params['field008'] = val
            if phase_target != BOOTSTRAP_PHASE:
                params['transform_log'].append((tag, key))
                params['fields_used'].append((tag, ))
        elif taglink.startswith(MARCXML_NS + '/data'):
            indicator_list = ((attribs.get('ind1')
                               or ' ')[0].replace(' ', '#'),
                              (attribs.get('ind2')
                               or ' ')[0].replace(' ', '#'))
            key = 'tag-' + tag
            #logger.debug('indicators: ', repr(indicators))
            #indicator_list = (indicators['ind1'], indicators['ind2'])
            if phase_target != BOOTSTRAP_PHASE:
                params['fields_used'].append(tuple([tag] +
                                                   curr_subfields_keys))

        #This is where we check each incoming MARC link to see if it matches a transform into an output link (e.g. renaming 001 to 'controlCode')
        to_process = []
        #Start with most specific matches, then to most general

        # "?" syntax in lookups is a single char wildcard
        #First with subfields, with & without indicators:
        for k, v in curr_subfields:
            #if indicator_list == ('#', '#'):
            lookups = [
                '{0}-{1}{2}${3}'.format(tag, indicator_list[0],
                                        indicator_list[1], k),
                '{0}-?{2}${3}'.format(tag, indicator_list[0],
                                      indicator_list[1], k),
                '{0}-{1}?${3}'.format(tag, indicator_list[0],
                                      indicator_list[1], k),
                '{0}${1}'.format(tag, k),
            ]
            for lookup in lookups:
                if lookup in transforms:
                    to_process.append((transforms[lookup], v, lookup))
                else:
                    # don't report on subfields for which a code-transform exists,
                    # disregard wildcards
                    if phase_target != BOOTSTRAP_PHASE and not tag in transforms and '?' not in lookup:

                        params['dropped_codes'].setdefault(lookup, 0)
                        params['dropped_codes'][lookup] += 1

        #Now just the tag, with & without indicators
        lookups = [
            '{0}-{1}{2}'.format(tag, indicator_list[0], indicator_list[1]),
            '{0}-?{2}'.format(tag, indicator_list[0], indicator_list[1]),
            '{0}-{1}?'.format(tag, indicator_list[0], indicator_list[1]),
            tag,
        ]

        #Remember how many lookups were successful based on subfields
        subfields_results_len = len(to_process)
        for lookup in lookups:
            if lookup in transforms:
                to_process.append((transforms[lookup], val, lookup))

        if phase_target != BOOTSTRAP_PHASE and subfields_results_len == len(
                to_process) and not curr_subfields:
            # Count as dropped if subfields were not processed and theer were no matches on non-subfield lookups
            params['dropped_codes'].setdefault(tag, 0)
            params['dropped_codes'][tag] += 1

        mat_ent = functools.partial(materialize_entity,
                                    ctx_params=params,
                                    loop=params['loop'])

        #Apply all the handlers that were found
        for funcinfo, val, lookup in to_process:
            #Support multiple actions per lookup
            funcs = funcinfo if isinstance(funcinfo, tuple) else (funcinfo, )

            for func in funcs:
                extras = {
                    'origins': params['origins'],
                    'match-spec': lookup,
                    'indicators': indicators,
                    'logger': params['logger'],
                    'lookups': params['lookups'],
                    'postprocessing': [],
                    'inputns': MARC,
                    'abort-signal': False,
                }
                #Build Versa processing context
                #Should we include indicators?
                #Should we be passing in taglink rather than tag?
                ctx = bfcontext((origin, tag, val, attribs),
                                input_model,
                                output_model,
                                extras=extras,
                                base=params['vocabbase'],
                                idgen=mat_ent,
                                existing_ids=params['existing_ids'])
                func(ctx)
                params['to_postprocess'].extend(ctx.extras['postprocessing'])
                if ctx.extras['abort-signal']:
                    return False

        if phase_target != BOOTSTRAP_PHASE and not to_process:
            #Nothing else has handled this data field; go to the fallback
            fallback_rel_base = '../marcext/tag-' + tag
            if not curr_subfields:
                #Fallback for control field: Captures MARC tag & value
                output_model.add(
                    I(origin),
                    I(iri.absolutize(fallback_rel_base, params['vocabbase'])),
                    val)
            for k, v in curr_subfields:
                #Fallback for data field: Captures MARC tag, indicators, subfields & value
                fallback_rel = '../marcext/{0}-{1}{2}-{3}'.format(
                    fallback_rel_base, indicator_list[0].replace('#', 'X'),
                    indicator_list[1].replace('#', 'X'), k)
                #params['transform_log'].append((code, fallback_rel))
                try:
                    output_model.add(
                        I(origin),
                        I(iri.absolutize(fallback_rel, params['vocabbase'])),
                        v)
                except ValueError as e:
                    control_code = list(marc_lookup(
                        input_model, '001')) or ['NO 001 CONTROL CODE']
                    dumb_title = list(marc_lookup(
                        input_model, '245$a')) or ['NO 245$a TITLE']
                    params['logger'].warning(
                        '{}\nSkipping statement for {}: "{}"'.format(
                            e, control_code[0], dumb_title[0]))

    #For now do not run special transforms if in a custom phase
    #XXX: Needs discussion
    if phase_target in (BOOTSTRAP_PHASE, DEFAULT_MAIN_PHASE):
        #params['logger'].debug('PHASE {}\n'.format(phase_target))
        extra_stmts = set()  # prevent duplicate statements
        special_transforms = params['transforms'].specials
        for origin, k, v in itertools.chain(
                special_transforms.process_leader(params),
                special_transforms.process_006(params['fields006'], params),
                special_transforms.process_007(params['fields007'], params),
                special_transforms.process_008(params['field008'], params)):
            v = v if isinstance(v, tuple) else (v, )
            for item in v:
                o = origin or I(params['default-origin'])
                if o and (o, k, item) not in extra_stmts:
                    output_model.add(o, k, item)
                    extra_stmts.add((o, k, item))
    return True
Exemple #55
0
def isbn_instancegen(params, loop, model):
    '''
    Default handling of the idea of splitting a MARC record with FRBR Work info as well as instances signalled by ISBNs

    According to Vicki Instances can be signalled by 007, 020 or 3XX, but we stick to 020 for now
    '''
    #Handle ISBNs re: https://foundry.zepheira.com/issues/1976
    entbase = params['entbase']
    output_model = params['output_model']
    input_model = params['input_model']
    vocabbase = params['vocabbase']
    logger = params['logger']
    materialize_entity = params['materialize_entity']
    existing_ids = params['existing_ids']
    workid = params['default-origin']
    ids = params['ids']
    plugins = params['plugins']

    INSTANTIATES_REL = I(iri.absolutize('instantiates', vocabbase))

    isbns = list((val for code, val in marc_lookup(input_model, '020$a')))
    logger.debug('Raw ISBNS:\t{0}'.format(isbns))

    # sorted to remove non-determinism which interferes with canonicalization
    normalized_isbns = sorted(list(isbn_list(isbns, logger=logger)))

    subscript = ord('a')
    instance_ids = []
    logger.debug('Normalized ISBN:\t{0}'.format(normalized_isbns))
    if normalized_isbns:
        for inum, itype in normalized_isbns:
            ean13 = compute_ean13_check(inum)
            data = [['instantiates', workid], [ISBNNS + 'isbn', ean13]]
            instanceid = materialize_entity('Instance',
                                            ctx_params=params,
                                            model_to_update=output_model,
                                            data=data,
                                            loop=loop)
            if entbase: instanceid = I(iri.absolutize(instanceid, entbase))

            output_model.add(I(instanceid), ISBN_REL, ean13)
            output_model.add(I(instanceid), INSTANTIATES_REL, I(workid))
            if itype: output_model.add(I(instanceid), ISBN_VTYPE_REL, itype)
            existing_ids.add(instanceid)
            instance_ids.append(instanceid)
    else:
        #If there are no ISBNs, we'll generate a default Instance
        data = [['instantiates', workid]]
        instanceid = materialize_entity('Instance',
                                        ctx_params=params,
                                        model_to_update=output_model,
                                        data=data,
                                        loop=loop)
        instanceid = I(iri.absolutize(instanceid,
                                      entbase)) if entbase else I(instanceid)
        output_model.add(I(instanceid), INSTANTIATES_REL, I(workid))
        existing_ids.add(instanceid)
        instance_ids.append(instanceid)

    #output_model.add(instance_ids[0], I(iri.absolutize('instantiates', vocabbase)), I(workid))
    #output_model.add(I(instance_ids[0]), VTYPE_REL, I(iri.absolutize('Instance', vocabbase)))

    return instance_ids
Exemple #56
0
import itertools
import asyncio

from itertools import tee, zip_longest

from versa import I, VERSA_BASEIRI, ORIGIN, RELATIONSHIP, TARGET
from versa.util import simple_lookup

from amara3 import iri

from bibframe import BFZ, BFLC, g_services, BF_INIT_TASK, BF_MARCREC_TASK, BF_MATRES_TASK, BF_FINAL_TASK

RDF_NAMESPACE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
RDFS_NAMESPACE = 'http://www.w3.org/2000/01/rdf-schema#'

VTYPE_REL = I(iri.absolutize('type', VERSA_BASEIRI))
RDFS_LABEL = RDFS_NAMESPACE + 'label'


def pairwise(iterable):
    a, b = tee(iterable)
    next(b, None)
    return zip_longest(a, b)


#A plug-in is a series of callables, each of which handles a phase of
#Process

#The only phase predefined for all plug-ins is BF_INIT_TASK

Exemple #57
0
def record_handler(loop,
                   model,
                   entbase=None,
                   vocabbase=BL,
                   limiting=None,
                   plugins=None,
                   ids=None,
                   postprocess=None,
                   out=None,
                   logger=logging,
                   transforms=TRANSFORMS,
                   special_transforms=unused_flag,
                   canonical=False,
                   model_factory=memory.connection,
                   lookups=None,
                   **kwargs):
    '''
    loop - asyncio event loop
    model - the Versa model for the record
    entbase - base IRI used for IDs of generated entity resources
    limiting - mutable pair of [count, limit] used to control the number of records processed
    '''
    #Deprecated legacy API support
    if isinstance(transforms, dict) or special_transforms is not unused_flag:
        warnings.warn('Please switch to using bibframe.transforms_set',
                      PendingDeprecationWarning)
        special_transforms = special_transforms or default_special_transforms()
        transforms = transform_set(transforms)
        transforms.specials = special_transforms

    _final_tasks = set(
    )  #Tasks for the event loop contributing to the MARC processing

    plugins = plugins or []
    if ids is None: ids = idgen(entbase)

    #FIXME: For now always generate instances from ISBNs, but consider working this through the plugins system
    instancegen = isbn_instancegen

    existing_ids = set()
    #Start the process of writing out the JSON representation of the resulting Versa
    if out and not canonical: out.write('[')
    first_record = True

    try:
        while True:
            input_model = yield
            leader = None
            #Add work item record, with actual hash resource IDs based on default or plugged-in algo
            #FIXME: No plug-in support yet
            params = {
                'input_model': input_model,
                'logger': logger,
                #'input_model': input_model, 'output_model': model, 'logger': logger,
                'entbase': entbase,
                'vocabbase': vocabbase,
                'ids': ids,
                'existing_ids': existing_ids,
                'plugins': plugins,
                'transforms': transforms,
                'materialize_entity': materialize_entity,
                'leader': leader,
                'lookups': lookups or {},
                'loop': loop
            }

            # Earliest plugin stage, with an unadulterated input model
            for plugin in plugins:
                if BF_INPUT_TASK in plugin:
                    yield from plugin[BF_INPUT_TASK](loop, input_model, params)

            #Prepare cross-references (i.e. 880s)
            #See the "$6 - Linkage" section of https://www.loc.gov/marc/bibliographic/ecbdcntf.html
            #XXX: Figure out a way to declare in TRANSFORMS? We might have to deal with non-standard relationship designators: https://github.com/lcnetdev/marc2bibframe/issues/83
            xrefs = {}
            remove_links = set()
            add_links = []

            xref_link_tag_workaround = {}
            for lid, marc_link in input_model:
                origin, taglink, val, attribs = marc_link
                if taglink == MARCXML_NS + '/leader' or taglink.startswith(
                        MARCXML_NS + '/data/9'):
                    #900 fields are local and might not follow the general xref rules
                    params['leader'] = leader = val
                    continue
                #XXX Do other fields with a 9 digit (not just 9XX) also need to be skipped?
                if taglink.startswith(MARCXML_NS +
                                      '/extra/') or 'tag' not in attribs:
                    continue
                this_tag = attribs['tag']
                #if this_tag == '100': import pdb; pdb.set_trace()
                for xref in attribs.get('6', []):
                    matched = LINKAGE_PAT.match(xref)
                    this_taglink, this_occ, this_scriptid, this_rtl = matched.groups(
                    ) if matched else (None, None, None, None)
                    if not this_taglink and occ:
                        control_code = list(marc_lookup(
                            input_model, '001')) or ['NO 001 CONTROL CODE']
                        dumb_title = list(marc_lookup(
                            input_model, '245$a')) or ['NO 245$a TITLE']
                        logger.warning(
                            'Skipping invalid $6: "{}" for {}: "{}"'.format(
                                xref, control_code[0], dumb_title[0]))
                        continue

                    if this_tag == this_taglink:
                        #Pretty sure this is an erroneous self-link, but we've seen this in the wild (e.g. QNL). Issue warning & do the best we can linking via occurrence
                        #Note: the resulting workround (lookup table from occurence code to the correct tag) will not work in cases of linking from any tag higher in ordinal value than 880 (if such a situation is even possible)
                        logger.warning(
                            'Invalid input: erroneous self-link $6: "{}" from "{}". Trying to work around.'
                            .format(xref, this_tag))
                        if this_tag != '880':
                            xref_link_tag_workaround[this_occ] = this_tag

                    #FIXME: Remove this debugging if statament at some point
                    if scriptid or rtl:
                        logger.debug(
                            'Language info specified in subfield 6, {}'.format(
                                xref))

                    #Locate the matching taglink
                    if this_tag == '880' and this_occ == '00':
                        #Special case, no actual xref, used to separate scripts in a record (re Multiscript Records)
                        #FIXME: Not really handled right now. Presume some sort of merge dynamics will need to be implemented
                        attribs['tag'] = this_taglink
                        add_links.append(
                            (origin, MARCXML_NS + '/data/' + this_taglink, val,
                             attribs))

                    if xref_link_tag_workaround:
                        if this_tag == '880':
                            this_taglink = xref_link_tag_workaround.get(
                                this_occ)

                    links = input_model.match(
                        None, MARCXML_NS + '/data/' + this_taglink)
                    for that_link in links:
                        #6 is the cross-reference subfield
                        for that_ref in link[ATTRIBUTES].get('6', []):
                            matched = LINKAGE_PAT.match(that_ref)
                            that_taglink, that_occ, that_scriptid, that_rtl = matched.groups(
                            ) if matched else (None, None, None, None)
                            #if not that_tag and that_occ:
                            #    control_code = list(marc_lookup(input_model, '001')) or ['NO 001 CONTROL CODE']
                            #    dumb_title = list(marc_lookup(input_model, '245$a')) or ['NO 245$a TITLE']
                            #    logger.warning('Skipping invalid $6: "{}" for {}: "{}"'.format(to_ref, control_code[0], dumb_title[0]))
                            #    continue
                            if ([that_taglink, that_occ] == [
                                    this_tag, this_occ
                            ]) or (xref_link_tag_workaround
                                   and that_occ == this_occ):
                                if this_tag == '880':
                                    #This is an 880, which we'll handle by integrating back into the input model using the correct tag, flagged to show the relationship
                                    remove_links.add(lid)

                                if that_taglink == '880':
                                    #Rule for 880s: duplicate but link more robustly
                                    copied_attribs = attribs.copy()
                                    for k, v in that_link[ATTRIBUTES].items():
                                        if k[:3] not in ('tag', 'ind'):
                                            copied_attribs.setdefault(
                                                k, []).extend(v)
                                    add_links.append(
                                        (origin,
                                         MARCXML_NS + '/data/' + this_tag, val,
                                         copied_attribs))

            input_model.remove(remove_links)
            input_model.add_many(add_links)

            # hook for plugins interested in the xref-resolved input model
            for plugin in plugins:
                if BF_INPUT_XREF_TASK in plugin:
                    yield from plugin[BF_INPUT_XREF_TASK](loop, input_model,
                                                          params)

            #Do one pass to establish work hash
            #XXX Should crossrefs precede this?
            bootstrap_dummy_id = next(params['input_model'].match())[ORIGIN]
            logger.debug('Entering bootstrap phase. Dummy ID: {}'.format(
                bootstrap_dummy_id))

            params['default-origin'] = bootstrap_dummy_id
            params['instanceids'] = [bootstrap_dummy_id + '-instance']
            params['output_model'] = model_factory()

            params['field008'] = leader = None
            params['fields006'] = fields006 = []
            params['fields007'] = fields007 = []
            params['to_postprocess'] = []

            params['origins'] = {
                WORK_TYPE: bootstrap_dummy_id,
                INSTANCE_TYPE: params['instanceids'][0]
            }

            #First apply special patterns for determining the main target resources
            curr_transforms = transforms.compiled[BOOTSTRAP_PHASE]

            ok = process_marcpatterns(params, curr_transforms, input_model,
                                      BOOTSTRAP_PHASE)
            if not ok: continue  #Abort current record if signalled

            bootstrap_output = params['output_model']
            temp_main_target = main_type = None
            for o, r, t, a in bootstrap_output.match(
                    None, PYBF_BOOTSTRAP_TARGET_REL):
                #FIXME: We need a better designed way of determining fallback to bib
                if t is not None: temp_main_target, main_type = o, t

            #Switch to the main output model for processing
            params['output_model'] = model

            if temp_main_target is None:
                #If no target was set explicitly fall back to the transforms registered for the biblio phase
                #params['logger'].debug('WORK HASH ORIGIN {}\n'.format(bootstrap_dummy_id))
                #params['logger'].debug('WORK HASH MODEL {}\n'.format(repr(bootstrap_output)))
                workid_data = gather_workid_data(bootstrap_output,
                                                 bootstrap_dummy_id)
                workid = materialize_entity('Work',
                                            ctx_params=params,
                                            data=workid_data,
                                            loop=loop)
                logger.debug(
                    'Entering default main phase, Work ID: {0}'.format(workid))

                is_folded = workid in existing_ids
                existing_ids.add(workid)

                control_code = list(marc_lookup(
                    input_model, '001')) or ['NO 001 CONTROL CODE']
                dumb_title = list(marc_lookup(input_model,
                                              '245$a')) or ['NO 245$a TITLE']
                logger.debug('Work hash data: {0}'.format(repr(workid_data)))
                logger.debug('Control code: {0}'.format(control_code[0]))
                logger.debug('Uniform title: {0}'.format(dumb_title[0]))
                logger.debug('Work ID: {0}'.format(workid))

                workid = I(iri.absolutize(workid,
                                          entbase)) if entbase else I(workid)
                folded = [workid] if is_folded else []

                model.add(workid, VTYPE_REL,
                          I(iri.absolutize('Work', vocabbase)))

                params['default-origin'] = workid
                params['folded'] = folded

                #Figure out instances
                instanceids = instancegen(params, loop, model)
                params['instanceids'] = instanceids or [None]

                main_transforms = transforms.compiled[DEFAULT_MAIN_PHASE]
                params['origins'] = {
                    WORK_TYPE: workid,
                    INSTANCE_TYPE: params['instanceids'][0]
                }
                phase_target = DEFAULT_MAIN_PHASE
            else:
                targetid_data = gather_targetid_data(
                    bootstrap_output, temp_main_target,
                    transforms.orderings[main_type])
                #params['logger'].debug('Data for resource: {}\n'.format([main_type] + targetid_data))
                targetid = materialize_entity(main_type,
                                              ctx_params=params,
                                              data=targetid_data,
                                              loop=loop)
                logger.debug(
                    'Entering specialized phase, Target resource ID: {}, type: {}'
                    .format(targetid, main_type))

                is_folded = targetid in existing_ids
                existing_ids.add(targetid)
                #Determine next transform phase
                main_transforms = transforms.compiled[main_type]
                params['origins'] = {main_type: targetid}
                params['default-origin'] = targetid
                phase_target = main_type
                model.add(I(targetid), VTYPE_REL, I(main_type))

            params['transform_log'] = []  # set()
            params['fields_used'] = []
            params['dropped_codes'] = {}
            #Defensive coding against missing leader or 008
            params['field008'] = leader = None
            params['fields006'] = fields006 = []
            params['fields007'] = fields007 = []
            params['to_postprocess'] = []

            ok = process_marcpatterns(params, main_transforms, input_model,
                                      phase_target)
            if not ok: continue  #Abort current record if signalled

            skipped_rels = set()
            for op, rels, rid in params['to_postprocess']:
                for rel in rels:
                    skipped_rels.add(rel)
                if op == POSTPROCESS_AS_INSTANCE:
                    if params['instanceids'] == [None]:
                        params['instanceids'] = [rid]
                    else:
                        params['instanceids'].append(rid)
            instance_postprocess(params, skip_relationships=skipped_rels)

            logger.debug('+')

            #XXX At this point there must be at least one record with a Versa type

            for plugin in plugins:
                #Each plug-in is a task
                #task = asyncio.Task(plugin[BF_MARCREC_TASK](loop, relsink, params), loop=loop)
                if BF_MARCREC_TASK in plugin:
                    yield from plugin[BF_MARCREC_TASK](loop, model, params)
                logger.debug("Pending tasks: %s" %
                             asyncio.Task.all_tasks(loop))
                #FIXME: This blocks and thus serializes the plugin operation, rather than the desired coop scheduling approach
                #For some reason seting to async task then immediately deferring to next task via yield from sleep leads to the "yield from wasn't used with future" error (Not much clue at: https://codereview.appspot.com/7396044/)
                #yield from asyncio.Task(asyncio.sleep(0.01), loop=loop)
                #yield from asyncio.async(asyncio.sleep(0.01))
                #yield from asyncio.sleep(0.01) #Basically yield to next task

            #Can we somehow move this to passed-in postprocessing?
            if out and not canonical and not first_record: out.write(',\n')
            if out:
                if not canonical:
                    first_record = False
                    last_chunk = None
                    #Using iterencode avoids building a big JSON string in memory, or having to resort to file pointer seeking
                    #Then again builds a big list in memory, so still working on opt here
                    for chunk in json.JSONEncoder().iterencode(
                        [link for link in model]):
                        if last_chunk is None:
                            last_chunk = chunk[1:]
                        else:
                            out.write(last_chunk)
                            last_chunk = chunk
                    if last_chunk: out.write(last_chunk[:-1])
            #FIXME: Postprocessing should probably be a task too
            if postprocess: postprocess()
            #limiting--running count of records processed versus the max number, if any
            limiting[0] += 1
            if limiting[1] is not None and limiting[0] >= limiting[1]:
                break
    except GeneratorExit:
        logger.debug('Completed processing {0} record{1}.'.format(
            limiting[0], '' if limiting[0] == 1 else 's'))
        if out and not canonical: out.write(']')

        #if not plugins: loop.stop()
        for plugin in plugins:
            #Each plug-in is a task
            func = plugin.get(BF_FINAL_TASK)
            if not func: continue
            task = asyncio.Task(func(loop), loop=loop)
            _final_tasks.add(task)

            def task_done(task):
                #print('Task done: ', task)
                _final_tasks.remove(task)
                #logger.debug((plugins))
                #if plugins and len(_final_tasks) == 0:
                #print("_final_tasks is empty, stopping loop.")
                #loop = asyncio.get_event_loop()
                #    loop.stop()

            #Once all the plug-in tasks are done, all the work is done
            task.add_done_callback(task_done)
        #print('DONE')
        #raise

    return
Exemple #58
0
import re
import os
import logging
import itertools

#from rdflib import Graph, BNode, Namespace
from rdflib import URIRef, Literal, RDF, RDFS

from amara3 import iri

from versa import I, VERSA_BASEIRI, ORIGIN, RELATIONSHIP, TARGET

from bibframe import BFZ, BFLC

VTYPE_REL = I(iri.absolutize('type', VERSA_BASEIRI))
VLABEL_REL = I(iri.absolutize('label', VERSA_BASEIRI))

WORKCLASS = iri.absolutize('Work', BFZ)
INSTANCECLASS = iri.absolutize('Instance', BFZ)
INSTANCEREL = iri.absolutize('hasInstance', BFZ)

PROP_MAP = {
    VTYPE_REL: RDF.type,
    VLABEL_REL: RDFS.label,
}

def prep(stmt):
    '''
    Prepare a statement into a triple ready for rdflib
    '''
Exemple #59
0
    including absolutizing relative IRIs
    '''
    fullprop = kwargs.get('fullprop')
    rid = kwargs.get('rid')
    base = kwargs.get('base', ONYA)
    model = kwargs.get('model')
    iris = ltext.strip().split()
    for i in iris:
        model.add(rid, fullprop, I(iri.absolutize(i, base)))
    return None


PREP_METHODS = {
    ONYA('text'): lambda x, **kwargs: x,
    ONYA('resource'):
    lambda x, base=ONYA, **kwargs: I(iri.absolutize(x, base)),
    ONYA('resourceset'): handle_resourceset,
}


def get_block_text(block):
    '''
    Get simplified contents of an block

    a/href embedded in the block comes from Markdown such as `<link_text>`.
    Restore the angle brackets as expected by the li parser
    Also exclude child uls (to be processed separately)
    '''
    return ''.join([
        (ch if isinstance(ch, text) else
         ('<' + ch.xml_value +