Example #1
0
 def _normalize_isbn(ctx):
     _isbn = isbn(ctx) if callable(isbn) else isbn
     _isbn = [_isbn] if not isinstance(_isbn, list) else _isbn
     return [
         compute_ean13_check(i)
         for i, t in isbn_list([i for i in _isbn if i])
     ]
Example #2
0
def isbn_instancegen(params, loop, model):
    '''
    Default handling of the idea of splitting a MARC record with FRBR Work info as well as instances signalled by ISBNs

    According to Vicki Instances can be signalled by 007, 020 or 3XX, but we stick to 020 for now
    '''
    #Handle ISBNs re: https://foundry.zepheira.com/issues/1976
    entbase = params['entbase']
    output_model = params['output_model']
    input_model = params['input_model']
    vocabbase = params['vocabbase']
    logger = params['logger']
    materialize_entity = params['materialize_entity']
    existing_ids = params['existing_ids']
    workid = params['workid']
    ids = params['ids']
    plugins = params['plugins']

    INSTANTIATES_REL = I(iri.absolutize('instantiates', vocabbase))

    isbns = list(( val for code, val in marc_lookup(input_model, '020$a')))
    logger.debug('Raw ISBNS:\t{0}'.format(isbns))

    # sorted to remove non-determinism which interferes with canonicalization
    normalized_isbns = sorted(list(isbn_list(isbns, logger=logger)))

    subscript = ord('a')
    instance_ids = []
    logger.debug('Normalized ISBN:\t{0}'.format(normalized_isbns))
    if normalized_isbns:
        for inum, itype in normalized_isbns:
            ean13 = compute_ean13_check(inum)
            data = [['instantiates', workid], [ISBNNS + 'isbn', ean13]]
            instanceid = materialize_entity('Instance', ctx_params=params, loop=loop, model_to_update=params['output_model'], data=data)
            if entbase: instanceid = I(iri.absolutize(instanceid, entbase))

            output_model.add(I(instanceid), ISBN_REL, ean13)
            output_model.add(I(instanceid), INSTANTIATES_REL, I(workid))
            if itype: output_model.add(I(instanceid), ISBN_TYPE_REL, itype)
            existing_ids.add(instanceid)
            instance_ids.append(instanceid)
    else:
        #If there are no ISBNs, we'll generate a default Instance
        data = [['instantiates', workid]]
        instanceid = materialize_entity('Instance', ctx_params=params, loop=loop, model_to_update=params['output_model'], data=data)
        instanceid = I(iri.absolutize(instanceid, entbase)) if entbase else I(instanceid)
        output_model.add(I(instanceid), INSTANTIATES_REL, I(workid))
        existing_ids.add(instanceid)
        instance_ids.append(instanceid)

    #output_model.add(instance_ids[0], I(iri.absolutize('instantiates', vocabbase)), I(workid))
    #output_model.add(I(instance_ids[0]), TYPE_REL, I(iri.absolutize('Instance', vocabbase)))

    return instance_ids
Example #3
0
def instancegen(isbns):
    '''
    Default handling of the idea of splitting a MARC record with FRBR Work info as well as instances signalled by ISBNs
    '''
    base_instance_id = instance_item['id']
    instance_ids = []
    subscript = ord('a')
    for subix, (inum, itype) in enumerate(isbn_list(isbns)):
        #print >> sys.stderr, subix, inum, itype
        subitem = instance_item.copy()
        subitem['isbn'] = inum
        subitem['id'] = base_instance_id + (unichr(subscript + subix) if subix else '')
        if itype: subitem['isbnType'] = itype
        instance_ids.append(subitem['id'])
        new_instances.append(subitem)
Example #4
0
def isbn_instancegen(params):
    '''
    Default handling of the idea of splitting a MARC record with FRBR Work info as well as instances signalled by ISBNs


    '''
    #Handle ISBNs re: https://foundry.zepheira.com/issues/1976
    entbase = params['entbase']
    model = params['model']
    vocabbase = params['vocabbase']
    logger = params['logger']
    ids = params['ids']
    rec = params['rec']
    existing_ids = params['existing_ids']
    workid = params['workid']

    isbns = marc_lookup(rec, ['020$a'])
    logger.debug('Raw ISBNS:\t{0}'.format(isbns))

    normalized_isbns = list(isbn_list(isbns))

    subscript = ord('a')
    instance_ids = []
    logger.debug('Normalized ISBN:\t{0}'.format(normalized_isbns))
    if normalized_isbns:
        for subix, (inum, itype) in enumerate(normalized_isbns):
            instanceid = ids.send(['Instance', workid, inum])
            if entbase: instanceid = I(iri.absolutize(instanceid, entbase))

            model.add(I(instanceid), I(iri.absolutize('isbn', vocabbase)), inum)
            #subitem['id'] = instanceid + (unichr(subscript + subix) if subix else '')
            if itype: model.add(I(instanceid), I(iri.absolutize('isbnType', vocabbase)), itype)
            instance_ids.append(instanceid)
    else:
        instanceid = ids.send(['Instance', workid])
        if entbase: instanceid = I(iri.absolutize(instanceid, entbase))
        model.add(I(instanceid), TYPE_REL, I(iri.absolutize('Instance', vocabbase)))
        existing_ids.add(instanceid)
        instance_ids.append(instanceid)

    for instanceid in instance_ids:
        model.add(I(workid), I(iri.absolutize('hasInstance', vocabbase)), instanceid)
        model.add(I(instanceid), TYPE_REL, I(iri.absolutize('Instance', vocabbase)))

    return instance_ids
Example #5
0
 def _normalize_isbn(ctx):
     _isbn = isbn(ctx) if callable(isbn) else isbn
     _isbn = [_isbn] if not isinstance(_isbn, list) else _isbn
     return [ compute_ean13_check(i) for i, t in isbn_list([i for i in _isbn if i]) ]
Example #6
0
def record_handler(relsink, idbase, limiting=None, plugins=None, ids=None, postprocess=None, out=None, logger=logging, **kwargs):
    '''
    idbase - base IRI used for IDs of generated resources
    limiting - mutable pair of [count, limit] used to control the number of records processed
    '''
    plugins = plugins or []
    if ids is None: ids = idgen(idbase)
    #FIXME: Use thread local storage rather than function attributes

    #A few code modularization functions pulled into local context as closures
    def process_materialization(lookup, subfields, code=None):
        materializedid = hashid(idbase, tuple(subfields.items()))
        #The extra_props are parameters inherent to a particular MARC field/subfield for purposes of linked data representation
        if code is None: code = lookup
        (subst, extra_props) = MATERIALIZE[lookup]
        if RESOURCE_TYPE in extra_props:
            relsink.add(I(materializedid), TYPE_REL, I(iri.absolutize(extra_props[RESOURCE_TYPE], BFZ)))
        #logger.debug((lookup, subfields, extra_props))

        if materializedid not in T_prior_materializedids:
            #Just bundle in the subfields as they are, to avoid throwing out data. They can be otherwise used or just stripped later on
            #for k, v in itertools.chain((('marccode', code),), subfields.items(), extra_props.items()):
            for k, v in itertools.chain(subfields.items(), extra_props.items()):
                if k == RESOURCE_TYPE: continue
                fieldname = 'subfield-' + k
                if code + k in FIELD_RENAMINGS:
                    fieldname = FIELD_RENAMINGS[code + k]
                    if len(k) == 1: params['transforms'].append((code + k, fieldname)) #Only if proper MARC subfield
                    #params['transforms'].append((code + k, FIELD_RENAMINGS.get(sflookup, sflookup)))
                relsink.add(I(materializedid), iri.absolutize(fieldname, BFZ), v)
            T_prior_materializedids.add(materializedid)

        return materializedid, subst


    #FIXME: test correct MARC transforms info for annotations
    def process_annotation(anntype, subfields, extra_annotation_props):
        #Separate annotation subfields from object subfields
        object_subfields = subfields.copy()
        annotation_subfields = {}
        for k, v in subfields.items():
            if code + k in ANNOTATIONS_FIELDS:
                annotation_subfields[k] = v
                del object_subfields[k]
            params['transforms'].append((code + k, code + k))

        #objectid = next(idg)
        #object_props.update(object_subfields)

        annotationid = next(ids)
        relsink.add(I(annotationid), TYPE_REL, I(iri.absolutize(anntype, BFZ)))
        for k, v in itertools.chain(annotation_subfields.items(), extra_annotation_props.items()):
            relsink.add(I(annotationid), I(iri.absolutize(k, BFZ)), v)

        #Return enough info to generate the main subject/object relationship. The annotation is taken care of at this point
        return annotationid, object_subfields

    #Start the process of writing out the JSON representation of the resulting Versa
    out.write('[')
    first_record = True
    try:
        while True:
            rec = yield
            #for plugin in plugins:
            #    plugin.send(dict(rec=rec))
            leader = None
            #Add work item record
            workid = next(ids)
            relsink.add(I(workid), TYPE_REL, I(iri.absolutize('Work', BFZ)))
            instanceid = next(ids)
            #logger.debug((workid, instanceid))
            params = {'workid': workid, 'model': relsink}

            relsink.add(I(instanceid), TYPE_REL, I(iri.absolutize('Instance', BFZ)))
            #relsink.add((instanceid, iri.absolutize('leader', PROPBASE), leader))
            #Instances are added below
            #relsink.add(I(workid), I(iri.absolutize('hasInstance', BFZ)), I(instanceid))

            #for service in g_services: service.send(NEW_RECORD, relsink, workid, instanceid)

            params['transforms'] = [] # set()
            params['fields_used'] = []
            for row in rec:
                code = None

                if row[0] == LEADER:
                    params['leader'] = leader = row[1]
                elif row[0] == CONTROLFIELD:
                    code, val = row[1].strip(), row[2]
                    key = 'tag-' + code
                    if code == '008':
                        params['field008'] = field008 = val
                    params['transforms'].append((code, key))
                    relsink.add(I(instanceid), I(iri.absolutize(key, BFZ)), val)
                    params['fields_used'].append((code,))
                elif row[0] == DATAFIELD:
                    code, xmlattrs, subfields = row[1].strip(), row[2], row[3]
                    key = 'tag-' + code

                    handled = False
                    subfields = dict(( (sf[0].strip(), sf[1]) for sf in subfields ))
                    params['subfields'] = subfields
                    params['fields_used'].append(tuple([code] + list(subfields.keys())))

                    if subfields:
                        lookup = code
                        #See if any of the field codes represents a reference to an object which can be materialized

                        if code in MATERIALIZE:
                            materializedid, subst = process_materialization(code, subfields)
                            subject = instanceid if code in INSTANCE_FIELDS else workid
                            params['transforms'].append((code, subst))
                            relsink.add(I(subject), I(iri.absolutize(subst, BFZ)), I(materializedid))
                            logger.debug('.')
                            handled = True

                        if code in MATERIALIZE_VIA_ANNOTATION:
                            #FIXME: code comments for extra_object_props & extra_annotation_props
                            (subst, anntype, extra_annotation_props) = MATERIALIZE_VIA_ANNOTATION[code]
                            annotationid, object_subfields = process_annotation(anntype, subfields, extra_annotation_props)

                            subject = instanceid if code in INSTANCE_FIELDS else workid
                            objectid = next(ids)
                            params['transforms'].append((code, subst))
                            relsink.add(I(subject), I(iri.absolutize(subst, BFZ)), I(objectid), {I(iri.absolutize('annotation', BFZ)): I(annotationid)})

                            for k, v in itertools.chain((('marccode', code),), object_subfields.items()):
                            #for k, v in itertools.chain(('marccode', code), object_subfields.items(), extra_object_props.items()):
                                relsink.add(I(objectid), I(iri.absolutize(k, BFZ)), v)

                            logger.debug('.')
                            handled = True

                        #See if any of the field+subfield codes represents a reference to an object which can be materialized
                        if not handled:
                            for k, v in subfields.items():
                                lookup = code + k
                                if lookup in MATERIALIZE:
                                    #XXX At first glance you'd think you can always derive code from lookup (e.g. lookup[:3] but what if e.g. someone trims the left zero fill on the codes in the serialization?
                                    materializedid, subst = process_materialization(lookup, subfields, code=code)
                                    subject = instanceid if code in INSTANCE_FIELDS else workid
                                    params['transforms'].append((lookup, subst))
                                    relsink.add(I(subject), I(iri.absolutize(subst, BFZ)), I(materializedid))

                                    #Is the MARC code part of the hash computation for the materiaalized object ID? Surely not!
                                    #materializedid = hashid((code,) + tuple(subfields.items()))
                                    logger.debug('.')
                                    handled = True

                                else:
                                    field_name = 'tag-' + lookup
                                    if lookup in FIELD_RENAMINGS:
                                        field_name = FIELD_RENAMINGS[lookup]
                                    #Handle the simple field_name substitution of a label name for a MARC code
                                    subject = instanceid if code in INSTANCE_FIELDS else workid
                                    #logger.debug(repr(I(iri.absolutize(field_name, BFZ))))
                                    params['transforms'].append((lookup, field_name))
                                    relsink.add(I(subject), I(iri.absolutize(field_name, BFZ)), v)

                    #print >> sys.stderr, lookup, key
                    #if val:
                    #    subject = instanceid if code in INSTANCE_FIELDS else workid
                    #    relsink.add(I(subject), I(iri.absolutize(key, BFZ)), val)

                params['code'] = code

            special_properties = {}
            for k, v in process_leader(leader):
                special_properties.setdefault(k, set()).add(v)

            for k, v in process_008(field008):
                special_properties.setdefault(k, set()).add(v)
            params['special_properties'] = special_properties

            #We get some repeated values out of leader & 008 processing, and we want to
            #Remove dupes so we did so by working with sets then converting to lists
            for k, v in special_properties.items():
                special_properties[k] = list(v)
                for item in v:
                #logger.debug(v)
                    relsink.add(I(instanceid), I(iri.absolutize(k, BFZ)), item)


            #reduce lists of just one item
            #for k, v in work_item.items():
            #    if type(v) is list and len(v) == 1:
            #        work_item[k] = v[0]
            #work_sink.send(work_item)


            #Handle ISBNs re: https://foundry.zepheira.com/issues/1976
            ISBN_FIELD = 'tag-020'
            isbn_stmts = relsink.match(subj=instanceid, pred=iri.absolutize(ISBN_FIELD, BFZ))
            isbns = [ s[2] for s in isbn_stmts ]
            logger.debug('ISBNS: {0}'.format(list(isbn_list(isbns))))
            other_instance_ids = []
            subscript = ord('a')
            newid = None
            for subix, (inum, itype) in enumerate(isbn_list(isbns)):
                #print >> sys.stderr, subix, inum, itype
                newid = next(ids)
                duplicate_statements(relsink, instanceid, newid)
                relsink.add(I(newid), I(iri.absolutize('isbn', BFZ)), inum)
                #subitem['id'] = instanceid + (unichr(subscript + subix) if subix else '')
                if itype: relsink.add(I(newid), I(iri.absolutize('isbnType', BFZ)), itype)
                other_instance_ids.append(newid)

            if not other_instance_ids:
                #Make sure it's created as an instance even if it has no ISBN
                relsink.add(I(workid), I(iri.absolutize('hasInstance', BFZ)), I(instanceid))
                params.setdefault('instanceids', []).append(instanceid)

            for iid in other_instance_ids:
                relsink.add(I(workid), I(iri.absolutize('hasInstance', BFZ)), I(iid))
                params.setdefault('instanceids', []).append(iid)

            #if newid is None: #No ISBN specified
            #    send_instance(ninst)

            #ix += 1
            logger.debug('+')

            for plugin in plugins:
                plugin.send(params)

            #Can't really use this because it include outer []
            #jsondump(relsink, out)

            if not first_record: out.write(',\n')
            first_record = False
            last_chunk = None
            #Using iterencode avoids building a big JSON string in memory, or having to resort to file pointer seeking
            #Then again builds a big list in memory, so still working on opt here
            for chunk in json.JSONEncoder().iterencode([ stmt for stmt in relsink ]):
                if last_chunk is None:
                    last_chunk = chunk[1:]
                else:
                    out.write(last_chunk)
                    last_chunk = chunk
            if last_chunk: out.write(last_chunk[:-1])
            if postprocess: postprocess(rec)
            if limiting[1] is not None:
                limiting[0] += 1
                if limiting[0] >= limiting[1]:
                    break
        logger.debug('Completed processing {0} record{1}.'.format(limiting[0], '' if limiting[0] == 1 else 's'))
    except GeneratorExit:
        out.write(']')
    return
Example #7
0
def test_isbn_list(inputdata, expected):
    result = list(isbn_list(inputdata))
    assert result == expected, (result, expected)
Example #8
0
def isbn_instancegen(params, loop, model):
    '''
    Default handling of the idea of splitting a MARC record with FRBR Work info as well as instances signalled by ISBNs

    According to Vicki Instances can be signalled by 007, 020 or 3XX, but we stick to 020 for now
    '''
    #Handle ISBNs re: https://foundry.zepheira.com/issues/1976
    entbase = params['entbase']
    output_model = params['output_model']
    input_model = params['input_model']
    vocabbase = params['vocabbase']
    logger = params['logger']
    materialize_entity = params['materialize_entity']
    existing_ids = params['existing_ids']
    workid = params['default-origin']
    ids = params['ids']
    plugins = params['plugins']

    INSTANTIATES_REL = I(iri.absolutize('instantiates', vocabbase))

    isbns = list((val for code, val in marc_lookup(input_model, '020$a')))
    logger.debug('Raw ISBNS:\t{0}'.format(isbns))

    # sorted to remove non-determinism which interferes with canonicalization
    normalized_isbns = sorted(list(isbn_list(isbns, logger=logger)))

    subscript = ord('a')
    instance_ids = []
    logger.debug('Normalized ISBN:\t{0}'.format(normalized_isbns))
    if normalized_isbns:
        for inum, itype in normalized_isbns:
            ean13 = compute_ean13_check(inum)
            data = [['instantiates', workid], [ISBNNS + 'isbn', ean13]]
            instanceid = materialize_entity('Instance',
                                            ctx_params=params,
                                            model_to_update=output_model,
                                            data=data,
                                            loop=loop)
            if entbase: instanceid = I(iri.absolutize(instanceid, entbase))

            output_model.add(I(instanceid), ISBN_REL, ean13)
            output_model.add(I(instanceid), INSTANTIATES_REL, I(workid))
            if itype: output_model.add(I(instanceid), ISBN_VTYPE_REL, itype)
            existing_ids.add(instanceid)
            instance_ids.append(instanceid)
    else:
        #If there are no ISBNs, we'll generate a default Instance
        data = [['instantiates', workid]]
        instanceid = materialize_entity('Instance',
                                        ctx_params=params,
                                        model_to_update=output_model,
                                        data=data,
                                        loop=loop)
        instanceid = I(iri.absolutize(instanceid,
                                      entbase)) if entbase else I(instanceid)
        output_model.add(I(instanceid), INSTANTIATES_REL, I(workid))
        existing_ids.add(instanceid)
        instance_ids.append(instanceid)

    #output_model.add(instance_ids[0], I(iri.absolutize('instantiates', vocabbase)), I(workid))
    #output_model.add(I(instance_ids[0]), VTYPE_REL, I(iri.absolutize('Instance', vocabbase)))

    return instance_ids