def _normalize_isbn(ctx): _isbn = isbn(ctx) if callable(isbn) else isbn _isbn = [_isbn] if not isinstance(_isbn, list) else _isbn return [ compute_ean13_check(i) for i, t in isbn_list([i for i in _isbn if i]) ]
def isbn_instancegen(params, loop, model): ''' Default handling of the idea of splitting a MARC record with FRBR Work info as well as instances signalled by ISBNs According to Vicki Instances can be signalled by 007, 020 or 3XX, but we stick to 020 for now ''' #Handle ISBNs re: https://foundry.zepheira.com/issues/1976 entbase = params['entbase'] output_model = params['output_model'] input_model = params['input_model'] vocabbase = params['vocabbase'] logger = params['logger'] materialize_entity = params['materialize_entity'] existing_ids = params['existing_ids'] workid = params['workid'] ids = params['ids'] plugins = params['plugins'] INSTANTIATES_REL = I(iri.absolutize('instantiates', vocabbase)) isbns = list(( val for code, val in marc_lookup(input_model, '020$a'))) logger.debug('Raw ISBNS:\t{0}'.format(isbns)) # sorted to remove non-determinism which interferes with canonicalization normalized_isbns = sorted(list(isbn_list(isbns, logger=logger))) subscript = ord('a') instance_ids = [] logger.debug('Normalized ISBN:\t{0}'.format(normalized_isbns)) if normalized_isbns: for inum, itype in normalized_isbns: ean13 = compute_ean13_check(inum) data = [['instantiates', workid], [ISBNNS + 'isbn', ean13]] instanceid = materialize_entity('Instance', ctx_params=params, loop=loop, model_to_update=params['output_model'], data=data) if entbase: instanceid = I(iri.absolutize(instanceid, entbase)) output_model.add(I(instanceid), ISBN_REL, ean13) output_model.add(I(instanceid), INSTANTIATES_REL, I(workid)) if itype: output_model.add(I(instanceid), ISBN_TYPE_REL, itype) existing_ids.add(instanceid) instance_ids.append(instanceid) else: #If there are no ISBNs, we'll generate a default Instance data = [['instantiates', workid]] instanceid = materialize_entity('Instance', ctx_params=params, loop=loop, model_to_update=params['output_model'], data=data) instanceid = I(iri.absolutize(instanceid, entbase)) if entbase else I(instanceid) output_model.add(I(instanceid), INSTANTIATES_REL, I(workid)) existing_ids.add(instanceid) instance_ids.append(instanceid) #output_model.add(instance_ids[0], I(iri.absolutize('instantiates', vocabbase)), I(workid)) #output_model.add(I(instance_ids[0]), TYPE_REL, I(iri.absolutize('Instance', vocabbase))) return instance_ids
def instancegen(isbns): ''' Default handling of the idea of splitting a MARC record with FRBR Work info as well as instances signalled by ISBNs ''' base_instance_id = instance_item['id'] instance_ids = [] subscript = ord('a') for subix, (inum, itype) in enumerate(isbn_list(isbns)): #print >> sys.stderr, subix, inum, itype subitem = instance_item.copy() subitem['isbn'] = inum subitem['id'] = base_instance_id + (unichr(subscript + subix) if subix else '') if itype: subitem['isbnType'] = itype instance_ids.append(subitem['id']) new_instances.append(subitem)
def isbn_instancegen(params): ''' Default handling of the idea of splitting a MARC record with FRBR Work info as well as instances signalled by ISBNs ''' #Handle ISBNs re: https://foundry.zepheira.com/issues/1976 entbase = params['entbase'] model = params['model'] vocabbase = params['vocabbase'] logger = params['logger'] ids = params['ids'] rec = params['rec'] existing_ids = params['existing_ids'] workid = params['workid'] isbns = marc_lookup(rec, ['020$a']) logger.debug('Raw ISBNS:\t{0}'.format(isbns)) normalized_isbns = list(isbn_list(isbns)) subscript = ord('a') instance_ids = [] logger.debug('Normalized ISBN:\t{0}'.format(normalized_isbns)) if normalized_isbns: for subix, (inum, itype) in enumerate(normalized_isbns): instanceid = ids.send(['Instance', workid, inum]) if entbase: instanceid = I(iri.absolutize(instanceid, entbase)) model.add(I(instanceid), I(iri.absolutize('isbn', vocabbase)), inum) #subitem['id'] = instanceid + (unichr(subscript + subix) if subix else '') if itype: model.add(I(instanceid), I(iri.absolutize('isbnType', vocabbase)), itype) instance_ids.append(instanceid) else: instanceid = ids.send(['Instance', workid]) if entbase: instanceid = I(iri.absolutize(instanceid, entbase)) model.add(I(instanceid), TYPE_REL, I(iri.absolutize('Instance', vocabbase))) existing_ids.add(instanceid) instance_ids.append(instanceid) for instanceid in instance_ids: model.add(I(workid), I(iri.absolutize('hasInstance', vocabbase)), instanceid) model.add(I(instanceid), TYPE_REL, I(iri.absolutize('Instance', vocabbase))) return instance_ids
def record_handler(relsink, idbase, limiting=None, plugins=None, ids=None, postprocess=None, out=None, logger=logging, **kwargs): ''' idbase - base IRI used for IDs of generated resources limiting - mutable pair of [count, limit] used to control the number of records processed ''' plugins = plugins or [] if ids is None: ids = idgen(idbase) #FIXME: Use thread local storage rather than function attributes #A few code modularization functions pulled into local context as closures def process_materialization(lookup, subfields, code=None): materializedid = hashid(idbase, tuple(subfields.items())) #The extra_props are parameters inherent to a particular MARC field/subfield for purposes of linked data representation if code is None: code = lookup (subst, extra_props) = MATERIALIZE[lookup] if RESOURCE_TYPE in extra_props: relsink.add(I(materializedid), TYPE_REL, I(iri.absolutize(extra_props[RESOURCE_TYPE], BFZ))) #logger.debug((lookup, subfields, extra_props)) if materializedid not in T_prior_materializedids: #Just bundle in the subfields as they are, to avoid throwing out data. They can be otherwise used or just stripped later on #for k, v in itertools.chain((('marccode', code),), subfields.items(), extra_props.items()): for k, v in itertools.chain(subfields.items(), extra_props.items()): if k == RESOURCE_TYPE: continue fieldname = 'subfield-' + k if code + k in FIELD_RENAMINGS: fieldname = FIELD_RENAMINGS[code + k] if len(k) == 1: params['transforms'].append((code + k, fieldname)) #Only if proper MARC subfield #params['transforms'].append((code + k, FIELD_RENAMINGS.get(sflookup, sflookup))) relsink.add(I(materializedid), iri.absolutize(fieldname, BFZ), v) T_prior_materializedids.add(materializedid) return materializedid, subst #FIXME: test correct MARC transforms info for annotations def process_annotation(anntype, subfields, extra_annotation_props): #Separate annotation subfields from object subfields object_subfields = subfields.copy() annotation_subfields = {} for k, v in subfields.items(): if code + k in ANNOTATIONS_FIELDS: annotation_subfields[k] = v del object_subfields[k] params['transforms'].append((code + k, code + k)) #objectid = next(idg) #object_props.update(object_subfields) annotationid = next(ids) relsink.add(I(annotationid), TYPE_REL, I(iri.absolutize(anntype, BFZ))) for k, v in itertools.chain(annotation_subfields.items(), extra_annotation_props.items()): relsink.add(I(annotationid), I(iri.absolutize(k, BFZ)), v) #Return enough info to generate the main subject/object relationship. The annotation is taken care of at this point return annotationid, object_subfields #Start the process of writing out the JSON representation of the resulting Versa out.write('[') first_record = True try: while True: rec = yield #for plugin in plugins: # plugin.send(dict(rec=rec)) leader = None #Add work item record workid = next(ids) relsink.add(I(workid), TYPE_REL, I(iri.absolutize('Work', BFZ))) instanceid = next(ids) #logger.debug((workid, instanceid)) params = {'workid': workid, 'model': relsink} relsink.add(I(instanceid), TYPE_REL, I(iri.absolutize('Instance', BFZ))) #relsink.add((instanceid, iri.absolutize('leader', PROPBASE), leader)) #Instances are added below #relsink.add(I(workid), I(iri.absolutize('hasInstance', BFZ)), I(instanceid)) #for service in g_services: service.send(NEW_RECORD, relsink, workid, instanceid) params['transforms'] = [] # set() params['fields_used'] = [] for row in rec: code = None if row[0] == LEADER: params['leader'] = leader = row[1] elif row[0] == CONTROLFIELD: code, val = row[1].strip(), row[2] key = 'tag-' + code if code == '008': params['field008'] = field008 = val params['transforms'].append((code, key)) relsink.add(I(instanceid), I(iri.absolutize(key, BFZ)), val) params['fields_used'].append((code,)) elif row[0] == DATAFIELD: code, xmlattrs, subfields = row[1].strip(), row[2], row[3] key = 'tag-' + code handled = False subfields = dict(( (sf[0].strip(), sf[1]) for sf in subfields )) params['subfields'] = subfields params['fields_used'].append(tuple([code] + list(subfields.keys()))) if subfields: lookup = code #See if any of the field codes represents a reference to an object which can be materialized if code in MATERIALIZE: materializedid, subst = process_materialization(code, subfields) subject = instanceid if code in INSTANCE_FIELDS else workid params['transforms'].append((code, subst)) relsink.add(I(subject), I(iri.absolutize(subst, BFZ)), I(materializedid)) logger.debug('.') handled = True if code in MATERIALIZE_VIA_ANNOTATION: #FIXME: code comments for extra_object_props & extra_annotation_props (subst, anntype, extra_annotation_props) = MATERIALIZE_VIA_ANNOTATION[code] annotationid, object_subfields = process_annotation(anntype, subfields, extra_annotation_props) subject = instanceid if code in INSTANCE_FIELDS else workid objectid = next(ids) params['transforms'].append((code, subst)) relsink.add(I(subject), I(iri.absolutize(subst, BFZ)), I(objectid), {I(iri.absolutize('annotation', BFZ)): I(annotationid)}) for k, v in itertools.chain((('marccode', code),), object_subfields.items()): #for k, v in itertools.chain(('marccode', code), object_subfields.items(), extra_object_props.items()): relsink.add(I(objectid), I(iri.absolutize(k, BFZ)), v) logger.debug('.') handled = True #See if any of the field+subfield codes represents a reference to an object which can be materialized if not handled: for k, v in subfields.items(): lookup = code + k if lookup in MATERIALIZE: #XXX At first glance you'd think you can always derive code from lookup (e.g. lookup[:3] but what if e.g. someone trims the left zero fill on the codes in the serialization? materializedid, subst = process_materialization(lookup, subfields, code=code) subject = instanceid if code in INSTANCE_FIELDS else workid params['transforms'].append((lookup, subst)) relsink.add(I(subject), I(iri.absolutize(subst, BFZ)), I(materializedid)) #Is the MARC code part of the hash computation for the materiaalized object ID? Surely not! #materializedid = hashid((code,) + tuple(subfields.items())) logger.debug('.') handled = True else: field_name = 'tag-' + lookup if lookup in FIELD_RENAMINGS: field_name = FIELD_RENAMINGS[lookup] #Handle the simple field_name substitution of a label name for a MARC code subject = instanceid if code in INSTANCE_FIELDS else workid #logger.debug(repr(I(iri.absolutize(field_name, BFZ)))) params['transforms'].append((lookup, field_name)) relsink.add(I(subject), I(iri.absolutize(field_name, BFZ)), v) #print >> sys.stderr, lookup, key #if val: # subject = instanceid if code in INSTANCE_FIELDS else workid # relsink.add(I(subject), I(iri.absolutize(key, BFZ)), val) params['code'] = code special_properties = {} for k, v in process_leader(leader): special_properties.setdefault(k, set()).add(v) for k, v in process_008(field008): special_properties.setdefault(k, set()).add(v) params['special_properties'] = special_properties #We get some repeated values out of leader & 008 processing, and we want to #Remove dupes so we did so by working with sets then converting to lists for k, v in special_properties.items(): special_properties[k] = list(v) for item in v: #logger.debug(v) relsink.add(I(instanceid), I(iri.absolutize(k, BFZ)), item) #reduce lists of just one item #for k, v in work_item.items(): # if type(v) is list and len(v) == 1: # work_item[k] = v[0] #work_sink.send(work_item) #Handle ISBNs re: https://foundry.zepheira.com/issues/1976 ISBN_FIELD = 'tag-020' isbn_stmts = relsink.match(subj=instanceid, pred=iri.absolutize(ISBN_FIELD, BFZ)) isbns = [ s[2] for s in isbn_stmts ] logger.debug('ISBNS: {0}'.format(list(isbn_list(isbns)))) other_instance_ids = [] subscript = ord('a') newid = None for subix, (inum, itype) in enumerate(isbn_list(isbns)): #print >> sys.stderr, subix, inum, itype newid = next(ids) duplicate_statements(relsink, instanceid, newid) relsink.add(I(newid), I(iri.absolutize('isbn', BFZ)), inum) #subitem['id'] = instanceid + (unichr(subscript + subix) if subix else '') if itype: relsink.add(I(newid), I(iri.absolutize('isbnType', BFZ)), itype) other_instance_ids.append(newid) if not other_instance_ids: #Make sure it's created as an instance even if it has no ISBN relsink.add(I(workid), I(iri.absolutize('hasInstance', BFZ)), I(instanceid)) params.setdefault('instanceids', []).append(instanceid) for iid in other_instance_ids: relsink.add(I(workid), I(iri.absolutize('hasInstance', BFZ)), I(iid)) params.setdefault('instanceids', []).append(iid) #if newid is None: #No ISBN specified # send_instance(ninst) #ix += 1 logger.debug('+') for plugin in plugins: plugin.send(params) #Can't really use this because it include outer [] #jsondump(relsink, out) if not first_record: out.write(',\n') first_record = False last_chunk = None #Using iterencode avoids building a big JSON string in memory, or having to resort to file pointer seeking #Then again builds a big list in memory, so still working on opt here for chunk in json.JSONEncoder().iterencode([ stmt for stmt in relsink ]): if last_chunk is None: last_chunk = chunk[1:] else: out.write(last_chunk) last_chunk = chunk if last_chunk: out.write(last_chunk[:-1]) if postprocess: postprocess(rec) if limiting[1] is not None: limiting[0] += 1 if limiting[0] >= limiting[1]: break logger.debug('Completed processing {0} record{1}.'.format(limiting[0], '' if limiting[0] == 1 else 's')) except GeneratorExit: out.write(']') return
def test_isbn_list(inputdata, expected): result = list(isbn_list(inputdata)) assert result == expected, (result, expected)
def isbn_instancegen(params, loop, model): ''' Default handling of the idea of splitting a MARC record with FRBR Work info as well as instances signalled by ISBNs According to Vicki Instances can be signalled by 007, 020 or 3XX, but we stick to 020 for now ''' #Handle ISBNs re: https://foundry.zepheira.com/issues/1976 entbase = params['entbase'] output_model = params['output_model'] input_model = params['input_model'] vocabbase = params['vocabbase'] logger = params['logger'] materialize_entity = params['materialize_entity'] existing_ids = params['existing_ids'] workid = params['default-origin'] ids = params['ids'] plugins = params['plugins'] INSTANTIATES_REL = I(iri.absolutize('instantiates', vocabbase)) isbns = list((val for code, val in marc_lookup(input_model, '020$a'))) logger.debug('Raw ISBNS:\t{0}'.format(isbns)) # sorted to remove non-determinism which interferes with canonicalization normalized_isbns = sorted(list(isbn_list(isbns, logger=logger))) subscript = ord('a') instance_ids = [] logger.debug('Normalized ISBN:\t{0}'.format(normalized_isbns)) if normalized_isbns: for inum, itype in normalized_isbns: ean13 = compute_ean13_check(inum) data = [['instantiates', workid], [ISBNNS + 'isbn', ean13]] instanceid = materialize_entity('Instance', ctx_params=params, model_to_update=output_model, data=data, loop=loop) if entbase: instanceid = I(iri.absolutize(instanceid, entbase)) output_model.add(I(instanceid), ISBN_REL, ean13) output_model.add(I(instanceid), INSTANTIATES_REL, I(workid)) if itype: output_model.add(I(instanceid), ISBN_VTYPE_REL, itype) existing_ids.add(instanceid) instance_ids.append(instanceid) else: #If there are no ISBNs, we'll generate a default Instance data = [['instantiates', workid]] instanceid = materialize_entity('Instance', ctx_params=params, model_to_update=output_model, data=data, loop=loop) instanceid = I(iri.absolutize(instanceid, entbase)) if entbase else I(instanceid) output_model.add(I(instanceid), INSTANTIATES_REL, I(workid)) existing_ids.add(instanceid) instance_ids.append(instanceid) #output_model.add(instance_ids[0], I(iri.absolutize('instantiates', vocabbase)), I(workid)) #output_model.add(I(instance_ids[0]), VTYPE_REL, I(iri.absolutize('Instance', vocabbase))) return instance_ids