def record_handler(loop, relsink, entbase=None, vocabbase=BFZ, limiting=None, plugins=None, ids=None, postprocess=None, out=None, logger=logging, **kwargs): ''' loop - asyncio event loop entbase - base IRI used for IDs of generated entity resources limiting - mutable pair of [count, limit] used to control the number of records processed ''' _final_tasks = set() #Tasks for the event loop contributing to the MARC processing plugins = plugins or [] if ids is None: ids = idgen(entbase) logger.debug('GRIPPO: {0}'.format(repr(entbase))) #FIXME: For now always generate instances from ISBNs, but consider working this through th plugins system instancegen = isbn_instancegen existing_ids = set() initialize(hashidgen=ids, existing_ids=existing_ids) #Start the process of writing out the JSON representation of the resulting Versa out.write('[') first_record = True try: while True: rec = yield leader = None #Add work item record, with actual hash resource IDs based on default or plugged-in algo #FIXME: No plug-in support yet workhash = record_hash_key(rec) workid = ids.send('Work:' + workhash) existing_ids.add(workid) logger.debug('Uniform title from 245$a: {0}'.format(marc_lookup(rec, ['245$a']))) logger.debug('Work hash result: {0} from \'{1}\''.format(workid, 'Work' + workhash)) if entbase: workid = I(iri.absolutize(workid, entbase)) relsink.add(I(workid), TYPE_REL, I(iri.absolutize('Work', vocabbase))) params = {'workid': workid, 'rec': rec, 'logger': logger, 'model': relsink, 'entbase': entbase, 'vocabbase': vocabbase, 'ids': ids, 'existing_ids': existing_ids} #Figure out instances instanceids = instancegen(params) if instanceids: instanceid = instanceids[0] params['instance_ids'] = instanceids params['transforms'] = [] # set() params['fields_used'] = [] for row in rec: code = None if row[0] == LEADER: params['leader'] = leader = row[1] elif row[0] == CONTROLFIELD: code, val = row[1], row[2] key = 'tag-' + code if code == '008': params['field008'] = field008 = val params['transforms'].append((code, key)) relsink.add(I(instanceid), I(iri.absolutize(key, vocabbase)), val) params['fields_used'].append((code,)) elif row[0] == DATAFIELD: code, xmlattrs, subfields = row[1], row[2], row[3] #xmlattribs include are indicators indicators = ((xmlattrs.get('ind1') or ' ')[0].replace(' ', '#'), (xmlattrs.get('ind2') or ' ')[0].replace(' ', '#')) key = 'tag-' + code handled = False params['subfields'] = subfields params['indicators'] = indicators params['fields_used'].append(tuple([code] + list(subfields.keys()))) #Build Versa processing context to_process = [] #logger.debug(repr(indicators)) if indicators == ('#', '#'): #No indicators set for k, v in subfields.items(): lookup = '{0}${1}'.format(code, k) if lookup in TRANSFORMS: to_process.append((TRANSFORMS[lookup], v)) lookup = code if lookup in TRANSFORMS: to_process.append((TRANSFORMS[lookup], '')) else: #One or other indicators is set, so let's check the transforms against those lookup = '{0}-{1}{2}'.format(*((code,) + indicators)) if lookup in TRANSFORMS: to_process.append((TRANSFORMS[lookup], '')) for k, v in subfields.items(): lookup = '{0}${1}'.format(code, k) if lookup in TRANSFORMS: to_process.append((TRANSFORMS[lookup], v)) #Apply all the handlers that were found for func, val in to_process: ctx = context(workid, [(workid, code, val, subfields)], relsink, base=vocabbase) new_stmts = func(ctx, workid, instanceid) #FIXME: Use add for s in new_stmts: relsink.add(*s) #logger.debug('.') if not to_process: #Nothing else has handled this data field; go to the fallback fallback_rel_base = 'tag-' + code for k, v in subfields.items(): fallback_rel = fallback_rel_base + k #params['transforms'].append((code, fallback_rel)) relsink.add(I(workid), I(iri.absolutize(fallback_rel, vocabbase)), v) params['code'] = code special_properties = {} for k, v in process_leader(leader): special_properties.setdefault(k, set()).add(v) for k, v in process_008(field008): special_properties.setdefault(k, set()).add(v) params['special_properties'] = special_properties #We get some repeated values out of leader & 008 processing, and we want to #Remove dupes so we did so by working with sets then converting to lists for k, v in special_properties.items(): special_properties[k] = list(v) for item in v: #logger.debug(v) relsink.add(I(instanceid), I(iri.absolutize(k, vocabbase)), item) instance_postprocess(params) logger.debug('+') for plugin in plugins: #Each plug-in is a task #task = asyncio.Task(plugin[BF_MARCREC_TASK](loop, relsink, params), loop=loop) yield from plugin[BF_MARCREC_TASK](loop, relsink, params) logger.debug("Pending tasks: %s" % asyncio.Task.all_tasks(loop)) #FIXME: This blocks and thus serializes the plugin operation, rather than the desired coop scheduling approach #For some reason seting to async task then immediately deferring to next task via yield from sleep leads to the "yield from wasn't used with future" error (Not much clue at: https://codereview.appspot.com/7396044/) #yield from asyncio.Task(asyncio.sleep(0.01), loop=loop) #yield from asyncio.async(asyncio.sleep(0.01)) #yield from asyncio.sleep(0.01) #Basically yield to next task if not first_record: out.write(',\n') first_record = False last_chunk = None #Using iterencode avoids building a big JSON string in memory, or having to resort to file pointer seeking #Then again builds a big list in memory, so still working on opt here for chunk in json.JSONEncoder().iterencode([ link for link in relsink ]): if last_chunk is None: last_chunk = chunk[1:] else: out.write(last_chunk) last_chunk = chunk if last_chunk: out.write(last_chunk[:-1]) #FIXME: Postprocessing should probably be a task too if postprocess: postprocess(rec) #limiting--running count of records processed versus the max number, if any limiting[0] += 1 if limiting[1] is not None and limiting[0] >= limiting[1]: break except GeneratorExit: logger.debug('Completed processing {0} record{1}.'.format(limiting[0], '' if limiting[0] == 1 else 's')) out.write(']') if not plugins: loop.stop() for plugin in plugins: #Each plug-in is a task task = asyncio.Task(plugin[BF_FINAL_TASK](loop), loop=loop) _final_tasks.add(task) def task_done(task): #print('Task done: ', task) _final_tasks.remove(task) if len(_final_tasks) == 0: #print("_final_tasks is empty, stopping loop.") #loop = asyncio.get_event_loop() loop.stop() #Once all the plug-in tasks are done, all the work is done task.add_done_callback(task_done) #print('DONE') #raise return
def record_handler(relsink, idbase, limiting=None, plugins=None, ids=None, postprocess=None, out=None, logger=logging, **kwargs): ''' idbase - base IRI used for IDs of generated resources limiting - mutable pair of [count, limit] used to control the number of records processed ''' plugins = plugins or [] if ids is None: ids = idgen(idbase) #FIXME: Use thread local storage rather than function attributes #A few code modularization functions pulled into local context as closures def process_materialization(lookup, subfields, code=None): materializedid = hashid(idbase, tuple(subfields.items())) #The extra_props are parameters inherent to a particular MARC field/subfield for purposes of linked data representation if code is None: code = lookup (subst, extra_props) = MATERIALIZE[lookup] if RESOURCE_TYPE in extra_props: relsink.add(I(materializedid), TYPE_REL, I(iri.absolutize(extra_props[RESOURCE_TYPE], BFZ))) #logger.debug((lookup, subfields, extra_props)) if materializedid not in T_prior_materializedids: #Just bundle in the subfields as they are, to avoid throwing out data. They can be otherwise used or just stripped later on #for k, v in itertools.chain((('marccode', code),), subfields.items(), extra_props.items()): for k, v in itertools.chain(subfields.items(), extra_props.items()): if k == RESOURCE_TYPE: continue fieldname = 'subfield-' + k if code + k in FIELD_RENAMINGS: fieldname = FIELD_RENAMINGS[code + k] if len(k) == 1: params['transforms'].append((code + k, fieldname)) #Only if proper MARC subfield #params['transforms'].append((code + k, FIELD_RENAMINGS.get(sflookup, sflookup))) relsink.add(I(materializedid), iri.absolutize(fieldname, BFZ), v) T_prior_materializedids.add(materializedid) return materializedid, subst #FIXME: test correct MARC transforms info for annotations def process_annotation(anntype, subfields, extra_annotation_props): #Separate annotation subfields from object subfields object_subfields = subfields.copy() annotation_subfields = {} for k, v in subfields.items(): if code + k in ANNOTATIONS_FIELDS: annotation_subfields[k] = v del object_subfields[k] params['transforms'].append((code + k, code + k)) #objectid = next(idg) #object_props.update(object_subfields) annotationid = next(ids) relsink.add(I(annotationid), TYPE_REL, I(iri.absolutize(anntype, BFZ))) for k, v in itertools.chain(annotation_subfields.items(), extra_annotation_props.items()): relsink.add(I(annotationid), I(iri.absolutize(k, BFZ)), v) #Return enough info to generate the main subject/object relationship. The annotation is taken care of at this point return annotationid, object_subfields #Start the process of writing out the JSON representation of the resulting Versa out.write('[') first_record = True try: while True: rec = yield #for plugin in plugins: # plugin.send(dict(rec=rec)) leader = None #Add work item record workid = next(ids) relsink.add(I(workid), TYPE_REL, I(iri.absolutize('Work', BFZ))) instanceid = next(ids) #logger.debug((workid, instanceid)) params = {'workid': workid, 'model': relsink} relsink.add(I(instanceid), TYPE_REL, I(iri.absolutize('Instance', BFZ))) #relsink.add((instanceid, iri.absolutize('leader', PROPBASE), leader)) #Instances are added below #relsink.add(I(workid), I(iri.absolutize('hasInstance', BFZ)), I(instanceid)) #for service in g_services: service.send(NEW_RECORD, relsink, workid, instanceid) params['transforms'] = [] # set() params['fields_used'] = [] for row in rec: code = None if row[0] == LEADER: params['leader'] = leader = row[1] elif row[0] == CONTROLFIELD: code, val = row[1].strip(), row[2] key = 'tag-' + code if code == '008': params['field008'] = field008 = val params['transforms'].append((code, key)) relsink.add(I(instanceid), I(iri.absolutize(key, BFZ)), val) params['fields_used'].append((code,)) elif row[0] == DATAFIELD: code, xmlattrs, subfields = row[1].strip(), row[2], row[3] key = 'tag-' + code handled = False subfields = dict(( (sf[0].strip(), sf[1]) for sf in subfields )) params['subfields'] = subfields params['fields_used'].append(tuple([code] + list(subfields.keys()))) if subfields: lookup = code #See if any of the field codes represents a reference to an object which can be materialized if code in MATERIALIZE: materializedid, subst = process_materialization(code, subfields) subject = instanceid if code in INSTANCE_FIELDS else workid params['transforms'].append((code, subst)) relsink.add(I(subject), I(iri.absolutize(subst, BFZ)), I(materializedid)) logger.debug('.') handled = True if code in MATERIALIZE_VIA_ANNOTATION: #FIXME: code comments for extra_object_props & extra_annotation_props (subst, anntype, extra_annotation_props) = MATERIALIZE_VIA_ANNOTATION[code] annotationid, object_subfields = process_annotation(anntype, subfields, extra_annotation_props) subject = instanceid if code in INSTANCE_FIELDS else workid objectid = next(ids) params['transforms'].append((code, subst)) relsink.add(I(subject), I(iri.absolutize(subst, BFZ)), I(objectid), {I(iri.absolutize('annotation', BFZ)): I(annotationid)}) for k, v in itertools.chain((('marccode', code),), object_subfields.items()): #for k, v in itertools.chain(('marccode', code), object_subfields.items(), extra_object_props.items()): relsink.add(I(objectid), I(iri.absolutize(k, BFZ)), v) logger.debug('.') handled = True #See if any of the field+subfield codes represents a reference to an object which can be materialized if not handled: for k, v in subfields.items(): lookup = code + k if lookup in MATERIALIZE: #XXX At first glance you'd think you can always derive code from lookup (e.g. lookup[:3] but what if e.g. someone trims the left zero fill on the codes in the serialization? materializedid, subst = process_materialization(lookup, subfields, code=code) subject = instanceid if code in INSTANCE_FIELDS else workid params['transforms'].append((lookup, subst)) relsink.add(I(subject), I(iri.absolutize(subst, BFZ)), I(materializedid)) #Is the MARC code part of the hash computation for the materiaalized object ID? Surely not! #materializedid = hashid((code,) + tuple(subfields.items())) logger.debug('.') handled = True else: field_name = 'tag-' + lookup if lookup in FIELD_RENAMINGS: field_name = FIELD_RENAMINGS[lookup] #Handle the simple field_name substitution of a label name for a MARC code subject = instanceid if code in INSTANCE_FIELDS else workid #logger.debug(repr(I(iri.absolutize(field_name, BFZ)))) params['transforms'].append((lookup, field_name)) relsink.add(I(subject), I(iri.absolutize(field_name, BFZ)), v) #print >> sys.stderr, lookup, key #if val: # subject = instanceid if code in INSTANCE_FIELDS else workid # relsink.add(I(subject), I(iri.absolutize(key, BFZ)), val) params['code'] = code special_properties = {} for k, v in process_leader(leader): special_properties.setdefault(k, set()).add(v) for k, v in process_008(field008): special_properties.setdefault(k, set()).add(v) params['special_properties'] = special_properties #We get some repeated values out of leader & 008 processing, and we want to #Remove dupes so we did so by working with sets then converting to lists for k, v in special_properties.items(): special_properties[k] = list(v) for item in v: #logger.debug(v) relsink.add(I(instanceid), I(iri.absolutize(k, BFZ)), item) #reduce lists of just one item #for k, v in work_item.items(): # if type(v) is list and len(v) == 1: # work_item[k] = v[0] #work_sink.send(work_item) #Handle ISBNs re: https://foundry.zepheira.com/issues/1976 ISBN_FIELD = 'tag-020' isbn_stmts = relsink.match(subj=instanceid, pred=iri.absolutize(ISBN_FIELD, BFZ)) isbns = [ s[2] for s in isbn_stmts ] logger.debug('ISBNS: {0}'.format(list(isbn_list(isbns)))) other_instance_ids = [] subscript = ord('a') newid = None for subix, (inum, itype) in enumerate(isbn_list(isbns)): #print >> sys.stderr, subix, inum, itype newid = next(ids) duplicate_statements(relsink, instanceid, newid) relsink.add(I(newid), I(iri.absolutize('isbn', BFZ)), inum) #subitem['id'] = instanceid + (unichr(subscript + subix) if subix else '') if itype: relsink.add(I(newid), I(iri.absolutize('isbnType', BFZ)), itype) other_instance_ids.append(newid) if not other_instance_ids: #Make sure it's created as an instance even if it has no ISBN relsink.add(I(workid), I(iri.absolutize('hasInstance', BFZ)), I(instanceid)) params.setdefault('instanceids', []).append(instanceid) for iid in other_instance_ids: relsink.add(I(workid), I(iri.absolutize('hasInstance', BFZ)), I(iid)) params.setdefault('instanceids', []).append(iid) #if newid is None: #No ISBN specified # send_instance(ninst) #ix += 1 logger.debug('+') for plugin in plugins: plugin.send(params) #Can't really use this because it include outer [] #jsondump(relsink, out) if not first_record: out.write(',\n') first_record = False last_chunk = None #Using iterencode avoids building a big JSON string in memory, or having to resort to file pointer seeking #Then again builds a big list in memory, so still working on opt here for chunk in json.JSONEncoder().iterencode([ stmt for stmt in relsink ]): if last_chunk is None: last_chunk = chunk[1:] else: out.write(last_chunk) last_chunk = chunk if last_chunk: out.write(last_chunk[:-1]) if postprocess: postprocess(rec) if limiting[1] is not None: limiting[0] += 1 if limiting[0] >= limiting[1]: break logger.debug('Completed processing {0} record{1}.'.format(limiting[0], '' if limiting[0] == 1 else 's')) except GeneratorExit: out.write(']') return