def materialize_entity(etype, ctx_params=None, model_to_update=None, data=None, addtype=True, loop=None, logger=logging): ''' Routine for creating a BIBFRAME resource. Takes the entity (resource) type and a data mapping according to the resource type. Implements the Libhub Resource Hash Convention As a convenience, if a vocabulary base is provided, concatenate it to etype and the data keys data - list of key/value pairs used to compute the hash. If empty the hash will be a default for the entity type WARNING: THIS FUNCTION MANGLES THE data ARG ''' ctx_params = ctx_params or {} vocabbase = ctx_params.get('vocabbase', BL) entbase = ctx_params.get('entbase') existing_ids = ctx_params.get('existing_ids', set()) plugins = ctx_params.get('plugins') logger = ctx_params.get('logger', logging) output_model = ctx_params.get('output_model') ids = ctx_params.get('ids', idgen(entbase)) if vocabbase and not iri.is_absolute(etype): etype = vocabbase + etype params = {'logger': logger} data = data or [] if addtype: data.insert(0, [TYPE_REL, etype]) data_full = [ ((vocabbase + k if not iri.is_absolute(k) else k), v) for (k, v) in data ] plaintext = json.dumps(data_full, separators=(',', ':'), cls=OrderedJsonEncoder) eid = ids.send(plaintext) if model_to_update: model_to_update.add(I(eid), TYPE_REL, I(etype)) params['materialized_id'] = eid params['first_seen'] = eid in existing_ids params['plaintext'] = plaintext for plugin in plugins or (): #Not using yield from if BF_MATRES_TASK in plugin: for p in plugin[BF_MATRES_TASK](loop, output_model, params): pass #logger.debug("Pending tasks: %s" % asyncio.Task.all_tasks(loop)) return eid
def record_handler( loop, model, entbase=None, vocabbase=BL, limiting=None, plugins=None, ids=None, postprocess=None, out=None, logger=logging, transforms=TRANSFORMS, extra_transforms=default_extra_transforms(), canonical=False, **kwargs): ''' loop - asyncio event loop model - the Versa model for the record entbase - base IRI used for IDs of generated entity resources limiting - mutable pair of [count, limit] used to control the number of records processed ''' _final_tasks = set() #Tasks for the event loop contributing to the MARC processing plugins = plugins or [] if ids is None: ids = idgen(entbase) #FIXME: For now always generate instances from ISBNs, but consider working this through the plugins system instancegen = isbn_instancegen existing_ids = set() #Start the process of writing out the JSON representation of the resulting Versa if out and not canonical: out.write('[') first_record = True try: while True: input_model = yield leader = None #Add work item record, with actual hash resource IDs based on default or plugged-in algo #FIXME: No plug-in support yet params = {'input_model': input_model, 'output_model': model, 'logger': logger, 'entbase': entbase, 'vocabbase': vocabbase, 'ids': ids, 'existing_ids': existing_ids, 'plugins': plugins} workhash = record_hash_key(input_model) workid = materialize_entity('Work', ctx_params=params, loop=loop, hash=workhash) is_folded = workid in existing_ids existing_ids.add(workid) control_code = list(marc_lookup(input_model, '001')) or ['NO 001 CONTROL CODE'] dumb_title = list(marc_lookup(input_model, '245$a')) or ['NO 245$a TITLE'] logger.debug('Control code: {0}'.format(control_code[0])) logger.debug('Uniform title: {0}'.format(dumb_title[0])) logger.debug('Work hash result: {0} from \'{1}\''.format(workid, 'Work' + workhash)) if entbase: workid = I(iri.absolutize(workid, entbase)) else: workid = I(workid) folded = [workid] if is_folded else [] model.add(workid, TYPE_REL, I(iri.absolutize('Work', vocabbase))) params['workid'] = workid params['folded'] = folded #Figure out instances params['materialize_entity'] = materialize_entity instanceids = instancegen(params, loop, model) if instanceids: instanceid = instanceids[0] params['leader'] = None params['workid'] = workid params['instanceids'] = instanceids params['folded'] = folded params['transforms'] = [] # set() params['fields_used'] = [] params['dropped_codes'] = {} #Defensive coding against missing leader or 008 field008 = leader = None params['fields006'] = fields006 = [] params['fields007'] = fields007 = [] #Prepare cross-references (i.e. 880s) #XXX: Figure out a way to declare in TRANSFRORMS? We might have to deal with non-standard relationship designators: https://github.com/lcnetdev/marc2bibframe/issues/83 xrefs = {} remove_links = set() add_links = [] for lid, marc_link in input_model: origin, taglink, val, attribs = marc_link if taglink == MARCXML_NS + '/leader' or taglink.startswith(MARCXML_NS + '/data/9'): #900 fields are local and might not follow the general xref rules params['leader'] = leader = val continue tag = attribs['tag'] for xref in attribs.get('6', []): xref_parts = xref.split('-') if len(xref_parts) < 2: logger.debug('Invalid $6: {}'.format(xref_parts)) continue xreftag, xrefid = xref_parts #Locate the matching taglink if tag == '880' and xrefid.startswith('00'): #Special case, no actual xref, just the non-roman text #Rule for 880s: merge in & add language indicator langinfo = xrefid.split('/')[-1] #Not using langinfo, really, at present because it seems near useless. Eventually we can handle by embedding a lang indicator token into attr values for later postprocessing attribs['tag'] = xreftag add_links.append((origin, MARCXML_NS + '/data/' + xreftag, val, attribs)) links = input_model.match(None, MARCXML_NS + '/data/' + xreftag) for link in links: #6 is the cross-reference subfield for dest in link[ATTRIBUTES].get('6', []): if [tag, xrefid] == dest.split('/')[0].split('-'): if tag == '880': #880s will be handled by merger via xref, so take out for main loop #XXX: This does, however, make input_model no longer a true representation of the input XML. Problem? remove_links.add(lid) if xreftag == '880': #Rule for 880s: merge in & add language indicator langinfo = dest.split('/')[-1] #Not using langinfo, really, at present because it seems near useless. Eventually we can handle by embedding a lang indicator token into attr values for later postprocessing remove_links.add(lid) copied_attribs = attribs.copy() for k, v in link[ATTRIBUTES].items(): if k[:3] not in ('tag', 'ind'): copied_attribs.setdefault(k, []).extend(v) add_links.append((origin, taglink, val, copied_attribs)) for lid in remove_links: input_model.remove(lid) for linfo in add_links: input_model.add(*linfo) # hook for plugins interested in the input model for plugin in plugins: if BF_INPUT_TASK in plugin: yield from plugin[BF_INPUT_TASK](loop, input_model, params) # need to sort our way through the input model so that the materializations occur # at the same place each time, otherwise canonicalization fails due to the # addition of the subfield context (at the end of materialize()) for lid, marc_link in sorted(list(input_model), key=lambda x: int(x[0])): origin, taglink, val, attribs = marc_link if taglink == MARCXML_NS + '/leader': params['leader'] = leader = val continue #Sort out attributes params['indicators'] = indicators = { k: v for k, v in attribs.items() if k.startswith('ind') } params['subfields'] = subfields = { k: v for k, v in attribs.items() if k[:3] not in ('tag', 'ind') } params['code'] = tag = attribs['tag'] if taglink.startswith(MARCXML_NS + '/control'): #No indicators on control fields. Turn them off, in effect indicator_list = ('#', '#') key = 'tag-' + tag if tag == '006': params['fields006'].append(val) if tag == '007': params['fields007'].append(val) if tag == '008': params['field008'] = field008 = val params['transforms'].append((tag, key)) params['fields_used'].append((tag,)) elif taglink.startswith(MARCXML_NS + '/data'): indicator_list = ((attribs.get('ind1') or ' ')[0].replace(' ', '#'), (attribs.get('ind2') or ' ')[0].replace(' ', '#')) key = 'tag-' + tag #logger.debug('indicators: ', repr(indicators)) #indicator_list = (indicators['ind1'], indicators['ind2']) params['fields_used'].append(tuple([tag] + list(subfields.keys()))) #This is where we check each incoming MARC link to see if it matches a transform into an output link (e.g. renaming 001 to 'controlCode') to_process = [] #Start with most specific matches, then to most general # "?" syntax in lookups is a single char wildcard #First with subfields, with & without indicators: for k, v in subfields.items(): #if indicator_list == ('#', '#'): lookups = [ '{0}-{1}{2}${3}'.format(tag, indicator_list[0], indicator_list[1], k), '{0}-?{2}${3}'.format(tag, indicator_list[0], indicator_list[1], k), '{0}-{1}?${3}'.format(tag, indicator_list[0], indicator_list[1], k), '{0}${1}'.format(tag, k), ] for valitems in v: for lookup in lookups: if lookup in transforms: to_process.append((transforms[lookup], valitems)) else: # don't report on subfields for which a code-transform exists, # disregard wildcards if not tag in transforms and '?' not in lookup: params['dropped_codes'].setdefault(lookup,0) params['dropped_codes'][lookup] += 1 #Now just the tag, with & without indicators lookups = [ '{0}-{1}{2}'.format(tag, indicator_list[0], indicator_list[1]), '{0}-?{2}'.format(tag, indicator_list[0], indicator_list[1]), '{0}-{1}?'.format(tag, indicator_list[0], indicator_list[1]), tag, ] #Remember how many lookups were successful based on subfields subfields_results_len = len(to_process) for lookup in lookups: if lookup in transforms: to_process.append((transforms[lookup], val)) if subfields_results_len == len(to_process) and not subfields: # Count as dropped if subfields were not processed and theer were no matches on non-subfield lookups params['dropped_codes'].setdefault(tag,0) params['dropped_codes'][tag] += 1 mat_ent = functools.partial(materialize_entity, ctx_params=params, loop=loop) #Apply all the handlers that were found for funcinfo, val in to_process: #Support multiple actions per lookup funcs = funcinfo if isinstance(funcinfo, tuple) else (funcinfo,) for func in funcs: extras = { WORKID: workid, IID: instanceid } #Build Versa processing context #Should we include indicators? #Should we be passing in taglik rather than tag? ctx = bfcontext((origin, tag, val, subfields), input_model, model, extras=extras, base=vocabbase, idgen=mat_ent, existing_ids=existing_ids) func(ctx) if not to_process: #Nothing else has handled this data field; go to the fallback fallback_rel_base = '../marcext/tag-' + tag if not subfields: #Fallback for control field: Captures MARC tag & value model.add(I(workid), I(iri.absolutize(fallback_rel_base, vocabbase)), val) for k, v in subfields.items(): #Fallback for data field: Captures MARC tag, indicators, subfields & value fallback_rel = '../marcext/{0}-{1}{2}-{3}'.format( fallback_rel_base, indicator_list[0].replace('#', 'X'), indicator_list[1].replace('#', 'X'), k) #params['transforms'].append((code, fallback_rel)) for valitem in v: model.add(I(workid), I(iri.absolutize(fallback_rel, vocabbase)), valitem) extra_stmts = set() # prevent duplicate statements for origin, k, v in itertools.chain( extra_transforms.process_leader(params), extra_transforms.process_006(fields006, params), extra_transforms.process_007(fields007, params), extra_transforms.process_008(field008, params)): v = v if isinstance(v, tuple) else (v,) for item in v: o = origin or I(workid) if (o,k,item) not in extra_stmts: model.add(o, k, item) extra_stmts.add((o, k, item)) instance_postprocess(params) logger.debug('+') for plugin in plugins: #Each plug-in is a task #task = asyncio.Task(plugin[BF_MARCREC_TASK](loop, relsink, params), loop=loop) if BF_MARCREC_TASK in plugin: yield from plugin[BF_MARCREC_TASK](loop, model, params) logger.debug("Pending tasks: %s" % asyncio.Task.all_tasks(loop)) #FIXME: This blocks and thus serializes the plugin operation, rather than the desired coop scheduling approach #For some reason seting to async task then immediately deferring to next task via yield from sleep leads to the "yield from wasn't used with future" error (Not much clue at: https://codereview.appspot.com/7396044/) #yield from asyncio.Task(asyncio.sleep(0.01), loop=loop) #yield from asyncio.async(asyncio.sleep(0.01)) #yield from asyncio.sleep(0.01) #Basically yield to next task #Can we somehow move this to passed-in postprocessing? if out and not canonical and not first_record: out.write(',\n') if out: if not canonical: first_record = False last_chunk = None #Using iterencode avoids building a big JSON string in memory, or having to resort to file pointer seeking #Then again builds a big list in memory, so still working on opt here for chunk in json.JSONEncoder().iterencode([ link for link in model ]): if last_chunk is None: last_chunk = chunk[1:] else: out.write(last_chunk) last_chunk = chunk if last_chunk: out.write(last_chunk[:-1]) #FIXME: Postprocessing should probably be a task too if postprocess: postprocess() #limiting--running count of records processed versus the max number, if any limiting[0] += 1 if limiting[1] is not None and limiting[0] >= limiting[1]: break except GeneratorExit: logger.debug('Completed processing {0} record{1}.'.format(limiting[0], '' if limiting[0] == 1 else 's')) if out and not canonical: out.write(']') #if not plugins: loop.stop() for plugin in plugins: #Each plug-in is a task func = plugin.get(BF_FINAL_TASK) if not func: continue task = asyncio.Task(func(loop), loop=loop) _final_tasks.add(task) def task_done(task): #print('Task done: ', task) _final_tasks.remove(task) #logger.debug((plugins)) #if plugins and len(_final_tasks) == 0: #print("_final_tasks is empty, stopping loop.") #loop = asyncio.get_event_loop() # loop.stop() #Once all the plug-in tasks are done, all the work is done task.add_done_callback(task_done) #print('DONE') #raise return
def record_handler(loop, relsink, entbase=None, vocabbase=BFZ, limiting=None, plugins=None, ids=None, postprocess=None, out=None, logger=logging, **kwargs): ''' loop - asyncio event loop entbase - base IRI used for IDs of generated entity resources limiting - mutable pair of [count, limit] used to control the number of records processed ''' _final_tasks = set() #Tasks for the event loop contributing to the MARC processing plugins = plugins or [] if ids is None: ids = idgen(entbase) logger.debug('GRIPPO: {0}'.format(repr(entbase))) #FIXME: For now always generate instances from ISBNs, but consider working this through th plugins system instancegen = isbn_instancegen existing_ids = set() initialize(hashidgen=ids, existing_ids=existing_ids) #Start the process of writing out the JSON representation of the resulting Versa out.write('[') first_record = True try: while True: rec = yield leader = None #Add work item record, with actual hash resource IDs based on default or plugged-in algo #FIXME: No plug-in support yet workhash = record_hash_key(rec) workid = ids.send('Work:' + workhash) existing_ids.add(workid) logger.debug('Uniform title from 245$a: {0}'.format(marc_lookup(rec, ['245$a']))) logger.debug('Work hash result: {0} from \'{1}\''.format(workid, 'Work' + workhash)) if entbase: workid = I(iri.absolutize(workid, entbase)) relsink.add(I(workid), TYPE_REL, I(iri.absolutize('Work', vocabbase))) params = {'workid': workid, 'rec': rec, 'logger': logger, 'model': relsink, 'entbase': entbase, 'vocabbase': vocabbase, 'ids': ids, 'existing_ids': existing_ids} #Figure out instances instanceids = instancegen(params) if instanceids: instanceid = instanceids[0] params['instance_ids'] = instanceids params['transforms'] = [] # set() params['fields_used'] = [] for row in rec: code = None if row[0] == LEADER: params['leader'] = leader = row[1] elif row[0] == CONTROLFIELD: code, val = row[1], row[2] key = 'tag-' + code if code == '008': params['field008'] = field008 = val params['transforms'].append((code, key)) relsink.add(I(instanceid), I(iri.absolutize(key, vocabbase)), val) params['fields_used'].append((code,)) elif row[0] == DATAFIELD: code, xmlattrs, subfields = row[1], row[2], row[3] #xmlattribs include are indicators indicators = ((xmlattrs.get('ind1') or ' ')[0].replace(' ', '#'), (xmlattrs.get('ind2') or ' ')[0].replace(' ', '#')) key = 'tag-' + code handled = False params['subfields'] = subfields params['indicators'] = indicators params['fields_used'].append(tuple([code] + list(subfields.keys()))) #Build Versa processing context to_process = [] #logger.debug(repr(indicators)) if indicators == ('#', '#'): #No indicators set for k, v in subfields.items(): lookup = '{0}${1}'.format(code, k) if lookup in TRANSFORMS: to_process.append((TRANSFORMS[lookup], v)) lookup = code if lookup in TRANSFORMS: to_process.append((TRANSFORMS[lookup], '')) else: #One or other indicators is set, so let's check the transforms against those lookup = '{0}-{1}{2}'.format(*((code,) + indicators)) if lookup in TRANSFORMS: to_process.append((TRANSFORMS[lookup], '')) for k, v in subfields.items(): lookup = '{0}${1}'.format(code, k) if lookup in TRANSFORMS: to_process.append((TRANSFORMS[lookup], v)) #Apply all the handlers that were found for func, val in to_process: ctx = context(workid, [(workid, code, val, subfields)], relsink, base=vocabbase) new_stmts = func(ctx, workid, instanceid) #FIXME: Use add for s in new_stmts: relsink.add(*s) #logger.debug('.') if not to_process: #Nothing else has handled this data field; go to the fallback fallback_rel_base = 'tag-' + code for k, v in subfields.items(): fallback_rel = fallback_rel_base + k #params['transforms'].append((code, fallback_rel)) relsink.add(I(workid), I(iri.absolutize(fallback_rel, vocabbase)), v) params['code'] = code special_properties = {} for k, v in process_leader(leader): special_properties.setdefault(k, set()).add(v) for k, v in process_008(field008): special_properties.setdefault(k, set()).add(v) params['special_properties'] = special_properties #We get some repeated values out of leader & 008 processing, and we want to #Remove dupes so we did so by working with sets then converting to lists for k, v in special_properties.items(): special_properties[k] = list(v) for item in v: #logger.debug(v) relsink.add(I(instanceid), I(iri.absolutize(k, vocabbase)), item) instance_postprocess(params) logger.debug('+') for plugin in plugins: #Each plug-in is a task #task = asyncio.Task(plugin[BF_MARCREC_TASK](loop, relsink, params), loop=loop) yield from plugin[BF_MARCREC_TASK](loop, relsink, params) logger.debug("Pending tasks: %s" % asyncio.Task.all_tasks(loop)) #FIXME: This blocks and thus serializes the plugin operation, rather than the desired coop scheduling approach #For some reason seting to async task then immediately deferring to next task via yield from sleep leads to the "yield from wasn't used with future" error (Not much clue at: https://codereview.appspot.com/7396044/) #yield from asyncio.Task(asyncio.sleep(0.01), loop=loop) #yield from asyncio.async(asyncio.sleep(0.01)) #yield from asyncio.sleep(0.01) #Basically yield to next task if not first_record: out.write(',\n') first_record = False last_chunk = None #Using iterencode avoids building a big JSON string in memory, or having to resort to file pointer seeking #Then again builds a big list in memory, so still working on opt here for chunk in json.JSONEncoder().iterencode([ link for link in relsink ]): if last_chunk is None: last_chunk = chunk[1:] else: out.write(last_chunk) last_chunk = chunk if last_chunk: out.write(last_chunk[:-1]) #FIXME: Postprocessing should probably be a task too if postprocess: postprocess(rec) #limiting--running count of records processed versus the max number, if any limiting[0] += 1 if limiting[1] is not None and limiting[0] >= limiting[1]: break except GeneratorExit: logger.debug('Completed processing {0} record{1}.'.format(limiting[0], '' if limiting[0] == 1 else 's')) out.write(']') if not plugins: loop.stop() for plugin in plugins: #Each plug-in is a task task = asyncio.Task(plugin[BF_FINAL_TASK](loop), loop=loop) _final_tasks.add(task) def task_done(task): #print('Task done: ', task) _final_tasks.remove(task) if len(_final_tasks) == 0: #print("_final_tasks is empty, stopping loop.") #loop = asyncio.get_event_loop() loop.stop() #Once all the plug-in tasks are done, all the work is done task.add_done_callback(task_done) #print('DONE') #raise return
def record_handler( loop, model, entbase=None, vocabbase=BL, limiting=None, plugins=None, ids=None, postprocess=None, out=None, logger=logging, transforms=TRANSFORMS, extra_transforms=default_extra_transforms(), canonical=False, **kwargs): ''' loop - asyncio event loop model - the Versa model for the record entbase - base IRI used for IDs of generated entity resources limiting - mutable pair of [count, limit] used to control the number of records processed ''' model_factory = kwargs.get('model_factory', memory.connection) main_transforms = transforms _final_tasks = set() #Tasks for the event loop contributing to the MARC processing plugins = plugins or [] if ids is None: ids = idgen(entbase) #FIXME: For now always generate instances from ISBNs, but consider working this through the plugins system instancegen = isbn_instancegen existing_ids = set() #Start the process of writing out the JSON representation of the resulting Versa if out and not canonical: out.write('[') first_record = True try: while True: input_model = yield leader = None #Add work item record, with actual hash resource IDs based on default or plugged-in algo #FIXME: No plug-in support yet params = { 'input_model': input_model, 'output_model': model, 'logger': logger, 'entbase': entbase, 'vocabbase': vocabbase, 'ids': ids, 'existing_ids': existing_ids, 'plugins': plugins, 'materialize_entity': materialize_entity, 'leader': leader, 'loop': loop, 'extra_transforms': extra_transforms } # Earliest plugin stage, with an unadulterated input model for plugin in plugins: if BF_INPUT_TASK in plugin: yield from plugin[BF_INPUT_TASK](loop, input_model, params) #Prepare cross-references (i.e. 880s) #XXX: Figure out a way to declare in TRANSFORMS? We might have to deal with non-standard relationship designators: https://github.com/lcnetdev/marc2bibframe/issues/83 xrefs = {} remove_links = set() add_links = [] for lid, marc_link in input_model: origin, taglink, val, attribs = marc_link if taglink == MARCXML_NS + '/leader' or taglink.startswith(MARCXML_NS + '/data/9'): #900 fields are local and might not follow the general xref rules params['leader'] = leader = val continue tag = attribs['tag'] for xref in attribs.get('6', []): xref_parts = xref.split('-') if len(xref_parts) != 2: control_code = list(marc_lookup(input_model, '001')) or ['NO 001 CONTROL CODE'] dumb_title = list(marc_lookup(input_model, '245$a')) or ['NO 245$a TITLE'] logger.warning('Skipping invalid $6: "{}" for {}: "{}"'.format(xref, control_code[0], dumb_title[0])) continue xreftag, xrefid = xref_parts #Locate the matching taglink if tag == '880' and xrefid.startswith('00'): #Special case, no actual xref, just the non-roman text #Rule for 880s: merge in & add language indicator langinfo = xrefid.split('/')[-1] #Not using langinfo, really, at present because it seems near useless. Eventually we can handle by embedding a lang indicator token into attr values for later postprocessing attribs['tag'] = xreftag add_links.append((origin, MARCXML_NS + '/data/' + xreftag, val, attribs)) links = input_model.match(None, MARCXML_NS + '/data/' + xreftag) for link in links: #6 is the cross-reference subfield for dest in link[ATTRIBUTES].get('6', []): if [tag, xrefid] == dest.split('/')[0].split('-'): if tag == '880': #880s will be handled by merger via xref, so take out for main loop #XXX: This does, however, make input_model no longer a true representation of the input XML. Problem? remove_links.add(lid) if xreftag == '880': #Rule for 880s: merge in & add language indicator langinfo = dest.split('/')[-1] #Not using langinfo, really, at present because it seems near useless. Eventually we can handle by embedding a lang indicator token into attr values for later postprocessing remove_links.add(lid) copied_attribs = attribs.copy() for k, v in link[ATTRIBUTES].items(): if k[:3] not in ('tag', 'ind'): copied_attribs.setdefault(k, []).extend(v) add_links.append((origin, taglink, val, copied_attribs)) input_model.remove(remove_links) input_model.add_many(add_links) # hook for plugins interested in the xref-resolved input model for plugin in plugins: if BF_INPUT_XREF_TASK in plugin: yield from plugin[BF_INPUT_XREF_TASK](loop, input_model, params) #Do one pass to establish work hash #XXX Should crossrefs precede this? temp_workhash = next(params['input_model'].match())[ORIGIN] logger.debug('Temp work hash: {0}'.format(temp_workhash)) params['workid'] = temp_workhash params['instanceids'] = [temp_workhash + '-instance'] params['output_model'] = model_factory() params['field008'] = leader = None params['fields006'] = fields006 = [] params['fields007'] = fields007 = [] params['to_postprocess'] = [] process_marcpatterns(params, WORK_HASH_TRANSFORMS, input_model, main_phase=False) workid_data = gather_workid_data(params['output_model'], temp_workhash) workid = materialize_entity('Work', ctx_params=params, loop=loop, data=workid_data) is_folded = workid in existing_ids existing_ids.add(workid) control_code = list(marc_lookup(input_model, '001')) or ['NO 001 CONTROL CODE'] dumb_title = list(marc_lookup(input_model, '245$a')) or ['NO 245$a TITLE'] logger.debug('Work hash data: {0}'.format(repr(workid_data))) logger.debug('Control code: {0}'.format(control_code[0])) logger.debug('Uniform title: {0}'.format(dumb_title[0])) logger.debug('Work ID: {0}'.format(workid)) workid = I(iri.absolutize(workid, entbase)) if entbase else I(workid) folded = [workid] if is_folded else [] model.add(workid, TYPE_REL, I(iri.absolutize('Work', vocabbase))) params['workid'] = workid params['folded'] = folded #Switch to the main output model for processing params['output_model'] = model #Figure out instances instanceids = instancegen(params, loop, model) params['instanceids'] = instanceids or [None] params['transform_log'] = [] # set() params['fields_used'] = [] params['dropped_codes'] = {} #Defensive coding against missing leader or 008 params['field008'] = leader = None params['fields006'] = fields006 = [] params['fields007'] = fields007 = [] params['to_postprocess'] = [] process_marcpatterns(params, main_transforms, input_model, main_phase=True) skipped_rels = set() for op, rels, rid in params['to_postprocess']: for rel in rels: skipped_rels.add(rel) if op == POSTPROCESS_AS_INSTANCE: if params['instanceids'] == [None]: params['instanceids'] = [rid] else: params['instanceids'].append(rid) instance_postprocess(params, skip_relationships=skipped_rels) logger.debug('+') for plugin in plugins: #Each plug-in is a task #task = asyncio.Task(plugin[BF_MARCREC_TASK](loop, relsink, params), loop=loop) if BF_MARCREC_TASK in plugin: yield from plugin[BF_MARCREC_TASK](loop, model, params) logger.debug("Pending tasks: %s" % asyncio.Task.all_tasks(loop)) #FIXME: This blocks and thus serializes the plugin operation, rather than the desired coop scheduling approach #For some reason seting to async task then immediately deferring to next task via yield from sleep leads to the "yield from wasn't used with future" error (Not much clue at: https://codereview.appspot.com/7396044/) #yield from asyncio.Task(asyncio.sleep(0.01), loop=loop) #yield from asyncio.async(asyncio.sleep(0.01)) #yield from asyncio.sleep(0.01) #Basically yield to next task #Can we somehow move this to passed-in postprocessing? if out and not canonical and not first_record: out.write(',\n') if out: if not canonical: first_record = False last_chunk = None #Using iterencode avoids building a big JSON string in memory, or having to resort to file pointer seeking #Then again builds a big list in memory, so still working on opt here for chunk in json.JSONEncoder().iterencode([ link for link in model ]): if last_chunk is None: last_chunk = chunk[1:] else: out.write(last_chunk) last_chunk = chunk if last_chunk: out.write(last_chunk[:-1]) #FIXME: Postprocessing should probably be a task too if postprocess: postprocess() #limiting--running count of records processed versus the max number, if any limiting[0] += 1 if limiting[1] is not None and limiting[0] >= limiting[1]: break except GeneratorExit: logger.debug('Completed processing {0} record{1}.'.format(limiting[0], '' if limiting[0] == 1 else 's')) if out and not canonical: out.write(']') #if not plugins: loop.stop() for plugin in plugins: #Each plug-in is a task func = plugin.get(BF_FINAL_TASK) if not func: continue task = asyncio.Task(func(loop), loop=loop) _final_tasks.add(task) def task_done(task): #print('Task done: ', task) _final_tasks.remove(task) #logger.debug((plugins)) #if plugins and len(_final_tasks) == 0: #print("_final_tasks is empty, stopping loop.") #loop = asyncio.get_event_loop() # loop.stop() #Once all the plug-in tasks are done, all the work is done task.add_done_callback(task_done) #print('DONE') #raise return
def record_handler(loop, model, entbase=None, vocabbase=BL, limiting=None, plugins=None, ids=None, postprocess=None, out=None, logger=logging, transforms=TRANSFORMS, special_transforms=unused_flag, canonical=False, model_factory=memory.connection, lookups=None, **kwargs): ''' loop - asyncio event loop model - the Versa model for the record entbase - base IRI used for IDs of generated entity resources limiting - mutable pair of [count, limit] used to control the number of records processed ''' #Deprecated legacy API support if isinstance(transforms, dict) or special_transforms is not unused_flag: warnings.warn('Please switch to using bibframe.transforms_set', PendingDeprecationWarning) special_transforms = special_transforms or default_special_transforms() transforms = transform_set(transforms) transforms.specials = special_transforms _final_tasks = set( ) #Tasks for the event loop contributing to the MARC processing plugins = plugins or [] if ids is None: ids = idgen(entbase) #FIXME: For now always generate instances from ISBNs, but consider working this through the plugins system instancegen = isbn_instancegen existing_ids = set() #Start the process of writing out the JSON representation of the resulting Versa if out and not canonical: out.write('[') first_record = True try: while True: input_model = yield leader = None #Add work item record, with actual hash resource IDs based on default or plugged-in algo #FIXME: No plug-in support yet params = { 'input_model': input_model, 'logger': logger, #'input_model': input_model, 'output_model': model, 'logger': logger, 'entbase': entbase, 'vocabbase': vocabbase, 'ids': ids, 'existing_ids': existing_ids, 'plugins': plugins, 'transforms': transforms, 'materialize_entity': materialize_entity, 'leader': leader, 'lookups': lookups or {}, 'loop': loop } # Earliest plugin stage, with an unadulterated input model for plugin in plugins: if BF_INPUT_TASK in plugin: yield from plugin[BF_INPUT_TASK](loop, input_model, params) #Prepare cross-references (i.e. 880s) #See the "$6 - Linkage" section of https://www.loc.gov/marc/bibliographic/ecbdcntf.html #XXX: Figure out a way to declare in TRANSFORMS? We might have to deal with non-standard relationship designators: https://github.com/lcnetdev/marc2bibframe/issues/83 xrefs = {} remove_links = set() add_links = [] xref_link_tag_workaround = {} for lid, marc_link in input_model: origin, taglink, val, attribs = marc_link if taglink == MARCXML_NS + '/leader' or taglink.startswith( MARCXML_NS + '/data/9'): #900 fields are local and might not follow the general xref rules params['leader'] = leader = val continue #XXX Do other fields with a 9 digit (not just 9XX) also need to be skipped? if taglink.startswith(MARCXML_NS + '/extra/') or 'tag' not in attribs: continue this_tag = attribs['tag'] #if this_tag == '100': import pdb; pdb.set_trace() for xref in attribs.get('6', []): matched = LINKAGE_PAT.match(xref) this_taglink, this_occ, this_scriptid, this_rtl = matched.groups( ) if matched else (None, None, None, None) if not this_taglink and occ: control_code = list(marc_lookup( input_model, '001')) or ['NO 001 CONTROL CODE'] dumb_title = list(marc_lookup( input_model, '245$a')) or ['NO 245$a TITLE'] logger.warning( 'Skipping invalid $6: "{}" for {}: "{}"'.format( xref, control_code[0], dumb_title[0])) continue if this_tag == this_taglink: #Pretty sure this is an erroneous self-link, but we've seen this in the wild (e.g. QNL). Issue warning & do the best we can linking via occurrence #Note: the resulting workround (lookup table from occurence code to the correct tag) will not work in cases of linking from any tag higher in ordinal value than 880 (if such a situation is even possible) logger.warning( 'Invalid input: erroneous self-link $6: "{}" from "{}". Trying to work around.' .format(xref, this_tag)) if this_tag != '880': xref_link_tag_workaround[this_occ] = this_tag #FIXME: Remove this debugging if statament at some point if scriptid or rtl: logger.debug( 'Language info specified in subfield 6, {}'.format( xref)) #Locate the matching taglink if this_tag == '880' and this_occ == '00': #Special case, no actual xref, used to separate scripts in a record (re Multiscript Records) #FIXME: Not really handled right now. Presume some sort of merge dynamics will need to be implemented attribs['tag'] = this_taglink add_links.append( (origin, MARCXML_NS + '/data/' + this_taglink, val, attribs)) if xref_link_tag_workaround: if this_tag == '880': this_taglink = xref_link_tag_workaround.get( this_occ) links = input_model.match( None, MARCXML_NS + '/data/' + this_taglink) for that_link in links: #6 is the cross-reference subfield for that_ref in link[ATTRIBUTES].get('6', []): matched = LINKAGE_PAT.match(that_ref) that_taglink, that_occ, that_scriptid, that_rtl = matched.groups( ) if matched else (None, None, None, None) #if not that_tag and that_occ: # control_code = list(marc_lookup(input_model, '001')) or ['NO 001 CONTROL CODE'] # dumb_title = list(marc_lookup(input_model, '245$a')) or ['NO 245$a TITLE'] # logger.warning('Skipping invalid $6: "{}" for {}: "{}"'.format(to_ref, control_code[0], dumb_title[0])) # continue if ([that_taglink, that_occ] == [ this_tag, this_occ ]) or (xref_link_tag_workaround and that_occ == this_occ): if this_tag == '880': #This is an 880, which we'll handle by integrating back into the input model using the correct tag, flagged to show the relationship remove_links.add(lid) if that_taglink == '880': #Rule for 880s: duplicate but link more robustly copied_attribs = attribs.copy() for k, v in that_link[ATTRIBUTES].items(): if k[:3] not in ('tag', 'ind'): copied_attribs.setdefault( k, []).extend(v) add_links.append( (origin, MARCXML_NS + '/data/' + this_tag, val, copied_attribs)) input_model.remove(remove_links) input_model.add_many(add_links) # hook for plugins interested in the xref-resolved input model for plugin in plugins: if BF_INPUT_XREF_TASK in plugin: yield from plugin[BF_INPUT_XREF_TASK](loop, input_model, params) #Do one pass to establish work hash #XXX Should crossrefs precede this? bootstrap_dummy_id = next(params['input_model'].match())[ORIGIN] logger.debug('Entering bootstrap phase. Dummy ID: {}'.format( bootstrap_dummy_id)) params['default-origin'] = bootstrap_dummy_id params['instanceids'] = [bootstrap_dummy_id + '-instance'] params['output_model'] = model_factory() params['field008'] = leader = None params['fields006'] = fields006 = [] params['fields007'] = fields007 = [] params['to_postprocess'] = [] params['origins'] = { WORK_TYPE: bootstrap_dummy_id, INSTANCE_TYPE: params['instanceids'][0] } #First apply special patterns for determining the main target resources curr_transforms = transforms.compiled[BOOTSTRAP_PHASE] ok = process_marcpatterns(params, curr_transforms, input_model, BOOTSTRAP_PHASE) if not ok: continue #Abort current record if signalled bootstrap_output = params['output_model'] temp_main_target = main_type = None for o, r, t, a in bootstrap_output.match( None, PYBF_BOOTSTRAP_TARGET_REL): #FIXME: We need a better designed way of determining fallback to bib if t is not None: temp_main_target, main_type = o, t #Switch to the main output model for processing params['output_model'] = model if temp_main_target is None: #If no target was set explicitly fall back to the transforms registered for the biblio phase #params['logger'].debug('WORK HASH ORIGIN {}\n'.format(bootstrap_dummy_id)) #params['logger'].debug('WORK HASH MODEL {}\n'.format(repr(bootstrap_output))) workid_data = gather_workid_data(bootstrap_output, bootstrap_dummy_id) workid = materialize_entity('Work', ctx_params=params, data=workid_data, loop=loop) logger.debug( 'Entering default main phase, Work ID: {0}'.format(workid)) is_folded = workid in existing_ids existing_ids.add(workid) control_code = list(marc_lookup( input_model, '001')) or ['NO 001 CONTROL CODE'] dumb_title = list(marc_lookup(input_model, '245$a')) or ['NO 245$a TITLE'] logger.debug('Work hash data: {0}'.format(repr(workid_data))) logger.debug('Control code: {0}'.format(control_code[0])) logger.debug('Uniform title: {0}'.format(dumb_title[0])) logger.debug('Work ID: {0}'.format(workid)) workid = I(iri.absolutize(workid, entbase)) if entbase else I(workid) folded = [workid] if is_folded else [] model.add(workid, VTYPE_REL, I(iri.absolutize('Work', vocabbase))) params['default-origin'] = workid params['folded'] = folded #Figure out instances instanceids = instancegen(params, loop, model) params['instanceids'] = instanceids or [None] main_transforms = transforms.compiled[DEFAULT_MAIN_PHASE] params['origins'] = { WORK_TYPE: workid, INSTANCE_TYPE: params['instanceids'][0] } phase_target = DEFAULT_MAIN_PHASE else: targetid_data = gather_targetid_data( bootstrap_output, temp_main_target, transforms.orderings[main_type]) #params['logger'].debug('Data for resource: {}\n'.format([main_type] + targetid_data)) targetid = materialize_entity(main_type, ctx_params=params, data=targetid_data, loop=loop) logger.debug( 'Entering specialized phase, Target resource ID: {}, type: {}' .format(targetid, main_type)) is_folded = targetid in existing_ids existing_ids.add(targetid) #Determine next transform phase main_transforms = transforms.compiled[main_type] params['origins'] = {main_type: targetid} params['default-origin'] = targetid phase_target = main_type model.add(I(targetid), VTYPE_REL, I(main_type)) params['transform_log'] = [] # set() params['fields_used'] = [] params['dropped_codes'] = {} #Defensive coding against missing leader or 008 params['field008'] = leader = None params['fields006'] = fields006 = [] params['fields007'] = fields007 = [] params['to_postprocess'] = [] ok = process_marcpatterns(params, main_transforms, input_model, phase_target) if not ok: continue #Abort current record if signalled skipped_rels = set() for op, rels, rid in params['to_postprocess']: for rel in rels: skipped_rels.add(rel) if op == POSTPROCESS_AS_INSTANCE: if params['instanceids'] == [None]: params['instanceids'] = [rid] else: params['instanceids'].append(rid) instance_postprocess(params, skip_relationships=skipped_rels) logger.debug('+') #XXX At this point there must be at least one record with a Versa type for plugin in plugins: #Each plug-in is a task #task = asyncio.Task(plugin[BF_MARCREC_TASK](loop, relsink, params), loop=loop) if BF_MARCREC_TASK in plugin: yield from plugin[BF_MARCREC_TASK](loop, model, params) logger.debug("Pending tasks: %s" % asyncio.Task.all_tasks(loop)) #FIXME: This blocks and thus serializes the plugin operation, rather than the desired coop scheduling approach #For some reason seting to async task then immediately deferring to next task via yield from sleep leads to the "yield from wasn't used with future" error (Not much clue at: https://codereview.appspot.com/7396044/) #yield from asyncio.Task(asyncio.sleep(0.01), loop=loop) #yield from asyncio.async(asyncio.sleep(0.01)) #yield from asyncio.sleep(0.01) #Basically yield to next task #Can we somehow move this to passed-in postprocessing? if out and not canonical and not first_record: out.write(',\n') if out: if not canonical: first_record = False last_chunk = None #Using iterencode avoids building a big JSON string in memory, or having to resort to file pointer seeking #Then again builds a big list in memory, so still working on opt here for chunk in json.JSONEncoder().iterencode( [link for link in model]): if last_chunk is None: last_chunk = chunk[1:] else: out.write(last_chunk) last_chunk = chunk if last_chunk: out.write(last_chunk[:-1]) #FIXME: Postprocessing should probably be a task too if postprocess: postprocess() #limiting--running count of records processed versus the max number, if any limiting[0] += 1 if limiting[1] is not None and limiting[0] >= limiting[1]: break except GeneratorExit: logger.debug('Completed processing {0} record{1}.'.format( limiting[0], '' if limiting[0] == 1 else 's')) if out and not canonical: out.write(']') #if not plugins: loop.stop() for plugin in plugins: #Each plug-in is a task func = plugin.get(BF_FINAL_TASK) if not func: continue task = asyncio.Task(func(loop), loop=loop) _final_tasks.add(task) def task_done(task): #print('Task done: ', task) _final_tasks.remove(task) #logger.debug((plugins)) #if plugins and len(_final_tasks) == 0: #print("_final_tasks is empty, stopping loop.") #loop = asyncio.get_event_loop() # loop.stop() #Once all the plug-in tasks are done, all the work is done task.add_done_callback(task_done) #print('DONE') #raise return