def process_marcpatterns(params, transforms, input_model, main_phase=False): if main_phase: # Need to sort our way through the input model so that the materializations occur # at the same place each time, otherwise canonicalization fails due to the # addition of the subfield context (at the end of materialize()) # XXX Is the int() cast necessary? If not we could do key=operator.itemgetter(0) input_model_iter= sorted(list(params['input_model']), key=lambda x: int(x[0])) else: input_model_iter= params['input_model'] params['to_postprocess'] = [] for lid, marc_link in input_model_iter: origin, taglink, val, attribs = marc_link if taglink == MARCXML_NS + '/leader': params['leader'] = leader = val continue #Sort out attributes params['indicators'] = indicators = { k: v for k, v in attribs.items() if k.startswith('ind') } params['subfields'] = subfields = attribs.copy() # preserve class for k in list(subfields.keys()): if k[:3] in ('tag', 'ind'): del subfields[k] params['code'] = tag = attribs['tag'] if taglink.startswith(MARCXML_NS + '/control'): #No indicators on control fields. Turn them off, in effect indicator_list = ('#', '#') key = 'tag-' + tag if tag == '006': params['fields006'].append(val) if tag == '007': params['fields007'].append(val) if tag == '008': params['field008'] = val if main_phase: params['transform_log'].append((tag, key)) params['fields_used'].append((tag,)) elif taglink.startswith(MARCXML_NS + '/data'): indicator_list = ((attribs.get('ind1') or ' ')[0].replace(' ', '#'), (attribs.get('ind2') or ' ')[0].replace(' ', '#')) key = 'tag-' + tag #logger.debug('indicators: ', repr(indicators)) #indicator_list = (indicators['ind1'], indicators['ind2']) if main_phase: params['fields_used'].append(tuple([tag] + list(subfields.keys()))) #This is where we check each incoming MARC link to see if it matches a transform into an output link (e.g. renaming 001 to 'controlCode') to_process = [] #Start with most specific matches, then to most general # "?" syntax in lookups is a single char wildcard #First with subfields, with & without indicators: for k, v in subfields.items(): #if indicator_list == ('#', '#'): lookups = [ '{0}-{1}{2}${3}'.format(tag, indicator_list[0], indicator_list[1], k), '{0}-?{2}${3}'.format(tag, indicator_list[0], indicator_list[1], k), '{0}-{1}?${3}'.format(tag, indicator_list[0], indicator_list[1], k), '{0}${1}'.format(tag, k), ] for valitems in v: for lookup in lookups: if lookup in transforms: to_process.append((transforms[lookup], valitems)) else: # don't report on subfields for which a code-transform exists, # disregard wildcards if main_phase and not tag in transforms and '?' not in lookup: params['dropped_codes'].setdefault(lookup,0) params['dropped_codes'][lookup] += 1 #Now just the tag, with & without indicators lookups = [ '{0}-{1}{2}'.format(tag, indicator_list[0], indicator_list[1]), '{0}-?{2}'.format(tag, indicator_list[0], indicator_list[1]), '{0}-{1}?'.format(tag, indicator_list[0], indicator_list[1]), tag, ] #Remember how many lookups were successful based on subfields subfields_results_len = len(to_process) for lookup in lookups: if lookup in transforms: to_process.append((transforms[lookup], val)) if main_phase and subfields_results_len == len(to_process) and not subfields: # Count as dropped if subfields were not processed and theer were no matches on non-subfield lookups params['dropped_codes'].setdefault(tag,0) params['dropped_codes'][tag] += 1 mat_ent = functools.partial(materialize_entity, ctx_params=params, loop=params['loop']) #Apply all the handlers that were found for funcinfo, val in to_process: #Support multiple actions per lookup funcs = funcinfo if isinstance(funcinfo, tuple) else (funcinfo,) for func in funcs: extras = { WORKID: params['workid'], IID: params['instanceids'][0], 'indicators': indicators, 'logger': params['logger'], 'postprocessing': [], 'inputns': MARC, } #Build Versa processing context #Should we include indicators? #Should we be passing in taglik rather than tag? ctx = bfcontext((origin, tag, val, subfields), input_model, params['output_model'], extras=extras, base=params['vocabbase'], idgen=mat_ent, existing_ids=params['existing_ids']) func(ctx) params['to_postprocess'].extend(ctx.extras['postprocessing']) if main_phase and not to_process: #Nothing else has handled this data field; go to the fallback fallback_rel_base = '../marcext/tag-' + tag if not subfields: #Fallback for control field: Captures MARC tag & value params['output_model'].add(I(params['workid']), I(iri.absolutize(fallback_rel_base, params['vocabbase'])), val) for k, v in subfields.items(): #Fallback for data field: Captures MARC tag, indicators, subfields & value fallback_rel = '../marcext/{0}-{1}{2}-{3}'.format( fallback_rel_base, indicator_list[0].replace('#', 'X'), indicator_list[1].replace('#', 'X'), k) #params['transform_log'].append((code, fallback_rel)) for valitem in v: try: params['output_model'].add(I(params['workid']), I(iri.absolutize(fallback_rel, params['vocabbase'])), valitem) except ValueError as e: control_code = list(marc_lookup(input_model, '001')) or ['NO 001 CONTROL CODE'] dumb_title = list(marc_lookup(input_model, '245$a')) or ['NO 245$a TITLE'] params['logger'].warning('{}\nSkipping statement for {}: "{}"'.format(e, control_code[0], dumb_title[0])) extra_stmts = set() # prevent duplicate statements extra_transforms = params['extra_transforms'] for origin, k, v in itertools.chain( extra_transforms.process_leader(params), extra_transforms.process_006(params['fields006'], params), extra_transforms.process_007(params['fields007'], params), extra_transforms.process_008(params['field008'], params)): v = v if isinstance(v, tuple) else (v,) for item in v: o = origin or I(params['workid']) if o and (o, k, item) not in extra_stmts: params['output_model'].add(o, k, item) extra_stmts.add((o, k, item)) return
def record_handler( loop, model, entbase=None, vocabbase=BL, limiting=None, plugins=None, ids=None, postprocess=None, out=None, logger=logging, transforms=TRANSFORMS, extra_transforms=default_extra_transforms(), canonical=False, **kwargs): ''' loop - asyncio event loop model - the Versa model for the record entbase - base IRI used for IDs of generated entity resources limiting - mutable pair of [count, limit] used to control the number of records processed ''' _final_tasks = set() #Tasks for the event loop contributing to the MARC processing plugins = plugins or [] if ids is None: ids = idgen(entbase) #FIXME: For now always generate instances from ISBNs, but consider working this through the plugins system instancegen = isbn_instancegen existing_ids = set() #Start the process of writing out the JSON representation of the resulting Versa if out and not canonical: out.write('[') first_record = True try: while True: input_model = yield leader = None #Add work item record, with actual hash resource IDs based on default or plugged-in algo #FIXME: No plug-in support yet params = {'input_model': input_model, 'output_model': model, 'logger': logger, 'entbase': entbase, 'vocabbase': vocabbase, 'ids': ids, 'existing_ids': existing_ids, 'plugins': plugins} workhash = record_hash_key(input_model) workid = materialize_entity('Work', ctx_params=params, loop=loop, hash=workhash) is_folded = workid in existing_ids existing_ids.add(workid) control_code = list(marc_lookup(input_model, '001')) or ['NO 001 CONTROL CODE'] dumb_title = list(marc_lookup(input_model, '245$a')) or ['NO 245$a TITLE'] logger.debug('Control code: {0}'.format(control_code[0])) logger.debug('Uniform title: {0}'.format(dumb_title[0])) logger.debug('Work hash result: {0} from \'{1}\''.format(workid, 'Work' + workhash)) if entbase: workid = I(iri.absolutize(workid, entbase)) else: workid = I(workid) folded = [workid] if is_folded else [] model.add(workid, TYPE_REL, I(iri.absolutize('Work', vocabbase))) params['workid'] = workid params['folded'] = folded #Figure out instances params['materialize_entity'] = materialize_entity instanceids = instancegen(params, loop, model) if instanceids: instanceid = instanceids[0] params['leader'] = None params['workid'] = workid params['instanceids'] = instanceids params['folded'] = folded params['transforms'] = [] # set() params['fields_used'] = [] params['dropped_codes'] = {} #Defensive coding against missing leader or 008 field008 = leader = None params['fields006'] = fields006 = [] params['fields007'] = fields007 = [] #Prepare cross-references (i.e. 880s) #XXX: Figure out a way to declare in TRANSFRORMS? We might have to deal with non-standard relationship designators: https://github.com/lcnetdev/marc2bibframe/issues/83 xrefs = {} remove_links = set() add_links = [] for lid, marc_link in input_model: origin, taglink, val, attribs = marc_link if taglink == MARCXML_NS + '/leader' or taglink.startswith(MARCXML_NS + '/data/9'): #900 fields are local and might not follow the general xref rules params['leader'] = leader = val continue tag = attribs['tag'] for xref in attribs.get('6', []): xref_parts = xref.split('-') if len(xref_parts) < 2: logger.debug('Invalid $6: {}'.format(xref_parts)) continue xreftag, xrefid = xref_parts #Locate the matching taglink if tag == '880' and xrefid.startswith('00'): #Special case, no actual xref, just the non-roman text #Rule for 880s: merge in & add language indicator langinfo = xrefid.split('/')[-1] #Not using langinfo, really, at present because it seems near useless. Eventually we can handle by embedding a lang indicator token into attr values for later postprocessing attribs['tag'] = xreftag add_links.append((origin, MARCXML_NS + '/data/' + xreftag, val, attribs)) links = input_model.match(None, MARCXML_NS + '/data/' + xreftag) for link in links: #6 is the cross-reference subfield for dest in link[ATTRIBUTES].get('6', []): if [tag, xrefid] == dest.split('/')[0].split('-'): if tag == '880': #880s will be handled by merger via xref, so take out for main loop #XXX: This does, however, make input_model no longer a true representation of the input XML. Problem? remove_links.add(lid) if xreftag == '880': #Rule for 880s: merge in & add language indicator langinfo = dest.split('/')[-1] #Not using langinfo, really, at present because it seems near useless. Eventually we can handle by embedding a lang indicator token into attr values for later postprocessing remove_links.add(lid) copied_attribs = attribs.copy() for k, v in link[ATTRIBUTES].items(): if k[:3] not in ('tag', 'ind'): copied_attribs.setdefault(k, []).extend(v) add_links.append((origin, taglink, val, copied_attribs)) for lid in remove_links: input_model.remove(lid) for linfo in add_links: input_model.add(*linfo) # hook for plugins interested in the input model for plugin in plugins: if BF_INPUT_TASK in plugin: yield from plugin[BF_INPUT_TASK](loop, input_model, params) # need to sort our way through the input model so that the materializations occur # at the same place each time, otherwise canonicalization fails due to the # addition of the subfield context (at the end of materialize()) for lid, marc_link in sorted(list(input_model), key=lambda x: int(x[0])): origin, taglink, val, attribs = marc_link if taglink == MARCXML_NS + '/leader': params['leader'] = leader = val continue #Sort out attributes params['indicators'] = indicators = { k: v for k, v in attribs.items() if k.startswith('ind') } params['subfields'] = subfields = { k: v for k, v in attribs.items() if k[:3] not in ('tag', 'ind') } params['code'] = tag = attribs['tag'] if taglink.startswith(MARCXML_NS + '/control'): #No indicators on control fields. Turn them off, in effect indicator_list = ('#', '#') key = 'tag-' + tag if tag == '006': params['fields006'].append(val) if tag == '007': params['fields007'].append(val) if tag == '008': params['field008'] = field008 = val params['transforms'].append((tag, key)) params['fields_used'].append((tag,)) elif taglink.startswith(MARCXML_NS + '/data'): indicator_list = ((attribs.get('ind1') or ' ')[0].replace(' ', '#'), (attribs.get('ind2') or ' ')[0].replace(' ', '#')) key = 'tag-' + tag #logger.debug('indicators: ', repr(indicators)) #indicator_list = (indicators['ind1'], indicators['ind2']) params['fields_used'].append(tuple([tag] + list(subfields.keys()))) #This is where we check each incoming MARC link to see if it matches a transform into an output link (e.g. renaming 001 to 'controlCode') to_process = [] #Start with most specific matches, then to most general # "?" syntax in lookups is a single char wildcard #First with subfields, with & without indicators: for k, v in subfields.items(): #if indicator_list == ('#', '#'): lookups = [ '{0}-{1}{2}${3}'.format(tag, indicator_list[0], indicator_list[1], k), '{0}-?{2}${3}'.format(tag, indicator_list[0], indicator_list[1], k), '{0}-{1}?${3}'.format(tag, indicator_list[0], indicator_list[1], k), '{0}${1}'.format(tag, k), ] for valitems in v: for lookup in lookups: if lookup in transforms: to_process.append((transforms[lookup], valitems)) else: # don't report on subfields for which a code-transform exists, # disregard wildcards if not tag in transforms and '?' not in lookup: params['dropped_codes'].setdefault(lookup,0) params['dropped_codes'][lookup] += 1 #Now just the tag, with & without indicators lookups = [ '{0}-{1}{2}'.format(tag, indicator_list[0], indicator_list[1]), '{0}-?{2}'.format(tag, indicator_list[0], indicator_list[1]), '{0}-{1}?'.format(tag, indicator_list[0], indicator_list[1]), tag, ] #Remember how many lookups were successful based on subfields subfields_results_len = len(to_process) for lookup in lookups: if lookup in transforms: to_process.append((transforms[lookup], val)) if subfields_results_len == len(to_process) and not subfields: # Count as dropped if subfields were not processed and theer were no matches on non-subfield lookups params['dropped_codes'].setdefault(tag,0) params['dropped_codes'][tag] += 1 mat_ent = functools.partial(materialize_entity, ctx_params=params, loop=loop) #Apply all the handlers that were found for funcinfo, val in to_process: #Support multiple actions per lookup funcs = funcinfo if isinstance(funcinfo, tuple) else (funcinfo,) for func in funcs: extras = { WORKID: workid, IID: instanceid } #Build Versa processing context #Should we include indicators? #Should we be passing in taglik rather than tag? ctx = bfcontext((origin, tag, val, subfields), input_model, model, extras=extras, base=vocabbase, idgen=mat_ent, existing_ids=existing_ids) func(ctx) if not to_process: #Nothing else has handled this data field; go to the fallback fallback_rel_base = '../marcext/tag-' + tag if not subfields: #Fallback for control field: Captures MARC tag & value model.add(I(workid), I(iri.absolutize(fallback_rel_base, vocabbase)), val) for k, v in subfields.items(): #Fallback for data field: Captures MARC tag, indicators, subfields & value fallback_rel = '../marcext/{0}-{1}{2}-{3}'.format( fallback_rel_base, indicator_list[0].replace('#', 'X'), indicator_list[1].replace('#', 'X'), k) #params['transforms'].append((code, fallback_rel)) for valitem in v: model.add(I(workid), I(iri.absolutize(fallback_rel, vocabbase)), valitem) extra_stmts = set() # prevent duplicate statements for origin, k, v in itertools.chain( extra_transforms.process_leader(params), extra_transforms.process_006(fields006, params), extra_transforms.process_007(fields007, params), extra_transforms.process_008(field008, params)): v = v if isinstance(v, tuple) else (v,) for item in v: o = origin or I(workid) if (o,k,item) not in extra_stmts: model.add(o, k, item) extra_stmts.add((o, k, item)) instance_postprocess(params) logger.debug('+') for plugin in plugins: #Each plug-in is a task #task = asyncio.Task(plugin[BF_MARCREC_TASK](loop, relsink, params), loop=loop) if BF_MARCREC_TASK in plugin: yield from plugin[BF_MARCREC_TASK](loop, model, params) logger.debug("Pending tasks: %s" % asyncio.Task.all_tasks(loop)) #FIXME: This blocks and thus serializes the plugin operation, rather than the desired coop scheduling approach #For some reason seting to async task then immediately deferring to next task via yield from sleep leads to the "yield from wasn't used with future" error (Not much clue at: https://codereview.appspot.com/7396044/) #yield from asyncio.Task(asyncio.sleep(0.01), loop=loop) #yield from asyncio.async(asyncio.sleep(0.01)) #yield from asyncio.sleep(0.01) #Basically yield to next task #Can we somehow move this to passed-in postprocessing? if out and not canonical and not first_record: out.write(',\n') if out: if not canonical: first_record = False last_chunk = None #Using iterencode avoids building a big JSON string in memory, or having to resort to file pointer seeking #Then again builds a big list in memory, so still working on opt here for chunk in json.JSONEncoder().iterencode([ link for link in model ]): if last_chunk is None: last_chunk = chunk[1:] else: out.write(last_chunk) last_chunk = chunk if last_chunk: out.write(last_chunk[:-1]) #FIXME: Postprocessing should probably be a task too if postprocess: postprocess() #limiting--running count of records processed versus the max number, if any limiting[0] += 1 if limiting[1] is not None and limiting[0] >= limiting[1]: break except GeneratorExit: logger.debug('Completed processing {0} record{1}.'.format(limiting[0], '' if limiting[0] == 1 else 's')) if out and not canonical: out.write(']') #if not plugins: loop.stop() for plugin in plugins: #Each plug-in is a task func = plugin.get(BF_FINAL_TASK) if not func: continue task = asyncio.Task(func(loop), loop=loop) _final_tasks.add(task) def task_done(task): #print('Task done: ', task) _final_tasks.remove(task) #logger.debug((plugins)) #if plugins and len(_final_tasks) == 0: #print("_final_tasks is empty, stopping loop.") #loop = asyncio.get_event_loop() # loop.stop() #Once all the plug-in tasks are done, all the work is done task.add_done_callback(task_done) #print('DONE') #raise return
def record_handler(loop, relsink, entbase=None, vocabbase=BFZ, limiting=None, plugins=None, ids=None, postprocess=None, out=None, logger=logging, **kwargs): ''' loop - asyncio event loop entbase - base IRI used for IDs of generated entity resources limiting - mutable pair of [count, limit] used to control the number of records processed ''' _final_tasks = set() #Tasks for the event loop contributing to the MARC processing plugins = plugins or [] if ids is None: ids = idgen(entbase) #FIXME: For now always generate instances from ISBNs, but consider working this through th plugins system instancegen = isbn_instancegen existing_ids = set() initialize(hashidgen=ids, existing_ids=existing_ids) #Start the process of writing out the JSON representation of the resulting Versa out.write('[') first_record = True try: while True: rec = yield leader = None #Add work item record, with actual hash resource IDs based on default or plugged-in algo #FIXME: No plug-in support yet workhash = record_hash_key(rec) workid = ids.send('Work:' + workhash) existing_ids.add(workid) logger.debug('Uniform title from 245$a: {0}'.format(marc_lookup(rec, ['245$a']))) logger.debug('Work hash result: {0} from \'{1}\''.format(workid, 'Work' + workhash)) if entbase: workid = I(iri.absolutize(workid, entbase)) relsink.add(I(workid), TYPE_REL, I(iri.absolutize('Work', vocabbase))) params = {'workid': workid, 'rec': rec, 'logger': logger, 'model': relsink, 'entbase': entbase, 'vocabbase': vocabbase, 'ids': ids, 'existing_ids': existing_ids} #Figure out instances instanceids = instancegen(params) if instanceids: instanceid = instanceids[0] params['instanceids'] = instanceids params['transforms'] = [] # set() params['fields_used'] = [] for row in rec: code = None if row[0] == LEADER: params['leader'] = leader = row[1] elif row[0] == CONTROLFIELD: code, val = row[1], row[2] key = 'tag-' + code if code == '008': params['field008'] = field008 = val params['transforms'].append((code, key)) relsink.add(I(instanceid), I(iri.absolutize(key, vocabbase)), val) params['fields_used'].append((code,)) elif row[0] == DATAFIELD: code, xmlattrs, subfields = row[1], row[2], row[3] #xmlattribs include are indicators indicators = ((xmlattrs.get('ind1') or ' ')[0].replace(' ', '#'), (xmlattrs.get('ind2') or ' ')[0].replace(' ', '#')) key = 'tag-' + code handled = False params['subfields'] = subfields params['indicators'] = indicators params['fields_used'].append(tuple([code] + list(subfields.keys()))) to_process = [] #logger.debug(repr(indicators)) if indicators != ('#', '#'): #One or other indicator is set, so let's check the transforms against those lookup = '{0}-{1}{2}'.format(*((code,) + indicators)) for k, v in subfields.items(): lookup = '{0}${1}'.format(code, k) if lookup in TRANSFORMS: to_process.append((TRANSFORMS[lookup], v)) if code in TRANSFORMS: to_process.append((TRANSFORMS[code], '')) #if code == '100': # logger.debug(to_process) #Apply all the handlers that were found for func, val in to_process: #Build Versa processing context ctx = bfcontext(workid, code, [(workid, code, val, subfields)], relsink, base=vocabbase, hashidgen=ids, existing_ids=existing_ids) new_stmts = func(ctx, workid, instanceid) #FIXME: Use add for s in new_stmts: relsink.add(*s) #logger.debug('.') if not to_process: #Nothing else has handled this data field; go to the fallback fallback_rel_base = 'tag-' + code for k, v in subfields.items(): fallback_rel = fallback_rel_base + k #params['transforms'].append((code, fallback_rel)) relsink.add(I(workid), I(iri.absolutize(fallback_rel, vocabbase)), v) params['code'] = code special_properties = {} for k, v in process_leader(leader): special_properties.setdefault(k, set()).add(v) for k, v in process_008(field008): special_properties.setdefault(k, set()).add(v) params['special_properties'] = special_properties #We get some repeated values out of leader & 008 processing, and we want to #Remove dupes so we did so by working with sets then converting to lists for k, v in special_properties.items(): special_properties[k] = list(v) for item in v: #logger.debug(v) relsink.add(I(instanceid), I(iri.absolutize(k, vocabbase)), item) instance_postprocess(params) logger.debug('+') for plugin in plugins: #Each plug-in is a task #task = asyncio.Task(plugin[BF_MARCREC_TASK](loop, relsink, params), loop=loop) yield from plugin[BF_MARCREC_TASK](loop, relsink, params) logger.debug("Pending tasks: %s" % asyncio.Task.all_tasks(loop)) #FIXME: This blocks and thus serializes the plugin operation, rather than the desired coop scheduling approach #For some reason seting to async task then immediately deferring to next task via yield from sleep leads to the "yield from wasn't used with future" error (Not much clue at: https://codereview.appspot.com/7396044/) #yield from asyncio.Task(asyncio.sleep(0.01), loop=loop) #yield from asyncio.async(asyncio.sleep(0.01)) #yield from asyncio.sleep(0.01) #Basically yield to next task if not first_record: out.write(',\n') first_record = False last_chunk = None #Using iterencode avoids building a big JSON string in memory, or having to resort to file pointer seeking #Then again builds a big list in memory, so still working on opt here for chunk in json.JSONEncoder().iterencode([ link for link in relsink ]): if last_chunk is None: last_chunk = chunk[1:] else: out.write(last_chunk) last_chunk = chunk if last_chunk: out.write(last_chunk[:-1]) #FIXME: Postprocessing should probably be a task too if postprocess: postprocess(rec) #limiting--running count of records processed versus the max number, if any limiting[0] += 1 if limiting[1] is not None and limiting[0] >= limiting[1]: break except GeneratorExit: logger.debug('Completed processing {0} record{1}.'.format(limiting[0], '' if limiting[0] == 1 else 's')) out.write(']') #if not plugins: loop.stop() for plugin in plugins: #Each plug-in is a task task = asyncio.Task(plugin[BF_FINAL_TASK](loop), loop=loop) _final_tasks.add(task) def task_done(task): #print('Task done: ', task) _final_tasks.remove(task) #logger.debug((plugins)) #if plugins and len(_final_tasks) == 0: #print("_final_tasks is empty, stopping loop.") #loop = asyncio.get_event_loop() # loop.stop() #Once all the plug-in tasks are done, all the work is done task.add_done_callback(task_done) #print('DONE') #raise return