コード例 #1
0
ファイル: util.py プロジェクト: HexaPlant/pybibframe
def materialize_entity(etype, ctx_params=None, model_to_update=None, data=None, addtype=True, loop=None, logger=logging):
    '''
    Routine for creating a BIBFRAME resource. Takes the entity (resource) type and a data mapping
    according to the resource type. Implements the Libhub Resource Hash Convention
    As a convenience, if a vocabulary base is provided, concatenate it to etype and the data keys

    data - list of key/value pairs used to compute the hash. If empty the hash will be a default for the entity type
            WARNING: THIS FUNCTION MANGLES THE data ARG
    '''
    ctx_params = ctx_params or {}
    vocabbase = ctx_params.get('vocabbase', BL)
    entbase = ctx_params.get('entbase')
    existing_ids = ctx_params.get('existing_ids', set())
    plugins = ctx_params.get('plugins')
    logger = ctx_params.get('logger', logging)
    output_model = ctx_params.get('output_model')
    ids = ctx_params.get('ids', idgen(entbase))
    if vocabbase and not iri.is_absolute(etype):
        etype = vocabbase + etype
    params = {'logger': logger}

    data = data or []
    if addtype: data.insert(0, [TYPE_REL, etype])
    data_full =  [ ((vocabbase + k if not iri.is_absolute(k) else k), v) for (k, v) in data ]
    plaintext = json.dumps(data_full, separators=(',', ':'), cls=OrderedJsonEncoder)

    eid = ids.send(plaintext)

    if model_to_update:
        model_to_update.add(I(eid), TYPE_REL, I(etype))

    params['materialized_id'] = eid
    params['first_seen'] = eid in existing_ids
    params['plaintext'] = plaintext
    for plugin in plugins or ():
        #Not using yield from
        if BF_MATRES_TASK in plugin:
            for p in plugin[BF_MATRES_TASK](loop, output_model, params): pass
        #logger.debug("Pending tasks: %s" % asyncio.Task.all_tasks(loop))
    return eid
コード例 #2
0
ファイル: marc.py プロジェクト: catwizard/pybibframe
def record_handler( loop, model, entbase=None, vocabbase=BL, limiting=None,
                    plugins=None, ids=None, postprocess=None, out=None,
                    logger=logging, transforms=TRANSFORMS,
                    extra_transforms=default_extra_transforms(),
                    canonical=False, **kwargs):
    '''
    loop - asyncio event loop
    model - the Versa model for the record
    entbase - base IRI used for IDs of generated entity resources
    limiting - mutable pair of [count, limit] used to control the number of records processed
    '''
    _final_tasks = set() #Tasks for the event loop contributing to the MARC processing

    plugins = plugins or []
    if ids is None: ids = idgen(entbase)

    #FIXME: For now always generate instances from ISBNs, but consider working this through the plugins system
    instancegen = isbn_instancegen

    existing_ids = set()
    #Start the process of writing out the JSON representation of the resulting Versa
    if out and not canonical: out.write('[')
    first_record = True

    try:
        while True:
            input_model = yield
            leader = None
            #Add work item record, with actual hash resource IDs based on default or plugged-in algo
            #FIXME: No plug-in support yet
            params = {'input_model': input_model, 'output_model': model, 'logger': logger, 'entbase': entbase, 'vocabbase': vocabbase, 'ids': ids, 'existing_ids': existing_ids, 'plugins': plugins}
            workhash = record_hash_key(input_model)
            workid = materialize_entity('Work', ctx_params=params, loop=loop, hash=workhash)
            is_folded = workid in existing_ids
            existing_ids.add(workid)
            control_code = list(marc_lookup(input_model, '001')) or ['NO 001 CONTROL CODE']
            dumb_title = list(marc_lookup(input_model, '245$a')) or ['NO 245$a TITLE']
            logger.debug('Control code: {0}'.format(control_code[0]))
            logger.debug('Uniform title: {0}'.format(dumb_title[0]))
            logger.debug('Work hash result: {0} from \'{1}\''.format(workid, 'Work' + workhash))

            if entbase:
                workid = I(iri.absolutize(workid, entbase))
            else:
                workid = I(workid)

            folded = [workid] if is_folded else []

            model.add(workid, TYPE_REL, I(iri.absolutize('Work', vocabbase)))

            params['workid'] = workid
            params['folded'] = folded

            #Figure out instances
            params['materialize_entity'] = materialize_entity
            instanceids = instancegen(params, loop, model)
            if instanceids:
                instanceid = instanceids[0]

            params['leader'] = None
            params['workid'] = workid
            params['instanceids'] = instanceids
            params['folded'] = folded
            params['transforms'] = [] # set()
            params['fields_used'] = []
            params['dropped_codes'] = {}
            #Defensive coding against missing leader or 008
            field008 = leader = None
            params['fields006'] = fields006 = []
            params['fields007'] = fields007 = []
            #Prepare cross-references (i.e. 880s)
            #XXX: Figure out a way to declare in TRANSFRORMS? We might have to deal with non-standard relationship designators: https://github.com/lcnetdev/marc2bibframe/issues/83
            xrefs = {}
            remove_links = set()
            add_links = []
            for lid, marc_link in input_model:
                origin, taglink, val, attribs = marc_link
                if taglink == MARCXML_NS + '/leader' or taglink.startswith(MARCXML_NS + '/data/9'):
                    #900 fields are local and might not follow the general xref rules
                    params['leader'] = leader = val
                    continue
                tag = attribs['tag']
                for xref in attribs.get('6', []):
                    xref_parts = xref.split('-')
                    if len(xref_parts) < 2:
                        logger.debug('Invalid $6: {}'.format(xref_parts))
                        continue

                    xreftag, xrefid = xref_parts
                    #Locate the matching taglink
                    if tag == '880' and xrefid.startswith('00'):
                        #Special case, no actual xref, just the non-roman text
                        #Rule for 880s: merge in & add language indicator
                        langinfo = xrefid.split('/')[-1]
                        #Not using langinfo, really, at present because it seems near useless. Eventually we can handle by embedding a lang indicator token into attr values for later postprocessing
                        attribs['tag'] = xreftag
                        add_links.append((origin, MARCXML_NS + '/data/' + xreftag, val, attribs))

                    links = input_model.match(None, MARCXML_NS + '/data/' + xreftag)
                    for link in links:
                        #6 is the cross-reference subfield
                        for dest in link[ATTRIBUTES].get('6', []):
                            if [tag, xrefid] == dest.split('/')[0].split('-'):
                                if tag == '880':
                                    #880s will be handled by merger via xref, so take out for main loop
                                    #XXX: This does, however, make input_model no longer a true representation of the input XML. Problem?
                                    remove_links.add(lid)

                                if xreftag == '880':
                                    #Rule for 880s: merge in & add language indicator
                                    langinfo = dest.split('/')[-1]
                                    #Not using langinfo, really, at present because it seems near useless. Eventually we can handle by embedding a lang indicator token into attr values for later postprocessing
                                    remove_links.add(lid)
                                    copied_attribs = attribs.copy()
                                    for k, v in link[ATTRIBUTES].items():
                                        if k[:3] not in ('tag', 'ind'):
                                            copied_attribs.setdefault(k, []).extend(v)
                                    add_links.append((origin, taglink, val, copied_attribs))

            for lid in remove_links:
                input_model.remove(lid)

            for linfo in add_links:
                input_model.add(*linfo)

            # hook for plugins interested in the input model
            for plugin in plugins:
                if BF_INPUT_TASK in plugin:
                    yield from plugin[BF_INPUT_TASK](loop, input_model, params)

            # need to sort our way through the input model so that the materializations occur
            # at the same place each time, otherwise canonicalization fails due to the
            # addition of the subfield context (at the end of materialize())
            for lid, marc_link in sorted(list(input_model), key=lambda x: int(x[0])):
                origin, taglink, val, attribs = marc_link
                if taglink == MARCXML_NS + '/leader':
                    params['leader'] = leader = val
                    continue
                #Sort out attributes
                params['indicators'] = indicators = { k: v for k, v in attribs.items() if k.startswith('ind') }
                params['subfields'] = subfields = { k: v for k, v in attribs.items() if k[:3] not in ('tag', 'ind') }
                params['code'] = tag = attribs['tag']
                if taglink.startswith(MARCXML_NS + '/control'):
                    #No indicators on control fields. Turn them off, in effect
                    indicator_list = ('#', '#')
                    key = 'tag-' + tag
                    if tag == '006':
                        params['fields006'].append(val)
                    if tag == '007':
                        params['fields007'].append(val)
                    if tag == '008':
                        params['field008'] = field008 = val
                    params['transforms'].append((tag, key))
                    params['fields_used'].append((tag,))
                elif taglink.startswith(MARCXML_NS + '/data'):
                    indicator_list = ((attribs.get('ind1') or ' ')[0].replace(' ', '#'), (attribs.get('ind2') or ' ')[0].replace(' ', '#'))
                    key = 'tag-' + tag
                    #logger.debug('indicators: ', repr(indicators))
                    #indicator_list = (indicators['ind1'], indicators['ind2'])
                    params['fields_used'].append(tuple([tag] + list(subfields.keys())))

                #This is where we check each incoming MARC link to see if it matches a transform into an output link (e.g. renaming 001 to 'controlCode')
                to_process = []
                #Start with most specific matches, then to most general

                # "?" syntax in lookups is a single char wildcard
                #First with subfields, with & without indicators:
                for k, v in subfields.items():
                    #if indicator_list == ('#', '#'):
                    lookups = [
                        '{0}-{1}{2}${3}'.format(tag, indicator_list[0], indicator_list[1], k),
                        '{0}-?{2}${3}'.format(tag, indicator_list[0], indicator_list[1], k),
                        '{0}-{1}?${3}'.format(tag, indicator_list[0], indicator_list[1], k),
                        '{0}${1}'.format(tag, k),
                    ]
                    for valitems in v:
                        for lookup in lookups:
                            if lookup in transforms:
                                to_process.append((transforms[lookup], valitems))
                            else:
                                # don't report on subfields for which a code-transform exists,
                                # disregard wildcards
                                if not tag in transforms and '?' not in lookup:

                                    params['dropped_codes'].setdefault(lookup,0)
                                    params['dropped_codes'][lookup] += 1

                #Now just the tag, with & without indicators
                lookups = [
                    '{0}-{1}{2}'.format(tag, indicator_list[0], indicator_list[1]),
                    '{0}-?{2}'.format(tag, indicator_list[0], indicator_list[1]),
                    '{0}-{1}?'.format(tag, indicator_list[0], indicator_list[1]),
                    tag,
                ]

                #Remember how many lookups were successful based on subfields
                subfields_results_len = len(to_process)
                for lookup in lookups:
                    if lookup in transforms:
                        to_process.append((transforms[lookup], val))

                if subfields_results_len == len(to_process) and not subfields:
                    # Count as dropped if subfields were not processed and theer were no matches on non-subfield lookups
                    params['dropped_codes'].setdefault(tag,0)
                    params['dropped_codes'][tag] += 1

                mat_ent = functools.partial(materialize_entity, ctx_params=params, loop=loop)
                #Apply all the handlers that were found
                for funcinfo, val in to_process:
                    #Support multiple actions per lookup
                    funcs = funcinfo if isinstance(funcinfo, tuple) else (funcinfo,)

                    for func in funcs:
                        extras = { WORKID: workid, IID: instanceid }
                        #Build Versa processing context
                        #Should we include indicators?
                        #Should we be passing in taglik rather than tag?
                        ctx = bfcontext((origin, tag, val, subfields), input_model, model, extras=extras, base=vocabbase, idgen=mat_ent, existing_ids=existing_ids)
                        func(ctx)

                if not to_process:
                    #Nothing else has handled this data field; go to the fallback
                    fallback_rel_base = '../marcext/tag-' + tag
                    if not subfields:
                        #Fallback for control field: Captures MARC tag & value
                        model.add(I(workid), I(iri.absolutize(fallback_rel_base, vocabbase)), val)
                    for k, v in subfields.items():
                        #Fallback for data field: Captures MARC tag, indicators, subfields & value
                        fallback_rel = '../marcext/{0}-{1}{2}-{3}'.format(
                            fallback_rel_base, indicator_list[0].replace('#', 'X'),
                            indicator_list[1].replace('#', 'X'), k)
                        #params['transforms'].append((code, fallback_rel))
                        for valitem in v:
                            model.add(I(workid), I(iri.absolutize(fallback_rel, vocabbase)), valitem)

            extra_stmts = set() # prevent duplicate statements
            for origin, k, v in itertools.chain(
                        extra_transforms.process_leader(params),
                        extra_transforms.process_006(fields006, params),
                        extra_transforms.process_007(fields007, params),
                        extra_transforms.process_008(field008, params)):
                v = v if isinstance(v, tuple) else (v,)
                for item in v:
                    o = origin or I(workid)
                    if (o,k,item) not in extra_stmts:
                        model.add(o, k, item)
                        extra_stmts.add((o, k, item))

            instance_postprocess(params)

            logger.debug('+')

            for plugin in plugins:
                #Each plug-in is a task
                #task = asyncio.Task(plugin[BF_MARCREC_TASK](loop, relsink, params), loop=loop)
                if BF_MARCREC_TASK in plugin:
                    yield from plugin[BF_MARCREC_TASK](loop, model, params)
                logger.debug("Pending tasks: %s" % asyncio.Task.all_tasks(loop))
                #FIXME: This blocks and thus serializes the plugin operation, rather than the desired coop scheduling approach
                #For some reason seting to async task then immediately deferring to next task via yield from sleep leads to the "yield from wasn't used with future" error (Not much clue at: https://codereview.appspot.com/7396044/)
                #yield from asyncio.Task(asyncio.sleep(0.01), loop=loop)
                #yield from asyncio.async(asyncio.sleep(0.01))
                #yield from asyncio.sleep(0.01) #Basically yield to next task

            #Can we somehow move this to passed-in postprocessing?
            if out and not canonical and not first_record: out.write(',\n')
            if out:
                if not canonical:
                    first_record = False
                    last_chunk = None
                    #Using iterencode avoids building a big JSON string in memory, or having to resort to file pointer seeking
                    #Then again builds a big list in memory, so still working on opt here
                    for chunk in json.JSONEncoder().iterencode([ link for link in model ]):
                        if last_chunk is None:
                            last_chunk = chunk[1:]
                        else:
                            out.write(last_chunk)
                            last_chunk = chunk
                    if last_chunk: out.write(last_chunk[:-1])
            #FIXME: Postprocessing should probably be a task too
            if postprocess: postprocess()
            #limiting--running count of records processed versus the max number, if any
            limiting[0] += 1
            if limiting[1] is not None and limiting[0] >= limiting[1]:
                break
    except GeneratorExit:
        logger.debug('Completed processing {0} record{1}.'.format(limiting[0], '' if limiting[0] == 1 else 's'))
        if out and not canonical: out.write(']')

        #if not plugins: loop.stop()
        for plugin in plugins:
            #Each plug-in is a task
            func = plugin.get(BF_FINAL_TASK)
            if not func: continue
            task = asyncio.Task(func(loop), loop=loop)
            _final_tasks.add(task)
            def task_done(task):
                #print('Task done: ', task)
                _final_tasks.remove(task)
                #logger.debug((plugins))
                #if plugins and len(_final_tasks) == 0:
                    #print("_final_tasks is empty, stopping loop.")
                    #loop = asyncio.get_event_loop()
                #    loop.stop()
            #Once all the plug-in tasks are done, all the work is done
            task.add_done_callback(task_done)
        #print('DONE')
        #raise

    return
コード例 #3
0
ファイル: marc.py プロジェクト: dfeeney/pybibframe
def record_handler(loop, relsink, entbase=None, vocabbase=BFZ, limiting=None, plugins=None, ids=None, postprocess=None, out=None, logger=logging, **kwargs):
    '''
    loop - asyncio event loop
    entbase - base IRI used for IDs of generated entity resources
    limiting - mutable pair of [count, limit] used to control the number of records processed
    '''
    _final_tasks = set() #Tasks for the event loop contributing to the MARC processing
    
    plugins = plugins or []
    if ids is None: ids = idgen(entbase)
    logger.debug('GRIPPO: {0}'.format(repr(entbase)))

    #FIXME: For now always generate instances from ISBNs, but consider working this through th plugins system
    instancegen = isbn_instancegen

    existing_ids = set()
    initialize(hashidgen=ids, existing_ids=existing_ids)
    #Start the process of writing out the JSON representation of the resulting Versa
    out.write('[')
    first_record = True
    try:
        while True:
            rec = yield
            leader = None
            #Add work item record, with actual hash resource IDs based on default or plugged-in algo
            #FIXME: No plug-in support yet
            workhash = record_hash_key(rec)
            workid = ids.send('Work:' + workhash)
            existing_ids.add(workid)
            logger.debug('Uniform title from 245$a: {0}'.format(marc_lookup(rec, ['245$a'])))
            logger.debug('Work hash result: {0} from \'{1}\''.format(workid, 'Work' + workhash))

            if entbase: workid = I(iri.absolutize(workid, entbase))
            relsink.add(I(workid), TYPE_REL, I(iri.absolutize('Work', vocabbase)))

            params = {'workid': workid, 'rec': rec, 'logger': logger, 'model': relsink, 'entbase': entbase, 'vocabbase': vocabbase, 'ids': ids, 'existing_ids': existing_ids}

            #Figure out instances
            instanceids = instancegen(params)
            if instanceids:
                instanceid = instanceids[0]

            params['instance_ids'] = instanceids
            params['transforms'] = [] # set()
            params['fields_used'] = []
            for row in rec:
                code = None

                if row[0] == LEADER:
                    params['leader'] = leader = row[1]
                elif row[0] == CONTROLFIELD:
                    code, val = row[1], row[2]
                    key = 'tag-' + code
                    if code == '008':
                        params['field008'] = field008 = val
                    params['transforms'].append((code, key))
                    relsink.add(I(instanceid), I(iri.absolutize(key, vocabbase)), val)
                    params['fields_used'].append((code,))
                elif row[0] == DATAFIELD:
                    code, xmlattrs, subfields = row[1], row[2], row[3]
                    #xmlattribs include are indicators
                    indicators = ((xmlattrs.get('ind1') or ' ')[0].replace(' ', '#'), (xmlattrs.get('ind2') or ' ')[0].replace(' ', '#'))
                    key = 'tag-' + code

                    handled = False
                    params['subfields'] = subfields
                    params['indicators'] = indicators
                    params['fields_used'].append(tuple([code] + list(subfields.keys())))

                    #Build Versa processing context

                    to_process = []
                    #logger.debug(repr(indicators))
                    if indicators == ('#', '#'):
                        #No indicators set
                        for k, v in subfields.items():
                            lookup = '{0}${1}'.format(code, k)
                            if lookup in TRANSFORMS: to_process.append((TRANSFORMS[lookup], v))

                        lookup = code
                        if lookup in TRANSFORMS: to_process.append((TRANSFORMS[lookup], ''))
                    else:
                        #One or other indicators is set, so let's check the transforms against those
                        lookup = '{0}-{1}{2}'.format(*((code,) + indicators))
                        if lookup in TRANSFORMS: to_process.append((TRANSFORMS[lookup], ''))

                        for k, v in subfields.items():
                            lookup = '{0}${1}'.format(code, k)
                            if lookup in TRANSFORMS: to_process.append((TRANSFORMS[lookup], v))

                    #Apply all the handlers that were found
                    for func, val in to_process:
                        ctx = context(workid, [(workid, code, val, subfields)], relsink, base=vocabbase)
                        new_stmts = func(ctx, workid, instanceid)
                        #FIXME: Use add
                        for s in new_stmts: relsink.add(*s)
                        #logger.debug('.')

                    if not to_process:
                        #Nothing else has handled this data field; go to the fallback
                        fallback_rel_base = 'tag-' + code
                        for k, v in subfields.items():
                            fallback_rel = fallback_rel_base + k
                            #params['transforms'].append((code, fallback_rel))
                            relsink.add(I(workid), I(iri.absolutize(fallback_rel, vocabbase)), v)

                params['code'] = code

            special_properties = {}
            for k, v in process_leader(leader):
                special_properties.setdefault(k, set()).add(v)

            for k, v in process_008(field008):
                special_properties.setdefault(k, set()).add(v)
            params['special_properties'] = special_properties

            #We get some repeated values out of leader & 008 processing, and we want to
            #Remove dupes so we did so by working with sets then converting to lists
            for k, v in special_properties.items():
                special_properties[k] = list(v)
                for item in v:
                #logger.debug(v)
                    relsink.add(I(instanceid), I(iri.absolutize(k, vocabbase)), item)

            instance_postprocess(params)

            logger.debug('+')

            for plugin in plugins:
                #Each plug-in is a task
                #task = asyncio.Task(plugin[BF_MARCREC_TASK](loop, relsink, params), loop=loop)
                yield from plugin[BF_MARCREC_TASK](loop, relsink, params)
                logger.debug("Pending tasks: %s" % asyncio.Task.all_tasks(loop))
                #FIXME: This blocks and thus serializes the plugin operation, rather than the desired coop scheduling approach
                #For some reason seting to async task then immediately deferring to next task via yield from sleep leads to the "yield from wasn't used with future" error (Not much clue at: https://codereview.appspot.com/7396044/)
                #yield from asyncio.Task(asyncio.sleep(0.01), loop=loop)
                #yield from asyncio.async(asyncio.sleep(0.01))
                #yield from asyncio.sleep(0.01) #Basically yield to next task

            if not first_record: out.write(',\n')
            first_record = False
            last_chunk = None
            #Using iterencode avoids building a big JSON string in memory, or having to resort to file pointer seeking
            #Then again builds a big list in memory, so still working on opt here
            for chunk in json.JSONEncoder().iterencode([ link for link in relsink ]):
                if last_chunk is None:
                    last_chunk = chunk[1:]
                else:
                    out.write(last_chunk)
                    last_chunk = chunk
            if last_chunk: out.write(last_chunk[:-1])
            #FIXME: Postprocessing should probably be a task too
            if postprocess: postprocess(rec)
            #limiting--running count of records processed versus the max number, if any
            limiting[0] += 1
            if limiting[1] is not None and limiting[0] >= limiting[1]:
                break
    except GeneratorExit:
        logger.debug('Completed processing {0} record{1}.'.format(limiting[0], '' if limiting[0] == 1 else 's'))
        out.write(']')

        if not plugins: loop.stop()
        for plugin in plugins:
            #Each plug-in is a task
            task = asyncio.Task(plugin[BF_FINAL_TASK](loop), loop=loop)
            _final_tasks.add(task)
            def task_done(task):
                #print('Task done: ', task)
                _final_tasks.remove(task)
                if len(_final_tasks) == 0:
                    #print("_final_tasks is empty, stopping loop.")
                    #loop = asyncio.get_event_loop()
                    loop.stop()
            #Once all the plug-in tasks are done, all the work is done
            task.add_done_callback(task_done)
        #print('DONE')
        #raise

    return
コード例 #4
0
ファイル: marc.py プロジェクト: anukat2015/pybibframe
def record_handler( loop, model, entbase=None, vocabbase=BL, limiting=None,
                    plugins=None, ids=None, postprocess=None, out=None,
                    logger=logging, transforms=TRANSFORMS,
                    extra_transforms=default_extra_transforms(),
                    canonical=False, **kwargs):
    '''
    loop - asyncio event loop
    model - the Versa model for the record
    entbase - base IRI used for IDs of generated entity resources
    limiting - mutable pair of [count, limit] used to control the number of records processed
    '''
    model_factory = kwargs.get('model_factory', memory.connection)
    main_transforms = transforms

    _final_tasks = set() #Tasks for the event loop contributing to the MARC processing

    plugins = plugins or []
    if ids is None: ids = idgen(entbase)

    #FIXME: For now always generate instances from ISBNs, but consider working this through the plugins system
    instancegen = isbn_instancegen

    existing_ids = set()
    #Start the process of writing out the JSON representation of the resulting Versa
    if out and not canonical: out.write('[')
    first_record = True

    try:
        while True:
            input_model = yield
            leader = None
            #Add work item record, with actual hash resource IDs based on default or plugged-in algo
            #FIXME: No plug-in support yet
            params = {
                'input_model': input_model, 'output_model': model, 'logger': logger,
                'entbase': entbase, 'vocabbase': vocabbase, 'ids': ids,
                'existing_ids': existing_ids, 'plugins': plugins,
                'materialize_entity': materialize_entity, 'leader': leader,
                'loop': loop, 'extra_transforms': extra_transforms
            }

            # Earliest plugin stage, with an unadulterated input model
            for plugin in plugins:
                if BF_INPUT_TASK in plugin:
                    yield from plugin[BF_INPUT_TASK](loop, input_model, params)

            #Prepare cross-references (i.e. 880s)
            #XXX: Figure out a way to declare in TRANSFORMS? We might have to deal with non-standard relationship designators: https://github.com/lcnetdev/marc2bibframe/issues/83
            xrefs = {}
            remove_links = set()
            add_links = []

            for lid, marc_link in input_model:
                origin, taglink, val, attribs = marc_link
                if taglink == MARCXML_NS + '/leader' or taglink.startswith(MARCXML_NS + '/data/9'):
                    #900 fields are local and might not follow the general xref rules
                    params['leader'] = leader = val
                    continue
                tag = attribs['tag']
                for xref in attribs.get('6', []):
                    xref_parts = xref.split('-')
                    if len(xref_parts) != 2:
                        control_code = list(marc_lookup(input_model, '001')) or ['NO 001 CONTROL CODE']
                        dumb_title = list(marc_lookup(input_model, '245$a')) or ['NO 245$a TITLE']
                        logger.warning('Skipping invalid $6: "{}" for {}: "{}"'.format(xref, control_code[0], dumb_title[0]))
                        continue

                    xreftag, xrefid = xref_parts
                    #Locate the matching taglink
                    if tag == '880' and xrefid.startswith('00'):
                        #Special case, no actual xref, just the non-roman text
                        #Rule for 880s: merge in & add language indicator
                        langinfo = xrefid.split('/')[-1]
                        #Not using langinfo, really, at present because it seems near useless. Eventually we can handle by embedding a lang indicator token into attr values for later postprocessing
                        attribs['tag'] = xreftag
                        add_links.append((origin, MARCXML_NS + '/data/' + xreftag, val, attribs))

                    links = input_model.match(None, MARCXML_NS + '/data/' + xreftag)
                    for link in links:
                        #6 is the cross-reference subfield
                        for dest in link[ATTRIBUTES].get('6', []):
                            if [tag, xrefid] == dest.split('/')[0].split('-'):
                                if tag == '880':
                                    #880s will be handled by merger via xref, so take out for main loop
                                    #XXX: This does, however, make input_model no longer a true representation of the input XML. Problem?
                                    remove_links.add(lid)

                                if xreftag == '880':
                                    #Rule for 880s: merge in & add language indicator
                                    langinfo = dest.split('/')[-1]
                                    #Not using langinfo, really, at present because it seems near useless. Eventually we can handle by embedding a lang indicator token into attr values for later postprocessing
                                    remove_links.add(lid)
                                    copied_attribs = attribs.copy()
                                    for k, v in link[ATTRIBUTES].items():
                                        if k[:3] not in ('tag', 'ind'):
                                            copied_attribs.setdefault(k, []).extend(v)
                                    add_links.append((origin, taglink, val, copied_attribs))

            input_model.remove(remove_links)
            input_model.add_many(add_links)

            # hook for plugins interested in the xref-resolved input model
            for plugin in plugins:
                if BF_INPUT_XREF_TASK in plugin:
                    yield from plugin[BF_INPUT_XREF_TASK](loop, input_model, params)

            #Do one pass to establish work hash
            #XXX Should crossrefs precede this?
            temp_workhash = next(params['input_model'].match())[ORIGIN]
            logger.debug('Temp work hash: {0}'.format(temp_workhash))

            params['workid'] = temp_workhash
            params['instanceids'] = [temp_workhash + '-instance']
            params['output_model'] = model_factory()

            params['field008'] = leader = None
            params['fields006'] = fields006 = []
            params['fields007'] = fields007 = []
            params['to_postprocess'] = []

            process_marcpatterns(params, WORK_HASH_TRANSFORMS, input_model, main_phase=False)

            workid_data = gather_workid_data(params['output_model'], temp_workhash)
            workid = materialize_entity('Work', ctx_params=params, loop=loop, data=workid_data)

            is_folded = workid in existing_ids
            existing_ids.add(workid)

            control_code = list(marc_lookup(input_model, '001')) or ['NO 001 CONTROL CODE']
            dumb_title = list(marc_lookup(input_model, '245$a')) or ['NO 245$a TITLE']
            logger.debug('Work hash data: {0}'.format(repr(workid_data)))
            logger.debug('Control code: {0}'.format(control_code[0]))
            logger.debug('Uniform title: {0}'.format(dumb_title[0]))
            logger.debug('Work ID: {0}'.format(workid))

            workid = I(iri.absolutize(workid, entbase)) if entbase else I(workid)
            folded = [workid] if is_folded else []

            model.add(workid, TYPE_REL, I(iri.absolutize('Work', vocabbase)))

            params['workid'] = workid
            params['folded'] = folded

            #Switch to the main output model for processing
            params['output_model'] = model

            #Figure out instances
            instanceids = instancegen(params, loop, model)

            params['instanceids'] = instanceids or [None]
            params['transform_log'] = [] # set()
            params['fields_used'] = []
            params['dropped_codes'] = {}
            #Defensive coding against missing leader or 008
            params['field008'] = leader = None
            params['fields006'] = fields006 = []
            params['fields007'] = fields007 = []
            params['to_postprocess'] = []

            process_marcpatterns(params, main_transforms, input_model, main_phase=True)

            skipped_rels = set()
            for op, rels, rid in params['to_postprocess']:
                for rel in rels: skipped_rels.add(rel)
                if op == POSTPROCESS_AS_INSTANCE:
                    if params['instanceids'] == [None]:
                        params['instanceids'] = [rid]
                    else:
                        params['instanceids'].append(rid)
            instance_postprocess(params, skip_relationships=skipped_rels)

            logger.debug('+')

            for plugin in plugins:
                #Each plug-in is a task
                #task = asyncio.Task(plugin[BF_MARCREC_TASK](loop, relsink, params), loop=loop)
                if BF_MARCREC_TASK in plugin:
                    yield from plugin[BF_MARCREC_TASK](loop, model, params)
                logger.debug("Pending tasks: %s" % asyncio.Task.all_tasks(loop))
                #FIXME: This blocks and thus serializes the plugin operation, rather than the desired coop scheduling approach
                #For some reason seting to async task then immediately deferring to next task via yield from sleep leads to the "yield from wasn't used with future" error (Not much clue at: https://codereview.appspot.com/7396044/)
                #yield from asyncio.Task(asyncio.sleep(0.01), loop=loop)
                #yield from asyncio.async(asyncio.sleep(0.01))
                #yield from asyncio.sleep(0.01) #Basically yield to next task

            #Can we somehow move this to passed-in postprocessing?
            if out and not canonical and not first_record: out.write(',\n')
            if out:
                if not canonical:
                    first_record = False
                    last_chunk = None
                    #Using iterencode avoids building a big JSON string in memory, or having to resort to file pointer seeking
                    #Then again builds a big list in memory, so still working on opt here
                    for chunk in json.JSONEncoder().iterencode([ link for link in model ]):
                        if last_chunk is None:
                            last_chunk = chunk[1:]
                        else:
                            out.write(last_chunk)
                            last_chunk = chunk
                    if last_chunk: out.write(last_chunk[:-1])
            #FIXME: Postprocessing should probably be a task too
            if postprocess: postprocess()
            #limiting--running count of records processed versus the max number, if any
            limiting[0] += 1
            if limiting[1] is not None and limiting[0] >= limiting[1]:
                break
    except GeneratorExit:
        logger.debug('Completed processing {0} record{1}.'.format(limiting[0], '' if limiting[0] == 1 else 's'))
        if out and not canonical: out.write(']')

        #if not plugins: loop.stop()
        for plugin in plugins:
            #Each plug-in is a task
            func = plugin.get(BF_FINAL_TASK)
            if not func: continue
            task = asyncio.Task(func(loop), loop=loop)
            _final_tasks.add(task)
            def task_done(task):
                #print('Task done: ', task)
                _final_tasks.remove(task)
                #logger.debug((plugins))
                #if plugins and len(_final_tasks) == 0:
                    #print("_final_tasks is empty, stopping loop.")
                    #loop = asyncio.get_event_loop()
                #    loop.stop()
            #Once all the plug-in tasks are done, all the work is done
            task.add_done_callback(task_done)
        #print('DONE')
        #raise

    return
コード例 #5
0
ファイル: marc.py プロジェクト: greeve/pybibframe
def record_handler(loop,
                   model,
                   entbase=None,
                   vocabbase=BL,
                   limiting=None,
                   plugins=None,
                   ids=None,
                   postprocess=None,
                   out=None,
                   logger=logging,
                   transforms=TRANSFORMS,
                   special_transforms=unused_flag,
                   canonical=False,
                   model_factory=memory.connection,
                   lookups=None,
                   **kwargs):
    '''
    loop - asyncio event loop
    model - the Versa model for the record
    entbase - base IRI used for IDs of generated entity resources
    limiting - mutable pair of [count, limit] used to control the number of records processed
    '''
    #Deprecated legacy API support
    if isinstance(transforms, dict) or special_transforms is not unused_flag:
        warnings.warn('Please switch to using bibframe.transforms_set',
                      PendingDeprecationWarning)
        special_transforms = special_transforms or default_special_transforms()
        transforms = transform_set(transforms)
        transforms.specials = special_transforms

    _final_tasks = set(
    )  #Tasks for the event loop contributing to the MARC processing

    plugins = plugins or []
    if ids is None: ids = idgen(entbase)

    #FIXME: For now always generate instances from ISBNs, but consider working this through the plugins system
    instancegen = isbn_instancegen

    existing_ids = set()
    #Start the process of writing out the JSON representation of the resulting Versa
    if out and not canonical: out.write('[')
    first_record = True

    try:
        while True:
            input_model = yield
            leader = None
            #Add work item record, with actual hash resource IDs based on default or plugged-in algo
            #FIXME: No plug-in support yet
            params = {
                'input_model': input_model,
                'logger': logger,
                #'input_model': input_model, 'output_model': model, 'logger': logger,
                'entbase': entbase,
                'vocabbase': vocabbase,
                'ids': ids,
                'existing_ids': existing_ids,
                'plugins': plugins,
                'transforms': transforms,
                'materialize_entity': materialize_entity,
                'leader': leader,
                'lookups': lookups or {},
                'loop': loop
            }

            # Earliest plugin stage, with an unadulterated input model
            for plugin in plugins:
                if BF_INPUT_TASK in plugin:
                    yield from plugin[BF_INPUT_TASK](loop, input_model, params)

            #Prepare cross-references (i.e. 880s)
            #See the "$6 - Linkage" section of https://www.loc.gov/marc/bibliographic/ecbdcntf.html
            #XXX: Figure out a way to declare in TRANSFORMS? We might have to deal with non-standard relationship designators: https://github.com/lcnetdev/marc2bibframe/issues/83
            xrefs = {}
            remove_links = set()
            add_links = []

            xref_link_tag_workaround = {}
            for lid, marc_link in input_model:
                origin, taglink, val, attribs = marc_link
                if taglink == MARCXML_NS + '/leader' or taglink.startswith(
                        MARCXML_NS + '/data/9'):
                    #900 fields are local and might not follow the general xref rules
                    params['leader'] = leader = val
                    continue
                #XXX Do other fields with a 9 digit (not just 9XX) also need to be skipped?
                if taglink.startswith(MARCXML_NS +
                                      '/extra/') or 'tag' not in attribs:
                    continue
                this_tag = attribs['tag']
                #if this_tag == '100': import pdb; pdb.set_trace()
                for xref in attribs.get('6', []):
                    matched = LINKAGE_PAT.match(xref)
                    this_taglink, this_occ, this_scriptid, this_rtl = matched.groups(
                    ) if matched else (None, None, None, None)
                    if not this_taglink and occ:
                        control_code = list(marc_lookup(
                            input_model, '001')) or ['NO 001 CONTROL CODE']
                        dumb_title = list(marc_lookup(
                            input_model, '245$a')) or ['NO 245$a TITLE']
                        logger.warning(
                            'Skipping invalid $6: "{}" for {}: "{}"'.format(
                                xref, control_code[0], dumb_title[0]))
                        continue

                    if this_tag == this_taglink:
                        #Pretty sure this is an erroneous self-link, but we've seen this in the wild (e.g. QNL). Issue warning & do the best we can linking via occurrence
                        #Note: the resulting workround (lookup table from occurence code to the correct tag) will not work in cases of linking from any tag higher in ordinal value than 880 (if such a situation is even possible)
                        logger.warning(
                            'Invalid input: erroneous self-link $6: "{}" from "{}". Trying to work around.'
                            .format(xref, this_tag))
                        if this_tag != '880':
                            xref_link_tag_workaround[this_occ] = this_tag

                    #FIXME: Remove this debugging if statament at some point
                    if scriptid or rtl:
                        logger.debug(
                            'Language info specified in subfield 6, {}'.format(
                                xref))

                    #Locate the matching taglink
                    if this_tag == '880' and this_occ == '00':
                        #Special case, no actual xref, used to separate scripts in a record (re Multiscript Records)
                        #FIXME: Not really handled right now. Presume some sort of merge dynamics will need to be implemented
                        attribs['tag'] = this_taglink
                        add_links.append(
                            (origin, MARCXML_NS + '/data/' + this_taglink, val,
                             attribs))

                    if xref_link_tag_workaround:
                        if this_tag == '880':
                            this_taglink = xref_link_tag_workaround.get(
                                this_occ)

                    links = input_model.match(
                        None, MARCXML_NS + '/data/' + this_taglink)
                    for that_link in links:
                        #6 is the cross-reference subfield
                        for that_ref in link[ATTRIBUTES].get('6', []):
                            matched = LINKAGE_PAT.match(that_ref)
                            that_taglink, that_occ, that_scriptid, that_rtl = matched.groups(
                            ) if matched else (None, None, None, None)
                            #if not that_tag and that_occ:
                            #    control_code = list(marc_lookup(input_model, '001')) or ['NO 001 CONTROL CODE']
                            #    dumb_title = list(marc_lookup(input_model, '245$a')) or ['NO 245$a TITLE']
                            #    logger.warning('Skipping invalid $6: "{}" for {}: "{}"'.format(to_ref, control_code[0], dumb_title[0]))
                            #    continue
                            if ([that_taglink, that_occ] == [
                                    this_tag, this_occ
                            ]) or (xref_link_tag_workaround
                                   and that_occ == this_occ):
                                if this_tag == '880':
                                    #This is an 880, which we'll handle by integrating back into the input model using the correct tag, flagged to show the relationship
                                    remove_links.add(lid)

                                if that_taglink == '880':
                                    #Rule for 880s: duplicate but link more robustly
                                    copied_attribs = attribs.copy()
                                    for k, v in that_link[ATTRIBUTES].items():
                                        if k[:3] not in ('tag', 'ind'):
                                            copied_attribs.setdefault(
                                                k, []).extend(v)
                                    add_links.append(
                                        (origin,
                                         MARCXML_NS + '/data/' + this_tag, val,
                                         copied_attribs))

            input_model.remove(remove_links)
            input_model.add_many(add_links)

            # hook for plugins interested in the xref-resolved input model
            for plugin in plugins:
                if BF_INPUT_XREF_TASK in plugin:
                    yield from plugin[BF_INPUT_XREF_TASK](loop, input_model,
                                                          params)

            #Do one pass to establish work hash
            #XXX Should crossrefs precede this?
            bootstrap_dummy_id = next(params['input_model'].match())[ORIGIN]
            logger.debug('Entering bootstrap phase. Dummy ID: {}'.format(
                bootstrap_dummy_id))

            params['default-origin'] = bootstrap_dummy_id
            params['instanceids'] = [bootstrap_dummy_id + '-instance']
            params['output_model'] = model_factory()

            params['field008'] = leader = None
            params['fields006'] = fields006 = []
            params['fields007'] = fields007 = []
            params['to_postprocess'] = []

            params['origins'] = {
                WORK_TYPE: bootstrap_dummy_id,
                INSTANCE_TYPE: params['instanceids'][0]
            }

            #First apply special patterns for determining the main target resources
            curr_transforms = transforms.compiled[BOOTSTRAP_PHASE]

            ok = process_marcpatterns(params, curr_transforms, input_model,
                                      BOOTSTRAP_PHASE)
            if not ok: continue  #Abort current record if signalled

            bootstrap_output = params['output_model']
            temp_main_target = main_type = None
            for o, r, t, a in bootstrap_output.match(
                    None, PYBF_BOOTSTRAP_TARGET_REL):
                #FIXME: We need a better designed way of determining fallback to bib
                if t is not None: temp_main_target, main_type = o, t

            #Switch to the main output model for processing
            params['output_model'] = model

            if temp_main_target is None:
                #If no target was set explicitly fall back to the transforms registered for the biblio phase
                #params['logger'].debug('WORK HASH ORIGIN {}\n'.format(bootstrap_dummy_id))
                #params['logger'].debug('WORK HASH MODEL {}\n'.format(repr(bootstrap_output)))
                workid_data = gather_workid_data(bootstrap_output,
                                                 bootstrap_dummy_id)
                workid = materialize_entity('Work',
                                            ctx_params=params,
                                            data=workid_data,
                                            loop=loop)
                logger.debug(
                    'Entering default main phase, Work ID: {0}'.format(workid))

                is_folded = workid in existing_ids
                existing_ids.add(workid)

                control_code = list(marc_lookup(
                    input_model, '001')) or ['NO 001 CONTROL CODE']
                dumb_title = list(marc_lookup(input_model,
                                              '245$a')) or ['NO 245$a TITLE']
                logger.debug('Work hash data: {0}'.format(repr(workid_data)))
                logger.debug('Control code: {0}'.format(control_code[0]))
                logger.debug('Uniform title: {0}'.format(dumb_title[0]))
                logger.debug('Work ID: {0}'.format(workid))

                workid = I(iri.absolutize(workid,
                                          entbase)) if entbase else I(workid)
                folded = [workid] if is_folded else []

                model.add(workid, VTYPE_REL,
                          I(iri.absolutize('Work', vocabbase)))

                params['default-origin'] = workid
                params['folded'] = folded

                #Figure out instances
                instanceids = instancegen(params, loop, model)
                params['instanceids'] = instanceids or [None]

                main_transforms = transforms.compiled[DEFAULT_MAIN_PHASE]
                params['origins'] = {
                    WORK_TYPE: workid,
                    INSTANCE_TYPE: params['instanceids'][0]
                }
                phase_target = DEFAULT_MAIN_PHASE
            else:
                targetid_data = gather_targetid_data(
                    bootstrap_output, temp_main_target,
                    transforms.orderings[main_type])
                #params['logger'].debug('Data for resource: {}\n'.format([main_type] + targetid_data))
                targetid = materialize_entity(main_type,
                                              ctx_params=params,
                                              data=targetid_data,
                                              loop=loop)
                logger.debug(
                    'Entering specialized phase, Target resource ID: {}, type: {}'
                    .format(targetid, main_type))

                is_folded = targetid in existing_ids
                existing_ids.add(targetid)
                #Determine next transform phase
                main_transforms = transforms.compiled[main_type]
                params['origins'] = {main_type: targetid}
                params['default-origin'] = targetid
                phase_target = main_type
                model.add(I(targetid), VTYPE_REL, I(main_type))

            params['transform_log'] = []  # set()
            params['fields_used'] = []
            params['dropped_codes'] = {}
            #Defensive coding against missing leader or 008
            params['field008'] = leader = None
            params['fields006'] = fields006 = []
            params['fields007'] = fields007 = []
            params['to_postprocess'] = []

            ok = process_marcpatterns(params, main_transforms, input_model,
                                      phase_target)
            if not ok: continue  #Abort current record if signalled

            skipped_rels = set()
            for op, rels, rid in params['to_postprocess']:
                for rel in rels:
                    skipped_rels.add(rel)
                if op == POSTPROCESS_AS_INSTANCE:
                    if params['instanceids'] == [None]:
                        params['instanceids'] = [rid]
                    else:
                        params['instanceids'].append(rid)
            instance_postprocess(params, skip_relationships=skipped_rels)

            logger.debug('+')

            #XXX At this point there must be at least one record with a Versa type

            for plugin in plugins:
                #Each plug-in is a task
                #task = asyncio.Task(plugin[BF_MARCREC_TASK](loop, relsink, params), loop=loop)
                if BF_MARCREC_TASK in plugin:
                    yield from plugin[BF_MARCREC_TASK](loop, model, params)
                logger.debug("Pending tasks: %s" %
                             asyncio.Task.all_tasks(loop))
                #FIXME: This blocks and thus serializes the plugin operation, rather than the desired coop scheduling approach
                #For some reason seting to async task then immediately deferring to next task via yield from sleep leads to the "yield from wasn't used with future" error (Not much clue at: https://codereview.appspot.com/7396044/)
                #yield from asyncio.Task(asyncio.sleep(0.01), loop=loop)
                #yield from asyncio.async(asyncio.sleep(0.01))
                #yield from asyncio.sleep(0.01) #Basically yield to next task

            #Can we somehow move this to passed-in postprocessing?
            if out and not canonical and not first_record: out.write(',\n')
            if out:
                if not canonical:
                    first_record = False
                    last_chunk = None
                    #Using iterencode avoids building a big JSON string in memory, or having to resort to file pointer seeking
                    #Then again builds a big list in memory, so still working on opt here
                    for chunk in json.JSONEncoder().iterencode(
                        [link for link in model]):
                        if last_chunk is None:
                            last_chunk = chunk[1:]
                        else:
                            out.write(last_chunk)
                            last_chunk = chunk
                    if last_chunk: out.write(last_chunk[:-1])
            #FIXME: Postprocessing should probably be a task too
            if postprocess: postprocess()
            #limiting--running count of records processed versus the max number, if any
            limiting[0] += 1
            if limiting[1] is not None and limiting[0] >= limiting[1]:
                break
    except GeneratorExit:
        logger.debug('Completed processing {0} record{1}.'.format(
            limiting[0], '' if limiting[0] == 1 else 's'))
        if out and not canonical: out.write(']')

        #if not plugins: loop.stop()
        for plugin in plugins:
            #Each plug-in is a task
            func = plugin.get(BF_FINAL_TASK)
            if not func: continue
            task = asyncio.Task(func(loop), loop=loop)
            _final_tasks.add(task)

            def task_done(task):
                #print('Task done: ', task)
                _final_tasks.remove(task)
                #logger.debug((plugins))
                #if plugins and len(_final_tasks) == 0:
                #print("_final_tasks is empty, stopping loop.")
                #loop = asyncio.get_event_loop()
                #    loop.stop()

            #Once all the plug-in tasks are done, all the work is done
            task.add_done_callback(task_done)
        #print('DONE')
        #raise

    return