Ejemplo n.º 1
0
def record_handler(loop, relsink, entbase=None, vocabbase=BFZ, limiting=None, plugins=None, ids=None, postprocess=None, out=None, logger=logging, **kwargs):
    '''
    loop - asyncio event loop
    entbase - base IRI used for IDs of generated entity resources
    limiting - mutable pair of [count, limit] used to control the number of records processed
    '''
    _final_tasks = set() #Tasks for the event loop contributing to the MARC processing
    
    plugins = plugins or []
    if ids is None: ids = idgen(entbase)
    logger.debug('GRIPPO: {0}'.format(repr(entbase)))

    #FIXME: For now always generate instances from ISBNs, but consider working this through th plugins system
    instancegen = isbn_instancegen

    existing_ids = set()
    initialize(hashidgen=ids, existing_ids=existing_ids)
    #Start the process of writing out the JSON representation of the resulting Versa
    out.write('[')
    first_record = True
    try:
        while True:
            rec = yield
            leader = None
            #Add work item record, with actual hash resource IDs based on default or plugged-in algo
            #FIXME: No plug-in support yet
            workhash = record_hash_key(rec)
            workid = ids.send('Work:' + workhash)
            existing_ids.add(workid)
            logger.debug('Uniform title from 245$a: {0}'.format(marc_lookup(rec, ['245$a'])))
            logger.debug('Work hash result: {0} from \'{1}\''.format(workid, 'Work' + workhash))

            if entbase: workid = I(iri.absolutize(workid, entbase))
            relsink.add(I(workid), TYPE_REL, I(iri.absolutize('Work', vocabbase)))

            params = {'workid': workid, 'rec': rec, 'logger': logger, 'model': relsink, 'entbase': entbase, 'vocabbase': vocabbase, 'ids': ids, 'existing_ids': existing_ids}

            #Figure out instances
            instanceids = instancegen(params)
            if instanceids:
                instanceid = instanceids[0]

            params['instance_ids'] = instanceids
            params['transforms'] = [] # set()
            params['fields_used'] = []
            for row in rec:
                code = None

                if row[0] == LEADER:
                    params['leader'] = leader = row[1]
                elif row[0] == CONTROLFIELD:
                    code, val = row[1], row[2]
                    key = 'tag-' + code
                    if code == '008':
                        params['field008'] = field008 = val
                    params['transforms'].append((code, key))
                    relsink.add(I(instanceid), I(iri.absolutize(key, vocabbase)), val)
                    params['fields_used'].append((code,))
                elif row[0] == DATAFIELD:
                    code, xmlattrs, subfields = row[1], row[2], row[3]
                    #xmlattribs include are indicators
                    indicators = ((xmlattrs.get('ind1') or ' ')[0].replace(' ', '#'), (xmlattrs.get('ind2') or ' ')[0].replace(' ', '#'))
                    key = 'tag-' + code

                    handled = False
                    params['subfields'] = subfields
                    params['indicators'] = indicators
                    params['fields_used'].append(tuple([code] + list(subfields.keys())))

                    #Build Versa processing context

                    to_process = []
                    #logger.debug(repr(indicators))
                    if indicators == ('#', '#'):
                        #No indicators set
                        for k, v in subfields.items():
                            lookup = '{0}${1}'.format(code, k)
                            if lookup in TRANSFORMS: to_process.append((TRANSFORMS[lookup], v))

                        lookup = code
                        if lookup in TRANSFORMS: to_process.append((TRANSFORMS[lookup], ''))
                    else:
                        #One or other indicators is set, so let's check the transforms against those
                        lookup = '{0}-{1}{2}'.format(*((code,) + indicators))
                        if lookup in TRANSFORMS: to_process.append((TRANSFORMS[lookup], ''))

                        for k, v in subfields.items():
                            lookup = '{0}${1}'.format(code, k)
                            if lookup in TRANSFORMS: to_process.append((TRANSFORMS[lookup], v))

                    #Apply all the handlers that were found
                    for func, val in to_process:
                        ctx = context(workid, [(workid, code, val, subfields)], relsink, base=vocabbase)
                        new_stmts = func(ctx, workid, instanceid)
                        #FIXME: Use add
                        for s in new_stmts: relsink.add(*s)
                        #logger.debug('.')

                    if not to_process:
                        #Nothing else has handled this data field; go to the fallback
                        fallback_rel_base = 'tag-' + code
                        for k, v in subfields.items():
                            fallback_rel = fallback_rel_base + k
                            #params['transforms'].append((code, fallback_rel))
                            relsink.add(I(workid), I(iri.absolutize(fallback_rel, vocabbase)), v)

                params['code'] = code

            special_properties = {}
            for k, v in process_leader(leader):
                special_properties.setdefault(k, set()).add(v)

            for k, v in process_008(field008):
                special_properties.setdefault(k, set()).add(v)
            params['special_properties'] = special_properties

            #We get some repeated values out of leader & 008 processing, and we want to
            #Remove dupes so we did so by working with sets then converting to lists
            for k, v in special_properties.items():
                special_properties[k] = list(v)
                for item in v:
                #logger.debug(v)
                    relsink.add(I(instanceid), I(iri.absolutize(k, vocabbase)), item)

            instance_postprocess(params)

            logger.debug('+')

            for plugin in plugins:
                #Each plug-in is a task
                #task = asyncio.Task(plugin[BF_MARCREC_TASK](loop, relsink, params), loop=loop)
                yield from plugin[BF_MARCREC_TASK](loop, relsink, params)
                logger.debug("Pending tasks: %s" % asyncio.Task.all_tasks(loop))
                #FIXME: This blocks and thus serializes the plugin operation, rather than the desired coop scheduling approach
                #For some reason seting to async task then immediately deferring to next task via yield from sleep leads to the "yield from wasn't used with future" error (Not much clue at: https://codereview.appspot.com/7396044/)
                #yield from asyncio.Task(asyncio.sleep(0.01), loop=loop)
                #yield from asyncio.async(asyncio.sleep(0.01))
                #yield from asyncio.sleep(0.01) #Basically yield to next task

            if not first_record: out.write(',\n')
            first_record = False
            last_chunk = None
            #Using iterencode avoids building a big JSON string in memory, or having to resort to file pointer seeking
            #Then again builds a big list in memory, so still working on opt here
            for chunk in json.JSONEncoder().iterencode([ link for link in relsink ]):
                if last_chunk is None:
                    last_chunk = chunk[1:]
                else:
                    out.write(last_chunk)
                    last_chunk = chunk
            if last_chunk: out.write(last_chunk[:-1])
            #FIXME: Postprocessing should probably be a task too
            if postprocess: postprocess(rec)
            #limiting--running count of records processed versus the max number, if any
            limiting[0] += 1
            if limiting[1] is not None and limiting[0] >= limiting[1]:
                break
    except GeneratorExit:
        logger.debug('Completed processing {0} record{1}.'.format(limiting[0], '' if limiting[0] == 1 else 's'))
        out.write(']')

        if not plugins: loop.stop()
        for plugin in plugins:
            #Each plug-in is a task
            task = asyncio.Task(plugin[BF_FINAL_TASK](loop), loop=loop)
            _final_tasks.add(task)
            def task_done(task):
                #print('Task done: ', task)
                _final_tasks.remove(task)
                if len(_final_tasks) == 0:
                    #print("_final_tasks is empty, stopping loop.")
                    #loop = asyncio.get_event_loop()
                    loop.stop()
            #Once all the plug-in tasks are done, all the work is done
            task.add_done_callback(task_done)
        #print('DONE')
        #raise

    return
Ejemplo n.º 2
0
def record_handler(relsink, idbase, limiting=None, plugins=None, ids=None, postprocess=None, out=None, logger=logging, **kwargs):
    '''
    idbase - base IRI used for IDs of generated resources
    limiting - mutable pair of [count, limit] used to control the number of records processed
    '''
    plugins = plugins or []
    if ids is None: ids = idgen(idbase)
    #FIXME: Use thread local storage rather than function attributes

    #A few code modularization functions pulled into local context as closures
    def process_materialization(lookup, subfields, code=None):
        materializedid = hashid(idbase, tuple(subfields.items()))
        #The extra_props are parameters inherent to a particular MARC field/subfield for purposes of linked data representation
        if code is None: code = lookup
        (subst, extra_props) = MATERIALIZE[lookup]
        if RESOURCE_TYPE in extra_props:
            relsink.add(I(materializedid), TYPE_REL, I(iri.absolutize(extra_props[RESOURCE_TYPE], BFZ)))
        #logger.debug((lookup, subfields, extra_props))

        if materializedid not in T_prior_materializedids:
            #Just bundle in the subfields as they are, to avoid throwing out data. They can be otherwise used or just stripped later on
            #for k, v in itertools.chain((('marccode', code),), subfields.items(), extra_props.items()):
            for k, v in itertools.chain(subfields.items(), extra_props.items()):
                if k == RESOURCE_TYPE: continue
                fieldname = 'subfield-' + k
                if code + k in FIELD_RENAMINGS:
                    fieldname = FIELD_RENAMINGS[code + k]
                    if len(k) == 1: params['transforms'].append((code + k, fieldname)) #Only if proper MARC subfield
                    #params['transforms'].append((code + k, FIELD_RENAMINGS.get(sflookup, sflookup)))
                relsink.add(I(materializedid), iri.absolutize(fieldname, BFZ), v)
            T_prior_materializedids.add(materializedid)

        return materializedid, subst


    #FIXME: test correct MARC transforms info for annotations
    def process_annotation(anntype, subfields, extra_annotation_props):
        #Separate annotation subfields from object subfields
        object_subfields = subfields.copy()
        annotation_subfields = {}
        for k, v in subfields.items():
            if code + k in ANNOTATIONS_FIELDS:
                annotation_subfields[k] = v
                del object_subfields[k]
            params['transforms'].append((code + k, code + k))

        #objectid = next(idg)
        #object_props.update(object_subfields)

        annotationid = next(ids)
        relsink.add(I(annotationid), TYPE_REL, I(iri.absolutize(anntype, BFZ)))
        for k, v in itertools.chain(annotation_subfields.items(), extra_annotation_props.items()):
            relsink.add(I(annotationid), I(iri.absolutize(k, BFZ)), v)

        #Return enough info to generate the main subject/object relationship. The annotation is taken care of at this point
        return annotationid, object_subfields

    #Start the process of writing out the JSON representation of the resulting Versa
    out.write('[')
    first_record = True
    try:
        while True:
            rec = yield
            #for plugin in plugins:
            #    plugin.send(dict(rec=rec))
            leader = None
            #Add work item record
            workid = next(ids)
            relsink.add(I(workid), TYPE_REL, I(iri.absolutize('Work', BFZ)))
            instanceid = next(ids)
            #logger.debug((workid, instanceid))
            params = {'workid': workid, 'model': relsink}

            relsink.add(I(instanceid), TYPE_REL, I(iri.absolutize('Instance', BFZ)))
            #relsink.add((instanceid, iri.absolutize('leader', PROPBASE), leader))
            #Instances are added below
            #relsink.add(I(workid), I(iri.absolutize('hasInstance', BFZ)), I(instanceid))

            #for service in g_services: service.send(NEW_RECORD, relsink, workid, instanceid)

            params['transforms'] = [] # set()
            params['fields_used'] = []
            for row in rec:
                code = None

                if row[0] == LEADER:
                    params['leader'] = leader = row[1]
                elif row[0] == CONTROLFIELD:
                    code, val = row[1].strip(), row[2]
                    key = 'tag-' + code
                    if code == '008':
                        params['field008'] = field008 = val
                    params['transforms'].append((code, key))
                    relsink.add(I(instanceid), I(iri.absolutize(key, BFZ)), val)
                    params['fields_used'].append((code,))
                elif row[0] == DATAFIELD:
                    code, xmlattrs, subfields = row[1].strip(), row[2], row[3]
                    key = 'tag-' + code

                    handled = False
                    subfields = dict(( (sf[0].strip(), sf[1]) for sf in subfields ))
                    params['subfields'] = subfields
                    params['fields_used'].append(tuple([code] + list(subfields.keys())))

                    if subfields:
                        lookup = code
                        #See if any of the field codes represents a reference to an object which can be materialized

                        if code in MATERIALIZE:
                            materializedid, subst = process_materialization(code, subfields)
                            subject = instanceid if code in INSTANCE_FIELDS else workid
                            params['transforms'].append((code, subst))
                            relsink.add(I(subject), I(iri.absolutize(subst, BFZ)), I(materializedid))
                            logger.debug('.')
                            handled = True

                        if code in MATERIALIZE_VIA_ANNOTATION:
                            #FIXME: code comments for extra_object_props & extra_annotation_props
                            (subst, anntype, extra_annotation_props) = MATERIALIZE_VIA_ANNOTATION[code]
                            annotationid, object_subfields = process_annotation(anntype, subfields, extra_annotation_props)

                            subject = instanceid if code in INSTANCE_FIELDS else workid
                            objectid = next(ids)
                            params['transforms'].append((code, subst))
                            relsink.add(I(subject), I(iri.absolutize(subst, BFZ)), I(objectid), {I(iri.absolutize('annotation', BFZ)): I(annotationid)})

                            for k, v in itertools.chain((('marccode', code),), object_subfields.items()):
                            #for k, v in itertools.chain(('marccode', code), object_subfields.items(), extra_object_props.items()):
                                relsink.add(I(objectid), I(iri.absolutize(k, BFZ)), v)

                            logger.debug('.')
                            handled = True

                        #See if any of the field+subfield codes represents a reference to an object which can be materialized
                        if not handled:
                            for k, v in subfields.items():
                                lookup = code + k
                                if lookup in MATERIALIZE:
                                    #XXX At first glance you'd think you can always derive code from lookup (e.g. lookup[:3] but what if e.g. someone trims the left zero fill on the codes in the serialization?
                                    materializedid, subst = process_materialization(lookup, subfields, code=code)
                                    subject = instanceid if code in INSTANCE_FIELDS else workid
                                    params['transforms'].append((lookup, subst))
                                    relsink.add(I(subject), I(iri.absolutize(subst, BFZ)), I(materializedid))

                                    #Is the MARC code part of the hash computation for the materiaalized object ID? Surely not!
                                    #materializedid = hashid((code,) + tuple(subfields.items()))
                                    logger.debug('.')
                                    handled = True

                                else:
                                    field_name = 'tag-' + lookup
                                    if lookup in FIELD_RENAMINGS:
                                        field_name = FIELD_RENAMINGS[lookup]
                                    #Handle the simple field_name substitution of a label name for a MARC code
                                    subject = instanceid if code in INSTANCE_FIELDS else workid
                                    #logger.debug(repr(I(iri.absolutize(field_name, BFZ))))
                                    params['transforms'].append((lookup, field_name))
                                    relsink.add(I(subject), I(iri.absolutize(field_name, BFZ)), v)

                    #print >> sys.stderr, lookup, key
                    #if val:
                    #    subject = instanceid if code in INSTANCE_FIELDS else workid
                    #    relsink.add(I(subject), I(iri.absolutize(key, BFZ)), val)

                params['code'] = code

            special_properties = {}
            for k, v in process_leader(leader):
                special_properties.setdefault(k, set()).add(v)

            for k, v in process_008(field008):
                special_properties.setdefault(k, set()).add(v)
            params['special_properties'] = special_properties

            #We get some repeated values out of leader & 008 processing, and we want to
            #Remove dupes so we did so by working with sets then converting to lists
            for k, v in special_properties.items():
                special_properties[k] = list(v)
                for item in v:
                #logger.debug(v)
                    relsink.add(I(instanceid), I(iri.absolutize(k, BFZ)), item)


            #reduce lists of just one item
            #for k, v in work_item.items():
            #    if type(v) is list and len(v) == 1:
            #        work_item[k] = v[0]
            #work_sink.send(work_item)


            #Handle ISBNs re: https://foundry.zepheira.com/issues/1976
            ISBN_FIELD = 'tag-020'
            isbn_stmts = relsink.match(subj=instanceid, pred=iri.absolutize(ISBN_FIELD, BFZ))
            isbns = [ s[2] for s in isbn_stmts ]
            logger.debug('ISBNS: {0}'.format(list(isbn_list(isbns))))
            other_instance_ids = []
            subscript = ord('a')
            newid = None
            for subix, (inum, itype) in enumerate(isbn_list(isbns)):
                #print >> sys.stderr, subix, inum, itype
                newid = next(ids)
                duplicate_statements(relsink, instanceid, newid)
                relsink.add(I(newid), I(iri.absolutize('isbn', BFZ)), inum)
                #subitem['id'] = instanceid + (unichr(subscript + subix) if subix else '')
                if itype: relsink.add(I(newid), I(iri.absolutize('isbnType', BFZ)), itype)
                other_instance_ids.append(newid)

            if not other_instance_ids:
                #Make sure it's created as an instance even if it has no ISBN
                relsink.add(I(workid), I(iri.absolutize('hasInstance', BFZ)), I(instanceid))
                params.setdefault('instanceids', []).append(instanceid)

            for iid in other_instance_ids:
                relsink.add(I(workid), I(iri.absolutize('hasInstance', BFZ)), I(iid))
                params.setdefault('instanceids', []).append(iid)

            #if newid is None: #No ISBN specified
            #    send_instance(ninst)

            #ix += 1
            logger.debug('+')

            for plugin in plugins:
                plugin.send(params)

            #Can't really use this because it include outer []
            #jsondump(relsink, out)

            if not first_record: out.write(',\n')
            first_record = False
            last_chunk = None
            #Using iterencode avoids building a big JSON string in memory, or having to resort to file pointer seeking
            #Then again builds a big list in memory, so still working on opt here
            for chunk in json.JSONEncoder().iterencode([ stmt for stmt in relsink ]):
                if last_chunk is None:
                    last_chunk = chunk[1:]
                else:
                    out.write(last_chunk)
                    last_chunk = chunk
            if last_chunk: out.write(last_chunk[:-1])
            if postprocess: postprocess(rec)
            if limiting[1] is not None:
                limiting[0] += 1
                if limiting[0] >= limiting[1]:
                    break
        logger.debug('Completed processing {0} record{1}.'.format(limiting[0], '' if limiting[0] == 1 else 's'))
    except GeneratorExit:
        out.write(']')
    return