def bfconvert(inputs, base=None, out=None, limit=None, rdfttl=None, config=None, verbose=False, logger=logging): ''' inputs - List of MARC/XML files to be parsed and converted to BIBFRAME RDF (Note: want to allow singular input strings) out - file where raw Versa JSON dump output should be written (default: write to stdout) rdfttl - stream to where RDF Turtle output should be written config - configuration information limit - Limit the number of records processed to this number. If omitted, all records will be processed. base - Base IRI to be used for creating resources. verbose - If true show additional messages and information (default: False) logger - logging object for messages ''' #if stats: # register_service(statsgen.statshandler) if hasattr(inputs, 'read') and hasattr(inputs, 'close'): #It's a file type? inputs = [inputs] if limit is not None: try: limit = int(limit) except ValueError: logger.debug('Limit must be a number, not "{0}". Ignoring.'.format(limit)) ids = marc.idgen(base) m = memory.connection() g = rdflib.Graph() g.bind('bf', BFNS) g.bind('bfc', BFCNS) g.bind('bfd', BFDNS) g.bind('v', VNS) def postprocess(rec): #No need to bother with Versa -> RDF translation if we were not asked to generate Turtle if rdfttl is not None: rdf.process(m, g, logger=logger) m.create_space() #Initialize auxiliary services (i.e. plugins) plugins = [] for pc in config.get(u'plugins', []): try: plugins.append(g_services[pc[u'id']]( config=pc, logger=logger, ) ) except KeyError: raise Exception(u'Unknown plugin {0}'.format(pc[u'id'])) limiting = [0, limit] for inf in inputs: sink = marc.record_handler(m, idbase=base, limiting=limiting, plugins=plugins, ids=ids, postprocess=postprocess, out=out, logger=logger) parse_marcxml(inf, sink) if rdfttl is not None: rdfttl.write(g.serialize(format="turtle")) for plugin in plugins: plugin.close() return
def bfconvert(inputs=None, base=None, out=None, limit=None, rdfttl=None, config=None, verbose=False, mods=None): ''' inputs - One or more MARC/XML files to be parsed and converted to BIBFRAME RDF out - file where raw Versa JSON dump output should be written (default: write to stdout) rdfttl - file where RDF Turtle output should be written config - file containing config in JSON format stats - file where statistics output should be written in JSON format limit - Limit the number of records processed to this number. If omitted, all records will be processed. base - Base IRI to be used for creating resources. mod - Python module to be imported in order to register plugins. verbose - If true show additional messages and information (default: False) ''' if config is None: config = {} else: config = json.load(config) logger = logging.getLogger('marc2bfrdf') if verbose: logger.setLevel(logging.DEBUG) for mod in mods or []: __import__(mod, globals(), locals(), []) from bibframe import g_services #if stats: # register_service(statsgen.statshandler) if limit is not None: try: limit = int(limit) except ValueError: logger.debug('Limit must be a number, not "{0}". Ignoring.'.format(limit)) ids = marc.idgen(base) m = memory.connection() g = rdflib.Graph() g.bind('bf', BFNS) g.bind('bfc', BFCNS) g.bind('bfd', BFDNS) g.bind('v', VNS) def postprocess(rec): #No need to bother with Versa -> RDF translation if we were not asked to generate Turtle if rdfttl is not None: rdf.process(m, g, logger=logger) m.create_space() #Initialize auxiliary services (i.e. plugins) plugins = [] for pc in config.get(u'plugins', []): try: plugins.append(g_services[pc[u'id']]( config=pc, logger=logger, ) ) except KeyError: raise Exception(u'Unknown plugin {0}'.format(pc[u'id'])) limiting = [0, limit] for inf in inputs: sink = marc.record_handler(m, idbase=base, limiting=limiting, plugins=plugins, ids=ids, postprocess=postprocess, out=out, logger=logger) parse_marcxml(inf, sink) if rdfttl is not None: rdfttl.write(g.serialize(format="turtle")) for plugin in plugins: plugin.close() return
def bfconvert(inputs, handle_marc_source=handle_marcxml_source, entbase=None, model=None, out=None, limit=None, rdfttl=None, rdfxml=None, xml=None, config=None, verbose=False, logger=logging, loop=None, canonical=False, lax=False, defaultsourcetype=inputsourcetype.unknown): ''' inputs - One or more open file-like object, string with MARC content, or filename or IRI. If filename or IRI it's a good idea to indicate this via the defaultsourcetype parameter handle_marc_source - Function to turn a source of MARC data (e.g. XML or JSON) into the internal format for processing entbase - Base IRI to be used for creating resources. model - model instance for internal use out - file where raw Versa JSON dump output should be written (default: write to stdout) limit - Limit the number of records processed to this number. If omitted, all records will be processed. rdfttl - stream to where RDF Turtle output should be written rdfxml - stream to where RDF/XML output should be written config - configuration information verbose - If true show additional messages and information (default: False) logger - logging object for messages loop - optional asyncio event loop to use canonical - output Versa's canonical form? lax - If True signal to the handle_marc_source function that relaxed syntax rules should be applied (e.g. accept XML with namespace problems) defaultsourcetype - Signal indicating how best to interpret inputs to create an inputsource ''' #if stats: # register_service(statsgen.statshandler) config = config or {} if limit is not None: try: limit = int(limit) except ValueError: logger.debug('Limit must be a number, not "{0}". Ignoring.'.format(limit)) def resolve_class(fullname): ''' Given a full name for a Python class, return the class object ''' import importlib modpath, name = fullname.rsplit('.', 1) module = importlib.import_module(modpath) cls = getattr(module, name) return cls attr_cls = resolve_class(config.get('versa-attr-cls', 'builtins.dict')) attr_list_cls = resolve_class(config.get('versa-attr-list-cls', 'builtins.list')) #attr_ordered_cls = resolve_class(config.get('versa-attr-cls', 'collections.OrderedDict')) model_factory = functools.partial(memory.connection, attr_cls=attr_cls) #,logger=logger) model_factory.attr_list_cls = attr_list_cls #model_odict_factory = functools.partial(memory.connection, attr_cls=attr_ordered_cls) #,logger=logger) #model_odict_factory.attr_list_cls = attr_list_cls if 'marc_record_handler' in config: handle_marc_source = AVAILABLE_MARC_HANDLERS[config['marc_record_handler']] readmode = handle_marc_source.readmode #inputs = ( inputsource(open(i, readmode)) for i in inputs ) #if not isinstance(inputs[0], inputsource): # inputs = ( inputsource(i, streamopenmode=readmode) for i in inputs ) if handle_marc_source.makeinputsource: inputs = factory(inputs, defaultsourcetype=defaultsourcetype, streamopenmode=readmode) #inputs = ( inputsource(i, streamopenmode=readmode) for i in inputs ) ids = marc.idgen(entbase) if model is None: model = model_factory() g = rdflib.Graph() #Intentionally not using either factory if canonical: global_model = memory.connection() #logger=logger) if xml is not None: xmlw = writer.raw(xml, indent=' ') xmlw.start_element('bibframe') extant_resources = None #extant_resources = set() def postprocess(): #No need to bother with Versa -> RDF translation if we were not asked to generate Turtle if any((rdfttl, rdfxml)): rdf.process(model, g, to_ignore=extant_resources, logger=logger) if canonical: global_model.add_many([(o,r,t,a) for (rid,(o,r,t,a)) in model]) if xml is not None: microxml.process(model, xmlw, to_ignore=extant_resources, logger=logger) model.create_space() #Set up event loop if not provided if not loop: loop = asyncio.get_event_loop() #Allow configuration of a separate base URI for vocab items (classes & properties) #XXX: Is this the best way to do this, or rather via a post-processing plug-in vb = config.get('vocab-base-uri', BL) transform_iris = config.get('transforms', {}) if transform_iris: transforms = {} for tiri in transform_iris: try: transforms.update(AVAILABLE_TRANSFORMS[tiri]) except KeyError: raise Exception('Unknown transforms set {0}'.format(tiri)) else: transforms = TRANSFORMS marcextras_vocab = config.get('marcextras-vocab') #Initialize auxiliary services (i.e. plugins) plugins = [] for pc in config.get('plugins', []): try: pinfo = g_services[pc['id']] plugins.append(pinfo) pinfo[BF_INIT_TASK](pinfo, config=pc) except KeyError: raise Exception('Unknown plugin {0}'.format(pc['id'])) limiting = [0, limit] #logger=logger, #raise(Exception(repr(inputs))) for source in inputs: @asyncio.coroutine #Wrap the parse operation to make it a task in the event loop def wrap_task(): #source=source sink = marc.record_handler( loop, model, entbase=entbase, vocabbase=vb, limiting=limiting, plugins=plugins, ids=ids, postprocess=postprocess, out=out, logger=logger, transforms=transforms, extra_transforms=extra_transforms(marcextras_vocab), canonical=canonical, model_factory=model_factory) args = dict(lax=lax) handle_marc_source(source, sink, args, logger, model_factory) sink.close() yield task = asyncio.async(wrap_task(), loop=loop) try: loop.run_until_complete(task) except Exception as ex: raise ex finally: loop.close() if canonical: out.write(repr(global_model)) if vb == BFZ: g.bind('bf', BFNS) g.bind('bfc', BFCNS) g.bind('bfd', BFDNS) else: g.bind('vb', rdflib.Namespace(vb)) if entbase: g.bind('ent', entbase) if rdfttl is not None: logger.debug('Converting to RDF (Turtle).') rdfttl.write(g.serialize(format="turtle")) if rdfxml is not None: logger.debug('Converting to RDF (XML).') rdfxml.write(g.serialize(format="pretty-xml")) if xml is not None: logger.debug('Converting to XML.') xmlw.end_element('bibframe') return
def bfconvert(inputs, entbase=None, model=None, out=None, limit=None, rdfttl=None, rdfxml=None, config=None, verbose=False, logger=logging, loop=None, canonical=False, lax=False, zipcheck=False): ''' inputs - List of MARC/XML files to be parsed and converted to BIBFRAME RDF (Note: want to allow singular input strings) entbase - Base IRI to be used for creating resources. model - model instance for internal use out - file where raw Versa JSON dump output should be written (default: write to stdout) limit - Limit the number of records processed to this number. If omitted, all records will be processed. rdfttl - stream to where RDF Turtle output should be written rdfxml - stream to where RDF/XML output should be written config - configuration information verbose - If true show additional messages and information (default: False) logger - logging object for messages loop - optional asyncio event loop to use canonical - output Versa's canonical form? zipcheck - whether to check for zip files among the inputs ''' #if stats: # register_service(statsgen.statshandler) config = config or {} if hasattr(inputs, 'read') and hasattr(inputs, 'close'): #It's a file type? inputs = [inputs] if limit is not None: try: limit = int(limit) except ValueError: logger.debug('Limit must be a number, not "{0}". Ignoring.'.format(limit)) ids = marc.idgen(entbase) if model is None: model = memory.connection(logger=logger) g = rdflib.Graph() if canonical: global_model = memory.connection(attr_cls=OrderedDict) extant_resources = None #extant_resources = set() def postprocess(): #No need to bother with Versa -> RDF translation if we were not asked to generate Turtle if any((rdfttl, rdfxml)): rdf.process(model, g, to_ignore=extant_resources, logger=logger) if canonical: global_model.add_many([(o,r,t,a) for (rid,(o,r,t,a)) in model]) model.create_space() #Set up event loop if not provided if not loop: loop = asyncio.get_event_loop() #Allow configuration of a separate base URI for vocab items (classes & properties) #XXX: Is this the best way to do this, or rather via a post-processing plug-in vb = config.get('vocab-base-uri', BL) transform_iris = config.get('transforms', {}) if transform_iris: transforms = {} for tiri in transform_iris: try: transforms.update(AVAILABLE_TRANSFORMS[tiri]) except KeyError: raise Exception('Unknown transforms set {0}'.format(tiri)) else: transforms = TRANSFORMS marcextras_vocab = config.get('marcextras-vocab') #Initialize auxiliary services (i.e. plugins) plugins = [] for pc in config.get('plugins', []): try: pinfo = g_services[pc['id']] plugins.append(pinfo) pinfo[BF_INIT_TASK](pinfo, config=pc) except KeyError: raise Exception('Unknown plugin {0}'.format(pc['id'])) limiting = [0, limit] #logger=logger, if zipcheck: warnings.warn("The zipcheck option is not working yet.", RuntimeWarning) for source_file in inputs: #Note: def input_fileset(sf): if zipcheck and zipfile.is_zipfile(sf): zf = zipfile.ZipFile(sf, 'r') for info in list(zf.infolist()): #From the doc: Note If the ZipFile was created by passing in a file-like object as the first argument to the constructor, then the object returned by open() shares the ZipFile’s file pointer. Under these circumstances, the object returned by open() should not be used after any additional operations are performed on the ZipFile object. sf.seek(0, 0) zf = zipfile.ZipFile(sf, 'r') yield zf.open(info, mode='r') else: if zipcheck: #Because zipfile.is_zipfile fast forwards to EOF sf.seek(0, 0) yield sf for inf in input_fileset(source_file): @asyncio.coroutine #Wrap the parse operation to make it a task in the event loop def wrap_task(inf=inf): #Cannot reuse a pyexpat parser, so must create a new one for each input file sink = marc.record_handler( loop, model, entbase=entbase, vocabbase=vb, limiting=limiting, plugins=plugins, ids=ids, postprocess=postprocess, out=out, logger=logger, transforms=transforms, extra_transforms=extra_transforms(marcextras_vocab), canonical=canonical) if lax: parser = xml.parsers.expat.ParserCreate() else: parser = xml.parsers.expat.ParserCreate(namespace_separator=NSSEP) handler = expat_callbacks(sink, parser, lax) parser.StartElementHandler = handler.start_element parser.EndElementHandler = handler.end_element parser.CharacterDataHandler = handler.char_data parser.buffer_text = True parser.ParseFile(inf) if handler.no_records: warnings.warn("No records found in this file. Possibly an XML namespace problem (try using the 'lax' flag).", RuntimeWarning) sink.close() yield task = asyncio.async(wrap_task(), loop=loop) try: loop.run_until_complete(task) except Exception as ex: raise ex finally: loop.close() if canonical: out.write(repr(global_model)) if vb == BFZ: g.bind('bf', BFNS) g.bind('bfc', BFCNS) g.bind('bfd', BFDNS) else: g.bind('vb', rdflib.Namespace(vb)) if entbase: g.bind('ent', entbase) if rdfttl is not None: logger.debug('Converting to RDF.') rdfttl.write(g.serialize(format="turtle")) if rdfxml is not None: logger.debug('Converting to RDF.') rdfxml.write(g.serialize(format="pretty-xml")) return
def bfconvert(inputs, base=None, out=None, limit=None, rdfttl=None, rdfxml=None, config=None, verbose=False, logger=logging): ''' inputs - List of MARC/XML files to be parsed and converted to BIBFRAME RDF (Note: want to allow singular input strings) out - file where raw Versa JSON dump output should be written (default: write to stdout) rdfttl - stream to where RDF Turtle output should be written config - configuration information limit - Limit the number of records processed to this number. If omitted, all records will be processed. base - Base IRI to be used for creating resources. verbose - If true show additional messages and information (default: False) logger - logging object for messages ''' #if stats: # register_service(statsgen.statshandler) config = config or {} if hasattr(inputs, 'read') and hasattr(inputs, 'close'): #It's a file type? inputs = [inputs] if limit is not None: try: limit = int(limit) except ValueError: logger.debug('Limit must be a number, not "{0}". Ignoring.'.format(limit)) ids = marc.idgen(base) m = memory.connection() g = rdflib.Graph() g.bind('bf', BFNS) g.bind('bfc', BFCNS) g.bind('bfd', BFDNS) g.bind('v', VNS) if base: g.bind('ent', base) extant_resources = None #extant_resources = set() def postprocess(rec): #No need to bother with Versa -> RDF translation if we were not asked to generate Turtle if any((rdfttl, rdfxml)): rdf.process(m, g, to_ignore=extant_resources, logger=logger) m.create_space() #Set up event loop loop = asyncio.get_event_loop() #Allow configuration of a separate base URI for vocab items (classes & properties) #XXX: Is this the best way to do this, or rather via a post-processing plug-in vb = config.get(u'vocab-base-uri', BFZ) #Initialize auxiliary services (i.e. plugins) plugins = [] for pc in config.get(u'plugins', []): try: pinfo = g_services[pc[u'id']] plugins.append(pinfo) pinfo[BF_INIT_TASK](pinfo, config=pc) except KeyError: raise Exception(u'Unknown plugin {0}'.format(pc[u'id'])) limiting = [0, limit] #logger=logger, for inf in inputs: sink = marc.record_handler(loop, m, entbase=base, vocabbase=vb, limiting=limiting, plugins=plugins, ids=ids, postprocess=postprocess, out=out, logger=logger) parser = sax.make_parser() #parser.setContentHandler(marcxmlhandler(receive_recs())) parser.setContentHandler(marcxmlhandler(sink)) parser.setFeature(sax.handler.feature_namespaces, 1) @asyncio.coroutine #Wrap the parse operation to make it a task in the event loop def wrap_task(): parser.parse(inf) yield task = asyncio.Task(wrap_task()) #parse_marcxml(inf, sink) try: loop.run_until_complete(task) except Exception as ex: raise ex finally: loop.close() if rdfttl is not None: logger.debug('Converting to RDF.') rdfttl.write(g.serialize(format="turtle")) if rdfxml is not None: logger.debug('Converting to RDF.') rdfxml.write(g.serialize(format="pretty-xml")) return