def disabled_test_xml_quiet(self): """ Tests the 'quiet' parameter of the MARC8ToUnicode class, passed in via the pymarc.record_to_xml() method """ outfile = 'test/dummy_stderr.txt' # truncate outfile in case someone's fiddled with it open(outfile, 'wb').close() # redirect stderr sys.stderr = open(outfile, 'wb') # reload pymarc so it picks up the new sys.stderr reload(pymarc) # get problematic record record = next( pymarc.reader.MARCReader(open('test/utf8_errors.dat', 'rb'))) # record_to_xml() with quiet set to False should generate errors # and write them to sys.stderr xml = pymarc.record_to_xml(record, quiet=False) # close dummy stderr so we can accurately get its size sys.stderr.close() # file size should be greater than 0 self.assertNotEqual(getsize(outfile), 0) # truncate file again open(outfile, 'wb').close() # be sure its truncated self.assertEqual(getsize(outfile), 0) # redirect stderr again sys.stderr = open(outfile, 'wb') reload(pymarc) # record_to_xml() with quiet set to True should not generate errors xml = pymarc.record_to_xml(record, quiet=True) # close dummy stderr sys.stderr.close() # no errors should have been written self.assertEqual(getsize(outfile), 0)
def disabled_test_xml_quiet(self): """ Tests the 'quiet' parameter of the MARC8ToUnicode class, passed in via the pymarc.record_to_xml() method """ outfile = 'test/dummy_stderr.txt' # truncate outfile in case someone's fiddled with it open(outfile, 'wb').close() # redirect stderr sys.stderr = open(outfile, 'wb') # reload pymarc so it picks up the new sys.stderr reload(pymarc) # get problematic record record = next(pymarc.reader.MARCReader(open('test/utf8_errors.dat', 'rb'))) # record_to_xml() with quiet set to False should generate errors # and write them to sys.stderr xml = pymarc.record_to_xml(record, quiet=False) # close dummy stderr so we can accurately get its size sys.stderr.close() # file size should be greater than 0 self.assertNotEqual(getsize(outfile), 0) # truncate file again open(outfile, 'wb').close() # be sure its truncated self.assertEqual(getsize(outfile), 0) # redirect stderr again sys.stderr = open(outfile, 'wb') reload(pymarc) # record_to_xml() with quiet set to True should not generate errors xml = pymarc.record_to_xml(record, quiet=True) # close dummy stderr sys.stderr.close() # no errors should have been written self.assertEqual(getsize(outfile), 0)
def test_xml_namespaces(self): """ Tests the 'namespace' parameter of the record_to_xml() method """ # get a test record record = pymarc.reader.MARCReader(open('test/test.dat')).next() # record_to_xml() with quiet set to False should generate errors # and write them to sys.stderr xml = pymarc.record_to_xml(record, namespace=False) # look for the xmlns in the written xml, should be -1 self.assertEqual(xml.find('xmlns="http://www.loc.gov/MARC21/slim"'), -1) # record_to_xml() with quiet set to True should not generate errors xml = pymarc.record_to_xml(record, namespace=True) # look for the xmlns in the written xml, should be >= 0 self.assertNotEqual(xml.find('xmlns="http://www.loc.gov/MARC21/slim"'), -1)
def to_html(self): """Return an HTML representation of any MARC records.""" records = [pymarc.Record(data=r) for r in self.record_data] xmllist = [pymarc.record_to_xml(r) for r in records] xslt = 'marcxml-to-html.xsl' html_list = [self._transform(xml, xslt) for xml in xmllist] return "".join(html_list)
def test_xml(self): # read in xml to a record fh = gzip.open('test/batch.xml.gz','rb') record1 = pymarc.parse_xml_to_array(fh)[0] # generate xml xml = pymarc.record_to_xml(record1) # parse generated xml record2 = pymarc.parse_xml_to_array(six.BytesIO(xml))[0] # compare original and resulting record self.assertEqual(record1.leader, record2.leader) field1 = record1.get_fields() field2 = record2.get_fields() self.assertEqual(len(field1), len(field2)) pos = 0 while pos < len(field1): self.assertEqual(field1[pos].tag, field2[pos].tag) if field1[pos].is_control_field(): self.assertEqual(field1[pos].data, field2[pos].data) else: self.assertEqual(field1[pos].get_subfields(), field2[pos].get_subfields()) self.assertEqual(field1[pos].indicators, field2[pos].indicators) pos += 1
def test_xml(self): # read in xml to a record record1 = pymarc.parse_xml_to_array("test/batch.xml")[0] # generate xml xml = pymarc.record_to_xml(record1) # parse generated xml record2 = pymarc.parse_xml_to_array(BytesIO(xml))[0] # compare original and resulting record self.assertEqual(record1.leader, record2.leader) field1 = record1.get_fields() field2 = record2.get_fields() self.assertEqual(len(field1), len(field2)) pos = 0 while pos < len(field1): self.assertEqual(field1[pos].tag, field2[pos].tag) if field1[pos].is_control_field(): self.assertEqual(field1[pos].data, field2[pos].data) else: self.assertEqual( field1[pos].get_subfields(), field2[pos].get_subfields() ) self.assertEqual(field1[pos].indicators, field2[pos].indicators) pos += 1
def test_xml_sort(self): # read in xml to a record record1 = pymarc.parse_xml_to_array('test/order.xml')[0] # generate xml xml = pymarc.record_to_xml(record1) # parse generated xml record1 = pymarc.parse_xml_to_array(StringIO(xml))[0] # parse xml in order record2 = pymarc.parse_xml_to_array('test/order_ok.xml')[0] # compare original and resulting record self.assertEqual(record1.leader, record2.leader) field1 = record1.get_fields() field2 = record2.get_fields() self.assertEqual(len(field1), len(field2)) pos = 0 while pos < len(field1): self.assertEqual(field1[pos].tag, field2[pos].tag) if field1[pos].is_control_field(): self.assertEqual(field1[pos].data, field2[pos].data) else: self.assertEqual(field1[pos].get_subfields(), field2[pos].get_subfields()) self.assertEqual(field1[pos].indicators, field2[pos].indicators) pos += 1
def test_xml_namespaces(self): """Tests the 'namespace' parameter of the record_to_xml() method.""" # get a test record fh = open("test/test.dat", "rb") record = next(pymarc.reader.MARCReader(fh)) # record_to_xml() with quiet set to False should generate errors # and write them to sys.stderr xml = pymarc.record_to_xml(record, namespace=False) # look for the xmlns in the written xml, should be -1 self.assertFalse(b'xmlns="http://www.loc.gov/MARC21/slim"' in xml) # record_to_xml() with quiet set to True should not generate errors xml = pymarc.record_to_xml(record, namespace=True) # look for the xmlns in the written xml, should be >= 0 self.assertTrue(b'xmlns="http://www.loc.gov/MARC21/slim"' in xml) fh.close()
def test_xml_namespaces(self): """ Tests the 'namespace' parameter of the record_to_xml() method """ # get a test record fh = open('test/test.dat', 'rb') record = next(pymarc.reader.MARCReader(fh)) # record_to_xml() with quiet set to False should generate errors # and write them to sys.stderr xml = pymarc.record_to_xml(record, namespace=False) # look for the xmlns in the written xml, should be -1 self.assertNotIn(b'xmlns="http://www.loc.gov/MARC21/slim"', xml) # record_to_xml() with quiet set to True should not generate errors xml = pymarc.record_to_xml(record, namespace=True) # look for the xmlns in the written xml, should be >= 0 self.assertIn(b'xmlns="http://www.loc.gov/MARC21/slim"', xml) fh.close()
def to_marcxml(self): """Return a MARCXML representation of any MARC records.""" records = [pymarc.Record(data=r) for r in self.record_data] xmllist = [pymarc.record_to_xml(r) for r in records] xmlstr = "".join(xmllist) xmldoc = """<?xml version="1.0" encoding="utf-8"?> <collection xmlns="http://www.loc.gov/MARC21/slim"> {0} </collection>""".format(xmlstr) return self._transform(xmldoc, 'format-xml.xsl')
def __rec2ld__(rec): marc_xml = pymarc.record_to_xml(rec, namespace=True) bf_rdf_xml = MARC2BF(lxml.etree.XML(marc_xml), baseuri='"http://catalog.coloradocollege.edu/"') bf_rdf = rdflib.Graph() bf_rdf.parse(data=lxml.etree.tostring(bf_rdf_xml), format='xml') bibcat.clean_uris(bf_rdf) try: raw_turtle = bf_rdf.serialize(format='turtle') except: raw_turtle = None return raw_turtle
def test_marc(self): record = marc.stub(self.pandata) open(TESTDATA_MARCFILENAME, "w+").write(pymarc.record_to_xml(record)) for field in record.get_fields('650'): self.assertEqual(field.get_subfields('a')[0], 'Science fiction') break for field in record.get_fields('100'): self.assertEqual(field.get_subfields('a')[0], 'Piper, H. Beam') break for field in record.get_fields('700'): self.assertEqual(field.get_subfields('4')[0], 'ill') break
def test_marc(self): record = marc.stub(self.pandata) open(TESTDATA_MARCFILENAME,"w+").write(pymarc.record_to_xml(record)) for field in record.get_fields('650'): self.assertEqual(field.get_subfields('a')[0], 'Science fiction') break for field in record.get_fields('100'): self.assertEqual(field.get_subfields('a')[0], 'Piper, H. Beam') break for field in record.get_fields('700'): self.assertEqual(field.get_subfields('4')[0], 'ill') break
def addLBD(config, oclcnumber, note): oauth_session = config.get('oauth-session') #create the LBD record = Record(leader='00000n a2200000 4500') record.add_field(Field(tag='004', data=oclcnumber)) record.add_field( Field(indicators=[' ', ' '], tag='500', subfields=['a', note]), Field(indicators=[' ', ' '], tag='935', subfields=['a', str(time.time())]), Field(indicators=[' ', ' '], tag='940', subfields=['a', config.get('oclcSymbol')])) input = pymarc.record_to_xml(record).decode("utf-8") try: r = oauth_session.post( config.get('metadata_service_url') + "/lbd/data", data=input, headers={ "Accept": 'application/atom+xml;content="application/vnd.oclc.marc21+xml"', "Content-Type": "application/vnd.oclc.marc21+xml" }) r.raise_for_status try: result = ElementTree.fromstring(r.content) ns = { 'atom': 'http://www.w3.org/2005/Atom', 'wc': 'http://worldcat.org/rb' } marcNode = result.findall('atom:content/wc:response', ns)[0].getchildren()[0] marcData = StringIO( ElementTree.tostring(marcNode, encoding='unicode', method='xml')) # need to get this XML section out as a string and into a file like object marcRecords = pymarc.parse_xml_to_array(marcData) # pull out the LBD accession number print(marcRecords) accessionNumber = marcRecords[0]['001'].value() status = "success" except xml.etree.ElementTree.ParseError as err: accessionNumber = "" status = "failed XML parsing issue" print(err) except requests.exceptions.HTTPError as err: status = "failed" return pd.Series([oclcnumber, accessionNumber, status])
def GET(self, query_string): try: marc = urllib2.urlopen('http://localhost:8080/marc/%s' % query_string).read() xml = pymarc.record_to_xml(pymarc.Record(data=marc)) if xml != '': xml = '<?xml version="1.0" encoding="UTF-8"?><collection xmlns="http://www.loc.gov/MARC21/slim" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">%s</collection>' % xml web.header('Content-Type', 'application/xml') print xml else: web.notfound() except urllib2.HTTPError, e: if e.code == 404: web.notfound() else: raise
def update_holding(holding, holding_id, full_holding_string, five_forty_one, mms_id): holding.add_field(five_forty_one) print("Holding with new field: \n" + str(holding) + "\n\n\n") updated_holding = pym.record_to_xml(holding).decode('utf-8') full_holding_string = full_holding_string.decode('utf-8') full_updated_holding = re.sub(r'<record>(.+)</record>', updated_holding, full_holding_string) print("Updated XML Holding Record: \n" + full_updated_holding + "\n") full_updated_holding = full_updated_holding.replace( '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>', '') #success = True # faulty_xml = "<holding></holding>" # # # full_holdings_xml = root.find('holding/holding_id=') # # response = requests.put( "https://api-na.hosted.exlibrisgroup.com/almaws/v1/bibs/" + str(mms_id) + "/holdings/" + str(holding_id) + "?apikey=l8xx6d87f17c700040dba6a300a2d59e98bc", data=full_updated_holding, headers=headers) # time.sleep(2) print(response.content) # # # # # # # response = requests.put("https://api-na.hosted.exlibrisgroup.com/almaws/v1/bibs/" + str(mms_id) + "/holdings/" + str(holding_id) + "?apikey=l8xxa16e86330d384b0c9171da72a6536dc8", data=full_updated_holding, headers=headers) # # # # # # print(response.content) if re.search('<errorsExist>true</errorsExist>', response.content): print("Couldn't write back to Alma for MMS ID: " + mms_id + "\n") error_file.write("Couldn't write back to Alma for MMS ID: " + mms_id + "\n") success = False else: output_file.write("<MMS_ID_" + mms_id + ">" + full_updated_holding + "</MMS_ID_" + mms_id + ">") success = True return success
def writeChangesToRecord(record, correct_headings, changed_headings, heading_type_settings): for heading_type in changed_headings: if len(correct_headings[heading_type]) > 0 or len( changed_headings[heading_type]) > 0: print heading_type print correct_headings[heading_type] print changed_headings[heading_type] print heading_type_settings print heading_type_settings record = writeHeadingsToRecord( record, correct_headings[heading_type] + changed_headings[heading_type], heading_type_settings[heading_type]['heading_type'], heading_type_settings[heading_type]['bib_fields']) if record is not None: return record_to_xml(record)
total = 0 marcset = MARCReader(marcfile) for record in marcset: # if we've reached the maximum records per transform, close this file # and start a new one if (recs != 0 and recs % int(config.get('system','max_records_per_transform')) == 0): current_batch_f.write(marcxmlfooter) current_batch_f.close() filecount += 1 current_batch_f = open('batch_%05d.tmp' % filecount,'w') current_batch_f.write(marcxmlheader) batchfiles.append('batch_%05d.tmp' % filecount) if (filter.passStage(record) and not filter.rejectStage(record)): recs += 1 current_batch_f.write(record_to_xml(record)) if (output_marcxml_flag == 1): marc_xml_f.write(record_to_xml(record)) total += 1 current_batch_f.write(marcxmlfooter) current_batch_f.close() # loop over batch files created, and run them through the XSLT parser for f in batchfiles: print "transforming %s" % f utils.apply_stylesheets(f,config) olac_rec = utils.getstringfromfile(f,'<olac:olac','</olac:olac>') print 'olac_rec = ', olac_rec
def __init__(self): setup() # start by assuming something will go wrong: status = CLI.EX_SOMETHING_ELSE desc = "Adds id.loc.gov URIs to subject and " + \ "name headings when established forms can be found. Works with EAD or MaRCXML files." # note: defaults in config file can be overridden by args on commandline # argparse... epi = """Exit statuses: 0 = All good 9 = Something unanticipated went wrong 64 = The command was used incorrectly, e.g., with the wrong number of arguments, a bad flag, a bad syntax in a parameter, or whatever. 65 = The input data was incorrect in some way. 66 = Input file (not a system file) did not exist or was not readable. 70 = An internal software (not OS) error has been detected. 73 = User specified output file cannot be created. 74 = An error occurred while doing I/O on some file. """ rHelp = "The input file." mHelp = "The input file is MaRCXML rather than EAD. Found URIs are put into $0." oHelp = "Path to the output file. Writes to stdout if no option " + \ "is supplied." nHelp = "Try to find URIs for names." sHelp = "Try to find URIs for subjects." aHelp = "Annotate the record. When multiple matches are found XML " + \ "comments containing the matches and their URIs will be added " + \ "to the record." vHelp = "Print messages to stdout (one-hit headings) and stderr " + \ "(zero or more than one hit headings)." cHelp = "Does just what it says.\n" lHelp = "Log alternatives.\n" cfgHelp = "Specify the config file. Defaults can be overridden. " + \ "At minimum, run e.g.: python addauths.py myfile.marc.xml" conf_parser = ArgumentParser(add_help=False, description=desc) conf_parser.add_argument("-c", "--conf_file", default=CONFIG, required=False, dest="conf_file", help=cfgHelp) args, remaining_argv = conf_parser.parse_known_args() defaults = { "marc" : False, "outpath": None, "names" : False, "subjects" : False, "annotate" : False, "verbose" : False, "ignore_cache" : False, "record": None, "log" : False } # if -c or --conf_file, override the defaults above if args.conf_file: config = ConfigParser.SafeConfigParser() config.read([args.conf_file]) cfgdict = dict(config.items('Paths')) # Paths section of config file booldict = dict(config.items('Booleans')) # Booleans section of config file for k,v in booldict.iteritems(): # need to get the booleans as booleans, not as 'strings' boo = config.getboolean('Booleans',k) cfgdict[k]=boo defaults = cfgdict parser = ArgumentParser(parents=[conf_parser],description=desc,formatter_class=RawDescriptionHelpFormatter,epilog=epi) parser.set_defaults(**defaults) parser.add_argument("-m", "--marc", required=False, dest="mrx", action="store_true", help=mHelp) parser.add_argument("-o", "--output", required=False, dest="outpath", help=oHelp) parser.add_argument("-n", "--names", required=False, dest="names", action="store_true", help=nHelp) parser.add_argument("-s", "--subjects", required=False, dest="subjects", action="store_true", help=sHelp) parser.add_argument("-a", "--annotate", required=False, dest="annotate", action="store_true", help=aHelp) parser.add_argument("-v", "--verbose", required=False, dest="verbose", action="store_true", help=vHelp) parser.add_argument("-C", "--ignore-cache",required=False, dest="ignore_cache", action="store_true", help=cHelp) parser.add_argument("-l", "--log",required=False, dest="log", action="store_true", help=lHelp) parser.add_argument("-f", "--file",required=True, dest="record", help=rHelp) args = parser.parse_args(remaining_argv) # TODO args to log (along with batch no.) -pmg print(args) #======================================================================= # Checks on our args and options. We can exit before we do any work. #======================================================================= if not os.path.exists(args.record): os.sys.stderr.write("File " + args.record + " does not exist\n") exit(CLI.EX_NO_INPUT) if args.record == None: os.sys.stderr.write("No input file supplied. See --help for usage\n") exit(CLI.EX_WRONG_USAGE) if not args.names and not args.subjects: msg = "Supply -n and or -s to link headings. Use --help " + \ "for more details.\n" os.sys.stderr.write(msg) exit(CLI.EX_WRONG_USAGE) if args.mrx == True: marc_path = args.record # a quick and dirty test... reader = pymarc.marcxml.parse_xml_to_array(marc_path) if not reader: msg = "-m flag used but input file isn't MaRCXML.\n" os.sys.stderr.write(msg) exit(CLI.EX_WRONG_USAGE) if args.outpath: outdir = os.path.dirname(args.outpath) if not os.path.exists(outdir): msg = "Directory " + outdir + " does not exist\n" os.sys.stderr.write(msg) exit(CLI.EX_CANT_CREATE) if not os.access(outdir, os.W_OK): msg = "Output directory " + outdir + " not writable\n" os.sys.stderr.write(msg) exit(CLI.EX_CANT_CREATE) #======================================================================= # The work... #======================================================================= shelf = shelve.open(SHELF_FILE, protocol=pickle.HIGHEST_PROTOCOL) ctxt = None mrx_subs = [] h = "" mrxheader = """<?xml version="1.0" encoding="UTF-8" ?> <collection xmlns="http://www.loc.gov/MARC21/slim" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd">""" try: reader = pymarc.marcxml.parse_xml_to_array(args.record) #writer = codecs.open("test_out.marc.xml", 'w', 'utf-8') options = {'annotate':args.annotate, 'verbose':args.verbose, 'mrx':args.mrx, 'log':args.log, 'ignore_cache':args.ignore_cache} fh = open(OUTDIR+'tmp.xml', 'wb+') fh.write(mrxheader) for rec in reader: f001 = rec.get_fields('001') for b in f001: bbid = b.value() if args.names: #======================= # NAMES #======================= # get names data from these subfields namesubf = ['a','c','d','q'] names = ['100','110','130','700','710','730'] for n in rec.get_fields(*names): for s in n.get_subfields(*namesubf): s = s.encode('utf8') mrx_subs.append(s) h = "--".join(mrx_subs) tag = n.tag _update_headings(bbid, 'nam', h, n, shelf, tag, **options) mrx_subs = [] if args.subjects: #======================= # SUBJECTS #======================= # get subjects data from these subfields (all but 0,2,3,6,8) subs = ['600','610','611','630','650','651'] subsubf = ['a', 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'z', '4'] for f in rec.get_fields(*subs): for s in f.get_subfields(*subsubf): s = s.encode('utf8') mrx_subs.append(s) h = "--".join(mrx_subs) tag = f.tag _update_headings(bbid, 'sub', h, f, shelf, tag, **options) mrx_subs = [] out = "%s" % (pymarc.record_to_xml(rec)) fh.write(out) fh.write("</collection>") fh.close() if args.outpath == None: os.sys.stdout.write(out) # if we got here... status = CLI.EX_OK #======================================================================= # Problems while doing "the work" are handled w/ Exceptions #======================================================================= except libxml2.parserError, e: # TODO: pymarc exceptions os.sys.stderr.write(str(e.message) + "\n") status = CLI.EX_DATA_ERR
def ingest(self, record, workspace=None): """Method runs entire tool-chain to ingest a single MARC record into Datastore. Args: record (pymarc.Record): MARC21 record workspace (str): Fedora4 workspace to ingest records into Returns: list: MongoID """ self.graph_ids = {} # MARC record must have a 001 for BIBFRAME xquery to function properly if not record['001']: unique_id = uuid.uuid1() field001 = pymarc.Field('001') field001.data = str(unique_id).split("-")[0] record.add_field(field001) marc_xml = pymarc.record_to_xml(record, namespace=True) marc_uri = rdflib.URIRef(self.__process_marc__(record)) derived_from = rdflib.URIRef('http://bibframe.org/vocab/derivedFrom') ## try: ## bibframe_graph = self.__xquery_chain__(marc_xml) ## for subject, obj in bibframe_graph.subject_objects( ## predicate=derived_from): ## bibframe_graph.set( ## (subject, ## derived_from, ## marc_uri)) ## all_graphs = self.__decompose_bf_graph__(bibframe_graph, workspace) ## for graph in all_graphs: ## graph_url = str(next(graph.subjects())) ## add_stub_request = urllib.request.Request( ## graph_url, ## method='PUT') ## try: ## urllib.request.urlopen(add_stub_request) ## except: ## print("Tried to add stub for {}".format(graph_url)) ## for graph in all_graphs: ## self.__process_bibframe__(graph) ## index_result = self.__index_into_es__( ## str(next(graph.subjects())), ## graph) ## bibframe_graph.close() ## except: ## print("Error with record {}".format(sys.exc_info()[0])) bibframe_graph = self.__xquery_chain__(marc_xml) for subject, obj in bibframe_graph.subject_objects( predicate=derived_from): bibframe_graph.set((subject, derived_from, marc_uri)) all_graphs = self.__decompose_bf_graph__(bibframe_graph, workspace) ## for graph in all_graphs: ## graph_url = str(next(graph.subjects())) ## add_stub_request = urllib.request.Request( ## graph_url, ## method='PUT') ## try: ## urllib.request.urlopen(add_stub_request) ## except: ## print("Tried to add stub for {}".format(graph_url)) for graph in all_graphs: self.__process_bibframe__(graph) ## index_result = self.__index_into_es__( ## str(next(graph.subjects()))) bibframe_graph.close()
def load_bib(self, record): title = None # we must have an lccn, but it's not an error if we don't find one lccn_orig = _extract(record, "010", "a") lccn = _normal_lccn(lccn_orig) if not lccn: # LOGGER.info("###### LCCN in OCLC pull, \ # but not in database. Missing LCCN. ######") # LOGGER.info(record) self.missing_lccns += 1 return # newer marc xml sets pulled from OCLC do not have the 005 control # field. 005 is the date and time of the last transaction. try: s = _extract(record, "005") parts = s.split(".") dt = datetime.datetime(*strptime(parts[0], "%Y%m%d%H%M%S")[0:6]) except AttributeError: dt = datetime.datetime.now() # dt.replace(microsecond=int(parts[1])) # it's remotely possible that a title with the LCCN already exists try: title = models.Title.objects.get(lccn=lccn) LOGGER.debug("Found another record for lccn: %s", lccn) if title.version == dt: LOGGER.debug(" with the same timestamp: %s", title.version) return # skip over this record with same timestamp elif title.version < dt: LOGGER.debug(" with newer timestamp: %s vs %s", title.version, dt) title.version = dt self.records_updated += 1 elif title.version > dt: LOGGER.debug(" with older timestamp: %s vs %s", title.version, dt) return # skip over older record else: raise ValueError("It should not be possible to have a version fail <, =, and > checks") except models.Title.DoesNotExist: self.records_created += 1 title = models.Title(lccn=lccn) title.version = dt # clear m2m relationships # these will come from the extraction title.subjects.clear() title.languages.clear() title.places.clear() # delete fk relationships # these will come from the extraction title.publication_dates.all().delete() title.notes.all().delete() title.alt_titles.all().delete() title.succeeding_title_links.all().delete() title.preceeding_title_links.all().delete() title.related_title_links.all().delete() title.urls.all().delete() # update title fields self._set_name(record, title) title.lccn_orig = lccn_orig title.oclc = self._extract_oclc(record) title.edition = _extract(record, "250", "a") title.publisher = _extract(record, "260", "b") title.frequency = _extract(record, "310", "a") title.frequency_date = _extract(record, "310", "b") title.uri = _extract(record, "856", "u") # rda records use 265$a, fallback to 260$a title.place_of_publication = _extract(record, "264", "a") if not title.place_of_publication: title.place_of_publication = _extract(record, "260", "a") # rda records use 338$a, fallback to 245$h title.medium = _extract(record, "338", "a") if not title.medium: title.medium = _extract(record, "245", "h") title.issn = _extract(record, "022", "a") f008 = record["008"].data title.start_year = _normal_year(f008[7:11]) title.end_year = _normal_year(f008[11:15]) # check to make sure start and end years are not blank if not title.start_year: LOGGER.error("lccn %s title has blank start year! Defaulting to 0", title.lccn) title.start_year = "0" if not title.end_year: LOGGER.error("lccn %s title has blank end year! Defaulting to 9999", title.lccn) title.end_year = "9999" title.country = self._extract_country(record) title.save() # update fk relationships with new values self._extract_languages(record, title) self._extract_places(record, title) self._extract_publication_dates(record, title) self._extract_subjects(record, title) self._extract_notes(record, title) self._extract_preceeding_titles(record, title) self._extract_succeeding_titles(record, title) self._extract_related_titles(record, title) self._extract_alt_titles(record, title) self._extract_urls(record, title) title.save() marc, marc_created = models.MARC.objects.get_or_create(title=title) marc.xml = record_to_xml(record) marc.save() # for context see: https://rdc.lctl.gov/trac/ndnp/ticket/375 if _is_chronam_electronic_resource(title, record): LOGGER.info("deleting title record for chronam electronic resource: %s", title) title.delete() # this is for long running processes so the query cache # doesn't bloat memory reset_queries() return title
def convert_marc_to_xml(hostenv): """Proquest delivers MARC-formatted files to CDL on behalf of campuses, generally 6-8 weeks after ETD was delivered. We transform these using campus XSLT customizations, adding both eScholarship and Proquest links in 856 fields. Proquest files have file extension '.UNX'""" marc_tmpfile = os.path.join(app_configs[hostenv]['tmp_dir'], 'marctmpfile.xml') xmlmarcnamespace = '<collection xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" ' \ 'xsi:schemaLocation="http://www.loc.gov/MARC21/slim ' \ 'http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd">\n' xmlcloser = '</collection>' for marcfilename in os.listdir(app_configs[hostenv]['marc_dir']): campuscode = None # find *.UNX or .unx file in /marc subdirectory if marcfilename[-3:].upper() == 'UNX': # determine campus abbr campusabbr = re.search(r'UC\s(.*?)\sMARC.*?', marcfilename) if campusabbr is not None: campuscode = constants.PQ_CAMPUS_NAMES.get(campusabbr.group(1)) # convert to XML marcpathname = os.path.join(app_configs[hostenv]['marc_dir'], marcfilename) try: reader = pymarc.MARCReader(open(marcpathname, 'rb'), to_unicode=True) #pylint: disable=maybe-no-member except pymarc.exceptions.PymarcException as err: logging.exception("ERROR opening PQ MARC file %s: %s", marcpathname, err.message) writer = codecs.open(marc_tmpfile, 'w', 'utf-8') writer.write(constants.XML_PROLOG) writer.write(xmlmarcnamespace) for record in reader: record.leader = record.leader[:9] + 'a' + record.leader[10:] writer.write( pymarc.record_to_xml(record, namespace=False) + "\n") writer.write(xmlcloser) writer.close() # need to add namespaces using XSLT marc_tmpfilestr = open(marc_tmpfile, 'r') marc_tmpfileread = marc_tmpfilestr.read() namespace_xmlstr = xml_saxon_transform(marc_tmpfileread, constants.NAMESPACE_XSLT) # test if all ISBNs are available test_str = xml_saxon_transform(namespace_xmlstr, constants.TEST_XSLT) # convert using campus customizations using XSLT if "ERROR" not in test_str: if campuscode is not None: campus_stylesheet = os.path.join( app_configs[hostenv]['xsl_dir'], campus_configs[campuscode]['pqmarcxslt']) campus_xml_str = xml_saxon_transform( namespace_xmlstr, campus_stylesheet) outfilename = campuscode + time.strftime( "%Y%m%d") + 'PQ-orig.xml' outfullpath = os.path.join( app_configs[hostenv]['marc_dir'], outfilename) campus_xml_file = codecs.open(outfullpath, 'wb') campus_xml_file.write(campus_xml_str) campus_xml_file.close() else: logging.error("ERROR: campus code not found %s", marcfilename) else: logging.error("ERROR: UNX file %s not converted; missing %s", marcfilename, test_str)
response = requests.get('http://id.loc.gov/authorities/names/label/%s' % subject) link = response.url pattern = re.compile(r'(.+)\.html$') value = pattern.findall(link)[0].encode('utf-8') return ('http://id.loc.gov/authorities/names', value) except: return (False, False) with codecs.open('G://Metadata Projects/sheet_music/sheet_music.bib', 'rb') as fh: regexpNS = 'http://exslt.org/regular-expressions' modsNS = 'http://www.loc.gov/mods/v3' nameDict = defaultdict(int) reader = pymarc.MARCReader(fh, to_unicode=True) for record in reader: root = etree.XML(pymarc.record_to_xml(record, namespace=True)) xslt_root = etree.parse(open('G:/Metadata Projects/sheet_music/sheet.xsl','r'), parser) transform = etree.XSLT(xslt_root) root = transform(root) title = root.xpath('/mods:mods/mods:titleInfo[not(@*)]/mods:title', namespaces={'mods': modsNS})[0].text bib = root.xpath('/mods:mods/mods:recordInfo/mods:recordIdentifier', namespaces={'mods': modsNS})[0].text print "----------------------------------------\nTransformation started for '%s.'\n" % title try: ## Get name URIs print "... Retrieving URIs for names and titles ...\n" pattern = re.compile("\s[A-Za-z]{2,}\.$")
import codecs import pymarc input = 'records_in.mrc' output = 'records_out.xml' reader = pymarc.MARCReader(open(input, 'rb'), to_unicode=True) writer = codecs.open(output, 'w', 'utf-8') for record in reader: record.leader = record.leader[:9] + 'a' + record.leader[10:] writer.write(bytes.decode(pymarc.record_to_xml(record)) + '\n')
def load_bib(self, record): title = None # we must have an lccn, but it's not an error if we don't find one lccn_orig = _extract(record, '010', 'a') lccn = _normal_lccn(lccn_orig) if not lccn: #_logger.info("###### LCCN in OCLC pull, \ # but not in database. Missing LCCN. ######") #_logger.info(record) self.missing_lccns += 1 return # newer marc xml sets pulled from OCLC do not have the 005 control # field. 005 is the date and time of the last transaction. try: s = _extract(record, '005') parts = s.split(".") dt = datetime.datetime(*strptime(parts[0], '%Y%m%d%H%M%S')[0:6]) except AttributeError: dt = datetime.datetime.now() #dt.replace(microsecond=int(parts[1])) # it's remotely possible that a title with the LCCN already exists try: title = models.Title.objects.get(lccn=lccn) _logger.debug("Found another record for lccn: %s" % lccn) if title.version == dt: _logger.debug(" with the same timestamp: %s" % title.version) return # skip over this record with same timestamp elif title.version < dt: _logger.debug(" with newer timestamp: %s vs %s" % (title.version, dt)) title.version = dt self.records_updated += 1 elif title.version > dt: _logger.debug(" with older timestamp: %s vs %s" % (title.version, dt)) return # skip over older record else: _logger.error("Logic error... this should be unreachable.") except models.Title.DoesNotExist: self.records_created += 1 title = models.Title(lccn=lccn) title.version = dt # clear m2m relationships # these will come from the extraction title.subjects.clear() title.languages.clear() title.places.clear() #TODO: Add a check to the title load that deletes all m2m in subjects and # places, to keep from lonely m2m records hanging out # delete fk relationships # these will come from the extraction title.publication_dates.all().delete() title.notes.all().delete() title.alt_titles.all().delete() title.succeeding_title_links.all().delete() title.preceeding_title_links.all().delete() title.related_title_links.all().delete() title.urls.all().delete() # update title fields self._set_name(record, title) title.lccn_orig = lccn_orig title.oclc = self._extract_oclc(record) title.edition = _extract(record, '250', 'a') title.place_of_publication = _extract(record, '260', 'a') title.publisher = _extract(record, '260', 'b') title.frequency = _extract(record, '310', 'a') title.frequency_date = _extract(record, '310', 'b') # the main purpose of this it to look for records # with 245 $h[microform] or [microfilm] # but we save everything title.medium = _extract(record, '245', 'h') title.issn = _extract(record, '022', 'a') f008 = record['008'].data title.start_year = _normal_year(f008[7:11]) title.end_year = _normal_year(f008[11:15]) title.country = self._extract_country(record) title.save() # update fk relationships with new values self._extract_languages(record, title) self._extract_places(record, title) self._extract_publication_dates(record, title) self._extract_subjects(record, title) self._extract_notes(record, title) self._extract_preceeding_titles(record, title) self._extract_succeeding_titles(record, title) self._extract_related_titles(record, title) self._extract_alt_titles(record, title) self._extract_urls(record, title) title.save() marc, marc_created = models.MARC.objects.get_or_create(title=title) marc.xml = record_to_xml(record) marc.save() # for context see: https://rdc.lctl.gov/trac/ndnp/ticket/375 if _is_chronam_electronic_resource(title, record): _logger.info("deleting title record for chronam electronic resource: %s" % title) title.delete() # this is for long running processes so the query cache # doesn't bloat memory reset_queries() return title
from pymarc import Record,Field,record_to_xml import MySQLdb record=Record() print dir(record) dbLo=MySQLdb.connect("localhost","shailesh","123","shailesh") dbKoha=MySQLdb.connect("localhost","root","","koha1") curaKoha=dbKoha.cursor() curaLocl = dbLo.cursor() curaLocl.execute("select BookNo,BookName,authorName from book_info group by BookNo;") dat=curaLocl.fetchall() curaLocl.execute("select accession,BookNo,callNo from book_info;") datIte=curaLocl.fetchall() for i in dat: record=Record() record.add_field(Field(tag='040',indicators=['0','1'],subfields=['c','LIBRARY OF CONGRESS'])) record.add_field(Field(tag='245',indicators=['0','1'],subfields=['a',i[1]])) record.add_field(Field(tag='942',indicators=['0','1'],subfields=['2','book of parag','c','BOOK'])) record.add_field(Field(tag='100',indicators=['0','1'],subfields=['a',i[2]])) record.add_field(Field(tag='999',indicators=['0','1'],subfields=['c','8','d','8'])) marcI=record_to_xml(record) #print i[0],i[1],i[2] curaKoha.execute("insert into biblio(biblionumber,title,author) values(%s,%s,%s);",(i[0],i[1],i[2])) curaKoha.execute("insert into biblioitems(biblionumber,biblioitemnumber,marcxml) values(%s,%s,%s);",(i[0],i[0],marcI)) for i in datIte: barcode='1111'+str(i[0]) curaKoha.execute("insert into items(itemnumber,biblionumber,biblioitemnumber,barcode,itemcallnumber) values(%s,%s,%s,%s,%s);",(i[0],i[1],i[1],barcode,i[2])) dbKoha.commit() dbKoha.close()
def convert2bibframe(record): return xquery_socket(pymarc.record_to_xml(record, namespace=True))
def read_mrx(mrcrec,names,subjects): ''' Read through a given MARCXML file and optionally copy it, inserting $0 as appropriate ''' enhanced = [] # will be True or False, from check_heading() recs = [] # will be a pymarc object or None, from check_heading() mrxheader = """<?xml version="1.0" encoding="UTF-8" ?> <collection xmlns="http://www.loc.gov/MARC21/slim" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd">""" try: reader = pymarc.marcxml.parse_xml_to_array(INDIR+mrcrec) for rec in reader: f001 = rec.get_fields('001') f035 = rec.get_fields('035') try: ocn = rec['035']['a'] except: ocn = None for b in f001: bbid = b.value() if names: en,r = check_heading(bbid,rec,'nam') enhanced.append(en) if en == True: recs.append(r) if subjects: # if just searching subjects, or if a rec only has subjects, no names en,r = check_heading(bbid,rec,'sub') enhanced.append(en) if en == True and r not in recs: recs.append(r) if nomarc == False: outfile = str.replace(mrcrec,'.xml','') fh = open(TMPDIR+outfile+'_tmp.xml', 'wb+') fh.write(mrxheader) for record in recs: if record is not None: try: out = "%s" % (pymarc.record_to_xml(record)) fh.write(out) except Exception as e: raise if nomarc == False and ((enhanced_only == True and (True in enhanced)) or (enhanced_only == False)): fh.write("</collection>") fh.close() except AttributeError as e: return except: if names: scheme = 'nam' elif subjects: scheme = 'sub' etype,evalue,etraceback = sys.exc_info() flag = "read_mrx problem: %s %s %s line %s" % (etype,evalue,etraceback,etraceback.tb_lineno) print(flag) if csvout or nomarc: # idea here is to report something out even when mrx has issues write_csv(bbid,flag,'',scheme,'','') if not nomarc and ((enhanced_only == True and (True in enhanced)) or (enhanced_only == False)): try: subprocess.Popen(['xmllint','--format','-o', OUTDIR+outfile, TMPDIR+outfile+'_tmp.xml']).wait() mrx2mrc(OUTDIR+outfile) except: etype,evalue,etraceback = sys.exc_info() print("xmllint problem: %s" % evalue) if (keep == False): os.remove(INDIR+mrcrec)
# write OAI header # TODO: handle exceptions for template variable substitution # is this exception handling working??? if (count == 0): # first iteration oaiheader = Template(utils.file2string(config.get('system','oai_header_file'))) uservars['sample_id'] = 'oai:' + \ uservars['repository_id'] + ':' +record['001'].value() try: olac_xml_f.write(oaiheader.substitute(uservars)) except KeyError: pass # construct a proper marcxml document xmlrec = record_to_xml(record) if (output_marcxml_flag == 1): marc_xml_f.write(xmlrec) xmlrec = marcxmlheader + xmlrec + marcxmlfooter # write out xml rec to a temp file xml_input_f = open(xml_input,'w') xml_input_f.write(xmlrec) xml_input_f.close() # apply stylesheets print "start stylesheet" utils.apply_stylesheets(xml_input,config) print "end stylesheet"
import utils # get params from command line try: input = sys.argv.pop(1) output = sys.argv.pop(1) except: print "you need two arguments: input_file output_file" sys.exit(2) f = open(output,'w') ctr = 0 marcset = pymarc.MARCReader(open(input)) f.write('<?xml version="1.0" encoding="UTF-8" ?>\n<collection xmlns="http://www.loc.gov/MARC21/slim">') for rec in marcset: xmlrec = pymarc.record_to_xml(rec) #xmlrec = libxml2.parseDoc(xmlrec) if (rec['695'] and (rec['695'].value().lower().find('language') != -1 or rec['695'].value().lower().find('music') != -1)): print rec['695'] f.write(xmlrec + '\n') ctr += 1 if ctr % 500 == 0: print "writing %sth record..." % ctr #if ctr == 100: break f.write('</collection>') f.close() #print "formatting xml..." #xmlrec = libxml2.parseDoc(utils.file2string(output)) #f = open(output,'w') #f.write(xmlrec.serialize(None,1)) #f.write(xmlrec.serialize(None,2))
sleep(1) if __name__ == "__main__": mrxheader = """<?xml version="1.0" encoding="UTF-8" ?> <collection xmlns="http://www.loc.gov/MARC21/slim" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd">""" fh = open('out/owi_tmp.xml', 'w+') fh.write(mrxheader) reader = pymarc.marcxml.parse_xml_to_array(infile) for rec in reader: for n in rec.get_fields('035'): for s in n.get_subfields('a'): if 'OCoLC' in s: num = s.replace('(OCoLC)','') workid = check_shelf(str(num)) if workid != None and workid != '': field = pymarc.Field( tag = '787', indicators = ['0',' '], subfields = [ 'o', str(workid) ]) rec.add_field(field) workid = "" out = "%s" % (pymarc.record_to_xml(rec)) fh.write(out) fh.write("</collection>") fh.close() # format output for readability subprocess.Popen(['xmllint','--format','-o', outfile,'owi_tmp.xml'])
filename_xml = 'examplerecord_%s.xml' % i filename_out = 'examplerecord_%s.out' % i records = MARCReader(open(filename), to_unicode=True, force_utf8=True, utf8_handling='ignore') writer_dat = MARCWriter(file(filename_out,'a')) writer_xml = open(filename_xml,'a') for marc in records: isbn_list = marc.get_fields('020') try: isbn_field = isbn_list[0] except Exception, e: j = i - 10 marc.add_ordered_field( Field( tag='020', indicators=[' ', ' '], subfields = ['a', isbns[j]] )) writer_dat.write(marc) writer_xml.write(record_to_xml(marc) + "\n") writer_dat.close() writer_xml.close()
def load_bib(self, record): title = None # we must have an lccn, but it's not an error if we don't find one lccn_orig = _extract(record, '010', 'a') lccn = _normal_lccn(lccn_orig) if not lccn: #_logger.info("###### LCCN in OCLC pull, \ # but not in database. Missing LCCN. ######") #_logger.info(record) self.missing_lccns += 1 return # newer marc xml sets pulled from OCLC do not have the 005 control # field. 005 is the date and time of the last transaction. try: s = _extract(record, '005') parts = s.split(".") dt = datetime.datetime(*strptime(parts[0], '%Y%m%d%H%M%S')[0:6]) except AttributeError: dt = datetime.datetime.now() #dt.replace(microsecond=int(parts[1])) # it's remotely possible that a title with the LCCN already exists try: title = models.Title.objects.get(lccn=lccn) _logger.debug("Found another record for lccn: %s" % lccn) if title.version == dt: _logger.debug(" with the same timestamp: %s" % title.version) return # skip over this record with same timestamp elif title.version < dt: _logger.debug(" with newer timestamp: %s vs %s" % (title.version, dt)) title.version = dt self.records_updated += 1 elif title.version > dt: _logger.debug(" with older timestamp: %s vs %s" % (title.version, dt)) return # skip over older record else: _logger.error("Logic error... this should be unreachable.") except models.Title.DoesNotExist: self.records_created += 1 title = models.Title(lccn=lccn) title.version = dt # clear m2m relationships # these will come from the extraction title.subjects.clear() title.languages.clear() title.places.clear() # delete fk relationships # these will come from the extraction title.publication_dates.all().delete() title.notes.all().delete() title.alt_titles.all().delete() title.succeeding_title_links.all().delete() title.preceeding_title_links.all().delete() title.related_title_links.all().delete() title.urls.all().delete() # update title fields self._set_name(record, title) title.lccn_orig = lccn_orig title.oclc = self._extract_oclc(record) title.edition = _extract(record, '250', 'a') title.publisher = _extract(record, '260', 'b') title.frequency = _extract(record, '310', 'a') title.frequency_date = _extract(record, '310', 'b') title.uri = _extract(record, '856', 'u') # rda records use 265$a, fallback to 260$a title.place_of_publication = _extract(record, '264', 'a') if not title.place_of_publication: title.place_of_publication = _extract(record, '260', 'a') # rda records use 338$a, fallback to 245$h title.medium = _extract(record, '338', 'a') if not title.medium: title.medium = _extract(record, '245', 'h') title.issn = _extract(record, '022', 'a') f008 = record['008'].data title.start_year = _normal_year(f008[7:11]) title.end_year = _normal_year(f008[11:15]) title.country = self._extract_country(record) title.save() # update fk relationships with new values self._extract_languages(record, title) self._extract_places(record, title) self._extract_publication_dates(record, title) self._extract_subjects(record, title) self._extract_notes(record, title) self._extract_preceeding_titles(record, title) self._extract_succeeding_titles(record, title) self._extract_related_titles(record, title) self._extract_alt_titles(record, title) self._extract_urls(record, title) title.save() marc, marc_created = models.MARC.objects.get_or_create(title=title) marc.xml = record_to_xml(record) marc.save() # for context see: https://rdc.lctl.gov/trac/ndnp/ticket/375 if _is_chronam_electronic_resource(title, record): _logger.info( "deleting title record for chronam electronic resource: %s" % title) title.delete() # this is for long running processes so the query cache # doesn't bloat memory reset_queries() return title
def process_rec(rec, type): rec_orig = deepcopy(rec) dup_num = False no_880_rec = False missing_key_880_rec = False unlinked_880_rec = False indiv_rec_analysis_msg = '' # string variable to collect individual analysis messages for each record rec_003_value = rec.get_fields('003')[0].value() # either 'OCLC' or the partner's institution code from the 003 field rec_001_value = rec.get_fields('001')[0].value() # either the OCLC number or the inst_id from the 001 field if type=='oclc': ################################################ # Check for duplicate OCLC record in batch for num in aco_globals.oclc_nums_processed: if rec_001_value == num: dup_num = True if not dup_num: aco_globals.oclc_nums_processed.add(rec_001_value) if type=='orig' or not dup_num: ################################################ # Add institutional ID and OCLC number to 999 field rec_orig, rec, oclc_id, inst_id, oclc_match, msg_1 = process_001_003_fields(rec_orig, rec, aco_globals.oclc_nums_bsns_all) indiv_rec_analysis_msg += msg_1 if not oclc_match: aco_globals.recs_no_oclc_match_count += 1 ################################################ # Check if record is missing any 880 script fields script_rec, missing_key_880s, msg_2 = check_for_missing_880s(rec) indiv_rec_analysis_msg += msg_2 if not script_rec: no_880_rec = True else: aco_globals.recs_880s_count += 1 if missing_key_880s: missing_key_880_rec = True ################################################ # Check if record has any unlinked 880 fields (having "00" in the 880 $6 numbering) unlinked_exist, msg_3 = check_for_unlinked_880s(rec) indiv_rec_analysis_msg += msg_3 if unlinked_exist: unlinked_880_rec = True ################################################ # Check if record has any untraced 490 fields without corresponding 8XX fields msg_4 = check_series_hdgs(rec) indiv_rec_analysis_msg += msg_4 ################################################ # Check if record contains RDA fields rda_rec, msg_5 = check_if_rda(rec) indiv_rec_analysis_msg += msg_5 if rda_rec: aco_globals.recs_rda_count += 1 ################################################ # Check if record contains bad encoding script character (black diamond around question-mark) # Evidenced by presence of Python source code u"\uFFFD" (See: http://www.fileformat.info/info/unicode/char/0fffd/index.htm) repl_char, msg_6 = check_repl_char(rec) indiv_rec_analysis_msg += msg_6 if repl_char: aco_globals.recs_repl_char_count += 1 ################################################ # Add/Delete/Modify MARC fields in print record to convert to an e-resource record rec, msg_7 = convert_2_eres_rec(rec, rda_rec) indiv_rec_analysis_msg += msg_7 ################################################ # Sort any $6 subfields that do not appear first in the field rec = sort_6_subs(rec) rec, msg_8 = second_sort_6_check(rec) indiv_rec_analysis_msg += msg_8 ################################################ # Match the 001/003 fields and insert the corresponding URL handle in an 856 field rec, msg_9 = insert_url(rec, aco_globals.handles_lines) indiv_rec_analysis_msg += msg_9 ################################################ # Match the BSNs and insert the corresponding SE (source entity) book IDs into the 999 field rec, msg_10 = insert_src_entities(rec, aco_globals.bsn_se_lines) indiv_rec_analysis_msg += msg_10 ################################################ # Change LDR values ldr = list(rec.leader) ldr[5] = 'n' ldr[6] = 'a' ldr[7] = 'm' #ldr[9] = 'a' rec.leader = ''.join(ldr) ################################################ # Remove any existing 999 $e subfields and Add new 999 $e subfield with error type codes # -- NOTE: adding the field to the rec_orig (deep copy of rec) seems to also add to rec...?? add_999e = False rec_orig_999s = rec_orig.get_fields('999') if len(rec_orig_999s) == 0: indiv_rec_analysis_msg += 'ERROR-MISC: The 999 field did not get added to the original record during processing\n' elif len(rec_orig_999s) > 1: indiv_rec_analysis_msg += 'ERROR-MISC: Original record contains multiple 999 fields\n' elif len(rec_orig_999s) == 1: if len(rec_orig.get_fields('999')[0].get_subfields('e')) > 0: for rec_orig_999e in rec_orig.get_fields('999')[0].get_subfields('e'): rec_orig.get_fields('999')[0].delete_subfield('e') add_999e = True # rec_999s = rec.get_fields('999') # if len(rec_999s) == 0: # indiv_rec_analysis_msg += 'ERROR-MISC: The 999 field did not get added to the converted record during processing\n' # elif len(rec_999s) > 1: # indiv_rec_analysis_msg += 'ERROR-MISC: Converted record contains multiple 999 fields\n' # elif len(rec_999s) == 1: # if len(rec.get_fields('999')[0].get_subfields('e')) > 0: # for rec_999e in rec.get_fields('999')[0].get_subfields('e'): # rec.get_fields('999')[0].delete_subfield('e') # add_999e = True if add_999e: error_types = '' if 'ERROR-880' in indiv_rec_analysis_msg: error_types += '(ERROR-880)' if 'ERROR-SERIES' in indiv_rec_analysis_msg: error_types += '(ERROR-SERIES)' if 'ERROR-MISC' in indiv_rec_analysis_msg: error_types += '(ERROR-MISC)' if not error_types == '': rec_orig.get_fields('999')[0].add_subfield('e', error_types) indiv_rec_analysis_msg += '---------------------------------------------------------------------\n' ################################################ # Write out ERROR message and MARC records depending on status if no_880_rec: aco_globals.recs_no_880s_count += 1 aco_globals.marcRecsOut_no_880s.write(rec_orig) aco_globals.recs_no_880s_txt.write(indiv_rec_analysis_msg) if missing_key_880_rec: aco_globals.recs_missing_key_880s_count += 1 aco_globals.marcRecsOut_missing_key_880s.write(rec_orig) aco_globals.recs_missing_key_880s_txt.write(indiv_rec_analysis_msg) if unlinked_880_rec: aco_globals.recs_unlinked_880s_count += 1 aco_globals.marcRecsOut_unlinked_880s.write(rec_orig) aco_globals.recs_unlinked_880s_txt.write(indiv_rec_analysis_msg) if 'ERROR-SERIES' in indiv_rec_analysis_msg: aco_globals.recs_series_errors_count += 1 aco_globals.marcRecsOut_series_errors.write(rec_orig) aco_globals.recs_series_errors_txt.write(indiv_rec_analysis_msg) if 'ERROR-MISC' in indiv_rec_analysis_msg: aco_globals.recs_misc_errors_count += 1 aco_globals.marcRecsOut_misc_errors.write(rec_orig) aco_globals.recs_misc_errors_txt.write(indiv_rec_analysis_msg) if 'ERROR' in indiv_rec_analysis_msg: aco_globals.recs_errors_all_count += 1 aco_globals.marcRecsOut_errors_all.write(rec_orig) aco_globals.recs_errors_all_txt.write(indiv_rec_analysis_msg) else: aco_globals.marcRecsOut_final_subset.write(rec) aco_globals.recs_final_this_subset_count += 1 aco_globals.marcRecsOut_final_all.write(rec) ################################################ # Write out individual .mrc record try: os.makedirs(aco_globals.batch_folder+'/mrc_out') except OSError as exception: if exception.errno != errno.EEXIST: raise indiv_marcRecOut = pymarc.MARCWriter(file(aco_globals.batch_folder+'/mrc_out/'+inst_id+'_mrc.mrc', 'w')) indiv_marcRecOut.write(rec) indiv_marcRecOut.close() ################################################ # Convert MARC to MARCXML and write out individual MARCXML record rec_xml = pymarc.record_to_xml(rec, namespace=True) # creates marcxml but all tags in a single line with no line breaks pretty_rec_xml = xml.dom.minidom.parseString(rec_xml) # parses the single line of xml pretty_rec_xml = pretty_rec_xml.toprettyxml(encoding='utf-8') # creates the correct indentations and line breaks, but adds extra line breaks within content tags pretty_xml_re = re.compile('>\n\s+([^<>\s].*?)\n\s+</', re.DOTALL) # regular expression for removing extra line breaks in content pretty_rec_xml = pretty_xml_re.sub('>\g<1></', pretty_rec_xml) # applying the reg ex to remove extra line breaks in content try: os.makedirs(aco_globals.batch_folder+'/marcxml_out') except OSError as exception: if exception.errno != errno.EEXIST: raise indiv_marcRecOut_xml = codecs.open(aco_globals.batch_folder+'/marcxml_out/'+inst_id+'_marcxml.xml', 'w') indiv_marcRecOut_xml.write(pretty_rec_xml) indiv_marcRecOut_xml.close() aco_globals.indiv_rec_analysis_msgs += indiv_rec_analysis_msg return dup_num
def _xml(record): return pymarc.record_to_xml(record)
#!/usr/bin/env python # -*- coding: utf-8 -*- import codecs import pymarc input = 'records_in.mrc' output = 'records_out.xml' reader = pymarc.MARCReader(open(input, 'rb'), to_unicode=True) writer = codecs.open(output, 'w', 'utf-8') for record in reader: record.leader = record.leader[:9] + 'a' + record.leader[10:] writer.write(pymarc.record_to_xml(record) + "\n")
""" Base script for DLF Forum 2014 Listening-Based Python workshop. Modified from files at https://github.com/LibraryCodeYearIG/MARC-record-edit . """ import os from pymarc import Field, MARCReader, MARCWriter, record_to_xml records = MARCReader(open('../../exampledump.mrc'), to_unicode=True, force_utf8=True, utf8_handling='ignore') index = 1 for marc in records: filename_dat = 'examplerecord_%s.dat' % index filename_xml = 'examplerecord_%s.xml' % index writer_dat = MARCWriter(file(filename_dat,'a')) writer_xml = open(filename_xml,'a') writer_dat.write(marc) writer_xml.write(record_to_xml(marc) + "\n") writer_dat.close() writer_xml.close() index += 1
def extract(marc_filepath): """Takes a MARC21 file, iterates through each MARC record and yields MARC XML""" reader = pymarc.MARCReader(open(marc_filepath, "rb"), to_unicode=True) for record in reader: yield pymarc.record_to_xml(record, namespace=True)
def convert_mrc_recorc_to_xml(my_marc): root = et.Element("collection") xml_record = record_to_xml(record) root.insert(i, et.fromstring(xml_record))