def break_up_record( start_record=0, end_record=0 ): """ Splits big marc file into smaller files. This can successfully re-write the whole errant `rec_19.mrc` file. """ log.debug( 'start_record, `{st}`; end_record, `{en}`'.format( st=start_record, en=end_record ) ) BIG_MARC_FILEPATH = settings.INPUT_FILEPATH SMALLER_OUTPUT_FILEPATH = settings.OUTPUT_FILEPATH log.debug( 'processing file, ``{}```'.format(BIG_MARC_FILEPATH) ) log.debug( 'output file, ``{}```'.format(SMALLER_OUTPUT_FILEPATH) ) start_time = datetime.datetime.now() count = 0 with open( BIG_MARC_FILEPATH, 'rb' ) as input_fh: # reader = pymarc.MARCReader( input_fh, force_utf8=True, utf8_handling='ignore' ) # reader = pymarc.MARCReader( input_fh ) # reader = pymarc.MARCReader( input_fh, to_unicode=True ) reader = pymarc.MARCReader( input_fh, to_unicode=True, utf8_handling='ignore' ) # works! with open( SMALLER_OUTPUT_FILEPATH, 'wb' ) as output_fh: writer = pymarc.MARCWriter( output_fh ) for record in reader: count += 1 if count % 10000 == 0: print( '`{}` records processed'.format(count) ) if count >= start_record: writer.write( record ) if count >= end_record: break end_time = datetime.datetime.now() log.debug( 'records processed, `{}`'.format(count) ) log.debug( 'time_taken, `{}`'.format(end_time-start_time) )
def file_create(record_type, parametres): """ Création du fichier en sortie : XML, iso2709 ou tabulé """ file = object id_filename = "-".join([parametres["outputID"], record_type]) if (parametres["format_file"] == 3): filename = id_filename + ".txt" file = open(filename, "w", encoding="utf-8") headers = ["Numéro de notice", "Type de notice" ] + parametres["select_fields"].split(";") funcs.line2report(headers, file, display=False) elif (parametres["format_file"] == 2): output_encoding = "utf-8" if ("xml_encoding_option" in parametres): output_encoding = parametres["xml_encoding_option"] filename = id_filename + ".xml" file = open(filename, "w", encoding=output_encoding) file.write(f"<?xml version='1.0' encoding='{output_encoding}'?>\n") file.write("<collection>") else: filename = id_filename + ".iso2709" file = mc.MARCWriter(open(filename, "wb")) return file
def break_up_record( start_record=0, end_record=0 ): """ Splits big marc file into smaller files. This can successfully re-write the whole errant `rec_19.mrc` file. """ log.debug( 'start_record, `{st}`; end_record, `{en}`'.format( st=start_record, en=end_record ) ) BIG_MARC_FILEPATH = os.environ['PYMARC_EXP__BIG_MARC_FILEPATH'] SMALLER_OUTPUT_FILEPATH = os.environ['PYMARC_EXP__SMALLER_OUTPUT_MARC_FILEPATH'] log.debug( 'processing file, ``{}```'.format(BIG_MARC_FILEPATH) ) log.debug( 'output file, ``{}```'.format(SMALLER_OUTPUT_FILEPATH) ) start_time = datetime.datetime.now() count = 0 last_record = 'init' with open( BIG_MARC_FILEPATH, 'rb' ) as input_fh: # reader = pymarc.MARCReader( input_fh, force_utf8=True, utf8_handling='ignore' ) # reader = pymarc.MARCReader( input_fh ) # reader = pymarc.MARCReader( input_fh, to_unicode=True ) # reader = pymarc.MARCReader( input_fh, to_unicode=True, utf8_handling='ignore' ) # works! reader = pymarc.MARCReader( input_fh, to_unicode=True, force_utf8=True, utf8_handling='ignore' ) # reader = pymarc.MARCReader( input_fh, to_unicode=False, utf8_handling='ignore' ) with open( SMALLER_OUTPUT_FILEPATH, 'wb' ) as output_fh: writer = pymarc.MARCWriter( output_fh ) processing_flag = True while processing_flag is True: try: record = next(reader) except Exception as e: record = None log.error( 'exception looping through records; ```{}```'.format( unicode(repr(e)) ) ) log.error( 'e info, ```{}```'.format(e) ) e_type, e_value, e_traceback = sys.exc_info() # <http://stackoverflow.com/a/15890953> log.error( 'e_type, ```{}``'.format(e_type) ) log.error( 'e_value, ```{}``'.format(e_value) ) log.error( 'e_traceback, ```{}```'.format(e_traceback) ) log.error( 'traceback info, ```{}```'.format( traceback.format_exc() ) ) # log.error( 'current record, ```{}```'.format( record ) ) log.error( 'current count, `{}`'.format(count) ) # log.error( 'last_record, ```{}```'.format(last_record) ) last_record = record count += 1 if count % 10000 == 0: print( '`{}` records processed'.format(count) ) if count >= start_record: log.debug( 'count, `{}`'.format(count) ) if record: log.debug( 'count is, `{cnt}`, so will write record.as_json()[0:100], ```{rcd}```'.format( cnt=count, rcd=record.as_json()[0:100] ) ) writer.write( record ) if count >= end_record: processing_flag = False end_time = datetime.datetime.now() log.debug( 'records processed, `{}`'.format(count) ) log.debug( 'time_taken, `{}`'.format(end_time-start_time) )
def get_resource(): os.makedirs(parameters["out-path"], exist_ok=True) dat_file = open(parameters["out-path"] + "/{}.dat".format('marc'), 'wb') marc_writer = pymarc.MARCWriter(dat_file) for row in next(resources): json_records_string = '[' + json.dumps(json.loads(row["json"])) + ']' for record in pymarc.JSONReader(json_records_string): marc_writer.write(record) yield row
def test_close_true(self): """If close_fh is true, then the file handle is also closed.""" file_handle = BytesIO() self.assertFalse(file_handle.closed, "The file handle should be open") writer = pymarc.MARCWriter(file_handle) self.assertFalse(file_handle.closed, "The file handle should still be open") writer.close() self.assertTrue(file_handle.closed, "The file handle should close when the writer closes")
def update_marc_file(infile, outfile, cdlpath): """add cdlpath info to all the MARC records in a file""" # open MARC file for reading reader = pymarc.MARCReader( file(infile), to_unicode=True, force_utf8=True, utf8_handling='ignore' ) # keep the new file in memory string = StringIO.StringIO() writer = pymarc.MARCWriter(string) # main look through all the records count = 0 for record in reader: count += 1 # create new MARC field and add it to the record field = pymarc.Field( tag = '941', indicators = ['0','1'], subfields = [ 'a', cdlpath ] ) record.add_field(field) try: # try to write the record writer.write(record) except UnicodeDecodeError as inst: # catch Unicode errors title = '' recordId = '' if record['245'] is not None: title = record['245'] if record['001'] is not None: recordId = record['001'] print "--- error with record %s %s" % (count, recordId) print "leader9 = %s" % record.leader[9] print title print inst # set leader9 to 'a' (indicates unicode) and try again ## this didn't work # try: # l = list(record.leader) # l[9] = 'a' # UTF-8 encoding # record.leader = "".join(l) # writer.write(record) # except UnicodeDecodeError as inst2: # print "tried again and failed again" # print "leader9 = %s" % record.leader[9] # print inst2 out = open(outfile, mode="w") sys.stdout = out print string.getvalue() string.close()
def test_close_false(self): """If close_fh is false, then the file handle is NOT closed.""" file_handle = BytesIO() self.assertFalse(file_handle.closed, "The file handle should be open") writer = pymarc.MARCWriter(file_handle) self.assertFalse(file_handle.closed, "The file handle should still be open") writer.close(close_fh=False) self.assertFalse( file_handle.closed, "The file handle should NOT close when the writer closes", )
def get_resource(resource): last_ccl_query, dat_file = None, None for row in resource: if row['first_ccl_query'] != last_ccl_query: if dat_file: dat_file.close() dat_file = open(get_dat_file_name(row), 'wb') marc_writer = pymarc.MARCWriter(dat_file) last_ccl_query = row['first_ccl_query'] assert marc_writer for record in pymarc.JSONReader(json.dumps([json.loads(row['json'])])): marc_writer.write(record) yield row
def convert_xml_to_marc(hostenv): """Convert MARC XML to MARC formatted .mrc file""" for marcfilename in os.listdir(app_configs[hostenv]['marc_dir']): if marcfilename[-3:] == 'xml': newfilename = re.sub("-orig.xml", "-marc.mrc", marcfilename) logging.info("Converting to MARC %s", marcfilename) marc_recs_out = pymarc.MARCWriter(open(app_configs[hostenv]['marc_dir'] \ +"/"+ newfilename, 'wb')) marc_xml_array = pymarc.parse_xml_to_array(app_configs[hostenv]['marc_dir'] \ +marcfilename) for rec in marc_xml_array: marc_recs_out.write(rec) marc_recs_out.close()
def file_create(record_type, parametres): file = object id_filename = "-".join([parametres["outputID"], record_type]) if (parametres["format_file"] == 2): filename = id_filename + ".xml" file = open(filename, "w", encoding="utf-8") file.write("<?xml version='1.0'?>\n") file.write("<mxc:collection ") for key in main.ns: file.write(' xmlns:' + key + '="' + main.ns[key] + '"') file.write(">\n") else: filename = id_filename + ".iso2709" file = mc.MARCWriter(open(filename, "wb")) return file
def test_write(self): # write a record off to a file writer = pymarc.MARCWriter(file('test/writer-test.dat', 'w')) record = pymarc.Record() field = pymarc.Field('245', ['0', '0'], ['a', 'foo']) record.add_field(field) writer.write(record) writer.close() # read it back in reader = pymarc.MARCReader(file('test/writer-test.dat')) record = reader.next() # remove it os.remove('test/writer-test.dat')
def test_copy_utf8(self): writer = pymarc.MARCWriter(open('test/write-utf8-test.dat', 'wb')) new_record = pymarc.Record(to_unicode=True, force_utf8=True) def process_xml(record): new_record.leader = record.leader for field in record.get_fields(): new_record.add_field(field) pymarc.map_xml(process_xml, 'test/utf8.xml') try: writer.write(new_record) writer.close() finally: # remove it os.remove('test/write-utf8-test.dat')
def test_write(self): """Write a record off to a file.""" file_handle = open("test/writer-test.dat", "wb") writer = pymarc.MARCWriter(file_handle) record = pymarc.Record() field = pymarc.Field("245", ["0", "0"], ["a", "foo"]) record.add_field(field) writer.write(record) writer.close() self.assertTrue(file_handle.closed, "The file handle should close when the writer closes") # read it back in reader = pymarc.MARCReader(open("test/writer-test.dat", "rb")) next(reader) reader.close() # remove it os.remove("test/writer-test.dat")
def test_write(self): # write a record off to a file file_handle = open('test/writer-test.dat', 'wb') writer = pymarc.MARCWriter(file_handle) record = pymarc.Record() field = pymarc.Field('245', ['0', '0'], ['a', 'foo']) record.add_field(field) writer.write(record) writer.close() self.assertTrue(file_handle.closed, 'The file handle should close when the writer closes') # read it back in reader = pymarc.MARCReader(open('test/writer-test.dat', 'rb')) r = next(reader) reader.close() # remove it os.remove('test/writer-test.dat')
def imslp_tarball_to_marc(tarball, outputfile=None, legacy_mapping=None, max_failures=30): """ Convert an IMSLP tarball to MARC binary output file without extracting it. If outputfile is not given, write to a temporary location. Returns the location of the resulting MARC file. A maximum number of failed conversions can be specified with `max_failures`, as of 2018-04-25, there were 30 records w/o title. """ if outputfile is None: _, outputfile = tempfile.mkstemp(prefix="siskin-") stats = collections.Counter() with open(outputfile, "wb") as output: writer = pymarc.MARCWriter(output) with tarfile.open(tarball) as tar: for member in tar.getmembers(): fobj = tar.extractfile(member) try: record = imslp_xml_to_marc(fobj.read(), legacy_mapping=legacy_mapping) writer.write(record) except ValueError as exc: logger.warn("conversion failed: %s", exc) stats["failed"] += 1 finally: fobj.close() stats["processed"] += 1 writer.close() if stats["failed"] > max_failures: logger.warn("%d records failed, only %d failures allowed", stats["failed"], max_failures) raise RuntimeError("more than %d records failed", max_failures) logger.debug("%d/%d records failed/processed", stats["failed"], stats["processed"]) return outputfile
def run(self): """ Iterate over all zipfiles in reverse, convert and concat binary marc into tempfile, then deduplicate. """ # Load all deletions into set. deleted = set() deldir = os.path.dirname(self.input().get('deletions').path) for path in sorted(iterfiles(deldir), reverse=True): with open(path) as handle: for i, line in enumerate(handle, start=1): line = line.strip() if len(line) > 20: self.logger.warn("suspicious id: %s", line) deleted.add(line) # Load updates. pattern = re.compile(r'^date-[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}.zip$') datadir = os.path.dirname(self.input().get('data').path) # Combine all binary MARC records in this file. _, combined = tempfile.mkstemp(prefix='siskin-') for path in sorted(iterfiles(datadir), reverse=True): filename = os.path.basename(path) if not pattern.match(filename): self.logger.warn("ignoring invalid filename: %s", path) continue if os.stat(path).st_size < 22: self.logger.warn("ignoring possibly empty zip file: %s", path) continue with zipfile.ZipFile(path) as zf: for name in zf.namelist(): with zf.open(name) as handle: with tempfile.NamedTemporaryFile(delete=False) as dst: shutil.copyfileobj(handle, dst) shellout( "yaz-marcdump -i marcxml -o marc {input} >> {output}", input=dst.name, output=combined, ignoremap={5: 'expected error from yaz'}) os.remove(dst.name) # Finally, concatenate initial dump. shellout("cat {input} >> {output}", input=self.input().get('dump').path, output=combined) # Already seen identifier. seen = set() with self.output().open('wb') as output: writer = pymarc.MARCWriter(output) # Iterate over MARC records (which are newest to oldest, keep track of seen identifiers). with open(combined) as handle: reader = pymarc.MARCReader(handle, force_utf8=True, to_unicode=True) for record in reader: field = record["001"] if not field: self.logger.debug("missing identifier") continue id = field.value() if id in seen: self.logger.debug("skipping duplicate: %s", id) continue if id in deleted: self.logger.debug("skipping deleted: %s", id) continue self.logger.debug("adding %s", id) writer.write(record) seen.add(id) self.logger.debug( "found %s unique records (deletion list contained %s ids)", len(seen), len(deleted)) os.remove(combined)
handles_csv = open(aco_globals.batch_folder+'/handles.csv', 'r') aco_globals.handles_lines = handles_csv.readlines() handles_csv.close() except: handles_csv = '' # retrieve the CSV file containing the BSNs and source entity (SE) book numbers try: bsn_se_csv = open(aco_globals.batch_folder+'/bsn-se-map.csv', 'r') aco_globals.bsn_se_lines = bsn_se_csv.readlines() bsn_se_csv.close() except: bsn_se_csv = '' # OUTPUT FILES output_folder = aco_globals.batch_folder+'/'+batch_name+'_3' aco_globals.marcRecsOut_errors_all = pymarc.MARCWriter(file(output_folder+'/'+batch_name+'_3_errors_all.mrc', 'w')) aco_globals.recs_errors_all_txt = codecs.open(output_folder+'/'+batch_name+'_3_errors_all.txt', 'w', encoding='utf8') aco_globals.recs_errors_all_txt.write('ALL Records containing any type of error - batch '+batch_name+'\n') aco_globals.recs_errors_all_txt.write('-- Each of these records have one or more of the following errors:\n') aco_globals.recs_errors_all_txt.write(' -- no 880 fields\n') aco_globals.recs_errors_all_txt.write(' -- missing a key 880 field\n') aco_globals.recs_errors_all_txt.write(' -- have an unlinked 880 field\n') aco_globals.recs_errors_all_txt.write(' -- have a series heading error in the 490/800/810/811/830 fields\n') aco_globals.recs_errors_all_txt.write(' -- have one of the various miscellaneous errors, marked with ERROR-MISC\n') aco_globals.recs_errors_all_txt.write('Report produced: '+aco_globals.curr_time+'\n') all_recs_analysis_txt = codecs.open(output_folder+'/'+batch_name+'_3_all_recs_analysis.txt', 'w', encoding='utf8') aco_globals.marcRecsOut_final_subset = pymarc.MARCWriter(file(output_folder+'/'+batch_name+'_3_final_recs.mrc', 'w')) aco_globals.marcRecsOut_final_all = pymarc.MARCWriter(file(aco_globals.batch_folder+'/'+batch_name+'_4_final_recs.mrc', 'w'))
import marcx import pandas import pymarc warnings.filterwarnings("ignore", message="numpy.dtype size changed") warnings.filterwarnings("ignore", message="numpy.ufunc size changed") inputfilename = "160_input.csv" outputfilename = "160_output.mrc" if len(sys.argv) == 3: inputfilename, outputfilename = sys.argv[1:] outputfile = io.open(outputfilename, "wb") writer = pymarc.MARCWriter(outputfile) csv_records = pandas.read_csv(inputfilename, encoding="latin-1", sep=";") for csv_record in csv_records.iterrows(): csv_record = csv_record[1] marc_record = marcx.Record(force_utf8=True) marc_record.leader = " nam 22 4500" f001 = "finc-160-" + str(csv_record["001"]) marc_record.add("001", data=f001) # Zugangsformat marc_record.add("007", data="tu")
import pymarc from pymarc import Record, Field from copy import deepcopy ############################################ # Read in a file of MARC records ############################################ my_records_in = pymarc.MARCReader(file('my_marc_recs_in.mrc'), to_unicode=True, force_utf8=True) ############################################ # Write out a file of MARC records ############################################ my_records_out = pymarc.MARCWriter(file('my_marc_recs_out.mrc', 'w')) ############################################ # Iterate through the input file of MARC records ############################################ rec_num = 1 for my_record in my_records_in: # iterate through each of the records in the file print 'Record #: '+str(rec_num) my_orig_record = deepcopy(my_record) print_record = False ############################################ # DEMO 1 - Get 650 fields from MARC records using record.get_fields() function # my_650s = my_record.get_fields('650') # print 'List of field objects returned: ' # print my_650s #
import xmltodict #our includes from getbib import * #open the file of ISBNs/Standard numbers (see sample.txt) bibs = open(sys.argv[1], "r").readlines() #extract file name for writing to output files file_name = os.path.splitext(os.path.basename(sys.argv[1]))[0] #strip spaces bibs = [x.strip() for x in bibs] #open up a new MARC file writer = pymarc.MARCWriter(open('output/' + file_name + '.mrc', 'wb')) #start three lists to create a report at the end missing_list = [] brief_list = [] retrieved_list = [] #write the full output from the Worldcat endpoint to an xml file full_xml = open('output/' + file_name + '_full.xml', 'wb') #iterate through each identifier and retrieve a record from the API for b in bibs: print(b) #check for empty lines if len(b) == 0:
#!/usr/bin/python import os import errno import sys import time import shutil import codecs import pymarc from pymarc import Record, Field import aco_globals import aco_functions aco_mrc_all = pymarc.MARCWriter( file(aco_globals.work_folder + '/' + 'mrc_out_all-3.mrc', 'w')) # Retrieve individual final MARC files from each mrc_out batch folder for root, folders, files in os.walk(aco_globals.work_folder): for folder in folders: if folder == "mrc_out": mrc_out_path = os.path.join(root, folder) for root, folders, files in os.walk(mrc_out_path): for mrc_file in files: mrc_file_path = os.path.join(root, mrc_file) this_mrc_files = pymarc.MARCReader(file(mrc_file_path), to_unicode=True, force_utf8=True) for this_mrc_file in this_mrc_files: this_mrc_003 = this_mrc_file.get_fields( '003')[0].value() this_mrc_001 = this_mrc_file.get_fields(
def _mrc(record): mrc_file = StringIO() writer = pymarc.MARCWriter(mrc_file) writer.write(record) mrc_file.seek(0) return mrc_file.read()
try: bsn_se_csv = open(aco_globals.batch_folder + '/bsn-se-map.csv', 'r') aco_globals.bsn_se_lines = bsn_se_csv.readlines() bsn_se_csv.close() except: bsn_se_csv = '' # OUTPUT FILES try: os.makedirs(aco_globals.batch_folder + '/' + batch_name + '_1/') except OSError as exception: if exception.errno != errno.EEXIST: raise marcRecsOut_orig_no_oclc_nums = pymarc.MARCWriter( file( aco_globals.batch_folder + '/' + batch_name + '_1/' + batch_name + '_1_orig_no_oclc_nums.mrc', 'w')) orig_no_oclc_nums_txt = codecs.open(aco_globals.batch_folder + '/' + batch_name + '_1/' + batch_name + '_1_orig_no_oclc_nums.txt', 'w', encoding='utf-8') orig_no_oclc_nums_txt.write('003/Inst,001/BSN,OCLC number(s),245a/Title\n') marcRecsOut_orig_with_oclc_nums = pymarc.MARCWriter( file( aco_globals.batch_folder + '/' + batch_name + '_1/' + batch_name + '_1_orig_with_oclc_nums.mrc', 'w')) orig_with_oclc_nums_txt = codecs.open(aco_globals.batch_folder + '/' + batch_name + '_1/' + batch_name + '_1_orig_with_oclc_nums.txt',
errorlist = "" path = os.path.dirname(os.path.abspath(__file__)) + '\\' for infile in glob.glob(os.path.join(path, '*.mrc')): with open(infile, 'rb') as fh: reader = pymarc.MARCReader(fh, force_utf8=True) record = next(reader) new_record = pymarc.Record(to_unicode=True, force_utf8=True) new_record.leader = record.leader for field in record.get_fields(): new_record.add_field(field) for f in new_record.get_fields('599'): new_record.remove_field( new_record.get_fields('599')[0] ) # only grabs first instance of 599 but only rarely will there be more than one print("deleted 599 from " + infile) for f in new_record.get_fields('910'): new_record.remove_field(new_record.get_fields('910') [0]) # some old records have 910 for stats print("deleted 910 from " + infile) out = pymarc.MARCWriter(open(infile, 'wb')) # the MARCWriter part above can be modified to create xml, json, and mnemonic mrk formats instead try: out.write(new_record) except Exception: errorlist += infile + '\n' out.close() if errorlist != "": with open(path + 'errors.log', 'w+') as fh: fh.write(errorlist)
def xml_to_mrc(path_in, path_out): writer = pymarc.MARCWriter(open(path_out, 'wb')) records = pymarc.map_xml(writer.write, path_in) writer.close()
# OUTPUT FILES output_folder = aco_globals.batch_folder + '/' + batch_name + '_3' try: os.makedirs(output_folder + '/' + batch_name + '_3_errors_no_880s/') os.makedirs(output_folder + '/' + batch_name + '_3_errors_missing_key_880s/') os.makedirs(output_folder + '/' + batch_name + '_3_errors_unlinked_880s/') os.makedirs(output_folder + '/' + batch_name + '_3_errors_series/') os.makedirs(output_folder + '/' + batch_name + '_3_errors_misc/') except OSError as exception: if exception.errno != errno.EEXIST: raise aco_globals.marcRecsOut_no_880s = pymarc.MARCWriter( file( output_folder + '/' + batch_name + '_3_errors_no_880s/' + batch_name + '_3_no_880s.mrc', 'w')) aco_globals.recs_no_880s_txt = codecs.open(output_folder + '/' + batch_name + '_3_errors_no_880s/' + batch_name + '_3_no_880s.txt', 'w', encoding='utf8') aco_globals.recs_no_880s_txt.write( 'Records with NO 880 script fields - batch ' + batch_name + '\n') aco_globals.recs_no_880s_txt.write( '-- These records do NOT contain ANY 880 script fields\n') aco_globals.recs_no_880s_txt.write('Report produced: ' + aco_globals.curr_time + '\n') aco_globals.marcRecsOut_missing_key_880s = pymarc.MARCWriter( file(