def html_compare(file1, file2, outfile, include_missing=False, brief=True): """Compare 2 files of MARC records and write HTML diff""" differ = difflib.HtmlDiff(wrapcolumn=85) with open(file1, "rb") as f1, open(file2, "rb") as f2, open(outfile, "w", encoding="utf-8") as out: out.write(FILE_START) r1 = pymarc.MARCReader(f1) r2 = pymarc.MARCReader(f2) rec2 = next(r2) for rec1 in r1: rec_id = rec1["001"].data if rec2["001"].data != rec_id: if include_missing: out.write(f"<strong>record {rec_id} missing</strong>") if not brief: out.write( differ.make_table(str(rec1).splitlines(), [""])) out.write("<hr />") else: out.write( differ.make_table(str(rec1).splitlines(), str(rec2).splitlines(), context=brief)) try: rec2 = next(r2) except StopIteration: pass out.write(FILE_END)
def setUp(self): self.reader = pymarc.MARCReader(open("test/test.dat", "rb")) self._record = pymarc.Record() field = pymarc.Field(tag="245", indicators=["1", "0"], subfields=["a", "Python", "c", "Guido"]) self._record.add_field(field)
def iso2tables(master, entry_filename, file_format, rec_format, id_traitement): #input_file_test = open(entry_filename,'rb').read() #print(chardet.detect(input_file_test).read()) encoding = "iso-8859-1" if (file_format == 1): encoding = "utf-8" (test_file, input_file) = test_encoding_file(master, entry_filename, encoding, file_format) assert test_file temp_list = [el + u'\u001D' for el in input_file] i = 0 for rec in temp_list: i += 1 outputfilename = "temp_record.txt" outputfile = open(outputfilename, "w", encoding="utf-8") outputfile.write(rec) outputfile.close() with open(outputfilename, 'rb') as fh: collection = mc.MARCReader(fh) if (file_format == 1): collection.force_utf8 = True (test, record) = detect_errors_encoding_iso(collection) if (test): record2listemetas(id_traitement, record, rec_format) try: os.remove("temp_record.txt") except FileNotFoundError as err: print(err) stats["Nombre total de notices traitées"] = i
def iso2tables(master, entry_filename, rec_format, id_traitement): input_file = open(entry_filename, 'r', encoding="utf-8").read().split(u'\u001D')[0:-1] temp_list = [el + u'\u001D' for el in input_file] for rec in temp_list: outputfilename = "temp_record.txt" outputfile = open(outputfilename, "w", encoding="utf-8") outputfile.write(rec) outputfile.close() with open(outputfilename, 'rb') as fh: collection = mc.MARCReader(fh) collection.force_utf8 = True try: for record in collection: print(record2meta(record, ["001"])) record2listemetas(record, rec_format) except mc.exceptions.RecordLengthInvalid as err: NumNot = record2meta(record, ["001"]) liste_notices_pb_encodage.append(NumNot) pass except UnicodeDecodeError as err: NumNot = record2meta(record, ["001"]) liste_notices_pb_encodage.append(NumNot) pass try: os.remove("temp_record.txt") except FileNotFoundError as err: main.popup_errors(master, main.errors["format_fichier_en_entree"])
def marc_recs_received(marcfile, hostenv): """Determines how many MARC records are to be delivered and update marc_records table in ETD db""" count = 0 try: reader = pymarc.MARCReader(open(marcfile, 'rb')) for record in reader: fields856 = [] count += 1 # only records delivered from Proquest include an 020 field if record['020'] is not None: isbn = record['020']['a'] for mfield in record.get_fields('856'): fields856.append(mfield['u']) # this updates the table for MARC records rec'd from PQ; # the table is also update in create_marc_xml for MARC # records generated from PQ XML metadata. update_marc_table(isbn, fields856[0], fields856[1], hostenv) #pylint: disable=maybe-no-member except pymarc.exceptions.PymarcException as err: logging.exception("ERROR opening MARC records %s: %s", marcfile, err.message) return count
def break_up_record( start_record=0, end_record=0 ): """ Splits big marc file into smaller files. This can successfully re-write the whole errant `rec_19.mrc` file. """ log.debug( 'start_record, `{st}`; end_record, `{en}`'.format( st=start_record, en=end_record ) ) BIG_MARC_FILEPATH = settings.INPUT_FILEPATH SMALLER_OUTPUT_FILEPATH = settings.OUTPUT_FILEPATH log.debug( 'processing file, ``{}```'.format(BIG_MARC_FILEPATH) ) log.debug( 'output file, ``{}```'.format(SMALLER_OUTPUT_FILEPATH) ) start_time = datetime.datetime.now() count = 0 with open( BIG_MARC_FILEPATH, 'rb' ) as input_fh: # reader = pymarc.MARCReader( input_fh, force_utf8=True, utf8_handling='ignore' ) # reader = pymarc.MARCReader( input_fh ) # reader = pymarc.MARCReader( input_fh, to_unicode=True ) reader = pymarc.MARCReader( input_fh, to_unicode=True, utf8_handling='ignore' ) # works! with open( SMALLER_OUTPUT_FILEPATH, 'wb' ) as output_fh: writer = pymarc.MARCWriter( output_fh ) for record in reader: count += 1 if count % 10000 == 0: print( '`{}` records processed'.format(count) ) if count >= start_record: writer.write( record ) if count >= end_record: break end_time = datetime.datetime.now() log.debug( 'records processed, `{}`'.format(count) ) log.debug( 'time_taken, `{}`'.format(end_time-start_time) )
def getMRCHeader(marc_file): out = set() marcrange = ["%03d" % i for i in range(999)] for record in pymarc.MARCReader(open(marc_file, 'rb')): for num in marcrange: for field in record.get_fields(num): out.add('leader') if int(field.tag) < 10: out.add(field.tag) else: if field.tag in fieldsNonFile1: ind1_norm = field.indicator1 ind2_norm = '*' elif field.tag in fieldsNonFile2: ind1_norm = '*' ind2_norm = field.indicator2 else: ind1 = field.indicator1 ind2 = field.indicator2 ind1_norm = ind1.replace(' ', '#') ind2_norm = ind2.replace(' ', '#') if field.tag in fieldsHeadings: out.add(field.tag) else: subfrange = list(string.ascii_lowercase) for num in string.digits: subfrange.append(num) for subfield in subfrange: if field.get_subfields(subfield): tag_concat = (field.tag + '_' + ind1_norm + ind2_norm + '$' + subfield) out.add(tag_concat) out_sorted = sorted(out) return(out_sorted)
def test_reader(): reader = pymarc.MARCReader(file('rusmarc_ebsco.mrc', 'r'), to_unicode=True, encoding='utf-8') for i, record in enumerate(reader): if i == 100: break print record_to_xml(record)
def setUp(self): self.reader = pymarc.MARCReader(open('test/test.dat', 'rb')) self._record = pymarc.Record() field = pymarc.Field(tag='245', indicators=['1', '0'], subfields=['a', 'Python', 'c', 'Guido']) self._record.add_field(field)
def disabled_test_codecs(self): import codecs with codecs.open('test/test.dat', encoding='utf-8') as fh: reader = pymarc.MARCReader(fh) record = next(reader) self.assertEqual(record['245']['a'], u'ActivePerl with ASP and ADO /')
def generateMARCXML(MARC21input, MARCXMLoutput): """Write MARCXML file for given MARC21 binary file.""" reader = pymarc.MARCReader(open(MARC21input, 'rb')) for record in reader: writer = pymarc.XMLWriter(open(MARCXMLoutput,'wb')) writer.write(record) writer.close()
def load_training_marc(self, marc_filename, marc_labels): """Method loads a training set of MARC records for a Creative Work Parameters: marc_filename -- Full path to marc filename marc_labels -- A list of booleans, True is Good, False is Bad """ marc_reader = pymarc.MARCReader(open(marc_filename, 'rb')) words_re = re.compile(r"(\w+)") self.records = [] for record in marc_reader: self.records.append(record) if len(self.records) != len(marc_labels): error_msg = "Number of records {0} must match MARC Labels {1}".format( len(self.records), len(marc_labels)) raise WorkClassifierError(error_msg) count = 0 good_tokens, bad_tokens = [], [] for record in self.records: tokens = self.__tokenize_marc21__(record) if marc_labels[count] is True: good_tokens.extend(tokens) elif marc_labels[count] is False: bad_tokens.extend(tokens) else: raise WorkClassifierError( "Unknown value for rec #{0} {1} {3}".format( count, marc_labels[count], tokens)) count += 1 self.rb.train('good', ' '.join(good_tokens)) self.rb.train('bad', ' '.join(bad_tokens))
def to_dict(cls, bin_marc): """Takes binary marc or marcxml and parses it into a human readable dict by first creating a pymarc object which enables easy access to the marc's fields and data. Pymarc doesn't have a convenient way to cast itself to a dict (keyed) with meaningful values (e.g. title, isbn, etc) so this method converts pymarc's Record (pymarc.record.Record) to a human readable dict. """ reader = pymarc.MARCReader(bin_marc, hide_utf8_warnings=True, force_utf8=True, utf8_handling='ignore') record = reader.next() keyed_record = MARCRecord(record) data = { 'identifiers': {}, 'authors': [keyed_record.author], } isbn = record.isbn() if isbn: data['isbn_%s' % len(record.isbn())] = [record.isbn()] data.update(keyed_record.publisher) data.update(keyed_record.title) return data
def iso2tables_old(master, entry_filename, rec_format, id_traitement): with open(entry_filename, 'rb') as fh: collection = mc.MARCReader(fh) collection.force_utf8 = True try: for record in collection: record2listemetas(record, rec_format) except mc.exceptions.RecordLengthInvalid as err: print("\n\n/*---------------------------------------------*\n\n") print(main.errors["pb_input_utf8"]) print(err) print("\n\n*------------------------------------------------*/") main.popup_errors( master, main.errors["pb_input_utf8_marcEdit"], "Aide en ligne : conversion iso2709 > XML", "https://github.com/Transition-bibliographique/bibliostratus/wiki/1-%5BBleu%5D-Pr%C3%A9parer-ses-donn%C3%A9es-pour-l'alignement-%C3%A0-partir-d'un-export-catalogue#un-probl%C3%A8me-dencodage--passez-en-xml-avec-marcedit" ) except UnicodeDecodeError as err: print("\n\n/*---------------------------------------------*\n\n") print(main.errors["pb_input_utf8"]) print(err) print("\n\n*------------------------------------------------*/") main.popup_errors( master, main.errors["pb_input_utf8_marcEdit"], "Aide en ligne : conversion iso2709 > XML", "https://github.com/Transition-bibliographique/bibliostratus/wiki/1-%5BBleu%5D-Pr%C3%A9parer-ses-donn%C3%A9es-pour-l'alignement-%C3%A0-partir-d'un-export-catalogue#un-probl%C3%A8me-dencodage--passez-en-xml-avec-marcedit" )
def fixRecord(record="", record_id=0, validation=False, replaceMethod='decimal'): replaceMethods = { 'decimal': (('#29;', '#30;', '#31;'), ("\x1D", "\x1E", "\x1F")), 'unicode': (('\u001d', '\u001e', '\u001f'), ("\x1D", "\x1E", "\x1F")), 'hex': (('\x1D', '\x1E', '\x1F'), ("\x1D", "\x1E", "\x1F")) } marcFullRecordFixed = record for i in range(0, 3): marcFullRecordFixed = marcFullRecordFixed.replace( replaceMethods.get(replaceMethod)[0][i], replaceMethods.get(replaceMethod)[1][i]) if validation: try: reader = pymarc.MARCReader(marcFullRecordFixed.encode('utf8'), utf8_handling='replace') marcrecord = next(reader) except (RecordLengthInvalid, RecordLeaderInvalid, BaseAddressNotFound, BaseAddressInvalid, RecordDirectoryInvalid, NoFieldsFound, UnicodeDecodeError) as e: eprint("record id {0}:".format(record_id) + str(e)) with open('invalid_records.txt', 'a') as error: eprint(marcFullRecordFixed, file=error) return None return marcFullRecordFixed
def break_up_record( start_record=0, end_record=0 ): """ Splits big marc file into smaller files. This can successfully re-write the whole errant `rec_19.mrc` file. """ log.debug( 'start_record, `{st}`; end_record, `{en}`'.format( st=start_record, en=end_record ) ) BIG_MARC_FILEPATH = os.environ['PYMARC_EXP__BIG_MARC_FILEPATH'] SMALLER_OUTPUT_FILEPATH = os.environ['PYMARC_EXP__SMALLER_OUTPUT_MARC_FILEPATH'] log.debug( 'processing file, ``{}```'.format(BIG_MARC_FILEPATH) ) log.debug( 'output file, ``{}```'.format(SMALLER_OUTPUT_FILEPATH) ) start_time = datetime.datetime.now() count = 0 last_record = 'init' with open( BIG_MARC_FILEPATH, 'rb' ) as input_fh: # reader = pymarc.MARCReader( input_fh, force_utf8=True, utf8_handling='ignore' ) # reader = pymarc.MARCReader( input_fh ) # reader = pymarc.MARCReader( input_fh, to_unicode=True ) # reader = pymarc.MARCReader( input_fh, to_unicode=True, utf8_handling='ignore' ) # works! reader = pymarc.MARCReader( input_fh, to_unicode=True, force_utf8=True, utf8_handling='ignore' ) # reader = pymarc.MARCReader( input_fh, to_unicode=False, utf8_handling='ignore' ) with open( SMALLER_OUTPUT_FILEPATH, 'wb' ) as output_fh: writer = pymarc.MARCWriter( output_fh ) processing_flag = True while processing_flag is True: try: record = next(reader) except Exception as e: record = None log.error( 'exception looping through records; ```{}```'.format( unicode(repr(e)) ) ) log.error( 'e info, ```{}```'.format(e) ) e_type, e_value, e_traceback = sys.exc_info() # <http://stackoverflow.com/a/15890953> log.error( 'e_type, ```{}``'.format(e_type) ) log.error( 'e_value, ```{}``'.format(e_value) ) log.error( 'e_traceback, ```{}```'.format(e_traceback) ) log.error( 'traceback info, ```{}```'.format( traceback.format_exc() ) ) # log.error( 'current record, ```{}```'.format( record ) ) log.error( 'current count, `{}`'.format(count) ) # log.error( 'last_record, ```{}```'.format(last_record) ) last_record = record count += 1 if count % 10000 == 0: print( '`{}` records processed'.format(count) ) if count >= start_record: log.debug( 'count, `{}`'.format(count) ) if record: log.debug( 'count is, `{cnt}`, so will write record.as_json()[0:100], ```{rcd}```'.format( cnt=count, rcd=record.as_json()[0:100] ) ) writer.write( record ) if count >= end_record: processing_flag = False end_time = datetime.datetime.now() log.debug( 'records processed, `{}`'.format(count) ) log.debug( 'time_taken, `{}`'.format(end_time-start_time) )
def iso2tables(master, entry_filename, file_format, rec_format, id_traitement): #input_file_test = open(entry_filename,'rb').read() #print(chardet.detect(input_file_test).read()) encoding = "iso-8859-1" if (file_format == 1): encoding = "utf-8" (test_file, input_file) = test_encoding_file(master, entry_filename, encoding) assert test_file temp_list = [el + u'\u001D' for el in input_file] i = 0 for rec in temp_list: i += 1 outputfilename = "temp_record.txt" outputfile = open(outputfilename, "w", encoding="utf-8") outputfile.write(rec) outputfile.close() with open(outputfilename, 'rb') as fh: collection = mc.MARCReader(fh) if (file_format == 1): collection.force_utf8 = True (test, record) = detect_errors_encoding_iso(collection) if (test): record2listemetas(id_traitement, record, rec_format) #============================================================================== # try: # for record in collection: # #print(record2meta(record,["001"])) # record2listemetas(id_traitement, record,rec_format) # except ValueError as err: # alerte_bom(str(err)) # except UnboundLocalError: # main.popup_errors(master,main.errors["format_fichier_en_entree"]) # except mc.exceptions.RecordLengthInvalid as err: # alerte_bom(str(err)) # NumNot = record2meta(record,["001"]) # liste_notices_pb_encodage.append(NumNot) # pass # except UnicodeDecodeError as err: # NumNot = record2meta(record,["001"]) # liste_notices_pb_encodage.append(NumNot) # pass #============================================================================== #============================================================================== # except UnicodeDecodeError as err: # print("""Le fichier en entrée n'est pas en """ + encoding + """ # Essayez l'autre option d'encodage du module, ou convertissez le fichier en XML # en utilisant MarcEdit""") # #============================================================================== try: os.remove("temp_record.txt") except FileNotFoundError as err: print(err) stats["Nombre total de notices traitées"] = i
def test_regression_45(self): # https://github.com/edsu/pymarc/issues/45 with open("test/regression45.dat", "rb") as fh: reader = pymarc.MARCReader(fh) record = next(reader) self.assertEqual(record["752"]["a"], "Russian Federation") self.assertEqual(record["752"]["b"], "Kostroma Oblast") self.assertEqual(record["752"]["d"], "Kostroma")
def disabled_test_codecs(self): import codecs with codecs.open("test/test.dat", encoding="utf-8") as fh: reader = pymarc.MARCReader(fh) record = next(reader) self.assertEqual(record["245"]["a"], "ActivePerl with ASP and ADO /")
def main(): marc_file = open('hello_marc.dat', 'rb') reader = pymarc.MARCReader(marc_file, force_utf8=True) # record = next(reader) # returns a record object for record in reader: print(type(record)) print(type(record.fields)) print(record.leader)
def mrc_to_mrk(path_in, path_out): reader = pymarc.MARCReader(open(path_in, 'rb'), to_unicode=True, force_utf8=True) writer = pymarc.TextWriter(io.open(path_out, 'wt', encoding="UTF-8")) for record in reader: writer.write(record) writer.close()
def test_regression_45(self): # https://github.com/edsu/pymarc/issues/45 with open('test/regression45.dat', 'rb') as fh: reader = pymarc.MARCReader(fh) record = next(reader) self.assertEqual(record['752']['a'], 'Russian Federation') self.assertEqual(record['752']['b'], 'Kostroma Oblast') self.assertEqual(record['752']['d'], 'Kostroma')
def __init__(self, file_name, exceptions=False): """DocString.""" self.error = None self._marc_reader = pymarc.MARCReader(open(file_name, 'rb'), to_unicode=True, force_utf8=True, utf8_handling='ignore') self.error = '' self.exceptions = exceptions
def write_csv(jangle_feed, csv_file_handle, ils=None): """ Convert a MARC dump file to a CSV file. """ # This doctest commented out until field names are stable. #>>> write_csv('test/marc.dat', 'test/records.csv') #>>> csv_records = open('test/records.csv').read() #>>> csv_measure = open('test/measure.csv').read() #>>> csv_records == csv_measure #True #>>> os.remove('test/records.csv') import elementtree.ElementTree as ET import urlparse import StringIO feed = ET.fromstring(jangle_feed.read()) continue_processing = "true" fieldname_dict = {} for fieldname in FIELDNAMES: fieldname_dict[fieldname] = fieldname #for record in reader atom_ns = "http://www.w3.org/2005/Atom" count = 0 try: writer = csv.DictWriter(csv_file_handle, FIELDNAMES) writer.writerow(fieldname_dict) content_record = '' while continue_processing == "true": for entry in feed.findall("./{%s}entry" % atom_ns): try: id = 'djo'+urlparse.urlparse(entry.find("{%s}id" % atom_ns).text).path.replace("/",":") except Exception, e: print(str(e)) content = entry.find("{%s}content" % atom_ns) content_record = base64.b64decode(content.text) try: reader = pymarc.MARCReader(StringIO.StringIO(content_record)) except Exception, e: print(str(e)) try: for marc_record in reader: count += 1 try: record = get_record(marc_record, id, ils=ils) if record: # skip when get_record returns None row = get_row(record) writer.writerow(row) except: sys.stderr.write("\nError in MARC record #%s (%s):\n" % (count, marc_record.title())) raise else: if count % 1000: sys.stderr.write(".") else: sys.stderr.write(str(count)) except Exception, e: print(str(e))
def main(arguments): marc_file = open(arguments.filename, 'rb') reader = pymarc.MARCReader(marc_file, force_utf8=True) record = next(reader) # returns a record object if arguments.field: print(record[arguments.field]) else: print(record) # print the MARC-text representation of the record return record
def test_MARCRecord(self): with open(os.path.join(EXAMPLES_PATH, 'line_marc.txt')) as line_marc: bin_marc = MARC.convert(line_marc.read()) reader = pymarc.MARCReader(bin_marc, hide_utf8_warnings=True, force_utf8=True, utf8_handling='ignore') keyed_record = MARCRecord(next(reader)) self.assertTrue(keyed_record.author.name, "Failed to retrieve author name")
def load_training_marc(self, marc_filename): """Method loads a training set of MARC records for a Creative Work Parameters: marc_filename -- Full path to marc filename """ marc_reader = pymarc.MARCReader(open(marc_filename, 'rb')) words_re = re.compile(r"(\w+)") for record in marc_reader: self.training_data.append(self.__tokenize_marc21__(record))
def run_dedup_experiment(pp_filepath, md_filepath, cache_datastore=redis.StrictRedis(), search_index=Elasticsearch()): """Runs experiment for de-duplicating BIBFRAME Person RDF graphs using MARC records from two samples representing Pride and Prejudice and Moby Dick records. """ pride_prejudice_records = [ check001(r) for r in pymarc.MARCReader(open(pp_filepath, "br+"), to_unicode=True) ] moby_dick_records = [ check001(r) for r in pymarc.MARCReader(open(md_filepath, "br+"), to_unicode=True) ] for recs in [pride_prejudice_records, moby_dick_records]: for record in recs: process_record(record, cache_datastore, search_index)
def make_record(self, raw_bytes): """ Not a test; preps record object. """ s = raw_bytes.decode( 'utf-8', errors='ignore' ) # 'replace' and 'backslashreplace' generate errors on ```record = next( reader )``` handled_b = s.encode('utf-8') fh = io.BytesIO(handled_b) reader = pymarc.MARCReader(fh) record = next(reader) return record
def extract_info(): """ Prints/logs certain record elements. The ```utf8_handling='ignore'``` is required to avoid a unicode-error. """ big_marc_filepath = settings.INPUT_FILEPATH log.debug( 'processing file, ``{}```'.format(big_marc_filepath) ) with open( big_marc_filepath, 'rb' ) as fh: reader = pymarc.MARCReader( fh, force_utf8=True, utf8_handling='ignore' ) # w/o 'ignore', this line generates a unicode-error start = datetime.datetime.now() count = 0 for record in reader: record.force_utf8 = True record_dct = record.as_dict() fields = record_dct['fields'] ## title = record.title() ## bib_id = 'not_available' item_id = 'not_available' record_dct_logged = False for field_dct in fields: for (k, val_dct) in field_dct.items(): if k == '907': try: bib_id = val_dct['subfields'][0]['a'][0:9] except Exception as e: log.debug( 'exception getting bib_id, ``{}```'.format(e) ) log.debug( 'record_dct, ```{}```'.format( pprint.pformat(record_dct) ) ) record_dct_logged = True if k == '945': try: subfields = val_dct['subfields'] for subfield_dct in subfields: for (k2, val2) in subfield_dct.items(): if k2 == 'y': item_id = val2 except Exception as f: log.debug( 'exception getting item_id, ``{}```'.format(f) ) if record_dct_logged is False: log.debug( 'record_dct, ```{}```'.format( pprint.pformat(record_dct) ) ) basic_info = { 'title': record.title(), 'bib_id': bib_id, 'item_id': item_id } # print( 'bas ic_info, ```{}```'.format( pprint.pformat(basic_info) ) ) log.info( 'basic_info, ```{}```'.format( pprint.pformat(basic_info) ) ) try: count+=1 if count % 10000 == 0: print( '`{}` records processed'.format(count) ) # if count > 100000: # break except Exception as e: log.debug( 'exception on record ```{rec}```; error, ```{err}```'.format(rec=record, err=e) ) end = datetime.datetime.now() log.info( 'count of records in file, `{}`'.format(count) ) log.info( 'time_taken, `{}`'.format(end-start) )