def test_marc8_reader_to_unicode(self): reader = MARCReader(file('test/marc8.dat'), to_unicode=True) r = reader.next() self.assertEquals(type(r), Record) utitle = r['240']['a'] self.assertEquals(type(utitle), unicode) self.assertEquals(utitle, u'De la solitude \xe0 la communaut\xe9.')
def test_marc8_reader_to_unicode_bad_eacc_sequence(self): reader = MARCReader(file('test/bad_eacc_encoding.dat'), to_unicode=True, hide_utf8_warnings=True) try: r = reader.next() self.assertFalse("Was able to decode invalid MARC8") except UnicodeDecodeError: self.assertTrue("Caught UnicodeDecodeError as expected")
def main(fd, store_type=None, store_id=None, graph_id=None, gzipped=False): """ Converts MARC21 data stored in fd to a RDFlib graph. """ from rdflib import plugin if store_type: msg = "Need a {} identifier for a disk-based store." assert store_id, msg.format('store') assert graph_id, msg.format('graph') store = plugin.get(store_type, Store)(store_id) else: store = 'default' graph = Graph(store=store, identifier=graph_id) if gzipped: import gzip open = gzip.open try: records = MARCReader(open(fd)) for i, triple in enumerate(process_records(records)): graph.add(triple) if i % 100 == 0: graph.commit() if i % 10000 == 0: print i finally: records.close() return graph
def test_marc8_reader_to_unicode_bad_escape(self): reader = MARCReader(file('test/bad_marc8_escape.dat'), to_unicode=True) r = reader.next() self.assertEquals(type(r), Record) upublisher = r['260']['b'] self.assertEquals(type(upublisher), unicode) self.assertEquals(upublisher, u'La Soci\xe9t\x1b,')
def test_marc8_reader(self): reader = MARCReader(file('test/marc8.dat')) r = reader.next() self.assertEquals(type(r), Record) utitle = r['240']['a'] self.assertEquals(type(utitle), str) self.assertEquals(utitle, 'De la solitude \xe1a la communaut\xe2e.')
def test_reading_utf8_without_flag(self): with open('test/utf8_without_leader_flag.dat', 'rb') as fh: reader = MARCReader(fh, to_unicode=False) record = next(reader) self.assertEqual(type(record), Record) utitle = record['240']['a'] self.assertEqual(type(utitle), binary_type) self.assertEqual(utitle, b'De la solitude a\xcc\x80 la communaute\xcc\x81.') with open('test/utf8_without_leader_flag.dat', 'rb') as fh: reader = MARCReader(fh, to_unicode=True, hide_utf8_warnings=True) record = next(reader) self.assertEqual(type(record), Record) utitle = record['240']['a'] self.assertEqual(type(utitle), text_type) # unless you force utf-8 characters will get lost and # warnings will appear in the terminal self.assertEqual(utitle, 'De la solitude a la communaute .') # force reading as utf-8 with open('test/utf8_without_leader_flag.dat', 'rb') as fh: reader = MARCReader(fh, to_unicode=True, force_utf8=True, hide_utf8_warnings=True) record = next(reader) self.assertEqual(type(record), Record) utitle = record['240']['a'] self.assertEqual(type(utitle), text_type) self.assertEqual(utitle, u'De la solitude a' + unichr(0x0300) + ' la communaute' + unichr(0x0301) + '.')
def test_reading_utf8_without_flag(self): with open("test/utf8_without_leader_flag.dat", "rb") as fh: reader = MARCReader(fh, to_unicode=False) record = next(reader) self.assertEqual(type(record), Record) utitle = record["240"]["a"] self.assertEqual(type(utitle), bytes) self.assertEqual( utitle, b"De la solitude a\xcc\x80 la communaute\xcc\x81.") with open("test/utf8_without_leader_flag.dat", "rb") as fh: reader = MARCReader(fh, to_unicode=True, hide_utf8_warnings=True) record = next(reader) self.assertEqual(type(record), Record) utitle = record["240"]["a"] self.assertEqual(type(utitle), str) # unless you force utf-8 characters will get lost and # warnings will appear in the terminal self.assertEqual(utitle, "De la solitude a la communaute .") # force reading as utf-8 with open("test/utf8_without_leader_flag.dat", "rb") as fh: reader = MARCReader(fh, to_unicode=True, force_utf8=True, hide_utf8_warnings=True) record = next(reader) self.assertEqual(type(record), Record) utitle = record["240"]["a"] self.assertEqual(type(utitle), str) self.assertEqual( utitle, u"De la solitude a" + chr(0x0300) + " la communaute" + chr(0x0301) + ".", )
def __init__(self, url_harvest, extra_data, **kwargs): '''Grab file and copy to local temp file''' super(MARCFetcher, self).__init__(url_harvest, extra_data, **kwargs) self.url_marc_file = url_harvest self.marc_file = tempfile.TemporaryFile() self.marc_file.write(urllib.urlopen(self.url_marc_file).read()) self.marc_file.seek(0) self.marc_reader = MARCReader(self.marc_file, to_unicode=True, utf8_handling='replace')
def test_encode_decode(self): # get raw data from file original = file('test/one.dat').read() # create a record object for the file reader = MARCReader(file('test/one.dat')) record = reader.next() # make sure original data is the same as # the record encoded as MARC raw = record.as_marc() self.assertEqual(original, raw)
def test_unicode(self): record = Record() record.add_field(Field(245, ['1', '0'], ['a', unichr(0x1234)])) writer = MARCWriter(open('test/foo', 'w')) writer.write(record) writer.close() reader = MARCReader(open('test/foo')) record = reader.next() self.assertEqual(record['245']['a'], unichr(0x1234))
def test_writing_unicode(self): record = Record() record.add_field(Field(245, ['1', '0'], ['a', unichr(0x1234)])) record.leader = ' a ' writer = MARCWriter(open('test/foo', 'w')) writer.write(record) writer.close() reader = MARCReader(open('test/foo'), to_unicode=True) record = reader.next() self.assertEqual(record['245']['a'], unichr(0x1234)) os.remove('test/foo')
def test_writing_unicode(self): record = Record() record.add_field(Field(245, ["1", "0"], ["a", chr(0x1234)])) record.leader = " a " writer = MARCWriter(open("test/foo", "wb")) writer.write(record) writer.close() reader = MARCReader(open("test/foo", "rb"), to_unicode=True) record = next(reader) self.assertEqual(record["245"]["a"], chr(0x1234)) reader.close() os.remove("test/foo")
def test_writing_unicode(self): record = Record() record.add_field(Field(245, ['1', '0'], ['a', unichr(0x1234)])) record.leader = ' a ' writer = MARCWriter(open('test/foo', 'wb')) writer.write(record) writer.close() reader = MARCReader(open('test/foo', 'rb'), to_unicode=True) record = next(reader) self.assertEqual(record['245']['a'], unichr(0x1234)) reader.close() os.remove('test/foo')
def test_reading_utf8_with_flag(self): reader = MARCReader(open('test/utf8_with_leader_flag.dat')) record = reader.next() self.assertEquals(type(record), Record) utitle = record['240']['a'] self.assertEquals(type(utitle), str) self.assertEquals(utitle, 'De la solitude a\xcc\x80 la communaute\xcc\x81.') reader = MARCReader(open('test/utf8_with_leader_flag.dat'), to_unicode=True) record = reader.next() self.assertEquals(type(record), Record) utitle = record['240']['a'] self.assertEquals(type(utitle), unicode) self.assertEquals(utitle, u'De la solitude a' + unichr(0x0300) + ' la communaute' + unichr(0x0301) + '.')
def read_005(bibid): marc = util.get_clio_marc(bibid) reader = MARCReader(marc) for record in reader: # title = record.title() datestamp = record['005'].value()[0:8] return datestamp
def process_file(f): results = {} with open(f, 'rb') as fh: reader = MARCReader(fh) # try: for record in reader: if '050' in record: if 'a' in record['050']: # might want to change this stuff, # take the first set of codes before a "." lcc = record['050']['a'].split('.')[0] # take whatever is first if there is a space lcc = lcc.split(' ')[0] if lcc not in results: results[lcc] = 0 results[lcc] += 1 # except: # print("error reading record in ", f) return results
def main_loop(marc_in): valid_geo_count = 0 invalid_geo_count = 0 with open(marc_in, 'rb') as fp: rdr = MARCReader(fp, to_unicode=True, force_utf8=True, utf8_handling='ignore', permissive=True) for rcd in tqdm(rdr): id_001 = rcd.get_fields('001')[0].value() if rcd.get_fields( '001') else None id_009 = rcd.get_fields('009')[0].value() if rcd.get_fields( '009') else None rcd_name = rcd.get_fields('151')[0].value() if rcd.get_fields( '151') else None coords_034 = rcd.get_fields('034')[0] if rcd.get_fields( '034') else None if coords_034 and id_001 and id_009 and rcd_name: if check_defg_034(id_001, id_009, rcd_name, coords_034): valid_geo_count += 1 else: invalid_geo_count += 1 return valid_geo_count, invalid_geo_count, invalid_geo_count / valid_geo_count
def main(argv): if len(argv) != 2: usage(sys.stderr) sys.exit(1) # input file inMarcFile = argv[1] # filecheck inputs fileCheck(inMarcFile) # output file outFile = 'extracted-mms-ids.' + time.strftime("%Y%m%d") + '.txt' # marc writer writer = codecs.open(outFile, 'wb', 'utf-8') #------------------------------------------------------------------# # Parse the MARC file for the MMS IDs #------------------------------------------------------------------# print('Extracting MMS IDs...') marcReader = MARCReader(file(inMarcFile), to_unicode=True, force_utf8=True) count = 0 for record in marcReader: count += 1 if count % 250 == 0: print(' Extracting MMS ID #' + unicode(count) + '...') writer.write(record['001'].value() + u'\n') print('Finished. ' + unicode(count) + ' MMS ID(s) extracted.')
def multifile_iter_records(files): for f in files: if not hasattr(f, 'read'): f = open(f) reader = MARCReader(f, to_unicode=True) for record in reader: yield record
def importMARCRecords(self, full=False, startTimestamp=None, recID=None): self.downloadRecordUpdates() museFile = self.downloadMARCRecords() startDateTime = None if full is False: if not startTimestamp: startDateTime = datetime.utcnow() - timedelta(hours=24) else: startDateTime = datetime.strptime(startTimestamp, '%Y-%m-%d') marcReader = MARCReader(museFile) for record in marcReader: if (startDateTime or recID) \ and self.recordToBeUpdated(record, startDateTime, recID)\ is False: continue try: self.parseMuseRecord(record) except MUSEError as e: logger.warning('Unable to parse MUSE record') logger.debug(e)
def read_marc(): digitized_oclc = [] records = [] with open('pclmaps_working_georefed.csv', 'r') as read_file: reader = csv.DictReader(read_file) for row in reader: if row['OCLC']: digitized_oclc.append(row['OCLC']) with open('mapsgis.mrc', 'rb') as read_file: reader = MARCReader(read_file) oclc_list = [] for record in reader: oclc = '' try: if record['001']: oclc = record['001']['a'] oclc = oclc.replace('ocm', '') oclc = oclc.replace('ocn', '') oclc_list.append(oclc) if oclc in digitized_oclc: records.append(record) except TypeError: pass set_digitized_oclc = set(digitized_oclc) set_marc_records = set(oclc_list) in_both = set_digitized_oclc.intersection(set_marc_records) print('digitized_oclc: {0}'.format(set_digitized_oclc)) print('marc_records: {0}'.format(set_marc_records)) print('in both: {0}'.format(in_both)) with open('mapsgis.txt', 'w') as write_file: for record in records: write_file.write(str(record))
def campus_split(): ''' Finds the master format files created by fmt_split(). then writes the records in each format file to separate files for holding campuses based on coding in MARC 049 subfield a. Outputs one file per campus per format. ''' campuses = ['MNGE', 'MNXN'] for campus in campuses: files = [ f for f in os.listdir() if re.match(r'.+(bks|ser|maps|vis|other)\.mrc', f) ] for file in files: with open(file, 'rb') as f: filename = str(file) fpref, fsuf = filename.split('.') writer = MARCWriter(open(fpref + '_' + campus + '.mrc', 'wb')) reader = MARCReader(f) for rec in reader: fields049 = rec.get_fields("049") for field in fields049: suba049 = field.get_subfields("a") for suba in suba049: if campus in suba: writer.write(rec) else: continue writer.close()
def check_mdt(i, n1): ret = ['Failed.', ''] # create config, bin with open('z3950.cfg', 'w') as f: f.write('open tcp:aleph.nkp.cz:9991/AUT-UTF\n') f.write('set_marcdump rec.bin\n') f.write('find @attr 1=12 "' + i + '"\n') # sys. number http://aleph.nkp.cz/web/Z39_NK_cze.htm f.write('show 1\n') f.write('close\n') f.write('quit\n') # call client data = subprocess.check_output(['yaz-client', '-f', 'z3950.cfg']) # paprse output reader = MARCReader(open('rec.bin', 'rb')) for rec in reader: if '100' in rec and 'a' in rec['100']: if n1.strip(',').decode('utf-8') != rec['100']['a'].strip(','): ret = ['Failed.', rec['100']['a']] for F in rec.get_fields('400'): if 'a' in F: if n1.strip(',').decode('utf-8') == F['a'].strip(','): ret = ['400', rec['100']['a']] for F in rec.get_fields('500'): if 'a' in F: if n1.strip(',').decode('utf-8') == F['a'].strip(','): ret = ['500', rec['100']['a']] else: ret = ['100', rec['100']['a']] # cleanup os.remove('z3950.cfg') os.remove('rec.bin') return ret
def marc2list(src, dst): with open(src, "rb") as f: reader = MARCReader(f) n = 0 for bib in reader: n += 1 rec_type = bib.leader[6] blvl = bib.leader[7] item_form = bib["008"].data[23] valid = is_valid_bib_type(rec_type, blvl, item_form) if valid and not has_research_callnum(bib): try: bibNo = bib["907"].value() bibNo = parse_bibNo(bibNo) except AttributeError: raise (f"record {n} has no sierra bib number") isbns = "" isbns_data = [] for field in bib.get_fields("020"): isbns_data.append(field.value()) isbns = extract_isbns(isbns_data) save2csv(dst, [bibNo, isbns])
def read_mc(sys_no): """ Loads marc data from aleph.unibas.ch for one single system number. :param sys_no: System number to which the marc entry is to be loaded. :return: marc binary for said system number. """ # print("reading: "+sys_no) try: conn = zoom.Connection('aleph.unibas.ch', 9909) conn.databaseName = 'dsv05' conn.preferredRecordSyntax = 'USMARC' query = zoom.Query('PQF', '@attr 1=1032 ' + sys_no) res = conn.search(query) data = bytes(res[0].data) except zoom.ConnectionError: print("\n!!! Error: could not connect to aleph !!!\n") return __write_to_cache(data, sys_no) reader = MARCReader(bytes(data), force_utf8=True, to_unicode=True) tmp = next(reader) # print("loaded data from aleph.") return tmp
def process_marc_files(): first_start_time = datetime.datetime.now() ## get list of files marc_file_list: list = create_marc_file_list() ## for each file... for marc_path in marc_file_list: file_start_time = datetime.datetime.now() print('\nnew file...') print(f'marc_file_path, ``{marc_path}``') with open(marc_path, 'rb') as fh: reader = MARCReader(fh) for record in reader: print('\nnew record...') print(f'full_title, ``{record.title()}``') bib = record['907']['a'] print( f'bib_url, ``https://search.library.brown.edu/catalog/{bib[1:-1]}``' ) file_end_time = datetime.datetime.now() print(f'\nfile-elapsed-time, ``{file_end_time - file_start_time}``') all_files_end_time = datetime.datetime.now() print( f'\nall-files-elapsed-time, ``{all_files_end_time - first_start_time}``\n' )
def main(): x = util.get_clio_marc('11256832') reader = MARCReader(x) for record in reader: print(record.title())
def get_records(filename: str) -> List: records = [] with open(filename, 'rb') as fh: reader = MARCReader(fh, to_unicode=True, force_utf8=True) for record in reader: records.append(record) return records
def __read_mc_from_cache(no): with open("data/tmp/marc/" + no + ".marc", "rb") as f: data = f.read() reader = MARCReader(bytes(data), force_utf8=True, to_unicode=True) tmp = next(reader) # print("loaded data from cache.") return tmp
def read_code(ff): reader = MARCReader(ff) num = 0 for r in reader: try: num += 1 pubnum = str(r['001']).replace("=001 ", '') subjects = '' subjectcodes = '' for rr in str(r).decode('utf-8', 'ignore').split("\n"): if rr.startswith('=650'): if subjects == '': subjects += re.sub('\=650.*?(?=[A-Z])', '', rr) else: subjects += ' ' + re.sub('\=650.*?(?=[A-Z])', '', rr) if rr.startswith('=690'): if subjectcodes == '': subjectcodes += re.sub('\=690.*?(?=[0-9])', '', rr) else: subjectcodes += ' ' + re.sub('\=690.*?(?=[0-9])', '', rr) outp.writerow([pubnum, subjects, subjectcodes]) except: print num print num
def main(): for filename in os.listdir('data/mrc/'): if os.path.isdir('data/mrc/' + filename) or filename[0] == '.': continue reader = MARCReader(file('data/mrc/' + filename)) with open('data/csv/' + os.path.splitext(filename)[0] + '.csv', 'wb') as f: writer = csv.writer(f) writer.writerow(['isbn', 'title', 'author', 'publisher', 'pubplace', 'pubyear', 'extent', 'dimensions', 'subject', 'inclusiondate', 'source', 'library', 'notes']) for i, record in enumerate(reader): #print record pubplace = clean(record['260']['a']) if '260' in record else None extent = clean(record['300']['a'], True) if '300' in record else None dimensions = record['300']['c'] if '300' in record else None subject = record['650']['a'] if '650' in record else None inclusiondate = record['988']['a'] if '988' in record else None source = record['906']['a'] if '906' in record else None library = record['690']['5'] if '690' in record else None notes = " ".join([field['a'] for field in record.notes() if 'a' in field]) writer.writerow([record.isbn(), get_title(record), clean(record.author(), True), clean(record.publisher()), pubplace, clean(record.pubyear()), extent, dimensions, subject, inclusiondate, source, library, notes]) if i % 100 == 0: print filename + ": " + str(i) + " documents processed"
def read_iso(file_name: str) -> list: result = [] temp_name = "临时.iso" # 读入数据 fp = open(file_name, 'r', encoding='utf-8') for index, data in enumerate(fp): # 把当前这行数据写入临时文件 # try: fp_temp = open(temp_name, 'w', encoding='utf-8') fp_temp.write(data) fp_temp.close() # 用marc形式读取 fh = open(temp_name, 'rb') try: reader = MARCReader(fh) record = next(reader) except (NoFieldsFound, UnicodeDecodeError): # 如果未从网站爬下,存在使用无内容的数据占位的数据.仍用无内容的数据补位. record = Record() except RecordLengthInvalid: # 读取数据多了最后一行的回车符,则跳出 break finally: fh.close() result.append(record) fp.close() os.remove(temp_name) return result
def new_marc(self, marc_file): from tech_services_reports.utility_code import convert_date, CatStat from tech_services_reports.models import Bib, Item from datetime import date from pymarc import MARCReader fixture = [] cedit_count = 0 for record in MARCReader(file(marc_file)): bib_number = record['907']['a'][1:] bib_level = record['998']['c'] cat_date = convert_date(record['998']['b']) cat_stat = CatStat(record) no_items = False #Create Bib fixture bdict = {} bdict['pk'] = bib_number bdict['model'] = 'tech_services_reports.bib' _fields = {} _fields['level'] = bib_level _fields['cat_type'] = cat_stat.cattype _fields['mat_type'] = cat_stat.mattype if cat_date: _fields['cat_date'] = cat_date.strftime("%Y-%m-%d") else: cat_date = None items = record.get_fields('945') #Need to do a check to see if any attached items #were created after the check d #Prep item fixture and append to main fixture _i = self.item_fixture(items) item_fixture = _i['fixture'] valid_items = _i['valid_items'] if len(_i['valid_items']) == 0: no_items = True #Checks to see if this bib, items pair is within reporting range. #The item_fixture function will skip items that aren't created #within the reporting range. #Skip pairs without a cat date and no items. if not cat_date: if no_items: continue #Allow pairs with no cat date but with items to be added to the fixture. #Skip pairs with an old cat date and no items. if cat_date: if cat_date < date(2010, 10, 01): if no_items: continue #Item fixture is a list so we need to add it to main list. fixture += item_fixture _fields['items'] = valid_items bdict['fields'] = _fields #This is just a dict so append. fixture.append(bdict) #Get cat edits marc_995 = record.get_fields('995') cedit = self.cat_edit_fixture(marc_995, bib_number) fixture += cedit['fixture'] cedit_count += len(cedit['fixture'])
def main(): try: for record in MARCReader(sys.stdin.buffer.read(), to_unicode=True): sys.stdout.write(json.dumps(transpose_to_ldj(record)) + "\n") sys.stdout.flush() except UnicodeDecodeError as e: eprint("unicode decode error: {}".format(e)) eprint(record)
def __init__(self, url_harvest, extra_data, **kwargs): '''Grab file and copy to local temp file''' super(MARCFetcher, self).__init__(url_harvest, extra_data, **kwargs) self.url_marc_file = url_harvest self.marc_file = tempfile.TemporaryFile() self.marc_file.write(urllib.urlopen(self.url_marc_file).read()) self.marc_file.seek(0) self.marc_reader = MARCReader( self.marc_file, to_unicode=True, utf8_handling='replace')
def test_reading_utf8_without_flag(self): reader = MARCReader(open('test/utf8_without_leader_flag.dat')) record = reader.next() self.assertEquals(type(record), Record) utitle = record['240']['a'] self.assertEquals(type(utitle), str) self.assertEquals(utitle, 'De la solitude a\xcc\x80 la communaute\xcc\x81.') reader = MARCReader(open('test/utf8_without_leader_flag.dat'), to_unicode=True, hide_utf8_warnings=True) record = reader.next() self.assertEquals(type(record), Record) utitle = record['240']['a'] self.assertEquals(type(utitle), unicode) # unless you force utf-8 characters will get lost and # warnings will appear in the terminal self.assertEquals(utitle, 'De la solitude a la communaute .') # force reading as utf-8 reader = MARCReader(open('test/utf8_without_leader_flag.dat'), to_unicode=True, force_utf8=True, hide_utf8_warnings=True) record = reader.next() self.assertEquals(type(record), Record) utitle = record['240']['a'] self.assertEquals(type(utitle), unicode) self.assertEquals(utitle, u'De la solitude a' + unichr(0x0300) + ' la communaute' + unichr(0x0301) + '.')
class MARCFetcher(Fetcher): '''Harvest a MARC FILE. Can be local or at a URL''' def __init__(self, url_harvest, extra_data): '''Grab file and copy to local temp file''' super(MARCFetcher, self).__init__(url_harvest, extra_data) self.url_marc_file = url_harvest self.marc_file = tempfile.TemporaryFile() self.marc_file.write(urllib.urlopen(self.url_marc_file).read()) self.marc_file.seek(0) self.marc_reader = MARCReader(self.marc_file, to_unicode=True, utf8_handling='replace') def next(self): '''Return MARC record by record to the controller''' return self.marc_reader.next().as_dict()