def load_part(archive_id, part, start_pos=0): print 'load_part:', archive_id, part global rec_no, t_prev, load_count full_part = archive_id + "/" + part f = open(rc['marc_path'] + "/" + full_part) if start_pos: f.seek(start_pos) for pos, loc, data in read_marc_file(full_part, f, pos=start_pos): rec_no += 1 if rec_no % chunk == 0: progress(archive_id, rec_no, start_pos, pos) if is_loaded(loc): continue want = ['001', '003', '010', '020', '035', '245'] try: index_fields = fast_parse.index_fields(data, want) except KeyError: print loc print fast_parse.get_tag_lines(data, ['245']) raise except AssertionError: print loc raise if not index_fields or 'title' not in index_fields: continue edition_pool = pool.build(index_fields) if not edition_pool: yield loc, data continue rec = fast_parse.read_edition(data) e1 = build_marc(rec) match = False seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: if edition_key in seen: continue thing = None while not thing or thing['type']['key'] == '/type/redirect': seen.add(edition_key) thing = withKey(edition_key) assert thing if thing['type']['key'] == '/type/redirect': print 'following redirect %s => %s' % (edition_key, thing['location']) edition_key = thing['location'] if try_merge(e1, edition_key, thing): add_source_records(edition_key, loc, thing, data) match = True break if match: break if not match: yield loc, data
def iter_marc(): rec_no = 0 for source_id, ia, name in sources(): for part, size in files(ia): full_part = ia + "/" + part filename = rc['marc_path'] + full_part assert os.path.exists(filename) f = open(filename) for pos, loc, data in read_marc_file(full_part, f): rec_no +=1 yield rec_no, pos, loc, data
def load_part(archive_id, part, start_pos=0): print('load_part:', archive_id, part) global rec_no, t_prev, load_count full_part = archive_id + "/" + part f = open(rc['marc_path'] + "/" + full_part) if start_pos: f.seek(start_pos) for pos, loc, data in read_marc_file(full_part, f, pos=start_pos): rec_no += 1 if rec_no % chunk == 0: progress(archive_id, rec_no, start_pos, pos) if is_loaded(loc): continue want = ['001', '003', '010', '020', '035', '245'] try: index_fields = fast_parse.index_fields(data, want) except KeyError: print(loc) print(fast_parse.get_tag_lines(data, ['245'])) raise except AssertionError: print(loc) raise if not index_fields or 'title' not in index_fields: continue edition_pool = pool.build(index_fields) if not edition_pool: continue rec = fast_parse.read_edition(data) e1 = build_marc(rec) match = False seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: if edition_key in seen: continue seen.add(edition_key) thing = withKey(edition_key) assert thing if try_merge(e1, edition_key, thing): add_source_records(edition_key, loc, thing) match = True if not match: yield loc, data
chunk = 10000 total = 32856039 for name, part, size in all_files(): f = open(name) print part file_id += 1 print file_id, part, size print >> db_file, '\t'.join([str(file_id), part]) filename = rc['marc_path'] + '/' + part if not os.path.exists(filename): print filename, 'missing' # continue assert os.path.exists(filename) f = open(filename) for pos, loc, data in read_marc_file(part, f): rec_no += 1 if rec_no % chunk == 0: t = time() - t_prev progress_update(rec_no, t) t_prev = time() process_record(pos, loc, data, file_id) db_file.close() db_rec.close() print "closing files" for v in out.values(): v.close() print "finished"
for tag in '100', '700': line = get_first_tag(data, set([tag])) if line: fields = list(get_all_subfields(line)) if any(k == 'c' for k, v in fields): print((loc, fields)) def files(ia): endings = ['.mrc', '.marc', '.out', '.dat', '.records.utf8'] def good(filename): return any(filename.endswith(e) for e in endings) dir = rc['marc_path'] + ia dir_len = len(dir) + 1 files = [] for dirpath, dirnames, filenames in os.walk(dir): files.extend(dirpath + "/" + f for f in sorted(filenames)) return [(i[dir_len:], os.path.getsize(i)) for i in files if good(i)] rec_no = 0 for source_id, ia, name in sources(): for part, size in files(ia): full_part = ia + "/" + part filename = rc['marc_path'] + full_part assert os.path.exists(filename) f = open(filename) for pos, loc, data in read_marc_file(full_part, f): rec_no +=1 process_record(pos, loc, data)