def load_part(archive_id, part, start_pos=0): print 'load_part:', archive_id, part global rec_no, t_prev, load_count full_part = archive_id + "/" + part f = open(rc['marc_path'] + "/" + full_part) if start_pos: f.seek(start_pos) for pos, loc, data in read_marc_file(full_part, f, pos=start_pos): rec_no += 1 if rec_no % chunk == 0: progress(archive_id, rec_no, start_pos, pos) if is_loaded(loc): continue want = ['001', '003', '010', '020', '035', '245'] try: index_fields = fast_parse.index_fields(data, want) except KeyError: print loc print fast_parse.get_tag_lines(data, ['245']) raise except AssertionError: print loc raise if not index_fields or 'title' not in index_fields: continue edition_pool = pool.build(index_fields) if not edition_pool: yield loc, data continue rec = fast_parse.read_edition(data) e1 = build_marc(rec) match = False seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: if edition_key in seen: continue thing = None while not thing or thing['type']['key'] == '/type/redirect': seen.add(edition_key) thing = withKey(edition_key) assert thing if thing['type']['key'] == '/type/redirect': print 'following redirect %s => %s' % (edition_key, thing['location']) edition_key = thing['location'] if try_merge(e1, edition_key, thing): add_source_records(edition_key, loc, thing, data) match = True break if match: break if not match: yield loc, data
def process_record(file_id, pos, length, data): rec = index_fields(data, ['001', '010', '020', '035', '245'], check_author = False) if not rec: return extra = dict((f, rec[f][0]) for f in ('title', 'lccn', 'call_number') if f in rec) rec_id = web.insert('rec', marc_file = file_id, pos=pos, len=length, **extra) for f in (f for f in ('isbn', 'oclc') if f in rec): for v in rec[f]: web.insert(f, seqname=False, rec=rec_id, value=v)
def load_part(archive_id, part, start_pos=0): print('load_part:', archive_id, part) global rec_no, t_prev, load_count full_part = archive_id + "/" + part f = open(rc['marc_path'] + "/" + full_part) if start_pos: f.seek(start_pos) for pos, loc, data in read_marc_file(full_part, f, pos=start_pos): rec_no += 1 if rec_no % chunk == 0: progress(archive_id, rec_no, start_pos, pos) if is_loaded(loc): continue want = ['001', '003', '010', '020', '035', '245'] try: index_fields = fast_parse.index_fields(data, want) except KeyError: print(loc) print(fast_parse.get_tag_lines(data, ['245'])) raise except AssertionError: print(loc) raise if not index_fields or 'title' not in index_fields: continue edition_pool = pool.build(index_fields) if not edition_pool: continue rec = fast_parse.read_edition(data) e1 = build_marc(rec) match = False seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: if edition_key in seen: continue seen.add(edition_key) thing = withKey(edition_key) assert thing if try_merge(e1, edition_key, thing): add_source_records(edition_key, loc, thing) match = True if not match: yield loc, data
def process_record(pos, loc, data): rec = index_fields(data, ['010']) if not rec: return for isbn in rec.get('oclc', []): try: add_to_db(oclc_db, str(isbn), loc) except (KeyboardInterrupt, NameError): raise except: pass for isbn in rec.get('title', []): try: add_to_db(title_db, str(isbn), loc) except (KeyboardInterrupt, NameError): raise except: pass
def process_record(pos, loc, data): global rec_id want = [ # '006', # Material Characteristics # '010', # LCCN '020', # ISBN # '035', # OCLC # '130', '240', # work title # '245', # title # '246', '730', '740' # other titles ] rec = index_fields(data, want, check_author=False) field_size = {'isbn': 16, 'oclc': 16, 'title': 25, 'lccn': 16} if not rec or 'isbn' not in rec: return for isbn in rec['isbn']: if ';' in isbn: print(loc) print(rec) assert ';' not in isbn too_long = any(len(i) > 16 for i in rec['isbn']) if not too_long: return print(loc) print(rec) assert not too_long for a, length in field_size: if a not in rec: continue too_long = any(len(i) > size for i in rec[a]) if not too_long: continue print(loc) print(rec) assert too_long # rec = list(get_tag_lines(data, want)) return
def process_record(pos, loc, data, file_id): global rec_id want = [ # '006', # Material Characteristics '010', # LCCN '020', # ISBN '035', # OCLC # '130', '240', # work title '245', # title # '246', '730', '740' # other titles ] try: rec = index_fields(data, want, check_author=False) except: print loc raise if not rec: return field_size = {'isbn': 16, 'oclc': 16, 'title': 25, 'lccn': 16} if 'isbn' in rec: rec['isbn'] = [i for i in rec['isbn'] if len(i) <= 16] if 'oclc' in rec: rec['oclc'] = [i for i in rec['oclc'] if len(i) <= 16] if 'lccn' in rec: rec['lccn'] = [i for i in rec['lccn'] if len(i) <= 16] for k, v in rec.iteritems(): if 'isbn' != k and any(len(i) > field_size[k] for i in v): print loc print rec assert False rec_id += 1 (f, p, l) = loc[5:].split(':') print >> db_rec, '\t'.join([str(rec_id), str(file_id), p, l]) for k, v in rec.iteritems(): if not v: continue for i in v: add_to_index(out[k], i, str(rec_id))
def process_record(pos, loc, data, file_id): global rec_id want = [ # '006', # Material Characteristics '010', # LCCN '020', # ISBN '035', # OCLC # '130', '240', # work title '245', # title # '246', '730', '740' # other titles ] try: rec = index_fields(data, want, check_author = False) except: print loc raise if not rec: return field_size = { 'isbn': 16, 'oclc': 16, 'title': 25, 'lccn': 16 } if 'isbn' in rec: rec['isbn'] = [i for i in rec['isbn'] if len(i) <= 16] if 'oclc' in rec: rec['oclc'] = [i for i in rec['oclc'] if len(i) <= 16] if 'lccn' in rec: rec['lccn'] = [i for i in rec['lccn'] if len(i) <= 16] for k, v in rec.iteritems(): if 'isbn' != k and any(len(i) > field_size[k] for i in v): print loc print rec assert False rec_id += 1 (f, p, l) = loc.split(':') print >> db_rec, '\t'.join([str(rec_id), str(file_id), p, l]) for k, v in rec.iteritems(): if not v: continue for i in v: add_to_index(out[k], i, str(rec_id))
def process_record(pos, loc, data): global rec_id want = [ # '006', # Material Characteristics # '010', # LCCN '020', # ISBN # '035', # OCLC # '130', '240', # work title # '245', # title # '246', '730', '740' # other titles ] rec = index_fields(data, want, check_author = False) field_size = { 'isbn': 16, 'oclc': 16, 'title': 25, 'lccn': 16 } if not rec or 'isbn' not in rec: return for isbn in rec['isbn']: if ';' in isbn: print(loc) print(rec) assert ';' not in isbn too_long = any(len(i) > 16 for i in rec['isbn']) if not too_long: return print(loc) print(rec) assert not too_long for a, length in field_size: if a not in rec: continue too_long = any(len(i) > size for i in rec[a]) if not too_long: continue print(loc) print(rec) assert too_long # rec = list(get_tag_lines(data, want)) return