def load_part(archive_id, part, start_pos=0): print 'load_part:', archive_id, part global rec_no, t_prev, load_count full_part = archive_id + "/" + part f = open(rc['marc_path'] + "/" + full_part) if start_pos: f.seek(start_pos) for pos, loc, data in read_marc_file(full_part, f, pos=start_pos): rec_no += 1 if rec_no % chunk == 0: progress(archive_id, rec_no, start_pos, pos) if is_loaded(loc): continue want = ['001', '003', '010', '020', '035', '245'] try: index_fields = fast_parse.index_fields(data, want) except KeyError: print loc print fast_parse.get_tag_lines(data, ['245']) raise except AssertionError: print loc raise if not index_fields or 'title' not in index_fields: continue edition_pool = pool.build(index_fields) if not edition_pool: yield loc, data continue rec = fast_parse.read_edition(data) e1 = build_marc(rec) match = False seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: if edition_key in seen: continue thing = None while not thing or thing['type']['key'] == '/type/redirect': seen.add(edition_key) thing = withKey(edition_key) assert thing if thing['type']['key'] == '/type/redirect': print 'following redirect %s => %s' % (edition_key, thing['location']) edition_key = thing['location'] if try_merge(e1, edition_key, thing): add_source_records(edition_key, loc, thing, data) match = True break if match: break if not match: yield loc, data
def load_part(archive_id, part, start_pos=0): print 'load_part:', archive_id, part global rec_no, t_prev, load_count full_part = archive_id + "/" + part f = open(rc['marc_path'] + "/" + full_part) if start_pos: f.seek(start_pos) for pos, loc, data in read_marc_file(full_part, f, pos=start_pos): rec_no += 1 if rec_no % chunk == 0: progress(archive_id, rec_no, start_pos, pos) if is_loaded(loc): continue want = ['001', '003', '010', '020', '035', '245'] try: index_fields = fast_parse.index_fields(data, want) except KeyError: print loc print fast_parse.get_tag_lines(data, ['245']) raise except AssertionError: print loc raise if not index_fields or 'title' not in index_fields: continue edition_pool = pool.build(index_fields) if not edition_pool: continue rec = fast_parse.read_edition(data) e1 = build_marc(rec) match = False seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: if edition_key in seen: continue seen.add(edition_key) thing = withKey(edition_key) assert thing if try_merge(e1, edition_key, thing): add_source_records(edition_key, loc, thing) match = True if not match: yield loc, data
def get_651(key): found = [] for src in get_src(key): data = get_from_local(src) for tag, line in get_tag_lines(data, ['651']): found.append(list(get_all_subfields(line))) return found
def data_from_marc(locs, name): lines = defaultdict(list) for loc in locs: data = marc_data(loc) line = read_line(get_first_tag(data, set(['100'])), name) if line: lines[line].append(loc) for tag, line in get_tag_lines(data, set(['700'])): line = read_line(line, name) if line: lines[line].append(loc) return lines
def read_edition(loc, data): fields = {} for tag, line in handle_wrapped_lines(get_tag_lines(data, want)): fields.setdefault(tag, []).append(line) edition = {} if len(fields['008']) != 1: warn("There should be a single '008' field, %s has %d." % (loc, len(fields['008']))) return {} f = fields['008'][0] publish_date = str(f)[7:11] if publish_date.isdigit() and publish_date != '0000': edition["publish_date"] = publish_date if str(f)[6] == 't': edition["copyright_date"] = str(f)[11:15] publish_country = str(f)[15:18] if publish_country not in ('|||', ' '): edition["publish_country"] = publish_country lang = str(f)[35:38] if lang not in (' ', '|||'): edition["languages"] = [{ 'key': '/l/' + lang }] edition.update(read_lccn(fields)) try: edition.update(read_isbn(fields)) except: print loc raise edition.update(read_oclc(fields)) edition.update(read_lc_classification(fields)) edition.update(read_dewey(fields)) edition.update(read_authors(fields)) edition.update(read_title(fields)) edition.update(read_genres(fields)) edition.update(read_subjects(fields)) edition.update(read_pagination(fields)) edition.update(read_series(fields)) edition.update(read_work_titles(fields)) edition.update(read_other_titles(fields)) edition.update(read_edition_name(fields)) edition.update(read_publisher(fields)) edition.update(read_contributions(fields)) edition.update(read_location(fields)) edition.update(read_url(fields)) edition.update(read_toc(fields)) edition.update(read_notes(fields)) edition.update(read_description(fields)) return edition
def __init__(self, data): fields = {} for tag, line in get_tag_lines(data, want): fields.setdefault(tag, []).append(line) self.fields = fields