def merge_authors(ol, keys, debug=False): # print 'merge author %s:"%s" and %s:"%s"' % (author['key'], author['name'], merge_with['key'], merge_with['name']) # print 'becomes: "%s"' % repr(new_name) authors = [ a for a in (withKey(k) for k in keys) if a['type']['key'] != '/type/redirect' ] not_redirect = set(a['key'] for a in authors) if debug: for a in authors: print(a) assert all(a['type']['key'] == '/type/author' for a in authors) name1 = authors[0]['name'] for a in authors: print(repr(a['key'], a['name'])) assert all(match_with_bad_chars(a['name'], name1) for a in authors[1:]) best_key = pick_best_author(authors)['key'] imgs = [ a['key'] for a in authors if a['key'] != '/a/OL2688880A' and has_image(a['key']) ] if len(imgs) == 1: new_key = imgs[0] else: new_key = "/a/OL%dA" % min(key_int(a) for a in authors) # Molière and O. J. O. Ferreira if len(imgs) != 0: print('imgs:', imgs) return # skip if not (imgs == [u'/a/OL21848A', u'/a/OL4280680A'] \ or imgs == [u'/a/OL325189A', u'/a/OL266422A'] \ or imgs == [u'/a/OL5160945A', u'/a/OL5776228A']): print(imgs) assert len(imgs) == 0 print(new_key) print(best_key) do_normalize(new_key, best_key, authors) old_keys = set(k for k in keys if k != new_key) print('old keys:', old_keys) for old in old_keys: # /b/OL21291659M switch_author(ol, old, new_key, old_keys, debug=True) if old in not_redirect: make_redirect(ol, old, new_key) q = { 'authors': old, 'type': '/type/edition', } if list(get_things(q)) != []: switch_author(ol, old, new_key, old_keys, debug=True)
def follow_redirects(key): keys = [] thing = None while not thing or thing['type']['key'] == '/type/redirect': keys.append(key) thing = withKey(key) assert thing if thing['type']['key'] == '/type/redirect': print('following redirect {} => {}'.format(key, thing['location'])) key = thing['location'] return (keys, thing)
def merge_authors(ol, keys, debug=False): # print 'merge author %s:"%s" and %s:"%s"' % (author['key'], author['name'], merge_with['key'], merge_with['name']) # print 'becomes: "%s"' % repr(new_name) authors = [a for a in (withKey(k) for k in keys) if a['type']['key'] != '/type/redirect'] not_redirect = set(a['key'] for a in authors) if debug: for a in authors: print(a) assert all(a['type']['key'] == '/type/author' for a in authors) name1 = authors[0]['name'] for a in authors: print(repr(a['key'], a['name'])) assert all(match_with_bad_chars(a['name'], name1) for a in authors[1:]) best_key = pick_best_author(authors)['key'] imgs = [a['key'] for a in authors if a['key'] != '/a/OL2688880A' and has_image(a['key'])] if len(imgs) == 1: new_key = imgs[0] else: new_key = "/a/OL%dA" % min(key_int(a) for a in authors) # Molière and O. J. O. Ferreira if len(imgs) != 0: print('imgs:', imgs) return # skip if not (imgs == [u'/a/OL21848A', u'/a/OL4280680A'] \ or imgs == [u'/a/OL325189A', u'/a/OL266422A'] \ or imgs == [u'/a/OL5160945A', u'/a/OL5776228A']): print(imgs) assert len(imgs) == 0 print(new_key) print(best_key) do_normalize(new_key, best_key, authors) old_keys = set(k for k in keys if k != new_key) print('old keys:', old_keys) for old in old_keys: # /b/OL21291659M switch_author(ol, old, new_key, old_keys, debug=True) if old in not_redirect: make_redirect(ol, old, new_key) q = { 'authors': old, 'type': '/type/edition', } if list(get_things(q)) != []: switch_author(ol, old, new_key, old_keys, debug=True)
def try_amazon(thing): if 'isbn_10' not in thing: return None if 'authors' in thing: authors = [] for a in thing['authors']: # this is a hack # the type of thing['authors'] should all be the same type if isinstance(a, dict): akey = a['key'] else: assert isinstance(a, six.string_types) akey = a author_thing = withKey(akey) if 'name' in author_thing: authors.append(author_thing['name']) else: authors = [] return amazon.build_amazon(thing, authors)
def try_amazon(thing): if 'isbn_10' not in thing: return None if 'authors' in thing: authors = [] for a in thing['authors']: # this is a hack # the type of thing['authors'] should all be the same type if isinstance(a, dict): akey = a['key'] else: assert isinstance(a, basestring) akey = a author_thing = withKey(akey) if 'name' in author_thing: authors.append(author_thing['name']) else: authors = [] return amazon.build_amazon(thing, authors)
def try_merge(e1, edition_key, thing): thing_type = thing['type']['key'] if thing_type != '/type/edition': print thing['key'], 'is', thing['type']['key'] if thing_type == '/type/delete': # return False assert thing_type == '/type/edition' if 'source_records' in thing: if fix_source_records(edition_key, thing): thing = withKey(edition_key) # reload return source_records_match(e1, thing) ia = thing.get('ocaid', None) print edition_key mc = get_mc(edition_key) print mc if mc: if mc.startswith('ia:'): ia = mc[3:] elif mc.endswith('.xml') or mc.endswith('.mrc'): ia = mc[:mc.find('/')] if '_meta.mrc:' in mc: print thing if 'ocaid' not in thing: return False ia = thing['ocaid'] rec2 = None if ia: if is_dark_or_bad(ia): return False try: loc2, rec2 = get_ia(ia) except xml.parsers.expat.ExpatError: return False except NoMARCXML: print 'no MARCXML' pass except urllib2.HTTPError, error: print error.code assert error.code in (404, 403) if not rec2: return True
def try_merge(e1, edition_key, thing): thing_type = thing['type'] if thing_type != Reference('/type/edition'): print thing['key'], 'is', str(thing['type']) if thing_type == Reference('/type/delete'): return False assert thing_type == Reference('/type/edition') if 'source_records' in thing: if fix_source_records(edition_key, thing): thing = withKey(edition_key) # reload return source_records_match(e1, thing) ia = thing.get('ocaid', None) print edition_key mc = get_mc(edition_key) print mc if mc: if mc.startswith('ia:'): ia = mc[3:] elif mc.endswith('.xml') or mc.endswith('.mrc'): ia = mc[:mc.find('/')] if '_meta.mrc:' in mc: print thing if 'ocaid' not in thing: return False ia = thing['ocaid'] rec2 = None if ia: if is_dark_or_bad(ia): return False try: rec2 = get_ia(ia) except xml.parsers.expat.ExpatError: return False except NoMARCXML: print 'no MARCXML' pass except urllib2.HTTPError, error: print error.code assert error.code in (404, 403) if not rec2: return True
def try_merge(e1, edition_key, thing): thing_type = thing["type"]["key"] if thing_type != "/type/edition": print thing["key"], "is", thing["type"]["key"] if thing_type == "/type/delete": # return False assert thing_type == "/type/edition" if "source_records" in thing: if fix_source_records(edition_key, thing): thing = withKey(edition_key) # reload return source_records_match(e1, thing) ia = thing.get("ocaid", None) print edition_key mc = get_mc(edition_key) print mc if mc: if mc.startswith("ia:"): ia = mc[3:] elif mc.endswith(".xml") or mc.endswith(".mrc"): ia = mc[: mc.find("/")] if "_meta.mrc:" in mc: assert "ocaid" in thing ia = thing["ocaid"] rec2 = None if ia: if is_dark_or_bad(ia): return False try: loc2, rec2 = get_ia(ia) except xml.parsers.expat.ExpatError: return False except NoMARCXML: print "no MARCXML" pass except urllib2.HTTPError, error: print error.code assert error.code in (404, 403) if not rec2: return True
def try_merge(e1, edition_key, thing): thing_type = thing['type'] if thing_type != Reference('/type/edition'): print(thing['key'], 'is', str(thing['type'])) if thing_type == Reference('/type/delete'): return False assert thing_type == Reference('/type/edition') if 'source_records' in thing: if fix_source_records(edition_key, thing): thing = withKey(edition_key) # reload return source_records_match(e1, thing) ia = thing.get('ocaid', None) print(edition_key) mc = get_mc(edition_key) print(mc) if mc: if mc.startswith('ia:'): ia = mc[3:] elif mc.endswith('.xml') or mc.endswith('.mrc'): ia = mc[:mc.find('/')] if '_meta.mrc:' in mc: print(thing) if 'ocaid' not in thing: return False ia = thing['ocaid'] rec2 = None if ia: if is_dark_or_bad(ia): return False try: rec2 = get_ia(ia) except xml.parsers.expat.ExpatError: return False except NoMARCXML: print('no MARCXML') pass except urllib2.HTTPError as error: print(error.code) assert error.code in (404, 403) if not rec2: return True if not rec2: if not mc: mc = get_mc(thing['key']) if not mc or mc == 'initial import': return False if mc.startswith('amazon:'): try: a = try_amazon(thing) except IndexError: print(thing['key']) raise except AttributeError: return False if not a: return False try: return amazon.attempt_merge(a, e1, threshold, debug=False) except: print(a) print(e1) print(thing['key']) raise print('mc:', mc) try: assert not mc.startswith('ia:') data = get_from_archive(mc) if not data: return True rec2 = fast_parse.read_edition(data) except (fast_parse.SoundRecording, IndexError, AssertionError): print(mc) print(edition_key) return False except: print(mc) print(edition_key) raise if not rec2: return False try: e2 = build_marc(rec2) except TypeError: print(rec2) raise return attempt_merge(e1, e2, threshold, debug=False)
def load_part(archive_id, part, start_pos=0): print 'load_part:', archive_id, part global rec_no, t_prev, load_count full_part = archive_id + "/" + part f = open(rc['marc_path'] + "/" + full_part) if start_pos: f.seek(start_pos) for pos, loc, data in read_marc_file(full_part, f, pos=start_pos): rec_no += 1 if rec_no % chunk == 0: progress(archive_id, rec_no, start_pos, pos) if is_loaded(loc): continue want = ['001', '003', '010', '020', '035', '245'] try: index_fields = fast_parse.index_fields(data, want) except KeyError: print loc print fast_parse.get_tag_lines(data, ['245']) raise except AssertionError: print loc raise except fast_parse.NotBook: continue if not index_fields or 'title' not in index_fields: continue print loc edition_pool = pool.build(index_fields) if not edition_pool: yield loc, data continue rec = fast_parse.read_edition(data) e1 = build_marc(rec) match = False seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: if edition_key in seen: continue thing = None while not thing or thing['type']['key'] == '/type/redirect': seen.add(edition_key) thing = withKey(edition_key) assert thing if thing['type']['key'] == '/type/redirect': print 'following redirect %s => %s' % (edition_key, thing['location']) edition_key = thing['location'] if try_merge(e1, edition_key, thing): add_source_records(edition_key, loc, thing, data) match = True break if match: break if not match: yield loc, data
def load_part(archive_id, part, start_pos=0): print('load_part:', archive_id, part) global rec_no, t_prev, load_count full_part = archive_id + "/" + part f = open(rc['marc_path'] + "/" + full_part) if start_pos: f.seek(start_pos) for pos, loc, data in read_marc_file(full_part, f, pos=start_pos): rec_no += 1 if rec_no % chunk == 0: progress(archive_id, rec_no, start_pos, pos) if is_loaded(loc): continue want = ['001', '003', '010', '020', '035', '245'] try: index_fields = fast_parse.index_fields(data, want) except KeyError: print(loc) print(fast_parse.get_tag_lines(data, ['245'])) raise except AssertionError: print(loc) raise except fast_parse.NotBook: continue if not index_fields or 'title' not in index_fields: continue print(loc) edition_pool = pool.build(index_fields) if not edition_pool: yield loc, data continue rec = fast_parse.read_edition(data) e1 = build_marc(rec) match = False seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: if edition_key in seen: continue thing = None while not thing or thing['type']['key'] == '/type/redirect': seen.add(edition_key) thing = withKey(edition_key) assert thing if thing['type']['key'] == '/type/redirect': print('following redirect %s => %s' % (edition_key, thing['location'])) edition_key = thing['location'] if try_merge(e1, edition_key, thing): add_source_records(edition_key, loc, thing, data) match = True break if match: break if not match: yield loc, data