def update_edition(e): if not is_single_core(): return [] ekey = e['key'] logger.info("updating edition %s", ekey) wkey = e.get('works') and e['works'][0]['key'] w = wkey and withKey(wkey) authors = [] if w: authors = [withKey(a['author']['key']) for a in w.get("authors", []) if 'author' in a] request_set = SolrRequestSet() request_set.delete(ekey) q = {'type': '/type/redirect', 'location': ekey} redirect_keys = [r['key'] for r in query_iter(q)] for k in redirect_keys: request_set.delete(k) doc = EditionBuilder(e, w, authors).build() request_set.add(doc) return request_set.get_requests()
def update_keys(keys, commit=True): logger.info("BEGIN update_keys") wkeys = set() # Get works for all the editions ekeys = set(k for k in keys if k.startswith("/books/")) for k in ekeys: logger.info("processing edition %s", k) edition = withKey(k) if edition and edition['type']['key'] == '/type/redirect': logger.warn("Found redirect to %s", edition['location']) edition = withKey(edition['location']) if not edition: logger.warn("No edition found for key %r. Ignoring...", k) continue elif edition['type']['key'] != '/type/edition': logger.warn("Found a document of type %r. Ignoring...", edition['type']['key']) continue if edition.get("works"): wkeys.add(edition["works"][0]['key']) else: # index the edition as it does not belong to any work wkeys.add(k) # Add work keys wkeys.update(k for k in keys if k.startswith("/works/")) # update works requests = [] for k in wkeys: logger.info("updating %s", k) try: w = withKey(k) requests += update_work(w, debug=True) except: logger.error("Failed to update work %s", k, exc_info=True) if requests: if commit: requests += ['<commit />'] solr_update(requests, debug=True) # update authors requests = [] akeys = set(k for k in keys if k.startswith("/authors/")) for k in akeys: logger.info("updating %s", k) try: requests += update_author(k) except: logger.error("Failed to update author %s", k, exc_info=True) if requests: if commit: requests += ['<commit />'] solr_update(requests, index="authors", debug=True) logger.info("END update_keys")
def try_amazon(key): thing = withKey(key) if 'isbn_10' not in thing: return None if 'authors' in thing: authors = [] for a in thing['authors']: author_thing = withKey(a['key']) if 'name' in author_thing: authors.append(author_thing['name']) else: authors = [] return amazon.build_amazon(thing, authors)
def hide_books(start): mend = [] fix_works = set() db_iter = db.query("select identifier, collection, updated from metadata where (noindex is not null or curatestate='dark') and mediatype='texts' and scandate is not null and updated > $start order by updated", {'start': start}) for row in db_iter: ia = row.identifier if row.collection: collections = set(i.lower().strip() for i in row.collection.split(';')) if 'printdisabled' in collections: continue print `ia`, row.updated for eq in query({'type': '/type/edition', 'ocaid': ia}): print eq['key'] e = ol.get(eq['key']) if 'ocaid' not in e: continue if 'works' in e: fix_works.update(e['works']) print e['key'], `e.get('title', None)` del e['ocaid'] mend.append(e) print 'removing links from %d editions' % len(mend) print ol.save_many(mend, 'remove link') requests = [] for wkey in fix_works: requests += update_work(withKey(wkey)) if fix_works: solr_update(requests + ['<commit/>'], debug=True)
def update_keys(keys, commit=True): logger.info("BEGIN update_keys") wkeys = set() # Get works for all the editions ekeys = set(k for k in keys if k.startswith("/books/")) for k in ekeys: print "processing edition", k edition = withKey(k) if edition.get("works"): wkeys.add(edition["works"][0]['key']) else: # index the edition as it does not belong to any work wkeys.add(k) # Add work keys wkeys.update(k for k in keys if k.startswith("/works/")) # update works requests = [] for k in wkeys: logger.info("updating %s", k) try: w = withKey(k) requests += update_work(w, debug=True) except: logger.error("Failed to update work %s", k, exc_info=True) if requests: if commit: requests += ['<commit />'] solr_update(requests, debug=True) # update authors requests = [] akeys = set(k for k in keys if k.startswith("/authors/")) for k in akeys: logger.info("updating %s", k) try: requests += update_author(k) except: logger.error("Failed to update author %s", k, exc_info=True) if requests: if commit: requests += ['<commit />'] solr_update(requests, index="authors", debug=True) logger.info("END update_keys")
def update_author(akey, a=None, handle_redirects=True): # http://ia331507.us.archive.org:8984/solr/works/select?indent=on&q=author_key:OL22098A&facet=true&rows=1&sort=edition_count%20desc&fl=title&facet.field=subject_facet&facet.mincount=1 m = re_author_key.match(akey) if not m: print 'bad key:', akey return author_id = m.group(1) if not a: a = withKey(akey) if a['type']['key'] in ('/type/redirect', '/type/delete') or not a.get('name', None): return ['<delete><query>key:%s</query></delete>' % author_id] try: assert a['type']['key'] == '/type/author' except AssertionError: print a['type']['key'] raise facet_fields = ['subject', 'time', 'person', 'place'] url = 'http://' + get_solr('works') + '/solr/works/select?wt=json&json.nl=arrarr&q=author_key:%s&sort=edition_count+desc&rows=1&fl=title,subtitle&facet=true&facet.mincount=1' % author_id url += ''.join('&facet.field=%s_facet' % f for f in facet_fields) reply = json.load(urlopen(url)) work_count = reply['response']['numFound'] docs = reply['response'].get('docs', []) top_work = None if docs: top_work = docs[0]['title'] if docs[0].get('subtitle', None): top_work += ': ' + docs[0]['subtitle'] all_subjects = [] for f in facet_fields: for s, num in reply['facet_counts']['facet_fields'][f + '_facet']: all_subjects.append((num, s)) all_subjects.sort(reverse=True) top_subjects = [s for num, s in all_subjects[:10]] add = Element("add") doc = SubElement(add, "doc") add_field(doc, 'key', author_id) if a.get('name', None): add_field(doc, 'name', a['name']) for f in 'birth_date', 'death_date', 'date': if a.get(f, None): add_field(doc, f, a[f]) if top_work: add_field(doc, 'top_work', top_work) add_field(doc, 'work_count', work_count) add_field_list(doc, 'top_subjects', top_subjects) requests = [] if handle_redirects: q = {'type': '/type/redirect', 'location': akey} redirects = ''.join('<id>%s</id>' % re_author_key.match(r['key']).group(1) for r in query_iter(q)) if redirects: requests.append('<delete>' + redirects + '</delete>') requests.append(tostring(add).encode('utf-8')) return requests
def fix_up_authors(w, akey, editions): print 'looking for author:', akey #print (w, akey, editions) seen_akey = False need_save = False for a in w.get('authors', []): print 'work:', w['key'] obj = withKey(a['author']['key']) if obj['type']['key'] == '/type/redirect': a['author']['key'] = obj['location'] print obj['key'], 'redirects to', obj['location'] #a['author']['key'] = '/authors/' + re_author_key.match(a['author']['key']).group(1) assert a['author']['key'].startswith('/authors/') obj = withKey(a['author']['key']) assert obj['type']['key'] == '/type/author' need_save = True if akey == a['author']['key']: seen_akey = True if seen_akey: if need_save: print 'need save:', a return need_save try: ekey = editions[0]['key'] except: print 'editions:', editions raise #print 'author %s missing. copying from first edition %s' % (akey, ekey) #print 'before:' for a in w.get('authors', []): print a e = withKey(ekey) #print e if not e.get('authors', None): print 'no authors in edition' return print 'authors from first edition', e['authors'] w['authors'] = [{ 'type': '/type/author_role', 'author': a } for a in e['authors']] #print 'after:' #for a in w['authors']: # print a return True
def fix_up_authors(w, akey, editions): print('looking for author:', akey) #print (w, akey, editions) seen_akey = False need_save = False for a in w.get('authors', []): print('work:', w['key']) obj = withKey(a['author']['key']) if obj['type']['key'] == '/type/redirect': a['author']['key'] = obj['location'] print(obj['key'], 'redirects to', obj['location']) #a['author']['key'] = '/authors/' + re_author_key.match(a['author']['key']).group(1) assert a['author']['key'].startswith('/authors/') obj = withKey(a['author']['key']) assert obj['type']['key'] == '/type/author' need_save = True if akey == a['author']['key']: seen_akey = True if seen_akey: if need_save: print('need save:', a) return need_save try: ekey = editions[0]['key'] except: print('editions:', editions) raise #print 'author %s missing. copying from first edition %s' % (akey, ekey) #print 'before:' for a in w.get('authors', []): print(a) e = withKey(ekey) #print e if not e.get('authors', None): print('no authors in edition') return print('authors from first edition', e['authors']) w['authors'] = [{'type':'/type/author_role', 'author':a} for a in e['authors']] #print 'after:' #for a in w['authors']: # print a return True
def run_update(): global authors_to_update global works_to_update global last_update print 'running update: %s works %s authors' % (len(works_to_update), len(authors_to_update)) if works_to_update: requests = [] num = 0 total = len(works_to_update) for wkey in works_to_update: num += 1 print 'update work: %s %d/%d' % (wkey, num, total) if '/' in wkey[7:]: print 'bad wkey:', wkey continue for attempt in range(5): try: requests += update_work(withKey(wkey)) break except AuthorRedirect: print 'fixing author redirect' w = ol.get(wkey) need_update = False for a in w['authors']: r = ol.get(a['author']) if r['type'] == '/type/redirect': a['author'] = {'key': r['location']} need_update = True assert need_update print w if not done_login: rc = read_rc() ol.login('EdwardBot', rc['EdwardBot']) ol.save(w['key'], w, 'avoid author redirect') if len(requests) >= 100: solr_update(requests, debug=True) requests = [] # if num % 1000 == 0: # solr_update(['<commit/>'], debug=True) if requests: solr_update(requests, debug=True) solr_update(['<commit/>'], debug=True) last_update = time() print >> open(state_file, 'w'), offset if authors_to_update: requests = [] for akey in authors_to_update: print 'update author:', akey requests += update_author(akey) solr_update(requests + ['<commit/>'], index='authors', debug=True) authors_to_update = set() works_to_update = set() print >> open(state_file, 'w'), offset
def run_update(): global authors_to_update global works_to_update global last_update print "running update: %s works %s authors" % (len(works_to_update), len(authors_to_update)) if works_to_update: requests = [] num = 0 total = len(works_to_update) for wkey in works_to_update: num += 1 print "update work: %s %d/%d" % (wkey, num, total) if "/" in wkey[7:]: print "bad wkey:", wkey continue for attempt in range(5): try: requests += update_work(withKey(wkey)) break except AuthorRedirect: print "fixing author redirect" w = ol.get(wkey) need_update = False for a in w["authors"]: r = ol.get(a["author"]) if r["type"] == "/type/redirect": a["author"] = {"key": r["location"]} need_update = True assert need_update print w if not done_login: rc = read_rc() ol.login("EdwardBot", rc["EdwardBot"]) ol.save(w["key"], w, "avoid author redirect") if len(requests) >= 100: solr_update(requests, debug=True) requests = [] # if num % 1000 == 0: # solr_update(['<commit/>'], debug=True) if requests: solr_update(requests, debug=True) solr_update(["<commit/>"], debug=True) last_update = time() print >> open(state_file, "w"), offset if authors_to_update: requests = [] for akey in authors_to_update: print "update author:", akey requests += update_author(akey) solr_update(requests + ["<commit/>"], index="authors", debug=True) authors_to_update = set() works_to_update = set() print >> open(state_file, "w"), offset
def fix_up_authors(w, akey, editions): seen_akey = False need_save = False print 'fix_up_authors' for a in w.get('authors', []): print a obj = withKey(a['author']['key']) if obj['type']['key'] == '/type/redirect': a['author']['key'] = obj['location'] a['author']['key'] = '/authors/' + re_author_key.match(a['author']['key']).group(1) print 'getting:', a['author']['key'] obj = withKey(a['author']['key']) print 'found:', obj assert obj['type']['key'] == '/type/author' need_save = True if akey == a['author']['key']: seen_akey = True if seen_akey: return need_save try: ekey = editions[0]['key'] except: print 'editions:', editions raise print 'author %s missing. copying from first edition %s' % (akey, ekey) print 'before:' for a in w.get('authors', []): print a e = withKey(ekey) print e if not e.get('authors', None): print 'no authors in edition' return w['authors'] = [{'type':'/type/author_role', 'author':a} for a in e['authors']] print 'after:' for a in w['authors']: print a return True
def update_keys(keys, commit=True): logger.info("BEGIN update_keys") wkeys = set() # Get works for all the editions ekeys = set(k for k in keys if k.startswith("/books/")) for k in ekeys: print "processing edition", k edition = withKey(k) if edition.get("works"): wkeys.add(edition["works"][0]['key']) # Add work keys wkeys.update(k for k in keys if k.startswith("/works/")) # update works requests = [] for k in wkeys: logger.info("updating %s", k) w = withKey(k) requests += update_work(w, debug=True) if requests: if commit: requests += ['<commit />'] solr_update(requests, debug=True) # update authors requests = [] akeys = set(k for k in keys if k.startswith("/authors/")) for k in akeys: logger.info("updating %s", k) requests += update_author(k) if requests: if commit: requests += ['<commit />'] solr_update(requests, index="authors", debug=True) logger.info("END update_keys")
def update_keys(keys): # update works requests = [] wkeys = [k for k in keys if k.startswith("/works/")] print "updating", wkeys for k in wkeys: w = withKey(k) requests += update_work(w, debug=True) if requests: requests += ['<commit />'] solr_update(requests, debug=True) # update authors requests = [] akeys = [k for k in keys if k.startswith("/authors/")] print "updating", akeys for k in akeys: requests += update_author(k) if requests: requests += ['<commit />'] solr_update(requests, index="authors", debug=True)
def get_author(self, a): """Returns the author dict from author entry in the work. get_author({"author": {"key": "/authors/OL1A"}}) """ if 'author' not in a: # OL Web UI bug return # http://openlibrary.org/works/OL15365167W.yml?m=edit&v=1 author = a['author'] if 'type' in author: # means it is already the whole object. # It'll be like this when doing re-indexing of solr. return author key = a['author']['key'] m = re_author_key.match(key) if not m: print 'invalid author key:', key return return withKey(key)
def get_author(self, a): """Returns the author dict from author entry in the work. get_author({"author": {"key": "/authors/OL1A"}}) """ if 'author' not in a: # OL Web UI bug return # http://openlibrary.org/works/OL15365167W.yml?m=edit&v=1 author = a['author'] if 'type' in author: # means it is already the whole object. # It'll be like this when doing re-indexing of solr. return author key = a['author']['key'] m = re_author_key.match(key) if not m: logger.error('invalid author key: %s', key) return return withKey(key)
def hide_books(start): hide_start = open(hide_state_file).readline()[:-1] print "hide start:", hide_start mend = [] fix_works = set() db_iter = db.query( "select identifier, collection, updated from metadata where (noindex is not null or curatestate='dark') and mediatype='texts' and scandate is not null and updated > $start order by scandate_dt", {"start": hide_start}, ) last_updated = None for row in db_iter: ia = row.identifier if row.collection: collections = set(i.lower().strip() for i in row.collection.split(";")) if "printdisabled" in collections or "lendinglibrary" in collections: continue print ` ia `, row.updated for eq in query({"type": "/type/edition", "ocaid": ia}): print eq["key"] e = ol.get(eq["key"]) if "ocaid" not in e: continue if "works" in e: fix_works.update(e["works"]) print e["key"], ` e.get("title", None) ` del e["ocaid"] mend.append(e) last_updated = row.updated print "removing links from %d editions" % len(mend) if not mend: return print ol.save_many(mend, "remove link") requests = [] for wkey in fix_works: requests += update_work(withKey(wkey)) if fix_works: solr_update(requests + ["<commit/>"], debug=True) print >> open(hide_state_file, "w"), last_updated
def hide_books(start): hide_start = open(hide_state_file).readline()[:-1] print('hide start:', hide_start) mend = [] fix_works = set() db_iter = db.query( "select identifier, collection, updated from metadata where (noindex is not null or curatestate='dark') and mediatype='texts' and scandate is not null and updated > $start", {'start': hide_start}) last_updated = None for row in db_iter: ia = row.identifier if row.collection: collections = set(i.lower().strip() for i in row.collection.split(';')) if ignore_noindex & collections: continue print((repr(ia), row.updated)) for eq in query({'type': '/type/edition', 'ocaid': ia}): print(eq['key']) e = ol.get(eq['key']) if 'ocaid' not in e: continue if 'works' in e: fix_works.update(e['works']) print((e['key'], repr(e.get('title', None)))) del e['ocaid'] mend.append(e) last_updated = row.updated print('removing links from %d editions' % len(mend)) if not mend: return print(ol.save_many(mend, 'remove link')) requests = [] for wkey in fix_works: requests += update_work(withKey(wkey)) if fix_works: solr_update(requests + ['<commit/>'], debug=True) print(last_updated, file=open(hide_state_file, 'w'))
def hide_books(start): hide_start = open(hide_state_file).readline()[:-1] print 'hide start:', hide_start mend = [] fix_works = set() db_iter = db.query("select identifier, collection, updated from metadata where (noindex is not null or curatestate='dark') and mediatype='texts' and scandate is not null and updated > $start", {'start': hide_start}) last_updated = None for row in db_iter: ia = row.identifier if row.collection: collections = set(i.lower().strip() for i in row.collection.split(';')) if ignore_noindex & collections: continue print(repr(ia), row.updated) for eq in query({'type': '/type/edition', 'ocaid': ia}): print eq['key'] e = ol.get(eq['key']) if 'ocaid' not in e: continue if 'works' in e: fix_works.update(e['works']) print(e['key'], repr(e.get('title', None))) del e['ocaid'] mend.append(e) last_updated = row.updated print 'removing links from %d editions' % len(mend) if not mend: return print ol.save_many(mend, 'remove link') requests = [] for wkey in fix_works: requests += update_work(withKey(wkey)) if fix_works: solr_update(requests + ['<commit/>'], debug=True) print >> open(hide_state_file, 'w'), last_updated
assert first_redirect['type']['key'] == '/type/redirect' akey = first_redirect['location'] if akey.startswith('/authors/'): akey = '/a/' + akey[len('/authors/'):] title_redirects = find_title_redirects(akey) works = find_works(akey, get_books(akey, books_query(akey)), existing=title_redirects) updated = update_works(akey, works, do_updates=True) works_to_update.update(w['key'] for w in updated) for query in i['data']['query']: key = query.pop('key') process_save(key, query) # store.put gets called when any document is updated in the store. Borrowing/Returning a book triggers one. elif action == 'store.put': # A sample record looks like this: # { # "action": "store.put", # "timestamp": "2011-12-01T00:00:44.241604", # "data": { # "data": {"borrowed": "false", "_key": "ebooks/books/OL5854888M", "_rev": "975708", "type": "ebook", "book_key": "/books/OL5854888M"}, # "key": "ebooks/books/OL5854888M" # }, # "site": "openlibrary.org" # } data = i.get('data', {}).get("data") if data.get("type") == "ebook" and data.get("_key", "").startswith("ebooks/books/"): edition_key = data['book_key'] process_save(edition_key, withKey(edition_key)) since_last_update = time() - last_update if len(works_to_update) > work_limit or len(authors_to_update) > author_limit or since_last_update > time_limit: run_update()
def run_update(): global authors_to_update, works_to_update subjects_to_update = set() global last_update print 'running update: %s works %s authors' % (len(works_to_update), len(authors_to_update)) if works_to_update: requests = [] num = 0 total = len(works_to_update) for wkey in works_to_update: num += 1 print 'update work: %s %d/%d' % (wkey, num, total) if '/' in wkey[7:]: print 'bad wkey:', wkey continue work_to_update = withKey(wkey) for attempt in range(5): try: requests += update_work(work_to_update) except AuthorRedirect: print 'fixing author redirect' w = ol.get(wkey) need_update = False for a in w['authors']: r = ol.get(a['author']) if r['type'] == '/type/redirect': a['author'] = {'key': r['location']} need_update = True if need_update: if not done_login: rc = read_rc() ol.login('EdwardBot', rc['EdwardBot']) ol.save(w['key'], w, 'avoid author redirect') if work_to_update['type']['key'] == '/type/work' and work_to_update.get('title'): subjects = get_work_subjects(work_to_update) print subjects for subject_type, values in subjects.iteritems(): subjects_to_update.update((subject_type, v) for v in values) if len(requests) >= 100: solr_update(requests, debug=True) requests = [] # if num % 1000 == 0: # solr_update(['<commit/>'], debug=True) if requests: solr_update(requests, debug=True) if not args.no_commit: solr_update(['<commit/>'], debug=True) last_update = time() if not args.no_author_updates and authors_to_update: requests = [] for akey in authors_to_update: print 'update author:', `akey` try: request = update_author(akey) if request: requests += request except AttributeError: print 'akey:', `akey` raise if not args.no_commit: solr_update(requests + ['<commit/>'], index='authors', debug=True) subject_add = Element("add") print subjects_to_update for subject_type, subject_name in subjects_to_update: key = subject_type + '/' + subject_name count = subject_count(subject_type, subject_name) if not subject_need_update(key, count): print 'no updated needed:', (subject_type, subject_name, count) continue print 'updated needed:', (subject_type, subject_name, count) doc = Element("doc") add_field(doc, 'key', key) add_field(doc, 'name', subject_name) add_field(doc, 'type', subject_type) add_field(doc, 'count', count) subject_add.append(doc) if len(subject_add): print 'updating subjects' add_xml = tostring(subject_add).encode('utf-8') solr_update([add_xml, '<commit />'], debug=True, index='subjects') authors_to_update = set() works_to_update = set() subjects_to_update = set() print >> open(state_file, 'w'), offset
from openlibrary.catalog.utils.query import query, withKey from openlibrary.catalog.importer.update import add_source_records for num, line in enumerate(open('/1/edward/imagepdf/possible_match2')): doc = eval(line) if 'publisher' not in doc: continue item_id = doc['item_id'] if query({'type': '/type/edition', 'source_records': 'ia:' + item_id}): continue e = withKey(doc['ol']) if 'publishers' not in e: continue title_match = False if doc['title'] == e['title']: title_match = True elif doc['title'] == e.get('title_prefix', '') + e['title']: title_match = True elif doc['title'] == e.get('title_prefix', '') + e['title'] + e.get( 'subtitle', ''): title_match = True elif doc['title'] == e['title'] + e.get('subtitle', ''): title_match = True if not title_match: continue if doc['publisher'] != e['publishers'][0]: continue print 'match:', item_id, doc['ol'] add_source_records(doc['ol'], item_id)
def solr_updates(i): global subjects_to_update, authors_to_update t0 = time() d = i['data'] changeset = d['changeset'] print 'author:', d['author'] try: assert len(changeset['data']) == 2 and 'master' in changeset['data'] and 'duplicates' in changeset['data'] except: print d['changeset'] raise master_key = changeset['data']['master'] dup_keys = changeset['data']['duplicates'] assert dup_keys print 'timestamp:', i['timestamp'] print 'dups:', dup_keys print 'records to update:', len(d['result']) master = None obj_by_key = {} works = [] editions_by_work = defaultdict(list) for obj in d['query']: obj_type = obj['type']['key'] k = obj['key'] if obj_type == '/type/work': works.append(obj['key']) elif obj_type == '/type/edition': if 'works' not in obj: continue for w in obj['works']: editions_by_work[w['key']].append(obj) obj_by_key[k] = obj master = obj_by_key.get(master_key) #print 'master:', master if len(d['result']) == 0: print i work_updates = [] for wkey in works: #print 'editions_by_work:', editions_by_work work = obj_by_key[wkey] work['editions'] = editions_by_work[wkey] subjects = get_work_subjects(work) for subject_type, values in subjects.iteritems(): subjects_to_update.update((subject_type, v) for v in values) try: ret = update_work(work, obj_cache=obj_by_key, debug=True) except AuthorRedirect: work = withKey(wkey) work['editions'] = editions_by_work[wkey] ret = update_work(work, debug=True, resolve_redirects=True) work_updates += ret if work_updates: solr_update(work_updates, debug=False, index='works') authors_to_update.append({ 'redirects': dup_keys, 'master_key': master_key, 'master': master}) print 'authors to update:', len(authors_to_update) t1 = time() - t0 update_times.append(t1) print 'update takes: %d seconds' % t1 print
def build_doc(w, obj_cache={}, resolve_redirects=False): wkey = w['key'] assert w['type']['key'] == '/type/work' title = w.get('title', None) if not title: return def get_pub_year(e): pub_date = e.get('publish_date', None) if pub_date: m = re_iso_date.match(pub_date) if m: return m.group(1) m = re_year.search(pub_date) if m: return m.group(1) if 'editions' not in w: q = {'type': '/type/edition', 'works': wkey, '*': None} w['editions'] = list(query_iter(q)) print 'editions:', [e['key'] for e in w['editions']] editions = [] for e in w['editions']: pub_year = get_pub_year(e) if pub_year: e['pub_year'] = pub_year if 'ocaid' in e: collection = get_ia_collection(e['ocaid']) #print 'collection:', collection e['ia_collection'] = collection e['public_scan'] = ('lendinglibrary' not in collection) and ('printdisabled' not in collection) overdrive_id = e.get('identifiers', {}).get('overdrive', None) if overdrive_id: #print 'overdrive:', overdrive_id e['overdrive'] = overdrive_id editions.append(e) editions.sort(key=lambda e: e.get('pub_year', None)) #print len(w['editions']), 'editions found' #print w['key'] work_authors = [] authors = [] author_keys = [] for a in w.get('authors', []): if 'author' not in a: # OL Web UI bug continue # http://openlibrary.org/works/OL15365167W.yml?m=edit&v=1 akey = a['author']['key'] m = re_author_key.match(akey) if not m: print 'invalid author key:', akey continue work_authors.append(akey) author_keys.append(m.group(1)) if akey in obj_cache and obj_cache[akey]['type'][ 'key'] != '/type/redirect': authors.append(obj_cache[akey]) else: authors.append(withKey(akey)) if any(a['type']['key'] == '/type/redirect' for a in authors): if resolve_redirects: def resolve(a): if a['type']['key'] == '/type/redirect': a = withKey(a['location']) return a authors = [resolve(a) for a in authors] else: print for a in authors: print 'author:', a print w['key'] print raise AuthorRedirect for a in authors: print 'author:', a assert all(a['type']['key'] == '/type/author' for a in authors) try: subjects = four_types(get_work_subjects(w)) except: print 'bad work: ', w['key'] raise field_map = { 'subjects': 'subject', 'subject_places': 'place', 'subject_times': 'time', 'subject_people': 'person', } has_fulltext = any( e.get('ocaid', None) or e.get('overdrive', None) for e in editions) #print 'has_fulltext:', has_fulltext for db_field, solr_field in field_map.iteritems(): if not w.get(db_field, None): continue cur = subjects.setdefault(solr_field, {}) for v in w[db_field]: try: if isinstance(v, dict): if 'value' not in v: continue v = v['value'] cur[v] = cur.get(v, 0) + 1 except: print 'v:', v raise if any(e.get('ocaid', None) for e in editions): subjects.setdefault('subject', {}) subjects['subject']['Accessible book'] = subjects['subject'].get( 'Accessible book', 0) + 1 if not has_fulltext: subjects['subject']['Protected DAISY'] = subjects['subject'].get( 'Protected DAISY', 0) + 1 #print w['key'], subjects['subject'] doc = Element("doc") add_field(doc, 'key', w['key'][7:]) title = w.get('title', None) if title: add_field(doc, 'title', title) # add_field(doc, 'title_suggest', title) add_field(doc, 'has_fulltext', has_fulltext) if w.get('subtitle', None): add_field(doc, 'subtitle', w['subtitle']) alt_titles = set() for e in editions: if 'title' in e and e['title'] != title: alt_titles.add(e['title']) for f in 'work_titles', 'other_titles': for t in e.get(f, []): if t != title: alt_titles.add(t) add_field_list(doc, 'alternative_title', alt_titles) alt_subtitles = set(e['subtitle'] for e in editions if e.get('subtitle', None) and e['subtitle'] != w.get('subtitle', None)) add_field_list(doc, 'alternative_subtitle', alt_subtitles) add_field(doc, 'edition_count', len(editions)) for e in editions: add_field(doc, 'edition_key', re_edition_key.match(e['key']).group(1)) cover_edition = pick_cover(w, editions) if cover_edition: add_field(doc, 'cover_edition_key', re_edition_key.match(cover_edition).group(1)) k = 'by_statement' add_field_list(doc, k, set(e[k] for e in editions if e.get(k, None))) k = 'publish_date' pub_dates = set(e[k] for e in editions if e.get(k, None)) add_field_list(doc, k, pub_dates) pub_years = set( m.group(1) for m in (re_year.search(i) for i in pub_dates) if m) if pub_years: add_field_list(doc, 'publish_year', pub_years) add_field(doc, 'first_publish_year', min(int(i) for i in pub_years)) k = 'first_sentence' fs = set(e[k]['value'] if isinstance(e[k], dict) else e[k] for e in editions if e.get(k, None)) add_field_list(doc, k, fs) publishers = set() for e in editions: publishers.update('Sine nomine' if is_sine_nomine(i) else i for i in e.get('publishers', [])) add_field_list(doc, 'publisher', publishers) # add_field_list(doc, 'publisher_facet', publishers) field_map = [ ('lccn', 'lccn'), ('publish_places', 'publish_place'), ('oclc_numbers', 'oclc'), ('contributions', 'contributor'), ] for db_key, search_key in field_map: v = set() for e in editions: if db_key not in e: continue v.update(e[db_key]) add_field_list(doc, search_key, v) isbn = set() for e in editions: for f in 'isbn_10', 'isbn_13': for v in e.get(f, []): isbn.add(v.replace('-', '')) add_field_list(doc, 'isbn', isbn) lang = set() for e in editions: for l in e.get('languages', []): m = re_lang_key.match(l['key'] if isinstance(l, dict) else l) lang.add(m.group(1)) if lang: add_field_list(doc, 'language', lang) pub_goog = set() # google pub_nongoog = set() nonpub_goog = set() nonpub_nongoog = set() public_scan = False all_collection = set() all_overdrive = set() lending_edition = None in_library_edition = None printdisabled = set() for e in editions: if 'overdrive' in e: all_overdrive.update(e['overdrive']) if 'ocaid' not in e: continue if not lending_edition and 'lendinglibrary' in e['ia_collection']: lending_edition = re_edition_key.match(e['key']).group(1) if not in_library_edition and 'inlibrary' in e['ia_collection']: in_library_edition = re_edition_key.match(e['key']).group(1) if 'printdisabled' in e['ia_collection']: printdisabled.add(re_edition_key.match(e['key']).group(1)) all_collection.update(e['ia_collection']) assert isinstance(e['ocaid'], basestring) i = e['ocaid'].strip() if e['public_scan']: public_scan = True if i.endswith('goog'): pub_goog.add(i) else: pub_nongoog.add(i) else: if i.endswith('goog'): nonpub_goog.add(i) else: nonpub_nongoog.add(i) #print 'lending_edition:', lending_edition ia_list = list(pub_nongoog) + list(pub_goog) + list(nonpub_nongoog) + list( nonpub_goog) add_field_list(doc, 'ia', ia_list) if has_fulltext: add_field(doc, 'public_scan_b', public_scan) if all_collection: add_field(doc, 'ia_collection_s', ';'.join(all_collection)) if all_overdrive: add_field(doc, 'overdrive_s', ';'.join(all_overdrive)) if lending_edition: add_field(doc, 'lending_edition_s', lending_edition) elif in_library_edition: add_field(doc, 'lending_edition_s', in_library_edition) if printdisabled: add_field(doc, 'printdisabled_s', ';'.join(list(printdisabled))) author_keys = [re_author_key.match(a['key']).group(1) for a in authors] author_names = [a.get('name', '') for a in authors] add_field_list(doc, 'author_key', author_keys) add_field_list(doc, 'author_name', author_names) alt_names = set() for a in authors: if 'alternate_names' in a: alt_names.update(a['alternate_names']) add_field_list(doc, 'author_alternative_name', alt_names) add_field_list(doc, 'author_facet', (' '.join(v) for v in zip(author_keys, author_names))) #if subjects: # add_field(doc, 'fiction', subjects['fiction']) for k in 'person', 'place', 'subject', 'time': if k not in subjects: continue add_field_list(doc, k, subjects[k].keys()) add_field_list(doc, k + '_facet', subjects[k].keys()) subject_keys = [str_to_key(s) for s in subjects[k].keys()] add_field_list(doc, k + '_key', subject_keys) return doc
def run_update(): global authors_to_update, works_to_update subjects_to_update = set() global last_update print 'running update: %s works %s authors' % (len(works_to_update), len(authors_to_update)) if works_to_update: requests = [] num = 0 total = len(works_to_update) for wkey in works_to_update: num += 1 print 'update work: %s %d/%d' % (wkey, num, total) if '/' in wkey[7:]: print 'bad wkey:', wkey continue work_to_update = withKey(wkey) for attempt in range(5): try: requests += update_work(work_to_update) except AuthorRedirect: print 'fixing author redirect' w = ol.get(wkey) need_update = False for a in w['authors']: r = ol.get(a['author']) if r['type'] == '/type/redirect': a['author'] = {'key': r['location']} need_update = True if need_update: if not done_login: rc = read_rc() ol.login('EdwardBot', rc['EdwardBot']) ol.save(w['key'], w, 'avoid author redirect') if work_to_update['type'][ 'key'] == '/type/work' and work_to_update.get('title'): subjects = get_work_subjects(work_to_update) print subjects for subject_type, values in subjects.iteritems(): subjects_to_update.update( (subject_type, v) for v in values) if len(requests) >= 100: solr_update(requests, debug=True) requests = [] # if num % 1000 == 0: # solr_update(['<commit/>'], debug=True) if requests: solr_update(requests, debug=True) if not args.no_commit: solr_update(['<commit/>'], debug=True) last_update = time() if not args.no_author_updates and authors_to_update: requests = [] for akey in authors_to_update: print('update author:', repr(akey)) try: request = update_author(akey) if request: requests += request except AttributeError: print('akey:', repr(akey)) raise if not args.no_commit: solr_update(requests + ['<commit/>'], debug=True) subject_add = Element("add") print subjects_to_update for subject_type, subject_name in subjects_to_update: key = subject_type + '/' + subject_name count = subject_count(subject_type, subject_name) if not subject_need_update(key, count): print 'no updated needed:', (subject_type, subject_name, count) continue print 'updated needed:', (subject_type, subject_name, count) doc = Element("doc") add_field(doc, 'key', key) add_field(doc, 'name', subject_name) add_field(doc, 'type', subject_type) add_field(doc, 'count', count) subject_add.append(doc) if len(subject_add): print 'updating subjects' add_xml = tostring(subject_add).encode('utf-8') solr_update([add_xml, '<commit />'], debug=True) authors_to_update = set() works_to_update = set() subjects_to_update = set() print >> open(state_file, 'w'), offset
def update_works(akey, works, do_updates=False): # we can now look up all works by an author if do_updates: rc = read_rc() ol.login('WorkBot', rc['WorkBot']) assert do_updates fh_log = open('/1/var/log/openlibrary/work_finder/' + strftime('%F_%T'), 'w') works = list(works) print >> fh_log, akey print >> fh_log, 'works:' pprint(works, fh_log) while True: # until redirects repaired q = {'type': '/type/edition', 'authors': akey, 'works': None} work_to_edition = defaultdict(set) edition_to_work = defaultdict(set) for e in query_iter(q): if not isinstance(e, dict): continue if e.get('works', None): for w in e['works']: work_to_edition[w['key']].add(e['key']) edition_to_work[e['key']].add(w['key']) work_by_key = {} fix_redirects = [] for k, editions in work_to_edition.items(): w = withKey(k) if w['type']['key'] == '/type/redirect': wkey = w['location'] print >> fh_log, 'redirect found', w[ 'key'], '->', wkey, editions assert re_work_key.match(wkey) for ekey in editions: e = get_with_retry(ekey) e['works'] = [{'key': wkey}] fix_redirects.append(e) continue work_by_key[k] = w if not fix_redirects: print >> fh_log, 'no redirects left' break print >> fh_log, 'save redirects' try: ol.save_many(fix_redirects, "merge works") except: for r in fix_redirects: print r raise all_existing = set() work_keys = [] print >> fh_log, 'edition_to_work:' print >> fh_log, ` dict(edition_to_work) ` print >> fh_log print >> fh_log, 'work_to_edition' print >> fh_log, ` dict(work_to_edition) ` print >> fh_log # open('edition_to_work', 'w').write(`dict(edition_to_work)`) # open('work_to_edition', 'w').write(`dict(work_to_edition)`) # open('work_by_key', 'w').write(`dict(work_by_key)`) work_title_match = {} works_by_title = {} for w in works: # 1st pass for e in w['editions']: ekey = e['key'] if isinstance(e, dict) else e for wkey in edition_to_work.get(ekey, []): try: wtitle = work_by_key[wkey]['title'] except: print 'bad work:', wkey raise if wtitle == w['title']: work_title_match[wkey] = w['title'] wkey_to_new_title = defaultdict(set) for w in works: # 2nd pass works_by_title[w['title']] = w w['existing_works'] = defaultdict(int) for e in w['editions']: ekey = e['key'] if isinstance(e, dict) else e for wkey in edition_to_work.get(ekey, []): if wkey in work_title_match and work_title_match[wkey] != w[ 'title']: continue wtitle = work_by_key[wkey]['title'] w['existing_works'][wkey] += 1 wkey_to_new_title[wkey].add(w['title']) existing_work_with_conflict = defaultdict(set) for w in works: # 3rd pass for wkey, v in w['existing_works'].iteritems(): if any(title != w['title'] for title in wkey_to_new_title[wkey]): w['has_conflict'] = True existing_work_with_conflict[wkey].add(w['title']) break for wkey, v in existing_work_with_conflict.iteritems(): cur_work = work_by_key[wkey] existing_titles = defaultdict(int) for ekey in work_to_edition[wkey]: e = withKey(ekey) title = e['title'] if e.get('title_prefix', None): title = e['title_prefix'].strip() + ' ' + e['title'] existing_titles[title] += 1 best_match = max(v, key=lambda wt: existing_titles[wt]) works_by_title[best_match]['best_match'] = work_by_key[wkey] for wtitle in v: del works_by_title[wtitle]['has_conflict'] if wtitle != best_match: works_by_title[wtitle]['existing_works'] = {} def other_matches(w, existing_wkey): return [ title for title in wkey_to_new_title[existing_wkey] if title != w['title'] ] works_updated_this_session = set() for w in works: # 4th pass if 'has_conflict' in w: pprint(w) assert 'has_conflict' not in w if len(w['existing_works']) == 1: existing_wkey = w['existing_works'].keys()[0] if not other_matches(w, existing_wkey): w['best_match'] = work_by_key[existing_wkey] if 'best_match' in w: updated = update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log) for wkey in updated: if wkey in works_updated_this_session: print >> fh_log, wkey, 'already updated!' print wkey, 'already updated!' works_updated_this_session.update(updated) continue if not w['existing_works']: updated = new_work(akey, w, do_updates, fh_log) for wkey in updated: assert wkey not in works_updated_this_session works_updated_this_session.update(updated) continue assert not any( other_matches(w, wkey) for wkey in w['existing_works'].iterkeys()) best_match = max(w['existing_works'].iteritems(), key=lambda i: i[1])[0] w['best_match'] = work_by_key[best_match] updated = update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log) for wkey in updated: if wkey in works_updated_this_session: print >> fh_log, wkey, 'already updated!' print wkey, 'already updated!' works_updated_this_session.update(updated) #if not do_updates: # return [] return [withKey(key) for key in works_updated_this_session]
do_updates = False while True: # until redirects repaired q = {'type':'/type/edition', 'authors':akey, 'works': None} work_to_edition = defaultdict(set) edition_to_work = defaultdict(set) for e in query_iter(q): if e.get('works', None): for w in e['works']: work_to_edition[w['key']].add(e['key']) edition_to_work[e['key']].add(w['key']) work_title = {} fix_redirects = [] for k, editions in work_to_edition.items(): w = withKey(k) if w['type']['key'] == '/type/redirect': print 'redirect found' wkey = w['location'] assert re_work_key.match(wkey) for ekey in editions: e = withKey(ekey) e['works'] = [Reference(wkey)] fix_redirects.append(e) continue work_title[k] = w['title'] if not fix_redirects: print 'no redirects left' break print 'save redirects' ol.save_many(fix_redirects, "merge works")
def withKey_cached(key, obj_cache={}): if key not in obj_cache: obj_cache[key] = withKey(key) return obj_cache[key]
def resolve(a): if a['type']['key'] == '/type/redirect': a = withKey(a['location']) return a
def update_author(akey, a=None, handle_redirects=True): # http://ia331507.us.archive.org:8984/solr/works/select?indent=on&q=author_key:OL22098A&facet=true&rows=1&sort=edition_count%20desc&fl=title&facet.field=subject_facet&facet.mincount=1 if akey == '/authors/': return m = re_author_key.match(akey) if not m: print 'bad key:', akey assert m author_id = m.group(1) if not a: a = withKey(akey) if a['type']['key'] in ('/type/redirect', '/type/delete') or not a.get('name', None): return ['<delete><query>key:%s</query></delete>' % author_id] try: assert a['type']['key'] == '/type/author' except AssertionError: print a['type']['key'] raise facet_fields = ['subject', 'time', 'person', 'place'] url = 'http://' + get_solr( 'works' ) + '/solr/works/select?wt=json&json.nl=arrarr&q=author_key:%s&sort=edition_count+desc&rows=1&fl=title,subtitle&facet=true&facet.mincount=1' % author_id url += ''.join('&facet.field=%s_facet' % f for f in facet_fields) reply = json.load(urlopen(url)) work_count = reply['response']['numFound'] docs = reply['response'].get('docs', []) top_work = None if docs: top_work = docs[0]['title'] if docs[0].get('subtitle', None): top_work += ': ' + docs[0]['subtitle'] all_subjects = [] for f in facet_fields: for s, num in reply['facet_counts']['facet_fields'][f + '_facet']: all_subjects.append((num, s)) all_subjects.sort(reverse=True) top_subjects = [s for num, s in all_subjects[:10]] add = Element("add") doc = SubElement(add, "doc") add_field(doc, 'key', author_id) if a.get('name', None): add_field(doc, 'name', a['name']) for f in 'birth_date', 'death_date', 'date': if a.get(f, None): add_field(doc, f, a[f]) if top_work: add_field(doc, 'top_work', top_work) add_field(doc, 'work_count', work_count) add_field_list(doc, 'top_subjects', top_subjects) requests = [] if handle_redirects: q = {'type': '/type/redirect', 'location': akey} try: redirects = ''.join('<id>%s</id>' % re_author_key.match(r['key']).group(1) for r in query_iter(q)) except AttributeError: print 'redirects:', [r['key'] for r in query_iter(q)] raise if redirects: requests.append('<delete>' + redirects + '</delete>') requests.append(tostring(add).encode('utf-8')) return requests
def solr_updates(i): global subjects_to_update, authors_to_update t0 = time() d = i['data'] changeset = d['changeset'] print 'author:', d['author'] try: assert len(changeset['data']) == 2 and 'master' in changeset['data'] and 'duplicates' in changeset['data'] except: print d['changeset'] raise master_key = changeset['data']['master'] dup_keys = changeset['data']['duplicates'] assert dup_keys print d['changeset'] print 'timestamp:', i['timestamp'] print 'dups:', dup_keys print 'records to update:', len(d['result']) master = None obj_by_key = {} works = [] editions_by_work = defaultdict(list) for obj in d['query']: obj_type = obj['type']['key'] k = obj['key'] if obj_type == '/type/work': works.append(obj['key']) elif obj_type == '/type/edition': if 'works' not in obj: continue for w in obj['works']: editions_by_work[w['key']].append(obj) obj_by_key[k] = obj master = obj_by_key.get(master_key) #print 'master:', master if len(d['result']) == 0: print i work_updates = [] for wkey in works: #print 'editions_by_work:', editions_by_work work = obj_by_key[wkey] work['editions'] = editions_by_work[wkey] subjects = get_work_subjects(work) for subject_type, values in subjects.iteritems(): subjects_to_update.update((subject_type, v) for v in values) try: ret = update_work(work, obj_cache=obj_by_key, debug=True) except AuthorRedirect: work = withKey(wkey) work['editions'] = editions_by_work[wkey] ret = update_work(work, debug=True, resolve_redirects=True) work_updates += ret solr_update(work_updates, debug=False, index='works') authors_to_update.append({ 'redirects': dup_keys, 'master_key': master_key, 'master': master}) print 'authors to update:', len(authors_to_update) t1 = time() - t0 update_times.append(t1) print 'update takes: %d seconds' % t1 print
def pick_cover(editions): for pub_year, ekey in sorted(get_covers(editions), reverse=True): e = withKey(ekey) if e['type']['key'] == '/type/edition': return ekey
print item, edition_pool e1 = build_marc(rec) print e1 match = False seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: # edition_key = '/books/' + re_edition_key.match(edition_key).match(1) if edition_key in seen: continue thing = None while not thing or thing['type']['key'] == '/type/redirect': seen.add(edition_key) thing = withKey(edition_key) assert thing if thing['type']['key'] == '/type/redirect': print 'following redirect %s => %s' % (edition_key, thing['location']) edition_key = thing['location'] if try_merge(e1, edition_key, thing): print 'add source records:', edition_key, item print (edition_key, item) print >> add_src_rec, (edition_key, item) #add_source_records(edition_key, ia) #write_log(ia, when, "found match: " + edition_key) match = True break if not match: print full_rec print >> new_book, full_rec
except KeyboardInterrupt: raise except: pass sleep(2) skip = True skip = False for line in open('/1/edward/jsondump/2009-07-29/has_ocaid'): key = line[:-1] if key == '/b/OL6539962M': # the end break if skip: if key == '/b/OL6539962M': skip = False else: continue if not has_cover_retry(key): print 'no cover' continue print key e = withKey(key) if not e.get('ocaid', None): print 'no ocaid' continue find_img(e['ocaid'].strip()) fh_log.close() print 'finished'
def build_doc(w): wkey = w['key'] assert w['type']['key'] == '/type/work' title = w.get('title', None) if not title: return def get_pub_year(e): pub_date = e.get('publish_date', None) if pub_date: m = re_year.search(pub_date) if m: return m.group(1) if 'editions' not in w: q = { 'type':'/type/edition', 'works': wkey, '*': None } w['editions'] = list(query_iter(q)) print 'editions:', [e['key'] for e in w['editions']] editions = [] for e in w['editions']: pub_year = get_pub_year(e) if pub_year: e['pub_year'] = pub_year if 'ocaid' in e: collection = get_ia_collection(e['ocaid']) print 'collection:', collection e['ia_collection'] = collection e['public_scan'] = ('lendinglibrary' not in collection) and ('printdisabled' not in collection) overdrive_id = e.get('identifiers', {}).get('overdrive', None) if overdrive_id: print 'overdrive:', overdrive_id e['overdrive'] = overdrive_id editions.append(e) editions.sort(key=lambda e: e.get('pub_year', None)) print len(w['editions']), 'editions found' print w['key'] work_authors = [] authors = [] author_keys = [] for a in w.get('authors', []): if 'author' not in a: continue akey = a['author']['key'] m = re_author_key.match(akey) if not m: print 'invalid author key:', akey continue work_authors.append(akey) author_keys.append(m.group(1)) authors.append(withKey(akey)) if any(a['type']['key'] == '/type/redirect' for a in authors): raise AuthorRedirect assert all(a['type']['key'] == '/type/author' for a in authors) #subjects = four_types(find_subjects(get_marc_subjects(w))) subjects = {} field_map = { 'subjects': 'subject', 'subject_places': 'place', 'subject_times': 'time', 'subject_people': 'person', } has_fulltext = any(e.get('ocaid', None) or e.get('overdrive', None) for e in editions) print 'has_fulltext:', has_fulltext for db_field, solr_field in field_map.iteritems(): if not w.get(db_field, None): continue cur = subjects.setdefault(solr_field, {}) for v in w[db_field]: try: if isinstance(v, dict): if 'value' not in v: continue v = v['value'] cur[v] = cur.get(v, 0) + 1 except: print 'v:', v raise if any(e.get('ocaid', None) for e in editions): subjects.setdefault('subject', {}) subjects['subject']['Accessible book'] = subjects['subject'].get('Accessible book', 0) + 1 if not has_fulltext: subjects['subject']['Protected DAISY'] = subjects['subject'].get('Protected DAISY', 0) + 1 print w['key'], subjects['subject'] doc = Element("doc") add_field(doc, 'key', w['key'][7:]) title = w.get('title', None) if title: add_field(doc, 'title', title) # add_field(doc, 'title_suggest', title) add_field(doc, 'has_fulltext', has_fulltext) if w.get('subtitle', None): add_field(doc, 'subtitle', w['subtitle']) alt_titles = set() for e in editions: if 'title' in e and e['title'] != title: alt_titles.add(e['title']) for f in 'work_titles', 'other_titles': for t in e.get(f, []): if t != title: alt_titles.add(t) add_field_list(doc, 'alternative_title', alt_titles) alt_subtitles = set( e['subtitle'] for e in editions if e.get('subtitle', None) and e['subtitle'] != w.get('subtitle', None)) add_field_list(doc, 'alternative_subtitle', alt_subtitles) add_field(doc, 'edition_count', len(editions)) for e in editions: add_field(doc, 'edition_key', re_edition_key.match(e['key']).group(1)) cover_edition = pick_cover(w, editions) if cover_edition: add_field(doc, 'cover_edition_key', re_edition_key.match(cover_edition).group(1)) k = 'by_statement' add_field_list(doc, k, set( e[k] for e in editions if e.get(k, None))) k = 'publish_date' pub_dates = set(e[k] for e in editions if e.get(k, None)) add_field_list(doc, k, pub_dates) pub_years = set(m.group(1) for m in (re_year.search(i) for i in pub_dates) if m) if pub_years: add_field_list(doc, 'publish_year', pub_years) add_field(doc, 'first_publish_year', min(int(i) for i in pub_years)) k = 'first_sentence' fs = set( e[k]['value'] if isinstance(e[k], dict) else e[k] for e in editions if e.get(k, None)) add_field_list(doc, k, fs) publishers = set() for e in editions: publishers.update('Sine nomine' if is_sine_nomine(i) else i for i in e.get('publishers', [])) add_field_list(doc, 'publisher', publishers) # add_field_list(doc, 'publisher_facet', publishers) field_map = [ ('lccn', 'lccn'), ('publish_places', 'publish_place'), ('oclc_numbers', 'oclc'), ('contributions', 'contributor'), ] for db_key, search_key in field_map: v = set() for e in editions: if db_key not in e: continue v.update(e[db_key]) add_field_list(doc, search_key, v) isbn = set() for e in editions: for f in 'isbn_10', 'isbn_13': for v in e.get(f, []): isbn.add(v.replace('-', '')) add_field_list(doc, 'isbn', isbn) lang = set() for e in editions: for l in e.get('languages', []): m = re_lang_key.match(l['key'] if isinstance(l, dict) else l) lang.add(m.group(1)) if lang: add_field_list(doc, 'language', lang) pub_goog = set() # google pub_nongoog = set() nonpub_goog = set() nonpub_nongoog = set() public_scan = False all_collection = set() all_overdrive = set() lending_edition = None printdisabled = set() for e in editions: if 'overdrive' in e: all_overdrive.update(e['overdrive']) if 'ocaid' not in e: continue if not lending_edition and 'lendinglibrary' in e['ia_collection']: lending_edition = re_edition_key.match(e['key']).group(1) if 'printdisabled' in e['ia_collection']: printdisabled.add(re_edition_key.match(e['key']).group(1)) all_collection.update(e['ia_collection']) assert isinstance(e['ocaid'], basestring) i = e['ocaid'].strip() if e['public_scan']: public_scan = True if i.endswith('goog'): pub_goog.add(i) else: pub_nongoog.add(i) else: if i.endswith('goog'): nonpub_goog.add(i) else: nonpub_nongoog.add(i) print 'lending_edition:', lending_edition ia_list = list(pub_nongoog) + list(pub_goog) + list(nonpub_nongoog) + list(nonpub_goog) add_field_list(doc, 'ia', ia_list) if has_fulltext: add_field(doc, 'public_scan_b', public_scan) if all_collection: add_field(doc, 'ia_collection_s', ';'.join(all_collection)) if all_overdrive: add_field(doc, 'overdrive_s', ';'.join(all_overdrive)) if lending_edition: add_field(doc, 'lending_edition_s', lending_edition) if printdisabled: add_field(doc, 'printdisabled_s', ';'.join(list(printdisabled))) author_keys = [re_author_key.match(a['key']).group(1) for a in authors] author_names = [a.get('name', '') for a in authors] add_field_list(doc, 'author_key', author_keys) add_field_list(doc, 'author_name', author_names) alt_names = set() for a in authors: if 'alternate_names' in a: alt_names.update(a['alternate_names']) add_field_list(doc, 'author_alternative_name', alt_names) add_field_list(doc, 'author_facet', (' '.join(v) for v in zip(author_keys, author_names))) #if subjects: # add_field(doc, 'fiction', subjects['fiction']) for k in 'person', 'place', 'subject', 'time': if k not in subjects: continue add_field_list(doc, k, subjects[k].keys()) # add_field_list(doc, k + '_facet', subjects[k].keys()) subject_keys = [str_to_key(s) for s in subjects[k].keys()] add_field_list(doc, k + '_key', subject_keys) return doc
write_log(ia, when, "loaded") continue e1 = build_marc(rec) match = False seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: if edition_key in seen: continue thing = None found = True while not thing or thing['type']['key'] == '/type/redirect': seen.add(edition_key) thing = withKey(edition_key) assert thing if 'type' not in thing: print(thing) if thing.get('error') == 'notfound': found = False break if thing['type']['key'] == '/type/redirect': print('following redirect %s => %s' % (edition_key, thing['location'])) edition_key = thing['location'] if not found: continue if try_merge(e1, edition_key, thing): add_source_records(edition_key, ia) write_log(ia, when, "found match: " + edition_key)
def update_keys(keys, commit=True): logger.info("BEGIN update_keys") wkeys = set() # Get works for all the editions ekeys = set(k for k in keys if k.startswith("/books/")) for k in ekeys: logger.info("processing edition %s", k) edition = get_document(k) if edition and edition['type']['key'] == '/type/redirect': logger.warn("Found redirect to %s", edition['location']) edition = withKey(edition['location']) if not edition: logger.warn("No edition found for key %r. Ignoring...", k) continue elif edition['type']['key'] != '/type/edition': logger.info( "%r is a document of type %r. Checking if any work has it as edition in solr...", k, edition['type']['key']) wkey = solr_select_work(k) if wkey: logger.info("found %r, updating it...", wkey) wkeys.add(wkey) logger.warn("Found a document of type %r. Ignoring...", edition['type']['key']) else: if edition.get("works"): wkeys.add(edition["works"][0]['key']) else: # index the edition as it does not belong to any work wkeys.add(k) # Add work keys wkeys.update(k for k in keys if k.startswith("/works/")) # update works requests = [] for k in wkeys: logger.info("updating %s", k) try: w = get_document(k) requests += update_work(w, debug=True) except: logger.error("Failed to update work %s", k, exc_info=True) if requests: if commit: requests += ['<commit />'] solr_update(requests, debug=True) # update authors requests = [] akeys = set(k for k in keys if k.startswith("/authors/")) for k in akeys: logger.info("updating %s", k) try: requests += update_author(k) except: logger.error("Failed to update author %s", k, exc_info=True) if requests: if commit: requests += ['<commit />'] solr_update(requests, index="authors", debug=True, commitWithin=1000) logger.info("END update_keys")
works = find_works(akey, get_books(akey, books_query(akey)), existing=title_redirects) updated = update_works(akey, works, do_updates=True) works_to_update.update(w['key'] for w in updated) for query in i['data']['query']: key = query.pop('key') process_save(key, query) # store.put gets called when any document is updated in the store. Borrowing/Returning a book triggers one. elif action == 'store.put': # A sample record looks like this: # { # "action": "store.put", # "timestamp": "2011-12-01T00:00:44.241604", # "data": { # "data": {"borrowed": "false", "_key": "ebooks/books/OL5854888M", "_rev": "975708", "type": "ebook", "book_key": "/books/OL5854888M"}, # "key": "ebooks/books/OL5854888M" # }, # "site": "openlibrary.org" # } data = i.get('data', {}).get("data") if data.get("type") == "ebook" and data.get( "_key", "").startswith("ebooks/books/"): edition_key = data['book_key'] process_save(edition_key, withKey(edition_key)) since_last_update = time() - last_update if len(works_to_update) > work_limit or len( authors_to_update ) > author_limit or since_last_update > time_limit: run_update()
def update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log): work_updated = [] best = w['best_match']['key'] update = [] subjects_from_existing_works = defaultdict(set) for wkey in w['existing_works'].iterkeys(): if wkey == best: continue existing = get_with_retry(wkey) for k in 'subjects', 'subject_places', 'subject_times', 'subject_people': if existing.get(k): subjects_from_existing_works[k].update(existing[k]) update.append({'type': '/type/redirect', 'location': best, 'key': wkey}) work_updated.append(wkey) for wkey in w['existing_works'].iterkeys(): editions = set(work_to_edition[wkey]) editions.update(e['key'] for e in w['editions']) for ekey in editions: e = get_with_retry(ekey) e['works'] = [{'key': best}] authors = [] for akey in e['authors']: a = get_with_retry(akey) if a['type'] == '/type/redirect': m = re_author_key.match(a['location']) akey = '/authors/' + m.group(1) authors.append({'key': str(akey)}) e['authors'] = authors new_toc = fix_toc(e) if new_toc: e['table_of_contents'] = new_toc update.append(e) cur_work = w['best_match'] need_save = fix_up_authors(cur_work, akey, w['editions']) if any(subjects_from_existing_works.values()): need_save = True if need_save or cur_work['title'] != w['title'] \ or ('subtitle' in w and 'subtitle' not in cur_work) \ or ('subjects' in w and 'subjects' not in cur_work): if cur_work['title'] != w['title']: print(( 'update work title:', best, repr(cur_work['title']), '->', repr(w['title']))) existing_work = get_with_retry(best) if existing_work['type'] != '/type/work': pprint(existing_work) assert existing_work['type'] == '/type/work' existing_work['title'] = w['title'] for k, v in subjects_from_existing_works.items(): existing_subjects = set(existing_work.get(k, [])) existing_work.setdefault(k, []).extend(s for s in v if s not in existing_subjects) add_detail_to_work(w, existing_work) for a in existing_work.get('authors', []): obj = withKey(a['author']) if obj['type']['key'] != '/type/redirect': continue new_akey = obj['location'] a['author'] = {'key': new_akey} assert new_akey.startswith('/authors/') obj = withKey(new_akey) assert obj['type']['key'] == '/type/author' print('existing:', existing_work, file=fh_log) print('subtitle:', repr(existing_work['subtitle']) if 'subtitle' in existing_work else 'n/a', file=fh_log) update.append(existing_work) work_updated.append(best) if do_updates: try: print(ol.save_many(update, 'merge works'), file=fh_log) except: for page in update: print(page) raise return work_updated
def update_keys(keys, commit=True): logger.info("BEGIN update_keys") wkeys = set() # Get works for all the editions ekeys = set(k for k in keys if k.startswith("/books/")) for k in ekeys: logger.info("processing edition %s", k) edition = get_document(k) if edition and edition['type']['key'] == '/type/redirect': logger.warn("Found redirect to %s", edition['location']) edition = withKey(edition['location']) if not edition: logger.warn("No edition found for key %r. Ignoring...", k) continue elif edition['type']['key'] != '/type/edition': logger.info("%r is a document of type %r. Checking if any work has it as edition in solr...", k, edition['type']['key']) wkey = solr_select_work(k) if wkey: logger.info("found %r, updating it...", wkey) wkeys.add(wkey) if edition['type']['key'] == '/type/delete': logger.info("Found a document of type %r. queuing for deleting it solr..", edition['type']['key']) # Also remove if there is any work with that key in solr. wkeys.add(k) else: logger.warn("Found a document of type %r. Ignoring...", edition['type']['key']) else: if edition.get("works"): wkeys.add(edition["works"][0]['key']) else: # index the edition as it does not belong to any work wkeys.add(k) # Add work keys wkeys.update(k for k in keys if k.startswith("/works/")) # update works requests = [] for k in wkeys: logger.info("updating %s", k) try: w = get_document(k) requests += update_work(w, debug=True) except: logger.error("Failed to update work %s", k, exc_info=True) if requests: if commit: requests += ['<commit />'] solr_update(requests, debug=True) # update editions requests = [] for k in ekeys: try: e = withKey(k) requests += update_edition(e) except: logger.error("Failed to update edition %s", k, exc_info=True) if requests: if commit: requests += ['<commit/>'] solr_update(requests, index="editions", debug=True) # update authors requests = [] akeys = set(k for k in keys if k.startswith("/authors/")) for k in akeys: logger.info("updating %s", k) try: requests += update_author(k) except: logger.error("Failed to update author %s", k, exc_info=True) if requests: if commit: requests += ['<commit />'] solr_update(requests, index="authors", debug=True, commitWithin=1000) # update subjects skeys = set(k for k in keys if k.startswith("/subjects/")) requests = [] for k in skeys: logger.info("updating %s", k) try: requests += update_subject(k) except: logger.error("Failed to update subject %s", k, exc_info=True) if requests: if commit: requests += ['<commit />'] solr_update(requests, index="subjects", debug=True) logger.info("END update_keys")
def update_works(akey, works, do_updates=False): # we can now look up all works by an author if do_updates: rc = read_rc() ol.login('WorkBot', rc['WorkBot']) assert do_updates fh_log = open('/1/var/log/openlibrary/work_finder/' + strftime('%F_%T'), 'w') works = list(works) print(akey, file=fh_log) print('works:', file=fh_log) pprint(works, fh_log) while True: # until redirects repaired q = {'type':'/type/edition', 'authors': akey, 'works': None} work_to_edition = defaultdict(set) edition_to_work = defaultdict(set) for e in query_iter(q): if not isinstance(e, dict): continue if e.get('works', None): for w in e['works']: work_to_edition[w['key']].add(e['key']) edition_to_work[e['key']].add(w['key']) work_by_key = {} fix_redirects = [] for k, editions in work_to_edition.items(): w = withKey(k) if w['type']['key'] == '/type/redirect': wkey = w['location'] print('redirect found', w['key'], '->', wkey, editions, file=fh_log) assert re_work_key.match(wkey) for ekey in editions: e = get_with_retry(ekey) e['works'] = [{'key': wkey}] fix_redirects.append(e) continue work_by_key[k] = w if not fix_redirects: print('no redirects left', file=fh_log) break print('save redirects', file=fh_log) try: ol.save_many(fix_redirects, "merge works") except: for r in fix_redirects: print(r) raise all_existing = set() work_keys = [] print('edition_to_work:', file=fh_log) print(repr(dict(edition_to_work)), file=fh_log) print(file=fh_log) print('work_to_edition', file=fh_log) print(repr(dict(work_to_edition)), file=fh_log) print(file=fh_log) # open('edition_to_work', 'w').write(repr(dict(edition_to_work))) # open('work_to_edition', 'w').write(repr(dict(work_to_edition))) # open('work_by_key', 'w').write(repr(dict(work_by_key))) work_title_match = {} works_by_title = {} for w in works: # 1st pass for e in w['editions']: ekey = e['key'] if isinstance(e, dict) else e for wkey in edition_to_work.get(ekey, []): try: wtitle = work_by_key[wkey]['title'] except: print('bad work:', wkey) raise if wtitle == w['title']: work_title_match[wkey] = w['title'] wkey_to_new_title = defaultdict(set) for w in works: # 2nd pass works_by_title[w['title']] = w w['existing_works'] = defaultdict(int) for e in w['editions']: ekey = e['key'] if isinstance(e, dict) else e for wkey in edition_to_work.get(ekey, []): if wkey in work_title_match and work_title_match[wkey] != w['title']: continue wtitle = work_by_key[wkey]['title'] w['existing_works'][wkey] += 1 wkey_to_new_title[wkey].add(w['title']) existing_work_with_conflict = defaultdict(set) for w in works: # 3rd pass for wkey, v in w['existing_works'].iteritems(): if any(title != w['title'] for title in wkey_to_new_title[wkey]): w['has_conflict'] = True existing_work_with_conflict[wkey].add(w['title']) break for wkey, v in existing_work_with_conflict.iteritems(): cur_work = work_by_key[wkey] existing_titles = defaultdict(int) for ekey in work_to_edition[wkey]: e = withKey(ekey) title = e['title'] if e.get('title_prefix', None): title = e['title_prefix'].strip() + ' ' + e['title'] existing_titles[title] += 1 best_match = max(v, key=lambda wt: existing_titles[wt]) works_by_title[best_match]['best_match'] = work_by_key[wkey] for wtitle in v: del works_by_title[wtitle]['has_conflict'] if wtitle != best_match: works_by_title[wtitle]['existing_works'] = {} def other_matches(w, existing_wkey): return [title for title in wkey_to_new_title[existing_wkey] if title != w['title']] works_updated_this_session = set() for w in works: # 4th pass if 'has_conflict' in w: pprint(w) assert 'has_conflict' not in w if len(w['existing_works']) == 1: existing_wkey = w['existing_works'].keys()[0] if not other_matches(w, existing_wkey): w['best_match'] = work_by_key[existing_wkey] if 'best_match' in w: updated = update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log) for wkey in updated: if wkey in works_updated_this_session: print(wkey, 'already updated!', file=fh_log) print(wkey, 'already updated!') works_updated_this_session.update(updated) continue if not w['existing_works']: updated = new_work(akey, w, do_updates, fh_log) for wkey in updated: assert wkey not in works_updated_this_session works_updated_this_session.update(updated) continue assert not any(other_matches(w, wkey) for wkey in w['existing_works'].iterkeys()) best_match = max(w['existing_works'].iteritems(), key=lambda i:i[1])[0] w['best_match'] = work_by_key[best_match] updated = update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log) for wkey in updated: if wkey in works_updated_this_session: print(wkey, 'already updated!', file=fh_log) print(wkey, 'already updated!') works_updated_this_session.update(updated) #if not do_updates: # return [] return [withKey(key) for key in works_updated_this_session]
from openlibrary.catalog.utils.query import query, withKey from openlibrary.catalog.importer.update import add_source_records for num, line in enumerate(open('/1/edward/imagepdf/possible_match2')): doc = eval(line) if 'publisher' not in doc: continue item_id = doc['item_id'] if query({'type':'/type/edition','source_records':'ia:' + item_id}): continue e = withKey(doc['ol']) if 'publishers' not in e: continue title_match = False if doc['title'] == e['title']: title_match = True elif doc['title'] == e.get('title_prefix', '') + e['title']: title_match = True elif doc['title'] == e.get('title_prefix', '') + e['title'] + e.get('subtitle', ''): title_match = True elif doc['title'] == e['title'] + e.get('subtitle', ''): title_match = True if not title_match: continue if doc['publisher'] != e['publishers'][0]: continue print 'match:', item_id, doc['ol'] add_source_records(doc['ol'], item_id)
def update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log): work_updated = [] best = w['best_match']['key'] update = [] subjects_from_existing_works = defaultdict(set) for wkey in w['existing_works'].iterkeys(): if wkey == best: continue existing = get_with_retry(wkey) for k in 'subjects', 'subject_places', 'subject_times', 'subject_people': if existing.get(k): subjects_from_existing_works[k].update(existing[k]) update.append({ 'type': '/type/redirect', 'location': best, 'key': wkey }) work_updated.append(wkey) for wkey in w['existing_works'].iterkeys(): editions = set(work_to_edition[wkey]) editions.update(e['key'] for e in w['editions']) for ekey in editions: e = get_with_retry(ekey) e['works'] = [{'key': best}] authors = [] for akey in e['authors']: a = get_with_retry(akey) if a['type'] == '/type/redirect': m = re_author_key.match(a['location']) akey = '/authors/' + m.group(1) authors.append({'key': str(akey)}) e['authors'] = authors new_toc = fix_toc(e) if new_toc: e['table_of_contents'] = new_toc update.append(e) cur_work = w['best_match'] need_save = fix_up_authors(cur_work, akey, w['editions']) if any(subjects_from_existing_works.values()): need_save = True if need_save or cur_work['title'] != w['title'] \ or ('subtitle' in w and 'subtitle' not in cur_work) \ or ('subjects' in w and 'subjects' not in cur_work): if cur_work['title'] != w['title']: print 'update work title:', best, ` cur_work['title'] `, '->', ` w[ 'title'] ` existing_work = get_with_retry(best) if existing_work['type'] != '/type/work': pprint(existing_work) assert existing_work['type'] == '/type/work' existing_work['title'] = w['title'] for k, v in subjects_from_existing_works.items(): existing_subjects = set(existing_work.get(k, [])) existing_work.setdefault(k, []).extend(s for s in v if s not in existing_subjects) add_detail_to_work(w, existing_work) for a in existing_work.get('authors', []): obj = withKey(a['author']) if obj['type']['key'] != '/type/redirect': continue new_akey = obj['location'] a['author'] = {'key': new_akey} assert new_akey.startswith('/authors/') obj = withKey(new_akey) assert obj['type']['key'] == '/type/author' print >> fh_log, 'existing:', existing_work print >> fh_log, 'subtitle:', ` existing_work[ 'subtitle'] ` if 'subtitle' in existing_work else 'n/a' update.append(existing_work) work_updated.append(best)
def build_doc(w, obj_cache={}, resolve_redirects=False): wkey = w['key'] assert w['type']['key'] == '/type/work' title = w.get('title', None) if not title: return def get_pub_year(e): pub_date = e.get('publish_date', None) if pub_date: m = re_iso_date.match(pub_date) if m: return m.group(1) m = re_year.search(pub_date) if m: return m.group(1) if 'editions' not in w: q = { 'type':'/type/edition', 'works': wkey, '*': None } w['editions'] = list(query_iter(q)) #print 'editions:', [e['key'] for e in w['editions']] identifiers = defaultdict(list) editions = [] for e in w['editions']: pub_year = get_pub_year(e) if pub_year: e['pub_year'] = pub_year ia = None if 'ocaid' in e: ia = e['ocaid'] elif 'ia_loaded_id' in e: loaded = e['ia_loaded_id'] ia = loaded if isinstance(loaded, basestring) else loaded[0] if ia: ia_meta_fields = get_ia_collection_and_box_id(ia) collection = ia_meta_fields['collection'] if 'ia_box_id' in e and isinstance(e['ia_box_id'], basestring): e['ia_box_id'] = [e['ia_box_id']] if ia_meta_fields.get('boxid'): box_id = list(ia_meta_fields['boxid'])[0] e.setdefault('ia_box_id', []) if box_id.lower() not in [x.lower() for x in e['ia_box_id']]: e['ia_box_id'].append(box_id) #print 'collection:', collection e['ia_collection'] = collection e['public_scan'] = ('lendinglibrary' not in collection) and ('printdisabled' not in collection) overdrive_id = e.get('identifiers', {}).get('overdrive', None) if overdrive_id: #print 'overdrive:', overdrive_id e['overdrive'] = overdrive_id if 'identifiers' in e: for k, id_list in e['identifiers'].iteritems(): k_orig = k k = k.replace('.', '_').replace(',', '_').replace('(', '').replace(')', '').replace(':', '_').replace('/', '').replace('#', '').lower() m = re_solr_field.match(k) if not m: print (k_orig, k) assert m for v in id_list: v = v.strip() if v not in identifiers[k]: identifiers[k].append(v) editions.append(e) editions.sort(key=lambda e: e.get('pub_year', None)) #print len(w['editions']), 'editions found' #print w['key'] work_authors = [] authors = [] author_keys = [] for a in w.get('authors', []): if 'author' not in a: # OL Web UI bug continue # http://openlibrary.org/works/OL15365167W.yml?m=edit&v=1 akey = a['author']['key'] m = re_author_key.match(akey) if not m: print 'invalid author key:', akey continue work_authors.append(akey) author_keys.append(m.group(1)) if akey in obj_cache and obj_cache[akey]['type']['key'] != '/type/redirect': authors.append(obj_cache[akey]) else: authors.append(withKey(akey)) if any(a['type']['key'] == '/type/redirect' for a in authors): if resolve_redirects: def resolve(a): if a['type']['key'] == '/type/redirect': a = withKey(a['location']) return a authors = [resolve(a) for a in authors] else: print for a in authors: print 'author:', a print w['key'] print raise AuthorRedirect assert all(a['type']['key'] == '/type/author' for a in authors) try: subjects = four_types(get_work_subjects(w)) except: print 'bad work: ', w['key'] raise field_map = { 'subjects': 'subject', 'subject_places': 'place', 'subject_times': 'time', 'subject_people': 'person', } has_fulltext = any(e.get('ocaid', None) or e.get('overdrive', None) for e in editions) #print 'has_fulltext:', has_fulltext for db_field, solr_field in field_map.iteritems(): if not w.get(db_field, None): continue cur = subjects.setdefault(solr_field, {}) for v in w[db_field]: try: if isinstance(v, dict): if 'value' not in v: continue v = v['value'] cur[v] = cur.get(v, 0) + 1 except: print 'v:', v raise if any(e.get('ocaid', None) for e in editions): subjects.setdefault('subject', {}) subjects['subject']['Accessible book'] = subjects['subject'].get('Accessible book', 0) + 1 if not has_fulltext: subjects['subject']['Protected DAISY'] = subjects['subject'].get('Protected DAISY', 0) + 1 #print w['key'], subjects['subject'] doc = Element("doc") add_field(doc, 'key', w['key'][7:]) title = w.get('title', None) if title: add_field(doc, 'title', title) # add_field(doc, 'title_suggest', title) add_field(doc, 'has_fulltext', has_fulltext) if w.get('subtitle', None): add_field(doc, 'subtitle', w['subtitle']) alt_titles = set() for e in editions: if 'title' in e and e['title'] != title: alt_titles.add(e['title']) for f in 'work_titles', 'other_titles': for t in e.get(f, []): if t != title: alt_titles.add(t) add_field_list(doc, 'alternative_title', alt_titles) alt_subtitles = set( e['subtitle'] for e in editions if e.get('subtitle', None) and e['subtitle'] != w.get('subtitle', None)) add_field_list(doc, 'alternative_subtitle', alt_subtitles) add_field(doc, 'edition_count', len(editions)) for e in editions: add_field(doc, 'edition_key', re_edition_key.match(e['key']).group(1)) cover_edition = pick_cover(w, editions) if cover_edition: add_field(doc, 'cover_edition_key', re_edition_key.match(cover_edition).group(1)) if w.get('covers'): cover = w['covers'][0] assert isinstance(cover, int) add_field(doc, 'cover_i', cover) k = 'by_statement' add_field_list(doc, k, set( e[k] for e in editions if e.get(k, None))) k = 'publish_date' pub_dates = set(e[k] for e in editions if e.get(k, None)) add_field_list(doc, k, pub_dates) pub_years = set(m.group(1) for m in (re_year.search(i) for i in pub_dates) if m) if pub_years: add_field_list(doc, 'publish_year', pub_years) add_field(doc, 'first_publish_year', min(int(i) for i in pub_years)) k = 'first_sentence' fs = set( e[k]['value'] if isinstance(e[k], dict) else e[k] for e in editions if e.get(k, None)) add_field_list(doc, k, fs) publishers = set() for e in editions: publishers.update('Sine nomine' if is_sine_nomine(i) else i for i in e.get('publishers', [])) add_field_list(doc, 'publisher', publishers) # add_field_list(doc, 'publisher_facet', publishers) field_map = [ ('lccn', 'lccn'), ('publish_places', 'publish_place'), ('oclc_numbers', 'oclc'), ('contributions', 'contributor'), ] for db_key, search_key in field_map: v = set() for e in editions: if db_key not in e: continue v.update(e[db_key]) add_field_list(doc, search_key, v) isbn = set() for e in editions: for f in 'isbn_10', 'isbn_13': for v in e.get(f, []): v = v.replace('-', '') isbn.add(v) alt = opposite_isbn(v) if alt: isbn.add(alt) add_field_list(doc, 'isbn', isbn) lang = set() ia_loaded_id = set() ia_box_id = set() for e in editions: for l in e.get('languages', []): m = re_lang_key.match(l['key'] if isinstance(l, dict) else l) lang.add(m.group(1)) if e.get('ia_loaded_id'): if isinstance(e['ia_loaded_id'], basestring): ia_loaded_id.add(e['ia_loaded_id']) else: try: assert isinstance(e['ia_loaded_id'], list) and isinstance(e['ia_loaded_id'][0], basestring) except AssertionError: print e.get('ia') print e['ia_loaded_id'] raise ia_loaded_id.update(e['ia_loaded_id']) if e.get('ia_box_id'): if isinstance(e['ia_box_id'], basestring): ia_box_id.add(e['ia_box_id']) else: try: assert isinstance(e['ia_box_id'], list) and isinstance(e['ia_box_id'][0], basestring) except AssertionError: print e['key'] raise ia_box_id.update(e['ia_box_id']) if lang: add_field_list(doc, 'language', lang) pub_goog = set() # google pub_nongoog = set() nonpub_goog = set() nonpub_nongoog = set() public_scan = False all_collection = set() all_overdrive = set() lending_edition = None in_library_edition = None printdisabled = set() for e in editions: if 'overdrive' in e: all_overdrive.update(e['overdrive']) if 'ocaid' not in e: continue if not lending_edition and 'lendinglibrary' in e.get('ia_collection', []): lending_edition = re_edition_key.match(e['key']).group(1) if not in_library_edition and 'inlibrary' in e.get('ia_collection', []): in_library_edition = re_edition_key.match(e['key']).group(1) if 'printdisabled' in e.get('ia_collection', []): printdisabled.add(re_edition_key.match(e['key']).group(1)) all_collection.update(e.get('ia_collection', [])) assert isinstance(e['ocaid'], basestring) i = e['ocaid'].strip() if e.get('public_scan'): public_scan = True if i.endswith('goog'): pub_goog.add(i) else: pub_nongoog.add(i) else: if i.endswith('goog'): nonpub_goog.add(i) else: nonpub_nongoog.add(i) #print 'lending_edition:', lending_edition ia_list = list(pub_nongoog) + list(pub_goog) + list(nonpub_nongoog) + list(nonpub_goog) add_field_list(doc, 'ia', ia_list) if has_fulltext: add_field(doc, 'public_scan_b', public_scan) if all_collection: add_field(doc, 'ia_collection_s', ';'.join(all_collection)) if all_overdrive: add_field(doc, 'overdrive_s', ';'.join(all_overdrive)) if lending_edition: add_field(doc, 'lending_edition_s', lending_edition) elif in_library_edition: add_field(doc, 'lending_edition_s', in_library_edition) if printdisabled: add_field(doc, 'printdisabled_s', ';'.join(list(printdisabled))) if lending_edition or in_library_edition: add_field(doc, "borrowed_b", is_borrowed(lending_edition or in_library_edition)) author_keys = [re_author_key.match(a['key']).group(1) for a in authors] author_names = [a.get('name', '') for a in authors] add_field_list(doc, 'author_key', author_keys) add_field_list(doc, 'author_name', author_names) alt_names = set() for a in authors: if 'alternate_names' in a: alt_names.update(a['alternate_names']) add_field_list(doc, 'author_alternative_name', alt_names) add_field_list(doc, 'author_facet', (' '.join(v) for v in zip(author_keys, author_names))) #if subjects: # add_field(doc, 'fiction', subjects['fiction']) for k in 'person', 'place', 'subject', 'time': if k not in subjects: continue add_field_list(doc, k, subjects[k].keys()) add_field_list(doc, k + '_facet', subjects[k].keys()) subject_keys = [str_to_key(s) for s in subjects[k].keys()] add_field_list(doc, k + '_key', subject_keys) for k in sorted(identifiers.keys()): add_field_list(doc, 'id_' + k, identifiers[k]) if ia_loaded_id: add_field_list(doc, 'ia_loaded_id', ia_loaded_id) if ia_box_id: add_field_list(doc, 'ia_box_id', ia_box_id) return doc