def test_no_title(self): requests = update_work.update_work({'key': '/books/OL1M', 'type': {'key': '/type/edition'}}) assert len(requests) == 1 assert '<field name="title">__None__</field>' in requests[0].toxml() requests = update_work.update_work({'key': '/works/OL23W', 'type': {'key': '/type/work'}}) assert len(requests) == 1 assert '<field name="title">__None__</field>' in requests[0].toxml()
def test_delete_work(self): del_work = update_work.update_work({'key': '/works/OL23W', 'type': {'key': '/type/delete'}}) del_edition = update_work.update_work({'key': '/works/OL23M', 'type': {'key': '/type/delete'}}) assert len(del_work) == 1 assert len(del_edition) == 1 assert isinstance(del_work, list) assert isinstance(del_work[0], update_work.DeleteRequest) assert del_work[0].toxml() == '<delete><query>key:/works/OL23W</query></delete>' assert isinstance(del_edition[0], update_work.DeleteRequest) assert del_edition[0].toxml() == '<delete><query>key:/works/OL23M</query></delete>'
def hide_books(start): mend = [] fix_works = set() db_iter = db.query("select identifier, collection, updated from metadata where (noindex is not null or curatestate='dark') and mediatype='texts' and scandate is not null and updated > $start order by updated", {'start': start}) for row in db_iter: ia = row.identifier if row.collection: collections = set(i.lower().strip() for i in row.collection.split(';')) if 'printdisabled' in collections: continue print `ia`, row.updated for eq in query({'type': '/type/edition', 'ocaid': ia}): print eq['key'] e = ol.get(eq['key']) if 'ocaid' not in e: continue if 'works' in e: fix_works.update(e['works']) print e['key'], `e.get('title', None)` del e['ocaid'] mend.append(e) print 'removing links from %d editions' % len(mend) print ol.save_many(mend, 'remove link') requests = [] for wkey in fix_works: requests += update_work(withKey(wkey)) if fix_works: solr_update(requests + ['<commit/>'], debug=True)
def solr_updates(i): global subjects_to_update, authors_to_update t0 = time() d = i['data'] changeset = d['changeset'] print 'author:', d['author'] try: assert len(changeset['data']) == 2 and 'master' in changeset['data'] and 'duplicates' in changeset['data'] except: print d['changeset'] raise master_key = changeset['data']['master'] dup_keys = changeset['data']['duplicates'] assert dup_keys print d['changeset'] print 'timestamp:', i['timestamp'] print 'dups:', dup_keys print 'records to update:', len(d['result']) master = None obj_by_key = {} works = [] editions_by_work = defaultdict(list) for obj in d['query']: obj_type = obj['type']['key'] k = obj['key'] if obj_type == '/type/work': works.append(obj['key']) elif obj_type == '/type/edition': if 'works' not in obj: continue for w in obj['works']: editions_by_work[w['key']].append(obj) obj_by_key[k] = obj master = obj_by_key.get(master_key) #print 'master:', master if len(d['result']) == 0: print i work_updates = [] for wkey in works: #print 'editions_by_work:', editions_by_work work = obj_by_key[wkey] work['editions'] = editions_by_work[wkey] subjects = get_work_subjects(work) for subject_type, values in subjects.iteritems(): subjects_to_update.update((subject_type, v) for v in values) ret = update_work(work, obj_cache=obj_by_key, debug=True) work_updates += ret solr_update(work_updates, debug=False, index='works') authors_to_update.append({ 'redirects': dup_keys, 'master_key': master_key, 'master': master}) print 'authors to update:', len(authors_to_update) t1 = time() - t0 update_times.append(t1) print 'update takes: %d seconds' % t1 print
def test_no_title(self): requests = update_work.update_work({ 'key': '/books/OL1M', 'type': { 'key': '/type/edition' } }) assert len(requests) == 1 assert requests[0].doc['title'] == "__None__" requests = update_work.update_work({ 'key': '/works/OL23W', 'type': { 'key': '/type/work' } }) assert len(requests) == 1 assert requests[0].doc['title'] == "__None__"
def test_work_no_title(self): work = {'key': '/works/OL23W', 'type': {'key': '/type/work'}} ed = make_edition(work) ed['title'] = 'Some Title!' update_work.data_provider = FakeDataProvider([work, ed]) requests = update_work.update_work(work) assert len(requests) == 1 assert requests[0].doc['title'] == "Some Title!"
def test_work_no_title(self): work = {'key': '/works/OL23W', 'type': {'key': '/type/work'}} ed = make_edition(work) ed['title'] = 'Some Title!' update_work.data_provider = FakeDataProvider([work, ed]) requests = update_work.update_work(work) assert len(requests) == 1 assert '<field name="title">Some Title!</field>' in requests[0].toxml()
def test_delete_editions(self): requests = update_work.update_work({ 'key': '/works/OL23M', 'type': { 'key': '/type/delete' } }) assert len(requests) == 1 assert requests[0].to_json_command() == '"delete": ["/works/OL23M"]'
def test_redirects(self): requests = update_work.update_work({ 'key': '/works/OL23W', 'type': { 'key': '/type/redirect' } }) assert len(requests) == 1 assert requests[0].to_json_command() == '"delete": ["/works/OL23W"]'
def test_redirects(self): requests = update_work.update_work({ 'key': '/works/OL23W', 'type': { 'key': '/type/redirect' } }) assert len(requests) == 1 assert isinstance(requests[0], update_work.DeleteRequest) assert requests[0].toxml() == '<delete><id>/works/OL23W</id></delete>'
def test_delete_editions(self): requests = update_work.update_work({ 'key': '/works/OL23M', 'type': { 'key': '/type/delete' } }) assert len(requests) == 1 assert isinstance(requests[0], update_work.DeleteRequest) assert requests[0].toxml( ) == '<delete><query>key:/works/OL23M</query></delete>'
def run_update(): global authors_to_update global works_to_update global last_update print 'running update: %s works %s authors' % (len(works_to_update), len(authors_to_update)) if works_to_update: requests = [] num = 0 total = len(works_to_update) for wkey in works_to_update: num += 1 print 'update work: %s %d/%d' % (wkey, num, total) if '/' in wkey[7:]: print 'bad wkey:', wkey continue for attempt in range(5): try: requests += update_work(withKey(wkey)) break except AuthorRedirect: print 'fixing author redirect' w = ol.get(wkey) need_update = False for a in w['authors']: r = ol.get(a['author']) if r['type'] == '/type/redirect': a['author'] = {'key': r['location']} need_update = True assert need_update print w if not done_login: rc = read_rc() ol.login('EdwardBot', rc['EdwardBot']) ol.save(w['key'], w, 'avoid author redirect') if len(requests) >= 100: solr_update(requests, debug=True) requests = [] # if num % 1000 == 0: # solr_update(['<commit/>'], debug=True) if requests: solr_update(requests, debug=True) solr_update(['<commit/>'], debug=True) last_update = time() print >> open(state_file, 'w'), offset if authors_to_update: requests = [] for akey in authors_to_update: print 'update author:', akey requests += update_author(akey) solr_update(requests + ['<commit/>'], index='authors', debug=True) authors_to_update = set() works_to_update = set() print >> open(state_file, 'w'), offset
def run_update(): global authors_to_update global works_to_update global last_update print "running update: %s works %s authors" % (len(works_to_update), len(authors_to_update)) if works_to_update: requests = [] num = 0 total = len(works_to_update) for wkey in works_to_update: num += 1 print "update work: %s %d/%d" % (wkey, num, total) if "/" in wkey[7:]: print "bad wkey:", wkey continue for attempt in range(5): try: requests += update_work(withKey(wkey)) break except AuthorRedirect: print "fixing author redirect" w = ol.get(wkey) need_update = False for a in w["authors"]: r = ol.get(a["author"]) if r["type"] == "/type/redirect": a["author"] = {"key": r["location"]} need_update = True assert need_update print w if not done_login: rc = read_rc() ol.login("EdwardBot", rc["EdwardBot"]) ol.save(w["key"], w, "avoid author redirect") if len(requests) >= 100: solr_update(requests, debug=True) requests = [] # if num % 1000 == 0: # solr_update(['<commit/>'], debug=True) if requests: solr_update(requests, debug=True) solr_update(["<commit/>"], debug=True) last_update = time() print >> open(state_file, "w"), offset if authors_to_update: requests = [] for akey in authors_to_update: print "update author:", akey requests += update_author(akey) solr_update(requests + ["<commit/>"], index="authors", debug=True) authors_to_update = set() works_to_update = set() print >> open(state_file, "w"), offset
def test_delete_work(self): del_work = update_work.update_work({ 'key': '/works/OL23W', 'type': { 'key': '/type/delete' } }) del_edition = update_work.update_work({ 'key': '/works/OL23M', 'type': { 'key': '/type/delete' } }) assert len(del_work) == 1 assert len(del_edition) == 1 assert isinstance(del_work, list) assert isinstance(del_work[0], update_work.DeleteRequest) assert del_work[0].toxml( ) == '<delete><query>key:/works/OL23W</query></delete>' assert isinstance(del_edition[0], update_work.DeleteRequest) assert del_edition[0].toxml( ) == '<delete><query>key:/works/OL23M</query></delete>'
def hide_books(start): hide_start = open(hide_state_file).readline()[:-1] print "hide start:", hide_start mend = [] fix_works = set() db_iter = db.query( "select identifier, collection, updated from metadata where (noindex is not null or curatestate='dark') and mediatype='texts' and scandate is not null and updated > $start order by scandate_dt", {"start": hide_start}, ) last_updated = None for row in db_iter: ia = row.identifier if row.collection: collections = set(i.lower().strip() for i in row.collection.split(";")) if "printdisabled" in collections or "lendinglibrary" in collections: continue print ` ia `, row.updated for eq in query({"type": "/type/edition", "ocaid": ia}): print eq["key"] e = ol.get(eq["key"]) if "ocaid" not in e: continue if "works" in e: fix_works.update(e["works"]) print e["key"], ` e.get("title", None) ` del e["ocaid"] mend.append(e) last_updated = row.updated print "removing links from %d editions" % len(mend) if not mend: return print ol.save_many(mend, "remove link") requests = [] for wkey in fix_works: requests += update_work(withKey(wkey)) if fix_works: solr_update(requests + ["<commit/>"], debug=True) print >> open(hide_state_file, "w"), last_updated
def hide_books(start): hide_start = open(hide_state_file).readline()[:-1] print('hide start:', hide_start) mend = [] fix_works = set() db_iter = db.query( "select identifier, collection, updated from metadata where (noindex is not null or curatestate='dark') and mediatype='texts' and scandate is not null and updated > $start", {'start': hide_start}) last_updated = None for row in db_iter: ia = row.identifier if row.collection: collections = set(i.lower().strip() for i in row.collection.split(';')) if ignore_noindex & collections: continue print((repr(ia), row.updated)) for eq in query({'type': '/type/edition', 'ocaid': ia}): print(eq['key']) e = ol.get(eq['key']) if 'ocaid' not in e: continue if 'works' in e: fix_works.update(e['works']) print((e['key'], repr(e.get('title', None)))) del e['ocaid'] mend.append(e) last_updated = row.updated print('removing links from %d editions' % len(mend)) if not mend: return print(ol.save_many(mend, 'remove link')) requests = [] for wkey in fix_works: requests += update_work(withKey(wkey)) if fix_works: solr_update(requests + ['<commit/>'], debug=True) print(last_updated, file=open(hide_state_file, 'w'))
def hide_books(start): hide_start = open(hide_state_file).readline()[:-1] print 'hide start:', hide_start mend = [] fix_works = set() db_iter = db.query("select identifier, collection, updated from metadata where (noindex is not null or curatestate='dark') and mediatype='texts' and scandate is not null and updated > $start", {'start': hide_start}) last_updated = None for row in db_iter: ia = row.identifier if row.collection: collections = set(i.lower().strip() for i in row.collection.split(';')) if ignore_noindex & collections: continue print(repr(ia), row.updated) for eq in query({'type': '/type/edition', 'ocaid': ia}): print eq['key'] e = ol.get(eq['key']) if 'ocaid' not in e: continue if 'works' in e: fix_works.update(e['works']) print(e['key'], repr(e.get('title', None))) del e['ocaid'] mend.append(e) last_updated = row.updated print 'removing links from %d editions' % len(mend) if not mend: return print ol.save_many(mend, 'remove link') requests = [] for wkey in fix_works: requests += update_work(withKey(wkey)) if fix_works: solr_update(requests + ['<commit/>'], debug=True) print >> open(hide_state_file, 'w'), last_updated
e = ol.get(ekey) e['works'] = [Reference(use_key)] update.append(e) if work_title[use_key] != w['title']: print 'update work title', `work_title[use_key]`, '->', `w['title']` existing_work = ol.get(use_key) existing_work['title'] = w['title'] update.append(existing_work) if do_updates: ol.save_many(update, 'merge works') all_existing.update(existing) for wkey in existing: cur = work_title[wkey] print ' ', wkey, cur == w['title'], `cur` print len(work_to_edition), len(all_existing) assert len(work_to_edition) == len(all_existing) if not do_updates: sys.exit(0) for key in work_keys: w = ol.get(key) add_cover_to_work(w) if 'cover_edition' not in w: print 'no cover found' update_work(withKey(key), debug=True) requests = ['<commit />'] solr_update(requests, debug=True)
def solr_updates(i): global subjects_to_update, authors_to_update t0 = time() d = i['data'] changeset = d['changeset'] print 'author:', d['author'] try: assert len(changeset['data']) == 2 and 'master' in changeset['data'] and 'duplicates' in changeset['data'] except: print d['changeset'] raise master_key = changeset['data']['master'] dup_keys = changeset['data']['duplicates'] assert dup_keys print 'timestamp:', i['timestamp'] print 'dups:', dup_keys print 'records to update:', len(d['result']) master = None obj_by_key = {} works = [] editions_by_work = defaultdict(list) for obj in d['query']: obj_type = obj['type']['key'] k = obj['key'] if obj_type == '/type/work': works.append(obj['key']) elif obj_type == '/type/edition': if 'works' not in obj: continue for w in obj['works']: editions_by_work[w['key']].append(obj) obj_by_key[k] = obj master = obj_by_key.get(master_key) #print 'master:', master if len(d['result']) == 0: print i work_updates = [] for wkey in works: #print 'editions_by_work:', editions_by_work work = obj_by_key[wkey] work['editions'] = editions_by_work[wkey] subjects = get_work_subjects(work) for subject_type, values in subjects.iteritems(): subjects_to_update.update((subject_type, v) for v in values) try: ret = update_work(work, obj_cache=obj_by_key, debug=True) except AuthorRedirect: work = withKey(wkey) work['editions'] = editions_by_work[wkey] ret = update_work(work, debug=True, resolve_redirects=True) work_updates += ret if work_updates: solr_update(work_updates, debug=False, index='works') authors_to_update.append({ 'redirects': dup_keys, 'master_key': master_key, 'master': master}) print 'authors to update:', len(authors_to_update) t1 = time() - t0 update_times.append(t1) print 'update takes: %d seconds' % t1 print
def run_update(): global authors_to_update, works_to_update subjects_to_update = set() global last_update print 'running update: %s works %s authors' % (len(works_to_update), len(authors_to_update)) if works_to_update: requests = [] num = 0 total = len(works_to_update) for wkey in works_to_update: num += 1 print 'update work: %s %d/%d' % (wkey, num, total) if '/' in wkey[7:]: print 'bad wkey:', wkey continue work_to_update = withKey(wkey) for attempt in range(5): try: requests += update_work(work_to_update) except AuthorRedirect: print 'fixing author redirect' w = ol.get(wkey) need_update = False for a in w['authors']: r = ol.get(a['author']) if r['type'] == '/type/redirect': a['author'] = {'key': r['location']} need_update = True if need_update: if not done_login: rc = read_rc() ol.login('EdwardBot', rc['EdwardBot']) ol.save(w['key'], w, 'avoid author redirect') if work_to_update['type']['key'] == '/type/work' and work_to_update.get('title'): subjects = get_work_subjects(work_to_update) print subjects for subject_type, values in subjects.iteritems(): subjects_to_update.update((subject_type, v) for v in values) if len(requests) >= 100: solr_update(requests, debug=True) requests = [] # if num % 1000 == 0: # solr_update(['<commit/>'], debug=True) if requests: solr_update(requests, debug=True) if not args.no_commit: solr_update(['<commit/>'], debug=True) last_update = time() if not args.no_author_updates and authors_to_update: requests = [] for akey in authors_to_update: print 'update author:', `akey` try: request = update_author(akey) if request: requests += request except AttributeError: print 'akey:', `akey` raise if not args.no_commit: solr_update(requests + ['<commit/>'], index='authors', debug=True) subject_add = Element("add") print subjects_to_update for subject_type, subject_name in subjects_to_update: key = subject_type + '/' + subject_name count = subject_count(subject_type, subject_name) if not subject_need_update(key, count): print 'no updated needed:', (subject_type, subject_name, count) continue print 'updated needed:', (subject_type, subject_name, count) doc = Element("doc") add_field(doc, 'key', key) add_field(doc, 'name', subject_name) add_field(doc, 'type', subject_type) add_field(doc, 'count', count) subject_add.append(doc) if len(subject_add): print 'updating subjects' add_xml = tostring(subject_add).encode('utf-8') solr_update([add_xml, '<commit />'], debug=True, index='subjects') authors_to_update = set() works_to_update = set() subjects_to_update = set() print >> open(state_file, 'w'), offset
best_match = max(w['existing_works'].iteritems(), key=lambda i:i[1])[0] w['best_match'] = work_by_key[best_match] updated = update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log) for wkey in updated: if wkey in works_updated_this_session: print(wkey, 'already updated!', file=fh_log) print(wkey, 'already updated!') works_updated_this_session.update(updated) #if not do_updates: # return [] return [withKey(key) for key in works_updated_this_session] if __name__ == '__main__': akey = '/authors/' + sys.argv[1] title_redirects = find_title_redirects(akey) works = find_works(akey, get_books(akey, books_query(akey)), existing=title_redirects) to_update = update_works(akey, works, do_updates=True) requests = [] for w in to_update: requests += update_work(w) if to_update: solr_update(requests + ['<commit />'], debug=True) requests = update_author(akey) solr_update(requests + ['<commit/>'], debug=True)
def run_update(): global authors_to_update, works_to_update subjects_to_update = set() global last_update print 'running update: %s works %s authors' % (len(works_to_update), len(authors_to_update)) if works_to_update: requests = [] num = 0 total = len(works_to_update) for wkey in works_to_update: num += 1 print 'update work: %s %d/%d' % (wkey, num, total) if '/' in wkey[7:]: print 'bad wkey:', wkey continue work_to_update = withKey(wkey) for attempt in range(5): try: requests += update_work(work_to_update) except AuthorRedirect: print 'fixing author redirect' w = ol.get(wkey) need_update = False for a in w['authors']: r = ol.get(a['author']) if r['type'] == '/type/redirect': a['author'] = {'key': r['location']} need_update = True if need_update: if not done_login: rc = read_rc() ol.login('EdwardBot', rc['EdwardBot']) ol.save(w['key'], w, 'avoid author redirect') if work_to_update['type'][ 'key'] == '/type/work' and work_to_update.get('title'): subjects = get_work_subjects(work_to_update) print subjects for subject_type, values in subjects.iteritems(): subjects_to_update.update( (subject_type, v) for v in values) if len(requests) >= 100: solr_update(requests, debug=True) requests = [] # if num % 1000 == 0: # solr_update(['<commit/>'], debug=True) if requests: solr_update(requests, debug=True) if not args.no_commit: solr_update(['<commit/>'], debug=True) last_update = time() if not args.no_author_updates and authors_to_update: requests = [] for akey in authors_to_update: print('update author:', repr(akey)) try: request = update_author(akey) if request: requests += request except AttributeError: print('akey:', repr(akey)) raise if not args.no_commit: solr_update(requests + ['<commit/>'], debug=True) subject_add = Element("add") print subjects_to_update for subject_type, subject_name in subjects_to_update: key = subject_type + '/' + subject_name count = subject_count(subject_type, subject_name) if not subject_need_update(key, count): print 'no updated needed:', (subject_type, subject_name, count) continue print 'updated needed:', (subject_type, subject_name, count) doc = Element("doc") add_field(doc, 'key', key) add_field(doc, 'name', subject_name) add_field(doc, 'type', subject_type) add_field(doc, 'count', count) subject_add.append(doc) if len(subject_add): print 'updating subjects' add_xml = tostring(subject_add).encode('utf-8') solr_update([add_xml, '<commit />'], debug=True) authors_to_update = set() works_to_update = set() subjects_to_update = set() print >> open(state_file, 'w'), offset
for wkey in updated: if wkey in works_updated_this_session: print >> fh_log, wkey, 'already updated!' print wkey, 'already updated!' works_updated_this_session.update(updated) #if not do_updates: # return [] return [withKey(key) for key in works_updated_this_session] if __name__ == '__main__': akey = '/authors/' + sys.argv[1] title_redirects = find_title_redirects(akey) works = find_works(akey, get_books(akey, books_query(akey)), existing=title_redirects) to_update = update_works(akey, works, do_updates=True) requests = [] for w in to_update: requests += update_work(w) if to_update: solr_update(requests + ['<commit />'], debug=True) requests = update_author(akey) solr_update(requests + ['<commit/>'], debug=True)