def solr_update_subjects(): global subjects_to_update print subjects_to_update subject_add = Element("add") for subject_type, subject_name in subjects_to_update: key = subject_type + '/' + subject_name count = subject_count(subject_type, subject_name) if not subject_need_update(key, count): print 'no updated needed:', (subject_type, subject_name, count) continue print 'updated needed:', (subject_type, subject_name, count) doc = Element("doc") add_field(doc, 'key', key) add_field(doc, 'name', subject_name) add_field(doc, 'type', subject_type) add_field(doc, 'count', count) subject_add.append(doc) if len(subject_add): print 'updating subjects' add_xml = tostring(subject_add).encode('utf-8') solr_update([add_xml], debug=False, index='subjects') solr_update(['<commit />'], debug=True, index='subjects') subjects_to_update = set()
def hide_books(start): mend = [] fix_works = set() db_iter = db.query("select identifier, collection, updated from metadata where (noindex is not null or curatestate='dark') and mediatype='texts' and scandate is not null and updated > $start order by updated", {'start': start}) for row in db_iter: ia = row.identifier if row.collection: collections = set(i.lower().strip() for i in row.collection.split(';')) if 'printdisabled' in collections: continue print `ia`, row.updated for eq in query({'type': '/type/edition', 'ocaid': ia}): print eq['key'] e = ol.get(eq['key']) if 'ocaid' not in e: continue if 'works' in e: fix_works.update(e['works']) print e['key'], `e.get('title', None)` del e['ocaid'] mend.append(e) print 'removing links from %d editions' % len(mend) print ol.save_many(mend, 'remove link') requests = [] for wkey in fix_works: requests += update_work(withKey(wkey)) if fix_works: solr_update(requests + ['<commit/>'], debug=True)
def solr_updates(i): global subjects_to_update, authors_to_update t0 = time() d = i['data'] changeset = d['changeset'] print 'author:', d['author'] try: assert len(changeset['data']) == 2 and 'master' in changeset['data'] and 'duplicates' in changeset['data'] except: print d['changeset'] raise master_key = changeset['data']['master'] dup_keys = changeset['data']['duplicates'] assert dup_keys print d['changeset'] print 'timestamp:', i['timestamp'] print 'dups:', dup_keys print 'records to update:', len(d['result']) master = None obj_by_key = {} works = [] editions_by_work = defaultdict(list) for obj in d['query']: obj_type = obj['type']['key'] k = obj['key'] if obj_type == '/type/work': works.append(obj['key']) elif obj_type == '/type/edition': if 'works' not in obj: continue for w in obj['works']: editions_by_work[w['key']].append(obj) obj_by_key[k] = obj master = obj_by_key.get(master_key) #print 'master:', master if len(d['result']) == 0: print i work_updates = [] for wkey in works: #print 'editions_by_work:', editions_by_work work = obj_by_key[wkey] work['editions'] = editions_by_work[wkey] subjects = get_work_subjects(work) for subject_type, values in subjects.iteritems(): subjects_to_update.update((subject_type, v) for v in values) ret = update_work(work, obj_cache=obj_by_key, debug=True) work_updates += ret solr_update(work_updates, debug=False, index='works') authors_to_update.append({ 'redirects': dup_keys, 'master_key': master_key, 'master': master}) print 'authors to update:', len(authors_to_update) t1 = time() - t0 update_times.append(t1) print 'update takes: %d seconds' % t1 print
def solr_update_authors(): global authors_to_update for a in authors_to_update: author_updates = ['<delete>' + ''.join('<id>%s</id>' % re_author_key.match(akey).group(1) for akey in a['redirects']) + '</delete>'] author_updates += update_author(a['master_key'], a=a['master'], handle_redirects=False) solr_update(author_updates, index='authors', debug=False) solr_update(['<commit/>'], index='authors', debug=True) authors_to_update = []
def test_bad_apple_in_solr_request(self, monkeypatch, monkeytime): mock_post = MagicMock(return_value=self.sample_individual_error()) monkeypatch.setattr(httpx, "post", mock_post) solr_update( [CommitRequest()], solr_base_url="http://localhost:8983/solr/foobar", ) assert mock_post.call_count == 1
def test_solr_offline(self, monkeypatch, monkeytime): mock_post = MagicMock(side_effect=ConnectError('', request=None)) monkeypatch.setattr(httpx, "post", mock_post) solr_update( [CommitRequest()], solr_base_url="http://localhost:8983/solr/foobar", ) assert mock_post.call_count > 1
def test_non_json_solr_503(self, monkeypatch, monkeytime): mock_post = MagicMock(return_value=self.sample_response_503()) monkeypatch.setattr(httpx, "post", mock_post) solr_update( [CommitRequest()], solr_base_url="http://localhost:8983/solr/foobar", ) assert mock_post.call_count > 1
def test_other_non_ok_status(self, monkeypatch, monkeytime): mock_post = MagicMock( return_value=Response(500, request=MagicMock(), content="{}")) monkeypatch.setattr(httpx, "post", mock_post) solr_update( [CommitRequest()], solr_base_url="http://localhost:8983/solr/foobar", ) assert mock_post.call_count > 1
def solr_update_authors(authors_to_update): for a in authors_to_update: try: author_updates = ['<delete>' + ''.join('<id>%s</id>' % re_author_key.match(akey).group(1) for akey in a['redirects']) + '</delete>'] except: print 'redirects' print a['redirects'] raise author_updates += update_author(a['master_key'], a=a['master'], handle_redirects=False) solr_update(author_updates, index='authors', debug=False) solr_update(['<commit/>'], index='authors', debug=True)
def run_update(): global authors_to_update global works_to_update global last_update print 'running update: %s works %s authors' % (len(works_to_update), len(authors_to_update)) if works_to_update: requests = [] num = 0 total = len(works_to_update) for wkey in works_to_update: num += 1 print 'update work: %s %d/%d' % (wkey, num, total) if '/' in wkey[7:]: print 'bad wkey:', wkey continue for attempt in range(5): try: requests += update_work(withKey(wkey)) break except AuthorRedirect: print 'fixing author redirect' w = ol.get(wkey) need_update = False for a in w['authors']: r = ol.get(a['author']) if r['type'] == '/type/redirect': a['author'] = {'key': r['location']} need_update = True assert need_update print w if not done_login: rc = read_rc() ol.login('EdwardBot', rc['EdwardBot']) ol.save(w['key'], w, 'avoid author redirect') if len(requests) >= 100: solr_update(requests, debug=True) requests = [] # if num % 1000 == 0: # solr_update(['<commit/>'], debug=True) if requests: solr_update(requests, debug=True) solr_update(['<commit/>'], debug=True) last_update = time() print >> open(state_file, 'w'), offset if authors_to_update: requests = [] for akey in authors_to_update: print 'update author:', akey requests += update_author(akey) solr_update(requests + ['<commit/>'], index='authors', debug=True) authors_to_update = set() works_to_update = set() print >> open(state_file, 'w'), offset
def run_update(): global authors_to_update global works_to_update global last_update print "running update: %s works %s authors" % (len(works_to_update), len(authors_to_update)) if works_to_update: requests = [] num = 0 total = len(works_to_update) for wkey in works_to_update: num += 1 print "update work: %s %d/%d" % (wkey, num, total) if "/" in wkey[7:]: print "bad wkey:", wkey continue for attempt in range(5): try: requests += update_work(withKey(wkey)) break except AuthorRedirect: print "fixing author redirect" w = ol.get(wkey) need_update = False for a in w["authors"]: r = ol.get(a["author"]) if r["type"] == "/type/redirect": a["author"] = {"key": r["location"]} need_update = True assert need_update print w if not done_login: rc = read_rc() ol.login("EdwardBot", rc["EdwardBot"]) ol.save(w["key"], w, "avoid author redirect") if len(requests) >= 100: solr_update(requests, debug=True) requests = [] # if num % 1000 == 0: # solr_update(['<commit/>'], debug=True) if requests: solr_update(requests, debug=True) solr_update(["<commit/>"], debug=True) last_update = time() print >> open(state_file, "w"), offset if authors_to_update: requests = [] for akey in authors_to_update: print "update author:", akey requests += update_author(akey) solr_update(requests + ["<commit/>"], index="authors", debug=True) authors_to_update = set() works_to_update = set() print >> open(state_file, "w"), offset
def hide_books(start): hide_start = open(hide_state_file).readline()[:-1] print('hide start:', hide_start) mend = [] fix_works = set() db_iter = db.query( "select identifier, collection, updated from metadata where (noindex is not null or curatestate='dark') and mediatype='texts' and scandate is not null and updated > $start", {'start': hide_start}) last_updated = None for row in db_iter: ia = row.identifier if row.collection: collections = set(i.lower().strip() for i in row.collection.split(';')) if ignore_noindex & collections: continue print((repr(ia), row.updated)) for eq in query({'type': '/type/edition', 'ocaid': ia}): print(eq['key']) e = ol.get(eq['key']) if 'ocaid' not in e: continue if 'works' in e: fix_works.update(e['works']) print((e['key'], repr(e.get('title', None)))) del e['ocaid'] mend.append(e) last_updated = row.updated print('removing links from %d editions' % len(mend)) if not mend: return print(ol.save_many(mend, 'remove link')) requests = [] for wkey in fix_works: requests += update_work(withKey(wkey)) if fix_works: solr_update(requests + ['<commit/>'], debug=True) print(last_updated, file=open(hide_state_file, 'w'))
def hide_books(start): hide_start = open(hide_state_file).readline()[:-1] print "hide start:", hide_start mend = [] fix_works = set() db_iter = db.query( "select identifier, collection, updated from metadata where (noindex is not null or curatestate='dark') and mediatype='texts' and scandate is not null and updated > $start order by scandate_dt", {"start": hide_start}, ) last_updated = None for row in db_iter: ia = row.identifier if row.collection: collections = set(i.lower().strip() for i in row.collection.split(";")) if "printdisabled" in collections or "lendinglibrary" in collections: continue print ` ia `, row.updated for eq in query({"type": "/type/edition", "ocaid": ia}): print eq["key"] e = ol.get(eq["key"]) if "ocaid" not in e: continue if "works" in e: fix_works.update(e["works"]) print e["key"], ` e.get("title", None) ` del e["ocaid"] mend.append(e) last_updated = row.updated print "removing links from %d editions" % len(mend) if not mend: return print ol.save_many(mend, "remove link") requests = [] for wkey in fix_works: requests += update_work(withKey(wkey)) if fix_works: solr_update(requests + ["<commit/>"], debug=True) print >> open(hide_state_file, "w"), last_updated
def hide_books(start): hide_start = open(hide_state_file).readline()[:-1] print 'hide start:', hide_start mend = [] fix_works = set() db_iter = db.query("select identifier, collection, updated from metadata where (noindex is not null or curatestate='dark') and mediatype='texts' and scandate is not null and updated > $start", {'start': hide_start}) last_updated = None for row in db_iter: ia = row.identifier if row.collection: collections = set(i.lower().strip() for i in row.collection.split(';')) if ignore_noindex & collections: continue print(repr(ia), row.updated) for eq in query({'type': '/type/edition', 'ocaid': ia}): print eq['key'] e = ol.get(eq['key']) if 'ocaid' not in e: continue if 'works' in e: fix_works.update(e['works']) print(e['key'], repr(e.get('title', None))) del e['ocaid'] mend.append(e) last_updated = row.updated print 'removing links from %d editions' % len(mend) if not mend: return print ol.save_many(mend, 'remove link') requests = [] for wkey in fix_works: requests += update_work(withKey(wkey)) if fix_works: solr_update(requests + ['<commit/>'], debug=True) print >> open(hide_state_file, 'w'), last_updated
print url sys.exit(0) print 'url:', url raise try: ret = simplejson.loads(data) except: open('bad_data.json', 'w').write(data) raise offset = ret['offset'] data_list = ret['data'] if len(data_list) == 0: if authors_to_update: print 'commit' solr_update(['<commit/>'], debug=True) solr_update_authors(authors_to_update) authors_to_update = [] solr_update_subjects() print 'waiting' sleep(10) continue else: print for i in data_list: action = i.pop('action') if action != 'save_many': continue if i['data']['comment'] != 'merge authors': continue
def _solr_commit(self): logger.info("BEGIN commit") update_work.solr_update(['<commit/>']) logger.info("END commit")
best_match = max(w['existing_works'].iteritems(), key=lambda i:i[1])[0] w['best_match'] = work_by_key[best_match] updated = update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log) for wkey in updated: if wkey in works_updated_this_session: print >> fh_log, wkey, 'already updated!' print wkey, 'already updated!' works_updated_this_session.update(updated) #if not do_updates: # return [] return [withKey(key) for key in works_updated_this_session] if __name__ == '__main__': akey = '/authors/' + sys.argv[1] title_redirects = find_title_redirects(akey) works = find_works(akey, get_books(akey, books_query(akey)), existing=title_redirects) to_update = update_works(akey, works, do_updates=True) requests = [] for w in to_update: requests += update_work(w) if to_update: solr_update(requests + ['<commit />'], debug=True) requests = update_author(akey) solr_update(requests + ['<commit/>'], index='authors', debug=True)
print url sys.exit(0) print 'url:', url raise try: ret = simplejson.loads(data) except: open('bad_data.json', 'w').write(data) raise offset = ret['offset'] data_list = ret['data'] if len(data_list) == 0: if authors_to_update: print 'commit' solr_update(['<commit/>'], debug=True, index='works') solr_update_authors() solr_update_subjects() print 'waiting' sleep(10) continue else: print for i in data_list: action = i.pop('action') if action != 'save_many': continue if i['data']['comment'] != 'merge authors': continue if i['timestamp'] == '2010-08-05T14:37:25.139418':
def _solr_commit(self): logger.info("BEGIN commit") update_work.solr_update(['<commit/>'], index="works") logger.info("END commit")
def run_update(): global authors_to_update, works_to_update subjects_to_update = set() global last_update print 'running update: %s works %s authors' % (len(works_to_update), len(authors_to_update)) if works_to_update: requests = [] num = 0 total = len(works_to_update) for wkey in works_to_update: num += 1 print 'update work: %s %d/%d' % (wkey, num, total) if '/' in wkey[7:]: print 'bad wkey:', wkey continue work_to_update = withKey(wkey) for attempt in range(5): try: requests += update_work(work_to_update) except AuthorRedirect: print 'fixing author redirect' w = ol.get(wkey) need_update = False for a in w['authors']: r = ol.get(a['author']) if r['type'] == '/type/redirect': a['author'] = {'key': r['location']} need_update = True if need_update: if not done_login: rc = read_rc() ol.login('EdwardBot', rc['EdwardBot']) ol.save(w['key'], w, 'avoid author redirect') if work_to_update['type']['key'] == '/type/work' and work_to_update.get('title'): subjects = get_work_subjects(work_to_update) print subjects for subject_type, values in subjects.iteritems(): subjects_to_update.update((subject_type, v) for v in values) if len(requests) >= 100: solr_update(requests, debug=True) requests = [] # if num % 1000 == 0: # solr_update(['<commit/>'], debug=True) if requests: solr_update(requests, debug=True) if not args.no_commit: solr_update(['<commit/>'], debug=True) last_update = time() if not args.no_author_updates and authors_to_update: requests = [] for akey in authors_to_update: print 'update author:', `akey` try: request = update_author(akey) if request: requests += request except AttributeError: print 'akey:', `akey` raise if not args.no_commit: solr_update(requests + ['<commit/>'], index='authors', debug=True) subject_add = Element("add") print subjects_to_update for subject_type, subject_name in subjects_to_update: key = subject_type + '/' + subject_name count = subject_count(subject_type, subject_name) if not subject_need_update(key, count): print 'no updated needed:', (subject_type, subject_name, count) continue print 'updated needed:', (subject_type, subject_name, count) doc = Element("doc") add_field(doc, 'key', key) add_field(doc, 'name', subject_name) add_field(doc, 'type', subject_type) add_field(doc, 'count', count) subject_add.append(doc) if len(subject_add): print 'updating subjects' add_xml = tostring(subject_add).encode('utf-8') solr_update([add_xml, '<commit />'], debug=True, index='subjects') authors_to_update = set() works_to_update = set() subjects_to_update = set() print >> open(state_file, 'w'), offset
best_match = max(w['existing_works'].iteritems(), key=lambda i:i[1])[0] w['best_match'] = work_by_key[best_match] updated = update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log) for wkey in updated: if wkey in works_updated_this_session: print(wkey, 'already updated!', file=fh_log) print(wkey, 'already updated!') works_updated_this_session.update(updated) #if not do_updates: # return [] return [withKey(key) for key in works_updated_this_session] if __name__ == '__main__': akey = '/authors/' + sys.argv[1] title_redirects = find_title_redirects(akey) works = find_works(akey, get_books(akey, books_query(akey)), existing=title_redirects) to_update = update_works(akey, works, do_updates=True) requests = [] for w in to_update: requests += update_work(w) if to_update: solr_update(requests + ['<commit />'], debug=True) requests = update_author(akey) solr_update(requests + ['<commit/>'], debug=True)
def _solr_commit(self): logger.info("BEGIN commit") update_work.solr_update([CommitRequest()]) logger.info("END commit")
for wkey in updated: if wkey in works_updated_this_session: print >> fh_log, wkey, 'already updated!' print wkey, 'already updated!' works_updated_this_session.update(updated) #if not do_updates: # return [] return [withKey(key) for key in works_updated_this_session] if __name__ == '__main__': akey = '/authors/' + sys.argv[1] title_redirects = find_title_redirects(akey) works = find_works(akey, get_books(akey, books_query(akey)), existing=title_redirects) to_update = update_works(akey, works, do_updates=True) requests = [] for w in to_update: requests += update_work(w) if to_update: solr_update(requests + ['<commit />'], debug=True) requests = update_author(akey) solr_update(requests + ['<commit/>'], debug=True)
e = ol.get(ekey) e['works'] = [Reference(use_key)] update.append(e) if work_title[use_key] != w['title']: print 'update work title', `work_title[use_key]`, '->', `w['title']` existing_work = ol.get(use_key) existing_work['title'] = w['title'] update.append(existing_work) if do_updates: ol.save_many(update, 'merge works') all_existing.update(existing) for wkey in existing: cur = work_title[wkey] print ' ', wkey, cur == w['title'], `cur` print len(work_to_edition), len(all_existing) assert len(work_to_edition) == len(all_existing) if not do_updates: sys.exit(0) for key in work_keys: w = ol.get(key) add_cover_to_work(w) if 'cover_edition' not in w: print 'no cover found' update_work(withKey(key), debug=True) requests = ['<commit />'] solr_update(requests, debug=True)
print url sys.exit(0) print 'url:', url raise try: ret = simplejson.loads(data) except: open('bad_data.json', 'w').write(data) raise offset = ret['offset'] data_list = ret['data'] if len(data_list) == 0: if authors_to_update: print 'commit' solr_update(['<commit/>'], debug=True, index='works') solr_update_authors(authors_to_update) authors_to_update = [] solr_update_subjects() print 'waiting' sleep(10) continue else: print for i in data_list: action = i.pop('action') if action != 'save_many': continue if i['data']['comment'] != 'merge authors': continue
def run_update(): global authors_to_update, works_to_update subjects_to_update = set() global last_update print 'running update: %s works %s authors' % (len(works_to_update), len(authors_to_update)) if works_to_update: requests = [] num = 0 total = len(works_to_update) for wkey in works_to_update: num += 1 print 'update work: %s %d/%d' % (wkey, num, total) if '/' in wkey[7:]: print 'bad wkey:', wkey continue work_to_update = withKey(wkey) for attempt in range(5): try: requests += update_work(work_to_update) except AuthorRedirect: print 'fixing author redirect' w = ol.get(wkey) need_update = False for a in w['authors']: r = ol.get(a['author']) if r['type'] == '/type/redirect': a['author'] = {'key': r['location']} need_update = True if need_update: if not done_login: rc = read_rc() ol.login('EdwardBot', rc['EdwardBot']) ol.save(w['key'], w, 'avoid author redirect') if work_to_update['type'][ 'key'] == '/type/work' and work_to_update.get('title'): subjects = get_work_subjects(work_to_update) print subjects for subject_type, values in subjects.iteritems(): subjects_to_update.update( (subject_type, v) for v in values) if len(requests) >= 100: solr_update(requests, debug=True) requests = [] # if num % 1000 == 0: # solr_update(['<commit/>'], debug=True) if requests: solr_update(requests, debug=True) if not args.no_commit: solr_update(['<commit/>'], debug=True) last_update = time() if not args.no_author_updates and authors_to_update: requests = [] for akey in authors_to_update: print('update author:', repr(akey)) try: request = update_author(akey) if request: requests += request except AttributeError: print('akey:', repr(akey)) raise if not args.no_commit: solr_update(requests + ['<commit/>'], debug=True) subject_add = Element("add") print subjects_to_update for subject_type, subject_name in subjects_to_update: key = subject_type + '/' + subject_name count = subject_count(subject_type, subject_name) if not subject_need_update(key, count): print 'no updated needed:', (subject_type, subject_name, count) continue print 'updated needed:', (subject_type, subject_name, count) doc = Element("doc") add_field(doc, 'key', key) add_field(doc, 'name', subject_name) add_field(doc, 'type', subject_type) add_field(doc, 'count', count) subject_add.append(doc) if len(subject_add): print 'updating subjects' add_xml = tostring(subject_add).encode('utf-8') solr_update([add_xml, '<commit />'], debug=True) authors_to_update = set() works_to_update = set() subjects_to_update = set() print >> open(state_file, 'w'), offset
def solr_updates(i): global subjects_to_update, authors_to_update t0 = time() d = i['data'] changeset = d['changeset'] print 'author:', d['author'] try: assert len(changeset['data']) == 2 and 'master' in changeset['data'] and 'duplicates' in changeset['data'] except: print d['changeset'] raise master_key = changeset['data']['master'] dup_keys = changeset['data']['duplicates'] assert dup_keys print 'timestamp:', i['timestamp'] print 'dups:', dup_keys print 'records to update:', len(d['result']) master = None obj_by_key = {} works = [] editions_by_work = defaultdict(list) for obj in d['query']: obj_type = obj['type']['key'] k = obj['key'] if obj_type == '/type/work': works.append(obj['key']) elif obj_type == '/type/edition': if 'works' not in obj: continue for w in obj['works']: editions_by_work[w['key']].append(obj) obj_by_key[k] = obj master = obj_by_key.get(master_key) #print 'master:', master if len(d['result']) == 0: print i work_updates = [] for wkey in works: #print 'editions_by_work:', editions_by_work work = obj_by_key[wkey] work['editions'] = editions_by_work[wkey] subjects = get_work_subjects(work) for subject_type, values in subjects.iteritems(): subjects_to_update.update((subject_type, v) for v in values) try: ret = update_work(work, obj_cache=obj_by_key, debug=True) except AuthorRedirect: work = withKey(wkey) work['editions'] = editions_by_work[wkey] ret = update_work(work, debug=True, resolve_redirects=True) work_updates += ret if work_updates: solr_update(work_updates, debug=False, index='works') authors_to_update.append({ 'redirects': dup_keys, 'master_key': master_key, 'master': master}) print 'authors to update:', len(authors_to_update) t1 = time() - t0 update_times.append(t1) print 'update takes: %d seconds' % t1 print