def solr_updates(i): global subjects_to_update, authors_to_update t0 = time() d = i['data'] changeset = d['changeset'] print 'author:', d['author'] try: assert len(changeset['data']) == 2 and 'master' in changeset['data'] and 'duplicates' in changeset['data'] except: print d['changeset'] raise master_key = changeset['data']['master'] dup_keys = changeset['data']['duplicates'] assert dup_keys print d['changeset'] print 'timestamp:', i['timestamp'] print 'dups:', dup_keys print 'records to update:', len(d['result']) master = None obj_by_key = {} works = [] editions_by_work = defaultdict(list) for obj in d['query']: obj_type = obj['type']['key'] k = obj['key'] if obj_type == '/type/work': works.append(obj['key']) elif obj_type == '/type/edition': if 'works' not in obj: continue for w in obj['works']: editions_by_work[w['key']].append(obj) obj_by_key[k] = obj master = obj_by_key.get(master_key) #print 'master:', master if len(d['result']) == 0: print i work_updates = [] for wkey in works: #print 'editions_by_work:', editions_by_work work = obj_by_key[wkey] work['editions'] = editions_by_work[wkey] subjects = get_work_subjects(work) for subject_type, values in subjects.iteritems(): subjects_to_update.update((subject_type, v) for v in values) ret = update_work(work, obj_cache=obj_by_key, debug=True) work_updates += ret solr_update(work_updates, debug=False, index='works') authors_to_update.append({ 'redirects': dup_keys, 'master_key': master_key, 'master': master}) print 'authors to update:', len(authors_to_update) t1 = time() - t0 update_times.append(t1) print 'update takes: %d seconds' % t1 print
def run_update(): global authors_to_update, works_to_update subjects_to_update = set() global last_update print 'running update: %s works %s authors' % (len(works_to_update), len(authors_to_update)) if works_to_update: requests = [] num = 0 total = len(works_to_update) for wkey in works_to_update: num += 1 print 'update work: %s %d/%d' % (wkey, num, total) if '/' in wkey[7:]: print 'bad wkey:', wkey continue work_to_update = withKey(wkey) for attempt in range(5): try: requests += update_work(work_to_update) except AuthorRedirect: print 'fixing author redirect' w = ol.get(wkey) need_update = False for a in w['authors']: r = ol.get(a['author']) if r['type'] == '/type/redirect': a['author'] = {'key': r['location']} need_update = True if need_update: if not done_login: rc = read_rc() ol.login('EdwardBot', rc['EdwardBot']) ol.save(w['key'], w, 'avoid author redirect') if work_to_update['type'][ 'key'] == '/type/work' and work_to_update.get('title'): subjects = get_work_subjects(work_to_update) print subjects for subject_type, values in subjects.iteritems(): subjects_to_update.update( (subject_type, v) for v in values) if len(requests) >= 100: solr_update(requests, debug=True) requests = [] # if num % 1000 == 0: # solr_update(['<commit/>'], debug=True) if requests: solr_update(requests, debug=True) if not args.no_commit: solr_update(['<commit/>'], debug=True) last_update = time() if not args.no_author_updates and authors_to_update: requests = [] for akey in authors_to_update: print('update author:', repr(akey)) try: request = update_author(akey) if request: requests += request except AttributeError: print('akey:', repr(akey)) raise if not args.no_commit: solr_update(requests + ['<commit/>'], debug=True) subject_add = Element("add") print subjects_to_update for subject_type, subject_name in subjects_to_update: key = subject_type + '/' + subject_name count = subject_count(subject_type, subject_name) if not subject_need_update(key, count): print 'no updated needed:', (subject_type, subject_name, count) continue print 'updated needed:', (subject_type, subject_name, count) doc = Element("doc") add_field(doc, 'key', key) add_field(doc, 'name', subject_name) add_field(doc, 'type', subject_type) add_field(doc, 'count', count) subject_add.append(doc) if len(subject_add): print 'updating subjects' add_xml = tostring(subject_add).encode('utf-8') solr_update([add_xml, '<commit />'], debug=True) authors_to_update = set() works_to_update = set() subjects_to_update = set() print >> open(state_file, 'w'), offset
def run_update(): global authors_to_update, works_to_update subjects_to_update = set() global last_update print 'running update: %s works %s authors' % (len(works_to_update), len(authors_to_update)) if works_to_update: requests = [] num = 0 total = len(works_to_update) for wkey in works_to_update: num += 1 print 'update work: %s %d/%d' % (wkey, num, total) if '/' in wkey[7:]: print 'bad wkey:', wkey continue work_to_update = withKey(wkey) for attempt in range(5): try: requests += update_work(work_to_update) except AuthorRedirect: print 'fixing author redirect' w = ol.get(wkey) need_update = False for a in w['authors']: r = ol.get(a['author']) if r['type'] == '/type/redirect': a['author'] = {'key': r['location']} need_update = True if need_update: if not done_login: rc = read_rc() ol.login('EdwardBot', rc['EdwardBot']) ol.save(w['key'], w, 'avoid author redirect') if work_to_update['type']['key'] == '/type/work' and work_to_update.get('title'): subjects = get_work_subjects(work_to_update) print subjects for subject_type, values in subjects.iteritems(): subjects_to_update.update((subject_type, v) for v in values) if len(requests) >= 100: solr_update(requests, debug=True) requests = [] # if num % 1000 == 0: # solr_update(['<commit/>'], debug=True) if requests: solr_update(requests, debug=True) if not args.no_commit: solr_update(['<commit/>'], debug=True) last_update = time() if not args.no_author_updates and authors_to_update: requests = [] for akey in authors_to_update: print 'update author:', `akey` try: request = update_author(akey) if request: requests += request except AttributeError: print 'akey:', `akey` raise if not args.no_commit: solr_update(requests + ['<commit/>'], index='authors', debug=True) subject_add = Element("add") print subjects_to_update for subject_type, subject_name in subjects_to_update: key = subject_type + '/' + subject_name count = subject_count(subject_type, subject_name) if not subject_need_update(key, count): print 'no updated needed:', (subject_type, subject_name, count) continue print 'updated needed:', (subject_type, subject_name, count) doc = Element("doc") add_field(doc, 'key', key) add_field(doc, 'name', subject_name) add_field(doc, 'type', subject_type) add_field(doc, 'count', count) subject_add.append(doc) if len(subject_add): print 'updating subjects' add_xml = tostring(subject_add).encode('utf-8') solr_update([add_xml, '<commit />'], debug=True, index='subjects') authors_to_update = set() works_to_update = set() subjects_to_update = set() print >> open(state_file, 'w'), offset
def solr_updates(i): global subjects_to_update, authors_to_update t0 = time() d = i['data'] changeset = d['changeset'] print 'author:', d['author'] try: assert len(changeset['data']) == 2 and 'master' in changeset['data'] and 'duplicates' in changeset['data'] except: print d['changeset'] raise master_key = changeset['data']['master'] dup_keys = changeset['data']['duplicates'] assert dup_keys print 'timestamp:', i['timestamp'] print 'dups:', dup_keys print 'records to update:', len(d['result']) master = None obj_by_key = {} works = [] editions_by_work = defaultdict(list) for obj in d['query']: obj_type = obj['type']['key'] k = obj['key'] if obj_type == '/type/work': works.append(obj['key']) elif obj_type == '/type/edition': if 'works' not in obj: continue for w in obj['works']: editions_by_work[w['key']].append(obj) obj_by_key[k] = obj master = obj_by_key.get(master_key) #print 'master:', master if len(d['result']) == 0: print i work_updates = [] for wkey in works: #print 'editions_by_work:', editions_by_work work = obj_by_key[wkey] work['editions'] = editions_by_work[wkey] subjects = get_work_subjects(work) for subject_type, values in subjects.iteritems(): subjects_to_update.update((subject_type, v) for v in values) try: ret = update_work(work, obj_cache=obj_by_key, debug=True) except AuthorRedirect: work = withKey(wkey) work['editions'] = editions_by_work[wkey] ret = update_work(work, debug=True, resolve_redirects=True) work_updates += ret if work_updates: solr_update(work_updates, debug=False, index='works') authors_to_update.append({ 'redirects': dup_keys, 'master_key': master_key, 'master': master}) print 'authors to update:', len(authors_to_update) t1 = time() - t0 update_times.append(t1) print 'update takes: %d seconds' % t1 print