Ejemplo n.º 1
0
def solr_update_subjects():
    global subjects_to_update
    print subjects_to_update

    subject_add = Element("add")
    for subject_type, subject_name in subjects_to_update:
        key = subject_type + '/' + subject_name
        count = subject_count(subject_type, subject_name)

        if not subject_need_update(key, count):
            print 'no updated needed:', (subject_type, subject_name, count)
            continue
        print 'updated needed:', (subject_type, subject_name, count)

        doc = Element("doc")
        add_field(doc, 'key', key)
        add_field(doc, 'name', subject_name)
        add_field(doc, 'type', subject_type)
        add_field(doc, 'count', count)
        subject_add.append(doc)

    if len(subject_add):
        print 'updating subjects'
        add_xml = tostring(subject_add).encode('utf-8')
        solr_update([add_xml], debug=False, index='subjects')
        solr_update(['<commit />'], debug=True, index='subjects')

    subjects_to_update = set()
Ejemplo n.º 2
0
def hide_books(start):
    mend = []
    fix_works = set()
    db_iter = db.query("select identifier, collection, updated from metadata where (noindex is not null or curatestate='dark') and mediatype='texts' and scandate is not null and updated > $start order by updated", {'start': start})
    for row in db_iter:
        ia = row.identifier
        if row.collection:
            collections = set(i.lower().strip() for i in row.collection.split(';'))
            if 'printdisabled' in collections:
                continue
        print `ia`, row.updated
        for eq in query({'type': '/type/edition', 'ocaid': ia}):
            print eq['key']
            e = ol.get(eq['key'])
            if 'ocaid' not in e:
                continue
            if 'works' in e:
                fix_works.update(e['works'])
            print e['key'], `e.get('title', None)`
            del e['ocaid']
            mend.append(e)
    print 'removing links from %d editions' % len(mend)
    print ol.save_many(mend, 'remove link')
    requests = []
    for wkey in fix_works:
        requests += update_work(withKey(wkey))
    if fix_works:
        solr_update(requests + ['<commit/>'], debug=True)
Ejemplo n.º 3
0
def solr_update_subjects():
    global subjects_to_update
    print subjects_to_update

    subject_add = Element("add")
    for subject_type, subject_name in subjects_to_update:
        key = subject_type + '/' + subject_name
        count = subject_count(subject_type, subject_name)

        if not subject_need_update(key, count):
            print 'no updated needed:', (subject_type, subject_name, count)
            continue
        print 'updated needed:', (subject_type, subject_name, count)

        doc = Element("doc")
        add_field(doc, 'key', key)
        add_field(doc, 'name', subject_name)
        add_field(doc, 'type', subject_type)
        add_field(doc, 'count', count)
        subject_add.append(doc)

    if len(subject_add):
        print 'updating subjects'
        add_xml = tostring(subject_add).encode('utf-8')
        solr_update([add_xml], debug=False, index='subjects')
        solr_update(['<commit />'], debug=True, index='subjects')

    subjects_to_update = set()
Ejemplo n.º 4
0
def solr_updates(i):
    global subjects_to_update, authors_to_update
    t0 = time()
    d = i['data']
    changeset = d['changeset']
    print 'author:', d['author']
    try:
        assert len(changeset['data']) == 2 and 'master' in changeset['data'] and 'duplicates' in changeset['data']
    except:
        print d['changeset']
        raise
    master_key = changeset['data']['master']
    dup_keys = changeset['data']['duplicates']
    assert dup_keys
    print d['changeset']
    print 'timestamp:', i['timestamp']
    print 'dups:', dup_keys
    print 'records to update:', len(d['result'])
     
    master = None
    obj_by_key = {}
    works = []
    editions_by_work = defaultdict(list)
    for obj in d['query']:
        obj_type = obj['type']['key']
        k = obj['key']
        if obj_type == '/type/work':
            works.append(obj['key'])
        elif obj_type == '/type/edition':
            if 'works' not in obj:
                continue
            for w in obj['works']:
                editions_by_work[w['key']].append(obj)
        obj_by_key[k] = obj
    master = obj_by_key.get(master_key)
    #print 'master:', master

    if len(d['result']) == 0:
        print i

    work_updates = []
    for wkey in works:
            #print 'editions_by_work:', editions_by_work
            work = obj_by_key[wkey]
            work['editions'] = editions_by_work[wkey]
            subjects = get_work_subjects(work)
            for subject_type, values in subjects.iteritems():
                subjects_to_update.update((subject_type, v) for v in values)
            ret = update_work(work, obj_cache=obj_by_key, debug=True)
            work_updates += ret
    solr_update(work_updates, debug=False, index='works')

    authors_to_update.append({ 'redirects': dup_keys, 'master_key': master_key, 'master': master})
    print 'authors to update:', len(authors_to_update)

    t1 = time() - t0
    update_times.append(t1)
    print 'update takes: %d seconds' % t1
    print
Ejemplo n.º 5
0
def solr_update_authors():
    global authors_to_update
    for a in authors_to_update:
        author_updates = ['<delete>' + ''.join('<id>%s</id>' % re_author_key.match(akey).group(1) for akey in a['redirects']) + '</delete>']
        author_updates += update_author(a['master_key'], a=a['master'], handle_redirects=False)
    solr_update(author_updates, index='authors', debug=False)
    solr_update(['<commit/>'], index='authors', debug=True)
    authors_to_update = []
Ejemplo n.º 6
0
    def test_bad_apple_in_solr_request(self, monkeypatch, monkeytime):
        mock_post = MagicMock(return_value=self.sample_individual_error())
        monkeypatch.setattr(httpx, "post", mock_post)

        solr_update(
            [CommitRequest()],
            solr_base_url="http://localhost:8983/solr/foobar",
        )

        assert mock_post.call_count == 1
Ejemplo n.º 7
0
    def test_solr_offline(self, monkeypatch, monkeytime):
        mock_post = MagicMock(side_effect=ConnectError('', request=None))
        monkeypatch.setattr(httpx, "post", mock_post)

        solr_update(
            [CommitRequest()],
            solr_base_url="http://localhost:8983/solr/foobar",
        )

        assert mock_post.call_count > 1
Ejemplo n.º 8
0
    def test_non_json_solr_503(self, monkeypatch, monkeytime):
        mock_post = MagicMock(return_value=self.sample_response_503())
        monkeypatch.setattr(httpx, "post", mock_post)

        solr_update(
            [CommitRequest()],
            solr_base_url="http://localhost:8983/solr/foobar",
        )

        assert mock_post.call_count > 1
Ejemplo n.º 9
0
    def test_other_non_ok_status(self, monkeypatch, monkeytime):
        mock_post = MagicMock(
            return_value=Response(500, request=MagicMock(), content="{}"))
        monkeypatch.setattr(httpx, "post", mock_post)

        solr_update(
            [CommitRequest()],
            solr_base_url="http://localhost:8983/solr/foobar",
        )

        assert mock_post.call_count > 1
Ejemplo n.º 10
0
def solr_update_authors(authors_to_update):
    for a in authors_to_update:
        try:
            author_updates = ['<delete>' + ''.join('<id>%s</id>' % re_author_key.match(akey).group(1) for akey in a['redirects']) + '</delete>']
        except:
            print 'redirects'
            print a['redirects']
            raise
        author_updates += update_author(a['master_key'], a=a['master'], handle_redirects=False)
    solr_update(author_updates, index='authors', debug=False)
    solr_update(['<commit/>'], index='authors', debug=True)
Ejemplo n.º 11
0
def run_update():
    global authors_to_update
    global works_to_update
    global last_update
    print 'running update: %s works %s authors' % (len(works_to_update), len(authors_to_update))
    if works_to_update:
        requests = []
        num = 0
        total = len(works_to_update)
        for wkey in works_to_update:
            num += 1
            print 'update work: %s %d/%d' % (wkey, num, total)
            if '/' in wkey[7:]:
                print 'bad wkey:', wkey
                continue
            for attempt in range(5):
                try:
                    requests += update_work(withKey(wkey))
                    break
                except AuthorRedirect:
                    print 'fixing author redirect'
                    w = ol.get(wkey)
                    need_update = False
                    for a in w['authors']:
                        r = ol.get(a['author'])
                        if r['type'] == '/type/redirect':
                            a['author'] = {'key': r['location']}
                            need_update = True
                    assert need_update
                    print w
                    if not done_login:
                        rc = read_rc()
                        ol.login('EdwardBot', rc['EdwardBot']) 
                    ol.save(w['key'], w, 'avoid author redirect')
            if len(requests) >= 100:
                solr_update(requests, debug=True)
                requests = []
#            if num % 1000 == 0:
#                solr_update(['<commit/>'], debug=True)
        if requests:
            solr_update(requests, debug=True)
        solr_update(['<commit/>'], debug=True)
    last_update = time()
    print >> open(state_file, 'w'), offset
    if authors_to_update:
        requests = []
        for akey in authors_to_update:
            print 'update author:', akey
            requests += update_author(akey)
        solr_update(requests + ['<commit/>'], index='authors', debug=True)
    authors_to_update = set()
    works_to_update = set()
    print >> open(state_file, 'w'), offset
Ejemplo n.º 12
0
def run_update():
    global authors_to_update
    global works_to_update
    global last_update
    print "running update: %s works %s authors" % (len(works_to_update), len(authors_to_update))
    if works_to_update:
        requests = []
        num = 0
        total = len(works_to_update)
        for wkey in works_to_update:
            num += 1
            print "update work: %s %d/%d" % (wkey, num, total)
            if "/" in wkey[7:]:
                print "bad wkey:", wkey
                continue
            for attempt in range(5):
                try:
                    requests += update_work(withKey(wkey))
                    break
                except AuthorRedirect:
                    print "fixing author redirect"
                    w = ol.get(wkey)
                    need_update = False
                    for a in w["authors"]:
                        r = ol.get(a["author"])
                        if r["type"] == "/type/redirect":
                            a["author"] = {"key": r["location"]}
                            need_update = True
                    assert need_update
                    print w
                    if not done_login:
                        rc = read_rc()
                        ol.login("EdwardBot", rc["EdwardBot"])
                    ol.save(w["key"], w, "avoid author redirect")
            if len(requests) >= 100:
                solr_update(requests, debug=True)
                requests = []
        #            if num % 1000 == 0:
        #                solr_update(['<commit/>'], debug=True)
        if requests:
            solr_update(requests, debug=True)
        solr_update(["<commit/>"], debug=True)
    last_update = time()
    print >> open(state_file, "w"), offset
    if authors_to_update:
        requests = []
        for akey in authors_to_update:
            print "update author:", akey
            requests += update_author(akey)
        solr_update(requests + ["<commit/>"], index="authors", debug=True)
    authors_to_update = set()
    works_to_update = set()
    print >> open(state_file, "w"), offset
Ejemplo n.º 13
0
def hide_books(start):
    hide_start = open(hide_state_file).readline()[:-1]
    print('hide start:', hide_start)

    mend = []
    fix_works = set()
    db_iter = db.query(
        "select identifier, collection, updated from metadata where (noindex is not null or curatestate='dark') and mediatype='texts' and scandate is not null and updated > $start",
        {'start': hide_start})
    last_updated = None
    for row in db_iter:
        ia = row.identifier
        if row.collection:
            collections = set(i.lower().strip()
                              for i in row.collection.split(';'))
            if ignore_noindex & collections:
                continue
        print((repr(ia), row.updated))
        for eq in query({'type': '/type/edition', 'ocaid': ia}):
            print(eq['key'])
            e = ol.get(eq['key'])
            if 'ocaid' not in e:
                continue
            if 'works' in e:
                fix_works.update(e['works'])
            print((e['key'], repr(e.get('title', None))))
            del e['ocaid']
            mend.append(e)
        last_updated = row.updated
    print('removing links from %d editions' % len(mend))
    if not mend:
        return
    print(ol.save_many(mend, 'remove link'))
    requests = []
    for wkey in fix_works:
        requests += update_work(withKey(wkey))
    if fix_works:
        solr_update(requests + ['<commit/>'], debug=True)
    print(last_updated, file=open(hide_state_file, 'w'))
Ejemplo n.º 14
0
def hide_books(start):
    hide_start = open(hide_state_file).readline()[:-1]
    print "hide start:", hide_start

    mend = []
    fix_works = set()
    db_iter = db.query(
        "select identifier, collection, updated from metadata where (noindex is not null or curatestate='dark') and mediatype='texts' and scandate is not null and updated > $start order by scandate_dt",
        {"start": hide_start},
    )
    last_updated = None
    for row in db_iter:
        ia = row.identifier
        if row.collection:
            collections = set(i.lower().strip() for i in row.collection.split(";"))
            if "printdisabled" in collections or "lendinglibrary" in collections:
                continue
        print ` ia `, row.updated
        for eq in query({"type": "/type/edition", "ocaid": ia}):
            print eq["key"]
            e = ol.get(eq["key"])
            if "ocaid" not in e:
                continue
            if "works" in e:
                fix_works.update(e["works"])
            print e["key"], ` e.get("title", None) `
            del e["ocaid"]
            mend.append(e)
        last_updated = row.updated
    print "removing links from %d editions" % len(mend)
    if not mend:
        return
    print ol.save_many(mend, "remove link")
    requests = []
    for wkey in fix_works:
        requests += update_work(withKey(wkey))
    if fix_works:
        solr_update(requests + ["<commit/>"], debug=True)
    print >> open(hide_state_file, "w"), last_updated
Ejemplo n.º 15
0
def hide_books(start):
    hide_start = open(hide_state_file).readline()[:-1]
    print 'hide start:', hide_start

    mend = []
    fix_works = set()
    db_iter = db.query("select identifier, collection, updated from metadata where (noindex is not null or curatestate='dark') and mediatype='texts' and scandate is not null and updated > $start", {'start': hide_start})
    last_updated = None
    for row in db_iter:
        ia = row.identifier
        if row.collection:
            collections = set(i.lower().strip() for i in row.collection.split(';'))
            if ignore_noindex & collections:
                continue
        print(repr(ia), row.updated)
        for eq in query({'type': '/type/edition', 'ocaid': ia}):
            print eq['key']
            e = ol.get(eq['key'])
            if 'ocaid' not in e:
                continue
            if 'works' in e:
                fix_works.update(e['works'])
            print(e['key'], repr(e.get('title', None)))
            del e['ocaid']
            mend.append(e)
        last_updated = row.updated
    print 'removing links from %d editions' % len(mend)
    if not mend:
        return
    print ol.save_many(mend, 'remove link')
    requests = []
    for wkey in fix_works:
        requests += update_work(withKey(wkey))
    if fix_works:
        solr_update(requests + ['<commit/>'], debug=True)
    print >> open(hide_state_file, 'w'), last_updated
Ejemplo n.º 16
0
            print url
            sys.exit(0)
        print 'url:', url
        raise
    try:
        ret = simplejson.loads(data)
    except:
        open('bad_data.json', 'w').write(data)
        raise

    offset = ret['offset']
    data_list = ret['data']
    if len(data_list) == 0:
        if authors_to_update:
            print 'commit'
            solr_update(['<commit/>'], debug=True)
            solr_update_authors(authors_to_update)
            authors_to_update = []
            solr_update_subjects()

        print 'waiting'
        sleep(10)
        continue
    else:
        print
    for i in data_list:
        action = i.pop('action')
        if action != 'save_many':
            continue
        if i['data']['comment'] != 'merge authors':
            continue
Ejemplo n.º 17
0
 def _solr_commit(self):
     logger.info("BEGIN commit")
     update_work.solr_update(['<commit/>'])
     logger.info("END commit")
Ejemplo n.º 18
0
        best_match = max(w['existing_works'].iteritems(), key=lambda i:i[1])[0]
        w['best_match'] = work_by_key[best_match]
        updated = update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log)
        for wkey in updated:
            if wkey in works_updated_this_session:
                print >> fh_log, wkey, 'already updated!'
                print wkey, 'already updated!'
        works_updated_this_session.update(updated)

    #if not do_updates:
    #    return []

    return [withKey(key) for key in works_updated_this_session]

if __name__ == '__main__':
    akey = '/authors/' + sys.argv[1]

    title_redirects = find_title_redirects(akey)
    works = find_works(akey, get_books(akey, books_query(akey)), existing=title_redirects)
    to_update = update_works(akey, works, do_updates=True)

    requests = []
    for w in to_update:
        requests += update_work(w)

    if to_update:
        solr_update(requests + ['<commit />'], debug=True)

    requests = update_author(akey)
    solr_update(requests + ['<commit/>'], index='authors', debug=True)
Ejemplo n.º 19
0
            print url
            sys.exit(0)
        print 'url:', url
        raise
    try:
        ret = simplejson.loads(data)
    except:
        open('bad_data.json', 'w').write(data)
        raise

    offset = ret['offset']
    data_list = ret['data']
    if len(data_list) == 0:
        if authors_to_update:
            print 'commit'
            solr_update(['<commit/>'], debug=True)
            solr_update_authors(authors_to_update)
            authors_to_update = []
            solr_update_subjects()

        print 'waiting'
        sleep(10)
        continue
    else:
        print
    for i in data_list:
        action = i.pop('action')
        if action != 'save_many':
            continue
        if i['data']['comment'] != 'merge authors':
            continue
Ejemplo n.º 20
0
            print url
            sys.exit(0)
        print 'url:', url
        raise
    try:
        ret = simplejson.loads(data)
    except:
        open('bad_data.json', 'w').write(data)
        raise

    offset = ret['offset']
    data_list = ret['data']
    if len(data_list) == 0:
        if authors_to_update:
            print 'commit'
            solr_update(['<commit/>'], debug=True, index='works')
            solr_update_authors()
            solr_update_subjects()
        
        print 'waiting'
        sleep(10)
        continue
    else:
        print
    for i in data_list:
        action = i.pop('action')
        if action != 'save_many':
            continue
        if i['data']['comment'] != 'merge authors':
            continue
        if i['timestamp'] == '2010-08-05T14:37:25.139418':
Ejemplo n.º 21
0
 def _solr_commit(self):
     logger.info("BEGIN commit")
     update_work.solr_update(['<commit/>'], index="works")
     logger.info("END commit")
Ejemplo n.º 22
0
def run_update():
    global authors_to_update, works_to_update
    subjects_to_update = set()
    global last_update
    print 'running update: %s works %s authors' % (len(works_to_update), len(authors_to_update))
    if works_to_update:
        requests = []
        num = 0
        total = len(works_to_update)
        for wkey in works_to_update:
            num += 1
            print 'update work: %s %d/%d' % (wkey, num, total)
            if '/' in wkey[7:]:
                print 'bad wkey:', wkey
                continue
            work_to_update = withKey(wkey)
            for attempt in range(5):
                try:
                    requests += update_work(work_to_update)
                except AuthorRedirect:
                    print 'fixing author redirect'
                    w = ol.get(wkey)
                    need_update = False
                    for a in w['authors']:
                        r = ol.get(a['author'])
                        if r['type'] == '/type/redirect':
                            a['author'] = {'key': r['location']}
                            need_update = True
                    if need_update:
                        if not done_login:
                            rc = read_rc()
                            ol.login('EdwardBot', rc['EdwardBot'])
                        ol.save(w['key'], w, 'avoid author redirect')
            if work_to_update['type']['key'] == '/type/work' and work_to_update.get('title'):
                subjects = get_work_subjects(work_to_update)
                print subjects
                for subject_type, values in subjects.iteritems():
                    subjects_to_update.update((subject_type, v) for v in values)
                if len(requests) >= 100:
                    solr_update(requests, debug=True)
                    requests = []
    #            if num % 1000 == 0:
    #                solr_update(['<commit/>'], debug=True)
        if requests:
            solr_update(requests, debug=True)
        if not args.no_commit:
            solr_update(['<commit/>'], debug=True)
    last_update = time()
    if not args.no_author_updates and authors_to_update:
        requests = []
        for akey in authors_to_update:
            print 'update author:', `akey`
            try:
                request = update_author(akey)
                if request:
                    requests += request
            except AttributeError:
                print 'akey:', `akey`
                raise
        if not args.no_commit:
            solr_update(requests + ['<commit/>'], index='authors', debug=True)
    subject_add = Element("add")
    print subjects_to_update
    for subject_type, subject_name in subjects_to_update:
        key = subject_type + '/' + subject_name
        count = subject_count(subject_type, subject_name)

        if not subject_need_update(key, count):
            print 'no updated needed:', (subject_type, subject_name, count)
            continue
        print 'updated needed:', (subject_type, subject_name, count)

        doc = Element("doc")
        add_field(doc, 'key', key)
        add_field(doc, 'name', subject_name)
        add_field(doc, 'type', subject_type)
        add_field(doc, 'count', count)
        subject_add.append(doc)

    if len(subject_add):
        print 'updating subjects'
        add_xml = tostring(subject_add).encode('utf-8')
        solr_update([add_xml, '<commit />'], debug=True, index='subjects')

    authors_to_update = set()
    works_to_update = set()
    subjects_to_update = set()
    print >> open(state_file, 'w'), offset
Ejemplo n.º 23
0
        best_match = max(w['existing_works'].iteritems(), key=lambda i:i[1])[0]
        w['best_match'] = work_by_key[best_match]
        updated = update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log)
        for wkey in updated:
            if wkey in works_updated_this_session:
                print(wkey, 'already updated!', file=fh_log)
                print(wkey, 'already updated!')
        works_updated_this_session.update(updated)

    #if not do_updates:
    #    return []

    return [withKey(key) for key in works_updated_this_session]

if __name__ == '__main__':
    akey = '/authors/' + sys.argv[1]

    title_redirects = find_title_redirects(akey)
    works = find_works(akey, get_books(akey, books_query(akey)), existing=title_redirects)
    to_update = update_works(akey, works, do_updates=True)

    requests = []
    for w in to_update:
        requests += update_work(w)

    if to_update:
        solr_update(requests + ['<commit />'], debug=True)

    requests = update_author(akey)
    solr_update(requests + ['<commit/>'], debug=True)
Ejemplo n.º 24
0
 def _solr_commit(self):
     logger.info("BEGIN commit")
     update_work.solr_update([CommitRequest()])
     logger.info("END commit")
Ejemplo n.º 25
0
        for wkey in updated:
            if wkey in works_updated_this_session:
                print >> fh_log, wkey, 'already updated!'
                print wkey, 'already updated!'
        works_updated_this_session.update(updated)

    #if not do_updates:
    #    return []

    return [withKey(key) for key in works_updated_this_session]


if __name__ == '__main__':
    akey = '/authors/' + sys.argv[1]

    title_redirects = find_title_redirects(akey)
    works = find_works(akey,
                       get_books(akey, books_query(akey)),
                       existing=title_redirects)
    to_update = update_works(akey, works, do_updates=True)

    requests = []
    for w in to_update:
        requests += update_work(w)

    if to_update:
        solr_update(requests + ['<commit />'], debug=True)

    requests = update_author(akey)
    solr_update(requests + ['<commit/>'], debug=True)
Ejemplo n.º 26
0
                    e = ol.get(ekey)
                    e['works'] = [Reference(use_key)]
                    update.append(e)
            if work_title[use_key] != w['title']:
                print 'update work title', `work_title[use_key]`, '->', `w['title']`
                existing_work = ol.get(use_key)
                existing_work['title'] = w['title']
                update.append(existing_work)
            if do_updates:
                ol.save_many(update, 'merge works')
        all_existing.update(existing)
        for wkey in existing:
            cur = work_title[wkey]
            print '  ', wkey, cur == w['title'], `cur`

    print len(work_to_edition), len(all_existing)
    assert len(work_to_edition) == len(all_existing)

    if not do_updates:
        sys.exit(0)

    for key in work_keys:
        w = ol.get(key)
        add_cover_to_work(w)
        if 'cover_edition' not in w:
            print 'no cover found'
        update_work(withKey(key), debug=True)

    requests = ['<commit />']
    solr_update(requests, debug=True)
Ejemplo n.º 27
0
            print url
            sys.exit(0)
        print 'url:', url
        raise
    try:
        ret = simplejson.loads(data)
    except:
        open('bad_data.json', 'w').write(data)
        raise

    offset = ret['offset']
    data_list = ret['data']
    if len(data_list) == 0:
        if authors_to_update:
            print 'commit'
            solr_update(['<commit/>'], debug=True, index='works')
            solr_update_authors(authors_to_update)
            authors_to_update = []
            solr_update_subjects()

        print 'waiting'
        sleep(10)
        continue
    else:
        print
    for i in data_list:
        action = i.pop('action')
        if action != 'save_many':
            continue
        if i['data']['comment'] != 'merge authors':
            continue
Ejemplo n.º 28
0
def run_update():
    global authors_to_update, works_to_update
    subjects_to_update = set()
    global last_update
    print 'running update: %s works %s authors' % (len(works_to_update),
                                                   len(authors_to_update))
    if works_to_update:
        requests = []
        num = 0
        total = len(works_to_update)
        for wkey in works_to_update:
            num += 1
            print 'update work: %s %d/%d' % (wkey, num, total)
            if '/' in wkey[7:]:
                print 'bad wkey:', wkey
                continue
            work_to_update = withKey(wkey)
            for attempt in range(5):
                try:
                    requests += update_work(work_to_update)
                except AuthorRedirect:
                    print 'fixing author redirect'
                    w = ol.get(wkey)
                    need_update = False
                    for a in w['authors']:
                        r = ol.get(a['author'])
                        if r['type'] == '/type/redirect':
                            a['author'] = {'key': r['location']}
                            need_update = True
                    if need_update:
                        if not done_login:
                            rc = read_rc()
                            ol.login('EdwardBot', rc['EdwardBot'])
                        ol.save(w['key'], w, 'avoid author redirect')
            if work_to_update['type'][
                    'key'] == '/type/work' and work_to_update.get('title'):
                subjects = get_work_subjects(work_to_update)
                print subjects
                for subject_type, values in subjects.iteritems():
                    subjects_to_update.update(
                        (subject_type, v) for v in values)
                if len(requests) >= 100:
                    solr_update(requests, debug=True)
                    requests = []
    #            if num % 1000 == 0:
    #                solr_update(['<commit/>'], debug=True)
        if requests:
            solr_update(requests, debug=True)
        if not args.no_commit:
            solr_update(['<commit/>'], debug=True)
    last_update = time()
    if not args.no_author_updates and authors_to_update:
        requests = []
        for akey in authors_to_update:
            print('update author:', repr(akey))
            try:
                request = update_author(akey)
                if request:
                    requests += request
            except AttributeError:
                print('akey:', repr(akey))
                raise
        if not args.no_commit:
            solr_update(requests + ['<commit/>'], debug=True)
    subject_add = Element("add")
    print subjects_to_update
    for subject_type, subject_name in subjects_to_update:
        key = subject_type + '/' + subject_name
        count = subject_count(subject_type, subject_name)

        if not subject_need_update(key, count):
            print 'no updated needed:', (subject_type, subject_name, count)
            continue
        print 'updated needed:', (subject_type, subject_name, count)

        doc = Element("doc")
        add_field(doc, 'key', key)
        add_field(doc, 'name', subject_name)
        add_field(doc, 'type', subject_type)
        add_field(doc, 'count', count)
        subject_add.append(doc)

    if len(subject_add):
        print 'updating subjects'
        add_xml = tostring(subject_add).encode('utf-8')
        solr_update([add_xml, '<commit />'], debug=True)

    authors_to_update = set()
    works_to_update = set()
    subjects_to_update = set()
    print >> open(state_file, 'w'), offset
Ejemplo n.º 29
0
def solr_updates(i):
    global subjects_to_update, authors_to_update
    t0 = time()
    d = i['data']
    changeset = d['changeset']
    print 'author:', d['author']
    try:
        assert len(changeset['data']) == 2 and 'master' in changeset['data'] and 'duplicates' in changeset['data']
    except:
        print d['changeset']
        raise
    master_key = changeset['data']['master']
    dup_keys = changeset['data']['duplicates']
    assert dup_keys
    print 'timestamp:', i['timestamp']
    print 'dups:', dup_keys
    print 'records to update:', len(d['result'])

    master = None
    obj_by_key = {}
    works = []
    editions_by_work = defaultdict(list)
    for obj in d['query']:
        obj_type = obj['type']['key']
        k = obj['key']
        if obj_type == '/type/work':
            works.append(obj['key'])
        elif obj_type == '/type/edition':
            if 'works' not in obj:
                continue
            for w in obj['works']:
                editions_by_work[w['key']].append(obj)
        obj_by_key[k] = obj
    master = obj_by_key.get(master_key)
    #print 'master:', master

    if len(d['result']) == 0:
        print i

    work_updates = []
    for wkey in works:
            #print 'editions_by_work:', editions_by_work
            work = obj_by_key[wkey]
            work['editions'] = editions_by_work[wkey]
            subjects = get_work_subjects(work)
            for subject_type, values in subjects.iteritems():
                subjects_to_update.update((subject_type, v) for v in values)
            try:
                ret = update_work(work, obj_cache=obj_by_key, debug=True)
            except AuthorRedirect:
                work = withKey(wkey)
                work['editions'] = editions_by_work[wkey]
                ret = update_work(work, debug=True, resolve_redirects=True)
            work_updates += ret
    if work_updates:
        solr_update(work_updates, debug=False, index='works')

    authors_to_update.append({ 'redirects': dup_keys, 'master_key': master_key, 'master': master})
    print 'authors to update:', len(authors_to_update)

    t1 = time() - t0
    update_times.append(t1)
    print 'update takes: %d seconds' % t1
    print