Ejemplo n.º 1
0
    def test_update_author(self, monkeypatch):
        update_work.data_provider = FakeDataProvider(
            [make_author(key='/authors/OL25A', name='Somebody')])
        empty_solr_resp = MockResponse({
            "facet_counts": {
                "facet_fields": {
                    "place_facet": [],
                    "person_facet": [],
                    "subject_facet": [],
                    "time_facet": [],
                }
            },
            "response": {
                "numFound": 0
            },
        })

        monkeypatch.setattr(update_work.requests, 'get',
                            lambda url, **kwargs: empty_solr_resp)
        requests = update_work.update_author('/authors/OL25A')
        assert len(requests) == 1
        assert isinstance(requests, list)
        assert isinstance(requests[0], update_work.UpdateRequest)
        assert requests[0].toxml().startswith('<add>')
        assert '<field name="key">/authors/OL25A</field>' in requests[0].toxml(
        )
Ejemplo n.º 2
0
 def test_update_author(self):
     update_work.data_provider = FakeDataProvider(
         [make_author(key='/authors/OL25A', name='Somebody')])
     empty_solr_resp = MockResponse({
         "facet_counts": {
             "facet_fields": {
                 "place_facet": [],
                 "person_facet": [],
                 "subject_facet": [],
                 "time_facet": [],
             }
         },
         "response": {
             "numFound": 0
         },
     })
     with mock.patch('openlibrary.solr.update_work.urlopen',
                     return_value=empty_solr_resp):
         requests = update_work.update_author('/authors/OL25A')
     assert len(requests) == 1
     assert isinstance(requests, list)
     assert isinstance(requests[0], update_work.UpdateRequest)
     assert requests[0].toxml().startswith('<add>')
     assert '<field name="key">/authors/OL25A</field>' in requests[0].toxml(
     )
Ejemplo n.º 3
0
def solr_update_authors():
    global authors_to_update
    for a in authors_to_update:
        author_updates = ['<delete>' + ''.join('<id>%s</id>' % re_author_key.match(akey).group(1) for akey in a['redirects']) + '</delete>']
        author_updates += update_author(a['master_key'], a=a['master'], handle_redirects=False)
    solr_update(author_updates, index='authors', debug=False)
    solr_update(['<commit/>'], index='authors', debug=True)
    authors_to_update = []
Ejemplo n.º 4
0
 def test_delete_author(self):
     update_work.data_provider = FakeDataProvider(
         [make_author(key='/authors/OL23A', type={'key': '/type/delete'})])
     requests = update_work.update_author('/authors/OL23A')
     assert isinstance(requests, list)
     assert isinstance(requests[0], update_work.DeleteRequest)
     assert requests[0].toxml(
     ) == '<delete><query>key:/authors/OL23A</query></delete>'
Ejemplo n.º 5
0
 def test_redirect_author(self):
     update_work.data_provider = FakeDataProvider([
         make_author(key='/authors/OL24A', type={'key': '/type/redirect'})
     ])
     requests = update_work.update_author('/authors/OL24A')
     assert isinstance(requests, list)
     assert isinstance(requests[0], update_work.DeleteRequest)
     assert requests[0].toxml() == '<delete><query>key:/authors/OL24A</query></delete>'
Ejemplo n.º 6
0
 def test_redirect_author(self):
     update_work.data_provider = FakeDataProvider([
         make_author(key='/authors/OL24A', type={'key': '/type/redirect'})
     ])
     requests = update_work.update_author('/authors/OL24A')
     assert isinstance(requests, list)
     assert isinstance(requests[0], update_work.DeleteRequest)
     assert requests[0].toxml() == '<delete><id>/authors/OL24A</id></delete>'
Ejemplo n.º 7
0
def solr_update_authors(authors_to_update):
    for a in authors_to_update:
        try:
            author_updates = ['<delete>' + ''.join('<id>%s</id>' % re_author_key.match(akey).group(1) for akey in a['redirects']) + '</delete>']
        except:
            print 'redirects'
            print a['redirects']
            raise
        author_updates += update_author(a['master_key'], a=a['master'], handle_redirects=False)
    solr_update(author_updates, index='authors', debug=False)
    solr_update(['<commit/>'], index='authors', debug=True)
Ejemplo n.º 8
0
def run_update():
    global authors_to_update
    global works_to_update
    global last_update
    print "running update: %s works %s authors" % (len(works_to_update), len(authors_to_update))
    if works_to_update:
        requests = []
        num = 0
        total = len(works_to_update)
        for wkey in works_to_update:
            num += 1
            print "update work: %s %d/%d" % (wkey, num, total)
            if "/" in wkey[7:]:
                print "bad wkey:", wkey
                continue
            for attempt in range(5):
                try:
                    requests += update_work(withKey(wkey))
                    break
                except AuthorRedirect:
                    print "fixing author redirect"
                    w = ol.get(wkey)
                    need_update = False
                    for a in w["authors"]:
                        r = ol.get(a["author"])
                        if r["type"] == "/type/redirect":
                            a["author"] = {"key": r["location"]}
                            need_update = True
                    assert need_update
                    print w
                    if not done_login:
                        rc = read_rc()
                        ol.login("EdwardBot", rc["EdwardBot"])
                    ol.save(w["key"], w, "avoid author redirect")
            if len(requests) >= 100:
                solr_update(requests, debug=True)
                requests = []
        #            if num % 1000 == 0:
        #                solr_update(['<commit/>'], debug=True)
        if requests:
            solr_update(requests, debug=True)
        solr_update(["<commit/>"], debug=True)
    last_update = time()
    print >> open(state_file, "w"), offset
    if authors_to_update:
        requests = []
        for akey in authors_to_update:
            print "update author:", akey
            requests += update_author(akey)
        solr_update(requests + ["<commit/>"], index="authors", debug=True)
    authors_to_update = set()
    works_to_update = set()
    print >> open(state_file, "w"), offset
Ejemplo n.º 9
0
def run_update():
    global authors_to_update
    global works_to_update
    global last_update
    print 'running update: %s works %s authors' % (len(works_to_update), len(authors_to_update))
    if works_to_update:
        requests = []
        num = 0
        total = len(works_to_update)
        for wkey in works_to_update:
            num += 1
            print 'update work: %s %d/%d' % (wkey, num, total)
            if '/' in wkey[7:]:
                print 'bad wkey:', wkey
                continue
            for attempt in range(5):
                try:
                    requests += update_work(withKey(wkey))
                    break
                except AuthorRedirect:
                    print 'fixing author redirect'
                    w = ol.get(wkey)
                    need_update = False
                    for a in w['authors']:
                        r = ol.get(a['author'])
                        if r['type'] == '/type/redirect':
                            a['author'] = {'key': r['location']}
                            need_update = True
                    assert need_update
                    print w
                    if not done_login:
                        rc = read_rc()
                        ol.login('EdwardBot', rc['EdwardBot']) 
                    ol.save(w['key'], w, 'avoid author redirect')
            if len(requests) >= 100:
                solr_update(requests, debug=True)
                requests = []
#            if num % 1000 == 0:
#                solr_update(['<commit/>'], debug=True)
        if requests:
            solr_update(requests, debug=True)
        solr_update(['<commit/>'], debug=True)
    last_update = time()
    print >> open(state_file, 'w'), offset
    if authors_to_update:
        requests = []
        for akey in authors_to_update:
            print 'update author:', akey
            requests += update_author(akey)
        solr_update(requests + ['<commit/>'], index='authors', debug=True)
    authors_to_update = set()
    works_to_update = set()
    print >> open(state_file, 'w'), offset
Ejemplo n.º 10
0
 def test_update_author(self, monkeypatch):
     update_work.data_provider = FakeDataProvider([
         make_author(key='/authors/OL25A', name='Somebody')
     ])
     # Minimal Solr response, author not found in Solr
     solr_response = """{
         "facet_counts": {
             "facet_fields": {
                 "place_facet": [], "person_facet": [], "subject_facet": [], "time_facet": []
             }
         },
         "response": {"numFound": 0}
     }"""
     monkeypatch.setattr(update_work, 'urlopen', lambda url: StringIO(solr_response))
     requests = update_work.update_author('/authors/OL25A')
     assert len(requests) == 1
     assert isinstance(requests, list)
     assert isinstance(requests[0], update_work.UpdateRequest)
     assert requests[0].toxml().startswith('<add>')
     assert '<field name="key">/authors/OL25A</field>' in requests[0].toxml()
Ejemplo n.º 11
0
 def test_update_author(self, monkeypatch):
     update_work.data_provider = FakeDataProvider(
         [make_author(key='/authors/OL25A', name='Somebody')])
     # Minimal Solr response, author not found in Solr
     solr_response = """{
         "facet_counts": {
             "facet_fields": {
                 "place_facet": [], "person_facet": [], "subject_facet": [], "time_facet": []
             }
         },
         "response": {"numFound": 0}
     }"""
     monkeypatch.setattr(update_work, 'urlopen',
                         lambda url: StringIO(solr_response))
     requests = update_work.update_author('/authors/OL25A')
     assert len(requests) == 1
     assert isinstance(requests, list)
     assert isinstance(requests[0], update_work.UpdateRequest)
     assert requests[0].toxml().startswith('<add>')
     assert '<field name="key">/authors/OL25A</field>' in requests[0].toxml(
     )
Ejemplo n.º 12
0
def run_update():
    global authors_to_update, works_to_update
    subjects_to_update = set()
    global last_update
    print 'running update: %s works %s authors' % (len(works_to_update),
                                                   len(authors_to_update))
    if works_to_update:
        requests = []
        num = 0
        total = len(works_to_update)
        for wkey in works_to_update:
            num += 1
            print 'update work: %s %d/%d' % (wkey, num, total)
            if '/' in wkey[7:]:
                print 'bad wkey:', wkey
                continue
            work_to_update = withKey(wkey)
            for attempt in range(5):
                try:
                    requests += update_work(work_to_update)
                except AuthorRedirect:
                    print 'fixing author redirect'
                    w = ol.get(wkey)
                    need_update = False
                    for a in w['authors']:
                        r = ol.get(a['author'])
                        if r['type'] == '/type/redirect':
                            a['author'] = {'key': r['location']}
                            need_update = True
                    if need_update:
                        if not done_login:
                            rc = read_rc()
                            ol.login('EdwardBot', rc['EdwardBot'])
                        ol.save(w['key'], w, 'avoid author redirect')
            if work_to_update['type'][
                    'key'] == '/type/work' and work_to_update.get('title'):
                subjects = get_work_subjects(work_to_update)
                print subjects
                for subject_type, values in subjects.iteritems():
                    subjects_to_update.update(
                        (subject_type, v) for v in values)
                if len(requests) >= 100:
                    solr_update(requests, debug=True)
                    requests = []
    #            if num % 1000 == 0:
    #                solr_update(['<commit/>'], debug=True)
        if requests:
            solr_update(requests, debug=True)
        if not args.no_commit:
            solr_update(['<commit/>'], debug=True)
    last_update = time()
    if not args.no_author_updates and authors_to_update:
        requests = []
        for akey in authors_to_update:
            print('update author:', repr(akey))
            try:
                request = update_author(akey)
                if request:
                    requests += request
            except AttributeError:
                print('akey:', repr(akey))
                raise
        if not args.no_commit:
            solr_update(requests + ['<commit/>'], debug=True)
    subject_add = Element("add")
    print subjects_to_update
    for subject_type, subject_name in subjects_to_update:
        key = subject_type + '/' + subject_name
        count = subject_count(subject_type, subject_name)

        if not subject_need_update(key, count):
            print 'no updated needed:', (subject_type, subject_name, count)
            continue
        print 'updated needed:', (subject_type, subject_name, count)

        doc = Element("doc")
        add_field(doc, 'key', key)
        add_field(doc, 'name', subject_name)
        add_field(doc, 'type', subject_type)
        add_field(doc, 'count', count)
        subject_add.append(doc)

    if len(subject_add):
        print 'updating subjects'
        add_xml = tostring(subject_add).encode('utf-8')
        solr_update([add_xml, '<commit />'], debug=True)

    authors_to_update = set()
    works_to_update = set()
    subjects_to_update = set()
    print >> open(state_file, 'w'), offset
Ejemplo n.º 13
0
 def test_redirect_author(self):
     update_work.data_provider = FakeDataProvider([
         make_author(key='/authors/OL24A', type={'key': '/type/redirect'})
     ])
     requests = update_work.update_author('/authors/OL24A')
     assert requests[0].to_json_command() == '"delete": ["/authors/OL24A"]'
Ejemplo n.º 14
0
def run_update():
    global authors_to_update, works_to_update
    subjects_to_update = set()
    global last_update
    print 'running update: %s works %s authors' % (len(works_to_update), len(authors_to_update))
    if works_to_update:
        requests = []
        num = 0
        total = len(works_to_update)
        for wkey in works_to_update:
            num += 1
            print 'update work: %s %d/%d' % (wkey, num, total)
            if '/' in wkey[7:]:
                print 'bad wkey:', wkey
                continue
            work_to_update = withKey(wkey)
            for attempt in range(5):
                try:
                    requests += update_work(work_to_update)
                except AuthorRedirect:
                    print 'fixing author redirect'
                    w = ol.get(wkey)
                    need_update = False
                    for a in w['authors']:
                        r = ol.get(a['author'])
                        if r['type'] == '/type/redirect':
                            a['author'] = {'key': r['location']}
                            need_update = True
                    if need_update:
                        if not done_login:
                            rc = read_rc()
                            ol.login('EdwardBot', rc['EdwardBot'])
                        ol.save(w['key'], w, 'avoid author redirect')
            if work_to_update['type']['key'] == '/type/work' and work_to_update.get('title'):
                subjects = get_work_subjects(work_to_update)
                print subjects
                for subject_type, values in subjects.iteritems():
                    subjects_to_update.update((subject_type, v) for v in values)
                if len(requests) >= 100:
                    solr_update(requests, debug=True)
                    requests = []
    #            if num % 1000 == 0:
    #                solr_update(['<commit/>'], debug=True)
        if requests:
            solr_update(requests, debug=True)
        if not args.no_commit:
            solr_update(['<commit/>'], debug=True)
    last_update = time()
    if not args.no_author_updates and authors_to_update:
        requests = []
        for akey in authors_to_update:
            print 'update author:', `akey`
            try:
                request = update_author(akey)
                if request:
                    requests += request
            except AttributeError:
                print 'akey:', `akey`
                raise
        if not args.no_commit:
            solr_update(requests + ['<commit/>'], index='authors', debug=True)
    subject_add = Element("add")
    print subjects_to_update
    for subject_type, subject_name in subjects_to_update:
        key = subject_type + '/' + subject_name
        count = subject_count(subject_type, subject_name)

        if not subject_need_update(key, count):
            print 'no updated needed:', (subject_type, subject_name, count)
            continue
        print 'updated needed:', (subject_type, subject_name, count)

        doc = Element("doc")
        add_field(doc, 'key', key)
        add_field(doc, 'name', subject_name)
        add_field(doc, 'type', subject_type)
        add_field(doc, 'count', count)
        subject_add.append(doc)

    if len(subject_add):
        print 'updating subjects'
        add_xml = tostring(subject_add).encode('utf-8')
        solr_update([add_xml, '<commit />'], debug=True, index='subjects')

    authors_to_update = set()
    works_to_update = set()
    subjects_to_update = set()
    print >> open(state_file, 'w'), offset
Ejemplo n.º 15
0
        best_match = max(w['existing_works'].iteritems(), key=lambda i:i[1])[0]
        w['best_match'] = work_by_key[best_match]
        updated = update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log)
        for wkey in updated:
            if wkey in works_updated_this_session:
                print(wkey, 'already updated!', file=fh_log)
                print(wkey, 'already updated!')
        works_updated_this_session.update(updated)

    #if not do_updates:
    #    return []

    return [withKey(key) for key in works_updated_this_session]

if __name__ == '__main__':
    akey = '/authors/' + sys.argv[1]

    title_redirects = find_title_redirects(akey)
    works = find_works(akey, get_books(akey, books_query(akey)), existing=title_redirects)
    to_update = update_works(akey, works, do_updates=True)

    requests = []
    for w in to_update:
        requests += update_work(w)

    if to_update:
        solr_update(requests + ['<commit />'], debug=True)

    requests = update_author(akey)
    solr_update(requests + ['<commit/>'], debug=True)
Ejemplo n.º 16
0
        for wkey in updated:
            if wkey in works_updated_this_session:
                print >> fh_log, wkey, 'already updated!'
                print wkey, 'already updated!'
        works_updated_this_session.update(updated)

    #if not do_updates:
    #    return []

    return [withKey(key) for key in works_updated_this_session]


if __name__ == '__main__':
    akey = '/authors/' + sys.argv[1]

    title_redirects = find_title_redirects(akey)
    works = find_works(akey,
                       get_books(akey, books_query(akey)),
                       existing=title_redirects)
    to_update = update_works(akey, works, do_updates=True)

    requests = []
    for w in to_update:
        requests += update_work(w)

    if to_update:
        solr_update(requests + ['<commit />'], debug=True)

    requests = update_author(akey)
    solr_update(requests + ['<commit/>'], debug=True)