def run_work_finder(i):
    t0 = time()
    d = i['data']
    print 'timestamp:', i['timestamp']
    print 'author:', d['author']
    print '%d records updated:' % len(d['result'])
    if 'changeset' not in d:
        print 'no changeset in author merge'
        print
        return
    changeset = d['changeset']

    try:
        assert len(changeset['data']) == 2 and 'master' in changeset['data'] and 'duplicates' in changeset['data']
    except:
        print d['changeset']
        raise
    akey = changeset['data']['master']
    dup_keys = changeset['data']['duplicates']
    print d['changeset']
    print 'dups:', dup_keys

    title_redirects = find_title_redirects(akey)
    works = find_works(akey, get_books(akey, books_query(akey)), existing=title_redirects)
    print 'author:', akey
    print 'works:', works
    updated = update_works(akey, works, do_updates=True)
    print '%d records updated' % len(updated)

    t1 = time() - t0
    update_times.append(t1)
    print 'update takes: %d seconds' % t1
    print
def run_work_finder(i):
    t0 = time()
    d = i['data']
    print('timestamp:', i['timestamp'])
    print('author:', d['author'])
    print('%d records updated:' % len(d['result']))
    if 'changeset' not in d:
        print('no changeset in author merge')
        print()
        return
    changeset = d['changeset']

    try:
        assert len(changeset['data']) == 2 and 'master' in changeset[
            'data'] and 'duplicates' in changeset['data']
    except:
        print(d['changeset'])
        raise
    akey = changeset['data']['master']
    dup_keys = changeset['data']['duplicates']
    #print d['changeset']
    print('dups:', dup_keys)

    title_redirects = find_title_redirects(akey)
    works = find_works(get_books(akey, books_query(akey)),
                       existing=title_redirects)
    print('author:', akey)
    print('works:', works)
    updated = update_works(akey, works, do_updates=True)
    print('%d records updated' % len(updated))

    t1 = time() - t0
    update_times.append(t1)
    print('update takes: %d seconds' % t1)
    print()
def run_work_finder(i):
    t0 = time()
    d = i["data"]
    print "timestamp:", i["timestamp"]
    print "author:", d["author"]
    print "%d records updated:" % len(d["result"])
    if "changeset" not in d:
        print "no changeset in author merge"
        print
        return
    changeset = d["changeset"]

    try:
        assert len(changeset["data"]) == 2 and "master" in changeset["data"] and "duplicates" in changeset["data"]
    except:
        print d["changeset"]
        raise
    akey = changeset["data"]["master"]
    dup_keys = changeset["data"]["duplicates"]
    # print d['changeset']
    print "dups:", dup_keys

    title_redirects = find_title_redirects(akey)
    works = find_works(get_books(akey, books_query(akey)), existing=title_redirects)
    print "author:", akey
    print "works:", works
    updated = update_works(akey, works, do_updates=True)
    print "%d records updated" % len(updated)

    t1 = time() - t0
    update_times.append(t1)
    print "update takes: %d seconds" % t1
    print
Example #4
0
#!/usr/bin/python

from __future__ import print_function
from openlibrary.catalog.works.find_works import find_title_redirects, find_works, get_books, books_query, update_works
import sys
from pprint import pprint

akey = sys.argv[1]
title_redirects = find_title_redirects(akey)
print('title_redirects:')
pprint(title_redirects)
print()

works = find_works(akey, get_books(akey, books_query(akey)), existing=title_redirects)
works = list(works)
print('works:')
pprint(works)
print()

updated = update_works(akey, works, do_updates=True)
print('updated works:')
pprint(updated)
Example #5
0
def test_find_works():
    works = list(find_works([]))
    assert works == []

    books = [{'title': 'Magic', 'key': '/books/OL1M'}]
    book_iter = get_books('', books, do_get_mc=False)

    books2 = list(book_iter)
    assert books2 == [{
        'key': '/books/OL1M',
        'norm_title': 'magic',
        'title': 'Magic'
    }]

    var = find_works2(books2)
    assert var['equiv'] == {}
    assert var['norm_titles'] == {'magic': 1}
    assert var['books_by_key'] == {'/books/OL1M': books2[0]}
    assert var['books'] == books2
    assert var['rev_wt'] == {}

    assert build_work_title_map({}, {'magic': 1}) == {}
    assert build_work_title_map({}, {'magic': 2, 'test': 0}) == {}

    works = list(find_works(books2, do_get_mc=False))
    expect = [{
        'title':
        'Magic',
        'editions': [{
            'key': '/books/OL1M',
            'norm_title': 'magic',
            'title': 'Magic'
        }],
    }]
    assert works == expect

    books = [
        {
            'title': 'Magic',
            'key': '/books/OL1M'
        },
        {
            'title': 'Magic',
            'key': '/books/OL2M'
        },
    ]
    book_iter = get_books('', books, do_get_mc=False)
    books2 = list(book_iter)

    var = find_works2(books2)
    assert var['equiv'] == {}
    assert var['norm_titles'] == {'magic': 2}
    assert var['books_by_key'] == {
        '/books/OL1M': books2[0],
        '/books/OL2M': books2[1]
    }
    assert var['books'] == books2
    assert var['rev_wt'] == {}

    works = list(find_works(books2, do_get_mc=False))
    expect = [{
        'title':
        'Magic',
        'editions': [
            {
                'key': '/books/OL1M',
                'norm_title': 'magic',
                'title': 'Magic'
            },
            {
                'key': '/books/OL2M',
                'norm_title': 'magic',
                'title': 'Magic'
            },
        ],
    }]
    assert works == expect

    magico = u'm\xe1gico'

    assert normalize(magico) == magico

    books = [
        {
            'title': magico,
            'work_title': ['magic'],
            'key': '/books/OL1M'
        },
        {
            'title': 'magic',
            'key': '/books/OL2M'
        },
        {
            'title': magico,
            'work_title': ['magic'],
            'key': '/books/OL3M'
        },
        {
            'title': 'magic',
            'key': '/books/OL4M'
        },
    ]
    expect_keys = sorted(e['key'] for e in books)
    book_iter = get_books('', books, do_get_mc=False)
    books2 = list(book_iter)

    expect = [
        {
            'key': '/books/OL1M',
            'norm_title': magico,
            'work_title': 'magic',
            'norm_wt': 'magic',
            'title': magico
        },
        {
            'key': '/books/OL2M',
            'norm_title': 'magic',
            'title': 'magic'
        },
        {
            'key': '/books/OL3M',
            'norm_title': magico,
            'work_title': 'magic',
            'norm_wt': 'magic',
            'title': magico
        },
        {
            'key': '/books/OL4M',
            'norm_title': 'magic',
            'title': 'magic'
        },
    ]

    assert len(books2) == 4
    for i in range(4):
        assert books2[i] == expect[i]

    var = find_works2(books2)
    assert var['equiv'] == {(magico, 'magic'): 2}
    assert var['norm_titles'] == {magico: 2, 'magic': 2}
    assert len(var['books_by_key']) == 4
    bk = var['books_by_key']
    assert bk['/books/OL1M'] == books2[0]
    assert bk['/books/OL2M'] == books2[1]
    assert bk['/books/OL3M'] == books2[2]
    assert bk['/books/OL4M'] == books2[3]
    assert var['books'] == books2
    assert var['rev_wt'] == {'magic': {'magic': 2}}

    title_map = build_work_title_map(var['equiv'], var['norm_titles'])

    assert title_map == {magico: 'magic'}

    find_works3(var)
    assert var['works'] == {'magic': {'magic': expect_keys}}
    assert var['work_titles'] == {'magic': ['/books/OL1M', '/books/OL3M']}

    sorted_works = find_work_sort(var)
    assert sorted_works == [(6, 'magic', {'magic': expect_keys})]

    works = list(find_works(books2, do_get_mc=False))
    expect = [{
        'title':
        u'Magic',
        'editions': [
            {
                'key': '/books/OL2M',
                'norm_title': 'magic',
                'title': 'magic'
            },
            {
                'key': '/books/OL1M',
                'norm_title': u'mágico',
                'norm_wt': 'magic',
                'title': u'Mágico'
            },
        ],
    }]

    work_count = len(works)
    assert work_count == 1
    editions = works[0]['editions']
    edition_count = len(works[0]['editions'])
    edition_keys = sorted(e['key'] for e in editions)
    assert edition_keys == expect_keys
    assert edition_count == 4
    del works[0]['editions']
    assert works[0] == {'title': 'magic'}
Example #6
0
 elif action == 'save_many':
     author_merge = i['data']['comment'] == 'merge authors'
     if author_merge and skip_author_merge:
         continue
     if author_merge and only_author_merge:
         continue
     if handle_author_merge and not i['data']['author'].endswith(
             'Bot') and author_merge:
         first_redirect = i['data']['query'][0]
         assert first_redirect['type']['key'] == '/type/redirect'
         akey = first_redirect['location']
         if akey.startswith('/authors/'):
             akey = '/a/' + akey[len('/authors/'):]
         title_redirects = find_title_redirects(akey)
         works = find_works(akey,
                            get_books(akey, books_query(akey)),
                            existing=title_redirects)
         updated = update_works(akey, works, do_updates=True)
         works_to_update.update(w['key'] for w in updated)
     for query in i['data']['query']:
         key = query.pop('key')
         process_save(key, query)
 # store.put gets called when any document is updated in the store. Borrowing/Returning a book triggers one.
 elif action == 'store.put':
     # A sample record looks like this:
     # {
     #   "action": "store.put",
     #   "timestamp": "2011-12-01T00:00:44.241604",
     #   "data": {
     #       "data": {"borrowed": "false", "_key": "ebooks/books/OL5854888M", "_rev": "975708", "type": "ebook", "book_key": "/books/OL5854888M"},
     #       "key": "ebooks/books/OL5854888M"