Esempio n. 1
0
def run_update():
    global authors_to_update
    global works_to_update
    global last_update
    print 'running update: %s works %s authors' % (len(works_to_update), len(authors_to_update))
    if works_to_update:
        requests = []
        num = 0
        total = len(works_to_update)
        for wkey in works_to_update:
            num += 1
            print 'update work: %s %d/%d' % (wkey, num, total)
            if '/' in wkey[7:]:
                print 'bad wkey:', wkey
                continue
            for attempt in range(5):
                try:
                    requests += update_work(withKey(wkey))
                    break
                except AuthorRedirect:
                    print 'fixing author redirect'
                    w = ol.get(wkey)
                    need_update = False
                    for a in w['authors']:
                        r = ol.get(a['author'])
                        if r['type'] == '/type/redirect':
                            a['author'] = {'key': r['location']}
                            need_update = True
                    assert need_update
                    print w
                    if not done_login:
                        rc = read_rc()
                        ol.login('EdwardBot', rc['EdwardBot']) 
                    ol.save(w['key'], w, 'avoid author redirect')
            if len(requests) >= 100:
                solr_update(requests, debug=True)
                requests = []
#            if num % 1000 == 0:
#                solr_update(['<commit/>'], debug=True)
        if requests:
            solr_update(requests, debug=True)
        solr_update(['<commit/>'], debug=True)
    last_update = time()
    print >> open(state_file, 'w'), offset
    if authors_to_update:
        requests = []
        for akey in authors_to_update:
            print 'update author:', akey
            requests += update_author(akey)
        solr_update(requests + ['<commit/>'], index='authors', debug=True)
    authors_to_update = set()
    works_to_update = set()
    print >> open(state_file, 'w'), offset
Esempio n. 2
0
def run_update():
    global authors_to_update
    global works_to_update
    global last_update
    print "running update: %s works %s authors" % (len(works_to_update), len(authors_to_update))
    if works_to_update:
        requests = []
        num = 0
        total = len(works_to_update)
        for wkey in works_to_update:
            num += 1
            print "update work: %s %d/%d" % (wkey, num, total)
            if "/" in wkey[7:]:
                print "bad wkey:", wkey
                continue
            for attempt in range(5):
                try:
                    requests += update_work(withKey(wkey))
                    break
                except AuthorRedirect:
                    print "fixing author redirect"
                    w = ol.get(wkey)
                    need_update = False
                    for a in w["authors"]:
                        r = ol.get(a["author"])
                        if r["type"] == "/type/redirect":
                            a["author"] = {"key": r["location"]}
                            need_update = True
                    assert need_update
                    print w
                    if not done_login:
                        rc = read_rc()
                        ol.login("EdwardBot", rc["EdwardBot"])
                    ol.save(w["key"], w, "avoid author redirect")
            if len(requests) >= 100:
                solr_update(requests, debug=True)
                requests = []
        #            if num % 1000 == 0:
        #                solr_update(['<commit/>'], debug=True)
        if requests:
            solr_update(requests, debug=True)
        solr_update(["<commit/>"], debug=True)
    last_update = time()
    print >> open(state_file, "w"), offset
    if authors_to_update:
        requests = []
        for akey in authors_to_update:
            print "update author:", akey
            requests += update_author(akey)
        solr_update(requests + ["<commit/>"], index="authors", debug=True)
    authors_to_update = set()
    works_to_update = set()
    print >> open(state_file, "w"), offset
Esempio n. 3
0
def add_cover_to_work(w):
    if 'cover_edition' in w:
        return
    q = {'type':'/type/edition', 'works':w['key'], 'publish_date': None, 'languages': '/l/eng'}
    cover_edition = pick_cover(query_iter(q))
    if not cover_edition:
        q = {'type':'/type/edition', 'works':w['key'], 'publish_date': None}
        cover_edition = pick_cover(query_iter(q))
        if not cover_edition:
            return
    w['cover_edition'] = Reference(cover_edition)
    if ol is None:
        rc = read_rc()
        ol = OpenLibrary("http://openlibrary.org")
        ol.login('WorkBot', rc['WorkBot']) 

    print ol.save(w['key'], w, 'added cover to work')
Esempio n. 4
0
def run_update():
    global authors_to_update, works_to_update
    subjects_to_update = set()
    global last_update
    print 'running update: %s works %s authors' % (len(works_to_update),
                                                   len(authors_to_update))
    if works_to_update:
        requests = []
        num = 0
        total = len(works_to_update)
        for wkey in works_to_update:
            num += 1
            print 'update work: %s %d/%d' % (wkey, num, total)
            if '/' in wkey[7:]:
                print 'bad wkey:', wkey
                continue
            work_to_update = withKey(wkey)
            for attempt in range(5):
                try:
                    requests += update_work(work_to_update)
                except AuthorRedirect:
                    print 'fixing author redirect'
                    w = ol.get(wkey)
                    need_update = False
                    for a in w['authors']:
                        r = ol.get(a['author'])
                        if r['type'] == '/type/redirect':
                            a['author'] = {'key': r['location']}
                            need_update = True
                    if need_update:
                        if not done_login:
                            rc = read_rc()
                            ol.login('EdwardBot', rc['EdwardBot'])
                        ol.save(w['key'], w, 'avoid author redirect')
            if work_to_update['type'][
                    'key'] == '/type/work' and work_to_update.get('title'):
                subjects = get_work_subjects(work_to_update)
                print subjects
                for subject_type, values in subjects.iteritems():
                    subjects_to_update.update(
                        (subject_type, v) for v in values)
                if len(requests) >= 100:
                    solr_update(requests, debug=True)
                    requests = []
    #            if num % 1000 == 0:
    #                solr_update(['<commit/>'], debug=True)
        if requests:
            solr_update(requests, debug=True)
        if not args.no_commit:
            solr_update(['<commit/>'], debug=True)
    last_update = time()
    if not args.no_author_updates and authors_to_update:
        requests = []
        for akey in authors_to_update:
            print('update author:', repr(akey))
            try:
                request = update_author(akey)
                if request:
                    requests += request
            except AttributeError:
                print('akey:', repr(akey))
                raise
        if not args.no_commit:
            solr_update(requests + ['<commit/>'], debug=True)
    subject_add = Element("add")
    print subjects_to_update
    for subject_type, subject_name in subjects_to_update:
        key = subject_type + '/' + subject_name
        count = subject_count(subject_type, subject_name)

        if not subject_need_update(key, count):
            print 'no updated needed:', (subject_type, subject_name, count)
            continue
        print 'updated needed:', (subject_type, subject_name, count)

        doc = Element("doc")
        add_field(doc, 'key', key)
        add_field(doc, 'name', subject_name)
        add_field(doc, 'type', subject_type)
        add_field(doc, 'count', count)
        subject_add.append(doc)

    if len(subject_add):
        print 'updating subjects'
        add_xml = tostring(subject_add).encode('utf-8')
        solr_update([add_xml, '<commit />'], debug=True)

    authors_to_update = set()
    works_to_update = set()
    subjects_to_update = set()
    print >> open(state_file, 'w'), offset
Esempio n. 5
0
import re, web, sys
import simplejson as json
from urllib2 import urlopen, URLError
from openlibrary.catalog.read_rc import read_rc
from openlibrary.catalog.importer.db_read import get_mc
from time import sleep
from openlibrary.catalog.title_page_img.load import add_cover_image
from openlibrary.api import OpenLibrary, unmarshal, marshal
from pprint import pprint

rc = read_rc()
ol = OpenLibrary("http://openlibrary.org")
ol.login('ImportBot', rc['ImportBot'])

re_meta_mrc = re.compile('^([^/]*)_meta.mrc:0:\d+$')

def make_redirect(old, new, msg='replace with redirect'):
    r = {'type': {'key': '/type/redirect'}, 'location': new}
    ol.save(old, r, msg)

def fix_toc(e):
    toc = e.get('table_of_contents', None)
    if not toc:
        return
    print e['key']
    pprint(toc)
    # http://openlibrary.org/books/OL789133M - /type/toc_item missing from table_of_contents
    if isinstance(toc[0], dict) and ('pagenum' in toc[0] or toc[0]['type'] == '/type/toc_item'):
        return
    return [{'title': unicode(i), 'type': '/type/toc_item'} for i in toc if i != u'']
Esempio n. 6
0
from __future__ import print_function
from openlibrary.catalog.merge.merge_marc import *
from openlibrary.catalog.read_rc import read_rc
import openlibrary.catalog.merge.amazon as amazon
from openlibrary.catalog.get_ia import *
from openlibrary.catalog.importer.db_read import withKey, get_mc
from openlibrary.api import OpenLibrary, Reference
import openlibrary.catalog.marc.fast_parse as fast_parse
import xml.parsers.expat
import web, sys
from time import sleep

import six

rc = read_rc()

ol = OpenLibrary("http://openlibrary.org")
ol.login('ImportBot', rc['ImportBot'])

ia_db = web.database(dbn='mysql',
                     db='archive',
                     user=rc['ia_db_user'],
                     pw=rc['ia_db_pass'],
                     host=rc['ia_db_host'])
ia_db.printing = False

re_meta_marc = re.compile('([^/]+)_(meta|marc)\.(mrc|xml)')

threshold = 875
amazon.set_isbn_match(225)
Esempio n. 7
0
def run_update():
    global authors_to_update, works_to_update
    subjects_to_update = set()
    global last_update
    print 'running update: %s works %s authors' % (len(works_to_update), len(authors_to_update))
    if works_to_update:
        requests = []
        num = 0
        total = len(works_to_update)
        for wkey in works_to_update:
            num += 1
            print 'update work: %s %d/%d' % (wkey, num, total)
            if '/' in wkey[7:]:
                print 'bad wkey:', wkey
                continue
            work_to_update = withKey(wkey)
            for attempt in range(5):
                try:
                    requests += update_work(work_to_update)
                except AuthorRedirect:
                    print 'fixing author redirect'
                    w = ol.get(wkey)
                    need_update = False
                    for a in w['authors']:
                        r = ol.get(a['author'])
                        if r['type'] == '/type/redirect':
                            a['author'] = {'key': r['location']}
                            need_update = True
                    if need_update:
                        if not done_login:
                            rc = read_rc()
                            ol.login('EdwardBot', rc['EdwardBot'])
                        ol.save(w['key'], w, 'avoid author redirect')
            if work_to_update['type']['key'] == '/type/work' and work_to_update.get('title'):
                subjects = get_work_subjects(work_to_update)
                print subjects
                for subject_type, values in subjects.iteritems():
                    subjects_to_update.update((subject_type, v) for v in values)
                if len(requests) >= 100:
                    solr_update(requests, debug=True)
                    requests = []
    #            if num % 1000 == 0:
    #                solr_update(['<commit/>'], debug=True)
        if requests:
            solr_update(requests, debug=True)
        if not args.no_commit:
            solr_update(['<commit/>'], debug=True)
    last_update = time()
    if not args.no_author_updates and authors_to_update:
        requests = []
        for akey in authors_to_update:
            print 'update author:', `akey`
            try:
                request = update_author(akey)
                if request:
                    requests += request
            except AttributeError:
                print 'akey:', `akey`
                raise
        if not args.no_commit:
            solr_update(requests + ['<commit/>'], index='authors', debug=True)
    subject_add = Element("add")
    print subjects_to_update
    for subject_type, subject_name in subjects_to_update:
        key = subject_type + '/' + subject_name
        count = subject_count(subject_type, subject_name)

        if not subject_need_update(key, count):
            print 'no updated needed:', (subject_type, subject_name, count)
            continue
        print 'updated needed:', (subject_type, subject_name, count)

        doc = Element("doc")
        add_field(doc, 'key', key)
        add_field(doc, 'name', subject_name)
        add_field(doc, 'type', subject_type)
        add_field(doc, 'count', count)
        subject_add.append(doc)

    if len(subject_add):
        print 'updating subjects'
        add_xml = tostring(subject_add).encode('utf-8')
        solr_update([add_xml, '<commit />'], debug=True, index='subjects')

    authors_to_update = set()
    works_to_update = set()
    subjects_to_update = set()
    print >> open(state_file, 'w'), offset
Esempio n. 8
0
def update_works(akey, works, do_updates=False):
    # we can now look up all works by an author
    if do_updates:
        rc = read_rc()
        ol.login('WorkBot', rc['WorkBot'])
    assert do_updates

    fh_log = open('/1/var/log/openlibrary/work_finder/' + strftime('%F_%T'), 'w')
    works = list(works)
    print(akey, file=fh_log)
    print('works:', file=fh_log)
    pprint(works, fh_log)

    while True: # until redirects repaired
        q = {'type':'/type/edition', 'authors': akey, 'works': None}
        work_to_edition = defaultdict(set)
        edition_to_work = defaultdict(set)
        for e in query_iter(q):
            if not isinstance(e, dict):
                continue
            if e.get('works', None):
                for w in e['works']:
                    work_to_edition[w['key']].add(e['key'])
                    edition_to_work[e['key']].add(w['key'])

        work_by_key = {}
        fix_redirects = []
        for k, editions in work_to_edition.items():
            w = withKey(k)
            if w['type']['key'] == '/type/redirect':
                wkey = w['location']
                print('redirect found', w['key'], '->', wkey, editions, file=fh_log)
                assert re_work_key.match(wkey)
                for ekey in editions:
                    e = get_with_retry(ekey)
                    e['works'] = [{'key': wkey}]
                    fix_redirects.append(e)
                continue
            work_by_key[k] = w
        if not fix_redirects:
            print('no redirects left', file=fh_log)
            break
        print('save redirects', file=fh_log)
        try:
            ol.save_many(fix_redirects, "merge works")
        except:
            for r in fix_redirects:
                print(r)
            raise

    all_existing = set()
    work_keys = []
    print('edition_to_work:', file=fh_log)
    print(repr(dict(edition_to_work)), file=fh_log)
    print(file=fh_log)
    print('work_to_edition', file=fh_log)
    print(repr(dict(work_to_edition)), file=fh_log)
    print(file=fh_log)

#    open('edition_to_work', 'w').write(repr(dict(edition_to_work)))
#    open('work_to_edition', 'w').write(repr(dict(work_to_edition)))
#    open('work_by_key', 'w').write(repr(dict(work_by_key)))

    work_title_match = {}
    works_by_title = {}
    for w in works: # 1st pass
        for e in w['editions']:
            ekey = e['key'] if isinstance(e, dict) else e
            for wkey in edition_to_work.get(ekey, []):
                try:
                    wtitle = work_by_key[wkey]['title']
                except:
                    print('bad work:', wkey)
                    raise
                if wtitle == w['title']:
                    work_title_match[wkey] = w['title']

    wkey_to_new_title = defaultdict(set)

    for w in works: # 2nd pass
        works_by_title[w['title']] = w
        w['existing_works'] = defaultdict(int)
        for e in w['editions']:
            ekey = e['key'] if isinstance(e, dict) else e
            for wkey in edition_to_work.get(ekey, []):
                if wkey in work_title_match and work_title_match[wkey] != w['title']:
                    continue
                wtitle = work_by_key[wkey]['title']
                w['existing_works'][wkey] += 1
                wkey_to_new_title[wkey].add(w['title'])

    existing_work_with_conflict = defaultdict(set)

    for w in works: # 3rd pass
        for wkey, v in w['existing_works'].iteritems():
            if any(title != w['title'] for title in wkey_to_new_title[wkey]):
                w['has_conflict'] = True
                existing_work_with_conflict[wkey].add(w['title'])
                break

    for wkey, v in existing_work_with_conflict.iteritems():
        cur_work = work_by_key[wkey]
        existing_titles = defaultdict(int)
        for ekey in work_to_edition[wkey]:
            e = withKey(ekey)
            title = e['title']
            if e.get('title_prefix', None):
                title = e['title_prefix'].strip() + ' ' + e['title']
            existing_titles[title] += 1
        best_match = max(v, key=lambda wt: existing_titles[wt])
        works_by_title[best_match]['best_match'] = work_by_key[wkey]
        for wtitle in v:
            del works_by_title[wtitle]['has_conflict']
            if wtitle != best_match:
                works_by_title[wtitle]['existing_works'] = {}

    def other_matches(w, existing_wkey):
        return [title for title in wkey_to_new_title[existing_wkey] if title != w['title']]

    works_updated_this_session = set()

    for w in works: # 4th pass
        if 'has_conflict' in w:
            pprint(w)
        assert 'has_conflict' not in w
        if len(w['existing_works']) == 1:
            existing_wkey = w['existing_works'].keys()[0]
            if not other_matches(w, existing_wkey):
                w['best_match'] = work_by_key[existing_wkey]
        if 'best_match' in w:
            updated = update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log)
            for wkey in updated:
                if wkey in works_updated_this_session:
                    print(wkey, 'already updated!', file=fh_log)
                    print(wkey, 'already updated!')
                works_updated_this_session.update(updated)
            continue
        if not w['existing_works']:
            updated = new_work(akey, w, do_updates, fh_log)
            for wkey in updated:
                assert wkey not in works_updated_this_session
                works_updated_this_session.update(updated)
            continue

        assert not any(other_matches(w, wkey) for wkey in w['existing_works'].iterkeys())
        best_match = max(w['existing_works'].iteritems(), key=lambda i:i[1])[0]
        w['best_match'] = work_by_key[best_match]
        updated = update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log)
        for wkey in updated:
            if wkey in works_updated_this_session:
                print(wkey, 'already updated!', file=fh_log)
                print(wkey, 'already updated!')
        works_updated_this_session.update(updated)

    #if not do_updates:
    #    return []

    return [withKey(key) for key in works_updated_this_session]
Esempio n. 9
0
def update_works(akey, works, do_updates=False):
    # we can now look up all works by an author
    if do_updates:
        rc = read_rc()
        ol.login('WorkBot', rc['WorkBot'])
    assert do_updates

    fh_log = open('/1/var/log/openlibrary/work_finder/' + strftime('%F_%T'),
                  'w')
    works = list(works)
    print >> fh_log, akey
    print >> fh_log, 'works:'
    pprint(works, fh_log)

    while True:  # until redirects repaired
        q = {'type': '/type/edition', 'authors': akey, 'works': None}
        work_to_edition = defaultdict(set)
        edition_to_work = defaultdict(set)
        for e in query_iter(q):
            if not isinstance(e, dict):
                continue
            if e.get('works', None):
                for w in e['works']:
                    work_to_edition[w['key']].add(e['key'])
                    edition_to_work[e['key']].add(w['key'])

        work_by_key = {}
        fix_redirects = []
        for k, editions in work_to_edition.items():
            w = withKey(k)
            if w['type']['key'] == '/type/redirect':
                wkey = w['location']
                print >> fh_log, 'redirect found', w[
                    'key'], '->', wkey, editions
                assert re_work_key.match(wkey)
                for ekey in editions:
                    e = get_with_retry(ekey)
                    e['works'] = [{'key': wkey}]
                    fix_redirects.append(e)
                continue
            work_by_key[k] = w
        if not fix_redirects:
            print >> fh_log, 'no redirects left'
            break
        print >> fh_log, 'save redirects'
        try:
            ol.save_many(fix_redirects, "merge works")
        except:
            for r in fix_redirects:
                print r
            raise

    all_existing = set()
    work_keys = []
    print >> fh_log, 'edition_to_work:'
    print >> fh_log, ` dict(edition_to_work) `
    print >> fh_log
    print >> fh_log, 'work_to_edition'
    print >> fh_log, ` dict(work_to_edition) `
    print >> fh_log

    #    open('edition_to_work', 'w').write(`dict(edition_to_work)`)
    #    open('work_to_edition', 'w').write(`dict(work_to_edition)`)
    #    open('work_by_key', 'w').write(`dict(work_by_key)`)

    work_title_match = {}
    works_by_title = {}
    for w in works:  # 1st pass
        for e in w['editions']:
            ekey = e['key'] if isinstance(e, dict) else e
            for wkey in edition_to_work.get(ekey, []):
                try:
                    wtitle = work_by_key[wkey]['title']
                except:
                    print 'bad work:', wkey
                    raise
                if wtitle == w['title']:
                    work_title_match[wkey] = w['title']

    wkey_to_new_title = defaultdict(set)

    for w in works:  # 2nd pass
        works_by_title[w['title']] = w
        w['existing_works'] = defaultdict(int)
        for e in w['editions']:
            ekey = e['key'] if isinstance(e, dict) else e
            for wkey in edition_to_work.get(ekey, []):
                if wkey in work_title_match and work_title_match[wkey] != w[
                        'title']:
                    continue
                wtitle = work_by_key[wkey]['title']
                w['existing_works'][wkey] += 1
                wkey_to_new_title[wkey].add(w['title'])

    existing_work_with_conflict = defaultdict(set)

    for w in works:  # 3rd pass
        for wkey, v in w['existing_works'].iteritems():
            if any(title != w['title'] for title in wkey_to_new_title[wkey]):
                w['has_conflict'] = True
                existing_work_with_conflict[wkey].add(w['title'])
                break

    for wkey, v in existing_work_with_conflict.iteritems():
        cur_work = work_by_key[wkey]
        existing_titles = defaultdict(int)
        for ekey in work_to_edition[wkey]:
            e = withKey(ekey)
            title = e['title']
            if e.get('title_prefix', None):
                title = e['title_prefix'].strip() + ' ' + e['title']
            existing_titles[title] += 1
        best_match = max(v, key=lambda wt: existing_titles[wt])
        works_by_title[best_match]['best_match'] = work_by_key[wkey]
        for wtitle in v:
            del works_by_title[wtitle]['has_conflict']
            if wtitle != best_match:
                works_by_title[wtitle]['existing_works'] = {}

    def other_matches(w, existing_wkey):
        return [
            title for title in wkey_to_new_title[existing_wkey]
            if title != w['title']
        ]

    works_updated_this_session = set()

    for w in works:  # 4th pass
        if 'has_conflict' in w:
            pprint(w)
        assert 'has_conflict' not in w
        if len(w['existing_works']) == 1:
            existing_wkey = w['existing_works'].keys()[0]
            if not other_matches(w, existing_wkey):
                w['best_match'] = work_by_key[existing_wkey]
        if 'best_match' in w:
            updated = update_work_with_best_match(akey, w, work_to_edition,
                                                  do_updates, fh_log)
            for wkey in updated:
                if wkey in works_updated_this_session:
                    print >> fh_log, wkey, 'already updated!'
                    print wkey, 'already updated!'
                works_updated_this_session.update(updated)
            continue
        if not w['existing_works']:
            updated = new_work(akey, w, do_updates, fh_log)
            for wkey in updated:
                assert wkey not in works_updated_this_session
                works_updated_this_session.update(updated)
            continue

        assert not any(
            other_matches(w, wkey) for wkey in w['existing_works'].iterkeys())
        best_match = max(w['existing_works'].iteritems(),
                         key=lambda i: i[1])[0]
        w['best_match'] = work_by_key[best_match]
        updated = update_work_with_best_match(akey, w, work_to_edition,
                                              do_updates, fh_log)
        for wkey in updated:
            if wkey in works_updated_this_session:
                print >> fh_log, wkey, 'already updated!'
                print wkey, 'already updated!'
        works_updated_this_session.update(updated)

    #if not do_updates:
    #    return []

    return [withKey(key) for key in works_updated_this_session]