Esempio n. 1
0
def update_edition(e):
    if not is_single_core():
        return []

    ekey = e['key']
    logger.info("updating edition %s", ekey)

    wkey = e.get('works') and e['works'][0]['key']
    w = wkey and withKey(wkey)
    authors = []

    if w:
        authors = [withKey(a['author']['key']) for a in w.get("authors", []) if 'author' in a]

    request_set = SolrRequestSet()
    request_set.delete(ekey)

    q = {'type': '/type/redirect', 'location': ekey}
    redirect_keys = [r['key'] for r in query_iter(q)]
    for k in redirect_keys:
        request_set.delete(k)

    doc = EditionBuilder(e, w, authors).build()
    request_set.add(doc)
    return request_set.get_requests()
Esempio n. 2
0
def update_keys(keys, commit=True):
    logger.info("BEGIN update_keys")
    wkeys = set()

    # Get works for all the editions
    ekeys = set(k for k in keys if k.startswith("/books/"))
    for k in ekeys:
        logger.info("processing edition %s", k)
        edition = withKey(k)

        if edition and edition['type']['key'] == '/type/redirect':
            logger.warn("Found redirect to %s", edition['location'])
            edition = withKey(edition['location'])

        if not edition:
            logger.warn("No edition found for key %r. Ignoring...", k)
            continue
        elif edition['type']['key'] != '/type/edition':
            logger.warn("Found a document of type %r. Ignoring...", edition['type']['key'])
            continue

        if edition.get("works"):
            wkeys.add(edition["works"][0]['key'])
        else:
            # index the edition as it does not belong to any work
            wkeys.add(k)
        
    # Add work keys
    wkeys.update(k for k in keys if k.startswith("/works/"))
    
    # update works
    requests = []
    for k in wkeys:
        logger.info("updating %s", k)
        try:
            w = withKey(k)
            requests += update_work(w, debug=True)
        except:
            logger.error("Failed to update work %s", k, exc_info=True)

    if requests:    
        if commit:
            requests += ['<commit />']
        solr_update(requests, debug=True)
    
    # update authors
    requests = []
    akeys = set(k for k in keys if k.startswith("/authors/"))
    for k in akeys:
        logger.info("updating %s", k)
        try:
            requests += update_author(k)
        except:
            logger.error("Failed to update author %s", k, exc_info=True)
    if requests:  
        if commit:
            requests += ['<commit />']
        solr_update(requests, index="authors", debug=True)
    logger.info("END update_keys")
Esempio n. 3
0
def try_amazon(key):
    thing = withKey(key)
    if 'isbn_10' not in thing:
        return None
    if 'authors' in thing:
        authors = []
        for a in thing['authors']:
            author_thing = withKey(a['key'])
            if 'name' in author_thing:
                authors.append(author_thing['name'])
    else:
        authors = []
    return amazon.build_amazon(thing, authors)
Esempio n. 4
0
def hide_books(start):
    mend = []
    fix_works = set()
    db_iter = db.query("select identifier, collection, updated from metadata where (noindex is not null or curatestate='dark') and mediatype='texts' and scandate is not null and updated > $start order by updated", {'start': start})
    for row in db_iter:
        ia = row.identifier
        if row.collection:
            collections = set(i.lower().strip() for i in row.collection.split(';'))
            if 'printdisabled' in collections:
                continue
        print `ia`, row.updated
        for eq in query({'type': '/type/edition', 'ocaid': ia}):
            print eq['key']
            e = ol.get(eq['key'])
            if 'ocaid' not in e:
                continue
            if 'works' in e:
                fix_works.update(e['works'])
            print e['key'], `e.get('title', None)`
            del e['ocaid']
            mend.append(e)
    print 'removing links from %d editions' % len(mend)
    print ol.save_many(mend, 'remove link')
    requests = []
    for wkey in fix_works:
        requests += update_work(withKey(wkey))
    if fix_works:
        solr_update(requests + ['<commit/>'], debug=True)
Esempio n. 5
0
def update_keys(keys, commit=True):
    logger.info("BEGIN update_keys")
    wkeys = set()

    # Get works for all the editions
    ekeys = set(k for k in keys if k.startswith("/books/"))
    for k in ekeys:
        print "processing edition", k
        edition = withKey(k)
        if edition.get("works"):
            wkeys.add(edition["works"][0]['key'])
        else:
            # index the edition as it does not belong to any work
            wkeys.add(k)
        
    # Add work keys
    wkeys.update(k for k in keys if k.startswith("/works/"))
    
    # update works
    requests = []
    for k in wkeys:
        logger.info("updating %s", k)
        try:
            w = withKey(k)
            requests += update_work(w, debug=True)
        except:
            logger.error("Failed to update work %s", k, exc_info=True)

    if requests:    
        if commit:
            requests += ['<commit />']
        solr_update(requests, debug=True)
    
    # update authors
    requests = []
    akeys = set(k for k in keys if k.startswith("/authors/"))
    for k in akeys:
        logger.info("updating %s", k)
        try:
            requests += update_author(k)
        except:
            logger.error("Failed to update author %s", k, exc_info=True)
    if requests:  
        if commit:
            requests += ['<commit />']
        solr_update(requests, index="authors", debug=True)
    logger.info("END update_keys")
Esempio n. 6
0
def update_author(akey, a=None, handle_redirects=True):
    # http://ia331507.us.archive.org:8984/solr/works/select?indent=on&q=author_key:OL22098A&facet=true&rows=1&sort=edition_count%20desc&fl=title&facet.field=subject_facet&facet.mincount=1
    m = re_author_key.match(akey)
    if not m:
        print 'bad key:', akey
        return
    author_id = m.group(1)
    if not a:
        a = withKey(akey)
    if a['type']['key'] in ('/type/redirect', '/type/delete') or not a.get('name', None):
        return ['<delete><query>key:%s</query></delete>' % author_id] 
    try:
        assert a['type']['key'] == '/type/author'
    except AssertionError:
        print a['type']['key']
        raise

    facet_fields = ['subject', 'time', 'person', 'place']

    url = 'http://' + get_solr('works') + '/solr/works/select?wt=json&json.nl=arrarr&q=author_key:%s&sort=edition_count+desc&rows=1&fl=title,subtitle&facet=true&facet.mincount=1' % author_id
    url += ''.join('&facet.field=%s_facet' % f for f in facet_fields)
    reply = json.load(urlopen(url))
    work_count = reply['response']['numFound']
    docs = reply['response'].get('docs', [])
    top_work = None
    if docs:
        top_work = docs[0]['title']
        if docs[0].get('subtitle', None):
            top_work += ': ' + docs[0]['subtitle']
    all_subjects = []
    for f in facet_fields:
        for s, num in reply['facet_counts']['facet_fields'][f + '_facet']:
            all_subjects.append((num, s))
    all_subjects.sort(reverse=True)
    top_subjects = [s for num, s in all_subjects[:10]]

    add = Element("add")
    doc = SubElement(add, "doc")
    add_field(doc, 'key', author_id)
    if a.get('name', None):
        add_field(doc, 'name', a['name'])
    for f in 'birth_date', 'death_date', 'date':
        if a.get(f, None):
            add_field(doc, f, a[f])
    if top_work:
        add_field(doc, 'top_work', top_work)
    add_field(doc, 'work_count', work_count)
    add_field_list(doc, 'top_subjects', top_subjects)

    requests = []
    if handle_redirects:
        q = {'type': '/type/redirect', 'location': akey}
        redirects = ''.join('<id>%s</id>' % re_author_key.match(r['key']).group(1) for r in query_iter(q))
        if redirects:
            requests.append('<delete>' + redirects + '</delete>')

    requests.append(tostring(add).encode('utf-8'))
    return requests
Esempio n. 7
0
def fix_up_authors(w, akey, editions):
    print 'looking for author:', akey
    #print (w, akey, editions)
    seen_akey = False
    need_save = False
    for a in w.get('authors', []):
        print 'work:', w['key']
        obj = withKey(a['author']['key'])
        if obj['type']['key'] == '/type/redirect':
            a['author']['key'] = obj['location']
            print obj['key'], 'redirects to', obj['location']
            #a['author']['key'] = '/authors/' + re_author_key.match(a['author']['key']).group(1)
            assert a['author']['key'].startswith('/authors/')
            obj = withKey(a['author']['key'])
            assert obj['type']['key'] == '/type/author'
            need_save = True
        if akey == a['author']['key']:
            seen_akey = True
    if seen_akey:
        if need_save:
            print 'need save:', a
        return need_save
    try:
        ekey = editions[0]['key']
    except:
        print 'editions:', editions
        raise
    #print 'author %s missing. copying from first edition %s' % (akey, ekey)
    #print 'before:'
    for a in w.get('authors', []):
        print a
    e = withKey(ekey)
    #print e
    if not e.get('authors', None):
        print 'no authors in edition'
        return
    print 'authors from first edition', e['authors']
    w['authors'] = [{
        'type': '/type/author_role',
        'author': a
    } for a in e['authors']]
    #print 'after:'
    #for a in w['authors']:
    #    print a
    return True
Esempio n. 8
0
def fix_up_authors(w, akey, editions):
    print('looking for author:', akey)
    #print (w, akey, editions)
    seen_akey = False
    need_save = False
    for a in w.get('authors', []):
        print('work:', w['key'])
        obj = withKey(a['author']['key'])
        if obj['type']['key'] == '/type/redirect':
            a['author']['key'] = obj['location']
            print(obj['key'], 'redirects to', obj['location'])
            #a['author']['key'] = '/authors/' + re_author_key.match(a['author']['key']).group(1)
            assert a['author']['key'].startswith('/authors/')
            obj = withKey(a['author']['key'])
            assert obj['type']['key'] == '/type/author'
            need_save = True
        if akey == a['author']['key']:
            seen_akey = True
    if seen_akey:
        if need_save:
            print('need save:', a)
        return need_save
    try:
        ekey = editions[0]['key']
    except:
        print('editions:', editions)
        raise
    #print 'author %s missing. copying from first edition %s' % (akey, ekey)
    #print 'before:'
    for a in w.get('authors', []):
        print(a)
    e = withKey(ekey)
    #print e
    if not e.get('authors', None):
        print('no authors in edition')
        return
    print('authors from first edition', e['authors'])
    w['authors'] = [{'type':'/type/author_role', 'author':a} for a in e['authors']]
    #print 'after:'
    #for a in w['authors']:
    #    print a
    return True
Esempio n. 9
0
def run_update():
    global authors_to_update
    global works_to_update
    global last_update
    print 'running update: %s works %s authors' % (len(works_to_update), len(authors_to_update))
    if works_to_update:
        requests = []
        num = 0
        total = len(works_to_update)
        for wkey in works_to_update:
            num += 1
            print 'update work: %s %d/%d' % (wkey, num, total)
            if '/' in wkey[7:]:
                print 'bad wkey:', wkey
                continue
            for attempt in range(5):
                try:
                    requests += update_work(withKey(wkey))
                    break
                except AuthorRedirect:
                    print 'fixing author redirect'
                    w = ol.get(wkey)
                    need_update = False
                    for a in w['authors']:
                        r = ol.get(a['author'])
                        if r['type'] == '/type/redirect':
                            a['author'] = {'key': r['location']}
                            need_update = True
                    assert need_update
                    print w
                    if not done_login:
                        rc = read_rc()
                        ol.login('EdwardBot', rc['EdwardBot']) 
                    ol.save(w['key'], w, 'avoid author redirect')
            if len(requests) >= 100:
                solr_update(requests, debug=True)
                requests = []
#            if num % 1000 == 0:
#                solr_update(['<commit/>'], debug=True)
        if requests:
            solr_update(requests, debug=True)
        solr_update(['<commit/>'], debug=True)
    last_update = time()
    print >> open(state_file, 'w'), offset
    if authors_to_update:
        requests = []
        for akey in authors_to_update:
            print 'update author:', akey
            requests += update_author(akey)
        solr_update(requests + ['<commit/>'], index='authors', debug=True)
    authors_to_update = set()
    works_to_update = set()
    print >> open(state_file, 'w'), offset
Esempio n. 10
0
def run_update():
    global authors_to_update
    global works_to_update
    global last_update
    print "running update: %s works %s authors" % (len(works_to_update), len(authors_to_update))
    if works_to_update:
        requests = []
        num = 0
        total = len(works_to_update)
        for wkey in works_to_update:
            num += 1
            print "update work: %s %d/%d" % (wkey, num, total)
            if "/" in wkey[7:]:
                print "bad wkey:", wkey
                continue
            for attempt in range(5):
                try:
                    requests += update_work(withKey(wkey))
                    break
                except AuthorRedirect:
                    print "fixing author redirect"
                    w = ol.get(wkey)
                    need_update = False
                    for a in w["authors"]:
                        r = ol.get(a["author"])
                        if r["type"] == "/type/redirect":
                            a["author"] = {"key": r["location"]}
                            need_update = True
                    assert need_update
                    print w
                    if not done_login:
                        rc = read_rc()
                        ol.login("EdwardBot", rc["EdwardBot"])
                    ol.save(w["key"], w, "avoid author redirect")
            if len(requests) >= 100:
                solr_update(requests, debug=True)
                requests = []
        #            if num % 1000 == 0:
        #                solr_update(['<commit/>'], debug=True)
        if requests:
            solr_update(requests, debug=True)
        solr_update(["<commit/>"], debug=True)
    last_update = time()
    print >> open(state_file, "w"), offset
    if authors_to_update:
        requests = []
        for akey in authors_to_update:
            print "update author:", akey
            requests += update_author(akey)
        solr_update(requests + ["<commit/>"], index="authors", debug=True)
    authors_to_update = set()
    works_to_update = set()
    print >> open(state_file, "w"), offset
Esempio n. 11
0
def fix_up_authors(w, akey, editions):
    seen_akey = False
    need_save = False
    print 'fix_up_authors'
    for a in w.get('authors', []):
        print a
        obj = withKey(a['author']['key'])
        if obj['type']['key'] == '/type/redirect':
            a['author']['key'] = obj['location']
            a['author']['key'] = '/authors/' + re_author_key.match(a['author']['key']).group(1)
            print 'getting:', a['author']['key']
            obj = withKey(a['author']['key'])
            print 'found:', obj
            assert obj['type']['key'] == '/type/author'
            need_save = True
        if akey == a['author']['key']:
            seen_akey = True
    if seen_akey:
        return need_save
    try:
        ekey = editions[0]['key']
    except:
        print 'editions:', editions
        raise
    print 'author %s missing. copying from first edition %s' % (akey, ekey)
    print 'before:'
    for a in w.get('authors', []):
        print a
    e = withKey(ekey)
    print e
    if not e.get('authors', None):
        print 'no authors in edition'
        return
    w['authors'] = [{'type':'/type/author_role', 'author':a} for a in e['authors']]
    print 'after:'
    for a in w['authors']:
        print a
    return True
Esempio n. 12
0
def update_keys(keys, commit=True):
    logger.info("BEGIN update_keys")
    wkeys = set()

    # Get works for all the editions
    ekeys = set(k for k in keys if k.startswith("/books/"))
    for k in ekeys:
        print "processing edition", k
        edition = withKey(k)
        if edition.get("works"):
            wkeys.add(edition["works"][0]['key'])
        
    # Add work keys
    wkeys.update(k for k in keys if k.startswith("/works/"))
    
    # update works
    requests = []
    for k in wkeys:
        logger.info("updating %s", k)
        w = withKey(k)
        requests += update_work(w, debug=True)
    if requests:    
        if commit:
            requests += ['<commit />']
        solr_update(requests, debug=True)
    
    # update authors
    requests = []
    akeys = set(k for k in keys if k.startswith("/authors/"))
    for k in akeys:
        logger.info("updating %s", k)
        requests += update_author(k)
    if requests:  
        if commit:
            requests += ['<commit />']
        solr_update(requests, index="authors", debug=True)
    logger.info("END update_keys")
Esempio n. 13
0
def update_keys(keys, commit=True):
    logger.info("BEGIN update_keys")
    wkeys = set()

    # Get works for all the editions
    ekeys = set(k for k in keys if k.startswith("/books/"))
    for k in ekeys:
        print "processing edition", k
        edition = withKey(k)
        if edition.get("works"):
            wkeys.add(edition["works"][0]['key'])

    # Add work keys
    wkeys.update(k for k in keys if k.startswith("/works/"))

    # update works
    requests = []
    for k in wkeys:
        logger.info("updating %s", k)
        w = withKey(k)
        requests += update_work(w, debug=True)
    if requests:
        if commit:
            requests += ['<commit />']
        solr_update(requests, debug=True)

    # update authors
    requests = []
    akeys = set(k for k in keys if k.startswith("/authors/"))
    for k in akeys:
        logger.info("updating %s", k)
        requests += update_author(k)
    if requests:
        if commit:
            requests += ['<commit />']
        solr_update(requests, index="authors", debug=True)
    logger.info("END update_keys")
Esempio n. 14
0
def update_keys(keys):
    # update works
    requests = []
    wkeys = [k for k in keys if k.startswith("/works/")]
    print "updating", wkeys
    for k in wkeys:
        w = withKey(k)
        requests += update_work(w, debug=True)
    if requests:    
        requests += ['<commit />']
        solr_update(requests, debug=True)
    
    # update authors
    requests = []    
    akeys = [k for k in keys if k.startswith("/authors/")]
    print "updating", akeys
    for k in akeys:
        requests += update_author(k)
    if requests:    
        requests += ['<commit />']
        solr_update(requests, index="authors", debug=True)        
Esempio n. 15
0
    def get_author(self, a):
        """Returns the author dict from author entry in the work.

            get_author({"author": {"key": "/authors/OL1A"}})
        """
        if 'author' not in a:  # OL Web UI bug
            return  # http://openlibrary.org/works/OL15365167W.yml?m=edit&v=1

        author = a['author']

        if 'type' in author:
            # means it is already the whole object.
            # It'll be like this when doing re-indexing of solr.
            return author

        key = a['author']['key']
        m = re_author_key.match(key)
        if not m:
            print 'invalid author key:', key
            return
        return withKey(key)
Esempio n. 16
0
    def get_author(self, a):
        """Returns the author dict from author entry in the work.

            get_author({"author": {"key": "/authors/OL1A"}})
        """
        if 'author' not in a: # OL Web UI bug
            return # http://openlibrary.org/works/OL15365167W.yml?m=edit&v=1

        author = a['author']

        if 'type' in author:
            # means it is already the whole object. 
            # It'll be like this when doing re-indexing of solr.
            return author
        
        key = a['author']['key']
        m = re_author_key.match(key)
        if not m:
            logger.error('invalid author key: %s', key)
            return
        return withKey(key)
Esempio n. 17
0
def hide_books(start):
    hide_start = open(hide_state_file).readline()[:-1]
    print "hide start:", hide_start

    mend = []
    fix_works = set()
    db_iter = db.query(
        "select identifier, collection, updated from metadata where (noindex is not null or curatestate='dark') and mediatype='texts' and scandate is not null and updated > $start order by scandate_dt",
        {"start": hide_start},
    )
    last_updated = None
    for row in db_iter:
        ia = row.identifier
        if row.collection:
            collections = set(i.lower().strip() for i in row.collection.split(";"))
            if "printdisabled" in collections or "lendinglibrary" in collections:
                continue
        print ` ia `, row.updated
        for eq in query({"type": "/type/edition", "ocaid": ia}):
            print eq["key"]
            e = ol.get(eq["key"])
            if "ocaid" not in e:
                continue
            if "works" in e:
                fix_works.update(e["works"])
            print e["key"], ` e.get("title", None) `
            del e["ocaid"]
            mend.append(e)
        last_updated = row.updated
    print "removing links from %d editions" % len(mend)
    if not mend:
        return
    print ol.save_many(mend, "remove link")
    requests = []
    for wkey in fix_works:
        requests += update_work(withKey(wkey))
    if fix_works:
        solr_update(requests + ["<commit/>"], debug=True)
    print >> open(hide_state_file, "w"), last_updated
Esempio n. 18
0
def hide_books(start):
    hide_start = open(hide_state_file).readline()[:-1]
    print('hide start:', hide_start)

    mend = []
    fix_works = set()
    db_iter = db.query(
        "select identifier, collection, updated from metadata where (noindex is not null or curatestate='dark') and mediatype='texts' and scandate is not null and updated > $start",
        {'start': hide_start})
    last_updated = None
    for row in db_iter:
        ia = row.identifier
        if row.collection:
            collections = set(i.lower().strip()
                              for i in row.collection.split(';'))
            if ignore_noindex & collections:
                continue
        print((repr(ia), row.updated))
        for eq in query({'type': '/type/edition', 'ocaid': ia}):
            print(eq['key'])
            e = ol.get(eq['key'])
            if 'ocaid' not in e:
                continue
            if 'works' in e:
                fix_works.update(e['works'])
            print((e['key'], repr(e.get('title', None))))
            del e['ocaid']
            mend.append(e)
        last_updated = row.updated
    print('removing links from %d editions' % len(mend))
    if not mend:
        return
    print(ol.save_many(mend, 'remove link'))
    requests = []
    for wkey in fix_works:
        requests += update_work(withKey(wkey))
    if fix_works:
        solr_update(requests + ['<commit/>'], debug=True)
    print(last_updated, file=open(hide_state_file, 'w'))
Esempio n. 19
0
def hide_books(start):
    hide_start = open(hide_state_file).readline()[:-1]
    print 'hide start:', hide_start

    mend = []
    fix_works = set()
    db_iter = db.query("select identifier, collection, updated from metadata where (noindex is not null or curatestate='dark') and mediatype='texts' and scandate is not null and updated > $start", {'start': hide_start})
    last_updated = None
    for row in db_iter:
        ia = row.identifier
        if row.collection:
            collections = set(i.lower().strip() for i in row.collection.split(';'))
            if ignore_noindex & collections:
                continue
        print(repr(ia), row.updated)
        for eq in query({'type': '/type/edition', 'ocaid': ia}):
            print eq['key']
            e = ol.get(eq['key'])
            if 'ocaid' not in e:
                continue
            if 'works' in e:
                fix_works.update(e['works'])
            print(e['key'], repr(e.get('title', None)))
            del e['ocaid']
            mend.append(e)
        last_updated = row.updated
    print 'removing links from %d editions' % len(mend)
    if not mend:
        return
    print ol.save_many(mend, 'remove link')
    requests = []
    for wkey in fix_works:
        requests += update_work(withKey(wkey))
    if fix_works:
        solr_update(requests + ['<commit/>'], debug=True)
    print >> open(hide_state_file, 'w'), last_updated
Esempio n. 20
0
                assert first_redirect['type']['key'] == '/type/redirect'
                akey = first_redirect['location']
                if akey.startswith('/authors/'):
                    akey = '/a/' + akey[len('/authors/'):]
                title_redirects = find_title_redirects(akey)
                works = find_works(akey, get_books(akey, books_query(akey)), existing=title_redirects)
                updated = update_works(akey, works, do_updates=True)
                works_to_update.update(w['key'] for w in updated)
            for query in i['data']['query']:
                key = query.pop('key')
                process_save(key, query)
        # store.put gets called when any document is updated in the store. Borrowing/Returning a book triggers one.
        elif action == 'store.put':
            # A sample record looks like this:
            # {
            #   "action": "store.put",
            #   "timestamp": "2011-12-01T00:00:44.241604",
            #   "data": {
            #       "data": {"borrowed": "false", "_key": "ebooks/books/OL5854888M", "_rev": "975708", "type": "ebook", "book_key": "/books/OL5854888M"},
            #       "key": "ebooks/books/OL5854888M"
            #   },
            #   "site": "openlibrary.org"
            # }
            data = i.get('data', {}).get("data")
            if data.get("type") == "ebook" and data.get("_key", "").startswith("ebooks/books/"):
                edition_key = data['book_key']
                process_save(edition_key, withKey(edition_key))
    since_last_update = time() - last_update
    if len(works_to_update) > work_limit or len(authors_to_update) > author_limit or since_last_update > time_limit:
        run_update()
Esempio n. 21
0
def run_update():
    global authors_to_update, works_to_update
    subjects_to_update = set()
    global last_update
    print 'running update: %s works %s authors' % (len(works_to_update), len(authors_to_update))
    if works_to_update:
        requests = []
        num = 0
        total = len(works_to_update)
        for wkey in works_to_update:
            num += 1
            print 'update work: %s %d/%d' % (wkey, num, total)
            if '/' in wkey[7:]:
                print 'bad wkey:', wkey
                continue
            work_to_update = withKey(wkey)
            for attempt in range(5):
                try:
                    requests += update_work(work_to_update)
                except AuthorRedirect:
                    print 'fixing author redirect'
                    w = ol.get(wkey)
                    need_update = False
                    for a in w['authors']:
                        r = ol.get(a['author'])
                        if r['type'] == '/type/redirect':
                            a['author'] = {'key': r['location']}
                            need_update = True
                    if need_update:
                        if not done_login:
                            rc = read_rc()
                            ol.login('EdwardBot', rc['EdwardBot'])
                        ol.save(w['key'], w, 'avoid author redirect')
            if work_to_update['type']['key'] == '/type/work' and work_to_update.get('title'):
                subjects = get_work_subjects(work_to_update)
                print subjects
                for subject_type, values in subjects.iteritems():
                    subjects_to_update.update((subject_type, v) for v in values)
                if len(requests) >= 100:
                    solr_update(requests, debug=True)
                    requests = []
    #            if num % 1000 == 0:
    #                solr_update(['<commit/>'], debug=True)
        if requests:
            solr_update(requests, debug=True)
        if not args.no_commit:
            solr_update(['<commit/>'], debug=True)
    last_update = time()
    if not args.no_author_updates and authors_to_update:
        requests = []
        for akey in authors_to_update:
            print 'update author:', `akey`
            try:
                request = update_author(akey)
                if request:
                    requests += request
            except AttributeError:
                print 'akey:', `akey`
                raise
        if not args.no_commit:
            solr_update(requests + ['<commit/>'], index='authors', debug=True)
    subject_add = Element("add")
    print subjects_to_update
    for subject_type, subject_name in subjects_to_update:
        key = subject_type + '/' + subject_name
        count = subject_count(subject_type, subject_name)

        if not subject_need_update(key, count):
            print 'no updated needed:', (subject_type, subject_name, count)
            continue
        print 'updated needed:', (subject_type, subject_name, count)

        doc = Element("doc")
        add_field(doc, 'key', key)
        add_field(doc, 'name', subject_name)
        add_field(doc, 'type', subject_type)
        add_field(doc, 'count', count)
        subject_add.append(doc)

    if len(subject_add):
        print 'updating subjects'
        add_xml = tostring(subject_add).encode('utf-8')
        solr_update([add_xml, '<commit />'], debug=True, index='subjects')

    authors_to_update = set()
    works_to_update = set()
    subjects_to_update = set()
    print >> open(state_file, 'w'), offset
Esempio n. 22
0
from openlibrary.catalog.utils.query import query, withKey
from openlibrary.catalog.importer.update import add_source_records

for num, line in enumerate(open('/1/edward/imagepdf/possible_match2')):
    doc = eval(line)
    if 'publisher' not in doc:
        continue
    item_id = doc['item_id']
    if query({'type': '/type/edition', 'source_records': 'ia:' + item_id}):
        continue
    e = withKey(doc['ol'])
    if 'publishers' not in e:
        continue
    title_match = False
    if doc['title'] == e['title']:
        title_match = True
    elif doc['title'] == e.get('title_prefix', '') + e['title']:
        title_match = True
    elif doc['title'] == e.get('title_prefix', '') + e['title'] + e.get(
            'subtitle', ''):
        title_match = True
    elif doc['title'] == e['title'] + e.get('subtitle', ''):
        title_match = True
    if not title_match:
        continue
    if doc['publisher'] != e['publishers'][0]:
        continue
    print 'match:', item_id, doc['ol']
    add_source_records(doc['ol'], item_id)
Esempio n. 23
0
def solr_updates(i):
    global subjects_to_update, authors_to_update
    t0 = time()
    d = i['data']
    changeset = d['changeset']
    print 'author:', d['author']
    try:
        assert len(changeset['data']) == 2 and 'master' in changeset['data'] and 'duplicates' in changeset['data']
    except:
        print d['changeset']
        raise
    master_key = changeset['data']['master']
    dup_keys = changeset['data']['duplicates']
    assert dup_keys
    print 'timestamp:', i['timestamp']
    print 'dups:', dup_keys
    print 'records to update:', len(d['result'])

    master = None
    obj_by_key = {}
    works = []
    editions_by_work = defaultdict(list)
    for obj in d['query']:
        obj_type = obj['type']['key']
        k = obj['key']
        if obj_type == '/type/work':
            works.append(obj['key'])
        elif obj_type == '/type/edition':
            if 'works' not in obj:
                continue
            for w in obj['works']:
                editions_by_work[w['key']].append(obj)
        obj_by_key[k] = obj
    master = obj_by_key.get(master_key)
    #print 'master:', master

    if len(d['result']) == 0:
        print i

    work_updates = []
    for wkey in works:
            #print 'editions_by_work:', editions_by_work
            work = obj_by_key[wkey]
            work['editions'] = editions_by_work[wkey]
            subjects = get_work_subjects(work)
            for subject_type, values in subjects.iteritems():
                subjects_to_update.update((subject_type, v) for v in values)
            try:
                ret = update_work(work, obj_cache=obj_by_key, debug=True)
            except AuthorRedirect:
                work = withKey(wkey)
                work['editions'] = editions_by_work[wkey]
                ret = update_work(work, debug=True, resolve_redirects=True)
            work_updates += ret
    if work_updates:
        solr_update(work_updates, debug=False, index='works')

    authors_to_update.append({ 'redirects': dup_keys, 'master_key': master_key, 'master': master})
    print 'authors to update:', len(authors_to_update)

    t1 = time() - t0
    update_times.append(t1)
    print 'update takes: %d seconds' % t1
    print
Esempio n. 24
0
def build_doc(w, obj_cache={}, resolve_redirects=False):
    wkey = w['key']
    assert w['type']['key'] == '/type/work'
    title = w.get('title', None)
    if not title:
        return

    def get_pub_year(e):
        pub_date = e.get('publish_date', None)
        if pub_date:
            m = re_iso_date.match(pub_date)
            if m:
                return m.group(1)
            m = re_year.search(pub_date)
            if m:
                return m.group(1)

    if 'editions' not in w:
        q = {'type': '/type/edition', 'works': wkey, '*': None}
        w['editions'] = list(query_iter(q))
        print 'editions:', [e['key'] for e in w['editions']]

    editions = []
    for e in w['editions']:
        pub_year = get_pub_year(e)
        if pub_year:
            e['pub_year'] = pub_year
        if 'ocaid' in e:
            collection = get_ia_collection(e['ocaid'])
            #print 'collection:', collection
            e['ia_collection'] = collection
            e['public_scan'] = ('lendinglibrary'
                                not in collection) and ('printdisabled'
                                                        not in collection)
        overdrive_id = e.get('identifiers', {}).get('overdrive', None)
        if overdrive_id:
            #print 'overdrive:', overdrive_id
            e['overdrive'] = overdrive_id
        editions.append(e)

    editions.sort(key=lambda e: e.get('pub_year', None))

    #print len(w['editions']), 'editions found'

    #print w['key']
    work_authors = []
    authors = []
    author_keys = []
    for a in w.get('authors', []):
        if 'author' not in a:  # OL Web UI bug
            continue  # http://openlibrary.org/works/OL15365167W.yml?m=edit&v=1
        akey = a['author']['key']
        m = re_author_key.match(akey)
        if not m:
            print 'invalid author key:', akey
            continue
        work_authors.append(akey)
        author_keys.append(m.group(1))
        if akey in obj_cache and obj_cache[akey]['type'][
                'key'] != '/type/redirect':
            authors.append(obj_cache[akey])
        else:
            authors.append(withKey(akey))
    if any(a['type']['key'] == '/type/redirect' for a in authors):
        if resolve_redirects:

            def resolve(a):
                if a['type']['key'] == '/type/redirect':
                    a = withKey(a['location'])
                return a

            authors = [resolve(a) for a in authors]
        else:
            print
            for a in authors:
                print 'author:', a
            print w['key']
            print
            raise AuthorRedirect
    for a in authors:
        print 'author:', a
    assert all(a['type']['key'] == '/type/author' for a in authors)

    try:
        subjects = four_types(get_work_subjects(w))
    except:
        print 'bad work: ', w['key']
        raise

    field_map = {
        'subjects': 'subject',
        'subject_places': 'place',
        'subject_times': 'time',
        'subject_people': 'person',
    }

    has_fulltext = any(
        e.get('ocaid', None) or e.get('overdrive', None) for e in editions)

    #print 'has_fulltext:', has_fulltext

    for db_field, solr_field in field_map.iteritems():
        if not w.get(db_field, None):
            continue
        cur = subjects.setdefault(solr_field, {})
        for v in w[db_field]:
            try:
                if isinstance(v, dict):
                    if 'value' not in v:
                        continue
                    v = v['value']
                cur[v] = cur.get(v, 0) + 1
            except:
                print 'v:', v
                raise

    if any(e.get('ocaid', None) for e in editions):
        subjects.setdefault('subject', {})
        subjects['subject']['Accessible book'] = subjects['subject'].get(
            'Accessible book', 0) + 1
        if not has_fulltext:
            subjects['subject']['Protected DAISY'] = subjects['subject'].get(
                'Protected DAISY', 0) + 1
        #print w['key'], subjects['subject']

    doc = Element("doc")

    add_field(doc, 'key', w['key'][7:])
    title = w.get('title', None)
    if title:
        add_field(doc, 'title', title)


#        add_field(doc, 'title_suggest', title)

    add_field(doc, 'has_fulltext', has_fulltext)
    if w.get('subtitle', None):
        add_field(doc, 'subtitle', w['subtitle'])

    alt_titles = set()
    for e in editions:
        if 'title' in e and e['title'] != title:
            alt_titles.add(e['title'])
        for f in 'work_titles', 'other_titles':
            for t in e.get(f, []):
                if t != title:
                    alt_titles.add(t)
    add_field_list(doc, 'alternative_title', alt_titles)

    alt_subtitles = set(e['subtitle'] for e in editions
                        if e.get('subtitle', None)
                        and e['subtitle'] != w.get('subtitle', None))
    add_field_list(doc, 'alternative_subtitle', alt_subtitles)

    add_field(doc, 'edition_count', len(editions))
    for e in editions:
        add_field(doc, 'edition_key', re_edition_key.match(e['key']).group(1))

    cover_edition = pick_cover(w, editions)
    if cover_edition:
        add_field(doc, 'cover_edition_key',
                  re_edition_key.match(cover_edition).group(1))

    k = 'by_statement'
    add_field_list(doc, k, set(e[k] for e in editions if e.get(k, None)))

    k = 'publish_date'
    pub_dates = set(e[k] for e in editions if e.get(k, None))
    add_field_list(doc, k, pub_dates)
    pub_years = set(
        m.group(1) for m in (re_year.search(i) for i in pub_dates) if m)
    if pub_years:
        add_field_list(doc, 'publish_year', pub_years)
        add_field(doc, 'first_publish_year', min(int(i) for i in pub_years))

    k = 'first_sentence'
    fs = set(e[k]['value'] if isinstance(e[k], dict) else e[k]
             for e in editions if e.get(k, None))
    add_field_list(doc, k, fs)

    publishers = set()
    for e in editions:
        publishers.update('Sine nomine' if is_sine_nomine(i) else i
                          for i in e.get('publishers', []))
    add_field_list(doc, 'publisher', publishers)
    #    add_field_list(doc, 'publisher_facet', publishers)

    field_map = [
        ('lccn', 'lccn'),
        ('publish_places', 'publish_place'),
        ('oclc_numbers', 'oclc'),
        ('contributions', 'contributor'),
    ]

    for db_key, search_key in field_map:
        v = set()
        for e in editions:
            if db_key not in e:
                continue
            v.update(e[db_key])
        add_field_list(doc, search_key, v)

    isbn = set()
    for e in editions:
        for f in 'isbn_10', 'isbn_13':
            for v in e.get(f, []):
                isbn.add(v.replace('-', ''))
    add_field_list(doc, 'isbn', isbn)

    lang = set()
    for e in editions:
        for l in e.get('languages', []):
            m = re_lang_key.match(l['key'] if isinstance(l, dict) else l)
            lang.add(m.group(1))
    if lang:
        add_field_list(doc, 'language', lang)

    pub_goog = set()  # google
    pub_nongoog = set()
    nonpub_goog = set()
    nonpub_nongoog = set()

    public_scan = False
    all_collection = set()
    all_overdrive = set()
    lending_edition = None
    in_library_edition = None
    printdisabled = set()
    for e in editions:
        if 'overdrive' in e:
            all_overdrive.update(e['overdrive'])
        if 'ocaid' not in e:
            continue
        if not lending_edition and 'lendinglibrary' in e['ia_collection']:
            lending_edition = re_edition_key.match(e['key']).group(1)
        if not in_library_edition and 'inlibrary' in e['ia_collection']:
            in_library_edition = re_edition_key.match(e['key']).group(1)
        if 'printdisabled' in e['ia_collection']:
            printdisabled.add(re_edition_key.match(e['key']).group(1))
        all_collection.update(e['ia_collection'])
        assert isinstance(e['ocaid'], basestring)
        i = e['ocaid'].strip()
        if e['public_scan']:
            public_scan = True
            if i.endswith('goog'):
                pub_goog.add(i)
            else:
                pub_nongoog.add(i)
        else:
            if i.endswith('goog'):
                nonpub_goog.add(i)
            else:
                nonpub_nongoog.add(i)
    #print 'lending_edition:', lending_edition
    ia_list = list(pub_nongoog) + list(pub_goog) + list(nonpub_nongoog) + list(
        nonpub_goog)
    add_field_list(doc, 'ia', ia_list)
    if has_fulltext:
        add_field(doc, 'public_scan_b', public_scan)
    if all_collection:
        add_field(doc, 'ia_collection_s', ';'.join(all_collection))
    if all_overdrive:
        add_field(doc, 'overdrive_s', ';'.join(all_overdrive))
    if lending_edition:
        add_field(doc, 'lending_edition_s', lending_edition)
    elif in_library_edition:
        add_field(doc, 'lending_edition_s', in_library_edition)
    if printdisabled:
        add_field(doc, 'printdisabled_s', ';'.join(list(printdisabled)))

    author_keys = [re_author_key.match(a['key']).group(1) for a in authors]
    author_names = [a.get('name', '') for a in authors]
    add_field_list(doc, 'author_key', author_keys)
    add_field_list(doc, 'author_name', author_names)

    alt_names = set()
    for a in authors:
        if 'alternate_names' in a:
            alt_names.update(a['alternate_names'])

    add_field_list(doc, 'author_alternative_name', alt_names)
    add_field_list(doc, 'author_facet',
                   (' '.join(v) for v in zip(author_keys, author_names)))
    #if subjects:
    #    add_field(doc, 'fiction', subjects['fiction'])

    for k in 'person', 'place', 'subject', 'time':
        if k not in subjects:
            continue
        add_field_list(doc, k, subjects[k].keys())
        add_field_list(doc, k + '_facet', subjects[k].keys())
        subject_keys = [str_to_key(s) for s in subjects[k].keys()]
        add_field_list(doc, k + '_key', subject_keys)

    return doc
Esempio n. 25
0
def run_update():
    global authors_to_update, works_to_update
    subjects_to_update = set()
    global last_update
    print 'running update: %s works %s authors' % (len(works_to_update),
                                                   len(authors_to_update))
    if works_to_update:
        requests = []
        num = 0
        total = len(works_to_update)
        for wkey in works_to_update:
            num += 1
            print 'update work: %s %d/%d' % (wkey, num, total)
            if '/' in wkey[7:]:
                print 'bad wkey:', wkey
                continue
            work_to_update = withKey(wkey)
            for attempt in range(5):
                try:
                    requests += update_work(work_to_update)
                except AuthorRedirect:
                    print 'fixing author redirect'
                    w = ol.get(wkey)
                    need_update = False
                    for a in w['authors']:
                        r = ol.get(a['author'])
                        if r['type'] == '/type/redirect':
                            a['author'] = {'key': r['location']}
                            need_update = True
                    if need_update:
                        if not done_login:
                            rc = read_rc()
                            ol.login('EdwardBot', rc['EdwardBot'])
                        ol.save(w['key'], w, 'avoid author redirect')
            if work_to_update['type'][
                    'key'] == '/type/work' and work_to_update.get('title'):
                subjects = get_work_subjects(work_to_update)
                print subjects
                for subject_type, values in subjects.iteritems():
                    subjects_to_update.update(
                        (subject_type, v) for v in values)
                if len(requests) >= 100:
                    solr_update(requests, debug=True)
                    requests = []
    #            if num % 1000 == 0:
    #                solr_update(['<commit/>'], debug=True)
        if requests:
            solr_update(requests, debug=True)
        if not args.no_commit:
            solr_update(['<commit/>'], debug=True)
    last_update = time()
    if not args.no_author_updates and authors_to_update:
        requests = []
        for akey in authors_to_update:
            print('update author:', repr(akey))
            try:
                request = update_author(akey)
                if request:
                    requests += request
            except AttributeError:
                print('akey:', repr(akey))
                raise
        if not args.no_commit:
            solr_update(requests + ['<commit/>'], debug=True)
    subject_add = Element("add")
    print subjects_to_update
    for subject_type, subject_name in subjects_to_update:
        key = subject_type + '/' + subject_name
        count = subject_count(subject_type, subject_name)

        if not subject_need_update(key, count):
            print 'no updated needed:', (subject_type, subject_name, count)
            continue
        print 'updated needed:', (subject_type, subject_name, count)

        doc = Element("doc")
        add_field(doc, 'key', key)
        add_field(doc, 'name', subject_name)
        add_field(doc, 'type', subject_type)
        add_field(doc, 'count', count)
        subject_add.append(doc)

    if len(subject_add):
        print 'updating subjects'
        add_xml = tostring(subject_add).encode('utf-8')
        solr_update([add_xml, '<commit />'], debug=True)

    authors_to_update = set()
    works_to_update = set()
    subjects_to_update = set()
    print >> open(state_file, 'w'), offset
Esempio n. 26
0
def update_works(akey, works, do_updates=False):
    # we can now look up all works by an author
    if do_updates:
        rc = read_rc()
        ol.login('WorkBot', rc['WorkBot'])
    assert do_updates

    fh_log = open('/1/var/log/openlibrary/work_finder/' + strftime('%F_%T'),
                  'w')
    works = list(works)
    print >> fh_log, akey
    print >> fh_log, 'works:'
    pprint(works, fh_log)

    while True:  # until redirects repaired
        q = {'type': '/type/edition', 'authors': akey, 'works': None}
        work_to_edition = defaultdict(set)
        edition_to_work = defaultdict(set)
        for e in query_iter(q):
            if not isinstance(e, dict):
                continue
            if e.get('works', None):
                for w in e['works']:
                    work_to_edition[w['key']].add(e['key'])
                    edition_to_work[e['key']].add(w['key'])

        work_by_key = {}
        fix_redirects = []
        for k, editions in work_to_edition.items():
            w = withKey(k)
            if w['type']['key'] == '/type/redirect':
                wkey = w['location']
                print >> fh_log, 'redirect found', w[
                    'key'], '->', wkey, editions
                assert re_work_key.match(wkey)
                for ekey in editions:
                    e = get_with_retry(ekey)
                    e['works'] = [{'key': wkey}]
                    fix_redirects.append(e)
                continue
            work_by_key[k] = w
        if not fix_redirects:
            print >> fh_log, 'no redirects left'
            break
        print >> fh_log, 'save redirects'
        try:
            ol.save_many(fix_redirects, "merge works")
        except:
            for r in fix_redirects:
                print r
            raise

    all_existing = set()
    work_keys = []
    print >> fh_log, 'edition_to_work:'
    print >> fh_log, ` dict(edition_to_work) `
    print >> fh_log
    print >> fh_log, 'work_to_edition'
    print >> fh_log, ` dict(work_to_edition) `
    print >> fh_log

    #    open('edition_to_work', 'w').write(`dict(edition_to_work)`)
    #    open('work_to_edition', 'w').write(`dict(work_to_edition)`)
    #    open('work_by_key', 'w').write(`dict(work_by_key)`)

    work_title_match = {}
    works_by_title = {}
    for w in works:  # 1st pass
        for e in w['editions']:
            ekey = e['key'] if isinstance(e, dict) else e
            for wkey in edition_to_work.get(ekey, []):
                try:
                    wtitle = work_by_key[wkey]['title']
                except:
                    print 'bad work:', wkey
                    raise
                if wtitle == w['title']:
                    work_title_match[wkey] = w['title']

    wkey_to_new_title = defaultdict(set)

    for w in works:  # 2nd pass
        works_by_title[w['title']] = w
        w['existing_works'] = defaultdict(int)
        for e in w['editions']:
            ekey = e['key'] if isinstance(e, dict) else e
            for wkey in edition_to_work.get(ekey, []):
                if wkey in work_title_match and work_title_match[wkey] != w[
                        'title']:
                    continue
                wtitle = work_by_key[wkey]['title']
                w['existing_works'][wkey] += 1
                wkey_to_new_title[wkey].add(w['title'])

    existing_work_with_conflict = defaultdict(set)

    for w in works:  # 3rd pass
        for wkey, v in w['existing_works'].iteritems():
            if any(title != w['title'] for title in wkey_to_new_title[wkey]):
                w['has_conflict'] = True
                existing_work_with_conflict[wkey].add(w['title'])
                break

    for wkey, v in existing_work_with_conflict.iteritems():
        cur_work = work_by_key[wkey]
        existing_titles = defaultdict(int)
        for ekey in work_to_edition[wkey]:
            e = withKey(ekey)
            title = e['title']
            if e.get('title_prefix', None):
                title = e['title_prefix'].strip() + ' ' + e['title']
            existing_titles[title] += 1
        best_match = max(v, key=lambda wt: existing_titles[wt])
        works_by_title[best_match]['best_match'] = work_by_key[wkey]
        for wtitle in v:
            del works_by_title[wtitle]['has_conflict']
            if wtitle != best_match:
                works_by_title[wtitle]['existing_works'] = {}

    def other_matches(w, existing_wkey):
        return [
            title for title in wkey_to_new_title[existing_wkey]
            if title != w['title']
        ]

    works_updated_this_session = set()

    for w in works:  # 4th pass
        if 'has_conflict' in w:
            pprint(w)
        assert 'has_conflict' not in w
        if len(w['existing_works']) == 1:
            existing_wkey = w['existing_works'].keys()[0]
            if not other_matches(w, existing_wkey):
                w['best_match'] = work_by_key[existing_wkey]
        if 'best_match' in w:
            updated = update_work_with_best_match(akey, w, work_to_edition,
                                                  do_updates, fh_log)
            for wkey in updated:
                if wkey in works_updated_this_session:
                    print >> fh_log, wkey, 'already updated!'
                    print wkey, 'already updated!'
                works_updated_this_session.update(updated)
            continue
        if not w['existing_works']:
            updated = new_work(akey, w, do_updates, fh_log)
            for wkey in updated:
                assert wkey not in works_updated_this_session
                works_updated_this_session.update(updated)
            continue

        assert not any(
            other_matches(w, wkey) for wkey in w['existing_works'].iterkeys())
        best_match = max(w['existing_works'].iteritems(),
                         key=lambda i: i[1])[0]
        w['best_match'] = work_by_key[best_match]
        updated = update_work_with_best_match(akey, w, work_to_edition,
                                              do_updates, fh_log)
        for wkey in updated:
            if wkey in works_updated_this_session:
                print >> fh_log, wkey, 'already updated!'
                print wkey, 'already updated!'
        works_updated_this_session.update(updated)

    #if not do_updates:
    #    return []

    return [withKey(key) for key in works_updated_this_session]
Esempio n. 27
0
    do_updates = False

    while True: # until redirects repaired
        q = {'type':'/type/edition', 'authors':akey, 'works': None}
        work_to_edition = defaultdict(set)
        edition_to_work = defaultdict(set)
        for e in query_iter(q):
            if e.get('works', None):
                for w in e['works']:
                    work_to_edition[w['key']].add(e['key'])
                    edition_to_work[e['key']].add(w['key'])

        work_title = {}
        fix_redirects = []
        for k, editions in work_to_edition.items():
            w = withKey(k)
            if w['type']['key'] == '/type/redirect':
                print 'redirect found'
                wkey = w['location']
                assert re_work_key.match(wkey)
                for ekey in editions:
                    e = withKey(ekey)
                    e['works'] = [Reference(wkey)]
                    fix_redirects.append(e)
                continue
            work_title[k] = w['title']
        if not fix_redirects:
            print 'no redirects left'
            break
        print 'save redirects'
        ol.save_many(fix_redirects, "merge works")
Esempio n. 28
0
def withKey_cached(key, obj_cache={}):
    if key not in obj_cache:
        obj_cache[key] = withKey(key)
    return obj_cache[key]
Esempio n. 29
0
 def resolve(a):
     if a['type']['key'] == '/type/redirect':
         a = withKey(a['location'])
     return a
Esempio n. 30
0
def withKey_cached(key, obj_cache={}):
    if key not in obj_cache:
        obj_cache[key] = withKey(key)
    return obj_cache[key]
Esempio n. 31
0
def update_author(akey, a=None, handle_redirects=True):
    # http://ia331507.us.archive.org:8984/solr/works/select?indent=on&q=author_key:OL22098A&facet=true&rows=1&sort=edition_count%20desc&fl=title&facet.field=subject_facet&facet.mincount=1
    if akey == '/authors/':
        return
    m = re_author_key.match(akey)
    if not m:
        print 'bad key:', akey
    assert m
    author_id = m.group(1)
    if not a:
        a = withKey(akey)
    if a['type']['key'] in ('/type/redirect',
                            '/type/delete') or not a.get('name', None):
        return ['<delete><query>key:%s</query></delete>' % author_id]
    try:
        assert a['type']['key'] == '/type/author'
    except AssertionError:
        print a['type']['key']
        raise

    facet_fields = ['subject', 'time', 'person', 'place']

    url = 'http://' + get_solr(
        'works'
    ) + '/solr/works/select?wt=json&json.nl=arrarr&q=author_key:%s&sort=edition_count+desc&rows=1&fl=title,subtitle&facet=true&facet.mincount=1' % author_id
    url += ''.join('&facet.field=%s_facet' % f for f in facet_fields)
    reply = json.load(urlopen(url))
    work_count = reply['response']['numFound']
    docs = reply['response'].get('docs', [])
    top_work = None
    if docs:
        top_work = docs[0]['title']
        if docs[0].get('subtitle', None):
            top_work += ': ' + docs[0]['subtitle']
    all_subjects = []
    for f in facet_fields:
        for s, num in reply['facet_counts']['facet_fields'][f + '_facet']:
            all_subjects.append((num, s))
    all_subjects.sort(reverse=True)
    top_subjects = [s for num, s in all_subjects[:10]]

    add = Element("add")
    doc = SubElement(add, "doc")
    add_field(doc, 'key', author_id)
    if a.get('name', None):
        add_field(doc, 'name', a['name'])
    for f in 'birth_date', 'death_date', 'date':
        if a.get(f, None):
            add_field(doc, f, a[f])
    if top_work:
        add_field(doc, 'top_work', top_work)
    add_field(doc, 'work_count', work_count)
    add_field_list(doc, 'top_subjects', top_subjects)

    requests = []
    if handle_redirects:
        q = {'type': '/type/redirect', 'location': akey}
        try:
            redirects = ''.join('<id>%s</id>' %
                                re_author_key.match(r['key']).group(1)
                                for r in query_iter(q))
        except AttributeError:
            print 'redirects:', [r['key'] for r in query_iter(q)]
            raise
        if redirects:
            requests.append('<delete>' + redirects + '</delete>')

    requests.append(tostring(add).encode('utf-8'))
    return requests
Esempio n. 32
0
def solr_updates(i):
    global subjects_to_update, authors_to_update
    t0 = time()
    d = i['data']
    changeset = d['changeset']
    print 'author:', d['author']
    try:
        assert len(changeset['data']) == 2 and 'master' in changeset['data'] and 'duplicates' in changeset['data']
    except:
        print d['changeset']
        raise
    master_key = changeset['data']['master']
    dup_keys = changeset['data']['duplicates']
    assert dup_keys
    print d['changeset']
    print 'timestamp:', i['timestamp']
    print 'dups:', dup_keys
    print 'records to update:', len(d['result'])
     
    master = None
    obj_by_key = {}
    works = []
    editions_by_work = defaultdict(list)
    for obj in d['query']:
        obj_type = obj['type']['key']
        k = obj['key']
        if obj_type == '/type/work':
            works.append(obj['key'])
        elif obj_type == '/type/edition':
            if 'works' not in obj:
                continue
            for w in obj['works']:
                editions_by_work[w['key']].append(obj)
        obj_by_key[k] = obj
    master = obj_by_key.get(master_key)
    #print 'master:', master

    if len(d['result']) == 0:
        print i

    work_updates = []
    for wkey in works:
            #print 'editions_by_work:', editions_by_work
            work = obj_by_key[wkey]
            work['editions'] = editions_by_work[wkey]
            subjects = get_work_subjects(work)
            for subject_type, values in subjects.iteritems():
                subjects_to_update.update((subject_type, v) for v in values)
            try:
                ret = update_work(work, obj_cache=obj_by_key, debug=True)
            except AuthorRedirect:
                work = withKey(wkey)
                work['editions'] = editions_by_work[wkey]
                ret = update_work(work, debug=True, resolve_redirects=True)
            work_updates += ret
    solr_update(work_updates, debug=False, index='works')

    authors_to_update.append({ 'redirects': dup_keys, 'master_key': master_key, 'master': master})
    print 'authors to update:', len(authors_to_update)

    t1 = time() - t0
    update_times.append(t1)
    print 'update takes: %d seconds' % t1
    print
Esempio n. 33
0
def pick_cover(editions):
    for pub_year, ekey in sorted(get_covers(editions), reverse=True):
        e = withKey(ekey)
        if e['type']['key'] == '/type/edition':
            return ekey
Esempio n. 34
0
    print item, edition_pool
    e1 = build_marc(rec)
    print e1

    match = False
    seen = set()
    for k, v in edition_pool.iteritems():
        for edition_key in v:
#            edition_key = '/books/' + re_edition_key.match(edition_key).match(1)
            if edition_key in seen:
                continue
            thing = None
            while not thing or thing['type']['key'] == '/type/redirect':
                seen.add(edition_key)
                thing = withKey(edition_key)
                assert thing
                if thing['type']['key'] == '/type/redirect':
                    print 'following redirect %s => %s' % (edition_key, thing['location'])
                    edition_key = thing['location']
            if try_merge(e1, edition_key, thing):
                print 'add source records:', edition_key, item
                print (edition_key, item)
                print >> add_src_rec, (edition_key, item)
                #add_source_records(edition_key, ia)
                #write_log(ia, when, "found match: " + edition_key)
                match = True
                break
        if not match:
            print full_rec
            print >> new_book, full_rec
Esempio n. 35
0
        except KeyboardInterrupt:
            raise
        except:
            pass
        sleep(2)

skip = True
skip = False
for line in open('/1/edward/jsondump/2009-07-29/has_ocaid'):
    key = line[:-1]
    if key == '/b/OL6539962M': # the end
        break
    if skip:
        if key == '/b/OL6539962M':
            skip = False
        else:
            continue
    if not has_cover_retry(key):
        print 'no cover'
        continue
    print key
    e = withKey(key)
    if not e.get('ocaid', None):
        print 'no ocaid'
        continue
    find_img(e['ocaid'].strip())

fh_log.close()

print 'finished'
Esempio n. 36
0
def build_doc(w):
    wkey = w['key']
    assert w['type']['key'] == '/type/work'
    title = w.get('title', None)
    if not title:
        return

    def get_pub_year(e):
        pub_date = e.get('publish_date', None)
        if pub_date:
            m = re_year.search(pub_date)
            if m:
                return m.group(1)

    if 'editions' not in w:
        q = { 'type':'/type/edition', 'works': wkey, '*': None }
        w['editions'] = list(query_iter(q))
        print 'editions:', [e['key'] for e in w['editions']]

    editions = []
    for e in w['editions']:
        pub_year = get_pub_year(e)
        if pub_year:
            e['pub_year'] = pub_year
        if 'ocaid' in e:
            collection = get_ia_collection(e['ocaid'])
            print 'collection:', collection
            e['ia_collection'] = collection
            e['public_scan'] = ('lendinglibrary' not in collection) and ('printdisabled' not in collection)
        overdrive_id = e.get('identifiers', {}).get('overdrive', None)
        if overdrive_id:
            print 'overdrive:', overdrive_id
            e['overdrive'] = overdrive_id
        editions.append(e)

    editions.sort(key=lambda e: e.get('pub_year', None))

    print len(w['editions']), 'editions found'

    print w['key']
    work_authors = []
    authors = []
    author_keys = []
    for a in w.get('authors', []):
        if 'author' not in a:
            continue
        akey = a['author']['key']
        m = re_author_key.match(akey)
        if not m:
            print 'invalid author key:', akey
            continue
        work_authors.append(akey)
        author_keys.append(m.group(1))
        authors.append(withKey(akey))
    if any(a['type']['key'] == '/type/redirect' for a in authors):
        raise AuthorRedirect
    assert all(a['type']['key'] == '/type/author' for a in authors)

    #subjects = four_types(find_subjects(get_marc_subjects(w)))
    subjects = {}
    field_map = {
        'subjects': 'subject',
        'subject_places': 'place',
        'subject_times': 'time',
        'subject_people': 'person',
    }

    has_fulltext = any(e.get('ocaid', None) or e.get('overdrive', None) for e in editions)

    print 'has_fulltext:', has_fulltext

    for db_field, solr_field in field_map.iteritems():
        if not w.get(db_field, None):
            continue
        cur = subjects.setdefault(solr_field, {})
        for v in w[db_field]:
            try:
                if isinstance(v, dict):
                    if 'value' not in v:
                        continue
                    v = v['value']
                cur[v] = cur.get(v, 0) + 1
            except:
                print 'v:', v
                raise

    if any(e.get('ocaid', None) for e in editions):
        subjects.setdefault('subject', {})
        subjects['subject']['Accessible book'] = subjects['subject'].get('Accessible book', 0) + 1
        if not has_fulltext:
            subjects['subject']['Protected DAISY'] = subjects['subject'].get('Protected DAISY', 0) + 1
        print w['key'], subjects['subject']

    doc = Element("doc")

    add_field(doc, 'key', w['key'][7:])
    title = w.get('title', None)
    if title:
        add_field(doc, 'title', title)
#        add_field(doc, 'title_suggest', title)

    add_field(doc, 'has_fulltext', has_fulltext)
    if w.get('subtitle', None):
        add_field(doc, 'subtitle', w['subtitle'])

    alt_titles = set()
    for e in editions:
        if 'title' in e and e['title'] != title:
            alt_titles.add(e['title'])
        for f in 'work_titles', 'other_titles':
            for t in e.get(f, []):
                if t != title:
                    alt_titles.add(t)
    add_field_list(doc, 'alternative_title', alt_titles)

    alt_subtitles = set( e['subtitle'] for e in editions if e.get('subtitle', None) and e['subtitle'] != w.get('subtitle', None))
    add_field_list(doc, 'alternative_subtitle', alt_subtitles)

    add_field(doc, 'edition_count', len(editions))
    for e in editions:
        add_field(doc, 'edition_key', re_edition_key.match(e['key']).group(1))

    cover_edition = pick_cover(w, editions)
    if cover_edition:
        add_field(doc, 'cover_edition_key', re_edition_key.match(cover_edition).group(1))

    k = 'by_statement'
    add_field_list(doc, k, set( e[k] for e in editions if e.get(k, None)))

    k = 'publish_date'
    pub_dates = set(e[k] for e in editions if e.get(k, None))
    add_field_list(doc, k, pub_dates)
    pub_years = set(m.group(1) for m in (re_year.search(i) for i in pub_dates) if m)
    if pub_years:
        add_field_list(doc, 'publish_year', pub_years)
        add_field(doc, 'first_publish_year', min(int(i) for i in pub_years))

    k = 'first_sentence'
    fs = set( e[k]['value'] if isinstance(e[k], dict) else e[k] for e in editions if e.get(k, None))
    add_field_list(doc, k, fs)

    publishers = set()
    for e in editions:
        publishers.update('Sine nomine' if is_sine_nomine(i) else i for i in e.get('publishers', []))
    add_field_list(doc, 'publisher', publishers)
#    add_field_list(doc, 'publisher_facet', publishers)

    field_map = [
        ('lccn', 'lccn'),
        ('publish_places', 'publish_place'),
        ('oclc_numbers', 'oclc'),
        ('contributions', 'contributor'),
    ]

    for db_key, search_key in field_map:
        v = set()
        for e in editions:
            if db_key not in e:
                continue
            v.update(e[db_key])
        add_field_list(doc, search_key, v)

    isbn = set()
    for e in editions:
        for f in 'isbn_10', 'isbn_13':
            for v in e.get(f, []):
                isbn.add(v.replace('-', ''))
    add_field_list(doc, 'isbn', isbn)

    lang = set()
    for e in editions:
        for l in e.get('languages', []):
            m = re_lang_key.match(l['key'] if isinstance(l, dict) else l)
            lang.add(m.group(1))
    if lang:
        add_field_list(doc, 'language', lang)

    pub_goog = set() # google
    pub_nongoog = set()
    nonpub_goog = set()
    nonpub_nongoog = set()

    public_scan = False
    all_collection = set()
    all_overdrive = set()
    lending_edition = None
    printdisabled = set()
    for e in editions:
        if 'overdrive' in e:
            all_overdrive.update(e['overdrive'])
        if 'ocaid' not in e:
            continue
        if not lending_edition and 'lendinglibrary' in e['ia_collection']:
            lending_edition = re_edition_key.match(e['key']).group(1)
        if 'printdisabled' in e['ia_collection']:
            printdisabled.add(re_edition_key.match(e['key']).group(1))
        all_collection.update(e['ia_collection'])
        assert isinstance(e['ocaid'], basestring)
        i = e['ocaid'].strip()
        if e['public_scan']:
            public_scan = True
            if i.endswith('goog'):
                pub_goog.add(i)
            else:
                pub_nongoog.add(i)
        else:
            if i.endswith('goog'):
                nonpub_goog.add(i)
            else:
                nonpub_nongoog.add(i)
    print 'lending_edition:', lending_edition
    ia_list = list(pub_nongoog) + list(pub_goog) + list(nonpub_nongoog) + list(nonpub_goog)
    add_field_list(doc, 'ia', ia_list)
    if has_fulltext:
        add_field(doc, 'public_scan_b', public_scan)
    if all_collection:
        add_field(doc, 'ia_collection_s', ';'.join(all_collection))
    if all_overdrive:
        add_field(doc, 'overdrive_s', ';'.join(all_overdrive))
    if lending_edition:
        add_field(doc, 'lending_edition_s', lending_edition)
    if printdisabled:
        add_field(doc, 'printdisabled_s', ';'.join(list(printdisabled)))

    author_keys = [re_author_key.match(a['key']).group(1) for a in authors]
    author_names = [a.get('name', '') for a in authors]
    add_field_list(doc, 'author_key', author_keys)
    add_field_list(doc, 'author_name', author_names)

    alt_names = set()
    for a in authors:
        if 'alternate_names' in a:
            alt_names.update(a['alternate_names'])

    add_field_list(doc, 'author_alternative_name', alt_names)
    add_field_list(doc, 'author_facet', (' '.join(v) for v in zip(author_keys, author_names)))
    #if subjects:
    #    add_field(doc, 'fiction', subjects['fiction'])

    for k in 'person', 'place', 'subject', 'time':
        if k not in subjects:
            continue
        add_field_list(doc, k, subjects[k].keys())
#        add_field_list(doc, k + '_facet', subjects[k].keys())
        subject_keys = [str_to_key(s) for s in subjects[k].keys()]
        add_field_list(doc, k + '_key', subject_keys)

    return doc
Esempio n. 37
0
                write_log(ia, when, "loaded")
                continue

            e1 = build_marc(rec)

            match = False
            seen = set()
            for k, v in edition_pool.iteritems():
                for edition_key in v:
                    if edition_key in seen:
                        continue
                    thing = None
                    found = True
                    while not thing or thing['type']['key'] == '/type/redirect':
                        seen.add(edition_key)
                        thing = withKey(edition_key)
                        assert thing
                        if 'type' not in thing:
                            print(thing)
                        if thing.get('error') == 'notfound':
                            found = False
                            break
                        if thing['type']['key'] == '/type/redirect':
                            print('following redirect %s => %s' %
                                  (edition_key, thing['location']))
                            edition_key = thing['location']
                    if not found:
                        continue
                    if try_merge(e1, edition_key, thing):
                        add_source_records(edition_key, ia)
                        write_log(ia, when, "found match: " + edition_key)
Esempio n. 38
0
def update_keys(keys, commit=True):
    logger.info("BEGIN update_keys")
    wkeys = set()

    # Get works for all the editions
    ekeys = set(k for k in keys if k.startswith("/books/"))
    for k in ekeys:
        logger.info("processing edition %s", k)
        edition = get_document(k)

        if edition and edition['type']['key'] == '/type/redirect':
            logger.warn("Found redirect to %s", edition['location'])
            edition = withKey(edition['location'])

        if not edition:
            logger.warn("No edition found for key %r. Ignoring...", k)
            continue
        elif edition['type']['key'] != '/type/edition':
            logger.info(
                "%r is a document of type %r. Checking if any work has it as edition in solr...",
                k, edition['type']['key'])
            wkey = solr_select_work(k)
            if wkey:
                logger.info("found %r, updating it...", wkey)
                wkeys.add(wkey)
            logger.warn("Found a document of type %r. Ignoring...",
                        edition['type']['key'])
        else:
            if edition.get("works"):
                wkeys.add(edition["works"][0]['key'])
            else:
                # index the edition as it does not belong to any work
                wkeys.add(k)

    # Add work keys
    wkeys.update(k for k in keys if k.startswith("/works/"))

    # update works
    requests = []
    for k in wkeys:
        logger.info("updating %s", k)
        try:
            w = get_document(k)
            requests += update_work(w, debug=True)
        except:
            logger.error("Failed to update work %s", k, exc_info=True)

    if requests:
        if commit:
            requests += ['<commit />']
        solr_update(requests, debug=True)

    # update authors
    requests = []
    akeys = set(k for k in keys if k.startswith("/authors/"))
    for k in akeys:
        logger.info("updating %s", k)
        try:
            requests += update_author(k)
        except:
            logger.error("Failed to update author %s", k, exc_info=True)
    if requests:
        if commit:
            requests += ['<commit />']
        solr_update(requests, index="authors", debug=True, commitWithin=1000)
    logger.info("END update_keys")
Esempio n. 39
0
                works = find_works(akey,
                                   get_books(akey, books_query(akey)),
                                   existing=title_redirects)
                updated = update_works(akey, works, do_updates=True)
                works_to_update.update(w['key'] for w in updated)
            for query in i['data']['query']:
                key = query.pop('key')
                process_save(key, query)
        # store.put gets called when any document is updated in the store. Borrowing/Returning a book triggers one.
        elif action == 'store.put':
            # A sample record looks like this:
            # {
            #   "action": "store.put",
            #   "timestamp": "2011-12-01T00:00:44.241604",
            #   "data": {
            #       "data": {"borrowed": "false", "_key": "ebooks/books/OL5854888M", "_rev": "975708", "type": "ebook", "book_key": "/books/OL5854888M"},
            #       "key": "ebooks/books/OL5854888M"
            #   },
            #   "site": "openlibrary.org"
            # }
            data = i.get('data', {}).get("data")
            if data.get("type") == "ebook" and data.get(
                    "_key", "").startswith("ebooks/books/"):
                edition_key = data['book_key']
                process_save(edition_key, withKey(edition_key))
    since_last_update = time() - last_update
    if len(works_to_update) > work_limit or len(
            authors_to_update
    ) > author_limit or since_last_update > time_limit:
        run_update()
Esempio n. 40
0
def update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log):
    work_updated = []
    best = w['best_match']['key']
    update = []
    subjects_from_existing_works = defaultdict(set)
    for wkey in w['existing_works'].iterkeys():
        if wkey == best:
            continue
        existing = get_with_retry(wkey)
        for k in 'subjects', 'subject_places', 'subject_times', 'subject_people':
            if existing.get(k):
                subjects_from_existing_works[k].update(existing[k])

        update.append({'type': '/type/redirect', 'location': best, 'key': wkey})
        work_updated.append(wkey)

    for wkey in w['existing_works'].iterkeys():
        editions = set(work_to_edition[wkey])
        editions.update(e['key'] for e in w['editions'])
        for ekey in editions:
            e = get_with_retry(ekey)
            e['works'] = [{'key': best}]
            authors = []
            for akey in e['authors']:
                a = get_with_retry(akey)
                if a['type'] == '/type/redirect':
                    m = re_author_key.match(a['location'])
                    akey = '/authors/' + m.group(1)
                authors.append({'key': str(akey)})
            e['authors'] = authors
            new_toc = fix_toc(e)
            if new_toc:
                e['table_of_contents'] = new_toc
            update.append(e)

    cur_work = w['best_match']
    need_save = fix_up_authors(cur_work, akey, w['editions'])
    if any(subjects_from_existing_works.values()):
        need_save = True
    if need_save or cur_work['title'] != w['title'] \
            or ('subtitle' in w and 'subtitle' not in cur_work) \
            or ('subjects' in w and 'subjects' not in cur_work):
        if cur_work['title'] != w['title']:
            print(( 'update work title:', best, repr(cur_work['title']), '->', repr(w['title'])))
        existing_work = get_with_retry(best)
        if existing_work['type'] != '/type/work':
            pprint(existing_work)
        assert existing_work['type'] == '/type/work'
        existing_work['title'] = w['title']
        for k, v in subjects_from_existing_works.items():
            existing_subjects = set(existing_work.get(k, []))
            existing_work.setdefault(k, []).extend(s for s in v if s not in existing_subjects)
        add_detail_to_work(w, existing_work)
        for a in existing_work.get('authors', []):
            obj = withKey(a['author'])
            if obj['type']['key'] != '/type/redirect':
                continue
            new_akey = obj['location']
            a['author'] = {'key': new_akey}
            assert new_akey.startswith('/authors/')
            obj = withKey(new_akey)
            assert obj['type']['key'] == '/type/author'
        print('existing:', existing_work, file=fh_log)
        print('subtitle:', repr(existing_work['subtitle']) if 'subtitle' in existing_work else 'n/a', file=fh_log)
        update.append(existing_work)
        work_updated.append(best)
    if do_updates:
        try:
            print(ol.save_many(update, 'merge works'), file=fh_log)
        except:
            for page in update:
                print(page)
            raise
    return work_updated
Esempio n. 41
0
def update_keys(keys, commit=True):
    logger.info("BEGIN update_keys")
    wkeys = set()

    # Get works for all the editions
    ekeys = set(k for k in keys if k.startswith("/books/"))
    for k in ekeys:
        logger.info("processing edition %s", k)
        edition = get_document(k)

        if edition and edition['type']['key'] == '/type/redirect':
            logger.warn("Found redirect to %s", edition['location'])
            edition = withKey(edition['location'])

        if not edition:
            logger.warn("No edition found for key %r. Ignoring...", k)
            continue
        elif edition['type']['key'] != '/type/edition':
            logger.info("%r is a document of type %r. Checking if any work has it as edition in solr...", k, edition['type']['key'])
            wkey = solr_select_work(k)
            if wkey:
                logger.info("found %r, updating it...", wkey)
                wkeys.add(wkey)

            if edition['type']['key'] == '/type/delete':
                logger.info("Found a document of type %r. queuing for deleting it solr..", edition['type']['key'])
                # Also remove if there is any work with that key in solr.
                wkeys.add(k)
            else:
                logger.warn("Found a document of type %r. Ignoring...", edition['type']['key'])
        else:
            if edition.get("works"):
                wkeys.add(edition["works"][0]['key'])
            else:
                # index the edition as it does not belong to any work
                wkeys.add(k)
        
    # Add work keys
    wkeys.update(k for k in keys if k.startswith("/works/"))
    
    # update works
    requests = []
    for k in wkeys:
        logger.info("updating %s", k)
        try:
            w = get_document(k)
            requests += update_work(w, debug=True)
        except:
            logger.error("Failed to update work %s", k, exc_info=True)

    if requests:    
        if commit:
            requests += ['<commit />']
        solr_update(requests, debug=True)

    # update editions
    requests = []
    for k in ekeys:
        try:
            e = withKey(k)
            requests += update_edition(e)
        except:
            logger.error("Failed to update edition %s", k, exc_info=True)
    if requests:
        if commit:
            requests += ['<commit/>']
        solr_update(requests, index="editions", debug=True)
    
    # update authors
    requests = []
    akeys = set(k for k in keys if k.startswith("/authors/"))
    for k in akeys:
        logger.info("updating %s", k)
        try:
            requests += update_author(k)
        except:
            logger.error("Failed to update author %s", k, exc_info=True)

    if requests:  
        if commit:
            requests += ['<commit />']
        solr_update(requests, index="authors", debug=True, commitWithin=1000)

    # update subjects
    skeys = set(k for k in keys if k.startswith("/subjects/"))
    requests = []
    for k in skeys:
        logger.info("updating %s", k)
        try:
            requests += update_subject(k)
        except:
            logger.error("Failed to update subject %s", k, exc_info=True)
    if requests:  
        if commit:
            requests += ['<commit />']
        solr_update(requests, index="subjects", debug=True)

    logger.info("END update_keys")
Esempio n. 42
0
def update_works(akey, works, do_updates=False):
    # we can now look up all works by an author
    if do_updates:
        rc = read_rc()
        ol.login('WorkBot', rc['WorkBot'])
    assert do_updates

    fh_log = open('/1/var/log/openlibrary/work_finder/' + strftime('%F_%T'), 'w')
    works = list(works)
    print(akey, file=fh_log)
    print('works:', file=fh_log)
    pprint(works, fh_log)

    while True: # until redirects repaired
        q = {'type':'/type/edition', 'authors': akey, 'works': None}
        work_to_edition = defaultdict(set)
        edition_to_work = defaultdict(set)
        for e in query_iter(q):
            if not isinstance(e, dict):
                continue
            if e.get('works', None):
                for w in e['works']:
                    work_to_edition[w['key']].add(e['key'])
                    edition_to_work[e['key']].add(w['key'])

        work_by_key = {}
        fix_redirects = []
        for k, editions in work_to_edition.items():
            w = withKey(k)
            if w['type']['key'] == '/type/redirect':
                wkey = w['location']
                print('redirect found', w['key'], '->', wkey, editions, file=fh_log)
                assert re_work_key.match(wkey)
                for ekey in editions:
                    e = get_with_retry(ekey)
                    e['works'] = [{'key': wkey}]
                    fix_redirects.append(e)
                continue
            work_by_key[k] = w
        if not fix_redirects:
            print('no redirects left', file=fh_log)
            break
        print('save redirects', file=fh_log)
        try:
            ol.save_many(fix_redirects, "merge works")
        except:
            for r in fix_redirects:
                print(r)
            raise

    all_existing = set()
    work_keys = []
    print('edition_to_work:', file=fh_log)
    print(repr(dict(edition_to_work)), file=fh_log)
    print(file=fh_log)
    print('work_to_edition', file=fh_log)
    print(repr(dict(work_to_edition)), file=fh_log)
    print(file=fh_log)

#    open('edition_to_work', 'w').write(repr(dict(edition_to_work)))
#    open('work_to_edition', 'w').write(repr(dict(work_to_edition)))
#    open('work_by_key', 'w').write(repr(dict(work_by_key)))

    work_title_match = {}
    works_by_title = {}
    for w in works: # 1st pass
        for e in w['editions']:
            ekey = e['key'] if isinstance(e, dict) else e
            for wkey in edition_to_work.get(ekey, []):
                try:
                    wtitle = work_by_key[wkey]['title']
                except:
                    print('bad work:', wkey)
                    raise
                if wtitle == w['title']:
                    work_title_match[wkey] = w['title']

    wkey_to_new_title = defaultdict(set)

    for w in works: # 2nd pass
        works_by_title[w['title']] = w
        w['existing_works'] = defaultdict(int)
        for e in w['editions']:
            ekey = e['key'] if isinstance(e, dict) else e
            for wkey in edition_to_work.get(ekey, []):
                if wkey in work_title_match and work_title_match[wkey] != w['title']:
                    continue
                wtitle = work_by_key[wkey]['title']
                w['existing_works'][wkey] += 1
                wkey_to_new_title[wkey].add(w['title'])

    existing_work_with_conflict = defaultdict(set)

    for w in works: # 3rd pass
        for wkey, v in w['existing_works'].iteritems():
            if any(title != w['title'] for title in wkey_to_new_title[wkey]):
                w['has_conflict'] = True
                existing_work_with_conflict[wkey].add(w['title'])
                break

    for wkey, v in existing_work_with_conflict.iteritems():
        cur_work = work_by_key[wkey]
        existing_titles = defaultdict(int)
        for ekey in work_to_edition[wkey]:
            e = withKey(ekey)
            title = e['title']
            if e.get('title_prefix', None):
                title = e['title_prefix'].strip() + ' ' + e['title']
            existing_titles[title] += 1
        best_match = max(v, key=lambda wt: existing_titles[wt])
        works_by_title[best_match]['best_match'] = work_by_key[wkey]
        for wtitle in v:
            del works_by_title[wtitle]['has_conflict']
            if wtitle != best_match:
                works_by_title[wtitle]['existing_works'] = {}

    def other_matches(w, existing_wkey):
        return [title for title in wkey_to_new_title[existing_wkey] if title != w['title']]

    works_updated_this_session = set()

    for w in works: # 4th pass
        if 'has_conflict' in w:
            pprint(w)
        assert 'has_conflict' not in w
        if len(w['existing_works']) == 1:
            existing_wkey = w['existing_works'].keys()[0]
            if not other_matches(w, existing_wkey):
                w['best_match'] = work_by_key[existing_wkey]
        if 'best_match' in w:
            updated = update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log)
            for wkey in updated:
                if wkey in works_updated_this_session:
                    print(wkey, 'already updated!', file=fh_log)
                    print(wkey, 'already updated!')
                works_updated_this_session.update(updated)
            continue
        if not w['existing_works']:
            updated = new_work(akey, w, do_updates, fh_log)
            for wkey in updated:
                assert wkey not in works_updated_this_session
                works_updated_this_session.update(updated)
            continue

        assert not any(other_matches(w, wkey) for wkey in w['existing_works'].iterkeys())
        best_match = max(w['existing_works'].iteritems(), key=lambda i:i[1])[0]
        w['best_match'] = work_by_key[best_match]
        updated = update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log)
        for wkey in updated:
            if wkey in works_updated_this_session:
                print(wkey, 'already updated!', file=fh_log)
                print(wkey, 'already updated!')
        works_updated_this_session.update(updated)

    #if not do_updates:
    #    return []

    return [withKey(key) for key in works_updated_this_session]
Esempio n. 43
0
 def resolve(a):
     if a['type']['key'] == '/type/redirect':
         a = withKey(a['location'])
     return a
Esempio n. 44
0
from openlibrary.catalog.utils.query import query, withKey
from openlibrary.catalog.importer.update import add_source_records

for num, line in enumerate(open('/1/edward/imagepdf/possible_match2')):
    doc = eval(line)
    if 'publisher' not in doc:
        continue
    item_id = doc['item_id']
    if query({'type':'/type/edition','source_records':'ia:' + item_id}):
        continue
    e = withKey(doc['ol'])
    if 'publishers' not in e:
        continue
    title_match = False
    if doc['title'] == e['title']:
        title_match = True
    elif doc['title'] == e.get('title_prefix', '') + e['title']:
        title_match = True
    elif doc['title'] == e.get('title_prefix', '') + e['title'] + e.get('subtitle', ''):
        title_match = True
    elif doc['title'] == e['title'] + e.get('subtitle', ''):
        title_match = True
    if not title_match:
        continue
    if doc['publisher'] != e['publishers'][0]:
        continue
    print 'match:', item_id, doc['ol']
    add_source_records(doc['ol'], item_id)

Esempio n. 45
0
def update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log):
    work_updated = []
    best = w['best_match']['key']
    update = []
    subjects_from_existing_works = defaultdict(set)
    for wkey in w['existing_works'].iterkeys():
        if wkey == best:
            continue
        existing = get_with_retry(wkey)
        for k in 'subjects', 'subject_places', 'subject_times', 'subject_people':
            if existing.get(k):
                subjects_from_existing_works[k].update(existing[k])

        update.append({
            'type': '/type/redirect',
            'location': best,
            'key': wkey
        })
        work_updated.append(wkey)

    for wkey in w['existing_works'].iterkeys():
        editions = set(work_to_edition[wkey])
        editions.update(e['key'] for e in w['editions'])
        for ekey in editions:
            e = get_with_retry(ekey)
            e['works'] = [{'key': best}]
            authors = []
            for akey in e['authors']:
                a = get_with_retry(akey)
                if a['type'] == '/type/redirect':
                    m = re_author_key.match(a['location'])
                    akey = '/authors/' + m.group(1)
                authors.append({'key': str(akey)})
            e['authors'] = authors
            new_toc = fix_toc(e)
            if new_toc:
                e['table_of_contents'] = new_toc
            update.append(e)

    cur_work = w['best_match']
    need_save = fix_up_authors(cur_work, akey, w['editions'])
    if any(subjects_from_existing_works.values()):
        need_save = True
    if need_save or cur_work['title'] != w['title'] \
            or ('subtitle' in w and 'subtitle' not in cur_work) \
            or ('subjects' in w and 'subjects' not in cur_work):
        if cur_work['title'] != w['title']:
            print 'update work title:', best, ` cur_work['title'] `, '->', ` w[
                'title'] `
        existing_work = get_with_retry(best)
        if existing_work['type'] != '/type/work':
            pprint(existing_work)
        assert existing_work['type'] == '/type/work'
        existing_work['title'] = w['title']
        for k, v in subjects_from_existing_works.items():
            existing_subjects = set(existing_work.get(k, []))
            existing_work.setdefault(k,
                                     []).extend(s for s in v
                                                if s not in existing_subjects)
        add_detail_to_work(w, existing_work)
        for a in existing_work.get('authors', []):
            obj = withKey(a['author'])
            if obj['type']['key'] != '/type/redirect':
                continue
            new_akey = obj['location']
            a['author'] = {'key': new_akey}
            assert new_akey.startswith('/authors/')
            obj = withKey(new_akey)
            assert obj['type']['key'] == '/type/author'
        print >> fh_log, 'existing:', existing_work
        print >> fh_log, 'subtitle:', ` existing_work[
            'subtitle'] ` if 'subtitle' in existing_work else 'n/a'
        update.append(existing_work)
        work_updated.append(best)
Esempio n. 46
0
def build_doc(w, obj_cache={}, resolve_redirects=False):
    wkey = w['key']
    assert w['type']['key'] == '/type/work'
    title = w.get('title', None)
    if not title:
        return

    def get_pub_year(e):
        pub_date = e.get('publish_date', None)
        if pub_date:
            m = re_iso_date.match(pub_date)
            if m:
                return m.group(1)
            m = re_year.search(pub_date)
            if m:
                return m.group(1)

    if 'editions' not in w:
        q = { 'type':'/type/edition', 'works': wkey, '*': None }
        w['editions'] = list(query_iter(q))
        #print 'editions:', [e['key'] for e in w['editions']]

    identifiers = defaultdict(list)

    editions = []
    for e in w['editions']:
        pub_year = get_pub_year(e)
        if pub_year:
            e['pub_year'] = pub_year
        ia = None
        if 'ocaid' in e:
            ia = e['ocaid']
        elif 'ia_loaded_id' in e:
            loaded = e['ia_loaded_id']
            ia = loaded if isinstance(loaded, basestring) else loaded[0]
        if ia:
            ia_meta_fields = get_ia_collection_and_box_id(ia)
            collection = ia_meta_fields['collection']
            if 'ia_box_id' in e and isinstance(e['ia_box_id'], basestring):
                e['ia_box_id'] = [e['ia_box_id']]
            if ia_meta_fields.get('boxid'):
                box_id = list(ia_meta_fields['boxid'])[0]
                e.setdefault('ia_box_id', [])
                if box_id.lower() not in [x.lower() for x in e['ia_box_id']]:
                    e['ia_box_id'].append(box_id)
            #print 'collection:', collection
            e['ia_collection'] = collection
            e['public_scan'] = ('lendinglibrary' not in collection) and ('printdisabled' not in collection)
        overdrive_id = e.get('identifiers', {}).get('overdrive', None)
        if overdrive_id:
            #print 'overdrive:', overdrive_id
            e['overdrive'] = overdrive_id
        if 'identifiers' in e:
            for k, id_list in e['identifiers'].iteritems():
                k_orig = k
                k = k.replace('.', '_').replace(',', '_').replace('(', '').replace(')', '').replace(':', '_').replace('/', '').replace('#', '').lower()
                m = re_solr_field.match(k)
                if not m:
                    print (k_orig, k)
                assert m
                for v in id_list:
                    v = v.strip()
                    if v not in identifiers[k]:
                        identifiers[k].append(v)
        editions.append(e)

    editions.sort(key=lambda e: e.get('pub_year', None))

    #print len(w['editions']), 'editions found'

    #print w['key']
    work_authors = []
    authors = []
    author_keys = []
    for a in w.get('authors', []):
        if 'author' not in a: # OL Web UI bug
            continue # http://openlibrary.org/works/OL15365167W.yml?m=edit&v=1
        akey = a['author']['key']
        m = re_author_key.match(akey)
        if not m:
            print 'invalid author key:', akey
            continue
        work_authors.append(akey)
        author_keys.append(m.group(1))
        if akey in obj_cache and obj_cache[akey]['type']['key'] != '/type/redirect':
            authors.append(obj_cache[akey])
        else:
            authors.append(withKey(akey))
    if any(a['type']['key'] == '/type/redirect' for a in authors):
        if resolve_redirects:
            def resolve(a):
                if a['type']['key'] == '/type/redirect':
                    a = withKey(a['location'])
                return a
            authors = [resolve(a) for a in authors]
        else:
            print
            for a in authors:
                print 'author:', a
            print w['key']
            print
            raise AuthorRedirect
    assert all(a['type']['key'] == '/type/author' for a in authors)

    try:
        subjects = four_types(get_work_subjects(w))
    except:
        print 'bad work: ', w['key']
        raise

    field_map = {
        'subjects': 'subject',
        'subject_places': 'place',
        'subject_times': 'time',
        'subject_people': 'person',
    }

    has_fulltext = any(e.get('ocaid', None) or e.get('overdrive', None) for e in editions)

    #print 'has_fulltext:', has_fulltext

    for db_field, solr_field in field_map.iteritems():
        if not w.get(db_field, None):
            continue
        cur = subjects.setdefault(solr_field, {})
        for v in w[db_field]:
            try:
                if isinstance(v, dict):
                    if 'value' not in v:
                        continue
                    v = v['value']
                cur[v] = cur.get(v, 0) + 1
            except:
                print 'v:', v
                raise

    if any(e.get('ocaid', None) for e in editions):
        subjects.setdefault('subject', {})
        subjects['subject']['Accessible book'] = subjects['subject'].get('Accessible book', 0) + 1
        if not has_fulltext:
            subjects['subject']['Protected DAISY'] = subjects['subject'].get('Protected DAISY', 0) + 1
        #print w['key'], subjects['subject']

    doc = Element("doc")

    add_field(doc, 'key', w['key'][7:])
    title = w.get('title', None)
    if title:
        add_field(doc, 'title', title)
#        add_field(doc, 'title_suggest', title)

    add_field(doc, 'has_fulltext', has_fulltext)
    if w.get('subtitle', None):
        add_field(doc, 'subtitle', w['subtitle'])

    alt_titles = set()
    for e in editions:
        if 'title' in e and e['title'] != title:
            alt_titles.add(e['title'])
        for f in 'work_titles', 'other_titles':
            for t in e.get(f, []):
                if t != title:
                    alt_titles.add(t)
    add_field_list(doc, 'alternative_title', alt_titles)

    alt_subtitles = set( e['subtitle'] for e in editions if e.get('subtitle', None) and e['subtitle'] != w.get('subtitle', None))
    add_field_list(doc, 'alternative_subtitle', alt_subtitles)

    add_field(doc, 'edition_count', len(editions))
    for e in editions:
        add_field(doc, 'edition_key', re_edition_key.match(e['key']).group(1))

    cover_edition = pick_cover(w, editions)
    if cover_edition:
        add_field(doc, 'cover_edition_key', re_edition_key.match(cover_edition).group(1))
    if w.get('covers'):
        cover = w['covers'][0]
        assert isinstance(cover, int)
        add_field(doc, 'cover_i', cover)

    k = 'by_statement'
    add_field_list(doc, k, set( e[k] for e in editions if e.get(k, None)))

    k = 'publish_date'
    pub_dates = set(e[k] for e in editions if e.get(k, None))
    add_field_list(doc, k, pub_dates)
    pub_years = set(m.group(1) for m in (re_year.search(i) for i in pub_dates) if m)
    if pub_years:
        add_field_list(doc, 'publish_year', pub_years)
        add_field(doc, 'first_publish_year', min(int(i) for i in pub_years))

    k = 'first_sentence'
    fs = set( e[k]['value'] if isinstance(e[k], dict) else e[k] for e in editions if e.get(k, None))
    add_field_list(doc, k, fs)

    publishers = set()
    for e in editions:
        publishers.update('Sine nomine' if is_sine_nomine(i) else i for i in e.get('publishers', []))
    add_field_list(doc, 'publisher', publishers)
#    add_field_list(doc, 'publisher_facet', publishers)

    field_map = [
        ('lccn', 'lccn'),
        ('publish_places', 'publish_place'),
        ('oclc_numbers', 'oclc'),
        ('contributions', 'contributor'),
    ]

    for db_key, search_key in field_map:
        v = set()
        for e in editions:
            if db_key not in e:
                continue
            v.update(e[db_key])
        add_field_list(doc, search_key, v)

    isbn = set()
    for e in editions:
        for f in 'isbn_10', 'isbn_13':
            for v in e.get(f, []):
                v = v.replace('-', '')
                isbn.add(v)
                alt = opposite_isbn(v)
                if alt:
                    isbn.add(alt)
    add_field_list(doc, 'isbn', isbn)

    lang = set()
    ia_loaded_id = set()
    ia_box_id = set()

    for e in editions:
        for l in e.get('languages', []):
            m = re_lang_key.match(l['key'] if isinstance(l, dict) else l)
            lang.add(m.group(1))
        if e.get('ia_loaded_id'):
            if isinstance(e['ia_loaded_id'], basestring):
                ia_loaded_id.add(e['ia_loaded_id'])
            else:
                try:
                    assert isinstance(e['ia_loaded_id'], list) and isinstance(e['ia_loaded_id'][0], basestring)
                except AssertionError:
                    print e.get('ia')
                    print e['ia_loaded_id']
                    raise
                ia_loaded_id.update(e['ia_loaded_id'])
        if e.get('ia_box_id'):
            if isinstance(e['ia_box_id'], basestring):
                ia_box_id.add(e['ia_box_id'])
            else:
                try:
                    assert isinstance(e['ia_box_id'], list) and isinstance(e['ia_box_id'][0], basestring)
                except AssertionError:
                    print e['key']
                    raise
                ia_box_id.update(e['ia_box_id'])
    if lang:
        add_field_list(doc, 'language', lang)

    pub_goog = set() # google
    pub_nongoog = set()
    nonpub_goog = set()
    nonpub_nongoog = set()

    public_scan = False
    all_collection = set()
    all_overdrive = set()
    lending_edition = None
    in_library_edition = None
    printdisabled = set()
    for e in editions:
        if 'overdrive' in e:
            all_overdrive.update(e['overdrive'])
        if 'ocaid' not in e:
            continue
        if not lending_edition and 'lendinglibrary' in e.get('ia_collection', []):
            lending_edition = re_edition_key.match(e['key']).group(1)
        if not in_library_edition and 'inlibrary' in e.get('ia_collection', []):
            in_library_edition = re_edition_key.match(e['key']).group(1)
        if 'printdisabled' in e.get('ia_collection', []):
            printdisabled.add(re_edition_key.match(e['key']).group(1))
        all_collection.update(e.get('ia_collection', []))
        assert isinstance(e['ocaid'], basestring)
        i = e['ocaid'].strip()
        if e.get('public_scan'):
            public_scan = True
            if i.endswith('goog'):
                pub_goog.add(i)
            else:
                pub_nongoog.add(i)
        else:
            if i.endswith('goog'):
                nonpub_goog.add(i)
            else:
                nonpub_nongoog.add(i)
    #print 'lending_edition:', lending_edition
    ia_list = list(pub_nongoog) + list(pub_goog) + list(nonpub_nongoog) + list(nonpub_goog)
    add_field_list(doc, 'ia', ia_list)
    if has_fulltext:
        add_field(doc, 'public_scan_b', public_scan)
    if all_collection:
        add_field(doc, 'ia_collection_s', ';'.join(all_collection))
    if all_overdrive:
        add_field(doc, 'overdrive_s', ';'.join(all_overdrive))
    if lending_edition:
        add_field(doc, 'lending_edition_s', lending_edition)
    elif in_library_edition:
        add_field(doc, 'lending_edition_s', in_library_edition)
    if printdisabled:
        add_field(doc, 'printdisabled_s', ';'.join(list(printdisabled)))
        
    if lending_edition or in_library_edition:
        add_field(doc, "borrowed_b", is_borrowed(lending_edition or in_library_edition))

    author_keys = [re_author_key.match(a['key']).group(1) for a in authors]
    author_names = [a.get('name', '') for a in authors]
    add_field_list(doc, 'author_key', author_keys)
    add_field_list(doc, 'author_name', author_names)

    alt_names = set()
    for a in authors:
        if 'alternate_names' in a:
            alt_names.update(a['alternate_names'])

    add_field_list(doc, 'author_alternative_name', alt_names)
    add_field_list(doc, 'author_facet', (' '.join(v) for v in zip(author_keys, author_names)))
    #if subjects:
    #    add_field(doc, 'fiction', subjects['fiction'])

    for k in 'person', 'place', 'subject', 'time':
        if k not in subjects:
            continue
        add_field_list(doc, k, subjects[k].keys())
        add_field_list(doc, k + '_facet', subjects[k].keys())
        subject_keys = [str_to_key(s) for s in subjects[k].keys()]
        add_field_list(doc, k + '_key', subject_keys)

    for k in sorted(identifiers.keys()):
        add_field_list(doc, 'id_' + k, identifiers[k])

    if ia_loaded_id:
        add_field_list(doc, 'ia_loaded_id', ia_loaded_id)

    if ia_box_id:
        add_field_list(doc, 'ia_box_id', ia_box_id)
        
    return doc