Exemple #1
0
    def get_isbns(self, editions):
        isbns = set()

        isbns.update(v.replace("_", "").strip() for e in editions for v in e.get("isbn_10", []))
        isbns.update(v.replace("_", "").strip() for e in editions for v in e.get("isbn_13", []))
        
        # Get the isbn13 when isbn10 is present and vice-versa.
        alt_isbns = [opposite_isbn(v) for v in isbns]
        isbns.update(v for v in alt_isbns if v is not None)
        
        return isbns        
Exemple #2
0
 def isbn_redirect(self, isbn_param):
     isbn = normalize_isbn(isbn_param)
     if not isbn:
         return
     editions = []
     for isbn_len in (10, 13):
         qisbn = isbn if len(isbn) == isbn_len else opposite_isbn(isbn)
         q = {'type': '/type/edition', 'isbn_%d' % isbn_len: qisbn}
         editions += web.ctx.site.things(q)
     if len(editions):
         raise web.seeother(editions[0])
Exemple #3
0
    def get_isbns(self, editions):
        isbns = set()

        isbns.update(
            v.replace("_", "").strip() for e in editions
            for v in e.get("isbn_10", []))
        isbns.update(
            v.replace("_", "").strip() for e in editions
            for v in e.get("isbn_13", []))

        # Get the isbn13 when isbn10 is present and vice-versa.
        alt_isbns = [opposite_isbn(v) for v in isbns]
        isbns.update(v for v in alt_isbns if v is not None)

        return isbns
Exemple #4
0
def test_opposite_isbn():
    assert opposite_isbn('0-940787-08-3') == '9780940787087'
    assert opposite_isbn('978-0-940787-08-7') == '0940787083'
    assert opposite_isbn('BAD-ISBN') is None
Exemple #5
0
def build_doc(w, obj_cache={}, resolve_redirects=False):
    wkey = w['key']
    assert w['type']['key'] == '/type/work'
    title = w.get('title', None)
    if not title:
        return

    def get_pub_year(e):
        pub_date = e.get('publish_date', None)
        if pub_date:
            m = re_iso_date.match(pub_date)
            if m:
                return m.group(1)
            m = re_year.search(pub_date)
            if m:
                return m.group(1)

    if 'editions' not in w:
        q = { 'type':'/type/edition', 'works': wkey, '*': None }
        w['editions'] = list(query_iter(q))
        #print 'editions:', [e['key'] for e in w['editions']]

    identifiers = defaultdict(list)

    editions = []
    for e in w['editions']:
        pub_year = get_pub_year(e)
        if pub_year:
            e['pub_year'] = pub_year
        ia = None
        if 'ocaid' in e:
            ia = e['ocaid']
        elif 'ia_loaded_id' in e:
            loaded = e['ia_loaded_id']
            ia = loaded if isinstance(loaded, basestring) else loaded[0]
        if ia:
            ia_meta_fields = get_ia_collection_and_box_id(ia)
            collection = ia_meta_fields['collection']
            if 'ia_box_id' in e and isinstance(e['ia_box_id'], basestring):
                e['ia_box_id'] = [e['ia_box_id']]
            if ia_meta_fields.get('boxid'):
                box_id = list(ia_meta_fields['boxid'])[0]
                e.setdefault('ia_box_id', [])
                if box_id.lower() not in [x.lower() for x in e['ia_box_id']]:
                    e['ia_box_id'].append(box_id)
            #print 'collection:', collection
            e['ia_collection'] = collection
            e['public_scan'] = ('lendinglibrary' not in collection) and ('printdisabled' not in collection)
        overdrive_id = e.get('identifiers', {}).get('overdrive', None)
        if overdrive_id:
            #print 'overdrive:', overdrive_id
            e['overdrive'] = overdrive_id
        if 'identifiers' in e:
            for k, id_list in e['identifiers'].iteritems():
                k_orig = k
                k = k.replace('.', '_').replace(',', '_').replace('(', '').replace(')', '').replace(':', '_').replace('/', '').replace('#', '').lower()
                m = re_solr_field.match(k)
                if not m:
                    print (k_orig, k)
                assert m
                for v in id_list:
                    v = v.strip()
                    if v not in identifiers[k]:
                        identifiers[k].append(v)
        editions.append(e)

    editions.sort(key=lambda e: e.get('pub_year', None))

    #print len(w['editions']), 'editions found'

    #print w['key']
    work_authors = []
    authors = []
    author_keys = []
    for a in w.get('authors', []):
        if 'author' not in a: # OL Web UI bug
            continue # http://openlibrary.org/works/OL15365167W.yml?m=edit&v=1
        akey = a['author']['key']
        m = re_author_key.match(akey)
        if not m:
            print 'invalid author key:', akey
            continue
        work_authors.append(akey)
        author_keys.append(m.group(1))
        if akey in obj_cache and obj_cache[akey]['type']['key'] != '/type/redirect':
            authors.append(obj_cache[akey])
        else:
            authors.append(withKey(akey))
    if any(a['type']['key'] == '/type/redirect' for a in authors):
        if resolve_redirects:
            def resolve(a):
                if a['type']['key'] == '/type/redirect':
                    a = withKey(a['location'])
                return a
            authors = [resolve(a) for a in authors]
        else:
            print
            for a in authors:
                print 'author:', a
            print w['key']
            print
            raise AuthorRedirect
    assert all(a['type']['key'] == '/type/author' for a in authors)

    try:
        subjects = four_types(get_work_subjects(w))
    except:
        print 'bad work: ', w['key']
        raise

    field_map = {
        'subjects': 'subject',
        'subject_places': 'place',
        'subject_times': 'time',
        'subject_people': 'person',
    }

    has_fulltext = any(e.get('ocaid', None) or e.get('overdrive', None) for e in editions)

    #print 'has_fulltext:', has_fulltext

    for db_field, solr_field in field_map.iteritems():
        if not w.get(db_field, None):
            continue
        cur = subjects.setdefault(solr_field, {})
        for v in w[db_field]:
            try:
                if isinstance(v, dict):
                    if 'value' not in v:
                        continue
                    v = v['value']
                cur[v] = cur.get(v, 0) + 1
            except:
                print 'v:', v
                raise

    if any(e.get('ocaid', None) for e in editions):
        subjects.setdefault('subject', {})
        subjects['subject']['Accessible book'] = subjects['subject'].get('Accessible book', 0) + 1
        if not has_fulltext:
            subjects['subject']['Protected DAISY'] = subjects['subject'].get('Protected DAISY', 0) + 1
        #print w['key'], subjects['subject']

    doc = Element("doc")

    add_field(doc, 'key', w['key'][7:])
    title = w.get('title', None)
    if title:
        add_field(doc, 'title', title)
#        add_field(doc, 'title_suggest', title)

    add_field(doc, 'has_fulltext', has_fulltext)
    if w.get('subtitle', None):
        add_field(doc, 'subtitle', w['subtitle'])

    alt_titles = set()
    for e in editions:
        if 'title' in e and e['title'] != title:
            alt_titles.add(e['title'])
        for f in 'work_titles', 'other_titles':
            for t in e.get(f, []):
                if t != title:
                    alt_titles.add(t)
    add_field_list(doc, 'alternative_title', alt_titles)

    alt_subtitles = set( e['subtitle'] for e in editions if e.get('subtitle', None) and e['subtitle'] != w.get('subtitle', None))
    add_field_list(doc, 'alternative_subtitle', alt_subtitles)

    add_field(doc, 'edition_count', len(editions))
    for e in editions:
        add_field(doc, 'edition_key', re_edition_key.match(e['key']).group(1))

    cover_edition = pick_cover(w, editions)
    if cover_edition:
        add_field(doc, 'cover_edition_key', re_edition_key.match(cover_edition).group(1))
    if w.get('covers'):
        cover = w['covers'][0]
        assert isinstance(cover, int)
        add_field(doc, 'cover_i', cover)

    k = 'by_statement'
    add_field_list(doc, k, set( e[k] for e in editions if e.get(k, None)))

    k = 'publish_date'
    pub_dates = set(e[k] for e in editions if e.get(k, None))
    add_field_list(doc, k, pub_dates)
    pub_years = set(m.group(1) for m in (re_year.search(i) for i in pub_dates) if m)
    if pub_years:
        add_field_list(doc, 'publish_year', pub_years)
        add_field(doc, 'first_publish_year', min(int(i) for i in pub_years))

    k = 'first_sentence'
    fs = set( e[k]['value'] if isinstance(e[k], dict) else e[k] for e in editions if e.get(k, None))
    add_field_list(doc, k, fs)

    publishers = set()
    for e in editions:
        publishers.update('Sine nomine' if is_sine_nomine(i) else i for i in e.get('publishers', []))
    add_field_list(doc, 'publisher', publishers)
#    add_field_list(doc, 'publisher_facet', publishers)

    field_map = [
        ('lccn', 'lccn'),
        ('publish_places', 'publish_place'),
        ('oclc_numbers', 'oclc'),
        ('contributions', 'contributor'),
    ]

    for db_key, search_key in field_map:
        v = set()
        for e in editions:
            if db_key not in e:
                continue
            v.update(e[db_key])
        add_field_list(doc, search_key, v)

    isbn = set()
    for e in editions:
        for f in 'isbn_10', 'isbn_13':
            for v in e.get(f, []):
                v = v.replace('-', '')
                isbn.add(v)
                alt = opposite_isbn(v)
                if alt:
                    isbn.add(alt)
    add_field_list(doc, 'isbn', isbn)

    lang = set()
    ia_loaded_id = set()
    ia_box_id = set()

    for e in editions:
        for l in e.get('languages', []):
            m = re_lang_key.match(l['key'] if isinstance(l, dict) else l)
            lang.add(m.group(1))
        if e.get('ia_loaded_id'):
            if isinstance(e['ia_loaded_id'], basestring):
                ia_loaded_id.add(e['ia_loaded_id'])
            else:
                try:
                    assert isinstance(e['ia_loaded_id'], list) and isinstance(e['ia_loaded_id'][0], basestring)
                except AssertionError:
                    print e.get('ia')
                    print e['ia_loaded_id']
                    raise
                ia_loaded_id.update(e['ia_loaded_id'])
        if e.get('ia_box_id'):
            if isinstance(e['ia_box_id'], basestring):
                ia_box_id.add(e['ia_box_id'])
            else:
                try:
                    assert isinstance(e['ia_box_id'], list) and isinstance(e['ia_box_id'][0], basestring)
                except AssertionError:
                    print e['key']
                    raise
                ia_box_id.update(e['ia_box_id'])
    if lang:
        add_field_list(doc, 'language', lang)

    pub_goog = set() # google
    pub_nongoog = set()
    nonpub_goog = set()
    nonpub_nongoog = set()

    public_scan = False
    all_collection = set()
    all_overdrive = set()
    lending_edition = None
    in_library_edition = None
    printdisabled = set()
    for e in editions:
        if 'overdrive' in e:
            all_overdrive.update(e['overdrive'])
        if 'ocaid' not in e:
            continue
        if not lending_edition and 'lendinglibrary' in e.get('ia_collection', []):
            lending_edition = re_edition_key.match(e['key']).group(1)
        if not in_library_edition and 'inlibrary' in e.get('ia_collection', []):
            in_library_edition = re_edition_key.match(e['key']).group(1)
        if 'printdisabled' in e.get('ia_collection', []):
            printdisabled.add(re_edition_key.match(e['key']).group(1))
        all_collection.update(e.get('ia_collection', []))
        assert isinstance(e['ocaid'], basestring)
        i = e['ocaid'].strip()
        if e.get('public_scan'):
            public_scan = True
            if i.endswith('goog'):
                pub_goog.add(i)
            else:
                pub_nongoog.add(i)
        else:
            if i.endswith('goog'):
                nonpub_goog.add(i)
            else:
                nonpub_nongoog.add(i)
    #print 'lending_edition:', lending_edition
    ia_list = list(pub_nongoog) + list(pub_goog) + list(nonpub_nongoog) + list(nonpub_goog)
    add_field_list(doc, 'ia', ia_list)
    if has_fulltext:
        add_field(doc, 'public_scan_b', public_scan)
    if all_collection:
        add_field(doc, 'ia_collection_s', ';'.join(all_collection))
    if all_overdrive:
        add_field(doc, 'overdrive_s', ';'.join(all_overdrive))
    if lending_edition:
        add_field(doc, 'lending_edition_s', lending_edition)
    elif in_library_edition:
        add_field(doc, 'lending_edition_s', in_library_edition)
    if printdisabled:
        add_field(doc, 'printdisabled_s', ';'.join(list(printdisabled)))
        
    if lending_edition or in_library_edition:
        add_field(doc, "borrowed_b", is_borrowed(lending_edition or in_library_edition))

    author_keys = [re_author_key.match(a['key']).group(1) for a in authors]
    author_names = [a.get('name', '') for a in authors]
    add_field_list(doc, 'author_key', author_keys)
    add_field_list(doc, 'author_name', author_names)

    alt_names = set()
    for a in authors:
        if 'alternate_names' in a:
            alt_names.update(a['alternate_names'])

    add_field_list(doc, 'author_alternative_name', alt_names)
    add_field_list(doc, 'author_facet', (' '.join(v) for v in zip(author_keys, author_names)))
    #if subjects:
    #    add_field(doc, 'fiction', subjects['fiction'])

    for k in 'person', 'place', 'subject', 'time':
        if k not in subjects:
            continue
        add_field_list(doc, k, subjects[k].keys())
        add_field_list(doc, k + '_facet', subjects[k].keys())
        subject_keys = [str_to_key(s) for s in subjects[k].keys()]
        add_field_list(doc, k + '_key', subject_keys)

    for k in sorted(identifiers.keys()):
        add_field_list(doc, 'id_' + k, identifiers[k])

    if ia_loaded_id:
        add_field_list(doc, 'ia_loaded_id', ia_loaded_id)

    if ia_box_id:
        add_field_list(doc, 'ia_box_id', ia_box_id)
        
    return doc