Example #1
0
def find_works(book_iter, existing={}, do_get_mc=True):

    var = find_works2(book_iter)
    find_works3(var, existing)

    works = find_work_sort(var)

    for work_count, norm, w in works:
        first = sorted(w.items(), reverse=True, key=lambda i: len(i[1]))[0][0]
        titles = defaultdict(int)
        for key_list in w.values():
            for ekey in key_list:
                b = var['books_by_key'][ekey]
                title = b['title']
                titles[title] += 1
        keys = var['work_titles'][norm]
        for values in w.values():
            keys += values
        assert work_count == len(keys)
        title = max(titles.keys(), key=lambda i: titles[i])
        toc_iter = ((k, var['books_by_key'][k].get('table_of_contents', None))
                    for k in keys)
        toc = dict((k, v) for k, v in toc_iter if v)
        # sometimes keys contains duplicates
        editions = [var['books_by_key'][k] for k in set(keys)]
        subtitles = defaultdict(lambda: defaultdict(int))
        edition_count = 0
        with_subtitle_count = 0
        for e in editions:
            edition_count += 1
            subtitle = e.get('subtitle') or ''
            if subtitle != '':
                with_subtitle_count += 1
            norm_subtitle = mk_norm(subtitle)
            if norm_subtitle != norm:
                subtitles[norm_subtitle][subtitle] += 1
        use_subtitle = None
        for k, v in subtitles.iteritems():
            lc_k = k.strip(' .').lower()
            if lc_k in ('', 'roman') or 'edition' in lc_k:
                continue
            num = sum(v.values())
            overall = float(num) / float(edition_count)
            ratio = float(num) / float(with_subtitle_count)
            if overall > 0.2 and ratio > 0.5:
                use_subtitle = freq_dict_top(v)
        w = {'title': first, 'editions': editions}
        if use_subtitle:
            w['subtitle'] = use_subtitle
        if toc:
            w['toc'] = toc
        try:
            subjects = four_types(get_work_subjects(w, do_get_mc=do_get_mc))
        except:
            print w
            raise
        if subjects:
            w['subjects'] = subjects
        yield w
Example #2
0
def find_works(book_iter, existing={}, do_get_mc=True):

    var = find_works2(book_iter)
    find_works3(var, existing)

    works = find_work_sort(var)

    for work_count, norm, w in works:
        first = sorted(w.items(), reverse=True, key=lambda i:len(i[1]))[0][0]
        titles = defaultdict(int)
        for key_list in w.values():
            for ekey in key_list:
                b = var['books_by_key'][ekey]
                title = b['title']
                titles[title] += 1
        keys = var['work_titles'][norm]
        for values in w.values():
            keys += values
        assert work_count == len(keys)
        title = max(titles.keys(), key=lambda i:titles[i])
        toc_iter = ((k, var['books_by_key'][k].get('table_of_contents', None)) for k in keys)
        toc = dict((k, v) for k, v in toc_iter if v)
        # sometimes keys contains duplicates
        editions = [var['books_by_key'][k] for k in set(keys)]
        subtitles = defaultdict(lambda: defaultdict(int))
        edition_count = 0
        with_subtitle_count = 0
        for e in editions:
            edition_count += 1
            subtitle = e.get('subtitle') or ''
            if subtitle != '':
                with_subtitle_count += 1
            norm_subtitle = mk_norm(subtitle)
            if norm_subtitle != norm:
                subtitles[norm_subtitle][subtitle] += 1
        use_subtitle = None
        for k, v in subtitles.iteritems():
            lc_k = k.strip(' .').lower()
            if lc_k in ('', 'roman') or 'edition' in lc_k:
                continue
            num = sum(v.values())
            overall = float(num) / float(edition_count)
            ratio = float(num) / float(with_subtitle_count)
            if overall > 0.2 and ratio > 0.5:
                use_subtitle = freq_dict_top(v)
        w = {'title': first, 'editions': editions}
        if use_subtitle:
            w['subtitle'] = use_subtitle
        if toc:
            w['toc'] = toc
        try:
            subjects = four_types(get_work_subjects(w, do_get_mc=do_get_mc))
        except:
            print(w)
            raise
        if subjects:
            w['subjects'] = subjects
        yield w
        rec = MarcBinary(data)
        assert read_subjects(rec) == expected


subjects = []
for item, expect in xml_samples:
    filename = os.path.dirname(
        __file__) + '/test_data/xml_input/' + item + '_marc.xml'
    element = etree.parse(filename).getroot()
    if element.tag != record_tag and element[0].tag == record_tag:
        element = element[0]
    rec = MarcXml(element)
    subjects.append(read_subjects(rec))

for item, expect in bin_samples:
    filename = os.path.dirname(__file__) + '/test_data/bin_input/' + item

    data = open(filename).read()
    if len(data) != int(data[:5]):
        data = data.decode('utf-8').encode('raw_unicode_escape')
    rec = MarcBinary(data)
    subjects.append(read_subjects(rec))

all_subjects = defaultdict(lambda: defaultdict(int))
for a in subjects:
    for b, c in a.items():
        for d, e in c.items():
            all_subjects[b][d] += e

print four_types(dict((k, dict(v)) for k, v in all_subjects.items()))
Example #4
0
 def test_four_types_event(self):
     subjects = {'event': {'Party': 1}}
     expect = {'subject': {'Party': 1}}
     assert four_types(subjects) == expect
Example #5
0
 def test_four_types_combine(self):
     subjects = {'subject': {'Science': 2}, 'event': {'Party': 1}}
     expect = {'subject': {'Science': 2, 'Party': 1}}
     assert four_types(subjects) == expect
        if len(data) != int(data[:5]):
            data = data.decode('utf-8').encode('raw_unicode_escape')
        rec = MarcBinary(data)
        assert read_subjects(rec) == expected

subjects = []
for item, expect in xml_samples:
    filename = os.path.dirname(__file__) + '/test_data/xml_input/' + item + '_marc.xml'
    element = etree.parse(filename).getroot()
    if element.tag != record_tag and element[0].tag == record_tag:
        element = element[0]
    rec = MarcXml(element)
    subjects.append(read_subjects(rec))

for item, expect in bin_samples:
    filename = os.path.dirname(__file__) + '/test_data/bin_input/' + item

    data = open(filename).read()
    if len(data) != int(data[:5]):
        data = data.decode('utf-8').encode('raw_unicode_escape')
    rec = MarcBinary(data)
    subjects.append(read_subjects(rec))

all_subjects = defaultdict(lambda: defaultdict(int))
for a in subjects:
    for b, c in a.items():
        for d, e in c.items():
            all_subjects[b][d] += e

print four_types(dict((k, dict(v)) for k, v in all_subjects.items()))
Example #7
0
def build_doc(w, obj_cache={}, resolve_redirects=False):
    wkey = w['key']
    assert w['type']['key'] == '/type/work'
    title = w.get('title', None)
    if not title:
        return

    def get_pub_year(e):
        pub_date = e.get('publish_date', None)
        if pub_date:
            m = re_year.search(pub_date)
            if m:
                return m.group(1)

    if 'editions' not in w:
        q = { 'type':'/type/edition', 'works': wkey, '*': None }
        w['editions'] = list(query_iter(q))
        print 'editions:', [e['key'] for e in w['editions']]

    editions = []
    for e in w['editions']:
        pub_year = get_pub_year(e)
        if pub_year:
            e['pub_year'] = pub_year
        if 'ocaid' in e:
            collection = get_ia_collection(e['ocaid'])
            #print 'collection:', collection
            e['ia_collection'] = collection
            e['public_scan'] = ('lendinglibrary' not in collection) and ('printdisabled' not in collection)
        overdrive_id = e.get('identifiers', {}).get('overdrive', None)
        if overdrive_id:
            #print 'overdrive:', overdrive_id
            e['overdrive'] = overdrive_id
        editions.append(e)

    editions.sort(key=lambda e: e.get('pub_year', None))

    #print len(w['editions']), 'editions found'

    #print w['key']
    work_authors = []
    authors = []
    author_keys = []
    for a in w.get('authors', []):
        if 'author' not in a: # OL Web UI bug
            continue # http://openlibrary.org/works/OL15365167W.yml?m=edit&v=1
        akey = a['author']['key']
        m = re_author_key.match(akey)
        if not m:
            print 'invalid author key:', akey
            continue
        work_authors.append(akey)
        author_keys.append(m.group(1))
        if akey in obj_cache and obj_cache[akey]['type']['key'] != '/type/redirect':
            authors.append(obj_cache[akey])
        else:
            authors.append(withKey(akey))
    if any(a['type']['key'] == '/type/redirect' for a in authors):
        if resolve_redirects:
            def resolve(a):
                if a['type']['key'] == '/type/redirect':
                    a = withKey(a['location'])
                return a
            authors = [resolve(a) for a in authors]
        else:
            print
            for a in authors:
                print 'author:', a
            print
            raise AuthorRedirect
    assert all(a['type']['key'] == '/type/author' for a in authors)

    try:
        subjects = four_types(get_work_subjects(w))
    except:
        print 'bad work: ', w['key']
        raise

    field_map = {
        'subjects': 'subject',
        'subject_places': 'place',
        'subject_times': 'time',
        'subject_people': 'person',
    }

    has_fulltext = any(e.get('ocaid', None) or e.get('overdrive', None) for e in editions)

    #print 'has_fulltext:', has_fulltext

    for db_field, solr_field in field_map.iteritems():
        if not w.get(db_field, None):
            continue
        cur = subjects.setdefault(solr_field, {})
        for v in w[db_field]:
            try:
                if isinstance(v, dict):
                    if 'value' not in v:
                        continue
                    v = v['value']
                cur[v] = cur.get(v, 0) + 1
            except:
                print 'v:', v
                raise

    if any(e.get('ocaid', None) for e in editions):
        subjects.setdefault('subject', {})
        subjects['subject']['Accessible book'] = subjects['subject'].get('Accessible book', 0) + 1
        if not has_fulltext:
            subjects['subject']['Protected DAISY'] = subjects['subject'].get('Protected DAISY', 0) + 1
        #print w['key'], subjects['subject']

    doc = Element("doc")

    add_field(doc, 'key', w['key'][7:])
    title = w.get('title', None)
    if title:
        add_field(doc, 'title', title)
#        add_field(doc, 'title_suggest', title)

    add_field(doc, 'has_fulltext', has_fulltext)
    if w.get('subtitle', None):
        add_field(doc, 'subtitle', w['subtitle'])

    alt_titles = set()
    for e in editions:
        if 'title' in e and e['title'] != title:
            alt_titles.add(e['title'])
        for f in 'work_titles', 'other_titles':
            for t in e.get(f, []):
                if t != title:
                    alt_titles.add(t)
    add_field_list(doc, 'alternative_title', alt_titles)

    alt_subtitles = set( e['subtitle'] for e in editions if e.get('subtitle', None) and e['subtitle'] != w.get('subtitle', None))
    add_field_list(doc, 'alternative_subtitle', alt_subtitles)

    add_field(doc, 'edition_count', len(editions))
    for e in editions:
        add_field(doc, 'edition_key', re_edition_key.match(e['key']).group(1))

    cover_edition = pick_cover(w, editions)
    if cover_edition:
        add_field(doc, 'cover_edition_key', re_edition_key.match(cover_edition).group(1))

    k = 'by_statement'
    add_field_list(doc, k, set( e[k] for e in editions if e.get(k, None)))

    k = 'publish_date'
    pub_dates = set(e[k] for e in editions if e.get(k, None))
    add_field_list(doc, k, pub_dates)
    pub_years = set(m.group(1) for m in (re_year.search(i) for i in pub_dates) if m)
    if pub_years:
        add_field_list(doc, 'publish_year', pub_years)
        add_field(doc, 'first_publish_year', min(int(i) for i in pub_years))

    k = 'first_sentence'
    fs = set( e[k]['value'] if isinstance(e[k], dict) else e[k] for e in editions if e.get(k, None))
    add_field_list(doc, k, fs)

    publishers = set()
    for e in editions:
        publishers.update('Sine nomine' if is_sine_nomine(i) else i for i in e.get('publishers', []))
    add_field_list(doc, 'publisher', publishers)
#    add_field_list(doc, 'publisher_facet', publishers)

    field_map = [
        ('lccn', 'lccn'),
        ('publish_places', 'publish_place'),
        ('oclc_numbers', 'oclc'),
        ('contributions', 'contributor'),
    ]

    for db_key, search_key in field_map:
        v = set()
        for e in editions:
            if db_key not in e:
                continue
            v.update(e[db_key])
        add_field_list(doc, search_key, v)

    isbn = set()
    for e in editions:
        for f in 'isbn_10', 'isbn_13':
            for v in e.get(f, []):
                isbn.add(v.replace('-', ''))
    add_field_list(doc, 'isbn', isbn)

    lang = set()
    for e in editions:
        for l in e.get('languages', []):
            m = re_lang_key.match(l['key'] if isinstance(l, dict) else l)
            lang.add(m.group(1))
    if lang:
        add_field_list(doc, 'language', lang)

    pub_goog = set() # google
    pub_nongoog = set()
    nonpub_goog = set()
    nonpub_nongoog = set()

    public_scan = False
    all_collection = set()
    all_overdrive = set()
    lending_edition = None
    printdisabled = set()
    for e in editions:
        if 'overdrive' in e:
            all_overdrive.update(e['overdrive'])
        if 'ocaid' not in e:
            continue
        if not lending_edition and 'lendinglibrary' in e['ia_collection']:
            lending_edition = re_edition_key.match(e['key']).group(1)
        if 'printdisabled' in e['ia_collection']:
            printdisabled.add(re_edition_key.match(e['key']).group(1))
        all_collection.update(e['ia_collection'])
        assert isinstance(e['ocaid'], basestring)
        i = e['ocaid'].strip()
        if e['public_scan']:
            public_scan = True
            if i.endswith('goog'):
                pub_goog.add(i)
            else:
                pub_nongoog.add(i)
        else:
            if i.endswith('goog'):
                nonpub_goog.add(i)
            else:
                nonpub_nongoog.add(i)
    #print 'lending_edition:', lending_edition
    ia_list = list(pub_nongoog) + list(pub_goog) + list(nonpub_nongoog) + list(nonpub_goog)
    add_field_list(doc, 'ia', ia_list)
    if has_fulltext:
        add_field(doc, 'public_scan_b', public_scan)
    if all_collection:
        add_field(doc, 'ia_collection_s', ';'.join(all_collection))
    if all_overdrive:
        add_field(doc, 'overdrive_s', ';'.join(all_overdrive))
    if lending_edition:
        add_field(doc, 'lending_edition_s', lending_edition)
    if printdisabled:
        add_field(doc, 'printdisabled_s', ';'.join(list(printdisabled)))

    author_keys = [re_author_key.match(a['key']).group(1) for a in authors]
    author_names = [a.get('name', '') for a in authors]
    add_field_list(doc, 'author_key', author_keys)
    add_field_list(doc, 'author_name', author_names)

    alt_names = set()
    for a in authors:
        if 'alternate_names' in a:
            alt_names.update(a['alternate_names'])

    add_field_list(doc, 'author_alternative_name', alt_names)
    add_field_list(doc, 'author_facet', (' '.join(v) for v in zip(author_keys, author_names)))
    #if subjects:
    #    add_field(doc, 'fiction', subjects['fiction'])

    for k in 'person', 'place', 'subject', 'time':
        if k not in subjects:
            continue
        add_field_list(doc, k, subjects[k].keys())
        add_field_list(doc, k + '_facet', subjects[k].keys())
        subject_keys = [str_to_key(s) for s in subjects[k].keys()]
        add_field_list(doc, k + '_key', subject_keys)

    return doc
Example #8
0
def find_works(akey, book_iter, existing={}):
    equiv = defaultdict(int) # normalized title and work title pairs
    norm_titles = defaultdict(int) # frequency of titles
    books_by_key = {}
    books = []
    # normalized work title to regular title
    rev_wt = defaultdict(lambda: defaultdict(int))

    for book in book_iter:
        if 'norm_wt' in book:
            pair = (book['norm_title'], book['norm_wt'])
            equiv[pair] += 1
            rev_wt[book['norm_wt']][book['work_title']] +=1
        norm_titles[book['norm_title']] += 1
        books_by_key[book['key']] = book
        books.append(book)

    title_map = build_work_title_map(equiv, norm_titles)

    for a, b in existing.items():
        norm_a = mk_norm(a)
        norm_b = mk_norm(b)
        rev_wt[norm_b][norm_a] +=1
        title_map[norm_a] = norm_b

    works = defaultdict(lambda: defaultdict(list))
    work_titles = defaultdict(list)
    for b in books:
        if 'eng' not in b.get('lang', []) and 'norm_wt' in b:
            work_titles[b['norm_wt']].append(b['key'])
        n = b['norm_title']
        title = b['title']
        if n in title_map:
            n = title_map[n]
            title = top_rev_wt(rev_wt[n])
        works[n][title].append(b['key'])

    works = sorted([(sum(map(len, w.values() + [work_titles[n]])), n, w) for n, w in works.items()])

    for work_count, norm, w in works:
        first = sorted(w.items(), reverse=True, key=lambda i:len(i[1]))[0][0]
        titles = defaultdict(int)
        for key_list in w.values():
            for ekey in key_list:
                b = books_by_key[ekey]
                title = b['title']
                titles[title] += 1
        keys = work_titles[norm]
        for values in w.values():
            keys += values
        assert work_count == len(keys)
        title = max(titles.keys(), key=lambda i:titles[i])
        toc_iter = ((k, books_by_key[k].get('table_of_contents', None)) for k in keys)
        toc = dict((k, v) for k, v in toc_iter if v)
        editions = [books_by_key[k] for k in keys]
        subtitles = defaultdict(lambda: defaultdict(int))
        edition_count = 0
        with_subtitle_count = 0
        for e in editions:
            edition_count += 1
            subtitle = e['subtitle'] or ''
            if subtitle != '':
                with_subtitle_count += 1
            norm_subtitle = mk_norm(subtitle)
            if norm_subtitle != norm:
                subtitles[norm_subtitle][subtitle] += 1
        use_subtitle = None
        for k, v in subtitles.iteritems():
            lc_k = k.strip(' .').lower()
            if lc_k in ('', 'roman') or 'edition' in lc_k:
                continue
            num = sum(v.values())
            overall = float(num) / float(edition_count)
            ratio = float(num) / float(with_subtitle_count)
            if overall > 0.2 and ratio > 0.5:
                use_subtitle = freq_dict_top(v)
        w = {'title': first, 'editions': editions}
        if use_subtitle:
            w['subtitle'] = use_subtitle
        if toc:
            w['toc'] = toc
        subjects = four_types(get_work_subjects(w))
        if subjects:
            w['subjects'] = subjects
        yield w