Ejemplo n.º 1
0
def find_title_redirects(akey):
    title_redirects = {}
    for w in get_existing_works(akey):
        try:
            norm_wt = mk_norm(w['title'])
        except:
            print(w['key'])
            raise
        q = {'type':'/type/redirect', 'location': str(w['key']), 'limit': 0}
        try:
            query_iter = ol.query(q)
        except:
            print(q)
            raise
        for r in map(get_first_version, query_iter):
            redirect_history = json.load(urlopen('http://openlibrary.org%s.json?m=history' % r['key']))
            if any(v['author'].endswith('/WorkBot') and v['comment'] == "merge works" for v in redirect_history):
                continue
            #print 'redirect:', r
            if mk_norm(r['title']) == norm_wt:
                continue
            if r['title'] in title_redirects:
                assert title_redirects[r['title']] == w['title']
            #print 'redirect:', r['key'], r['title'], 'work:', w['key'], w['title']
            title_redirects[r['title']] = w['title']
    return title_redirects
Ejemplo n.º 2
0
def find_matching_work(e):
    norm_title = mk_norm(e['title'])

    seen = set()
    for akey in e['authors']:
        q = {
            'type':'/type/work',
            'authors': {'author': {'key': akey}},
            'limit': 0,
            'title': None,
        }
        t0 = time()
        work_keys = list(ol.query(q))
        t1 = time() - t0
        print 'time to find books by author: %.1f seconds' % t1
        for w in work_keys:
            wkey = w['key']
            if wkey in seen:
                continue
            seen.add(wkey)
            if not w.get('title'):
                continue
            if mk_norm(w['title']) == norm_title:
                assert ol.query({'key': wkey, 'type': None})[0]['type'] == '/type/work'
                return wkey
Ejemplo n.º 3
0
def get_books(akey, query):
    for e in query:
        if not e.get('title', None):
            continue


#        if len(e.get('authors', [])) != 1:
#            continue
        if 'title_prefix' in e and e['title_prefix']:
            prefix = e['title_prefix']
            if prefix[-1] != ' ':
                prefix += ' '
            title = prefix + e['title']
        else:
            title = e['title']

        title = title.strip(' ')
        if has_dot(title):
            title = title[:-1]
        if title.strip('. ') in ['Publications', 'Works', 'Report', \
                'Letters', 'Calendar', 'Bulletin', 'Plays', 'Sermons', 'Correspondence']:
            continue

        m = re_parens.match(title)
        if m:
            title = m.group(1)

        n = mk_norm(title)

        book = {
            'title': title,
            'norm_title': n,
            'key': e['key'],
        }

        lang = e.get('languages', [])
        if lang:
            book['lang'] = [l['key'][3:] for l in lang]

        if e.get('table_of_contents', None):
            if isinstance(e['table_of_contents'][0], six.string_types):
                book['table_of_contents'] = e['table_of_contents']
            else:
                assert isinstance(e['table_of_contents'][0], dict)
                if e['table_of_contents'][0]['type'] == '/type/text':
                    book['table_of_contents'] = [
                        i['value'] for i in e['table_of_contents']
                    ]

        wt = get_work_title(e)
        if not wt:
            yield book
            continue
        if wt in ('Works', 'Selections'):
            yield book
            continue
        n_wt = mk_norm(wt)
        book['work_title'] = wt
        book['norm_wt'] = n_wt
        yield book
def find_matching_work(e):
    norm_title = mk_norm(e['title'])

    seen = set()
    for akey in e['authors']:
        q = {
            'type': '/type/work',
            'authors': {
                'author': {
                    'key': akey
                }
            },
            'limit': 0,
            'title': None,
        }
        t0 = time()
        work_keys = list(ol.query(q))
        t1 = time() - t0
        print('time to find books by author: %.1f seconds' % t1)
        for w in work_keys:
            wkey = w['key']
            if wkey in seen:
                continue
            seen.add(wkey)
            if not w.get('title'):
                continue
            if mk_norm(w['title']) == norm_title:
                assert ol.query({
                    'key': wkey,
                    'type': None
                })[0]['type'] == '/type/work'
                return wkey
Ejemplo n.º 5
0
def find_title_redirects(akey):
    title_redirects = {}
    for w in get_existing_works(akey):
        try:
            norm_wt = mk_norm(w['title'])
        except:
            print w['key']
            raise
        q = {'type': '/type/redirect', 'location': str(w['key']), 'limit': 0}
        try:
            query_iter = ol.query(q)
        except:
            print q
            raise
        for r in map(get_first_version, query_iter):
            redirect_history = json.load(
                urlopen('http://openlibrary.org%s.json?m=history' % r['key']))
            if any(v['author'].endswith('/WorkBot')
                   and v['comment'] == "merge works"
                   for v in redirect_history):
                continue
            #print 'redirect:', r
            if mk_norm(r['title']) == norm_wt:
                continue
            if r['title'] in title_redirects:
                assert title_redirects[r['title']] == w['title']
            #print 'redirect:', r['key'], r['title'], 'work:', w['key'], w['title']
            title_redirects[r['title']] = w['title']
    return title_redirects
Ejemplo n.º 6
0
def find_matching_work(e):
    """
    Looks for an existing Work representing the new import edition by
    comparing normalized titles for every work by each author of the current edition.
    Returns the first match found, or None.

    :param dict e: An OL edition suitable for saving, has a key, and has full Authors with keys
                   but has not yet been saved.
    :rtype: None or str
    :return: the matched work key "/works/OL..W" if found
    """

    norm_title = mk_norm(get_title(e))
    seen = set()
    for a in e['authors']:
        q = {'type': '/type/work', 'authors': {'author': {'key': a['key']}}}
        work_keys = list(web.ctx.site.things(q))
        for wkey in work_keys:
            w = web.ctx.site.get(wkey)
            if wkey in seen:
                continue
            seen.add(wkey)
            if not w.get('title'):
                continue
            if mk_norm(w['title']) == norm_title:
                assert w.type.key == '/type/work'
                return wkey
Ejemplo n.º 7
0
def get_books(akey):
    for e in books_query(akey):
        if not e.get('title', None):
            continue
        if len(e.get('authors', [])) != 1:
            continue
        if 'works' in e:
            continue
        if 'title_prefix' in e and e['title_prefix']:
            prefix = e['title_prefix']
            if prefix[-1] != ' ':
                prefix += ' '
            title = prefix + e['title']
        else:
            title = e['title']

        title = title.strip(' ')
        if has_dot(title):
            title = title[:-1]
        if title.strip('. ') in ['Publications', 'Works', 'Report', \
                'Letters', 'Calendar', 'Bulletin', 'Plays', 'Sermons', 'Correspondence']:
            continue

        m = re_parens.match(title)
        if m:
            title = m.group(1)

        n = mk_norm(title)

        book = {
            'title': title,
            'norm_title': n,
            'key': e['key'],
        }

        if 'languages' in e:
            book['lang'] = [l['key'][3:] for l in e['languages']]

        if e.get('table_of_contents', None):
            if isinstance(e['table_of_contents'][0], basestring):
                book['table_of_contents'] = e['table_of_contents']
            else:
                assert isinstance(e['table_of_contents'][0], dict)
                if e['table_of_contents'][0]['type'] == '/type/text':
                    book['table_of_contents'] = [i['value'] for i in e['table_of_contents']]

        wt = get_work_title(e)
        if not wt:
            yield book
            continue
        if wt in ('Works', 'Selections'):
            yield book
            continue
        n_wt = mk_norm(wt)
        book['work_title'] = wt
        book['norm_wt'] = n_wt
        yield book
Ejemplo n.º 8
0
def find_title_redirects(akey):
    title_redirects = {}
    for w in get_existing_works(akey):
        norm_wt = mk_norm(w['title'])
        q = {'type':'/type/redirect', 'location': str(w['key']), 'limit': 500}
        for r in map(get_first_version, ol.query(q)):
            if mk_norm(r['title']) == norm_wt:
                continue
            if r['title'] in title_redirects:
                assert title_redirects[r['title']] == w['title']
            title_redirects[r['title']] = w['title']
    return title_redirects
Ejemplo n.º 9
0
def find_works(book_iter, existing={}, do_get_mc=True):

    var = find_works2(book_iter)
    find_works3(var, existing)

    works = find_work_sort(var)

    for work_count, norm, w in works:
        first = sorted(w.items(), reverse=True, key=lambda i: len(i[1]))[0][0]
        titles = defaultdict(int)
        for key_list in w.values():
            for ekey in key_list:
                b = var['books_by_key'][ekey]
                title = b['title']
                titles[title] += 1
        keys = var['work_titles'][norm]
        for values in w.values():
            keys += values
        assert work_count == len(keys)
        title = max(titles.keys(), key=lambda i: titles[i])
        toc_iter = ((k, var['books_by_key'][k].get('table_of_contents', None))
                    for k in keys)
        toc = dict((k, v) for k, v in toc_iter if v)
        # sometimes keys contains duplicates
        editions = [var['books_by_key'][k] for k in set(keys)]
        subtitles = defaultdict(lambda: defaultdict(int))
        edition_count = 0
        with_subtitle_count = 0
        for e in editions:
            edition_count += 1
            subtitle = e.get('subtitle') or ''
            if subtitle != '':
                with_subtitle_count += 1
            norm_subtitle = mk_norm(subtitle)
            if norm_subtitle != norm:
                subtitles[norm_subtitle][subtitle] += 1
        use_subtitle = None
        for k, v in subtitles.iteritems():
            lc_k = k.strip(' .').lower()
            if lc_k in ('', 'roman') or 'edition' in lc_k:
                continue
            num = sum(v.values())
            overall = float(num) / float(edition_count)
            ratio = float(num) / float(with_subtitle_count)
            if overall > 0.2 and ratio > 0.5:
                use_subtitle = freq_dict_top(v)
        w = {'title': first, 'editions': editions}
        if use_subtitle:
            w['subtitle'] = use_subtitle
        if toc:
            w['toc'] = toc
        try:
            subjects = four_types(get_work_subjects(w, do_get_mc=do_get_mc))
        except:
            print w
            raise
        if subjects:
            w['subjects'] = subjects
        yield w
Ejemplo n.º 10
0
def find_works(book_iter, existing={}, do_get_mc=True):

    var = find_works2(book_iter)
    find_works3(var, existing)

    works = find_work_sort(var)

    for work_count, norm, w in works:
        first = sorted(w.items(), reverse=True, key=lambda i:len(i[1]))[0][0]
        titles = defaultdict(int)
        for key_list in w.values():
            for ekey in key_list:
                b = var['books_by_key'][ekey]
                title = b['title']
                titles[title] += 1
        keys = var['work_titles'][norm]
        for values in w.values():
            keys += values
        assert work_count == len(keys)
        title = max(titles.keys(), key=lambda i:titles[i])
        toc_iter = ((k, var['books_by_key'][k].get('table_of_contents', None)) for k in keys)
        toc = dict((k, v) for k, v in toc_iter if v)
        # sometimes keys contains duplicates
        editions = [var['books_by_key'][k] for k in set(keys)]
        subtitles = defaultdict(lambda: defaultdict(int))
        edition_count = 0
        with_subtitle_count = 0
        for e in editions:
            edition_count += 1
            subtitle = e.get('subtitle') or ''
            if subtitle != '':
                with_subtitle_count += 1
            norm_subtitle = mk_norm(subtitle)
            if norm_subtitle != norm:
                subtitles[norm_subtitle][subtitle] += 1
        use_subtitle = None
        for k, v in subtitles.iteritems():
            lc_k = k.strip(' .').lower()
            if lc_k in ('', 'roman') or 'edition' in lc_k:
                continue
            num = sum(v.values())
            overall = float(num) / float(edition_count)
            ratio = float(num) / float(with_subtitle_count)
            if overall > 0.2 and ratio > 0.5:
                use_subtitle = freq_dict_top(v)
        w = {'title': first, 'editions': editions}
        if use_subtitle:
            w['subtitle'] = use_subtitle
        if toc:
            w['toc'] = toc
        try:
            subjects = four_types(get_work_subjects(w, do_get_mc=do_get_mc))
        except:
            print(w)
            raise
        if subjects:
            w['subjects'] = subjects
        yield w
Ejemplo n.º 11
0
def find_matching_work(e):
    norm_title = mk_norm(get_title(e))

    seen = set()
    for a in e['authors']:
        q = {
            'type': '/type/work',
            'authors.author': a['key'],
        }
        work_keys = list(web.ctx.site.things(q))
        for wkey in work_keys:
            w = web.ctx.site.get(wkey)
            if wkey in seen:
                continue
            seen.add(wkey)
            if not w.get('title'):
                continue
            if mk_norm(w['title']) == norm_title:
                assert w.type.key == '/type/work'
                return wkey
Ejemplo n.º 12
0
def find_works3(var, existing={}):
    title_map = build_work_title_map(var['equiv'], var['norm_titles'])

    for a, b in existing.items():
        norm_a = mk_norm(a)
        norm_b = mk_norm(b)
        var['rev_wt'][norm_b][norm_a] +=1
        title_map[norm_a] = norm_b

    var['works'] = defaultdict(lambda: defaultdict(list))
    var['work_titles'] = defaultdict(list)
    for b in var['books']:
        if 'eng' not in b.get('lang', []) and 'norm_wt' in b:
            var['work_titles'][b['norm_wt']].append(b['key'])
        n = b['norm_title']
        title = b['title']
        if n in title_map:
            n = title_map[n]
            title = top_rev_wt(var['rev_wt'][n])
        var['works'][n][title].append(b['key'])
Ejemplo n.º 13
0
def find_works3(var, existing={}):
    title_map = build_work_title_map(var['equiv'], var['norm_titles'])

    for a, b in existing.items():
        norm_a = mk_norm(a)
        norm_b = mk_norm(b)
        var['rev_wt'][norm_b][norm_a] += 1
        title_map[norm_a] = norm_b

    var['works'] = defaultdict(lambda: defaultdict(list))
    var['work_titles'] = defaultdict(list)
    for b in var['books']:
        if 'eng' not in b.get('lang', []) and 'norm_wt' in b:
            var['work_titles'][b['norm_wt']].append(b['key'])
        n = b['norm_title']
        title = b['title']
        if n in title_map:
            n = title_map[n]
            title = top_rev_wt(var['rev_wt'][n])
        var['works'][n][title].append(b['key'])
Ejemplo n.º 14
0
def find_matching_work(e):
    norm_title = mk_norm(get_title(e))

    seen = set()
    for a in e['authors']:
        q = {
            'type':'/type/work',
            'authors.author': a['key'],
        }
        work_keys = list(web.ctx.site.things(q))
        for wkey in work_keys:
            w = web.ctx.site.get(wkey)
            if wkey in seen:
                continue
            seen.add(wkey)
            if not w.get('title'):
                continue
            if mk_norm(w['title']) == norm_title:
                assert w.type.key == '/type/work'
                return wkey
Ejemplo n.º 15
0
def get_books(akey, query):
    for e in query:
        try:
            if not e.get('title', None):
                continue
        except:
            print e
#        if len(e.get('authors', [])) != 1:
#            continue
        if 'title_prefix' in e and e['title_prefix']:
            prefix = e['title_prefix']
            if prefix[-1] != ' ':
                prefix += ' '
            title = prefix + e['title']
        else:
            title = e['title']

        title = title.strip(' ')
        if has_dot(title):
            title = title[:-1]
        title_and_subtitle = title
        if e.get('subtitle', None):
            title_and_subtitle += ' ' + e['subtitle']
        #if title_and_subtitle in ['Publications', 'Works', 'Report', \
        #        'Letters', 'Calendar', 'Bulletin', 'Plays', \
        #        'Sermons', 'Correspondence', 'Bills']:
        #    continue

        m = re_parens.match(title)
        if m:
            title = m.group(1)

        n = mk_norm(title)

        book = {
            'title': title,
            'norm_title': n,
            'key': e['key'],
        }

        lang = e.get('languages', [])
        if lang:
            book['lang'] = [re_lang_key.match(l['key']).group(1) for l in lang]

        if e.get('table_of_contents', None):
            if isinstance(e['table_of_contents'][0], basestring):
                book['table_of_contents'] = e['table_of_contents']
            else:
                assert isinstance(e['table_of_contents'][0], dict)
                if e['table_of_contents'][0].get('type', None) == '/type/text':
                    book['table_of_contents'] = [i['value'] for i in e['table_of_contents']]
        if 'subtitle' in e:
            book['subtitle'] = e['subtitle']

        if 'source_records' in e:
            book['source_records'] = e['source_records']

        wt = get_work_title(e)
        if not wt:
            yield book
            continue
        if wt in bad_titles:
            yield book
            continue
        n_wt = mk_norm(wt)
        book['work_title'] = wt
        book['norm_wt'] = n_wt
        yield book
Ejemplo n.º 16
0
def find_works(akey, book_iter, existing={}):
    equiv = defaultdict(int) # normalized title and work title pairs
    norm_titles = defaultdict(int) # frequency of titles
    books_by_key = {}
    books = []
    # normalized work title to regular title
    rev_wt = defaultdict(lambda: defaultdict(int))

    print 'find_works'

    for book in book_iter:
        if 'norm_wt' in book:
            pair = (book['norm_title'], book['norm_wt'])
            equiv[pair] += 1
            rev_wt[book['norm_wt']][book['work_title']] +=1
        norm_titles[book['norm_title']] += 1
        books_by_key[book['key']] = book
        books.append(book)

    title_map = build_work_title_map(equiv, norm_titles)

    for a, b in existing.items():
        norm_a = mk_norm(a)
        norm_b = mk_norm(b)
        rev_wt[norm_b][norm_a] +=1
        title_map[norm_a] = norm_b

    works = defaultdict(lambda: defaultdict(list))
    work_titles = defaultdict(list)
    for b in books:
        if 'eng' not in b.get('lang', []) and 'norm_wt' in b:
            work_titles[b['norm_wt']].append(b['key'])
            continue
        n = b['norm_title']
        title = b['title']
        if n in title_map:
            n = title_map[n]
            title = top_rev_wt(rev_wt[n])
        works[n][title].append(b['key'])

    works = sorted([(sum(map(len, w.values() + [work_titles[n]])), n, w) for n, w in works.items()])

    for work_count, norm, w in works:
        first = sorted(w.items(), reverse=True, key=lambda i:len(i[1]))[0][0]
        titles = defaultdict(int)
        for key_list in w.values():
            for ekey in key_list:
                b = books_by_key[ekey]
                title = b['title']
                titles[title] += 1
        keys = work_titles[norm]
        for values in w.values():
            keys += values
        assert work_count == len(keys)
        title = max(titles.keys(), key=lambda i:titles[i])
        toc_iter = ((k, books_by_key[k].get('table_of_contents', None)) for k in keys)
        toc = dict((k, v) for k, v in toc_iter if v)
        editions = [books_by_key[k] for k in keys]
        subtitles = defaultdict(lambda: defaultdict(int))
        edition_count = 0
        with_subtitle_count = 0
        for e in editions:
            edition_count += 1
            subtitle = e['subtitle'] or ''
            if subtitle != '':
                with_subtitle_count += 1
            norm_subtitle = mk_norm(subtitle)
            if norm_subtitle != norm:
                subtitles[norm_subtitle][subtitle] += 1
        use_subtitle = None
        for k, v in subtitles.iteritems():
            lc_k = k.strip(' .').lower()
            if lc_k in ('', 'roman') or 'edition' in lc_k:
                continue
            num = sum(v.values())
            overall = float(num) / float(edition_count)
            ratio = float(num) / float(with_subtitle_count)
            if overall > 0.2 and ratio > 0.5:
                use_subtitle = freq_dict_top(v)
        w = {'title': first, 'editions': editions}
        if use_subtitle:
            w['subtitle'] = use_subtitle
        if toc:
            w['toc'] = toc
        subjects = four_types(find_subjects(get_marc_subjects(w)))
        if subjects:
            w['subjects'] = subjects
        yield w
Ejemplo n.º 17
0
def cat_title(prefix, title):
    if not prefix:
        return mk_norm(title)
    if prefix[-1] != ' ':
        prefix += ' '
    return mk_norm(prefix + title)
Ejemplo n.º 18
0
def test_mk_norm(title, expected):
    assert mk_norm(title) == expected
Ejemplo n.º 19
0
def get_books(akey, query, do_get_mc=True):
    for e in query:
        try:
            if not e.get('title', None):
                continue
        except:
            print(e)
#        if len(e.get('authors', [])) != 1:
#            continue
        if 'title_prefix' in e and e['title_prefix']:
            prefix = e['title_prefix']
            if prefix[-1] != ' ':
                prefix += ' '
            title = prefix + e['title']
        else:
            title = e['title']

        title = title.strip(' ')
        if has_dot(title):
            title = title[:-1]

        m = re_parens.match(title)
        if m:
            title = m.group(1)

        n = mk_norm(title)

        book = {
            'title': title,
            'norm_title': n,
            'key': e['key'],
        }

        lang = e.get('languages', [])
        if lang:
            book['lang'] = [re_lang_key.match(l['key']).group(1) for l in lang]

        if e.get('table_of_contents', None):
            if isinstance(e['table_of_contents'][0], six.string_types):
                book['table_of_contents'] = e['table_of_contents']
            else:
                assert isinstance(e['table_of_contents'][0], dict)
                if e['table_of_contents'][0].get('type', None) == '/type/text':
                    book['table_of_contents'] = [i['value'] for i in e['table_of_contents']]
        if 'subtitle' in e:
            book['subtitle'] = e['subtitle']

        if 'source_records' in e:
            book['source_records'] = e['source_records']

        mc = get_mc(e['key']) if do_get_mc else None
        wt = get_work_title(e, mc)
        if not wt:
            yield book
            continue
        if wt in bad_titles:
            yield book
            continue
        n_wt = mk_norm(wt)
        book['work_title'] = wt
        book['norm_wt'] = n_wt
        yield book
Ejemplo n.º 20
0
def test_mk_norm_equality(a, b):
    assert mk_norm(a) == mk_norm(b)
Ejemplo n.º 21
0
def cat_title(prefix, title):
    if not prefix:
        return mk_norm(title)
    if prefix[-1] != ' ':
        prefix += ' '
    return mk_norm(prefix + title)
Ejemplo n.º 22
0
def get_books(akey, query, do_get_mc=True):
    for e in query:
        try:
            if not e.get('title', None):
                continue
        except:
            print e


#        if len(e.get('authors', [])) != 1:
#            continue
        if 'title_prefix' in e and e['title_prefix']:
            prefix = e['title_prefix']
            if prefix[-1] != ' ':
                prefix += ' '
            title = prefix + e['title']
        else:
            title = e['title']

        title = title.strip(' ')
        if has_dot(title):
            title = title[:-1]

        m = re_parens.match(title)
        if m:
            title = m.group(1)

        n = mk_norm(title)

        book = {
            'title': title,
            'norm_title': n,
            'key': e['key'],
        }

        lang = e.get('languages', [])
        if lang:
            book['lang'] = [re_lang_key.match(l['key']).group(1) for l in lang]

        if e.get('table_of_contents', None):
            if isinstance(e['table_of_contents'][0], basestring):
                book['table_of_contents'] = e['table_of_contents']
            else:
                assert isinstance(e['table_of_contents'][0], dict)
                if e['table_of_contents'][0].get('type', None) == '/type/text':
                    book['table_of_contents'] = [
                        i['value'] for i in e['table_of_contents']
                    ]
        if 'subtitle' in e:
            book['subtitle'] = e['subtitle']

        if 'source_records' in e:
            book['source_records'] = e['source_records']

        mc = get_mc(e['key']) if do_get_mc else None
        wt = get_work_title(e, mc)
        if not wt:
            yield book
            continue
        if wt in bad_titles:
            yield book
            continue
        n_wt = mk_norm(wt)
        book['work_title'] = wt
        book['norm_wt'] = n_wt
        yield book