def find_title_redirects(akey): title_redirects = {} for w in get_existing_works(akey): try: norm_wt = mk_norm(w['title']) except: print(w['key']) raise q = {'type':'/type/redirect', 'location': str(w['key']), 'limit': 0} try: query_iter = ol.query(q) except: print(q) raise for r in map(get_first_version, query_iter): redirect_history = json.load(urlopen('http://openlibrary.org%s.json?m=history' % r['key'])) if any(v['author'].endswith('/WorkBot') and v['comment'] == "merge works" for v in redirect_history): continue #print 'redirect:', r if mk_norm(r['title']) == norm_wt: continue if r['title'] in title_redirects: assert title_redirects[r['title']] == w['title'] #print 'redirect:', r['key'], r['title'], 'work:', w['key'], w['title'] title_redirects[r['title']] = w['title'] return title_redirects
def find_matching_work(e): norm_title = mk_norm(e['title']) seen = set() for akey in e['authors']: q = { 'type':'/type/work', 'authors': {'author': {'key': akey}}, 'limit': 0, 'title': None, } t0 = time() work_keys = list(ol.query(q)) t1 = time() - t0 print 'time to find books by author: %.1f seconds' % t1 for w in work_keys: wkey = w['key'] if wkey in seen: continue seen.add(wkey) if not w.get('title'): continue if mk_norm(w['title']) == norm_title: assert ol.query({'key': wkey, 'type': None})[0]['type'] == '/type/work' return wkey
def get_books(akey, query): for e in query: if not e.get('title', None): continue # if len(e.get('authors', [])) != 1: # continue if 'title_prefix' in e and e['title_prefix']: prefix = e['title_prefix'] if prefix[-1] != ' ': prefix += ' ' title = prefix + e['title'] else: title = e['title'] title = title.strip(' ') if has_dot(title): title = title[:-1] if title.strip('. ') in ['Publications', 'Works', 'Report', \ 'Letters', 'Calendar', 'Bulletin', 'Plays', 'Sermons', 'Correspondence']: continue m = re_parens.match(title) if m: title = m.group(1) n = mk_norm(title) book = { 'title': title, 'norm_title': n, 'key': e['key'], } lang = e.get('languages', []) if lang: book['lang'] = [l['key'][3:] for l in lang] if e.get('table_of_contents', None): if isinstance(e['table_of_contents'][0], six.string_types): book['table_of_contents'] = e['table_of_contents'] else: assert isinstance(e['table_of_contents'][0], dict) if e['table_of_contents'][0]['type'] == '/type/text': book['table_of_contents'] = [ i['value'] for i in e['table_of_contents'] ] wt = get_work_title(e) if not wt: yield book continue if wt in ('Works', 'Selections'): yield book continue n_wt = mk_norm(wt) book['work_title'] = wt book['norm_wt'] = n_wt yield book
def find_matching_work(e): norm_title = mk_norm(e['title']) seen = set() for akey in e['authors']: q = { 'type': '/type/work', 'authors': { 'author': { 'key': akey } }, 'limit': 0, 'title': None, } t0 = time() work_keys = list(ol.query(q)) t1 = time() - t0 print('time to find books by author: %.1f seconds' % t1) for w in work_keys: wkey = w['key'] if wkey in seen: continue seen.add(wkey) if not w.get('title'): continue if mk_norm(w['title']) == norm_title: assert ol.query({ 'key': wkey, 'type': None })[0]['type'] == '/type/work' return wkey
def find_title_redirects(akey): title_redirects = {} for w in get_existing_works(akey): try: norm_wt = mk_norm(w['title']) except: print w['key'] raise q = {'type': '/type/redirect', 'location': str(w['key']), 'limit': 0} try: query_iter = ol.query(q) except: print q raise for r in map(get_first_version, query_iter): redirect_history = json.load( urlopen('http://openlibrary.org%s.json?m=history' % r['key'])) if any(v['author'].endswith('/WorkBot') and v['comment'] == "merge works" for v in redirect_history): continue #print 'redirect:', r if mk_norm(r['title']) == norm_wt: continue if r['title'] in title_redirects: assert title_redirects[r['title']] == w['title'] #print 'redirect:', r['key'], r['title'], 'work:', w['key'], w['title'] title_redirects[r['title']] = w['title'] return title_redirects
def find_matching_work(e): """ Looks for an existing Work representing the new import edition by comparing normalized titles for every work by each author of the current edition. Returns the first match found, or None. :param dict e: An OL edition suitable for saving, has a key, and has full Authors with keys but has not yet been saved. :rtype: None or str :return: the matched work key "/works/OL..W" if found """ norm_title = mk_norm(get_title(e)) seen = set() for a in e['authors']: q = {'type': '/type/work', 'authors': {'author': {'key': a['key']}}} work_keys = list(web.ctx.site.things(q)) for wkey in work_keys: w = web.ctx.site.get(wkey) if wkey in seen: continue seen.add(wkey) if not w.get('title'): continue if mk_norm(w['title']) == norm_title: assert w.type.key == '/type/work' return wkey
def get_books(akey): for e in books_query(akey): if not e.get('title', None): continue if len(e.get('authors', [])) != 1: continue if 'works' in e: continue if 'title_prefix' in e and e['title_prefix']: prefix = e['title_prefix'] if prefix[-1] != ' ': prefix += ' ' title = prefix + e['title'] else: title = e['title'] title = title.strip(' ') if has_dot(title): title = title[:-1] if title.strip('. ') in ['Publications', 'Works', 'Report', \ 'Letters', 'Calendar', 'Bulletin', 'Plays', 'Sermons', 'Correspondence']: continue m = re_parens.match(title) if m: title = m.group(1) n = mk_norm(title) book = { 'title': title, 'norm_title': n, 'key': e['key'], } if 'languages' in e: book['lang'] = [l['key'][3:] for l in e['languages']] if e.get('table_of_contents', None): if isinstance(e['table_of_contents'][0], basestring): book['table_of_contents'] = e['table_of_contents'] else: assert isinstance(e['table_of_contents'][0], dict) if e['table_of_contents'][0]['type'] == '/type/text': book['table_of_contents'] = [i['value'] for i in e['table_of_contents']] wt = get_work_title(e) if not wt: yield book continue if wt in ('Works', 'Selections'): yield book continue n_wt = mk_norm(wt) book['work_title'] = wt book['norm_wt'] = n_wt yield book
def find_title_redirects(akey): title_redirects = {} for w in get_existing_works(akey): norm_wt = mk_norm(w['title']) q = {'type':'/type/redirect', 'location': str(w['key']), 'limit': 500} for r in map(get_first_version, ol.query(q)): if mk_norm(r['title']) == norm_wt: continue if r['title'] in title_redirects: assert title_redirects[r['title']] == w['title'] title_redirects[r['title']] = w['title'] return title_redirects
def find_works(book_iter, existing={}, do_get_mc=True): var = find_works2(book_iter) find_works3(var, existing) works = find_work_sort(var) for work_count, norm, w in works: first = sorted(w.items(), reverse=True, key=lambda i: len(i[1]))[0][0] titles = defaultdict(int) for key_list in w.values(): for ekey in key_list: b = var['books_by_key'][ekey] title = b['title'] titles[title] += 1 keys = var['work_titles'][norm] for values in w.values(): keys += values assert work_count == len(keys) title = max(titles.keys(), key=lambda i: titles[i]) toc_iter = ((k, var['books_by_key'][k].get('table_of_contents', None)) for k in keys) toc = dict((k, v) for k, v in toc_iter if v) # sometimes keys contains duplicates editions = [var['books_by_key'][k] for k in set(keys)] subtitles = defaultdict(lambda: defaultdict(int)) edition_count = 0 with_subtitle_count = 0 for e in editions: edition_count += 1 subtitle = e.get('subtitle') or '' if subtitle != '': with_subtitle_count += 1 norm_subtitle = mk_norm(subtitle) if norm_subtitle != norm: subtitles[norm_subtitle][subtitle] += 1 use_subtitle = None for k, v in subtitles.iteritems(): lc_k = k.strip(' .').lower() if lc_k in ('', 'roman') or 'edition' in lc_k: continue num = sum(v.values()) overall = float(num) / float(edition_count) ratio = float(num) / float(with_subtitle_count) if overall > 0.2 and ratio > 0.5: use_subtitle = freq_dict_top(v) w = {'title': first, 'editions': editions} if use_subtitle: w['subtitle'] = use_subtitle if toc: w['toc'] = toc try: subjects = four_types(get_work_subjects(w, do_get_mc=do_get_mc)) except: print w raise if subjects: w['subjects'] = subjects yield w
def find_works(book_iter, existing={}, do_get_mc=True): var = find_works2(book_iter) find_works3(var, existing) works = find_work_sort(var) for work_count, norm, w in works: first = sorted(w.items(), reverse=True, key=lambda i:len(i[1]))[0][0] titles = defaultdict(int) for key_list in w.values(): for ekey in key_list: b = var['books_by_key'][ekey] title = b['title'] titles[title] += 1 keys = var['work_titles'][norm] for values in w.values(): keys += values assert work_count == len(keys) title = max(titles.keys(), key=lambda i:titles[i]) toc_iter = ((k, var['books_by_key'][k].get('table_of_contents', None)) for k in keys) toc = dict((k, v) for k, v in toc_iter if v) # sometimes keys contains duplicates editions = [var['books_by_key'][k] for k in set(keys)] subtitles = defaultdict(lambda: defaultdict(int)) edition_count = 0 with_subtitle_count = 0 for e in editions: edition_count += 1 subtitle = e.get('subtitle') or '' if subtitle != '': with_subtitle_count += 1 norm_subtitle = mk_norm(subtitle) if norm_subtitle != norm: subtitles[norm_subtitle][subtitle] += 1 use_subtitle = None for k, v in subtitles.iteritems(): lc_k = k.strip(' .').lower() if lc_k in ('', 'roman') or 'edition' in lc_k: continue num = sum(v.values()) overall = float(num) / float(edition_count) ratio = float(num) / float(with_subtitle_count) if overall > 0.2 and ratio > 0.5: use_subtitle = freq_dict_top(v) w = {'title': first, 'editions': editions} if use_subtitle: w['subtitle'] = use_subtitle if toc: w['toc'] = toc try: subjects = four_types(get_work_subjects(w, do_get_mc=do_get_mc)) except: print(w) raise if subjects: w['subjects'] = subjects yield w
def find_matching_work(e): norm_title = mk_norm(get_title(e)) seen = set() for a in e['authors']: q = { 'type': '/type/work', 'authors.author': a['key'], } work_keys = list(web.ctx.site.things(q)) for wkey in work_keys: w = web.ctx.site.get(wkey) if wkey in seen: continue seen.add(wkey) if not w.get('title'): continue if mk_norm(w['title']) == norm_title: assert w.type.key == '/type/work' return wkey
def find_works3(var, existing={}): title_map = build_work_title_map(var['equiv'], var['norm_titles']) for a, b in existing.items(): norm_a = mk_norm(a) norm_b = mk_norm(b) var['rev_wt'][norm_b][norm_a] +=1 title_map[norm_a] = norm_b var['works'] = defaultdict(lambda: defaultdict(list)) var['work_titles'] = defaultdict(list) for b in var['books']: if 'eng' not in b.get('lang', []) and 'norm_wt' in b: var['work_titles'][b['norm_wt']].append(b['key']) n = b['norm_title'] title = b['title'] if n in title_map: n = title_map[n] title = top_rev_wt(var['rev_wt'][n]) var['works'][n][title].append(b['key'])
def find_works3(var, existing={}): title_map = build_work_title_map(var['equiv'], var['norm_titles']) for a, b in existing.items(): norm_a = mk_norm(a) norm_b = mk_norm(b) var['rev_wt'][norm_b][norm_a] += 1 title_map[norm_a] = norm_b var['works'] = defaultdict(lambda: defaultdict(list)) var['work_titles'] = defaultdict(list) for b in var['books']: if 'eng' not in b.get('lang', []) and 'norm_wt' in b: var['work_titles'][b['norm_wt']].append(b['key']) n = b['norm_title'] title = b['title'] if n in title_map: n = title_map[n] title = top_rev_wt(var['rev_wt'][n]) var['works'][n][title].append(b['key'])
def find_matching_work(e): norm_title = mk_norm(get_title(e)) seen = set() for a in e['authors']: q = { 'type':'/type/work', 'authors.author': a['key'], } work_keys = list(web.ctx.site.things(q)) for wkey in work_keys: w = web.ctx.site.get(wkey) if wkey in seen: continue seen.add(wkey) if not w.get('title'): continue if mk_norm(w['title']) == norm_title: assert w.type.key == '/type/work' return wkey
def get_books(akey, query): for e in query: try: if not e.get('title', None): continue except: print e # if len(e.get('authors', [])) != 1: # continue if 'title_prefix' in e and e['title_prefix']: prefix = e['title_prefix'] if prefix[-1] != ' ': prefix += ' ' title = prefix + e['title'] else: title = e['title'] title = title.strip(' ') if has_dot(title): title = title[:-1] title_and_subtitle = title if e.get('subtitle', None): title_and_subtitle += ' ' + e['subtitle'] #if title_and_subtitle in ['Publications', 'Works', 'Report', \ # 'Letters', 'Calendar', 'Bulletin', 'Plays', \ # 'Sermons', 'Correspondence', 'Bills']: # continue m = re_parens.match(title) if m: title = m.group(1) n = mk_norm(title) book = { 'title': title, 'norm_title': n, 'key': e['key'], } lang = e.get('languages', []) if lang: book['lang'] = [re_lang_key.match(l['key']).group(1) for l in lang] if e.get('table_of_contents', None): if isinstance(e['table_of_contents'][0], basestring): book['table_of_contents'] = e['table_of_contents'] else: assert isinstance(e['table_of_contents'][0], dict) if e['table_of_contents'][0].get('type', None) == '/type/text': book['table_of_contents'] = [i['value'] for i in e['table_of_contents']] if 'subtitle' in e: book['subtitle'] = e['subtitle'] if 'source_records' in e: book['source_records'] = e['source_records'] wt = get_work_title(e) if not wt: yield book continue if wt in bad_titles: yield book continue n_wt = mk_norm(wt) book['work_title'] = wt book['norm_wt'] = n_wt yield book
def find_works(akey, book_iter, existing={}): equiv = defaultdict(int) # normalized title and work title pairs norm_titles = defaultdict(int) # frequency of titles books_by_key = {} books = [] # normalized work title to regular title rev_wt = defaultdict(lambda: defaultdict(int)) print 'find_works' for book in book_iter: if 'norm_wt' in book: pair = (book['norm_title'], book['norm_wt']) equiv[pair] += 1 rev_wt[book['norm_wt']][book['work_title']] +=1 norm_titles[book['norm_title']] += 1 books_by_key[book['key']] = book books.append(book) title_map = build_work_title_map(equiv, norm_titles) for a, b in existing.items(): norm_a = mk_norm(a) norm_b = mk_norm(b) rev_wt[norm_b][norm_a] +=1 title_map[norm_a] = norm_b works = defaultdict(lambda: defaultdict(list)) work_titles = defaultdict(list) for b in books: if 'eng' not in b.get('lang', []) and 'norm_wt' in b: work_titles[b['norm_wt']].append(b['key']) continue n = b['norm_title'] title = b['title'] if n in title_map: n = title_map[n] title = top_rev_wt(rev_wt[n]) works[n][title].append(b['key']) works = sorted([(sum(map(len, w.values() + [work_titles[n]])), n, w) for n, w in works.items()]) for work_count, norm, w in works: first = sorted(w.items(), reverse=True, key=lambda i:len(i[1]))[0][0] titles = defaultdict(int) for key_list in w.values(): for ekey in key_list: b = books_by_key[ekey] title = b['title'] titles[title] += 1 keys = work_titles[norm] for values in w.values(): keys += values assert work_count == len(keys) title = max(titles.keys(), key=lambda i:titles[i]) toc_iter = ((k, books_by_key[k].get('table_of_contents', None)) for k in keys) toc = dict((k, v) for k, v in toc_iter if v) editions = [books_by_key[k] for k in keys] subtitles = defaultdict(lambda: defaultdict(int)) edition_count = 0 with_subtitle_count = 0 for e in editions: edition_count += 1 subtitle = e['subtitle'] or '' if subtitle != '': with_subtitle_count += 1 norm_subtitle = mk_norm(subtitle) if norm_subtitle != norm: subtitles[norm_subtitle][subtitle] += 1 use_subtitle = None for k, v in subtitles.iteritems(): lc_k = k.strip(' .').lower() if lc_k in ('', 'roman') or 'edition' in lc_k: continue num = sum(v.values()) overall = float(num) / float(edition_count) ratio = float(num) / float(with_subtitle_count) if overall > 0.2 and ratio > 0.5: use_subtitle = freq_dict_top(v) w = {'title': first, 'editions': editions} if use_subtitle: w['subtitle'] = use_subtitle if toc: w['toc'] = toc subjects = four_types(find_subjects(get_marc_subjects(w))) if subjects: w['subjects'] = subjects yield w
def cat_title(prefix, title): if not prefix: return mk_norm(title) if prefix[-1] != ' ': prefix += ' ' return mk_norm(prefix + title)
def test_mk_norm(title, expected): assert mk_norm(title) == expected
def get_books(akey, query, do_get_mc=True): for e in query: try: if not e.get('title', None): continue except: print(e) # if len(e.get('authors', [])) != 1: # continue if 'title_prefix' in e and e['title_prefix']: prefix = e['title_prefix'] if prefix[-1] != ' ': prefix += ' ' title = prefix + e['title'] else: title = e['title'] title = title.strip(' ') if has_dot(title): title = title[:-1] m = re_parens.match(title) if m: title = m.group(1) n = mk_norm(title) book = { 'title': title, 'norm_title': n, 'key': e['key'], } lang = e.get('languages', []) if lang: book['lang'] = [re_lang_key.match(l['key']).group(1) for l in lang] if e.get('table_of_contents', None): if isinstance(e['table_of_contents'][0], six.string_types): book['table_of_contents'] = e['table_of_contents'] else: assert isinstance(e['table_of_contents'][0], dict) if e['table_of_contents'][0].get('type', None) == '/type/text': book['table_of_contents'] = [i['value'] for i in e['table_of_contents']] if 'subtitle' in e: book['subtitle'] = e['subtitle'] if 'source_records' in e: book['source_records'] = e['source_records'] mc = get_mc(e['key']) if do_get_mc else None wt = get_work_title(e, mc) if not wt: yield book continue if wt in bad_titles: yield book continue n_wt = mk_norm(wt) book['work_title'] = wt book['norm_wt'] = n_wt yield book
def test_mk_norm_equality(a, b): assert mk_norm(a) == mk_norm(b)
def get_books(akey, query, do_get_mc=True): for e in query: try: if not e.get('title', None): continue except: print e # if len(e.get('authors', [])) != 1: # continue if 'title_prefix' in e and e['title_prefix']: prefix = e['title_prefix'] if prefix[-1] != ' ': prefix += ' ' title = prefix + e['title'] else: title = e['title'] title = title.strip(' ') if has_dot(title): title = title[:-1] m = re_parens.match(title) if m: title = m.group(1) n = mk_norm(title) book = { 'title': title, 'norm_title': n, 'key': e['key'], } lang = e.get('languages', []) if lang: book['lang'] = [re_lang_key.match(l['key']).group(1) for l in lang] if e.get('table_of_contents', None): if isinstance(e['table_of_contents'][0], basestring): book['table_of_contents'] = e['table_of_contents'] else: assert isinstance(e['table_of_contents'][0], dict) if e['table_of_contents'][0].get('type', None) == '/type/text': book['table_of_contents'] = [ i['value'] for i in e['table_of_contents'] ] if 'subtitle' in e: book['subtitle'] = e['subtitle'] if 'source_records' in e: book['source_records'] = e['source_records'] mc = get_mc(e['key']) if do_get_mc else None wt = get_work_title(e, mc) if not wt: yield book continue if wt in bad_titles: yield book continue n_wt = mk_norm(wt) book['work_title'] = wt book['norm_wt'] = n_wt yield book