def POST(self): i = web.input(key=[], master=None, merge_key=[]) keys = uniq(i.key) selected = uniq(i.merge_key) # filter bad keys keys = self.filter_authors(keys) # doesn't make sense to merge master with it self. if i.master in selected: selected.remove(i.master) formdata = web.storage(master=i.master, selected=selected) if not i.master or len(selected) == 0: return render_template( "merge/authors", keys, top_books_from_author=top_books_from_author, formdata=formdata, ) else: # redirect to the master. The master will display a progressbar and call the merge_authors_json to trigger the merge. raise web.seeother("/authors/" + i.master + "/-/" + "?merge=true&duplicates=" + ",".join(selected))
def merge_property(self, a, b): if isinstance(a, list) and isinstance(b, list): return uniq(a + b, key=dicthash) elif not a: return b else: return a
def GET(self): i = web.input(key=[]) keys = uniq(i.key) # filter bad keys keys = self.filter_authors(keys) return render_template('merge/authors', keys, top_books_from_author=top_books_from_author)
def get_ocaid(item): # Circular import otherwise from ..book_providers import is_non_ia_ocaid possible_fields = [ 'ocaid', # In editions 'identifier', # In ?? not editions/works/solr 'ia', # In solr work records and worksearch get_docs 'lending_identifier', # In solr works records + worksearch get_doc ] # SOLR WORK RECORDS ONLY: # Open Library only has access to a list of archive.org IDs # and solr isn't currently equipped with the information # necessary to determine which editions may be openly # available. Using public domain date as a heuristic # Long term solution is a full reindex, but this hack will work in the # vast majority of cases for now. # NOTE: there is still a risk pre-1923 books will get a print-diabled-only # or lendable edition. # Note: guaranteed to be int-able if none None US_PD_YEAR = 1923 if float(item.get('first_publish_year') or '-inf') > US_PD_YEAR: # Prefer `lending_identifier` over `ia` (push `ia` to bottom) possible_fields.remove('ia') possible_fields.append('ia') ocaids = [] for field in possible_fields: if item.get(field): ocaids += item[field] if isinstance(item[field], list) else [item[field]] ocaids = uniq(ocaids) return next((ocaid for ocaid in ocaids if not is_non_ia_ocaid(ocaid)), None)
def merge_docs(self, master, dup): # avoid merging other types. if dup['type']['key'] == '/type/author': master = BasicMergeEngine.merge_docs(self, master, dup) if dup.get('name') and not name_eq(dup['name'], master.get('name') or ''): master.setdefault('alternate_names', []).append(dup['name']) if 'alternate_names' in master: master['alternate_names'] = uniq(master['alternate_names'], key=space_squash_and_strip) return master
def POST(self): i = web.input(key=[], master=None, merge_key=[]) keys = uniq(i.key) selected = uniq(i.merge_key) # filter bad keys keys = self.filter_authors(keys) # doesn't make sense to merge master with it self. if i.master in selected: selected.remove(i.master) formdata = web.storage( master=i.master, selected=selected ) if not i.master or len(selected) == 0: return render_template("merge/authors", keys, top_books_from_author=top_books_from_author, formdata=formdata) else: # redirect to the master. The master will display a progressbar and call the merge_authors_json to trigger the merge. master = web.ctx.site.get("/authors/" + i.master) raise web.seeother(master.url() + "?merge=true&duplicates=" + ",".join(selected))
def convert_doc(self, doc, master, duplicates): """Converts references to any of the duplicates in the given doc to the master. """ if isinstance(doc, dict): if len(doc) == 1 and doc.keys() == ['key']: key = doc['key'] if key in duplicates: return {"key": master} else: return doc else: return dict((k, self.convert_doc(v, master, duplicates)) for k, v in doc.iteritems()) elif isinstance(doc, list): values = [self.convert_doc(v, master, duplicates) for v in doc] return uniq(values, key=dicthash) else: return doc
def update_references(self, doc, master, duplicates): """ Converts references to any of the duplicates in the given doc to the master. :param doc: :param str master: :param list of str duplicates: :rtype: Any """ if isinstance(doc, dict): if list(doc) == ['key']: return {"key": master} if doc['key'] in duplicates else doc else: return dict( (k, self.update_references(v, master, duplicates)) for k, v in doc.items()) elif isinstance(doc, list): values = [self.update_references(v, master, duplicates) for v in doc] return uniq(values, key=dicthash) else: return doc
def GET(self): i = web.input(key=[],merge_key=[]) keys = uniq(i.key) merge_keys = uniq(i.merge_key) assert all(k is not None for k in merge_keys) if not merge_keys: return render_template('merge/editions', keys) full_keys = ['/books/' + k for k in merge_keys] editions = [web.ctx.site.get('/books/' + k) for k in merge_keys] master = None for e in editions: if e.key == '/books/' + i.master: master = e break all_keys = set() for e in editions: for k in e.keys(): if e[k] is not None and e[k] != {}: all_keys.add(k) merged = {} possible_values = defaultdict(lambda: defaultdict(int)) k = 'publish_date' publish_dates = set(e[k] for e in editions if k in e and len(e[k]) != 4) k = 'pagination' all_pagination = set(e[k] for e in editions if e.get(k)) one_item_lists = {} for k in 'lc_classifications', 'publishers', 'contributions', 'series': one_item_lists[k] = set(e[k][0].strip('.') for e in editions if e.get(k) and len(set(e[k])) == 1) for k in ['other_titles', 'isbn_10', 'series']: if k not in all_keys: continue merged[k] = [] for e in editions: for v in e.get(k, []): if v not in merged[k]: possible_values[k][v] += 1 merged[k].append(v) k = 'ocaid' for e in editions: v = e.get(k) if not v: continue possible_values[k][v] += 1 if 'ia:' + v not in merged.get('source_records', []): merged.setdefault('source_records', []).append(v) k = 'identifiers' if k in all_keys: merged[k] = {} for e in editions: if k not in e: continue for a, b in e[k].items(): for c in b: if c in merged[k].setdefault(a, []): continue merged[k][a].append(c) any_publish_country = False k = 'publish_country' if k in all_keys: for e in editions: if e.get(k) and not e[k].strip().startswith('xx'): any_publish_country = True for k in 'source_records', 'ia_box_id': merged[k] = [] for e in editions: if e.get(k) and isinstance(e[k], six.string_types): e[k] = [e[k]] if e.get(k): assert isinstance(e[k], list) for sr in e.get(k, []): if sr not in merged[k]: merged[k].append(sr) for k in all_keys: if k in ('source_records', 'ia_box_id', 'identifiers', 'ocaid', 'other_titles', 'series'): continue uniq_values = defaultdict(list) for num, e in enumerate(editions): v = e.get(k) if v: if isinstance(v, list): for lv in v: possible_values[k][lv] += 1 elif not isinstance(v, dict): possible_values[k][v] += 1 if k == 'publish_date' and len(v) == 4 and v.isdigit and any(v in pd for pd in publish_dates): continue if k == 'pagination' and any(len(i) > len(v) and v in i for i in all_pagination): continue if k in one_item_lists and len(set(e.get(k, []))) == 1 and any(len(i) > len(v[0].strip('.')) and v[0].strip('.') in i for i in one_item_lists[k]): continue if k == 'publish_country' and any_publish_country and e.get(k, '').strip().startswith('xx'): continue if k == 'edition_name' and v.endswith(' ed edition'): v = v[:-len(' edition')] uniq_values[re_nonword.sub('', repr(v).lower())].append(num) if len(uniq_values) == 1: merged[k] = editions[uniq_values.values()[0][0]][k] continue if k == 'covers': assert all(isinstance(e[k], list) for e in editions if k in e) covers = set() for e in editions: if k in e: covers.update(c for c in e[k] if c != -1) merged['covers'] = sorted(covers) continue if k == 'notes': merged['notes'] = '' for e in editions: if e.get('notes'): merged['notes'] += e['notes'] + '\n' continue if k == 'ocaid': for e in editions: if e.get('ocaid'): #assert not e['ocaid'].endswith('goog') merged['ocaid'] = e['ocaid'] break assert merged['ocaid'] continue return render_template('merge/editions2', master, editions, all_keys, merged, possible_values)
def load(rec, account_key=None): """Given a record, tries to add/match that edition in the system. Record is a dictionary containing all the metadata of the edition. The following fields are mandatory: * title: str * source_records: list :param dict rec: Edition record to add :rtype: dict :return: a dict to be converted into a JSON HTTP response, same as load_data() """ required_fields = ['title', 'source_records' ] # ['authors', 'publishers', 'publish_date'] for field in required_fields: if not rec.get(field): raise RequiredField(field) if not isinstance(rec['source_records'], list): rec['source_records'] = [rec['source_records']] # Split subtitle if required and not already present if ':' in rec.get('title') and not rec.get('subtitle'): title, subtitle = split_subtitle(rec.get('title')) if subtitle: rec['title'] = title rec['subtitle'] = subtitle rec = normalize_record_isbns(rec) edition_pool = build_pool(rec) # deduplicate authors rec['authors'] = uniq(rec.get('authors', []), dicthash) if not edition_pool: # No match candidates found, add edition return load_data(rec, account_key=account_key) match = early_exit(rec) if not match: match = find_exact_match(rec, edition_pool) if not match: rec['full_title'] = rec['title'] if rec.get('subtitle'): rec['full_title'] += ' ' + rec['subtitle'] e1 = build_marc(rec) add_db_name(e1) match = find_match(e1, edition_pool) if not match: # No match found, add edition return load_data(rec, account_key=account_key) # We have an edition match at this point need_work_save = need_edition_save = False w = None e = web.ctx.site.get(match) # check for, and resolve, author redirects for a in e.authors: while is_redirect(a): if a in e.authors: e.authors.remove(a) a = web.ctx.site.get(a.location) if not is_redirect(a): e.authors.append(a) if e.get('works'): w = e.works[0].dict() work_created = False else: # Found an edition without a work work_created = need_work_save = need_edition_save = True w = new_work(e.dict(), rec) e.works = [{'key': w['key']}] # Add subjects to work, if not already present if 'subjects' in rec: work_subjects = list(w.get('subjects', [])) for s in rec['subjects']: if s not in work_subjects: work_subjects.append(s) need_work_save = True if need_work_save and work_subjects: w['subjects'] = work_subjects # Add cover to edition if 'cover' in rec and not e.get_covers(): cover_url = rec['cover'] cover_id = add_cover(cover_url, e.key, account_key=account_key) if cover_id: e['covers'] = [cover_id] need_edition_save = True # Add cover to work, if needed if not w.get('covers') and e.get_covers(): w['covers'] = [e['covers'][0]] need_work_save = True # Add description to work, if needed if not w.get('description') and e.get('description'): w['description'] = e['description'] need_work_save = True # Add authors to work, if needed if not w.get('authors'): authors = [import_author(a) for a in rec.get('authors', [])] w['authors'] = [{ 'type': { 'key': '/type/author_role' }, 'author': a.key } for a in authors if a.get('key')] if w.get('authors'): need_work_save = True # Add ocaid to edition (str), if needed if 'ocaid' in rec and not e.ocaid: e['ocaid'] = rec['ocaid'] need_edition_save = True # Add list fields to edition as needed edition_fields = [ 'local_id', 'lccn', 'lc_classifications', 'source_records', ] for f in edition_fields: if f not in rec: continue # ensure values is a list values = rec[f] if isinstance(rec[f], list) else [rec[f]] if f in e: # get values from rec that are not currently on the edition to_add = [v for v in values if v not in e[f]] e[f] += to_add else: e[f] = to_add = values if to_add: need_edition_save = True edits = [] reply = { 'success': True, 'edition': { 'key': match, 'status': 'matched' }, 'work': { 'key': w['key'], 'status': 'matched' }, } if need_edition_save: reply['edition']['status'] = 'modified' edits.append(e.dict()) if need_work_save: reply['work']['status'] = 'created' if work_created else 'modified' edits.append(w) if edits: web.ctx.site.save_many(edits, comment='import existing book', action='edit-book') if 'ocaid' in rec: update_ia_metadata_for_ol_edition(match.split('/')[-1]) return reply