def test_author_contrib(self): rec1 = { 'authors': [ {'db_name': 'Bruner, Jerome S.', 'name': 'Bruner, Jerome S.'}], 'full_title': ('Contemporary approaches to cognition ' 'a symposium held at the University of Colorado.'), 'number_of_pages': 210, 'publish_country': 'xxu', 'publish_date': '1957', 'publishers': ['Harvard U.P']} rec2 = { 'authors': [ {'db_name': ('University of Colorado (Boulder campus). ' 'Dept. of Psychology.'), 'name': ('University of Colorado (Boulder campus). ' 'Dept. of Psychology.')}], 'contribs': [ {'db_name': 'Bruner, Jerome S.', 'name': 'Bruner, Jerome S.'}], 'full_title': ('Contemporary approaches to cognition ' 'a symposium held at the University of Colorado'), 'lccn': ['57012963'], 'number_of_pages': 210, 'publish_country': 'mau', 'publish_date': '1957', 'publishers': ['Harvard University Press']} e1 = build_marc(rec1) e2 = build_marc(rec2) assert compare_authors(e1, e2) == ('authors', 'exact match', 125) threshold = 875 assert editions_match(e1, e2, threshold) is True
def test_merge2(): amazon = {'publishers': [u'Collins'], 'isbn_10': ['0002167530'], 'number_of_pages': 287, 'short_title': u'sea birds britain ireland', 'normalized_title': u'sea birds britain ireland', 'full_title': u'Sea Birds Britain Ireland', 'titles': [u'Sea Birds Britain Ireland', u'sea birds britain ireland'], 'publish_date': u'1975', 'authors': [{'name': 'Stanley Cramp', 'db_name': 'Cramp, Stanley'}]} marc = {'publisher': [u'Collins'], 'isbn_10': [u'0002167530'], 'short_title': u'seabirds of britain and i', 'normalized_title': u'seabirds of britain and ireland', 'full_title': u'seabirds of Britain and Ireland', 'titles': [u'seabirds of Britain and Ireland', u'seabirds of britain and ireland'], 'publish_date': '1974', 'authors': [{'db_name': u'Cramp, Stanley.', 'entity_type': 'person', 'name': u'Cramp, Stanley.', 'personal_name': u'Cramp, Stanley.'}], 'source_record_loc': 'marc_records_scriblio_net/part08.dat:61449973:855'} threshold = 875 # build_marc() will place all isbn_ types in the 'isbn' field. # compare_author_fields() expects all authors to have a db_name assert attempt_merge(build_marc(amazon), build_marc(marc), threshold, debug=True)
def test_match_low_threshold(self): # year is off by < 2 years, counts a little # build_marc() will place all isbn_ types in the 'isbn' field. e1 = build_marc({ 'publishers': ['Collins'], 'isbn_10': ['0002167530'], 'number_of_pages': 287, 'short_title': 'sea birds britain ireland', 'normalized_title': 'sea birds britain ireland', 'full_title': 'Sea Birds Britain Ireland', 'titles': ['Sea Birds Britain Ireland', 'sea birds britain ireland'], 'publish_date': '1975', 'authors': [{ 'name': 'Stanley Cramp', 'db_name': 'Cramp, Stanley' }], }) e2 = build_marc({ 'publishers': ['Collins'], 'isbn_10': ['0002167530'], 'short_title': 'seabirds of britain and i', 'normalized_title': 'seabirds of britain and ireland', 'full_title': 'seabirds of Britain and Ireland', 'titles': [ 'seabirds of Britain and Ireland', 'seabirds of britain and ireland', ], 'publish_date': '1974', 'authors': [{ 'db_name': 'Cramp, Stanley.', 'entity_type': 'person', 'name': 'Cramp, Stanley.', 'personal_name': 'Cramp, Stanley.', }], 'source_record_loc': 'marc_records_scriblio_net/part08.dat:61449973:855', }) threshold = 515 assert editions_match(e1, e2, threshold, debug=True) assert editions_match(e1, e2, threshold + 1) is False
def try_merge(e1, edition_key, existing): thing_type = existing.type.key if thing_type == '/type/delete': return False assert thing_type == '/type/edition' rec2 = {} rec2['full_title'] = existing.title if existing.subtitle: rec2['full_title'] += ' ' + existing.subtitle for f in 'isbn', 'isbn_10', 'isbn_13', 'lccn', 'publish_country', 'publishers', 'publish_date': if existing.get(f): rec2[f] = existing[f] if existing.authors: rec2['authors'] = [] for a in existing.authors: author_type = a.type.key while author_type == '/type/delete' or author_type == '/type/redirect': if author_type == '/type/delete': a = undelete_author(a) author_type = a.type.key continue if author_type == '/type/redirect': a = web.ctx.site.get(a.location) author_type = a.type.key continue assert author_type == '/type/author' assert a['name'] rec2['authors'].append({'name': a['name'], 'db_name': db_name(a)}) e2 = build_marc(rec2) return attempt_merge(e1, e2, threshold, debug=False)
def try_merge(e1, edition_key, existing): thing_type = existing.type.key if thing_type == '/type/delete': return False assert thing_type == '/type/edition' rec2 = {} rec2['full_title'] = existing.title if existing.subtitle: rec2['full_title'] += ' ' + existing.subtitle if existing.lccn: rec2['lccn'] = existing.lccn rec2['authors'] = [{ 'name': a.name, 'db_name': db_name(a) } for a in existing.authors] if existing.publishers: rec2['publishers'] = existing.publishers if existing.publish_date: rec2['publisher_date'] = existing.publish_date e2 = build_marc(rec2) print print 'e1:', e1 print 'e2:', e2 return attempt_merge(e1, e2, threshold, debug=True)
def try_merge(e1, edition_key, existing): """ Converts the existing edition into a comparable dict and performs a thresholded comparison to decide whether they are the same. :param dict e1: :param str edition_key: :param Thing existing: Edition object that most likely matches e1, the object of edition_key :rtype: bool :return: Whether e1 is sufficiently the same as the 'existing' edition """ thing_type = existing.type.key if thing_type == '/type/delete': return False assert thing_type == '/type/edition' rec2 = {} rec2['full_title'] = existing.title if existing.subtitle: rec2['full_title'] += ' ' + existing.subtitle for f in 'isbn', 'isbn_10', 'isbn_13', 'lccn', 'publish_country', 'publishers', 'publish_date': if existing.get(f): rec2[f] = existing[f] if existing.authors: rec2['authors'] = [] for a in existing.authors: while a.type.key == '/type/redirect': a = web.ctx.site.get(a.location) if a.type.key == '/type/author': assert a['name'] rec2['authors'].append({ 'name': a['name'], 'db_name': db_name(a) }) e2 = build_marc(rec2) return attempt_merge(e1, e2, threshold)
def try_merge(edition, ekey, thing): thing_type = thing['type']['key'] if 'isbn_10' not in edition: print(edition) asin = edition.get('isbn_10', None) or edition['asin'] if 'authors' in edition: authors = [i['name'] for i in edition['authors']] else: authors = [] a = amazon_merge.build_amazon(edition, authors) assert isinstance(asin, str) assert thing_type == '/type/edition' # print edition['asin'], ekey if 'source_records' in thing: if 'amazon:' + asin in thing['source_records']: return True return source_records_match(a, thing) # print 'no source records' mc = get_mc(ekey) # print 'mc:', mc if mc == 'amazon:' + asin: return True if not mc: return False data = get_from_local(mc) e1 = build_marc(fast_parse.read_edition(data)) return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
def test_compare_authors_by_statement(self): # requires db_name to be present on both records. rec1 = { 'full_title': 'Full Title, required', 'authors': [{ 'name': 'Alistair Smith', 'db_name': 'Alistair Smith'}]} rec2 = { 'full_title': 'A different Full Title, only matching authors here.', 'authors': [{ 'db_name': 'National Gallery (Great Britain)', 'name': 'National Gallery (Great Britain)', 'entity_type': 'org'}], 'by_statement': 'Alistair Smith.'} result = compare_authors(build_marc(rec1), build_marc(rec2)) assert result == ('main', 'exact match', 125)
def test_compare_authors_by_statement(): # requires db_name to be present on both records. rec1 = { 'full_title': 'Full Title, required', 'authors': [{ 'name': 'Alistair Smith', 'db_name': 'Alistair Smith'}]} rec2 = { 'full_title': 'A different Full Title, only matching authors here.', 'authors': [{ 'db_name': u'National Gallery (Great Britain)', 'name': u'National Gallery (Great Britain)', 'entity_type': 'org'}], 'by_statement': 'Alistair Smith.'} result = compare_authors(build_marc(rec1), build_marc(rec2)) # This expected result taken from the amazon and merge versions of compare_author, # Current merge_marc.compare_authors() does not take by_statement into account. assert result == ('main', 'exact match', 125)
def test_build_marc(): # used in add_book.load() when trying to find an existing edition match edition = { 'title': 'A test title (parens)', 'full_title': 'A test full title : subtitle (parens).', # required, and set by add_book.load() 'source_records': ['ia:test-source'] } result = build_marc(edition) assert isinstance(result['titles'], list) assert result['isbn'] == [] assert result['normalized_title'] == 'a test full title subtitle (parens)' assert result['short_title'] == 'a test full title subtitl'
def get_record(key, mc): data = get_from_archive(mc) try: rec = fast_parse.read_edition(data) except (fast_parse.SoundRecording, IndexError, AssertionError): print(mc) print(key) return False try: return marc.build_marc(rec) except TypeError: print(rec) raise
def ia_match(a, ia): try: loc, rec = get_ia(ia) except urllib.error.HTTPError: return False if rec is None or 'full_title' not in rec: return False try: e1 = build_marc(rec) except TypeError: print(rec) raise return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
def test_try_merge(mock_site): rec = { 'title': 'Test item', 'lccn': ['123'], 'authors': [{'name': 'Smith, John', 'birth_date': '1980'}], 'source_records': ['ia:test_item'], } reply = load(rec) ekey = reply['edition']['key'] e = mock_site.get(ekey) rec['full_title'] = rec['title'] e1 = build_marc(rec) add_db_name(e1) result = try_merge(e1, ekey, e) assert result is True
def test_try_merge(mock_site): rec = { 'title': 'Test item', 'lccn': ['123'], 'authors': [{'name': 'Smith, John', 'birth_date': '1980'}], } reply = load(rec) ekey = reply['edition']['key'] e = mock_site.get(ekey) rec['full_title'] = rec['title'] if rec.get('subtitle'): rec['full_title'] += ' ' + rec['subtitle'] e1 = build_marc(rec) add_db_name(e1) assert try_merge(e1, ekey, e)
def editions_match(candidate, existing): """ Converts the existing edition into a comparable dict and performs a thresholded comparison to decide whether they are the same. Used by add_book.load() -> add_book.find_match() to check whether two editions match. :param dict candidate: Output of build_marc(import record candidate) :param Thing existing: Edition object to be tested against candidate :rtype: bool :return: Whether candidate is sufficiently the same as the 'existing' edition """ thing_type = existing.type.key if thing_type == '/type/delete': return False # FIXME: will fail if existing is a redirect. assert thing_type == '/type/edition' rec2 = {} rec2['full_title'] = existing.title if existing.subtitle: rec2['full_title'] += ' ' + existing.subtitle for f in ( 'isbn', 'isbn_10', 'isbn_13', 'lccn', 'publish_country', 'publishers', 'publish_date', ): if existing.get(f): rec2[f] = existing[f] if existing.authors: rec2['authors'] = [] for a in existing.authors: while a.type.key == '/type/redirect': a = web.ctx.site.get(a.location) if a.type.key == '/type/author': assert a['name'] rec2['authors'].append({'name': a['name'], 'db_name': db_name(a)}) e2 = build_marc(rec2) return threshold_match(candidate, e2, threshold)
def test_try_merge(mock_site): rec = { 'title': 'Test item', 'lccn': ['123'], 'authors': [{ 'name': 'Smith, John', 'birth_date': '1980' }], } reply = load(rec) ekey = reply['edition']['key'] e = mock_site.get(ekey) rec['full_title'] = rec['title'] if rec.get('subtitle'): rec['full_title'] += ' ' + rec['subtitle'] e1 = build_marc(rec) add_db_name(e1) assert try_merge(e1, ekey, e)
def try_merge(e1, edition_key, existing): thing_type = existing.type.key if thing_type == '/type/delete': return False assert thing_type == '/type/edition' rec2 = {} rec2['full_title'] = existing.title if existing.subtitle: rec2['full_title'] += ' ' + existing.subtitle if existing.lccn: rec2['lccn'] = existing.lccn rec2['authors'] = [{'name': a.name, 'db_name': db_name(a)} for a in existing.authors] if existing.publishers: rec2['publishers'] = existing.publishers if existing.publish_date: rec2['publisher_date'] = existing.publish_date e2 = build_marc(rec2) print print 'e1:', e1 print 'e2:', e2 return attempt_merge(e1, e2, threshold, debug=True)
def load(rec): """Given a record, tries to add/match that edition in the system. Record is a dictionary containing all the metadata of the edition. The following fields are mandatory: * title * source_records """ if not rec.get('title'): raise RequiredField('title') if not rec.get('source_records'): raise RequiredField('source_records') if isinstance(rec['source_records'], basestring): rec['source_records'] = [rec['source_records']] edition_pool = build_pool(rec) if not edition_pool: # No match candidates found, add edition return load_data(rec) #matches = set(item for sublist in edition_pool.values() for item in sublist) #if len(matches) == 1: # return {'success': True, 'edition': {'key': list(matches)[0]}} match = early_exit(rec) if not match: match = find_exact_match(rec, edition_pool) if not match: rec['full_title'] = rec['title'] if rec.get('subtitle'): rec['full_title'] += ' ' + rec['subtitle'] e1 = build_marc(rec) add_db_name(e1) match = find_match(e1, edition_pool) if not match: # No match found, add edition return load_data(rec) # We have an edition match at this point need_work_save = False need_edition_save = False w = None e = web.ctx.site.get(match) if e.works: w = e.works[0].dict() work_created = False else: # Found an edition without a work work_created = True need_work_save = True need_edition_save = True w = { 'type': { 'key': '/type/work' }, 'title': get_title(rec), 'key': web.ctx.site.new_key('/type/work'), } #TODO: add edition covers and author to new work e.works = [{'key': w['key']}] # Add subjects to work, if not already present if 'subjects' in rec: work_subjects = list(w.get('subjects', [])) for s in rec['subjects']: if s not in work_subjects: work_subjects.append(s) need_work_save = True if need_work_save and work_subjects: w['subjects'] = work_subjects # Add cover to edition, and work, if needed if 'cover' in rec and not e.covers: cover_url = rec['cover'] cover_id = add_cover(cover_url, e.key) if cover_id: e['covers'] = [cover_id] need_edition_save = True if not w.get('covers'): w['covers'] = [cover_id] need_work_save = True # Add ocaid to edition (str), if needed if 'ocaid' in rec and not e.ocaid: e['ocaid'] = rec['ocaid'] need_edition_save = True # add values to edition lists for f in 'source_records', 'local_id', 'ia_box_id', 'ia_loaded_id': if f not in rec: continue # ensure values is a list values = rec[f] if isinstance(rec[f], list) else [rec[f]] if f in e: # get values from rec that are not currently on the edition to_add = [v for v in values if v not in e[f]] e[f] += to_add else: e[f] = to_add = values if to_add: need_edition_save = True edits = [] reply = { 'success': True, 'edition': { 'key': match, 'status': 'matched' }, 'work': { 'key': w['key'], 'status': 'matched' }, } if need_edition_save: reply['edition']['status'] = 'modified' edits.append(e.dict()) if need_work_save: reply['work']['status'] = 'created' if work_created else 'modified' edits.append(w) if edits: web.ctx.site.save_many(edits, 'import existing book') return reply
def marc_match(a, loc): assert loc rec = fast_parse.read_edition(get_from_local(loc)) e1 = build_marc(rec) # print 'amazon:', a return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
def load(rec, account_key=None): """Given a record, tries to add/match that edition in the system. Record is a dictionary containing all the metadata of the edition. The following fields are mandatory: * title: str * source_records: list :param dict rec: Edition record to add :rtype: dict :return: a dict to be converted into a JSON HTTP response, same as load_data() """ required_fields = ['title', 'source_records' ] # ['authors', 'publishers', 'publish_date'] for field in required_fields: if not rec.get(field): raise RequiredField(field) if not isinstance(rec['source_records'], list): rec['source_records'] = [rec['source_records']] # Split subtitle if required and not already present if ':' in rec.get('title') and not rec.get('subtitle'): title, subtitle = split_subtitle(rec.get('title')) if subtitle: rec['title'] = title rec['subtitle'] = subtitle rec = normalize_record_isbns(rec) edition_pool = build_pool(rec) if not edition_pool: # No match candidates found, add edition return load_data(rec, account_key=account_key) match = early_exit(rec) if not match: match = find_exact_match(rec, edition_pool) if not match: rec['full_title'] = rec['title'] if rec.get('subtitle'): rec['full_title'] += ' ' + rec['subtitle'] e1 = build_marc(rec) add_db_name(e1) match = find_match(e1, edition_pool) if not match: # No match found, add edition return load_data(rec, account_key=account_key) # We have an edition match at this point need_work_save = need_edition_save = False w = None e = web.ctx.site.get(match) # check for, and resolve, author redirects for a in e.authors: while is_redirect(a): if a in e.authors: e.authors.remove(a) a = web.ctx.site.get(a.location) if not is_redirect(a): e.authors.append(a) if e.get('works'): w = e.works[0].dict() work_created = False else: # Found an edition without a work work_created = need_work_save = need_edition_save = True w = new_work(e.dict(), rec) e.works = [{'key': w['key']}] # Add subjects to work, if not already present if 'subjects' in rec: work_subjects = list(w.get('subjects', [])) for s in rec['subjects']: if s not in work_subjects: work_subjects.append(s) need_work_save = True if need_work_save and work_subjects: w['subjects'] = work_subjects # Add cover to edition if 'cover' in rec and not e.get_covers(): cover_url = rec['cover'] cover_id = add_cover(cover_url, e.key, account_key=account_key) if cover_id: e['covers'] = [cover_id] need_edition_save = True # Add cover to work, if needed if not w.get('covers') and e.get_covers(): w['covers'] = [e['covers'][0]] need_work_save = True # Add description to work, if needed if not w.get('description') and e.get('description'): w['description'] = e['description'] need_work_save = True # Add authors to work, if needed if not w.get('authors'): authors = [import_author(a) for a in rec.get('authors', [])] w['authors'] = [{ 'type': { 'key': '/type/author_role' }, 'author': a.key } for a in authors if a.get('key')] if w.get('authors'): need_work_save = True # Add ocaid to edition (str), if needed if 'ocaid' in rec and not e.ocaid: e['ocaid'] = rec['ocaid'] need_edition_save = True # Add list fields to edition as needed edition_fields = [ 'local_id', 'lccn', 'lc_classifications', 'source_records', ] for f in edition_fields: if f not in rec: continue # ensure values is a list values = rec[f] if isinstance(rec[f], list) else [rec[f]] if f in e: # get values from rec that are not currently on the edition to_add = [v for v in values if v not in e[f]] e[f] += to_add else: e[f] = to_add = values if to_add: need_edition_save = True edits = [] reply = { 'success': True, 'edition': { 'key': match, 'status': 'matched' }, 'work': { 'key': w['key'], 'status': 'matched' }, } if need_edition_save: reply['edition']['status'] = 'modified' edits.append(e.dict()) if need_work_save: reply['work']['status'] = 'created' if work_created else 'modified' edits.append(w) if edits: web.ctx.site.save_many(edits, comment='import existing book', action='edit-book') if 'ocaid' in rec: update_ia_metadata_for_ol_edition(match.split('/')[-1]) return reply
write_log(ia, when, "error: full_title missing") continue index_fields = make_index_fields(rec) if not index_fields: print("no index_fields") write_log(ia, when, "error: no index fields") continue edition_pool = pool.build(index_fields) if not edition_pool: load(ia, use_binary=use_binary) write_log(ia, when, "loaded") continue e1 = build_marc(rec) match = False seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: if edition_key in seen: continue thing = None found = True while not thing or thing['type']['key'] == '/type/redirect': seen.add(edition_key) thing = withKey(edition_key) assert thing if 'type' not in thing: print(thing)
def load(rec): """Given a record, tries to add/match that edition in the system. Record is a dictionary containing all the metadata of the edition. The following fields are mandatory: * title * source_records """ if not rec.get('title'): raise RequiredField('title') if not rec.get('source_records'): raise RequiredField('source_records') if isinstance(rec['source_records'], basestring): rec['source_records'] = [rec['source_records']] edition_pool = build_pool(rec) if not edition_pool: return load_data(rec) # 'no books in pool, loading' #matches = set(item for sublist in edition_pool.values() for item in sublist) #if len(matches) == 1: # return {'success': True, 'edition': {'key': list(matches)[0]}} match = early_exit(rec) if not match: match = find_exact_match(rec, edition_pool) if not match: rec['full_title'] = rec['title'] if rec.get('subtitle'): rec['full_title'] += ' ' + rec['subtitle'] e1 = build_marc(rec) add_db_name(e1) match = find_match(e1, edition_pool) if not match: # 'match found:', match, rec['ia'] return load_data(rec) need_work_save = False need_edition_save = False w = None e = web.ctx.site.get(match) if e.works: w = e.works[0].dict() work_created = False else: work_created = True need_work_save = True need_edition_save = True w = { 'type': { 'key': '/type/work' }, 'title': get_title(rec), 'key': web.ctx.site.new_key('/type/work'), } e.works = [{'key': w['key']}] reply = { 'success': True, 'edition': { 'key': match, 'status': 'matched' }, 'work': { 'key': w['key'], 'status': 'matched' }, } if not e.get('source_records'): e['source_records'] = [] existing_source_records = set(e['source_records']) for i in rec['source_records']: if i not in existing_source_records: e['source_records'].append(i) need_edition_save = True assert e['source_records'] edits = [] if False and rec.get('authors'): reply['authors'] = [] east = east_in_by_statement(rec) work_authors = list(w.get('authors', [])) edition_authors = list(e.authors) author_in = [import_author(a, eastern=east) for a in rec['authors']] for a in author_in: new_author = 'key' not in a add_to_work = False add_to_edition = False if new_author: a['key'] = web.ctx.site.new_key('/type/author') assert isinstance(a, dict) edits.append(a) add_to_work = True add_to_edition = True else: if not any(i['author'] == a for i in work_authors): add_to_work = True if all(i['key'] != a['key'] for i in edition_authors): add_to_edition = True if add_to_work: need_work_save = True work_authors.append({ 'type': { 'key': '/type/author_role' }, 'author': { 'key': a['key'] }, }) if add_to_edition: need_edition_save = True edition_authors.append({'key': a['key']}) reply['authors'].append({ 'key': a['key'], 'name': a['name'], 'status': ('created' if new_author else 'modified'), }) w['authors'] = work_authors e['authors'] = edition_authors if 'subjects' in rec: work_subjects = list(w.get('subjects', [])) for s in rec['subjects']: if s not in work_subjects: work_subjects.append(s) need_work_save = True if need_work_save and work_subjects: w['subjects'] = work_subjects if 'ocaid' in rec: new = 'ia:' + rec['ocaid'] if not e.ocaid: e['ocaid'] = rec['ocaid'] need_edition_save = True if 'cover' in rec and not e.covers: cover_url = rec['cover'] cover_id = add_cover(cover_url, e.key) if cover_id: e['covers'] = [cover_id] need_edition_save = True if not w.get('covers'): w['covers'] = [cover_id] need_work_save = True for f in 'ia_box_id', 'ia_loaded_id': if f not in rec: continue if e.get(f): assert not isinstance(e[f], basestring) assert isinstance(e[f], list) if isinstance(rec[f], basestring): if rec[f] not in e[f]: e[f].append(rec[f]) need_edition_save = True else: assert isinstance(rec[f], list) for x in rec[f]: if x not in e[f]: e[f].append(x) need_edition_save = True if isinstance(rec[f], basestring): e[f] = [rec[f]] need_edition_save = True else: assert isinstance(rec[f], list) e[f] = rec[f] need_edition_save = True assert not isinstance(e[f], basestring) assert isinstance(e[f], list) if need_edition_save: reply['edition']['status'] = 'modified' e_dict = e.dict() assert e_dict and isinstance(e_dict, dict) edits.append(e_dict) if need_work_save: reply['work']['status'] = 'created' if work_created else 'modified' edits.append(w) if edits: for i in edits: assert i assert isinstance(i, dict) web.ctx.site.save_many(edits, 'import new book') # update_ia_metadata_for_ol_edition(reply['edition']['key'].split('/')[2]) return reply
def load_part(archive_id, part, start_pos=0): print('load_part:', archive_id, part) global rec_no, t_prev, load_count full_part = archive_id + "/" + part f = open(rc['marc_path'] + "/" + full_part) if start_pos: f.seek(start_pos) for pos, loc, data in read_marc_file(full_part, f, pos=start_pos): rec_no += 1 if rec_no % chunk == 0: progress(archive_id, rec_no, start_pos, pos) if is_loaded(loc): continue want = ['001', '003', '010', '020', '035', '245'] try: index_fields = fast_parse.index_fields(data, want) except KeyError: print(loc) print(fast_parse.get_tag_lines(data, ['245'])) raise except AssertionError: print(loc) raise except fast_parse.NotBook: continue if not index_fields or 'title' not in index_fields: continue print(loc) edition_pool = pool.build(index_fields) if not edition_pool: yield loc, data continue rec = fast_parse.read_edition(data) e1 = build_marc(rec) match = False seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: if edition_key in seen: continue thing = None while not thing or thing['type']['key'] == '/type/redirect': seen.add(edition_key) thing = withKey(edition_key) assert thing if thing['type']['key'] == '/type/redirect': print('following redirect %s => %s' % (edition_key, thing['location'])) edition_key = thing['location'] if try_merge(e1, edition_key, thing): add_source_records(edition_key, loc, thing, data) match = True break if match: break if not match: yield loc, data
format = rec['physical_format'].lower() if format.startswith('[graphic') or format.startswith('[cartograph'): print item, format index_fields = make_index_fields(rec) if not index_fields: print "no index_fields" continue #print index_fields edition_pool = pool.build(index_fields) if not edition_pool or not any(v for v in edition_pool.itervalues()): print >> new_book, rec continue print item, edition_pool e1 = build_marc(rec) print e1 match = False seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: # edition_key = '/books/' + re_edition_key.match(edition_key).match(1) if edition_key in seen: continue thing = None while not thing or thing['type']['key'] == '/type/redirect': seen.add(edition_key) thing = withKey(edition_key) assert thing if thing['type']['key'] == '/type/redirect':
def load(rec): """Given a record, tries to add/match that edition in the system. Record is a dictionary containing all the metadata of the edition. The following fields are mandatory: * title * source_records """ if not rec.get('title'): raise RequiredField('title') if not rec.get('source_records'): raise RequiredField('source_records') if isinstance(rec['source_records'], basestring): rec['source_records'] = [rec['source_records']] edition_pool = build_pool(rec) if not edition_pool: return load_data(rec) # 'no books in pool, loading' #matches = set(item for sublist in edition_pool.values() for item in sublist) #if len(matches) == 1: # return {'success': True, 'edition': {'key': list(matches)[0]}} match = early_exit(rec) if not match: match = find_exact_match(rec, edition_pool) if not match: rec['full_title'] = rec['title'] if rec.get('subtitle'): rec['full_title'] += ' ' + rec['subtitle'] e1 = build_marc(rec) add_db_name(e1) match = find_match(e1, edition_pool) if not match: # 'match found:', match, rec['ia'] return load_data(rec) need_work_save = False need_edition_save = False w = None e = web.ctx.site.get(match) if e.works: w = e.works[0].dict() work_created = False else: work_created = True need_work_save = True need_edition_save = True w = { 'type': {'key': '/type/work'}, 'title': get_title(rec), 'key': web.ctx.site.new_key('/type/work'), } e.works = [{'key': w['key']}] reply = { 'success': True, 'edition': {'key': match, 'status': 'matched'}, 'work': {'key': w['key'], 'status': 'matched'}, } if not e.get('source_records'): e['source_records'] = [] existing_source_records = set(e['source_records']) for i in rec['source_records']: if i not in existing_source_records: e['source_records'].append(i) need_edition_save = True assert e['source_records'] edits = [] if False and rec.get('authors'): reply['authors'] = [] east = east_in_by_statement(rec) work_authors = list(w.get('authors', [])) edition_authors = list(e.authors) author_in = [import_author(a, eastern=east) for a in rec['authors']] for a in author_in: new_author = 'key' not in a add_to_work = False add_to_edition = False if new_author: a['key'] = web.ctx.site.new_key('/type/author') assert isinstance(a, dict) edits.append(a) add_to_work = True add_to_edition = True else: if not any(i['author'] == a for i in work_authors): add_to_work = True if all(i['key'] != a['key'] for i in edition_authors): add_to_edition = True if add_to_work: need_work_save = True work_authors.append({ 'type': {'key': '/type/author_role'}, 'author': {'key': a['key'] }, }) if add_to_edition: need_edition_save = True edition_authors.append({'key': a['key'] }) reply['authors'].append({ 'key': a['key'], 'name': a['name'], 'status': ('created' if new_author else 'modified'), }) w['authors'] = work_authors e['authors'] = edition_authors if 'subjects' in rec: work_subjects = list(w.get('subjects', [])) for s in rec['subjects']: if s not in work_subjects: work_subjects.append(s) need_work_save = True if need_work_save and work_subjects: w['subjects'] = work_subjects if 'ocaid' in rec: new = 'ia:' + rec['ocaid'] if not e.ocaid: e['ocaid'] = rec['ocaid'] need_edition_save = True if 'cover' in rec and not e.covers: cover_url = rec['cover'] cover_id = add_cover(cover_url, e.key) if cover_id: e['covers'] = [cover_id] need_edition_save = True if not w.get('covers'): w['covers'] = [cover_id] need_work_save = True for f in 'ia_box_id', 'ia_loaded_id': if f not in rec: continue if e.get(f): assert not isinstance(e[f], basestring) assert isinstance(e[f], list) if isinstance(rec[f], basestring): if rec[f] not in e[f]: e[f].append(rec[f]) need_edition_save = True else: assert isinstance(rec[f], list) for x in rec[f]: if x not in e[f]: e[f].append(x) need_edition_save = True if isinstance(rec[f], basestring): e[f] = [rec[f]] need_edition_save = True else: assert isinstance(rec[f], list) e[f] = rec[f] need_edition_save = True assert not isinstance(e[f], basestring) assert isinstance(e[f], list) if need_edition_save: reply['edition']['status'] = 'modified' e_dict = e.dict() assert e_dict and isinstance(e_dict, dict) edits.append(e_dict) if need_work_save: reply['work']['status'] = 'created' if work_created else 'modified' edits.append(w) if edits: for i in edits: assert i assert isinstance(i, dict) web.ctx.site.save_many(edits, 'import new book') return reply
def load(rec): """Given a record, tries to add/match that edition in the system. Record is a dictionary containing all the metadata of the edition. The following fields are mandatory: * title * source_records """ if not rec.get('title'): raise RequiredField('title') if not rec.get('source_records'): raise RequiredField('source_records') if isinstance(rec['source_records'], basestring): rec['source_records'] = [rec['source_records']] edition_pool = build_pool(rec) if not edition_pool: # No match candidates found, add edition return load_data(rec) #matches = set(item for sublist in edition_pool.values() for item in sublist) #if len(matches) == 1: # return {'success': True, 'edition': {'key': list(matches)[0]}} match = early_exit(rec) if not match: match = find_exact_match(rec, edition_pool) if not match: rec['full_title'] = rec['title'] if rec.get('subtitle'): rec['full_title'] += ' ' + rec['subtitle'] e1 = build_marc(rec) add_db_name(e1) match = find_match(e1, edition_pool) if not match: # No match found, add edition return load_data(rec) # We have an edition match at this point need_work_save = False need_edition_save = False w = None e = web.ctx.site.get(match) if e.works: w = e.works[0].dict() work_created = False else: # Found an edition without a work work_created = True need_work_save = True need_edition_save = True w = { 'type': {'key': '/type/work'}, 'title': get_title(rec), 'key': web.ctx.site.new_key('/type/work'), } #TODO: add edition covers and author to new work e.works = [{'key': w['key']}] # Add subjects to work, if not already present if 'subjects' in rec: work_subjects = list(w.get('subjects', [])) for s in rec['subjects']: if s not in work_subjects: work_subjects.append(s) need_work_save = True if need_work_save and work_subjects: w['subjects'] = work_subjects # Add cover to edition, and work, if needed if 'cover' in rec and not e.covers: cover_url = rec['cover'] cover_id = add_cover(cover_url, e.key) if cover_id: e['covers'] = [cover_id] need_edition_save = True if not w.get('covers'): w['covers'] = [cover_id] need_work_save = True # Add ocaid to edition (str), if needed if 'ocaid' in rec and not e.ocaid: e['ocaid'] = rec['ocaid'] need_edition_save = True # add values to edition lists for f in 'source_records', 'local_id', 'ia_box_id', 'ia_loaded_id': if f not in rec: continue # ensure values is a list values = rec[f] if isinstance(rec[f], list) else [rec[f]] if f in e: # get values from rec that are not currently on the edition to_add = [v for v in values if v not in e[f]] e[f] += to_add else: e[f] = to_add = values if to_add: need_edition_save = True edits = [] reply = { 'success': True, 'edition': {'key': match, 'status': 'matched'}, 'work': {'key': w['key'], 'status': 'matched'}, } if need_edition_save: reply['edition']['status'] = 'modified' edits.append(e.dict()) if need_work_save: reply['work']['status'] = 'created' if work_created else 'modified' edits.append(w) if edits: web.ctx.site.save_many(edits, 'import existing book') return reply
def load(rec): if not rec.get('title'): raise RequiredField('title') edition_pool = build_pool(rec) #print 'pool:', edition_pool if not edition_pool: return load_data(rec) # 'no books in pool, loading' #matches = set(item for sublist in edition_pool.values() for item in sublist) #if len(matches) == 1: # return {'success': True, 'edition': {'key': list(matches)[0]}} match = find_exact_match(rec, edition_pool) if not match: rec['full_title'] = rec['title'] if rec.get('subtitle'): rec['full_title'] += ' ' + rec['subtitle'] e1 = build_marc(rec) add_db_name(e1) #print #print 'e1', e1 #print #print 'pool', edition_pool match = find_match(e1, edition_pool) if match: # 'match found:', match, rec['ia'] e = web.ctx.site.get(match) w = e['works'][0] reply = { 'success': True, 'edition': { 'key': match, 'status': 'matched' }, 'work': { 'key': w.key, 'status': 'matched' }, } edits = [] need_work_save = False need_edition_save = False if rec.get('authors'): reply['authors'] = [] east = east_in_by_statement(rec) work_authors = list(w.authors) edition_authors = list(e.authors) author_in = [ import_author(a, eastern=east) for a in rec['authors'] ] for a in author_in: new_author = 'key' not in a add_to_work = False add_to_edition = False if new_author: a['key'] = web.ctx.site.new_key('/type/author') edits.append(a) add_to_work = True add_to_edition = True else: if not any(i.author.key == a['key'] for i in work_authors): add_to_work = True if not any(i.key == a['key'] for i in edition_authors): add_to_edition = True if add_to_work: need_work_save = True work_authors.append({ 'type': { 'key': '/type/author_role' }, 'author': { 'key': a['key'] }, }) if add_to_edition: need_edition_save = True edition_authors.append({'key': a['key']}) reply['authors'].append({ 'key': a['key'], 'name': a['name'], 'status': ('created' if new_author else 'modified'), }) w.authors = work_authors e.authors = edition_authors if 'subjects' in rec: work_subjects = list(w.subjects) for s in rec['subjects']: if s not in w.subjects: #print 'w.subjects.append(%s)' % s work_subjects.append(s) need_work_save = True if need_work_save: w.subjects = work_subjects if need_edition_save: reply['edition']['status'] = 'modified' edits.append(e) web.ctx.site.save(e, match, 'update edition') if need_work_save: reply['work']['status'] = 'modified' edits.append(w) if edits: web.ctx.site.save_many(edits, 'import new book') return reply #add_source_records(match, ia) else: # 'no match found', rec['ia'] return load_data(rec)
def load(rec): """Given a record, tries to add/match that edition in the system. Record is a dictionary containing all the metadata of the edition. The following fields are mandatory: * title * source_records """ if not rec.get('title'): raise RequiredField('title') if not rec.get('source_records'): raise RequiredField('source_records') if isinstance(rec['source_records'], six.string_types): rec['source_records'] = [rec['source_records']] edition_pool = build_pool(rec) if not edition_pool: # No match candidates found, add edition return load_data(rec) match = early_exit(rec) if not match: match = find_exact_match(rec, edition_pool) if not match: rec['full_title'] = rec['title'] if rec.get('subtitle'): rec['full_title'] += ' ' + rec['subtitle'] e1 = build_marc(rec) add_db_name(e1) match = find_match(e1, edition_pool) if not match: # No match found, add edition return load_data(rec) # We have an edition match at this point need_work_save = need_edition_save = False w = None e = web.ctx.site.get(match) if e.get('works'): w = e.works[0].dict() work_created = False else: # Found an edition without a work work_created = need_work_save = need_edition_save = True w = new_work(e.dict(), rec) e.works = [{'key': w['key']}] # Add subjects to work, if not already present if 'subjects' in rec: work_subjects = list(w.get('subjects', [])) for s in rec['subjects']: if s not in work_subjects: work_subjects.append(s) need_work_save = True if need_work_save and work_subjects: w['subjects'] = work_subjects # Add cover to edition if 'cover' in rec and not e.covers: cover_url = rec['cover'] cover_id = add_cover(cover_url, e.key) if cover_id: e['covers'] = [cover_id] need_edition_save = True # Add cover to work if needed if not w.get('covers') and e.get('covers'): w['covers'] = [e['covers'][0]] need_work_save = True # Add authors to work if needed if not w.get('authors'): authors = [import_author(a) for a in rec.get('authors', [])] w['authors'] = [{'type':{'key': '/type/author_role'}, 'author': a.key} for a in authors if a.get('key')] if w.get('authors'): need_work_save = True # Add ocaid to edition (str), if needed if 'ocaid' in rec and not e.ocaid: e['ocaid'] = rec['ocaid'] need_edition_save = True edition_fields = [ 'local_id', 'ia_box_id', 'ia_loaded_id', 'source_records'] # XXX Todos: # only consider `source_records` for newly created work # or if field originally missing: #if work_created and not e.get('source_records'): # edition_fields.append('source_records') for f in edition_fields: if f not in rec: continue # ensure values is a list values = rec[f] if isinstance(rec[f], list) else [rec[f]] if f in e: # get values from rec that are not currently on the edition to_add = [v for v in values if v not in e[f]] e[f] += to_add else: e[f] = to_add = values if to_add: need_edition_save = True edits = [] reply = { 'success': True, 'edition': {'key': match, 'status': 'matched'}, 'work': {'key': w['key'], 'status': 'matched'}, } if need_edition_save: reply['edition']['status'] = 'modified' edits.append(e.dict()) if need_work_save: reply['work']['status'] = 'created' if work_created else 'modified' edits.append(w) if edits: web.ctx.site.save_many(edits, 'import existing book') if 'ocaid' in rec: update_ia_metadata_for_ol_edition(match.split('/')[-1]) return reply
def load(rec): """Given a record, tries to add/match that edition in the system. Record is a dictionary containing all the metadata of the edition. The following fields are mandatory: * title * source_records """ if not rec.get('title'): raise RequiredField('title') if not rec.get('source_records'): raise RequiredField('source_records') if isinstance(rec['source_records'], basestring): rec['source_records'] = [rec['source_records']] edition_pool = build_pool(rec) if not edition_pool: # No match candidates found, add edition return load_data(rec) match = early_exit(rec) if not match: match = find_exact_match(rec, edition_pool) if not match: rec['full_title'] = rec['title'] if rec.get('subtitle'): rec['full_title'] += ' ' + rec['subtitle'] e1 = build_marc(rec) add_db_name(e1) match = find_match(e1, edition_pool) if not match: # No match found, add edition return load_data(rec) # We have an edition match at this point need_work_save = need_edition_save = False w = None e = web.ctx.site.get(match) if hasattr(e, 'works'): w = e.works[0].dict() work_created = False else: # Found an edition without a work work_created = need_work_save = need_edition_save = True w = new_work(e, rec) e.works = [{'key': w['key']}] # Add subjects to work, if not already present if 'subjects' in rec: work_subjects = list(w.get('subjects', [])) for s in rec['subjects']: if s not in work_subjects: work_subjects.append(s) need_work_save = True if need_work_save and work_subjects: w['subjects'] = work_subjects # Add cover to edition if 'cover' in rec and not e.covers: cover_url = rec['cover'] cover_id = add_cover(cover_url, e.key) if cover_id: e['covers'] = [cover_id] need_edition_save = True # Add cover to work if needed if not w.get('covers') and e.get('covers'): w['covers'] = [e['covers'][0]] need_work_save = True # Add ocaid to edition (str), if needed if 'ocaid' in rec and not e.ocaid: e['ocaid'] = rec['ocaid'] need_edition_save = True edition_fields = [ 'local_id', 'ia_box_id', 'ia_loaded_id', 'source_records' ] # XXX Todos: # only consider `source_records` for newly created work # or if field originally missing: #if work_created and not e.get('source_records'): # edition_fields.append('source_records') for f in edition_fields: if f not in rec: continue # ensure values is a list values = rec[f] if isinstance(rec[f], list) else [rec[f]] if f in e: # get values from rec that are not currently on the edition to_add = [v for v in values if v not in e[f]] e[f] += to_add else: e[f] = to_add = values if to_add: need_edition_save = True edits = [] reply = { 'success': True, 'edition': { 'key': match, 'status': 'matched' }, 'work': { 'key': w['key'], 'status': 'matched' }, } if need_edition_save: reply['edition']['status'] = 'modified' edits.append(e.dict()) if need_work_save: reply['work']['status'] = 'created' if work_created else 'modified' edits.append(w) if edits: web.ctx.site.save_many(edits, 'import existing book') if 'ocaid' in rec: update_ia_metadata_for_ol_edition(match.split('/')[-1]) return reply
def load(rec): if not rec.get('title'): raise RequiredField('title') edition_pool = build_pool(rec) #print 'pool:', edition_pool if not edition_pool: return load_data(rec) # 'no books in pool, loading' #matches = set(item for sublist in edition_pool.values() for item in sublist) #if len(matches) == 1: # return {'success': True, 'edition': {'key': list(matches)[0]}} match = find_exact_match(rec, edition_pool) if not match: rec['full_title'] = rec['title'] if rec.get('subtitle'): rec['full_title'] += ' ' + rec['subtitle'] e1 = build_marc(rec) add_db_name(e1) #print #print 'e1', e1 #print #print 'pool', edition_pool match = find_match(e1, edition_pool) if match: # 'match found:', match, rec['ia'] e = web.ctx.site.get(match) w = e['works'][0] reply = { 'success': True, 'edition': {'key': match, 'status': 'matched'}, 'work': {'key': w.key, 'status': 'matched'}, } need_work_save = False need_edition_save = False if rec.get('authors'): reply['authors'] = [] east = east_in_by_statement(rec) work_authors = list(w.authors) edition_authors = list(e.authors) author_in = [import_author(a, eastern=east) for a in rec['authors']] for a in author_in: new_author = 'key' not in a add_to_work = False add_to_edition = False if new_author: a['key'] = web.ctx.site.new_key('/type/author') aobj = web.ctx.site.save(a, comment='new author') add_to_work = True add_to_edition = True else: if not any(i.author.key == a['key'] for i in work_authors): add_to_work = True if not any(i.key == a['key'] for i in edition_authors): add_to_edition = True if add_to_work: need_work_save = True work_authors.append({ 'type': {'key': '/type/author_role'}, 'author': {'key': a['key'] }, }) if add_to_edition: need_edition_save = True edition_authors.append({'key': a['key'] }) reply['authors'].append({ 'key': a['key'], 'name': a['name'], 'status': ('created' if new_author else 'modified'), }) w.authors = work_authors e.authors = edition_authors if 'subjects' in rec: work_subjects = list(w.subjects) for s in rec['subjects']: if s not in w.subjects: #print 'w.subjects.append(%s)' % s work_subjects.append(s) need_work_save = True if need_work_save: w.subjects = work_subjects if need_edition_save: reply['edition']['status'] = 'modified' web.ctx.site.save(e, match, 'update edition') if need_work_save: reply['work']['status'] = 'modified' web.ctx.site.save(w, w.key, 'update work') return reply #add_source_records(match, ia) else: # 'no match found', rec['ia'] return load_data(rec)
def load_part(archive_id, part, start_pos=0): print 'load_part:', archive_id, part global rec_no, t_prev, load_count full_part = archive_id + "/" + part f = open(rc['marc_path'] + "/" + full_part) if start_pos: f.seek(start_pos) for pos, loc, data in read_marc_file(full_part, f, pos=start_pos): rec_no += 1 if rec_no % chunk == 0: progress(archive_id, rec_no, start_pos, pos) if is_loaded(loc): continue want = ['001', '003', '010', '020', '035', '245'] try: index_fields = fast_parse.index_fields(data, want) except KeyError: print loc print fast_parse.get_tag_lines(data, ['245']) raise except AssertionError: print loc raise except fast_parse.NotBook: continue if not index_fields or 'title' not in index_fields: continue print loc edition_pool = pool.build(index_fields) if not edition_pool: yield loc, data continue rec = fast_parse.read_edition(data) e1 = build_marc(rec) match = False seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: if edition_key in seen: continue thing = None while not thing or thing['type']['key'] == '/type/redirect': seen.add(edition_key) thing = withKey(edition_key) assert thing if thing['type']['key'] == '/type/redirect': print 'following redirect %s => %s' % (edition_key, thing['location']) edition_key = thing['location'] if try_merge(e1, edition_key, thing): add_source_records(edition_key, loc, thing, data) match = True break if match: break if not match: yield loc, data