def try_merge(e1, edition_key, existing): thing_type = existing.type.key if thing_type == '/type/delete': return False assert thing_type == '/type/edition' rec2 = {} rec2['full_title'] = existing.title if existing.subtitle: rec2['full_title'] += ' ' + existing.subtitle for f in 'isbn', 'isbn_10', 'isbn_13', 'lccn', 'publish_country', 'publishers', 'publish_date': if existing.get(f): rec2[f] = existing[f] if existing.authors: rec2['authors'] = [] for a in existing.authors: author_type = a.type.key while author_type == '/type/delete' or author_type == '/type/redirect': if author_type == '/type/delete': a = undelete_author(a) author_type = a.type.key continue if author_type == '/type/redirect': a = web.ctx.site.get(a.location) author_type = a.type.key continue assert author_type == '/type/author' assert a['name'] rec2['authors'].append({'name': a['name'], 'db_name': db_name(a)}) e2 = build_marc(rec2) return attempt_merge(e1, e2, threshold, debug=False)
def test_merge(): bpl = {'authors': [{'birth_date': u'1897', 'db_name': u'Green, Constance McLaughlin 1897-', 'entity_type': 'person', 'name': u'Green, Constance McLaughlin', 'personal_name': u'Green, Constance McLaughlin'}], 'full_title': u'Eli Whitney and the birth of American technology', 'isbn': [u'188674632X'], 'normalized_title': u'eli whitney and the birth of american technology', 'number_of_pages': 215, 'publish_date': '1956', 'publishers': [u'HarperCollins', u'[distributed by Talman Pub.]'], 'short_title': u'eli whitney and the birth', 'source_record_loc': 'bpl101.mrc:0:1226', 'titles': [u'Eli Whitney and the birth of American technology', u'eli whitney and the birth of american technology']} lc = {'authors': [{'birth_date': u'1897', 'db_name': u'Green, Constance McLaughlin 1897-', 'entity_type': 'person', 'name': u'Green, Constance McLaughlin', 'personal_name': u'Green, Constance McLaughlin'}], 'full_title': u'Eli Whitney and the birth of American technology.', 'isbn': [], 'normalized_title': u'eli whitney and the birth of american technology', 'number_of_pages': 215, 'publish_date': '1956', 'publishers': ['Little, Brown'], 'short_title': u'eli whitney and the birth', 'source_record_loc': 'marc_records_scriblio_net/part04.dat:119539872:591', 'titles': [u'Eli Whitney and the birth of American technology.', u'eli whitney and the birth of american technology']} assert compare_authors(bpl, lc) == ('authors', 'exact match', 125) threshold = 875 assert attempt_merge(bpl, lc, threshold) is True
def try_merge(e1, edition_key, existing): thing_type = existing.type.key if thing_type == '/type/delete': return False assert thing_type == '/type/edition' rec2 = {} rec2['full_title'] = existing.title if existing.subtitle: rec2['full_title'] += ' ' + existing.subtitle if existing.lccn: rec2['lccn'] = existing.lccn rec2['authors'] = [{ 'name': a.name, 'db_name': db_name(a) } for a in existing.authors] if existing.publishers: rec2['publishers'] = existing.publishers if existing.publish_date: rec2['publisher_date'] = existing.publish_date e2 = build_marc(rec2) print print 'e1:', e1 print 'e2:', e2 return attempt_merge(e1, e2, threshold, debug=True)
def try_merge(e1, edition_key, existing): """ Converts the existing edition into a comparable dict and performs a thresholded comparison to decide whether they are the same. :param dict e1: :param str edition_key: :param Thing existing: Edition object that most likely matches e1, the object of edition_key :rtype: bool :return: Whether e1 is sufficiently the same as the 'existing' edition """ thing_type = existing.type.key if thing_type == '/type/delete': return False assert thing_type == '/type/edition' rec2 = {} rec2['full_title'] = existing.title if existing.subtitle: rec2['full_title'] += ' ' + existing.subtitle for f in 'isbn', 'isbn_10', 'isbn_13', 'lccn', 'publish_country', 'publishers', 'publish_date': if existing.get(f): rec2[f] = existing[f] if existing.authors: rec2['authors'] = [] for a in existing.authors: while a.type.key == '/type/redirect': a = web.ctx.site.get(a.location) if a.type.key == '/type/author': assert a['name'] rec2['authors'].append({ 'name': a['name'], 'db_name': db_name(a) }) e2 = build_marc(rec2) return attempt_merge(e1, e2, threshold)
def test_merge2(): amazon = {'publishers': [u'Collins'], 'isbn_10': ['0002167530'], 'number_of_pages': 287, 'short_title': u'sea birds britain ireland', 'normalized_title': u'sea birds britain ireland', 'full_title': u'Sea Birds Britain Ireland', 'titles': [u'Sea Birds Britain Ireland', u'sea birds britain ireland'], 'publish_date': u'1975', 'authors': [{'name': 'Stanley Cramp', 'db_name': 'Cramp, Stanley'}]} marc = {'publisher': [u'Collins'], 'isbn_10': [u'0002167530'], 'short_title': u'seabirds of britain and i', 'normalized_title': u'seabirds of britain and ireland', 'full_title': u'seabirds of Britain and Ireland', 'titles': [u'seabirds of Britain and Ireland', u'seabirds of britain and ireland'], 'publish_date': '1974', 'authors': [{'db_name': u'Cramp, Stanley.', 'entity_type': 'person', 'name': u'Cramp, Stanley.', 'personal_name': u'Cramp, Stanley.'}], 'source_record_loc': 'marc_records_scriblio_net/part08.dat:61449973:855'} threshold = 875 # build_marc() will place all isbn_ types in the 'isbn' field. # compare_author_fields() expects all authors to have a db_name assert attempt_merge(build_marc(amazon), build_marc(marc), threshold, debug=True)
def try_merge(e1, edition_key, existing): """ Converts the existing edition into a comparable dict and performs a thresholded comparison to decide whether they are the same. Used by add_book.load() -> add_book.find_match() to check whether two editions match. :param dict e1: Output of build_marc(import record candidate) :param str edition_key: edition key of existing :param Thing existing: Edition object to be tested against e1, the object of edition_key :rtype: bool :return: Whether e1 is sufficiently the same as the 'existing' edition """ thing_type = existing.type.key if thing_type == '/type/delete': return False # FIXME: will fail if existing is a redirect. assert thing_type == '/type/edition' rec2 = {} rec2['full_title'] = existing.title if existing.subtitle: rec2['full_title'] += ' ' + existing.subtitle for f in 'isbn', 'isbn_10', 'isbn_13', 'lccn', 'publish_country', 'publishers', 'publish_date': if existing.get(f): rec2[f] = existing[f] if existing.authors: rec2['authors'] = [] for a in existing.authors: while a.type.key == '/type/redirect': a = web.ctx.site.get(a.location) if a.type.key == '/type/author': assert a['name'] rec2['authors'].append({ 'name': a['name'], 'db_name': db_name(a) }) e2 = build_marc(rec2) return attempt_merge(e1, e2, threshold)
def test_author_contrib(): rec1 = {'authors': [{'db_name': u'Bruner, Jerome S.', 'name': u'Bruner, Jerome S.'}], 'full_title': u'Contemporary approaches to cognition a symposium held at the University of Colorado.', 'number_of_pages': 210, 'publish_country': 'xxu', 'publish_date': '1957', 'publishers': [u'Harvard U.P']} rec2 = {'authors': [{'db_name': u'University of Colorado (Boulder campus). Dept. of Psychology.', 'name': u'University of Colorado (Boulder campus). Dept. of Psychology.'}], 'contribs': [{'db_name': u'Bruner, Jerome S.', 'name': u'Bruner, Jerome S.'}], 'full_title': u'Contemporary approaches to cognition a symposium held at the University of Colorado', 'lccn': ['57012963'], 'number_of_pages': 210, 'publish_country': 'mau', 'publish_date': '1957', 'publishers': [u'Harvard University Press']} e1 = build_marc(rec1) e2 = build_marc(rec2) assert compare_authors(e1, e2) == ('authors', 'exact match', 125) threshold = 875 assert attempt_merge(e1, e2, threshold) is True
def try_merge(e1, edition_key, existing): thing_type = existing.type.key if thing_type == '/type/delete': return False assert thing_type == '/type/edition' rec2 = {} rec2['full_title'] = existing.title if existing.subtitle: rec2['full_title'] += ' ' + existing.subtitle if existing.lccn: rec2['lccn'] = existing.lccn rec2['authors'] = [{'name': a.name, 'db_name': db_name(a)} for a in existing.authors] if existing.publishers: rec2['publishers'] = existing.publishers if existing.publish_date: rec2['publisher_date'] = existing.publish_date e2 = build_marc(rec2) print print 'e1:', e1 print 'e2:', e2 return attempt_merge(e1, e2, threshold, debug=True)