def find_match(e1, edition_pool): seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: if edition_key in seen: continue thing = None found = True while not thing or is_redirect(thing): seen.add(edition_key) thing = web.ctx.site.get(edition_key) if thing is None: found = False break if is_redirect(thing): print 'following redirect %s => %s' % (edition_key, thing['location']) edition_key = thing['location'] if not found: continue #print (e1, edition_key, thing) if try_merge(e1, edition_key, thing): #add_source_records(edition_key, ia) return edition_key return None
def find_match(e1, edition_pool): """ Find the best match for e1 in edition_pool and return its key. :param dict e1: the new edition we are trying to match :param list edition_pool: list of possible edition matches :rtype: str|None :return: None or the edition key '/books/OL...M' of the best edition match for e1 in edition_pool """ seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: if edition_key in seen: continue thing = None found = True while not thing or is_redirect(thing): seen.add(edition_key) thing = web.ctx.site.get(edition_key) if thing is None: found = False break if is_redirect(thing): edition_key = thing['location'] if not found: continue if try_merge(e1, edition_key, thing): return edition_key
def test_try_merge(mock_site): rec = { 'title': 'Test item', 'lccn': ['123'], 'authors': [{'name': 'Smith, John', 'birth_date': '1980'}], 'source_records': ['ia:test_item'], } reply = load(rec) ekey = reply['edition']['key'] e = mock_site.get(ekey) rec['full_title'] = rec['title'] e1 = build_marc(rec) add_db_name(e1) result = try_merge(e1, ekey, e) assert result is True
def test_try_merge(mock_site): rec = { 'title': 'Test item', 'lccn': ['123'], 'authors': [{'name': 'Smith, John', 'birth_date': '1980'}], } reply = load(rec) ekey = reply['edition']['key'] e = mock_site.get(ekey) rec['full_title'] = rec['title'] if rec.get('subtitle'): rec['full_title'] += ' ' + rec['subtitle'] e1 = build_marc(rec) add_db_name(e1) assert try_merge(e1, ekey, e)
def test_try_merge(mock_site): rec = { 'title': 'Test item', 'lccn': ['123'], 'authors': [{ 'name': 'Smith, John', 'birth_date': '1980' }], } reply = load(rec) ekey = reply['edition']['key'] e = mock_site.get(ekey) rec['full_title'] = rec['title'] if rec.get('subtitle'): rec['full_title'] += ' ' + rec['subtitle'] e1 = build_marc(rec) add_db_name(e1) assert try_merge(e1, ekey, e)
def find_match(e1, edition_pool): seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: if edition_key in seen: continue thing = None found = True while not thing or is_redirect(thing): seen.add(edition_key) thing = web.ctx.site.get(edition_key) if thing is None: found = False break if is_redirect(thing): print 'following redirect %s => %s' % (edition_key, thing['location']) edition_key = thing['location'] if not found: continue if try_merge(e1, edition_key, thing): return edition_key
def load(): global rec_no, t_prev skipping = False #for ia in ['nybc200715']: #cur.execute("select identifier from metadata where collection=%(c)s", {'c': collection}) cur.execute( "select identifier from metadata where scanner is not null and scanner != 'google' and noindex is null and mediatype='texts' and curatestate='approved'" ) # order by curatedate") for ia, in cur.fetchall(): rec_no += 1 if rec_no % chunk == 0: t = time() - t_prev t_prev = time() t1 = time() - t0 rec_per_sec = chunk / t rec_per_sec_total = rec_no / t1 remaining = total - rec_no sec = remaining / rec_per_sec_total print("%8d current: %9.3f overall: %9.3f" % (rec_no, rec_per_sec, rec_per_sec_total), end=' ') hours = sec / 3600 print("%6.3f hours" % hours) print(ia) if get_things({'type': '/type/edition', 'ocaid': ia}): print('already loaded') continue try: loc, rec = get_ia(ia) except (KeyboardInterrupt, NameError): raise except urllib2.HTTPError: continue if loc is None: continue print(loc, rec) if not loc.endswith('.xml'): print("not XML") continue if 'full_title' not in rec: print("full_title missing") continue index_fields = make_index_fields(rec) if not index_fields: print("no index_fields") continue edition_pool = pool.build(index_fields) print(edition_pool) if not edition_pool: yield loc, ia continue e1 = build_marc(rec) match = False for k, v in edition_pool.iteritems(): if k == 'title' and len(v) > 50: continue for edition_key in v: if try_merge(e1, edition_key.replace('\/', '/')): match = True break if match: break if not match: yield loc, ia
def test(self): self.assertEqual(try_merge([8,8]), [16]) self.assertEqual(try_merge([2,4]), [2, 4]) self.assertEqual(try_merge([2, 2, 4, 8]), [4, 4, 8])
def load(): global rec_no, t_prev skipping = False #for ia in ['nybc200715']: #cur.execute("select identifier from metadata where collection=%(c)s", {'c': collection}) cur.execute("select identifier from metadata where scanner is not null and scanner != 'google' and noindex is null and mediatype='texts' and curatestate='approved'") # order by curatedate") for ia, in cur.fetchall(): rec_no += 1 if rec_no % chunk == 0: t = time() - t_prev t_prev = time() t1 = time() - t0 rec_per_sec = chunk / t rec_per_sec_total = rec_no / t1 remaining = total - rec_no sec = remaining / rec_per_sec_total print("%8d current: %9.3f overall: %9.3f" % (rec_no, rec_per_sec, rec_per_sec_total), end=' ') hours = sec / 3600 print("%6.3f hours" % hours) print(ia) if get_things({'type': '/type/edition', 'ocaid': ia}): print('already loaded') continue try: loc, rec = get_ia(ia) except (KeyboardInterrupt, NameError): raise except urllib2.HTTPError: continue if loc is None: continue print(loc, rec) if not loc.endswith('.xml'): print("not XML") continue if 'full_title' not in rec: print("full_title missing") continue index_fields = make_index_fields(rec) if not index_fields: print("no index_fields") continue edition_pool = pool.build(index_fields) print(edition_pool) if not edition_pool: yield loc, ia continue e1 = build_marc(rec) match = False for k, v in edition_pool.iteritems(): if k == 'title' and len(v) > 50: continue for edition_key in v: if try_merge(e1, edition_key.replace('\/', '/')): match = True break if match: break if not match: yield loc, ia