def ia_match(a, ia): try: loc, rec = get_ia(ia) except urllib2.HTTPError: return False if rec is None or 'full_title' not in rec: return False try: e1 = build_marc(rec) except TypeError: print rec raise return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
def ia_match(a, ia): try: loc, rec = get_ia(ia) except urllib2.HTTPError: return False if rec is None or 'full_title' not in rec: return False try: e1 = build_marc(rec) except TypeError: print(rec) raise return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
def load(): global rec_no, t_prev skipping = False #for ia in ['nybc200715']: #cur.execute("select identifier from metadata where collection=%(c)s", {'c': collection}) cur.execute("select identifier from metadata where scanner is not null and scanner != 'google' and noindex is null and mediatype='texts' and curatestate='approved'") # order by curatedate") for ia, in cur.fetchall(): rec_no += 1 if rec_no % chunk == 0: t = time() - t_prev t_prev = time() t1 = time() - t0 rec_per_sec = chunk / t rec_per_sec_total = rec_no / t1 remaining = total - rec_no sec = remaining / rec_per_sec_total print("%8d current: %9.3f overall: %9.3f" % (rec_no, rec_per_sec, rec_per_sec_total), end=' ') hours = sec / 3600 print("%6.3f hours" % hours) print(ia) if get_things({'type': '/type/edition', 'ocaid': ia}): print('already loaded') continue try: loc, rec = get_ia(ia) except (KeyboardInterrupt, NameError): raise except urllib2.HTTPError: continue if loc is None: continue print(loc, rec) if not loc.endswith('.xml'): print("not XML") continue if 'full_title' not in rec: print("full_title missing") continue index_fields = make_index_fields(rec) if not index_fields: print("no index_fields") continue edition_pool = pool.build(index_fields) print(edition_pool) if not edition_pool: yield loc, ia continue e1 = build_marc(rec) match = False for k, v in edition_pool.iteritems(): if k == 'title' and len(v) > 50: continue for edition_key in v: if try_merge(e1, edition_key.replace('\/', '/')): match = True break if match: break if not match: yield loc, ia
def load(): global rec_no, t_prev skipping = False #for ia in ['nybc200715']: #cur.execute("select identifier from metadata where collection=%(c)s", {'c': collection}) cur.execute( "select identifier from metadata where scanner is not null and scanner != 'google' and noindex is null and mediatype='texts' and curatestate='approved'" ) # order by curatedate") for ia, in cur.fetchall(): rec_no += 1 if rec_no % chunk == 0: t = time() - t_prev t_prev = time() t1 = time() - t0 rec_per_sec = chunk / t rec_per_sec_total = rec_no / t1 remaining = total - rec_no sec = remaining / rec_per_sec_total print("%8d current: %9.3f overall: %9.3f" % (rec_no, rec_per_sec, rec_per_sec_total), end=' ') hours = sec / 3600 print("%6.3f hours" % hours) print(ia) if get_things({'type': '/type/edition', 'ocaid': ia}): print('already loaded') continue try: loc, rec = get_ia(ia) except (KeyboardInterrupt, NameError): raise except urllib2.HTTPError: continue if loc is None: continue print(loc, rec) if not loc.endswith('.xml'): print("not XML") continue if 'full_title' not in rec: print("full_title missing") continue index_fields = make_index_fields(rec) if not index_fields: print("no index_fields") continue edition_pool = pool.build(index_fields) print(edition_pool) if not edition_pool: yield loc, ia continue e1 = build_marc(rec) match = False for k, v in edition_pool.iteritems(): if k == 'title' and len(v) > 50: continue for edition_key in v: if try_merge(e1, edition_key.replace('\/', '/')): match = True break if match: break if not match: yield loc, ia
skip = True for i in iter: ia = i.identifier if skip: if ia == 'bostoncityhospit12hugh': skip = False else: continue print ia if query({'type': '/type/edition', 'ocaid': ia}): print 'already loaded' continue try: loc, rec = get_ia(ia) except (KeyboardInterrupt, NameError): raise except urllib2.HTTPError: continue if loc is None or rec is None: continue print loc, rec if not loc.endswith('.xml'): print "not XML" continue if 'full_title' not in rec: print "full_title missing" continue index_fields = make_index_fields(rec)