def get_langs(): lang = [] offset = 0 while True: i = get_things({'type': '/type/language', 'limit': 100, 'offset': offset}) lang += i if len(i) != 100: break offset += 100 return set(lang)
def load(): global rec_no, t_prev skipping = False #for ia in ['nybc200715']: #cur.execute("select identifier from metadata where collection=%(c)s", {'c': collection}) cur.execute( "select identifier from metadata where scanner is not null and scanner != 'google' and noindex is null and mediatype='texts' and curatestate='approved'" ) # order by curatedate") for ia, in cur.fetchall(): rec_no += 1 if rec_no % chunk == 0: t = time() - t_prev t_prev = time() t1 = time() - t0 rec_per_sec = chunk / t rec_per_sec_total = rec_no / t1 remaining = total - rec_no sec = remaining / rec_per_sec_total print("%8d current: %9.3f overall: %9.3f" % (rec_no, rec_per_sec, rec_per_sec_total), end=' ') hours = sec / 3600 print("%6.3f hours" % hours) print(ia) if get_things({'type': '/type/edition', 'ocaid': ia}): print('already loaded') continue try: loc, rec = get_ia(ia) except (KeyboardInterrupt, NameError): raise except urllib2.HTTPError: continue if loc is None: continue print(loc, rec) if not loc.endswith('.xml'): print("not XML") continue if 'full_title' not in rec: print("full_title missing") continue index_fields = make_index_fields(rec) if not index_fields: print("no index_fields") continue edition_pool = pool.build(index_fields) print(edition_pool) if not edition_pool: yield loc, ia continue e1 = build_marc(rec) match = False for k, v in edition_pool.iteritems(): if k == 'title' and len(v) > 50: continue for edition_key in v: if try_merge(e1, edition_key.replace('\/', '/')): match = True break if match: break if not match: yield loc, ia
def load(): global rec_no, t_prev skipping = False #for ia in ['nybc200715']: #cur.execute("select identifier from metadata where collection=%(c)s", {'c': collection}) cur.execute("select identifier from metadata where scanner is not null and scanner != 'google' and noindex is null and mediatype='texts' and curatestate='approved'") # order by curatedate") for ia, in cur.fetchall(): rec_no += 1 if rec_no % chunk == 0: t = time() - t_prev t_prev = time() t1 = time() - t0 rec_per_sec = chunk / t rec_per_sec_total = rec_no / t1 remaining = total - rec_no sec = remaining / rec_per_sec_total print("%8d current: %9.3f overall: %9.3f" % (rec_no, rec_per_sec, rec_per_sec_total), end=' ') hours = sec / 3600 print("%6.3f hours" % hours) print(ia) if get_things({'type': '/type/edition', 'ocaid': ia}): print('already loaded') continue try: loc, rec = get_ia(ia) except (KeyboardInterrupt, NameError): raise except urllib2.HTTPError: continue if loc is None: continue print(loc, rec) if not loc.endswith('.xml'): print("not XML") continue if 'full_title' not in rec: print("full_title missing") continue index_fields = make_index_fields(rec) if not index_fields: print("no index_fields") continue edition_pool = pool.build(index_fields) print(edition_pool) if not edition_pool: yield loc, ia continue e1 = build_marc(rec) match = False for k, v in edition_pool.iteritems(): if k == 'title' and len(v) > 50: continue for edition_key in v: if try_merge(e1, edition_key.replace('\/', '/')): match = True break if match: break if not match: yield loc, ia