Example #1
0
def get_langs():
    lang = []
    offset = 0
    while True:
        i = get_things({'type': '/type/language', 'limit': 100, 'offset': offset})
        lang += i
        if len(i) != 100:
            break
        offset += 100
    return set(lang)
Example #2
0
def load():
    global rec_no, t_prev
    skipping = False
    #for ia in ['nybc200715']:
    #cur.execute("select identifier from metadata where collection=%(c)s", {'c': collection})
    cur.execute(
        "select identifier from metadata where scanner is not null and scanner != 'google' and noindex is null and mediatype='texts' and curatestate='approved'"
    )  # order by curatedate")
    for ia, in cur.fetchall():
        rec_no += 1
        if rec_no % chunk == 0:
            t = time() - t_prev
            t_prev = time()
            t1 = time() - t0
            rec_per_sec = chunk / t
            rec_per_sec_total = rec_no / t1
            remaining = total - rec_no
            sec = remaining / rec_per_sec_total
            print("%8d current: %9.3f overall: %9.3f" %
                  (rec_no, rec_per_sec, rec_per_sec_total),
                  end=' ')
            hours = sec / 3600
            print("%6.3f hours" % hours)

        print(ia)
        if get_things({'type': '/type/edition', 'ocaid': ia}):
            print('already loaded')
            continue
        try:
            loc, rec = get_ia(ia)
        except (KeyboardInterrupt, NameError):
            raise
        except urllib2.HTTPError:
            continue
        if loc is None:
            continue
        print(loc, rec)

        if not loc.endswith('.xml'):
            print("not XML")
            continue
        if 'full_title' not in rec:
            print("full_title missing")
            continue
        index_fields = make_index_fields(rec)
        if not index_fields:
            print("no index_fields")
            continue

        edition_pool = pool.build(index_fields)
        print(edition_pool)

        if not edition_pool:
            yield loc, ia
            continue

        e1 = build_marc(rec)

        match = False
        for k, v in edition_pool.iteritems():
            if k == 'title' and len(v) > 50:
                continue
            for edition_key in v:
                if try_merge(e1, edition_key.replace('\/', '/')):
                    match = True
                    break
            if match:
                break
        if not match:
            yield loc, ia
def load():
    global rec_no, t_prev
    skipping = False
    #for ia in ['nybc200715']:
    #cur.execute("select identifier from metadata where collection=%(c)s", {'c': collection})
    cur.execute("select identifier from metadata where scanner is not null and scanner != 'google' and noindex is null and mediatype='texts' and curatestate='approved'") # order by curatedate")
    for ia, in cur.fetchall():
        rec_no += 1
        if rec_no % chunk == 0:
            t = time() - t_prev
            t_prev = time()
            t1 = time() - t0
            rec_per_sec = chunk / t
            rec_per_sec_total = rec_no / t1
            remaining = total - rec_no
            sec = remaining / rec_per_sec_total
            print("%8d current: %9.3f overall: %9.3f" % (rec_no, rec_per_sec, rec_per_sec_total), end=' ')
            hours = sec / 3600
            print("%6.3f hours" % hours)

        print(ia)
        if get_things({'type': '/type/edition', 'ocaid': ia}):
            print('already loaded')
            continue
        try:
            loc, rec = get_ia(ia)
        except (KeyboardInterrupt, NameError):
            raise
        except urllib2.HTTPError:
            continue
        if loc is None:
            continue
        print(loc, rec)

        if not loc.endswith('.xml'):
            print("not XML")
            continue
        if 'full_title' not in rec:
            print("full_title missing")
            continue
        index_fields = make_index_fields(rec)
        if not index_fields:
            print("no index_fields")
            continue

        edition_pool = pool.build(index_fields)
        print(edition_pool)

        if not edition_pool:
            yield loc, ia
            continue

        e1 = build_marc(rec)

        match = False
        for k, v in edition_pool.iteritems():
            if k == 'title' and len(v) > 50:
                continue
            for edition_key in v:
                if try_merge(e1, edition_key.replace('\/', '/')):
                    match = True
                    break
            if match:
                break
        if not match:
            yield loc, ia