Esempio n. 1
0
def ia_match(a, ia):
    try:
        loc, rec = get_ia(ia)
    except urllib2.HTTPError:
        return False
    if rec is None or 'full_title' not in rec:
        return False
    try:
        e1 = build_marc(rec)
    except TypeError:
        print rec
        raise
    return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
Esempio n. 2
0
def ia_match(a, ia):
    try:
        loc, rec = get_ia(ia)
    except urllib2.HTTPError:
        return False
    if rec is None or 'full_title' not in rec:
        return False
    try:
        e1 = build_marc(rec)
    except TypeError:
        print(rec)
        raise
    return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
Esempio n. 3
0
def load():
    global rec_no, t_prev
    skipping = False
    #for ia in ['nybc200715']:
    #cur.execute("select identifier from metadata where collection=%(c)s", {'c': collection})
    cur.execute("select identifier from metadata where scanner is not null and scanner != 'google' and noindex is null and mediatype='texts' and curatestate='approved'") # order by curatedate")
    for ia, in cur.fetchall():
        rec_no += 1
        if rec_no % chunk == 0:
            t = time() - t_prev
            t_prev = time()
            t1 = time() - t0
            rec_per_sec = chunk / t
            rec_per_sec_total = rec_no / t1
            remaining = total - rec_no
            sec = remaining / rec_per_sec_total
            print("%8d current: %9.3f overall: %9.3f" % (rec_no, rec_per_sec, rec_per_sec_total), end=' ')
            hours = sec / 3600
            print("%6.3f hours" % hours)

        print(ia)
        if get_things({'type': '/type/edition', 'ocaid': ia}):
            print('already loaded')
            continue
        try:
            loc, rec = get_ia(ia)
        except (KeyboardInterrupt, NameError):
            raise
        except urllib2.HTTPError:
            continue
        if loc is None:
            continue
        print(loc, rec)

        if not loc.endswith('.xml'):
            print("not XML")
            continue
        if 'full_title' not in rec:
            print("full_title missing")
            continue
        index_fields = make_index_fields(rec)
        if not index_fields:
            print("no index_fields")
            continue

        edition_pool = pool.build(index_fields)
        print(edition_pool)

        if not edition_pool:
            yield loc, ia
            continue

        e1 = build_marc(rec)

        match = False
        for k, v in edition_pool.iteritems():
            if k == 'title' and len(v) > 50:
                continue
            for edition_key in v:
                if try_merge(e1, edition_key.replace('\/', '/')):
                    match = True
                    break
            if match:
                break
        if not match:
            yield loc, ia
Esempio n. 4
0
def load():
    global rec_no, t_prev
    skipping = False
    #for ia in ['nybc200715']:
    #cur.execute("select identifier from metadata where collection=%(c)s", {'c': collection})
    cur.execute(
        "select identifier from metadata where scanner is not null and scanner != 'google' and noindex is null and mediatype='texts' and curatestate='approved'"
    )  # order by curatedate")
    for ia, in cur.fetchall():
        rec_no += 1
        if rec_no % chunk == 0:
            t = time() - t_prev
            t_prev = time()
            t1 = time() - t0
            rec_per_sec = chunk / t
            rec_per_sec_total = rec_no / t1
            remaining = total - rec_no
            sec = remaining / rec_per_sec_total
            print("%8d current: %9.3f overall: %9.3f" %
                  (rec_no, rec_per_sec, rec_per_sec_total),
                  end=' ')
            hours = sec / 3600
            print("%6.3f hours" % hours)

        print(ia)
        if get_things({'type': '/type/edition', 'ocaid': ia}):
            print('already loaded')
            continue
        try:
            loc, rec = get_ia(ia)
        except (KeyboardInterrupt, NameError):
            raise
        except urllib2.HTTPError:
            continue
        if loc is None:
            continue
        print(loc, rec)

        if not loc.endswith('.xml'):
            print("not XML")
            continue
        if 'full_title' not in rec:
            print("full_title missing")
            continue
        index_fields = make_index_fields(rec)
        if not index_fields:
            print("no index_fields")
            continue

        edition_pool = pool.build(index_fields)
        print(edition_pool)

        if not edition_pool:
            yield loc, ia
            continue

        e1 = build_marc(rec)

        match = False
        for k, v in edition_pool.iteritems():
            if k == 'title' and len(v) > 50:
                continue
            for edition_key in v:
                if try_merge(e1, edition_key.replace('\/', '/')):
                    match = True
                    break
            if match:
                break
        if not match:
            yield loc, ia
Esempio n. 5
0
skip = True

for i in iter:
    ia = i.identifier
    if skip:
        if ia == 'bostoncityhospit12hugh':
            skip = False
        else:
            continue
    print ia
    if query({'type': '/type/edition', 'ocaid': ia}):
        print 'already loaded'
        continue
    try:
        loc, rec = get_ia(ia)
    except (KeyboardInterrupt, NameError):
        raise
    except urllib2.HTTPError:
        continue
    if loc is None or rec is None:
        continue
    print loc, rec

    if not loc.endswith('.xml'):
        print "not XML"
        continue
    if 'full_title' not in rec:
        print "full_title missing"
        continue
    index_fields = make_index_fields(rec)