Example #1
0
def try_merge(edition, ekey, thing):
    thing_type = thing['type']['key']
    if 'isbn_10' not in edition:
        print edition
    asin = edition.get('isbn_10', None) or edition['asin']
    if 'authors' in edition:
        authors = [i['name'] for i in edition['authors']]
    else:
        authors = []
    a = amazon_merge.build_amazon(edition, authors)
    assert isinstance(asin, basestring)
    assert thing_type == '/type/edition'
    #print edition['asin'], ekey
    if 'source_records' in thing:
        if 'amazon:' + asin in thing['source_records']:
            return True
        return source_records_match(a, thing)

    #print 'no source records'
    mc = get_mc(ekey)
    #print 'mc:', mc
    if mc == 'amazon:' + asin:
        return True
    if not mc:
        return False
    data = get_from_local(mc)
    e1 = build_marc(fast_parse.read_edition(data))
    return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
Example #2
0
def try_merge(edition, ekey, thing):
    thing_type = thing['type']['key']
    if 'isbn_10' not in edition:
        print(edition)
    asin = edition.get('isbn_10', None) or edition['asin']
    if 'authors' in edition:
        authors = [i['name'] for i in edition['authors']]
    else:
        authors = []
    a = amazon_merge.build_amazon(edition, authors)
    assert isinstance(asin, six.string_types)
    assert thing_type == '/type/edition'
    #print edition['asin'], ekey
    if 'source_records' in thing:
        if 'amazon:' + asin in thing['source_records']:
            return True
        return source_records_match(a, thing)

    #print 'no source records'
    mc = get_mc(ekey)
    #print 'mc:', mc
    if mc == 'amazon:' + asin:
        return True
    if not mc:
        return False
    data = get_from_local(mc)
    e1 = build_marc(fast_parse.read_edition(data))
    return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
Example #3
0
def load_part(archive_id, part, start_pos=0):
    print 'load_part:', archive_id, part
    global rec_no, t_prev, load_count
    full_part = archive_id + "/" + part
    f = open(rc['marc_path'] + "/" + full_part)
    if start_pos:
        f.seek(start_pos)
    for pos, loc, data in read_marc_file(full_part, f, pos=start_pos):
        rec_no += 1
        if rec_no % chunk == 0:
            progress(archive_id, rec_no, start_pos, pos)

        if is_loaded(loc):
            continue
        want = ['001', '003', '010', '020', '035', '245']
        try:
            index_fields = fast_parse.index_fields(data, want)
        except KeyError:
            print loc
            print fast_parse.get_tag_lines(data, ['245'])
            raise
        except AssertionError:
            print loc
            raise
        if not index_fields or 'title' not in index_fields:
            continue

        edition_pool = pool.build(index_fields)

        if not edition_pool:
            yield loc, data
            continue

        rec = fast_parse.read_edition(data)
        e1 = build_marc(rec)

        match = False
        seen = set()
        for k, v in edition_pool.iteritems():
            for edition_key in v:
                if edition_key in seen:
                    continue
                thing = None
                while not thing or thing['type']['key'] == '/type/redirect':
                    seen.add(edition_key)
                    thing = withKey(edition_key)
                    assert thing
                    if thing['type']['key'] == '/type/redirect':
                        print 'following redirect %s => %s' % (edition_key, thing['location'])
                        edition_key = thing['location']
                if try_merge(e1, edition_key, thing):
                    add_source_records(edition_key, loc, thing, data)
                    match = True
                    break
            if match:
                break

        if not match:
            yield loc, data
Example #4
0
def ia_match(a, ia):
    try:
        loc, rec = get_ia(ia)
    except urllib2.HTTPError:
        return False
    if rec is None or 'full_title' not in rec:
        return False
    try:
        e1 = build_marc(rec)
    except TypeError:
        print rec
        raise
    return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
Example #5
0
def ia_match(a, ia):
    try:
        loc, rec = get_ia(ia)
    except urllib2.HTTPError:
        return False
    if rec is None or 'full_title' not in rec:
        return False
    try:
        e1 = build_marc(rec)
    except TypeError:
        print(rec)
        raise
    return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
Example #6
0
def get_record(key, mc):
    data = get_from_archive(mc)
    try:
        rec = fast_parse.read_edition(data)
    except (fast_parse.SoundRecording, IndexError, AssertionError):
        print mc
        print edition_key
        return False
    try:
        return marc.build_marc(rec)
    except TypeError:
        print rec
        raise
Example #7
0
def get_record(key, mc):
    data = get_from_archive(mc)
    try:
        rec = fast_parse.read_edition(data)
    except (fast_parse.SoundRecording, IndexError, AssertionError):
        print(mc)
        print(key)
        return False
    try:
        return marc.build_marc(rec)
    except TypeError:
        print(rec)
        raise
Example #8
0
def get_marc(loc):
    try:
        filename, p, l = loc.split(':')
    except ValueError:
        return None
    if not os.path.exists(marc_path + filename):
        return None
    f = open(marc_path + filename)
    f.seek(int(p))
    buf = f.read(int(l))
    f.close()
    rec = fast_parse.read_edition(buf)
    if rec:
        return build_marc(rec)
Example #9
0
def get_marc(loc):
    try:
        filename, p, l = loc.split(':')
    except ValueError:
        return None
    if not os.path.exists(marc_path + filename):
        return None
    f = open(marc_path + filename)
    f.seek(int(p))
    buf = f.read(int(l))
    f.close()
    rec = fast_parse.read_edition(buf)
    if rec:
        return build_marc(rec)
Example #10
0
def load_part(archive_id, part, start_pos=0):
    print('load_part:', archive_id, part)
    global rec_no, t_prev, load_count
    full_part = archive_id + "/" + part
    f = open(rc['marc_path'] + "/" + full_part)
    if start_pos:
        f.seek(start_pos)
    for pos, loc, data in read_marc_file(full_part, f, pos=start_pos):
        rec_no += 1
        if rec_no % chunk == 0:
            progress(archive_id, rec_no, start_pos, pos)

        if is_loaded(loc):
            continue
        want = ['001', '003', '010', '020', '035', '245']
        try:
            index_fields = fast_parse.index_fields(data, want)
        except KeyError:
            print(loc)
            print(fast_parse.get_tag_lines(data, ['245']))
            raise
        except AssertionError:
            print(loc)
            raise
        if not index_fields or 'title' not in index_fields:
            continue

        edition_pool = pool.build(index_fields)

        if not edition_pool:
            continue

        rec = fast_parse.read_edition(data)
        e1 = build_marc(rec)

        match = False
        seen = set()
        for k, v in edition_pool.iteritems():
            for edition_key in v:
                if edition_key in seen:
                    continue
                seen.add(edition_key)
                thing = withKey(edition_key)
                assert thing
                if try_merge(e1, edition_key, thing):
                    add_source_records(edition_key, loc, thing)
                    match = True

        if not match:
            yield loc, data
Example #11
0
def marc_match(a, loc):
    assert loc
    rec = fast_parse.read_edition(get_from_local(loc))
    e1 = build_marc(rec)
    #print 'amazon:', a
    return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
Example #12
0
def marc_match(a, loc):
    assert loc
    rec = fast_parse.read_edition(get_from_local(loc))
    e1 = build_marc(rec)
    #print 'amazon:', a
    return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
Example #13
0
def load():
    global rec_no, t_prev
    skipping = False
    #for ia in ['nybc200715']:
    #cur.execute("select identifier from metadata where collection=%(c)s", {'c': collection})
    cur.execute(
        "select identifier from metadata where scanner is not null and scanner != 'google' and noindex is null and mediatype='texts' and curatestate='approved'"
    )  # order by curatedate")
    for ia, in cur.fetchall():
        rec_no += 1
        if rec_no % chunk == 0:
            t = time() - t_prev
            t_prev = time()
            t1 = time() - t0
            rec_per_sec = chunk / t
            rec_per_sec_total = rec_no / t1
            remaining = total - rec_no
            sec = remaining / rec_per_sec_total
            print("%8d current: %9.3f overall: %9.3f" %
                  (rec_no, rec_per_sec, rec_per_sec_total),
                  end=' ')
            hours = sec / 3600
            print("%6.3f hours" % hours)

        print(ia)
        if get_things({'type': '/type/edition', 'ocaid': ia}):
            print('already loaded')
            continue
        try:
            loc, rec = get_ia(ia)
        except (KeyboardInterrupt, NameError):
            raise
        except urllib2.HTTPError:
            continue
        if loc is None:
            continue
        print(loc, rec)

        if not loc.endswith('.xml'):
            print("not XML")
            continue
        if 'full_title' not in rec:
            print("full_title missing")
            continue
        index_fields = make_index_fields(rec)
        if not index_fields:
            print("no index_fields")
            continue

        edition_pool = pool.build(index_fields)
        print(edition_pool)

        if not edition_pool:
            yield loc, ia
            continue

        e1 = build_marc(rec)

        match = False
        for k, v in edition_pool.iteritems():
            if k == 'title' and len(v) > 50:
                continue
            for edition_key in v:
                if try_merge(e1, edition_key.replace('\/', '/')):
                    match = True
                    break
            if match:
                break
        if not match:
            yield loc, ia
def load():
    global rec_no, t_prev
    skipping = False
    #for ia in ['nybc200715']:
    #cur.execute("select identifier from metadata where collection=%(c)s", {'c': collection})
    cur.execute("select identifier from metadata where scanner is not null and scanner != 'google' and noindex is null and mediatype='texts' and curatestate='approved'") # order by curatedate")
    for ia, in cur.fetchall():
        rec_no += 1
        if rec_no % chunk == 0:
            t = time() - t_prev
            t_prev = time()
            t1 = time() - t0
            rec_per_sec = chunk / t
            rec_per_sec_total = rec_no / t1
            remaining = total - rec_no
            sec = remaining / rec_per_sec_total
            print("%8d current: %9.3f overall: %9.3f" % (rec_no, rec_per_sec, rec_per_sec_total), end=' ')
            hours = sec / 3600
            print("%6.3f hours" % hours)

        print(ia)
        if get_things({'type': '/type/edition', 'ocaid': ia}):
            print('already loaded')
            continue
        try:
            loc, rec = get_ia(ia)
        except (KeyboardInterrupt, NameError):
            raise
        except urllib2.HTTPError:
            continue
        if loc is None:
            continue
        print(loc, rec)

        if not loc.endswith('.xml'):
            print("not XML")
            continue
        if 'full_title' not in rec:
            print("full_title missing")
            continue
        index_fields = make_index_fields(rec)
        if not index_fields:
            print("no index_fields")
            continue

        edition_pool = pool.build(index_fields)
        print(edition_pool)

        if not edition_pool:
            yield loc, ia
            continue

        e1 = build_marc(rec)

        match = False
        for k, v in edition_pool.iteritems():
            if k == 'title' and len(v) > 50:
                continue
            for edition_key in v:
                if try_merge(e1, edition_key.replace('\/', '/')):
                    match = True
                    break
            if match:
                break
        if not match:
            yield loc, ia
Example #15
0
        continue
    if 'full_title' not in rec:
        print "full_title missing"
        continue
    index_fields = make_index_fields(rec)
    if not index_fields:
        print "no index_fields"
        continue

    edition_pool = pool.build(index_fields)

    if not edition_pool:
        load(loc, ia)
        continue

    e1 = build_marc(rec)

    match = False
    seen = set()
    for k, v in edition_pool.iteritems():
        for edition_key in v:
            if edition_key in seen:
                continue
            thing = None
            while not thing or thing['type']['key'] == '/type/redirect':
                seen.add(edition_key)
                thing = withKey(edition_key)
                assert thing
                if thing['type']['key'] == '/type/redirect':
                    print 'following redirect %s => %s' % (edition_key, thing['location'])
                    edition_key = thing['location']