Exemple #1
0
def load_part(archive_id, part, start_pos=0):
    print 'load_part:', archive_id, part
    global rec_no, t_prev, load_count
    full_part = archive_id + "/" + part
    f = open(rc['marc_path'] + "/" + full_part)
    if start_pos:
        f.seek(start_pos)
    for pos, loc, data in read_marc_file(full_part, f, pos=start_pos):
        rec_no += 1
        if rec_no % chunk == 0:
            progress(archive_id, rec_no, start_pos, pos)

        if is_loaded(loc):
            continue
        want = ['001', '003', '010', '020', '035', '245']
        try:
            index_fields = fast_parse.index_fields(data, want)
        except KeyError:
            print loc
            print fast_parse.get_tag_lines(data, ['245'])
            raise
        except AssertionError:
            print loc
            raise
        if not index_fields or 'title' not in index_fields:
            continue

        edition_pool = pool.build(index_fields)

        if not edition_pool:
            yield loc, data
            continue

        rec = fast_parse.read_edition(data)
        e1 = build_marc(rec)

        match = False
        seen = set()
        for k, v in edition_pool.iteritems():
            for edition_key in v:
                if edition_key in seen:
                    continue
                thing = None
                while not thing or thing['type']['key'] == '/type/redirect':
                    seen.add(edition_key)
                    thing = withKey(edition_key)
                    assert thing
                    if thing['type']['key'] == '/type/redirect':
                        print 'following redirect %s => %s' % (edition_key, thing['location'])
                        edition_key = thing['location']
                if try_merge(e1, edition_key, thing):
                    add_source_records(edition_key, loc, thing, data)
                    match = True
                    break
            if match:
                break

        if not match:
            yield loc, data
Exemple #2
0
def iter_marc():
    rec_no = 0
    for source_id, ia, name in sources():
        for part, size in files(ia):
            full_part = ia + "/" + part
            filename = rc['marc_path'] + full_part
            assert os.path.exists(filename)
            f = open(filename)
            for pos, loc, data in read_marc_file(full_part, f):
                rec_no +=1
                yield rec_no, pos, loc, data
Exemple #3
0
def load_part(archive_id, part, start_pos=0):
    print('load_part:', archive_id, part)
    global rec_no, t_prev, load_count
    full_part = archive_id + "/" + part
    f = open(rc['marc_path'] + "/" + full_part)
    if start_pos:
        f.seek(start_pos)
    for pos, loc, data in read_marc_file(full_part, f, pos=start_pos):
        rec_no += 1
        if rec_no % chunk == 0:
            progress(archive_id, rec_no, start_pos, pos)

        if is_loaded(loc):
            continue
        want = ['001', '003', '010', '020', '035', '245']
        try:
            index_fields = fast_parse.index_fields(data, want)
        except KeyError:
            print(loc)
            print(fast_parse.get_tag_lines(data, ['245']))
            raise
        except AssertionError:
            print(loc)
            raise
        if not index_fields or 'title' not in index_fields:
            continue

        edition_pool = pool.build(index_fields)

        if not edition_pool:
            continue

        rec = fast_parse.read_edition(data)
        e1 = build_marc(rec)

        match = False
        seen = set()
        for k, v in edition_pool.iteritems():
            for edition_key in v:
                if edition_key in seen:
                    continue
                seen.add(edition_key)
                thing = withKey(edition_key)
                assert thing
                if try_merge(e1, edition_key, thing):
                    add_source_records(edition_key, loc, thing)
                    match = True

        if not match:
            yield loc, data
Exemple #4
0
chunk = 10000
total = 32856039

for name, part, size in all_files():
    f = open(name)
    print part
    file_id += 1
    print file_id, part, size
    print >> db_file, '\t'.join([str(file_id), part])
    filename = rc['marc_path'] + '/' + part
    if not os.path.exists(filename):
        print filename, 'missing'
    #    continue
    assert os.path.exists(filename)
    f = open(filename)
    for pos, loc, data in read_marc_file(part, f):
        rec_no += 1
        if rec_no % chunk == 0:
            t = time() - t_prev
            progress_update(rec_no, t)
            t_prev = time()
        process_record(pos, loc, data, file_id)

db_file.close()
db_rec.close()

print "closing files"
for v in out.values():
    v.close()
print "finished"
Exemple #5
0
    for tag in '100', '700':
        line = get_first_tag(data, set([tag]))
        if line:
            fields = list(get_all_subfields(line))
            if any(k == 'c' for k, v in fields):
                print((loc, fields))

def files(ia):
    endings = ['.mrc', '.marc', '.out', '.dat', '.records.utf8']
    def good(filename):
        return any(filename.endswith(e) for e in endings)

    dir = rc['marc_path'] + ia
    dir_len = len(dir) + 1
    files = []
    for dirpath, dirnames, filenames in os.walk(dir):
        files.extend(dirpath + "/" + f for f in sorted(filenames))
    return [(i[dir_len:], os.path.getsize(i)) for i in files if good(i)]

rec_no = 0

for source_id, ia, name in sources():
    for part, size in files(ia):
        full_part = ia + "/" + part
        filename = rc['marc_path'] + full_part
        assert os.path.exists(filename)
        f = open(filename)
        for pos, loc, data in read_marc_file(full_part, f):
            rec_no +=1
            process_record(pos, loc, data)