Python index_fields 예제들, catalog.marc.fast_parse.index_fields Python 예제들

예제 #1

0

파일 보기

파일: import_marc.py 프로젝트: candeira/openlibrary

def load_part(archive_id, part, start_pos=0):
    print 'load_part:', archive_id, part
    global rec_no, t_prev, load_count
    full_part = archive_id + "/" + part
    f = open(rc['marc_path'] + "/" + full_part)
    if start_pos:
        f.seek(start_pos)
    for pos, loc, data in read_marc_file(full_part, f, pos=start_pos):
        rec_no += 1
        if rec_no % chunk == 0:
            progress(archive_id, rec_no, start_pos, pos)

        if is_loaded(loc):
            continue
        want = ['001', '003', '010', '020', '035', '245']
        try:
            index_fields = fast_parse.index_fields(data, want)
        except KeyError:
            print loc
            print fast_parse.get_tag_lines(data, ['245'])
            raise
        except AssertionError:
            print loc
            raise
        if not index_fields or 'title' not in index_fields:
            continue

        edition_pool = pool.build(index_fields)

        if not edition_pool:
            yield loc, data
            continue

        rec = fast_parse.read_edition(data)
        e1 = build_marc(rec)

        match = False
        seen = set()
        for k, v in edition_pool.iteritems():
            for edition_key in v:
                if edition_key in seen:
                    continue
                thing = None
                while not thing or thing['type']['key'] == '/type/redirect':
                    seen.add(edition_key)
                    thing = withKey(edition_key)
                    assert thing
                    if thing['type']['key'] == '/type/redirect':
                        print 'following redirect %s => %s' % (edition_key, thing['location'])
                        edition_key = thing['location']
                if try_merge(e1, edition_key, thing):
                    add_source_records(edition_key, loc, thing, data)
                    match = True
                    break
            if match:
                break

        if not match:
            yield loc, data

예제 #2

0

파일 보기

파일: build_db.py 프로젝트: RaceList/openlibrary

def process_record(file_id, pos, length, data):
    rec = index_fields(data, ['001', '010', '020', '035', '245'], check_author = False)
    if not rec:
        return
    extra = dict((f, rec[f][0]) for f in ('title', 'lccn', 'call_number') if f in rec)
    rec_id = web.insert('rec', marc_file = file_id, pos=pos, len=length, **extra)
    for f in (f for f in ('isbn', 'oclc') if f in rec):
        for v in rec[f]:
            web.insert(f, seqname=False, rec=rec_id, value=v)

예제 #3

0

파일 보기

def process_record(file_id, pos, length, data):
    rec = index_fields(data, ['001', '010', '020', '035', '245'], check_author = False)
    if not rec:
        return
    extra = dict((f, rec[f][0]) for f in ('title', 'lccn', 'call_number') if f in rec)
    rec_id = web.insert('rec', marc_file = file_id, pos=pos, len=length, **extra)
    for f in (f for f in ('isbn', 'oclc') if f in rec):
        for v in rec[f]:
            web.insert(f, seqname=False, rec=rec_id, value=v)

예제 #4

0

파일 보기

def load_part(archive_id, part, start_pos=0):
    print('load_part:', archive_id, part)
    global rec_no, t_prev, load_count
    full_part = archive_id + "/" + part
    f = open(rc['marc_path'] + "/" + full_part)
    if start_pos:
        f.seek(start_pos)
    for pos, loc, data in read_marc_file(full_part, f, pos=start_pos):
        rec_no += 1
        if rec_no % chunk == 0:
            progress(archive_id, rec_no, start_pos, pos)

        if is_loaded(loc):
            continue
        want = ['001', '003', '010', '020', '035', '245']
        try:
            index_fields = fast_parse.index_fields(data, want)
        except KeyError:
            print(loc)
            print(fast_parse.get_tag_lines(data, ['245']))
            raise
        except AssertionError:
            print(loc)
            raise
        if not index_fields or 'title' not in index_fields:
            continue

        edition_pool = pool.build(index_fields)

        if not edition_pool:
            continue

        rec = fast_parse.read_edition(data)
        e1 = build_marc(rec)

        match = False
        seen = set()
        for k, v in edition_pool.iteritems():
            for edition_key in v:
                if edition_key in seen:
                    continue
                seen.add(edition_key)
                thing = withKey(edition_key)
                assert thing
                if try_merge(e1, edition_key, thing):
                    add_source_records(edition_key, loc, thing)
                    match = True

        if not match:
            yield loc, data

예제 #5

0

파일 보기

파일: isbn_index.py 프로젝트: hornc/openlibrary-1

def process_record(pos, loc, data):
    rec = index_fields(data, ['010'])
    if not rec:
        return
    for isbn in rec.get('oclc', []):
        try:
            add_to_db(oclc_db, str(isbn), loc)
        except (KeyboardInterrupt, NameError):
            raise
        except:
            pass
    for isbn in rec.get('title', []):
        try:
            add_to_db(title_db, str(isbn), loc)
        except (KeyboardInterrupt, NameError):
            raise
        except:
            pass

예제 #6

0

파일 보기

파일: isbn_index.py 프로젝트: Arpanray01/Open-Library

def process_record(pos, loc, data):
    rec = index_fields(data, ['010'])
    if not rec:
        return
    for isbn in rec.get('oclc', []):
        try:
            add_to_db(oclc_db, str(isbn), loc)
        except (KeyboardInterrupt, NameError):
            raise
        except:
            pass
    for isbn in rec.get('title', []):
        try:
            add_to_db(title_db, str(isbn), loc)
        except (KeyboardInterrupt, NameError):
            raise
        except:
            pass

예제 #7

0

파일 보기

def process_record(pos, loc, data):
    global rec_id
    want = [
        #        '006', # Material Characteristics
        #        '010', # LCCN
        '020',  # ISBN
        #        '035', # OCLC
        #        '130', '240', # work title
        #        '245', # title
        #        '246', '730', '740' # other titles
    ]
    rec = index_fields(data, want, check_author=False)
    field_size = {'isbn': 16, 'oclc': 16, 'title': 25, 'lccn': 16}
    if not rec or 'isbn' not in rec:
        return
    for isbn in rec['isbn']:
        if ';' in isbn:
            print(loc)
            print(rec)
        assert ';' not in isbn
    too_long = any(len(i) > 16 for i in rec['isbn'])
    if not too_long:
        return
    print(loc)
    print(rec)
    assert not too_long

    for a, length in field_size:
        if a not in rec:
            continue
        too_long = any(len(i) > size for i in rec[a])
        if not too_long:
            continue
        print(loc)
        print(rec)
        assert too_long


#    rec = list(get_tag_lines(data, want))
    return

예제 #8

0

파일 보기

def process_record(pos, loc, data, file_id):
    global rec_id
    want = [
        #        '006', # Material Characteristics
        '010',  # LCCN
        '020',  # ISBN
        '035',  # OCLC
        #        '130', '240', # work title
        '245',  # title
        #        '246', '730', '740' # other titles
    ]
    try:
        rec = index_fields(data, want, check_author=False)
    except:
        print loc
        raise
    if not rec:
        return
    field_size = {'isbn': 16, 'oclc': 16, 'title': 25, 'lccn': 16}
    if 'isbn' in rec:
        rec['isbn'] = [i for i in rec['isbn'] if len(i) <= 16]
    if 'oclc' in rec:
        rec['oclc'] = [i for i in rec['oclc'] if len(i) <= 16]
    if 'lccn' in rec:
        rec['lccn'] = [i for i in rec['lccn'] if len(i) <= 16]
    for k, v in rec.iteritems():
        if 'isbn' != k and any(len(i) > field_size[k] for i in v):
            print loc
            print rec
            assert False
    rec_id += 1
    (f, p, l) = loc[5:].split(':')
    print >> db_rec, '\t'.join([str(rec_id), str(file_id), p, l])

    for k, v in rec.iteritems():
        if not v:
            continue
        for i in v:
            add_to_index(out[k], i, str(rec_id))

예제 #9

0

파일 보기

파일: db_index.py 프로젝트: artmedlar/openlibrary

def process_record(pos, loc, data, file_id):
    global rec_id
    want = [
#        '006', # Material Characteristics
        '010', # LCCN
        '020', # ISBN
        '035', # OCLC
#        '130', '240', # work title
        '245', # title
#        '246', '730', '740' # other titles
    ]
    try:
        rec = index_fields(data, want, check_author = False)
    except:
        print loc
        raise
    if not rec:
        return
    field_size = { 'isbn': 16, 'oclc': 16, 'title': 25, 'lccn': 16 }
    if 'isbn' in rec:
        rec['isbn'] = [i for i in rec['isbn'] if len(i) <= 16]
    if 'oclc' in rec:
        rec['oclc'] = [i for i in rec['oclc'] if len(i) <= 16]
    if 'lccn' in rec:
        rec['lccn'] = [i for i in rec['lccn'] if len(i) <= 16]
    for k, v in rec.iteritems():
        if 'isbn' != k and any(len(i) > field_size[k] for i in v):
            print loc
            print rec
            assert False
    rec_id += 1
    (f, p, l) = loc.split(':')
    print >> db_rec, '\t'.join([str(rec_id), str(file_id), p, l])

    for k, v in rec.iteritems():
        if not v:
            continue
        for i in v:
            add_to_index(out[k], i, str(rec_id))

예제 #10

0

파일 보기

파일: find_bad_isbn.py 프로젝트: internetarchive/openlibrary

def process_record(pos, loc, data):
    global rec_id
    want = [
#        '006', # Material Characteristics
#        '010', # LCCN
        '020', # ISBN
#        '035', # OCLC
#        '130', '240', # work title
#        '245', # title
#        '246', '730', '740' # other titles
    ]
    rec = index_fields(data, want, check_author = False)
    field_size = { 'isbn': 16, 'oclc': 16, 'title': 25, 'lccn': 16 }
    if not rec or 'isbn' not in rec:
        return
    for isbn in rec['isbn']:
        if ';' in isbn:
            print(loc)
            print(rec)
        assert ';' not in isbn
    too_long = any(len(i) > 16 for i in rec['isbn'])
    if not too_long:
        return
    print(loc)
    print(rec)
    assert not too_long

    for a, length in field_size:
        if a not in rec:
            continue
        too_long = any(len(i) > size for i in rec[a])
        if not too_long:
            continue
        print(loc)
        print(rec)
        assert too_long
#    rec = list(get_tag_lines(data, want))
    return