예제 #1
0
#!/usr/bin/python2.5
from __future__ import print_function
from catalog.marc.fast_parse import *
from catalog.read_rc import read_rc
from catalog.get_ia import files
from sources import sources
import sys, os

rc = read_rc()
read_count = 10000

show_bad_records = False

for ia, name in sources(): # find which sources include '001' tag
    has_001 = 0
    rec_no = 0
    for part, size in files(ia):
        filename = rc['marc_path'] + ia + "/" + part
        if not os.path.exists(filename):
            continue
        for data, length in read_file(open(filename)):
            if rec_no == read_count:
                break
            rec_no += 1
            if list(get_tag_lines(data, ['001'])):
                has_001 += 1
            elif show_bad_records:
                print(data[:24])
                for tag, line in get_all_tag_lines(data):
                    if tag.startswith('00'):
                        print(tag, line[:-1])
예제 #2
0
from catalog.get_ia import read_marc_file
from time import time
from catalog.marc.fast_parse import index_fields, get_tag_lines
import os, os.path, re
from catalog.marc.all import all_files
from catalog.read_rc import read_rc

rc = read_rc()

fields = ['title', 'oclc', 'isbn', 'lccn']

out = dict((i, open(i, 'a')) for i in fields)
rec_id = 0
db_rec = open('recs', 'a')
db_file = open('files', 'a')
file_id = 0

re_escape = re.compile(r'[\n\r\t\0\\]')
trans = {
    '\n': '\\n',
    '\r': '\\r',
    '\t': '\\t',
    '\\': '\\\\',
    '\0': '',
}


def esc_group(m):
    return trans[m.group(0)]