Example #1
0
def load_part(archive_id, part, start_pos=0):
    print 'load_part:', archive_id, part
    global rec_no, t_prev, load_count
    full_part = archive_id + "/" + part
    f = open(rc['marc_path'] + "/" + full_part)
    if start_pos:
        f.seek(start_pos)
    for pos, loc, data in read_marc_file(full_part, f, pos=start_pos):
        rec_no += 1
        if rec_no % chunk == 0:
            progress(archive_id, rec_no, start_pos, pos)

        if is_loaded(loc):
            continue
        want = ['001', '003', '010', '020', '035', '245']
        try:
            index_fields = fast_parse.index_fields(data, want)
        except KeyError:
            print loc
            print fast_parse.get_tag_lines(data, ['245'])
            raise
        except AssertionError:
            print loc
            raise
        if not index_fields or 'title' not in index_fields:
            continue

        edition_pool = pool.build(index_fields)

        if not edition_pool:
            yield loc, data
            continue

        rec = fast_parse.read_edition(data)
        e1 = build_marc(rec)

        match = False
        seen = set()
        for k, v in edition_pool.iteritems():
            for edition_key in v:
                if edition_key in seen:
                    continue
                thing = None
                while not thing or thing['type']['key'] == '/type/redirect':
                    seen.add(edition_key)
                    thing = withKey(edition_key)
                    assert thing
                    if thing['type']['key'] == '/type/redirect':
                        print 'following redirect %s => %s' % (edition_key, thing['location'])
                        edition_key = thing['location']
                if try_merge(e1, edition_key, thing):
                    add_source_records(edition_key, loc, thing, data)
                    match = True
                    break
            if match:
                break

        if not match:
            yield loc, data
Example #2
0
def load_part(archive_id, part, start_pos=0):
    print 'load_part:', archive_id, part
    global rec_no, t_prev, load_count
    full_part = archive_id + "/" + part
    f = open(rc['marc_path'] + "/" + full_part)
    if start_pos:
        f.seek(start_pos)
    for pos, loc, data in read_marc_file(full_part, f, pos=start_pos):
        rec_no += 1
        if rec_no % chunk == 0:
            progress(archive_id, rec_no, start_pos, pos)

        if is_loaded(loc):
            continue
        want = ['001', '003', '010', '020', '035', '245']
        try:
            index_fields = fast_parse.index_fields(data, want)
        except KeyError:
            print loc
            print fast_parse.get_tag_lines(data, ['245'])
            raise
        except AssertionError:
            print loc
            raise
        if not index_fields or 'title' not in index_fields:
            continue

        edition_pool = pool.build(index_fields)

        if not edition_pool:
            continue

        rec = fast_parse.read_edition(data)
        e1 = build_marc(rec)

        match = False
        seen = set()
        for k, v in edition_pool.iteritems():
            for edition_key in v:
                if edition_key in seen:
                    continue
                seen.add(edition_key)
                thing = withKey(edition_key)
                assert thing
                if try_merge(e1, edition_key, thing):
                    add_source_records(edition_key, loc, thing)
                    match = True

        if not match:
            yield loc, data
Example #3
0
def get_651(key):
    found = []
    for src in get_src(key):
        data = get_from_local(src)
        for tag, line in get_tag_lines(data, ['651']):
            found.append(list(get_all_subfields(line)))
    return found
Example #4
0
def get_651(key):
    found = []
    for src in get_src(key):
        data = get_from_local(src)
        for tag, line in get_tag_lines(data, ['651']):
            found.append(list(get_all_subfields(line)))
    return found
Example #5
0
def data_from_marc(locs, name):
    lines = defaultdict(list)
    for loc in locs:
        data = marc_data(loc)
        line = read_line(get_first_tag(data, set(['100'])), name)
        if line:
            lines[line].append(loc)
        for tag, line in get_tag_lines(data, set(['700'])):
            line = read_line(line, name)
            if line:
                lines[line].append(loc)
    return lines
Example #6
0
def data_from_marc(locs, name):
    lines = defaultdict(list)
    for loc in locs:
        data = marc_data(loc)
        line = read_line(get_first_tag(data, set(['100'])), name)
        if line:
            lines[line].append(loc)
        for tag, line in get_tag_lines(data, set(['700'])):
            line = read_line(line, name)
            if line:
                lines[line].append(loc)
    return lines
Example #7
0
def read_edition(loc, data):
    fields = {}
    for tag, line in handle_wrapped_lines(get_tag_lines(data, want)):
        fields.setdefault(tag, []).append(line)

    edition = {}
    if len(fields['008']) != 1:
        warn("There should be a single '008' field, %s has %d." % (loc, len(fields['008'])))
        return {}
    f = fields['008'][0]
    publish_date = str(f)[7:11]
    if publish_date.isdigit() and publish_date != '0000':
        edition["publish_date"] = publish_date
    if str(f)[6] == 't':
        edition["copyright_date"] = str(f)[11:15]
    publish_country = str(f)[15:18]
    if publish_country not in ('|||', '   '):
        edition["publish_country"] = publish_country
    lang = str(f)[35:38]
    if lang not in ('   ', '|||'):
        edition["languages"] = [{ 'key': '/l/' + lang }]
    edition.update(read_lccn(fields))
    try:
        edition.update(read_isbn(fields))
    except:
        print loc
        raise
    edition.update(read_oclc(fields))
    edition.update(read_lc_classification(fields))
    edition.update(read_dewey(fields))
    edition.update(read_authors(fields))
    edition.update(read_title(fields))
    edition.update(read_genres(fields))
    edition.update(read_subjects(fields))
    edition.update(read_pagination(fields))
    edition.update(read_series(fields))
    edition.update(read_work_titles(fields))
    edition.update(read_other_titles(fields))
    edition.update(read_edition_name(fields))
    edition.update(read_publisher(fields))
    edition.update(read_contributions(fields))
    edition.update(read_location(fields))
    edition.update(read_url(fields))
    edition.update(read_toc(fields))
    edition.update(read_notes(fields))
    edition.update(read_description(fields))
    return edition
Example #8
0
 def __init__(self, data):
     fields = {}
     for tag, line in get_tag_lines(data, want):
         fields.setdefault(tag, []).append(line)
     self.fields = fields