Exemple #1
0
def test_ia_charset():
    # Tests a corrupted unicode MARC record is corrected, does code exist to fix this?
    data = open(test_data + 'histoirereligieu05cr_meta.mrc').read()
    line = list(get_tag_lines(data, set(['100'])))[0][1]
    a = list(get_all_subfields(line))[0][1]
    expect = u'Crétineau-Joly, J.'
    assert a == expect
Exemple #2
0
def test_wrapped_lines():
    data = open(test_data + 'wrapped_lines').read()
    ret = list(handle_wrapped_lines(get_tag_lines(data, ['520'])))
    assert len(ret) == 2
    a, b = ret
    assert a[0] == '520' and b[0] == '520'
    assert len(a[1]) == 2295
    assert len(b[1]) == 248
Exemple #3
0
def read_edition(loc, data):
    fields = {}
    for tag, line in handle_wrapped_lines(get_tag_lines(data, want)):
        fields.setdefault(tag, []).append(line)

    edition = {}
    if len(fields['008']) != 1:
        warn("There should be a single '008' field, %s has %d." % (loc, len(fields['008'])))
        return {}
    f = fields['008'][0]
    if not f:
        warn("'008' field must not be blank in %s" % (loc)) 
        return {}
    publish_date = str(f)[7:11]
    if publish_date.isdigit() and publish_date != '0000':
        edition["publish_date"] = publish_date
    try:
        if str(f)[6] == 't':
            edition["copyright_date"] = str(f)[11:15]
    except:
        print loc
        raise
    publish_country = str(f)[15:18]
    if publish_country not in ('|||', '   '):
        edition["publish_country"] = publish_country
    lang = str(f)[35:38]
    if lang not in ('   ', '|||'):
        edition["languages"] = [{ 'key': '/l/' + lang }]
    edition.update(read_lccn(fields))
    try:
        edition.update(read_isbn(fields))
    except:
        print loc
        raise
    edition.update(read_oclc(fields))
    edition.update(read_lc_classification(fields))
    edition.update(read_dewey(fields))
    edition.update(read_authors(fields))
    edition.update(read_title(fields))
    edition.update(read_genres(fields))
    edition.update(read_subjects(fields))
    edition.update(read_pagination(fields))
    edition.update(read_series(fields))
    edition.update(read_work_titles(fields))
    edition.update(read_other_titles(fields))
    edition.update(read_edition_name(fields))
    edition.update(read_publisher(fields))
    edition.update(read_contributions(fields))
    edition.update(read_location(fields))
    edition.update(read_url(fields))
    edition.update(read_toc(fields))
    edition.update(read_notes(fields))
    edition.update(read_description(fields))
    return edition
Exemple #4
0
 def read_fields(self, want):
     want = set(want)
     for tag, line in handle_wrapped_lines(get_tag_lines(self.data, want)):
         if tag not in want:
             continue
         if tag.startswith('00'):
             # marc_upei/marc-for-openlibrary-bigset.mrc:78997353:588
             if tag == '008' and line == '':
                 continue
             assert line[-1] == '\x1e'
             yield tag, line[:-1]
         else:
             yield tag, BinaryDataField(line)
Exemple #5
0
 def read_fields(self, want):
     want = set(want)
     marc8 = self.leader()[9] != 'a'
     for tag, line in fast_parse.handle_wrapped_lines(fast_parse.get_tag_lines(self.data, want)):
         if tag not in want:
             continue
         if tag.startswith('00'):
             # marc_upei/marc-for-openlibrary-bigset.mrc:78997353:588
             if tag == '008' and line == '':
                 continue
             assert line[-1] == '\x1e'
             yield tag, line[:-1]
         else:
             yield tag, BinaryDataField(self, line)
Exemple #6
0
def get_marc_subjects(w):
    for src in get_marc_source(w):
        data = None
        try:
            data = get_data(src)
        except ValueError:
            print 'bad record source:', src
            print 'http://openlibrary.org' + w['key']
            continue
        except urllib2.HTTPError, error:
            print 'HTTP error:', error.code, error.msg
            print 'http://openlibrary.org' + w['key']
        if not data:
            continue
        try:
            lines = list(get_tag_lines(data, subject_fields))
        except BadDictionary:
            print 'bad dictionary:', src
            print 'http://openlibrary.org' + w['key']
            continue
        if lines:
            yield lines
Exemple #7
0
def get_marc_subjects(w):
    for src in get_marc_source(w):
        data = None
        from openlibrary.catalog.get_ia import get_data
        try:
            data = get_data(src)
        except ValueError:
            print 'bad record source:', src
            print 'http://openlibrary.org' + w['key']
            continue
        except urllib2.HTTPError, error:
            print 'HTTP error:', error.code, error.msg
            print 'http://openlibrary.org' + w['key']
        if not data:
            continue
        try:
            lines = list(get_tag_lines(data, subject_fields))
        except BadDictionary:
            print 'bad dictionary:', src
            print 'http://openlibrary.org' + w['key']
            continue
        if lines:
            yield lines
Exemple #8
0
def get_marc_subjects(w):
    for src in get_marc_source(w):
        data = None
        from openlibrary.catalog.get_ia import get_data
        try:
            data = get_data(src)
        except ValueError:
            print('bad record source:', src)
            print('http://openlibrary.org' + w['key'])
            continue
        except urllib2.HTTPError as error:
            print('HTTP error:', error.code, error.msg)
            print('http://openlibrary.org' + w['key'])
        if not data:
            continue
        try:
            lines = list(get_tag_lines(data, subject_fields))
        except BadDictionary:
            print('bad dictionary:', src)
            print('http://openlibrary.org' + w['key'])
            continue
        if lines:
            yield lines
Exemple #9
0
 def __init__(self, data):
     fields = {}
     for tag, line in get_tag_lines(data, want):
         fields.setdefault(tag, []).append(line)
     self.fields = fields
Exemple #10
0
def read_works():
    i = 0
    pages = {}
    page_marc = {}

    for work, marc in work_and_marc():
        lines = []
        for loc in marc:
            data = get_data(loc)
            if not data:
                continue
            found = [v for k, v in get_tag_lines(data, set(['600']))]
            if found:
                lines.append((loc, found))
        if not lines:
            continue
        work['lines'] = lines
        i += 1
        print(i, work['key'], work['title'])

        try:
            people, marc_alt = read_people(j[1] for j in lines)
        except AssertionError:
            print(work['lines'])
            continue
        except KeyError:
            print(work['lines'])
            continue

        marc_alt_reverse = defaultdict(set)
        for k, v in marc_alt.items():
            marc_alt_reverse[v].add(k)

        w = ol.get(work['key'])
        w['subject_people'] = []
        for p, num in people.iteritems():
            print('  %2d %s' % (num, ' '.join("%s: %s" % (k, v) for k, v in p)))
            print('     ', p)
            if p in page_marc:
                w['subject_people'].append({'key': '/subjects/people/' + page_marc[p]})
                continue
            obj = build_person_object(p, marc_alt_reverse.get(p, []))
            key = obj['name'].replace(' ', '_')
            full_key = '/subjects/people/' + key
            w['subject_people'].append({'key': full_key})

            if key in pages:
                print(key)
                pages[key]['marc'].append(p)
                continue

            for m in obj['marc']:
                page_marc[m] = key

            pages[key] = obj
            obj_for_db = obj.copy()
            del obj_for_db['marc']
            obj_for_db['key'] = full_key
            obj_for_db['type'] = '/type/person'
            print(ol.save(full_key.encode('utf-8'), obj_for_db, 'create a new person page'))

        print(w)
        print(ol.save(w['key'], w, 'add links to people that this work is about'))
            f = open(filename)
            for pos, loc, data in read_marc_file(full_part, f):
                rec_no +=1
                yield rec_no, pos, loc, data

# source_record,oclc,accompanying_material,translated_from,title

re_oclc = re.compile ('^\(OCoLC\).*?0*(\d+)')

out = open('/3/edward/updates', 'w')
want = set(['001', '003', '035', '041', '245', '300'])
for rec_no, pos, loc, data in iter_marc():
    fields = {}
    rec = {}
    title_seen = False
    for tag, line in handle_wrapped_lines(get_tag_lines(data, want)):
        if tag == '245':
            if title_seen:
                continue
            title_seen = True
            if line[1] == '0': # no prefix
                continue
            contents = get_contents(line, ['a', 'b'])
            if 'a' in contents:
                rec['title'] = ' '.join(x.strip(' /,;:') for x in contents['a'])
            elif 'b' in contents:
                rec['title'] = contents['b'][0].strip(' /,;:')
            if 'title' in rec and has_dot(rec['title']):
                rec['title'] = rec['title'][:-1]
            continue
        if tag == '300':
Exemple #12
0
def load_part(archive_id, part, start_pos=0):
    print 'load_part:', archive_id, part
    global rec_no, t_prev, load_count
    full_part = archive_id + "/" + part
    f = open(rc['marc_path'] + "/" + full_part)
    if start_pos:
        f.seek(start_pos)
    for pos, loc, data in read_marc_file(full_part, f, pos=start_pos):
        rec_no += 1
        if rec_no % chunk == 0:
            progress(archive_id, rec_no, start_pos, pos)

        if is_loaded(loc):
            continue
        want = ['001', '003', '010', '020', '035', '245']
        try:
            index_fields = fast_parse.index_fields(data, want)
        except KeyError:
            print loc
            print fast_parse.get_tag_lines(data, ['245'])
            raise
        except AssertionError:
            print loc
            raise
        except fast_parse.NotBook:
            continue
        if not index_fields or 'title' not in index_fields:
            continue

        print loc
        edition_pool = pool.build(index_fields)

        if not edition_pool:
            yield loc, data
            continue

        rec = fast_parse.read_edition(data)
        e1 = build_marc(rec)

        match = False
        seen = set()
        for k, v in edition_pool.iteritems():
            for edition_key in v:
                if edition_key in seen:
                    continue
                thing = None
                while not thing or thing['type']['key'] == '/type/redirect':
                    seen.add(edition_key)
                    thing = withKey(edition_key)
                    assert thing
                    if thing['type']['key'] == '/type/redirect':
                        print 'following redirect %s => %s' % (edition_key, thing['location'])
                        edition_key = thing['location']
                if try_merge(e1, edition_key, thing):
                    add_source_records(edition_key, loc, thing, data)
                    match = True
                    break
            if match:
                break

        if not match:
            yield loc, data
Exemple #13
0
def load_part(archive_id, part, start_pos=0):
    print('load_part:', archive_id, part)
    global rec_no, t_prev, load_count
    full_part = archive_id + "/" + part
    f = open(rc['marc_path'] + "/" + full_part)
    if start_pos:
        f.seek(start_pos)
    for pos, loc, data in read_marc_file(full_part, f, pos=start_pos):
        rec_no += 1
        if rec_no % chunk == 0:
            progress(archive_id, rec_no, start_pos, pos)

        if is_loaded(loc):
            continue
        want = ['001', '003', '010', '020', '035', '245']
        try:
            index_fields = fast_parse.index_fields(data, want)
        except KeyError:
            print(loc)
            print(fast_parse.get_tag_lines(data, ['245']))
            raise
        except AssertionError:
            print(loc)
            raise
        except fast_parse.NotBook:
            continue
        if not index_fields or 'title' not in index_fields:
            continue

        print(loc)
        edition_pool = pool.build(index_fields)

        if not edition_pool:
            yield loc, data
            continue

        rec = fast_parse.read_edition(data)
        e1 = build_marc(rec)

        match = False
        seen = set()
        for k, v in edition_pool.iteritems():
            for edition_key in v:
                if edition_key in seen:
                    continue
                thing = None
                while not thing or thing['type']['key'] == '/type/redirect':
                    seen.add(edition_key)
                    thing = withKey(edition_key)
                    assert thing
                    if thing['type']['key'] == '/type/redirect':
                        print('following redirect %s => %s' %
                              (edition_key, thing['location']))
                        edition_key = thing['location']
                if try_merge(e1, edition_key, thing):
                    add_source_records(edition_key, loc, thing, data)
                    match = True
                    break
            if match:
                break

        if not match:
            yield loc, data
            for pos, loc, data in read_marc_file(full_part, f):
                rec_no += 1
                yield rec_no, pos, loc, data


# source_record,oclc,accompanying_material,translated_from,title

re_oclc = re.compile('^\(OCoLC\).*?0*(\d+)')

out = open('/3/edward/updates', 'w')
want = set(['001', '003', '035', '041', '245', '300'])
for rec_no, pos, loc, data in iter_marc():
    fields = {}
    rec = {}
    title_seen = False
    for tag, line in handle_wrapped_lines(get_tag_lines(data, want)):
        if tag == '245':
            if title_seen:
                continue
            title_seen = True
            if line[1] == '0':  # no prefix
                continue
            contents = get_contents(line, ['a', 'b'])
            if 'a' in contents:
                rec['title'] = ' '.join(
                    x.strip(' /,;:') for x in contents['a'])
            elif 'b' in contents:
                rec['title'] = contents['b'][0].strip(' /,;:')
            if 'title' in rec and has_dot(rec['title']):
                rec['title'] = rec['title'][:-1]
            continue
Exemple #15
0
def read_works():
    i = 0
    pages = {}
    page_marc = {}

    for work, marc in work_and_marc():
        lines = []
        for loc in marc:
            data = get_data(loc)
            if not data:
                continue
            found = [v for k, v in get_tag_lines(data, set(['600']))]
            if found:
                lines.append((loc, found))
        if not lines:
            continue
        work['lines'] = lines
        i += 1
        print i, work['key'], work['title']

        try:
            people, marc_alt = read_people(j[1] for j in lines)
        except AssertionError:
            print work['lines']
            continue
        except KeyError:
            print work['lines']
            continue

        marc_alt_reverse = defaultdict(set)
        for k, v in marc_alt.items():
            marc_alt_reverse[v].add(k)

        w = ol.get(work['key'])
        w['subject_people'] = []
        for p, num in people.iteritems():
            print '  %2d %s' % (num, ' '.join("%s: %s" % (k, v) for k, v in p))
            print '     ', p
            if p in page_marc:
                w['subject_people'].append({'key': '/subjects/people/' + page_marc[p]})
                continue
            obj = build_person_object(p, marc_alt_reverse.get(p, []))
            key = obj['name'].replace(' ', '_')
            full_key = '/subjects/people/' + key
            w['subject_people'].append({'key': full_key})

            if key in pages:
                print key
                pages[key]['marc'].append(p)
                continue

            for m in obj['marc']:
                page_marc[m] = key

            pages[key] = obj
            obj_for_db = obj.copy()
            del obj_for_db['marc']
            obj_for_db['key'] = full_key
            obj_for_db['type'] = '/type/person'
            print ol.save(full_key.encode('utf-8'), obj_for_db, 'create a new person page')

        print w
        print ol.save(w['key'], w, 'add links to people that this work is about')