Ejemplo n.º 1
0
def test_wrapped_lines():
    data = open(test_data + 'wrapped_lines').read()
    ret = list(handle_wrapped_lines(get_tag_lines(data, ['520'])))
    assert len(ret) == 2
    a, b = ret
    assert a[0] == '520' and b[0] == '520'
    assert len(a[1]) == 2295
    assert len(b[1]) == 248
Ejemplo n.º 2
0
 def all_fields(self):
     marc8 = self.leader()[9] != 'a'
     for tag, line in handle_wrapped_lines(get_all_tag_lines(self.data)):
         if tag.startswith('00'):
             # marc_upei/marc-for-openlibrary-bigset.mrc:78997353:588
             if tag == '008' and line == '':
                 continue
             assert line[-1] == '\x1e'
             yield tag, line[:-1]
         else:
             yield tag, BinaryDataField(self, line)
Ejemplo n.º 3
0
 def all_fields(self):
     marc8 = self.leader()[9] != 'a'
     for tag, line in fast_parse.handle_wrapped_lines(fast_parse.get_all_tag_lines(self.data)):
         if tag.startswith('00'):
             # marc_upei/marc-for-openlibrary-bigset.mrc:78997353:588
             if tag == '008' and line == '':
                 continue
             assert line[-1] == '\x1e'
             yield tag, line[:-1]
         else:
             yield tag, BinaryDataField(self, line)
Ejemplo n.º 4
0
def read_edition(loc, data):
    fields = {}
    for tag, line in handle_wrapped_lines(get_tag_lines(data, want)):
        fields.setdefault(tag, []).append(line)

    edition = {}
    if len(fields['008']) != 1:
        warn("There should be a single '008' field, %s has %d." % (loc, len(fields['008'])))
        return {}
    f = fields['008'][0]
    if not f:
        warn("'008' field must not be blank in %s" % (loc)) 
        return {}
    publish_date = str(f)[7:11]
    if publish_date.isdigit() and publish_date != '0000':
        edition["publish_date"] = publish_date
    try:
        if str(f)[6] == 't':
            edition["copyright_date"] = str(f)[11:15]
    except:
        print loc
        raise
    publish_country = str(f)[15:18]
    if publish_country not in ('|||', '   '):
        edition["publish_country"] = publish_country
    lang = str(f)[35:38]
    if lang not in ('   ', '|||'):
        edition["languages"] = [{ 'key': '/l/' + lang }]
    edition.update(read_lccn(fields))
    try:
        edition.update(read_isbn(fields))
    except:
        print loc
        raise
    edition.update(read_oclc(fields))
    edition.update(read_lc_classification(fields))
    edition.update(read_dewey(fields))
    edition.update(read_authors(fields))
    edition.update(read_title(fields))
    edition.update(read_genres(fields))
    edition.update(read_subjects(fields))
    edition.update(read_pagination(fields))
    edition.update(read_series(fields))
    edition.update(read_work_titles(fields))
    edition.update(read_other_titles(fields))
    edition.update(read_edition_name(fields))
    edition.update(read_publisher(fields))
    edition.update(read_contributions(fields))
    edition.update(read_location(fields))
    edition.update(read_url(fields))
    edition.update(read_toc(fields))
    edition.update(read_notes(fields))
    edition.update(read_description(fields))
    return edition
Ejemplo n.º 5
0
 def read_fields(self, want):
     want = set(want)
     for tag, line in handle_wrapped_lines(get_tag_lines(self.data, want)):
         if tag not in want:
             continue
         if tag.startswith('00'):
             # marc_upei/marc-for-openlibrary-bigset.mrc:78997353:588
             if tag == '008' and line == '':
                 continue
             assert line[-1] == '\x1e'
             yield tag, line[:-1]
         else:
             yield tag, BinaryDataField(line)
Ejemplo n.º 6
0
 def read_fields(self, want):
     want = set(want)
     marc8 = self.leader()[9] != 'a'
     #for tag, line in handle_wrapped_lines(get_tag_lines(self.data, want)):
     for tag, line in handle_wrapped_lines(get_tag_lines(self.data, want)):
         if tag not in want:
             continue
         if tag.startswith('00'):
             # marc_upei/marc-for-openlibrary-bigset.mrc:78997353:588
             if tag == '008' and line == '':
                 continue
             assert line[-1] == '\x1e'
             yield tag, line[:-1]
         else:
             yield tag, BinaryDataField(self, line)
Ejemplo n.º 7
0
            f = open(filename)
            for pos, loc, data in read_marc_file(full_part, f):
                rec_no +=1
                yield rec_no, pos, loc, data

# source_record,oclc,accompanying_material,translated_from,title

re_oclc = re.compile ('^\(OCoLC\).*?0*(\d+)')

out = open('/3/edward/updates', 'w')
want = set(['001', '003', '035', '041', '245', '300'])
for rec_no, pos, loc, data in iter_marc():
    fields = {}
    rec = {}
    title_seen = False
    for tag, line in handle_wrapped_lines(get_tag_lines(data, want)):
        if tag == '245':
            if title_seen:
                continue
            title_seen = True
            if line[1] == '0': # no prefix
                continue
            contents = get_contents(line, ['a', 'b'])
            if 'a' in contents:
                rec['title'] = ' '.join(x.strip(' /,;:') for x in contents['a'])
            elif 'b' in contents:
                rec['title'] = contents['b'][0].strip(' /,;:')
            if 'title' in rec and has_dot(rec['title']):
                rec['title'] = rec['title'][:-1]
            continue
        if tag == '300':
Ejemplo n.º 8
0
            for pos, loc, data in read_marc_file(full_part, f):
                rec_no += 1
                yield rec_no, pos, loc, data


# source_record,oclc,accompanying_material,translated_from,title

re_oclc = re.compile('^\(OCoLC\).*?0*(\d+)')

out = open('/3/edward/updates', 'w')
want = set(['001', '003', '035', '041', '245', '300'])
for rec_no, pos, loc, data in iter_marc():
    fields = {}
    rec = {}
    title_seen = False
    for tag, line in handle_wrapped_lines(get_tag_lines(data, want)):
        if tag == '245':
            if title_seen:
                continue
            title_seen = True
            if line[1] == '0':  # no prefix
                continue
            contents = get_contents(line, ['a', 'b'])
            if 'a' in contents:
                rec['title'] = ' '.join(
                    x.strip(' /,;:') for x in contents['a'])
            elif 'b' in contents:
                rec['title'] = contents['b'][0].strip(' /,;:')
            if 'title' in rec and has_dot(rec['title']):
                rec['title'] = rec['title'][:-1]
            continue