def GET(self, ia): ret = ''' <html> <head> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> <title>Marc lookup: %s</title> </head> <body> <form action="/z39.50">Internet archive identifier: <input name="ia" value="%s"><input value="go" type="submit"></form> <h1>%s</h1> <ul> <li><a href="http://www.archive.org/details/%s">Internet archive detail page</a> <li><a href="http://openlibrary.org/show-records/ia:%s">View current IA MARC record</a> </ul> ''' % (ia, ia, ia, ia, ia) marc_source = 'http://www.archive.org/download/' + ia + '/' + ia + '_metasource.xml' marc_xml = 'http://www.archive.org/download/' + ia + '/' + ia + '_marc.xml' marc_bin = 'http://www.archive.org/download/' + ia + '/' + ia + '_meta.mrc' try: from_marc_xml = xml_to_html.html_record(urlopen(marc_xml).read()) except: from_marc_xml = None try: meta_mrc = urlopen(marc_bin).read() from_marc_bin = html_record(meta_mrc) except: from_marc_bin = None root = etree.parse(urlopen(marc_source)).getroot() cclquery = root.find('cclquery').text target_name = root.find('target').text result_offset = root.find('resultOffset').text marc = get_marc(target_name, cclquery, result_offset) rec = html_record(marc) ret += '<h2>From Z39.50</h2>' ret += 'leader: <code>' + rec.leader.replace(' ', ' ') + '</code><br>' ret += rec.html() + '<br>\n' if from_marc_xml: ret += '<h2>From MARC XML on archive.org</h2>' ret += 'leader: <code>' + from_marc_xml.leader.replace(' ', ' ') + '</code><br>' ret += from_marc_xml.html() + '<br>\n' if from_marc_xml: ret += '<h2>From MARC binary on archive.org</h2>' ret += 'record length: ' + repr(len(meta_mrc)) + ' bytes<br>' ret += 'leader: <code>' + from_marc_bin.leader.replace(' ', ' ') + '</code><br>' ret += from_marc_bin.html() + '<br>\n' ret += '</body></html>' return ret
def GET(self, ia): ret = ''' <html> <head> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> <title>Marc lookup: %s</title> </head> <body> <form action="/z39.50">Internet archive identifier: <input name="ia" value="%s"><input value="go" type="submit"></form> <h1>%s</h1> <ul> <li><a href="http://www.archive.org/details/%s">Internet archive detail page</a> <li><a href="http://openlibrary.org/show-records/ia:%s">View current IA MARC record</a> </ul> ''' % (ia, ia, ia, ia, ia) marc_source = 'http://www.archive.org/download/' + ia + '/' + ia + '_metasource.xml' root = etree.parse(marc_source).getroot() cclquery = root.find('cclquery').text target_id = root.find('target').attrib['id'] marc = get_marc(target_id, cclquery) rec = html_record(marc) ret += 'leader: ' + rec.leader + '<br>' ret += rec.html() ret += '</body></html>' return ret
class show_marc(app.view): path = "/show-records/(.*):(\d+):(\d+)" def GET(self, filename, offset, length): m = re_bad_meta_mrc.match(filename) if m: raise web.seeother('/show-records/ia:' + m.group(1)) m = re_lc_sanfranpl.match(filename) if m: # archive.org is case-sensative mixed_case = 'SanFranPL%s/SanFranPL%s.out:%s:%s' % ( m.group(1), m.group(2), offset, length) raise web.seeother('/show-records/' + mixed_case) if filename == 'collingswoodlibrarymarcdump10-27-2008/collingswood.out': loc = 'CollingswoodLibraryMarcDump10-27-2008/Collingswood.out:%s:%s' % ( offset, length) raise web.seeother('/show-records/' + loc) loc = ':'.join(['marc', filename, offset, length]) books = web.ctx.site.things({ 'type': '/type/edition', 'source_records': loc, }) offset = int(offset) length = int(length) #print "record_locator: <code>%s</code><p/><hr>" % locator r0, r1 = offset, offset + 100000 url = 'http://www.archive.org/download/%s' % filename ureq = urllib2.Request( url, None, {'Range': 'bytes=%d-%d' % (r0, r1)}, ) try: result = urllib2.urlopen(ureq).read(100000) except urllib2.HTTPError, e: return "ERROR:" + str(e) len_in_rec = int(result[:5]) if len_in_rec != length: raise web.seeother('/show-records/%s:%d:%d' % (filename, offset, len_in_rec)) from openlibrary.catalog.marc import html try: record = html.html_record(result[0:length]) except ValueError: record = None return app.render_template("showmarc", record, filename, offset, length, books)
def test_html_subfields(): samples = [ (b' \x1fa0123456789\x1e', '<b>$a</b>0123456789'), (b' end of wrapped\x1e', 'end of wrapped'), (b' \x1fa<whatever>\x1e', '<b>$a</b><whatever>'), ] hr = html_record("00053This is the leader.Now we are beyond the leader.") for input_, output in samples: assert hr.html_subfields(input_) == output
def test_html_line_marc8(): samples = [ ('020', b' \x1fa0123456789\x1e', ' <b>$a</b>0123456789'), ('520', b' end of wrapped\x1e', ' end of wrapped'), ] hr = html_record(b'00053This is the leader.Now we are beyond the leader.') for tag, input_, output in samples: expect = '<large>%s</large> <code>%s</code>' % (tag, output) assert hr.html_line(tag, input_) == expect
def GET(self, filename, offset, length): m = re_bad_meta_mrc.match(filename) if m: raise web.seeother('/show-records/ia:' + m.group(1)) m = re_lc_sanfranpl.match(filename) if m: # archive.org is case-sensative mixed_case = 'SanFranPL%s/SanFranPL%s.out:%s:%s' % (m.group(1), m.group(2), offset, length) raise web.seeother('/show-records/' + mixed_case) if filename == 'collingswoodlibrarymarcdump10-27-2008/collingswood.out': loc = 'CollingswoodLibraryMarcDump10-27-2008/Collingswood.out:%s:%s' % (offset, length) raise web.seeother('/show-records/' + loc) loc = ':'.join(['marc', filename, offset, length]) books = web.ctx.site.things({ 'type': '/type/edition', 'source_records': loc, }) offset = int(offset) length = int(length) #print "record_locator: <code>%s</code><p/><hr>" % locator r0, r1 = offset, offset+100000 url = 'http://www.archive.org/download/%s'% filename ureq = urllib2.Request(url, None, {'Range':'bytes=%d-%d'% (r0, r1)}, ) try: result = urllib2.urlopen(ureq).read(100000) except urllib2.HTTPError as e: return "ERROR:" + str(e) len_in_rec = int(result[:5]) if len_in_rec != length: raise web.seeother('/show-records/%s:%d:%d' % (filename, offset, len_in_rec)) from openlibrary.catalog.marc import html try: record = html.html_record(result[0:length]) except ValueError: record = None template = app.render_template("showmarc", record, filename, offset, length, books) template.v2 = True return template
def GET(self, filename, offset, length): m = re_bad_meta_mrc.match(filename) if m: raise web.seeother('/show-records/ia:' + m.group(1)) m = re_lc_sanfranpl.match(filename) if m: # archive.org is case-sensative mixed_case = ( f'SanFranPL{m.group(1)}/SanFranPL{m.group(2)}.out:{offset}:{length}' ) raise web.seeother('/show-records/' + mixed_case) if filename == 'collingswoodlibrarymarcdump10-27-2008/collingswood.out': loc = f'CollingswoodLibraryMarcDump10-27-2008/Collingswood.out:{offset}:{length}' raise web.seeother('/show-records/' + loc) loc = ':'.join(['marc', filename, offset, length]) books = web.ctx.site.things({ 'type': '/type/edition', 'source_records': loc, }) offset = int(offset) length = int(length) r0, r1 = offset, offset + 100000 url = 'https://archive.org/download/%s' % filename headers = {'Range': 'bytes=%d-%d' % (r0, r1)} try: response = requests.get(url, headers=headers) response.raise_for_status() result = response.content[:100000] except requests.HTTPError as e: return "ERROR:" + str(e) len_in_rec = int(result[:5]) if len_in_rec != length: raise web.seeother('/show-records/%s:%d:%d' % (filename, offset, len_in_rec)) from openlibrary.catalog.marc import html try: record = html.html_record(result[0:length]) except ValueError: record = None return app.render_template("showmarc", record, filename, offset, length, books)
def test_html_line_utf8(): samples = [ ('245', (b'10\x1faDbu ma la \xca\xbejug pa\xca\xbei kar t\xcc\xa3i\xcc\x84k ' b':\x1fbDwags-brgyud grub pa\xca\xbei s\xcc\x81in\xcc\x87 rta /\x1f' b'cKarma-pa Mi-bskyod-rdo-rje.\x1e'), (u'10 <b>$a</b>Dbu ma la \u02bejug pa\u02bei kar \u1e6d\u012bk :<b>' u'$b</b>Dwags-brgyud grub pa\u02bei \u015bi\u1e45 rta /<b>$c</b>Ka' u'rma-pa Mi-bskyod-rdo-rje.')), ] hr = html_record(b'00053Thisais the leader.Now we are beyond the leader.') assert hr.is_marc8 == False for tag, input_, output in samples: expect = '<large>%s</large> <code>%s</code>' % (tag, output) assert hr.html_line(tag, input_) == expect
def GET(self, ia): error_404 = False url = f'https://archive.org/download/{ia}/{ia}_meta.mrc' try: response = requests.get(url) response.raise_for_status() data = response.content except requests.HTTPError as e: if e.response.status_code == 404: error_404 = True else: return "ERROR:" + str(e) if error_404: # no MARC record url = f'https://archive.org/download/{ia}/{ia}_meta.xml' try: response = requests.get(url) response.raise_for_status() data = response.content except requests.HTTPError as e: return "ERROR:" + str(e) raise web.seeother('https://archive.org/details/' + ia) books = web.ctx.site.things({ 'type': '/type/edition', 'source_records': 'ia:' + ia, }) or web.ctx.site.things({ 'type': '/type/edition', 'ocaid': ia, }) from openlibrary.catalog.marc import html try: leader_len = int(data[:5]) except ValueError: return "ERROR reading MARC for " + ia if len(data) != leader_len: data = data.decode('utf-8').encode('raw_unicode_escape') assert len(data) == int(data[:5]) try: record = html.html_record(data) except ValueError: record = None return app.render_template("showia", ia, record, books)
def test_html_line(): samples = [ ('020', ' \x1fa0123456789\x1e', ' <b>$a</b>0123456789'), ('520', ' end of wrapped\x1e', ' end of wrapped'), ('245', ('10\x1faDbu ma la \xca\xbejug pa\xca\xbei kar t\xcc\xa3i\xcc\x84k ' ':\x1fbDwags-brgyud grub pa\xca\xbei s\xcc\x81in\xcc\x87 rta /\x1f' 'cKarma-pa Mi-bskyod-rdo-rje.\x1e'), (u'10 <b>$a</b>Dbu ma la \u02bejug pa\u02bei kar \u1e6d\u012bk :<b>' u'$b</b>Dwags-brgyud grub pa\u02bei \u015bi\u1e45 rta /<b>$c</b>Ka' u'rma-pa Mi-bskyod-rdo-rje.')), ] hr = html_record("00053This is the leader.Now we are beyond the leader.") for tag, input, output in samples: expect = '<large>%s</large> <code>%s</code>' % (tag, output) assert hr.html_line(tag, input) == expect
def GET(self, ia): error_404 = False url = 'http://www.archive.org/download/%s/%s_meta.mrc' % (ia, ia) try: data = urllib.request.urlopen(url).read() except urllib.error.HTTPError as e: if e.code == 404: error_404 = True else: return "ERROR:" + str(e) if error_404: # no MARC record url = 'http://www.archive.org/download/%s/%s_meta.xml' % (ia, ia) try: data = urllib.request.urlopen(url).read() except urllib.error.HTTPError as e: return "ERROR:" + str(e) raise web.seeother('http://www.archive.org/details/' + ia) books = web.ctx.site.things({ 'type': '/type/edition', 'source_records': 'ia:' + ia, }) or web.ctx.site.things({ 'type': '/type/edition', 'ocaid': ia, }) from openlibrary.catalog.marc import html try: leader_len = int(data[:5]) except ValueError: return "ERROR reading MARC for " + ia if len(data) != leader_len: data = data.decode('utf-8').encode('raw_unicode_escape') assert len(data) == int(data[:5]) try: record = html.html_record(data) except ValueError: record = None template = app.render_template("showia", ia, record, books) template.v2 = True return template
def GET(self, ia): error_404 = False url = 'http://www.archive.org/download/%s/%s_meta.mrc' % (ia, ia) try: data = urllib2.urlopen(url).read() except urllib2.HTTPError as e: if e.code == 404: error_404 = True else: return "ERROR:" + str(e) if error_404: # no MARC record url = 'http://www.archive.org/download/%s/%s_meta.xml' % (ia, ia) try: data = urllib2.urlopen(url).read() except urllib2.HTTPError as e: return "ERROR:" + str(e) raise web.seeother('http://www.archive.org/details/' + ia) books = web.ctx.site.things({ 'type': '/type/edition', 'source_records': 'ia:' + ia, }) or web.ctx.site.things({ 'type': '/type/edition', 'ocaid': ia, }) from openlibrary.catalog.marc import html try: leader_len = int(data[:5]) except ValueError: return "ERROR reading MARC for " + ia if len(data) != leader_len: data = data.decode('utf-8').encode('raw_unicode_escape') assert len(data) == int(data[:5]) try: record = html.html_record(data) except ValueError: record = None template = app.render_template("showia", ia, record, books) template.v2 = True return template
'ocaid': ia, }) from openlibrary.catalog.marc import html try: leader_len = int(data[:5]) except ValueError: return "ERROR reading MARC for " + ia if len(data) != leader_len: data = data.decode('utf-8').encode('raw_unicode_escape') assert len(data) == int(data[:5]) try: record = html.html_record(data) except ValueError: record = None return app.render_template("showia", ia, record, books) class show_amazon(app.view): path = "/show-records/amazon:(.*)" def GET(self, asin): return app.render_template("showamazon", asin) re_bad_meta_mrc = re.compile('^([^/]+)_meta\.mrc$') re_lc_sanfranpl = re.compile('^sanfranpl(\d+)/sanfranpl(\d+)\.out')
books = web.ctx.site.things({ 'type': '/type/edition', 'source_records': 'ia:' + ia, }) or web.ctx.site.things({ 'type': '/type/edition', 'ocaid': ia, }) from openlibrary.catalog.marc import html if len(data) != int(data[:5]): data = data.decode('utf-8').encode('raw_unicode_escape') assert len(data) == int(data[:5]) try: record = html.html_record(data) except ValueError: record = None return render.showia(ia, record, books) class show_amazon(delegate.page): path = "/show-records/amazon:(.*)" def GET(self, asin): return render.showamazon(asin) re_bad_meta_mrc = re.compile('^([^/]+)_meta\.mrc$') re_lc_sanfranpl = re.compile('^sanfranpl(\d+)/sanfranpl(\d+)\.out') class show_marc(delegate.page):