def find_title(item_id): (ia_host, ia_path) = find_item(item_id) if not ia_host: return url = 'http://' + ia_host + ia_path + "/" + item_id + "_scandata.xml" scandata = None try: scandata = urlopen_keep_trying(url).read() except: pass if not scandata or '<book>' not in scandata[:100]: url = "http://" + ia_host + "/zipview.php?zip=" + ia_path + "/scandata.zip&file=scandata.xml" scandata = urlopen_keep_trying(url).read() if not scandata or '<book>' not in scandata: return zip_type = 'tif' if item_id.endswith('goog') else 'jp2' try: status = zip_test(ia_host, ia_path, item_id, zip_type) except socket.error: #print 'socket error:', ia_host bad_hosts.add(ia_host) return if status in (403, 404): #print zip_type, ' not found:', item_id return (cover, title) = parse_scandata_xml(scandata) return title
def load_xml(ia): url = archive_url + ia + '/' + ia + '_marc.xml' f = urlopen_keep_trying(url) root = etree.parse(f).getroot() if root.tag == '{http://www.loc.gov/MARC21/slim}collection': root = root[0] return MarcXml(root)
def load_binary(ia): url = archive_url + ia + "/" + ia + "_meta.mrc" f = urlopen_keep_trying(url) data = f.read() assert "<title>Internet Archive: Page Not Found</title>" not in data[:200] if len(data) != int(data[:5]): data = data.decode("utf-8").encode("raw_unicode_escape") assert len(data) == int(data[:5]) return MarcBinary(data)
def load_binary(ia): url = archive_url + ia + '/' + ia + '_meta.mrc' f = urlopen_keep_trying(url) data = f.read() assert '<title>Internet Archive: Page Not Found</title>' not in data[:200] if len(data) != int(data[:5]): data = data.decode('utf-8').encode('raw_unicode_escape') assert len(data) == int(data[:5]) return MarcBinary(data)
def load_binary(ia, host, path): url = 'http://' + host + path + '/' + ia + '_meta.mrc' print(url) f = urlopen_keep_trying(url) data = f.read() assert '<title>Internet Archive: Page Not Found</title>' not in data[:200] if len(data) != int(data[:5]): data = data.decode('utf-8').encode('raw_unicode_escape') assert len(data) == int(data[:5]) return MarcBinary(data)
def load_xml(ia): url = archive_url + ia + "/" + ia + "_marc.xml" f = urlopen_keep_trying(url) root = etree.parse(f).getroot() if root.tag == "{http://www.loc.gov/MARC21/slim}collection": root = root[0] return MarcXml(root) edition = read_edition(rec) assert "title" in edition return edition
def load_binary(ia): url = archive_url + ia + '/' + ia + '_meta.mrc' f = urlopen_keep_trying(url) data = f.content assert '<title>Internet Archive: Page Not Found</title>' not in data[:200] if len(data) != int(data[:5]): data = data.decode('utf-8').encode('raw_unicode_escape') if len(data) != int(data[:5]): return return MarcBinary(data)
def load_xml(ia, host, path): url = 'http://' + host + path + '/' + ia + '_marc.xml' print(url) f = urlopen_keep_trying(url) root = etree.parse(f).getroot() if root.tag == '{http://www.loc.gov/MARC21/slim}collection': root = root[0] return MarcXml(root) edition = read_edition(rec) assert 'title' in edition return edition
def load_xml(ia, host, path): url = 'http://' + host + path + '/' + ia + '_marc.xml' print url f = urlopen_keep_trying(url) root = etree.parse(f).getroot() if root.tag == '{http://www.loc.gov/MARC21/slim}collection': root = root[0] return MarcXml(root) edition = read_edition(rec) assert 'title' in edition return edition
def load(loc, ia): print "load", loc, ia url = archive_url + loc f = urlopen_keep_trying(url) try: edition = parse_xml.parse(f) except parse_xml.BadSubtag: return if 'title' not in edition: return edition['ocaid'] = ia write_edition(ia, edition)
def load(loc, ia): print("load", loc, ia) url = archive_url + loc f = urlopen_keep_trying(url) try: edition = parse_xml.parse(f) except AssertionError: return except parse_xml.BadSubtag: return except KeyError: return if 'title' not in edition: return edition['ocaid'] = ia write_edition("ia:" + ia, edition)