def get_marc_ia_data(ia, host=None, path=None): ia = ia.strip() # 'cyclopdiaofedu00kidd ' ending = 'meta.mrc' if host and path: url = 'http://%s%s/%s_%s' % (host, path, ia, ending) else: url = 'http://www.archive.org/download/' + ia + '/' + ia + '_' + ending f = urlopen_keep_trying(url) return f.read() if f else None
def get_marc_ia(ia): ia = ia.strip() # 'cyclopdiaofedu00kidd ' url = base + ia + "/" + ia + "_meta.mrc" data = urlopen_keep_trying(url).read() length = int(data[0:5]) if len(data) != length: data = data.decode('utf-8').encode('raw_unicode_escape') assert len(data) == length assert 'Internet Archive: Error' not in data print 'leader:', data[:24] return data return fast_parse.read_edition(data, accept_electronic=True)
def get_marc_ia(ia): ia = ia.strip() # 'cyclopdiaofedu00kidd ' url = base + ia + "/" + ia + "_meta.mrc" data = urlopen_keep_trying(url).read() length = int(data[0:5]) if len(data) != length: data = data.decode('utf-8').encode('raw_unicode_escape') assert len(data) == length assert 'Internet Archive: Error' not in data print 'leader:', data[:24] return data return fast_parse.read_edition(data, accept_electronic = True)
def get_ia(ia): ia = ia.strip() # 'cyclopdiaofedu00kidd ' # read MARC record of scanned book from archive.org # try the XML first because it has better character encoding # if there is a problem with the XML switch to the binary MARC xml_file = ia + "_marc.xml" loc = ia + "/" + xml_file try: print base + loc f = urlopen_keep_trying(base + loc) except urllib2.HTTPError, error: if error.code == 404: raise NoMARCXML else: print 'error:', error.code, error.msg raise