# if there is a problem with the XML switch to the binary MARC xml_file = ia + "_marc.xml" loc = ia + "/" + xml_file try: print base + loc f = urlopen_keep_trying(base + loc) except urllib2.HTTPError, error: if error.code == 404: raise NoMARCXML else: print 'error:', error.code, error.msg raise assert f if f: try: return read_xml.read_edition(f) except read_xml.BadXML: pass except xml.parsers.expat.ExpatError: #print 'IA:', `ia` #print 'XML parse error:', base + loc pass print base + loc if '<title>Internet Archive: Page Not Found</title>' in urllib2.urlopen(base + loc).read(200): raise NoMARCXML url = base + ia + "/" + ia + "_meta.mrc" print url try: f = urlopen_keep_trying(url) except urllib2.URLError: pass
print item, 'already loaded' load_count += 1 continue if ol.query({'type': '/type/edition', 'source_records': 'ia:' + ia}): print 'already loaded' load_count += 1 continue try: assert not re_census.match(item) assert 'passportapplicat' not in item assert len(full_rec.keys()) != 1 except AssertionError: print item raise filename = '/2/edward/20century/scans/' + item[:2] + '/' + item + '/' + item + '_marc.xml' rec = read_xml.read_edition(open(filename)) if 'full_title' not in rec: print "full_title missing", item continue if 'physical_format' in rec: format = rec['physical_format'].lower() if format.startswith('[graphic') or format.startswith('[cartograph'): print item, format index_fields = make_index_fields(rec) if not index_fields: print "no index_fields" continue #print index_fields edition_pool = pool.build(index_fields) if not edition_pool or not any(v for v in edition_pool.itervalues()):
# if there is a problem with the XML switch to the binary MARC xml_file = ia + "_marc.xml" loc = ia + "/" + xml_file try: print base + loc f = urlopen_keep_trying(base + loc) except urllib2.HTTPError, error: if error.code == 404: raise NoMARCXML else: print 'error:', error.code, error.msg raise assert f if f: try: return read_xml.read_edition(f) except read_xml.BadXML: print "read_xml BADXML" pass except xml.parsers.expat.ExpatError: #print 'IA:', `ia` #print 'XML parse error:', base + loc print "read_xml ExpatError" pass print base + loc if '<title>Internet Archive: Page Not Found</title>' in urllib2.urlopen( base + loc).read(200): raise NoMARCXML url = base + ia + "/" + ia + "_meta.mrc" print url try:
xml_file = ia + "_marc.xml" loc = ia + "/" + xml_file if os.path.exists(xml_path + xml_file): f = open(xml_path + xml_file) else: try: f = urlopen_keep_trying(base + loc) except urllib2.HTTPError, error: if error.code == 404: raise NoMARCXML else: print 'error:', error.code, error.msg raise if f: try: return loc, read_xml.read_edition(f) except read_xml.BadXML: pass except xml.parsers.expat.ExpatError: #print 'IA:', `ia` #print 'XML parse error:', base + loc pass if '<title>Internet Archive: Page Not Found</title>' in urllib2.urlopen(base + loc).read(200): raise NoMARCXML url = base + ia + "/" + ia + "_meta.mrc" print url try: f = urlopen_keep_trying(url) except urllib2.URLError: pass if not f:
load_count += 1 continue if ol.query({'type': '/type/edition', 'source_records': 'ia:' + ia}): print 'already loaded' load_count += 1 continue try: assert not re_census.match(item) assert 'passportapplicat' not in item assert len(full_rec.keys()) != 1 except AssertionError: print item raise filename = '/2/edward/20century/scans/' + item[: 2] + '/' + item + '/' + item + '_marc.xml' rec = read_xml.read_edition(open(filename)) if 'full_title' not in rec: print "full_title missing", item continue if 'physical_format' in rec: format = rec['physical_format'].lower() if format.startswith('[graphic') or format.startswith('[cartograph'): print item, format index_fields = make_index_fields(rec) if not index_fields: print "no index_fields" continue #print index_fields edition_pool = pool.build(index_fields) if not edition_pool or not any(v for v in edition_pool.itervalues()):