def test_incorrect_length_marcs(self, monkeypatch): """If a Binary MARC has a different length than stated in the MARC leader, it is probably due to bad character conversions.""" monkeypatch.setattr(get_ia, 'urlopen_keep_trying', return_test_marc_bin) monkeypatch.setattr( ia, 'get_metadata', lambda itemid: {'_filenames': [itemid + "_meta.mrc"]}) bad_marcs = [ '1733mmoiresdel00vill', # Binary MARC reports len=734, but actually=742. Has badly converted unicode # original unicode converted as if it were MARC8 'dasrmischepriv00rein', # same as zweibchersatir01horauoft, binary representation of unicode interpreted as unicode codepoints 'histoirereligieu05cr', # C3A2 in this file should be single byte MARC8 combining acute 0xE2 # Original MARC8 0xE2 interpreted as u00E2 => \xC3\xA2, leader still MARC8 'lesabndioeinas00sche', # Original MARC8 0xE2 interpreted as u00E2 => \xC3\xA2, leader still MARC8 'poganucpeoplethe00stowuoft', # junk / unexpected character at end of publishers in field 260 'scrapbooksofmoun03tupp', # possible extra chars at end of field 505? 'zweibchersatir01horauoft', # leader is unicode, chars '\xc3\x83\xc2\xbc' in mrc should be '\xc3\xbc' # original '\xc3\xb3' was converted to '\u00c3\u00b3' ] for bad_marc in bad_marcs: result = get_ia.get_marc_record_from_ia(bad_marc) #TODO: get_marc_record_from_ia() currently returns None in this case, # It should be handled by MarcBinary and raise a BadMarc exception, or similar. assert result is None
def test_no_marc_xml(self): """When no XML MARC is listed in _filenames, the Binary MARC should be fetched.""" self.m.setattr(get_ia, 'urlopen_keep_trying', return_test_marc_bin) self.m.setattr(ia, 'get_metadata', lambda itemid: {'_filenames': [itemid + "_meta.mrc"]}) bin_items = ['0descriptionofta1682unit', '13dipolarcycload00burk', 'bijouorannualofl1828cole', 'cu31924091184469', 'diebrokeradical400poll', 'engineercorpsofh00sher', 'flatlandromanceo00abbouoft', 'henrywardbeecher00robauoft', 'lincolncentenary00horn', 'livrodostermosh00bragoog', 'mytwocountries1954asto', 'onquietcomedyint00brid', 'secretcodeofsucc00stjo', 'thewilliamsrecord_vol29b', 'warofrebellionco1473unit', ] for item in bin_items: result = get_ia.get_marc_record_from_ia(item) self.assertIsInstance(result, MarcBinary, "%s: expected instanceof MarcBinary, got %s" % (item, type(result)))
def test_no_marc_xml(self, monkeypatch): """When no XML MARC is listed in _filenames, the Binary MARC should be fetched.""" monkeypatch.setattr(get_ia, 'urlopen_keep_trying', return_test_marc_bin) monkeypatch.setattr( ia, 'get_metadata', lambda itemid: {'_filenames': [itemid + "_meta.mrc"]}) bin_items = [ '0descriptionofta1682unit', '13dipolarcycload00burk', 'bijouorannualofl1828cole', 'cu31924091184469', 'diebrokeradical400poll', 'engineercorpsofh00sher', 'flatlandromanceo00abbouoft', 'henrywardbeecher00robauoft', 'lincolncentenary00horn', 'livrodostermosh00bragoog', 'mytwocountries1954asto', 'onquietcomedyint00brid', 'secretcodeofsucc00stjo', 'thewilliamsrecord_vol29b', 'warofrebellionco1473unit', ] for item in bin_items: result = get_ia.get_marc_record_from_ia(item) assert isinstance(result, MarcBinary), \ "%s: expected instanceof MarcBinary, got %s" % (item, type(result))
def test_incorrect_length_marcs(self, bad_marc, monkeypatch): """If a Binary MARC has a different length than stated in the MARC leader, it is probably due to bad character conversions.""" monkeypatch.setattr(get_ia, 'urlopen_keep_trying', return_test_marc_bin) monkeypatch.setattr(ia, 'get_metadata', lambda itemid: {'_filenames': [itemid + "_meta.mrc"]}) with pytest.raises(BadLength): result = get_ia.get_marc_record_from_ia(bad_marc)
def test_get_marc_record_from_ia(self): """Tests the method returning MARC records from IA used by the import API. It should return an XML MARC if one exists.""" self.m.setattr(get_ia, 'urlopen_keep_trying', return_test_marc_xml) self.m.setattr(ia, 'get_metadata', lambda itemid: {'_filenames': [itemid + '_marc.xml', itemid + '_meta.mrc']}) xml_items = ['1733mmoiresdel00vill', # no <?xml '0descriptionofta1682unit', # has <?xml 'cu31924091184469', # is <collection> #'1893manualofharm00jadauoft', # 0 byte xml file '00schlgoog', '13dipolarcycload00burk', '39002054008678.yale.edu', 'abhandlungender01ggoog', 'bijouorannualofl1828cole', 'dasrmischepriv00rein', 'diebrokeradical400poll', 'engineercorpsofh00sher', 'flatlandromanceo00abbouoft', 'lesabndioeinas00sche', 'lincolncentenary00horn', 'livrodostermosh00bragoog', 'mytwocountries1954asto', 'nybc200247', 'onquietcomedyint00brid', 'scrapbooksofmoun03tupp', 'secretcodeofsucc00stjo', 'soilsurveyrepor00statgoog', 'warofrebellionco1473unit', 'zweibchersatir01horauoft', ] for item in xml_items: result = get_ia.get_marc_record_from_ia(item) self.assertIsInstance(result, MarcXml, "%s: expected instanceof MarcXml, got %s" % (item, type(result)))
def test_get_marc_record_from_ia(self, item, monkeypatch): """Tests the method returning MARC records from IA used by the import API. It should return an XML MARC if one exists.""" monkeypatch.setattr(get_ia, 'urlopen_keep_trying', return_test_marc_xml) monkeypatch.setattr(ia, 'get_metadata', lambda itemid: {'_filenames': [itemid + '_marc.xml', itemid + '_meta.mrc']}) result = get_ia.get_marc_record_from_ia(item) assert isinstance(result, MarcXml), \ "%s: expected instanceof MarcXml, got %s" % (item, type(result))
def test_get_marc_record_from_ia(self, item, monkeypatch): """Tests the method returning MARC records from IA used by the import API. It should return an XML MARC if one exists.""" monkeypatch.setattr(get_ia, 'urlopen_keep_trying', return_test_marc_xml) monkeypatch.setattr(ia, 'get_metadata', lambda itemid: {'_filenames': [itemid + '_marc.xml', itemid + '_meta.mrc']}) result = get_ia.get_marc_record_from_ia(item) assert isinstance(result, MarcXml), \ f"{item}: expected instanceof MarcXml, got {type(result)}"
def ia_import(cls, identifier, require_marc=True, force_import=False): """ Performs logic to fetch archive.org item + metadata, produces a data dict, then loads into Open Library :param str identifier: archive.org ocaid :param bool require_marc: require archive.org item have MARC record? :param bool force_import: force import of this record :rtype: dict :returns: the data of the imported book or raises BookImportError """ # Case 1 - Is this a valid Archive.org item? metadata = ia.get_metadata(identifier) if not metadata: raise BookImportError('invalid-ia-identifier', '%s not found' % identifier) # Case 2 - Does the item have an openlibrary field specified? # The scan operators search OL before loading the book and add the # OL key if a match is found. We can trust them and attach the item # to that edition. if metadata.get('mediatype') == 'texts' and metadata.get( 'openlibrary'): edition_data = cls.get_ia_record(metadata) edition_data['openlibrary'] = metadata['openlibrary'] edition_data = cls.populate_edition_data(edition_data, identifier) return cls.load_book(edition_data) # Case 3 - Can the item be loaded into Open Library? status = ia.get_item_status(identifier, metadata) if status != 'ok' and not force_import: raise BookImportError(status, 'Prohibited Item %s' % identifier) # Case 4 - Does this item have a marc record? marc_record = get_marc_record_from_ia(identifier) if require_marc and not marc_record: raise BookImportError('no-marc-record') if marc_record: if not force_import: raise_non_book_marc(marc_record) try: edition_data = read_edition(marc_record) except MarcException as e: logger.error('failed to read from MARC record %s: %s', identifier, str(e)) raise BookImportError('invalid-marc-record') else: try: edition_data = cls.get_ia_record(metadata) except KeyError: raise BookImportError('invalid-ia-metadata') # Add IA specific fields: ocaid, source_records, and cover edition_data = cls.populate_edition_data(edition_data, identifier) return cls.load_book(edition_data)
def test_no_marc_xml(self, item, monkeypatch): """When no XML MARC is listed in _filenames, the Binary MARC should be fetched.""" monkeypatch.setattr(get_ia, 'urlopen_keep_trying', return_test_marc_bin) monkeypatch.setattr(ia, 'get_metadata', lambda itemid: {'_filenames': [itemid + "_meta.mrc"]}) result = get_ia.get_marc_record_from_ia(item) assert isinstance(result, MarcBinary), \ "%s: expected instanceof MarcBinary, got %s" % (item, type(result)) print("%s:\n\tUNICODE: [%s]\n\tTITLE: %s" % (item, result.leader()[9], unicode.encode(result.read_fields(['245']).next()[1].get_all_subfields().next()[1], 'utf8')))
def test_no_marc_xml(self, item, monkeypatch): """When no XML MARC is listed in _filenames, the Binary MARC should be fetched.""" monkeypatch.setattr(get_ia, 'urlopen_keep_trying', return_test_marc_bin) monkeypatch.setattr(ia, 'get_metadata', lambda itemid: {'_filenames': [itemid + "_meta.mrc"]}) result = get_ia.get_marc_record_from_ia(item) assert isinstance(result, MarcBinary), \ f"{item}: expected instanceof MarcBinary, got {type(result)}" field_245 = next(result.read_fields(['245'])) title = next(field_245[1].get_all_subfields())[1].encode('utf8') print(f"{item}:\n\tUNICODE: [{result.leader()[9]}]\n\tTITLE: {title}")
def test_no_marc_xml(self, item, monkeypatch): """When no XML MARC is listed in _filenames, the Binary MARC should be fetched.""" monkeypatch.setattr(get_ia, 'urlopen_keep_trying', return_test_marc_bin) monkeypatch.setattr( ia, 'get_metadata', lambda itemid: {'_filenames': [itemid + "_meta.mrc"]}) result = get_ia.get_marc_record_from_ia(item) assert isinstance(result, MarcBinary), \ "%s: expected instanceof MarcBinary, got %s" % (item, type(result)) print("%s:\n\tUNICODE: [%s]\n\tTITLE: %s" % (item, result.leader()[9], unicode.encode( result.read_fields([ '245' ]).next()[1].get_all_subfields().next()[1], 'utf8')))
def test_incorrect_length_marcs(self): """If a Binary MARC has a different length than stated in the MARC leader, it is probably due to bad character conversions.""" self.m.setattr(get_ia, 'urlopen_keep_trying', return_test_marc_bin) self.m.setattr(ia, 'get_metadata', lambda itemid: {'_filenames': [itemid + "_meta.mrc"]}) bad_marcs = ['1733mmoiresdel00vill', # Binary MARC reports len=734, but actually=742. Has badly converted unicode # original unicode converted as if it were MARC8 'dasrmischepriv00rein', # same as zweibchersatir01horauoft, binary representation of unicode interpreted as unicode codepoints 'histoirereligieu05cr', # C3A2 in this file should be single byte MARC8 combining acute 0xE2 # Original MARC8 0xE2 interpreted as u00E2 => \xC3\xA2, leader still MARC8 'lesabndioeinas00sche', # Original MARC8 0xE2 interpreted as u00E2 => \xC3\xA2, leader still MARC8 'poganucpeoplethe00stowuoft', # junk / unexpected character at end of publishers in field 260 'scrapbooksofmoun03tupp', # possible extra chars at end of field 505? 'zweibchersatir01horauoft', # leader is unicode, chars '\xc3\x83\xc2\xbc' in mrc should be '\xc3\xbc' # original '\xc3\xb3' was converted to '\u00c3\u00b3' ] for bad_marc in bad_marcs: result = get_ia.get_marc_record_from_ia(bad_marc) #TODO: get_marc_record_from_ia() currently returns None in this case, # It should be handled by MarcBinary and raise a BadMarc exception, or similar. self.assertIsNone(result)
def parse_data(data): data = data.strip() if -1 != data[:10].find('<?xml'): root = etree.fromstring(data) #print root.tag if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' == root.tag: edition_builder = import_rdf.parse(root) format = 'rdf' elif '{http://www.w3.org/2005/Atom}entry' == root.tag: edition_builder = import_opds.parse(root) format = 'opds' elif '{http://www.loc.gov/MARC21/slim}record' == root.tag: if root.tag == '{http://www.loc.gov/MARC21/slim}collection': root = root[0] rec = MarcXml(root) edition = read_edition(rec) edition_builder = import_edition_builder.import_edition_builder(init_dict=edition) format = 'marcxml' else: print 'unrecognized XML format' return None, None elif data.startswith('{') and data.endswith('}'): obj = json.loads(data) edition_builder = import_edition_builder.import_edition_builder(init_dict=obj) format = 'json' else: # Special case to load IA records # Just passing ia:foo00bar is enough to load foo00bar from IA. if data.startswith("ia:"): source_records = [data] itemid = data[len("ia:"):] metadata = ia.get_metadata(itemid) if not metadata: raise DataError("invalid-ia-identifier") if not ia.edition_from_item_metadata(itemid, metadata): raise DataError("item-not-a-book") try: rec = get_marc_record_from_ia(itemid) # skip serials if rec.leader()[7] == 's': raise DataError("item-is-serial") except IOError: raise DataError("no-marc-record") if not rec: raise DataError("no-marc-record") else: source_records = None itemid = None #Marc Binary if len(data) != int(data[:5]): return json.dumps({'success':False, 'error':'Bad MARC length'}) rec = MarcBinary(data) edition = read_edition(rec) if source_records: edition['source_records'] = source_records edition['ocaid'] = itemid edition_builder = import_edition_builder.import_edition_builder(init_dict=edition) format = 'marc' parse_meta_headers(edition_builder) return edition_builder.get_dict(), format
def parse_data(data): data = data.strip() if -1 != data[:10].find('<?xml'): root = etree.fromstring(data) #print root.tag if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' == root.tag: edition_builder = import_rdf.parse(root) format = 'rdf' elif '{http://www.w3.org/2005/Atom}entry' == root.tag: edition_builder = import_opds.parse(root) format = 'opds' elif '{http://www.loc.gov/MARC21/slim}record' == root.tag: if root.tag == '{http://www.loc.gov/MARC21/slim}collection': root = root[0] rec = MarcXml(root) edition = read_edition(rec) edition_builder = import_edition_builder.import_edition_builder( init_dict=edition) format = 'marcxml' else: print 'unrecognized XML format' return None, None elif data.startswith('{') and data.endswith('}'): obj = json.loads(data) edition_builder = import_edition_builder.import_edition_builder( init_dict=obj) format = 'json' else: # Special case to load IA records # Just passing ia:foo00bar is enough to load foo00bar from IA. if data.startswith("ia:"): source_records = [data] itemid = data[len("ia:"):] metadata = ia.get_metadata(itemid) if not metadata: raise DataError("invalid-ia-identifier") if not ia.edition_from_item_metadata(itemid, metadata): raise DataError("item-not-a-book") try: rec = get_marc_record_from_ia(itemid) except IOError: raise DataError("no-marc-record") if not rec: raise DataError("no-marc-record") else: source_records = None itemid = None #Marc Binary if len(data) != int(data[:5]): return json.dumps({ 'success': False, 'error': 'Bad MARC length' }) rec = MarcBinary(data) edition = read_edition(rec) if source_records: edition['source_records'] = source_records edition['ocaid'] = itemid edition_builder = import_edition_builder.import_edition_builder( init_dict=edition) format = 'marc' parse_meta_headers(edition_builder) return edition_builder.get_dict(), format
def get_marc_record(self, identifier): try: return get_marc_record_from_ia(identifier) except IOError: return None
def parse_data(data): """ Takes POSTed data and determines the format, and returns an Edition record suitable for adding to OL. :param str data: Raw data :rtype: (dict|None, str|None) :return: (Edition record, format (rdf|opds|marcxml|json|marc)) or (None, None) """ data = data.strip() if -1 != data[:10].find('<?xml'): root = etree.fromstring(data) if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' == root.tag: edition_builder = import_rdf.parse(root) format = 'rdf' elif '{http://www.w3.org/2005/Atom}entry' == root.tag: edition_builder = import_opds.parse(root) format = 'opds' elif '{http://www.loc.gov/MARC21/slim}record' == root.tag: if root.tag == '{http://www.loc.gov/MARC21/slim}collection': root = root[0] rec = MarcXml(root) edition = read_edition(rec) edition_builder = import_edition_builder.import_edition_builder(init_dict=edition) format = 'marcxml' else: print 'unrecognized XML format' return None, None elif data.startswith('{') and data.endswith('}'): obj = json.loads(data) edition_builder = import_edition_builder.import_edition_builder(init_dict=obj) format = 'json' else: # Special case to load IA records, DEPRECATED: use import/ia endpoint # Just passing ia:foo00bar is enough to load foo00bar from IA. if data.startswith("ia:"): source_records = [data] itemid = data[len("ia:"):] metadata = ia.get_metadata(itemid) if not metadata: raise DataError("invalid-ia-identifier") # see ia_importapi to address `imagecount` limitations status = ia.get_item_status(itemid, metadata) if status != 'ok': raise DataError(status) try: rec = get_marc_record_from_ia(itemid) # skip serials if rec and rec.leader()[7] == 's': raise DataError("item-is-serial") except IOError: raise DataError("no-marc-record") if not rec: raise DataError("no-marc-record") else: source_records = None itemid = None #Marc Binary if len(data) != int(data[:5]): return json.dumps({'success':False, 'error':'Bad MARC length'}) rec = MarcBinary(data) edition = read_edition(rec) if source_records: edition['source_records'] = source_records edition['ocaid'] = itemid edition_builder = import_edition_builder.import_edition_builder(init_dict=edition) format = 'marc' parse_meta_headers(edition_builder) return edition_builder.get_dict(), format
def POST(self): web.header('Content-Type', 'application/json') if not can_write(): raise web.HTTPError('403 Forbidden') i = web.input() require_marc = not (i.get('require_marc') == 'false') bulk_marc = i.get('bulk_marc') == 'true' if 'identifier' not in i: return self.error('bad-input', 'identifier not provided') identifier = i.identifier # First check whether this is a non-book, bulk-marc item if bulk_marc: # Get binary MARC by identifier = ocaid/filename:offset:length re_bulk_identifier = re.compile("([^/]*)/([^:]*):(\d*):(\d*)") try: ocaid, filename, offset, length = re_bulk_identifier.match( identifier).groups() data, next_offset, next_length = get_from_archive_bulk( identifier) next_data = { 'next_record_offset': next_offset, 'next_record_length': next_length } rec = MarcBinary(data) edition = read_edition(rec) except MarcException as e: details = "%s: %s" % (identifier, str(e)) logger.error("failed to read from bulk MARC record %s", details) return self.error('invalid-marc-record', details, **next_data) actual_length = int(rec.leader()[:MARC_LENGTH_POS]) edition['source_records'] = 'marc:%s/%s:%s:%d' % ( ocaid, filename, offset, actual_length) local_id = i.get('local_id') if local_id: local_id_type = web.ctx.site.get('/local_ids/' + local_id) prefix = local_id_type.urn_prefix id_field, id_subfield = local_id_type.id_location.split('$') def get_subfield(field, id_subfield): if isinstance(field, str): return field subfields = field[1].get_subfield_values(id_subfield) return subfields[0] if subfields else None _ids = [ get_subfield(f, id_subfield) for f in rec.read_fields([id_field]) if f and get_subfield(f, id_subfield) ] edition['local_id'] = [ 'urn:%s:%s' % (prefix, _id) for _id in _ids ] # Don't add the book if the MARC record is a non-book item self.reject_non_book_marc(rec, **next_data) result = add_book.load(edition) # Add next_data to the response as location of next record: result.update(next_data) return json.dumps(result) # Case 1 - Is this a valid Archive.org item? metadata = ia.get_metadata(identifier) if not metadata: return self.error('invalid-ia-identifier', '%s not found' % identifier) # Case 2 - Does the item have an openlibrary field specified? # The scan operators search OL before loading the book and add the # OL key if a match is found. We can trust them and attach the item # to that edition. if metadata.get('mediatype') == 'texts' and metadata.get( 'openlibrary'): edition_data = self.get_ia_record(metadata) edition_data['openlibrary'] = metadata['openlibrary'] edition_data = self.populate_edition_data(edition_data, identifier) return self.load_book(edition_data) # Case 3 - Can the item be loaded into Open Library? status = ia.get_item_status(identifier, metadata) if status != 'ok': return self.error(status, 'Prohibited Item %s' % identifier) # Case 4 - Does this item have a marc record? marc_record = get_marc_record_from_ia(identifier) if marc_record: self.reject_non_book_marc(marc_record) try: edition_data = read_edition(marc_record) except MarcException as e: logger.error('failed to read from MARC record %s: %s', identifier, str(e)) return self.error('invalid-marc-record') elif require_marc: return self.error('no-marc-record') else: try: edition_data = self.get_ia_record(metadata) except KeyError: return self.error("invalid-ia-metadata") # Add IA specific fields: ocaid, source_records, and cover edition_data = self.populate_edition_data(edition_data, identifier) return self.load_book(edition_data)