def parse_data(data): """ Takes POSTed data and determines the format, and returns an Edition record suitable for adding to OL. :param str data: Raw data :rtype: (dict|None, str|None) :return: (Edition record, format (rdf|opds|marcxml|json|marc)) or (None, None) """ data = data.strip() if -1 != data[:10].find('<?xml'): root = etree.fromstring(data) if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' == root.tag: edition_builder = import_rdf.parse(root) format = 'rdf' elif '{http://www.w3.org/2005/Atom}entry' == root.tag: edition_builder = import_opds.parse(root) format = 'opds' elif '{http://www.loc.gov/MARC21/slim}record' == root.tag: if root.tag == '{http://www.loc.gov/MARC21/slim}collection': root = root[0] rec = MarcXml(root) edition = read_edition(rec) edition_builder = import_edition_builder.import_edition_builder(init_dict=edition) format = 'marcxml' else: print('unrecognized XML format') return None, None elif data.startswith('{') and data.endswith('}'): obj = json.loads(data) edition_builder = import_edition_builder.import_edition_builder(init_dict=obj) format = 'json' else: #Marc Binary if len(data) < MARC_LENGTH_POS or len(data) != int(data[:MARC_LENGTH_POS]): raise DataError('no-marc-record') rec = MarcBinary(data) edition = read_edition(rec) edition_builder = import_edition_builder.import_edition_builder(init_dict=edition) format = 'marc' parse_meta_headers(edition_builder) return edition_builder.get_dict(), format
def test_from_marc_fields(self, mock_site, add_languages): ia = 'isbn_9781419594069' data = open_test_data(ia + '_meta.mrc').read() rec = read_edition(MarcBinary(data)) rec['source_records'] = ['ia:' + ia] reply = load(rec) assert reply['success'] is True # author from 100 assert reply['authors'][0]['name'] == 'Adam Weiner' edition = mock_site.get(reply['edition']['key']) # Publish place, publisher, & publish date - 260$a, $b, $c assert edition['publishers'][0] == 'Kaplan Publishing' assert edition['publish_date'] == '2007' assert edition['publish_places'][0] == 'New York' # Pagination 300 assert edition['number_of_pages'] == 264 assert edition['pagination'] == 'viii, 264 p.' # 8 subjects, 650 assert len(edition['subjects']) == 8 assert sorted(edition['subjects']) == [ 'Action and adventure films', 'Cinematography', 'Miscellanea', 'Physics', 'Physics in motion pictures', 'Popular works', 'Science fiction films', 'Special effects', ] # Edition description from 520 desc = ( 'Explains the basic laws of physics, covering such topics ' 'as mechanics, forces, and energy, while deconstructing ' 'famous scenes and stunts from motion pictures, including ' '"Apollo 13" and "Titanic," to determine if they are possible.') assert isinstance(edition['description'], Text) assert edition['description'] == desc # Work description from 520 work = mock_site.get(reply['work']['key']) assert isinstance(work['description'], Text) assert work['description'] == desc
def test_binary(self, i): expect_filename = "%s/bin_expect/%s" % (test_data, i) data = open("%s/bin_input/%s" % (test_data, i)).read() if len(data) != int(data[:5]): #TODO: Why are we fixing this in test expectations? Investigate. # affects histoirereligieu05cr_meta.mrc and zweibchersatir01horauoft_meta.mrc data = data.decode('utf-8').encode('raw_unicode_escape') assert len(data) == int(data[:5]) rec = MarcBinary(data) edition_marc_bin = read_edition(rec) assert edition_marc_bin j = simplejson.load(open(expect_filename)) assert j, "Unable to open test data: %s" % expect_filename assert sorted(edition_marc_bin.keys()) == sorted(j.keys()) for k in edition_marc_bin.keys(): if isinstance(j[k], list): for item1, item2 in zip(edition_marc_bin[k], j[k]): assert item1 == item2 assert edition_marc_bin[k] == j[k] assert edition_marc_bin == j
def get_work_subjects(w, do_get_mc=True): found = set() for e in w['editions']: sr = e.get('source_records', []) if sr: for i in sr: if i.endswith('initial import'): bad_source_record(e, i) continue if i.startswith('ia:') or i.startswith('marc:'): found.add(i) continue else: mc = None if do_get_mc: m = re_edition_key.match(e['key']) mc = get_mc('/b/' + m.group(1)) if mc: if mc.endswith('initial import'): bad_source_record(e, mc) continue if not mc.startswith('amazon:') and not re_ia_marc.match(mc): found.add('marc:' + mc) subjects = [] for sr in found: if sr.startswith('marc:ia:'): subjects.append(get_subjects_from_ia(sr[8:])) elif sr.startswith('marc:'): loc = sr[5:] data = get_from_archive(loc) rec = MarcBinary(data) try: subjects.append(read_subjects(rec)) except: print(('bad MARC:', loc)) print(('data:', repr(data))) raise else: assert sr.startswith('ia:') subjects.append(get_subjects_from_ia(sr[3:])) return combine_subjects(subjects)
def test_extra_author(mock_site): add_languages(mock_site) mock_site.save({ "name": "Hubert Howe Bancroft", "death_date": "1918.", "alternate_names": ["HUBERT HOWE BANCROFT", "Hubert Howe Bandcroft"], "key": "/authors/OL563100A", "birth_date": "1832", "personal_name": "Hubert Howe Bancroft", "type": {"key": "/type/author"}, }) mock_site.save({ "title": "The works of Hubert Howe Bancroft", "covers": [6060295, 5551343], "first_sentence": {"type": "/type/text", "value": "When it first became known to Europe that a new continent had been discovered, the wise men, philosophers, and especially the learned ecclesiastics, were sorely perplexed to account for such a discovery."}, "subject_places": ["Alaska", "America", "Arizona", "British Columbia", "California", "Canadian Northwest", "Central America", "Colorado", "Idaho", "Mexico", "Montana", "Nevada", "New Mexico", "Northwest Coast of North America", "Northwest boundary of the United States", "Oregon", "Pacific States", "Texas", "United States", "Utah", "Washington (State)", "West (U.S.)", "Wyoming"], "excerpts": [{"excerpt": "When it first became known to Europe that a new continent had been discovered, the wise men, philosophers, and especially the learned ecclesiastics, were sorely perplexed to account for such a discovery."}], "first_publish_date": "1882", "key": "/works/OL3421434W", "authors": [{"type": {"key": "/type/author_role"}, "author": {"key": "/authors/OL563100A"}}], "subject_times": ["1540-1810", "1810-1821", "1821-1861", "1821-1951", "1846-1850", "1850-1950", "1859-", "1859-1950", "1867-1910", "1867-1959", "1871-1903", "Civil War, 1861-1865", "Conquest, 1519-1540", "European intervention, 1861-1867", "Spanish colony, 1540-1810", "To 1519", "To 1821", "To 1846", "To 1859", "To 1867", "To 1871", "To 1889", "To 1912", "Wars of Independence, 1810-1821"], "type": {"key": "/type/work"}, "subjects": ["Antiquities", "Archaeology", "Autobiography", "Bibliography", "California Civil War, 1861-1865", "Comparative Literature", "Comparative civilization", "Courts", "Description and travel", "Discovery and exploration", "Early accounts to 1600", "English essays", "Ethnology", "Foreign relations", "Gold discoveries", "Historians", "History", "Indians", "Indians of Central America", "Indians of Mexico", "Indians of North America", "Languages", "Law", "Mayas", "Mexican War, 1846-1848", "Nahuas", "Nahuatl language", "Oregon question", "Political aspects of Law", "Politics and government", "Religion and mythology", "Religions", "Social life and customs", "Spanish", "Vigilance committees", "Writing", "Zamorano 80", "Accessible book", "Protected DAISY"] }) ia = 'workshuberthowe00racegoog' src = ia + '_meta.mrc' marc = MarcBinary(open_test_data(src).read()) rec = read_edition(marc) rec['source_records'] = ['ia:' + ia] reply = load(rec) assert reply['success'] == True w = mock_site.get(reply['work']['key']) reply = load(rec) assert reply['success'] == True w = mock_site.get(reply['work']['key'])
def parse_data(data): data = data.strip() if -1 != data[:10].find('<?xml'): root = etree.fromstring(data) #print root.tag if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' == root.tag: edition_builder = import_rdf.parse(root) format = 'rdf' elif '{http://www.w3.org/2005/Atom}entry' == root.tag: edition_builder = import_opds.parse(root) format = 'opds' elif '{http://www.loc.gov/MARC21/slim}record' == root.tag: if root.tag == '{http://www.loc.gov/MARC21/slim}collection': root = root[0] rec = MarcXml(root) edition = read_edition(rec) edition_builder = import_edition_builder.import_edition_builder( init_dict=edition) format = 'marcxml' else: print 'unrecognized XML format' return None, None elif data.startswith('{') and data.endswith('}'): obj = json.loads(data) edition_builder = import_edition_builder.import_edition_builder( init_dict=obj) format = 'json' else: #Marc Binary if len(data) != int(data[:5]): return json.dumps({'success': False, 'error': 'Bad MARC length'}) rec = MarcBinary(data) edition = read_edition(rec) edition_builder = import_edition_builder.import_edition_builder( init_dict=edition) format = 'marc' parse_meta_headers(edition_builder) return edition_builder.get_dict(), format
def get_marc_record_from_ia(identifier): """Takes IA identifiers and returns MARC record instance. """ metadata = ia.get_metadata(identifier) filenames = metadata['_filenames'] marc_xml_filename = identifier + "_marc.xml" marc_bin_filename = identifier + "_meta.mrc" item_base = base + "/" + identifier + "/" # Try marc.xml first if marc_xml_filename in filenames: data = urlopen_keep_trying(item_base + marc_xml_filename).read() if data[:10].find('<?xml') != -1: root = etree.fromstring(data) return MarcXml(root) # If that fails, try marc.bin if marc_bin_filename in filenames: data = urlopen_keep_trying(item_base + marc_bin_filename).read() if len(data) == int(data[:5]): return MarcBinary(data)
def test_binary(self, i): expect_filename = "%s/bin_expect/%s" % (test_data, i) data = open("%s/bin_input/%s" % (test_data, i)).read() if len(data) != int(data[:5]): #TODO: Why are we fixing this in test expectations? Investigate. # affects histoirereligieu05cr_meta.mrc and zweibchersatir01horauoft_meta.mrc data = data.decode('utf-8').encode('raw_unicode_escape') assert len(data) == int(data[:5]) rec = MarcBinary(data) edition_marc_bin = read_edition(rec) assert edition_marc_bin if not os.path.exists(expect_filename): # Missing test expectations file. Create a template from the input, but fail the current test. simplejson.dump(edition_marc_bin, open(expect_filename, 'w'), indent=2) assert False, 'Expectations file %s not found: template generated in %s. Please review and commit this file.' % (expect_filename, '/bin_expect') j = simplejson.load(open(expect_filename)) assert j, 'Unable to open test data: %s' % expect_filename assert sorted(edition_marc_bin.keys()) == sorted(j.keys()), 'Processed binary MARC fields do not match expectations in %s' % expect_filename for k in edition_marc_bin.keys(): if isinstance(j[k], list): for item1, item2 in zip(edition_marc_bin[k], j[k]): assert item1 == item2 assert edition_marc_bin[k] == j[k], 'Processed binary MARC values do not match expectations in %s' % expect_filename assert edition_marc_bin == j
def test_binary(self, i): expect_filename = '%s/bin_expect/%s' % (test_data, i) with open('%s/bin_input/%s' % (test_data, i), 'rb') as f: rec = MarcBinary(f.read()) edition_marc_bin = read_edition(rec) assert edition_marc_bin if not os.path.exists(expect_filename): # Missing test expectations file. Create a template from the input, but fail the current test. json.dump(edition_marc_bin, open(expect_filename, 'w'), indent=2) assert False, 'Expectations file %s not found: template generated in %s. Please review and commit this file.' % ( expect_filename, '/bin_expect') j = json.load(open(expect_filename)) assert j, 'Unable to open test data: %s' % expect_filename assert sorted(edition_marc_bin) == sorted(j), ( 'Processed binary MARC fields do not match expectations in %s' % expect_filename) msg = ('Processed binary MARC values do not match expectations in %s' % expect_filename) for key, value in edition_marc_bin.items(): if isinstance(value, Iterable): # can not sort a list of dicts assert len(value) == len(j[key]), msg assert all(item in value for item in j[key]), msg else: assert value == j[key], msg
def test_no_extra_author(mock_site, add_languages): author = { "name": "Paul Michael Boothe", "key": "/authors/OL1A", "type": { "key": "/type/author" }, } mock_site.save(author) work = { "title": "A Separate Pension Plan for Alberta", "covers": [1644794], "key": "/works/OL1W", "authors": [{ "type": "/type/author_role", "author": { "key": "/authors/OL1A" } }], "type": { "key": "/type/work" }, } mock_site.save(work) edition = { "number_of_pages": 90, "subtitle": "Analysis and Discussion (Western Studies in Economic Policy, No. 5)", "weight": "6.2 ounces", "covers": [1644794], "latest_revision": 6, "title": "A Separate Pension Plan for Alberta", "languages": [{ "key": "/languages/eng" }], "subjects": [ "Economics", "Alberta", "Political Science / State & Local Government", "Government policy", "Old age pensions", "Pensions", "Social security" ], "type": { "key": "/type/edition" }, "physical_dimensions": "9 x 6 x 0.2 inches", "publishers": ["The University of Alberta Press"], "physical_format": "Paperback", "key": "/books/OL1M", "authors": [{ "key": "/authors/OL1A" }], "identifiers": { "goodreads": ["4340973"], "librarything": ["5580522"] }, "isbn_13": ["9780888643513"], "isbn_10": ["0888643519"], "publish_date": "May 1, 2000", "works": [{ "key": "/works/OL1W" }] } mock_site.save(edition) src = 'v39.i34.records.utf8--186503-1413' marc = MarcBinary(open_test_data(src).read()) rec = read_edition(marc) rec['source_records'] = ['marc:' + src] reply = load(rec) assert reply['success'] is True assert reply['edition']['status'] == 'modified' assert reply['work']['status'] == 'modified' assert 'authors' not in reply assert reply['edition']['key'] == edition['key'] assert reply['work']['key'] == work['key'] e = mock_site.get(reply['edition']['key']) w = mock_site.get(reply['work']['key']) assert 'source_records' in e assert 'subjects' in w assert len(e['authors']) == 1 assert len(w['authors']) == 1
def test_missing_source_records(mock_site, add_languages): mock_site.save({ 'key': '/authors/OL592898A', 'name': 'Michael Robert Marrus', 'personal_name': 'Michael Robert Marrus', 'type': { 'key': '/type/author' } }) mock_site.save({ 'authors': [{ 'author': '/authors/OL592898A', 'type': { 'key': '/type/author_role' } }], 'key': '/works/OL16029710W', 'subjects': [ 'Nuremberg Trial of Major German War Criminals, Nuremberg, Germany, 1945-1946', 'Protected DAISY', 'Lending library' ], 'title': 'The Nuremberg war crimes trial, 1945-46', 'type': { 'key': '/type/work' }, }) mock_site.save({ "number_of_pages": 276, "subtitle": "a documentary history", "series": ["The Bedford series in history and culture"], "covers": [6649715, 3865334, 173632], "lc_classifications": ["D804.G42 N87 1997"], "ocaid": "nurembergwarcrim00marr", "contributions": ["Marrus, Michael Robert."], "uri_descriptions": ["Book review (H-Net)"], "title": "The Nuremberg war crimes trial, 1945-46", "languages": [{ "key": "/languages/eng" }], "subjects": [ "Nuremberg Trial of Major German War Criminals, Nuremberg, Germany, 1945-1946" ], "publish_country": "mau", "by_statement": "[compiled by] Michael R. Marrus.", "type": { "key": "/type/edition" }, "uris": ["http://www.h-net.org/review/hrev-a0a6c9-aa"], "publishers": ["Bedford Books"], "ia_box_id": ["IA127618"], "key": "/books/OL1023483M", "authors": [{ "key": "/authors/OL592898A" }], "publish_places": ["Boston"], "pagination": "xi, 276 p. :", "lccn": ["96086777"], "notes": { "type": "/type/text", "value": "Includes bibliographical references (p. 262-268) and index." }, "identifiers": { "goodreads": ["326638"], "librarything": ["1114474"] }, "url": ["http://www.h-net.org/review/hrev-a0a6c9-aa"], "isbn_10": ["031216386X", "0312136919"], "publish_date": "1997", "works": [{ "key": "/works/OL16029710W" }] }) ia = 'nurembergwarcrim1997marr' src = ia + '_meta.mrc' marc = MarcBinary(open_test_data(src).read()) rec = read_edition(marc) rec['source_records'] = ['ia:' + ia] reply = load(rec) assert reply['success'] is True e = mock_site.get(reply['edition']['key']) assert 'source_records' in e
def POST(self): web.header('Content-Type', 'application/json') if not can_write(): raise web.HTTPError('403 Forbidden') i = web.input() require_marc = not (i.get('require_marc') == 'false') bulk_marc = i.get('bulk_marc') == 'true' if 'identifier' not in i: return self.error('bad-input', 'identifier not provided') identifier = i.identifier # First check whether this is a non-book, bulk-marc item if bulk_marc: # Get binary MARC by identifier = ocaid/filename:offset:length re_bulk_identifier = re.compile("([^/]*)/([^:]*):(\d*):(\d*)") try: ocaid, filename, offset, length = re_bulk_identifier.match(identifier).groups() data, next_offset, next_length = get_from_archive_bulk(identifier) next_data = {'next_record_offset': next_offset, 'next_record_length': next_length} rec = MarcBinary(data) edition = read_edition(rec) except MarcException as e: details = "%s: %s" % (identifier, str(e)) logger.error("failed to read from bulk MARC record %s", details) return self.error('invalid-marc-record', details, **next_data) actual_length = int(rec.leader()[:MARC_LENGTH_POS]) edition['source_records'] = 'marc:%s/%s:%s:%d' % (ocaid, filename, offset, actual_length) #TODO: Look up URN prefixes to support more sources, extend openlibrary/catalog/marc/sources? if ocaid == 'OpenLibraries-Trent-MARCs': prefix = 'trent' edition['local_id'] = ['urn:%s:%s' % (prefix, _id) for _id in rec.get_fields('001')] result = add_book.load(edition) # Add next_data to the response as location of next record: result.update(next_data) return json.dumps(result) # Case 1 - Is this a valid Archive.org item? try: item_json = ia.get_item_json(identifier) item_server = item_json['server'] item_path = item_json['dir'] except KeyError: return self.error("invalid-ia-identifier", "%s not found" % identifier) metadata = ia.extract_item_metadata(item_json) if not metadata: return self.error("invalid-ia-identifier") # Case 2 - Does the item have an openlibrary field specified? # The scan operators search OL before loading the book and add the # OL key if a match is found. We can trust them and attach the item # to that edition. if metadata.get("mediatype") == "texts" and metadata.get("openlibrary"): edition_data = self.get_ia_record(metadata) edition_data["openlibrary"] = metadata["openlibrary"] edition_data = self.populate_edition_data(edition_data, identifier) return self.load_book(edition_data) # Case 3 - Can the item be loaded into Open Library? status = ia.get_item_status(identifier, metadata, item_server=item_server, item_path=item_path) if status != 'ok': return self.error(status, "Prohibited Item") # Case 4 - Does this item have a marc record? marc_record = self.get_marc_record(identifier) if marc_record: self.reject_non_book_marc(marc_record) try: edition_data = read_edition(marc_record) except MarcException as e: logger.error("failed to read from MARC record %s: %s", identifier, str(e)) return self.error("invalid-marc-record") elif require_marc: return self.error("no-marc-record") else: try: edition_data = self.get_ia_record(metadata) except KeyError: return self.error("invalid-ia-metadata") # Add IA specific fields: ocaid, source_records, and cover edition_data = self.populate_edition_data(edition_data, identifier) return self.load_book(edition_data)
def POST(self): web.header('Content-Type', 'application/json') if not can_write(): raise web.HTTPError('403 Forbidden') i = web.input() require_marc = not (i.get('require_marc') == 'false') force_import = i.get('force_import') == 'true' bulk_marc = i.get('bulk_marc') == 'true' if 'identifier' not in i: return self.error('bad-input', 'identifier not provided') identifier = i.identifier # First check whether this is a non-book, bulk-marc item if bulk_marc: # Get binary MARC by identifier = ocaid/filename:offset:length re_bulk_identifier = re.compile(r"([^/]*)/([^:]*):(\d*):(\d*)") try: ocaid, filename, offset, length = re_bulk_identifier.match( identifier).groups() data, next_offset, next_length = get_from_archive_bulk( identifier) next_data = { 'next_record_offset': next_offset, 'next_record_length': next_length, } rec = MarcBinary(data) edition = read_edition(rec) except MarcException as e: details = f"{identifier}: {str(e)}" logger.error("failed to read from bulk MARC record %s", details) return self.error('invalid-marc-record', details, **next_data) actual_length = int(rec.leader()[:MARC_LENGTH_POS]) edition['source_records'] = 'marc:%s/%s:%s:%d' % ( ocaid, filename, offset, actual_length, ) local_id = i.get('local_id') if local_id: local_id_type = web.ctx.site.get('/local_ids/' + local_id) prefix = local_id_type.urn_prefix force_import = True id_field, id_subfield = local_id_type.id_location.split('$') def get_subfield(field, id_subfield): if isinstance(field, str): return field subfields = field[1].get_subfield_values(id_subfield) return subfields[0] if subfields else None _ids = [ get_subfield(f, id_subfield) for f in rec.read_fields([id_field]) if f and get_subfield(f, id_subfield) ] edition['local_id'] = [f'urn:{prefix}:{_id}' for _id in _ids] # Don't add the book if the MARC record is a non-monograph item, # unless it is a scanning partner record and/or force_import is set. if not force_import: try: raise_non_book_marc(rec, **next_data) except BookImportError as e: return self.error(e.error_code, e.error, **e.kwargs) result = add_book.load(edition) # Add next_data to the response as location of next record: result.update(next_data) return json.dumps(result) try: return self.ia_import(identifier, require_marc=require_marc, force_import=force_import) except BookImportError as e: return self.error(e.error_code, e.error, **e.kwargs)
def test_raises_no_title(self): filename = "%s/bin_input/talis_no_title2.mrc" % test_data with open(filename, 'r') as f: rec = MarcBinary(f.read()) with pytest.raises(NoTitle): read_edition(rec)
def test_don_quixote(mock_site): """ All of these items are by 'Miguel de Cervantes Saavedra', only one Author should be created. Some items have bad MARC length, others are missing binary MARC altogether and raise BadMARC exceptions. """ pytest.skip("This test make live requests to archive.org") dq = [u'lifeexploitsofin01cerv', u'cu31924096224518', u'elingeniosedcrit04cerv', u'ingeniousgentlem01cervuoft', u'historyofingenio01cerv', u'lifeexploitsofin02cerviala', u'elingeniosohidal03cervuoft', u'nybc209000', u'elingeniosohidal11cerv', u'elingeniosohidal01cervuoft', u'elingeniosoh01cerv', u'donquixotedelama00cerviala', u'1896elingeniosohid02cerv', u'ingeniousgentlem04cervuoft', u'cu31924027656978', u'histoiredeladmir01cerv', u'donquijotedelama04cerv', u'cu31924027657075', u'donquixotedelama03cervuoft', u'aventurasdedonqu00cerv', u'p1elingeniosohid03cerv', u'geshikhefundonik01cervuoft', u'historyofvalorou02cerviala', u'ingeniousgentlem01cerv', u'donquixotedelama01cervuoft', u'ingeniousgentlem0195cerv', u'firstpartofdelig00cervuoft', u'p4elingeniosohid02cerv', u'donquijote00cervuoft', u'cu31924008863924', u'c2elingeniosohid02cerv', u'historyofvalorou03cerviala', u'historyofingenio01cerviala', u'historyadventure00cerv', u'elingeniosohidal00cerv', u'lifeexploitsofin01cervuoft', u'p2elingeniosohid05cerv', u'nybc203136', u'elingeniosohidal00cervuoft', u'donquixotedelama02cervuoft', u'lingnieuxcheva00cerv', u'ingeniousgentlem03cerv', u'vidayhechosdeli00siscgoog', u'lifeandexploits01jarvgoog', u'elingeniosohida00puiggoog', u'elingeniosohida00navagoog', u'donquichottedel02florgoog', u'historydonquixo00cogoog', u'vidayhechosdeli01siscgoog', u'elingeniosohida28saavgoog', u'historyvalorous00brangoog', u'elingeniosohida01goog', u'historyandadven00unkngoog', u'historyvalorous01goog', u'ingeniousgentle11saavgoog', u'elingeniosohida10saavgoog', u'adventuresdonqu00jarvgoog', u'historydonquixo04saavgoog', u'lingnieuxcheval00rouxgoog', u'elingeniosohida19saavgoog', u'historyingeniou00lalagoog', u'elingeniosohida00ormsgoog', u'historyandadven01smolgoog', u'elingeniosohida27saavgoog', u'elingeniosohida21saavgoog', u'historyingeniou00mottgoog', u'historyingeniou03unkngoog', u'lifeandexploits00jarvgoog', u'ingeniousgentle00conggoog', u'elingeniosohida00quixgoog', u'elingeniosohida01saavgoog', u'donquixotedelam02saavgoog', u'adventuresdonqu00gilbgoog', u'historyingeniou02saavgoog', u'donquixotedelam03saavgoog', u'elingeniosohida00ochogoog', u'historyingeniou08mottgoog', u'lifeandexploits01saavgoog', u'firstpartdeligh00shelgoog', u'elingeniosohida00castgoog', u'elingeniosohida01castgoog', u'adventofdonquixo00cerv', u'portablecervante00cerv', u'firstpartofdelig14cerv', u'donquixotemanofl00cerv', u'firstpartofdelig00cerv'] bad_length = [] bad_marc = [] add_languages(mock_site) edition_status_counts = defaultdict(int) work_status_counts = defaultdict(int) author_status_counts = defaultdict(int) for ocaid in dq: marc_url = 'https://archive.org/download/%s/%s_meta.mrc' % (ocaid, ocaid) data = urlopen(marc_url).read() try: marc = MarcBinary(data) except BadLength: bad_length.append(ocaid) continue except BadMARC: bad_marc.append(ocaid) continue rec = read_edition(marc) rec['source_records'] = ['ia:' + ocaid] reply = load(rec) q = { 'type': '/type/work', 'authors.author': '/authors/OL1A', } work_keys = list(mock_site.things(q)) author_keys = list(mock_site.things({'type': '/type/author'})) print("\nReply for %s: %s" % (ocaid, reply)) print("Work keys: %s" % work_keys) assert author_keys == ['/authors/OL1A'] assert reply['success'] is True # Increment status counters edition_status_counts[reply['edition']['status']] += 1 work_status_counts[reply['work']['status']] += 1 if (reply['work']['status'] != 'matched') and (reply['edition']['status'] != 'modified'): # No author key in response if work is 'matched' # No author key in response if edition is 'modified' author_status_counts[reply['authors'][0]['status']] += 1 print("BAD MARC LENGTH items: %s" % bad_length) print("BAD MARC items: %s" % bad_marc) print("Edition status counts: %s" % edition_status_counts) print("Work status counts: %s" % work_status_counts) print("Author status counts: %s" % author_status_counts)
def POST(self): web.header('Content-Type', 'application/json') if not can_write(): raise web.HTTPError('403 Forbidden') i = web.input() require_marc = not (i.get('require_marc') == 'false') bulk_marc = i.get('bulk_marc') == 'true' if 'identifier' not in i: return self.error('bad-input', 'identifier not provided') identifier = i.identifier # First check whether this is a non-book, bulk-marc item if bulk_marc: # Get binary MARC by identifier = ocaid/filename:offset:length re_bulk_identifier = re.compile("([^/]*)/([^:]*):(\d*):(\d*)") try: ocaid, filename, offset, length = re_bulk_identifier.match(identifier).groups() data, next_offset, next_length = get_from_archive_bulk(identifier) next_data = {'next_record_offset': next_offset, 'next_record_length': next_length} rec = MarcBinary(data) edition = read_edition(rec) except MarcException as e: details = "%s: %s" % (identifier, str(e)) logger.error("failed to read from bulk MARC record %s", details) return self.error('invalid-marc-record', details, **next_data) actual_length = int(rec.leader()[:5]) edition['source_records'] = 'marc:%s/%s:%s:%d' % (ocaid, filename, offset, actual_length) #TODO: Look up URN prefixes to support more sources prefix = 'trent' edition['local_id'] = ['urn:%s:%s' % (prefix, _id) for _id in rec.get_fields('001')] result = add_book.load(edition) # Add next_data to the response as location of next record: result.update(next_data) return json.dumps(result) # Case 0 - Is the item already loaded key = self.find_edition(identifier) if key: return self.status_matched(key) # Case 1 - Is this a valid Archive.org item? try: item_json = ia.get_item_json(identifier) item_server = item_json['server'] item_path = item_json['dir'] except KeyError: return self.error("invalid-ia-identifier", "%s not found" % identifier) metadata = ia.extract_item_metadata(item_json) if not metadata: return self.error("invalid-ia-identifier") # Case 2 - Does the item have an openlibrary field specified? # The scan operators search OL before loading the book and add the # OL key if a match is found. We can trust them and attach the item # to that edition. if metadata.get("mediatype") == "texts" and metadata.get("openlibrary"): d = { "title": metadata['title'], "openlibrary": "/books/" + metadata["openlibrary"] } d = self.populate_edition_data(d, identifier) return self.load_book(d) # Case 3 - Can the item be loaded into Open Library? status = ia.get_item_status(identifier, metadata, item_server=item_server, item_path=item_path) if status != 'ok': return self.error(status, "Prohibited Item") # Gio - April 2016 # items with metadata no_ol_import=true will be not imported if metadata.get("no_ol_import", '').lower() == 'true': return self.error("no-ol-import") # Case 4 - Does this item have a marc record? marc_record = self.get_marc_record(identifier) if marc_record: # Is the item a serial instead of a book? marc_leaders = marc_record.leader() if marc_leaders[7] == 's': return self.error("item-is-serial") # insider note: follows Archive.org's approach of # Item::isMARCXMLforMonograph() which excludes non-books if not (marc_leaders[7] == 'm' and marc_leaders[6] == 'a'): return self.error("item-not-book") try: edition_data = read_edition(marc_record) except MarcException as e: logger.error("failed to read from MARC record %s: %s", identifier, str(e)) return self.error("invalid-marc-record") elif require_marc: return self.error("no-marc-record") else: try: edition_data = self.get_ia_record(metadata) except KeyError: return self.error("invalid-ia-metadata") # Add IA specific fields: ocaid, source_records, and cover edition_data = self.populate_edition_data(edition_data, identifier) return self.load_book(edition_data)
def test_don_quixote(mock_site): dq = [ u'lifeexploitsofin01cerv', u'cu31924096224518', u'elingeniosedcrit04cerv', u'ingeniousgentlem01cervuoft', u'historyofingenio01cerv', u'lifeexploitsofin02cerviala', u'elingeniosohidal03cervuoft', u'nybc209000', u'elingeniosohidal11cerv', u'elingeniosohidal01cervuoft', u'elingeniosoh01cerv', u'donquixotedelama00cerviala', u'1896elingeniosohid02cerv', u'ingeniousgentlem04cervuoft', u'cu31924027656978', u'histoiredeladmir01cerv', u'donquijotedelama04cerv', u'cu31924027657075', u'donquixotedelama03cervuoft', u'aventurasdedonqu00cerv', u'p1elingeniosohid03cerv', u'geshikhefundonik01cervuoft', u'historyofvalorou02cerviala', u'ingeniousgentlem01cerv', u'donquixotedelama01cervuoft', u'ingeniousgentlem0195cerv', u'firstpartofdelig00cervuoft', u'p4elingeniosohid02cerv', u'donquijote00cervuoft', u'cu31924008863924', u'c2elingeniosohid02cerv', u'historyofvalorou03cerviala', u'historyofingenio01cerviala', u'historyadventure00cerv', u'elingeniosohidal00cerv', u'lifeexploitsofin01cervuoft', u'p2elingeniosohid05cerv', u'nybc203136', u'elingeniosohidal00cervuoft', u'donquixotedelama02cervuoft', u'lingnieuxcheva00cerv', u'ingeniousgentlem03cerv', u'vidayhechosdeli00siscgoog', u'lifeandexploits01jarvgoog', u'elingeniosohida00puiggoog', u'elingeniosohida00navagoog', u'donquichottedel02florgoog', u'historydonquixo00cogoog', u'vidayhechosdeli01siscgoog', u'elingeniosohida28saavgoog', u'historyvalorous00brangoog', u'elingeniosohida01goog', u'historyandadven00unkngoog', u'historyvalorous01goog', u'ingeniousgentle11saavgoog', u'elingeniosohida10saavgoog', u'adventuresdonqu00jarvgoog', u'historydonquixo04saavgoog', u'lingnieuxcheval00rouxgoog', u'elingeniosohida19saavgoog', u'historyingeniou00lalagoog', u'elingeniosohida00ormsgoog', u'historyandadven01smolgoog', u'elingeniosohida27saavgoog', u'elingeniosohida21saavgoog', u'historyingeniou00mottgoog', u'historyingeniou03unkngoog', u'lifeandexploits00jarvgoog', u'ingeniousgentle00conggoog', u'elingeniosohida00quixgoog', u'elingeniosohida01saavgoog', u'donquixotedelam02saavgoog', u'adventuresdonqu00gilbgoog', u'historyingeniou02saavgoog', u'donquixotedelam03saavgoog', u'elingeniosohida00ochogoog', u'historyingeniou08mottgoog', u'lifeandexploits01saavgoog', u'firstpartdeligh00shelgoog', u'elingeniosohida00castgoog', u'elingeniosohida01castgoog', u'adventofdonquixo00cerv', u'portablecervante00cerv', u'firstpartofdelig14cerv', u'donquixotemanofl00cerv', u'firstpartofdelig00cerv' ] add_languages(mock_site) edition_status_counts = defaultdict(int) work_status_counts = defaultdict(int) author_status_counts = defaultdict(int) for num, ia in enumerate(dq): marc_url = 'http://archive.org/download/%s/%s_meta.mrc' % (ia, ia) data = urlopen(marc_url).read() if '<title>Internet Archive: Page Not Found</title>' in data: continue marc = MarcBinary(data) rec = read_edition(marc) reply = load(rec) q = { 'type': '/type/work', 'authors.author': '/authors/OL1A', } work_keys = list(mock_site.things(q)) assert work_keys pprint(reply) assert reply['success'] == True astatus = reply['authors'][0]['status'] wstatus = reply['work']['status'] estatus = reply['edition']['status'] if num == 0: assert astatus == 'created' else: assert astatus == 'modified' edition_status_counts[estatus] += 1 work_status_counts[wstatus] += 1 author_status_counts[astatus] += 1 for k, v in edition_status_counts.iteritems(): print 'edition %8s: %d' % (k, v) print for k, v in work_status_counts.iteritems(): print 'work %8s: %d' % (k, v) print for k, v in author_status_counts.iteritems(): print 'author %8s: %d' % (k, v)
def POST(self): web.header('Content-Type', 'application/json') if not can_write(): raise web.HTTPError('403 Forbidden') i = web.input() require_marc = not (i.get('require_marc') == 'false') bulk_marc = i.get('bulk_marc') == 'true' if 'identifier' not in i: return self.error('bad-input', 'identifier not provided') identifier = i.identifier # First check whether this is a non-book, bulk-marc item if bulk_marc: # Get binary MARC by identifier = ocaid/filename:offset:length re_bulk_identifier = re.compile("([^/]*)/([^:]*):(\d*):(\d*)") try: ocaid, filename, offset, length = re_bulk_identifier.match(identifier).groups() data, next_offset, next_length = get_from_archive_bulk(identifier) next_data = {'next_record_offset': next_offset, 'next_record_length': next_length} rec = MarcBinary(data) edition = read_edition(rec) except MarcException as e: details = "%s: %s" % (identifier, str(e)) logger.error("failed to read from bulk MARC record %s", details) return self.error('invalid-marc-record', details, **next_data) actual_length = int(rec.leader()[:MARC_LENGTH_POS]) edition['source_records'] = 'marc:%s/%s:%s:%d' % (ocaid, filename, offset, actual_length) local_id = i.get('local_id') if local_id: local_id_type = web.ctx.site.get('/local_ids/' + local_id) prefix = local_id_type.urn_prefix edition['local_id'] = ['urn:%s:%s' % (prefix, _id) for _id in rec.get_fields('001')] result = add_book.load(edition) # Add next_data to the response as location of next record: result.update(next_data) return json.dumps(result) # Case 1 - Is this a valid Archive.org item? try: item_json = ia.get_item_json(identifier) item_server = item_json['server'] item_path = item_json['dir'] except KeyError: return self.error("invalid-ia-identifier", "%s not found" % identifier) metadata = ia.extract_item_metadata(item_json) if not metadata: return self.error("invalid-ia-identifier") # Case 2 - Does the item have an openlibrary field specified? # The scan operators search OL before loading the book and add the # OL key if a match is found. We can trust them and attach the item # to that edition. if metadata.get("mediatype") == "texts" and metadata.get("openlibrary"): edition_data = self.get_ia_record(metadata) edition_data["openlibrary"] = metadata["openlibrary"] edition_data = self.populate_edition_data(edition_data, identifier) return self.load_book(edition_data) # Case 3 - Can the item be loaded into Open Library? status = ia.get_item_status(identifier, metadata, item_server=item_server, item_path=item_path) if status != 'ok': return self.error(status, "Prohibited Item") # Case 4 - Does this item have a marc record? marc_record = self.get_marc_record(identifier) if marc_record: self.reject_non_book_marc(marc_record) try: edition_data = read_edition(marc_record) except MarcException as e: logger.error("failed to read from MARC record %s: %s", identifier, str(e)) return self.error("invalid-marc-record") elif require_marc: return self.error("no-marc-record") else: try: edition_data = self.get_ia_record(metadata) except KeyError: return self.error("invalid-ia-metadata") # Add IA specific fields: ocaid, source_records, and cover edition_data = self.populate_edition_data(edition_data, identifier) return self.load_book(edition_data)
def parse_data(data): """ Takes POSTed data and determines the format, and returns an Edition record suitable for adding to OL. :param str data: Raw data :rtype: (dict|None, str|None) :return: (Edition record, format (rdf|opds|marcxml|json|marc)) or (None, None) """ data = data.strip() if -1 != data[:10].find('<?xml'): root = etree.fromstring(data) if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' == root.tag: edition_builder = import_rdf.parse(root) format = 'rdf' elif '{http://www.w3.org/2005/Atom}entry' == root.tag: edition_builder = import_opds.parse(root) format = 'opds' elif '{http://www.loc.gov/MARC21/slim}record' == root.tag: if root.tag == '{http://www.loc.gov/MARC21/slim}collection': root = root[0] rec = MarcXml(root) edition = read_edition(rec) edition_builder = import_edition_builder.import_edition_builder(init_dict=edition) format = 'marcxml' else: print 'unrecognized XML format' return None, None elif data.startswith('{') and data.endswith('}'): obj = json.loads(data) edition_builder = import_edition_builder.import_edition_builder(init_dict=obj) format = 'json' else: # Special case to load IA records, DEPRECATED: use import/ia endpoint # Just passing ia:foo00bar is enough to load foo00bar from IA. if data.startswith("ia:"): source_records = [data] itemid = data[len("ia:"):] metadata = ia.get_metadata(itemid) if not metadata: raise DataError("invalid-ia-identifier") # see ia_importapi to address `imagecount` limitations status = ia.get_item_status(itemid, metadata) if status != 'ok': raise DataError(status) try: rec = get_marc_record_from_ia(itemid) # skip serials if rec and rec.leader()[7] == 's': raise DataError("item-is-serial") except IOError: raise DataError("no-marc-record") if not rec: raise DataError("no-marc-record") else: source_records = None itemid = None #Marc Binary if len(data) != int(data[:5]): return json.dumps({'success':False, 'error':'Bad MARC length'}) rec = MarcBinary(data) edition = read_edition(rec) if source_records: edition['source_records'] = source_records edition['ocaid'] = itemid edition_builder = import_edition_builder.import_edition_builder(init_dict=edition) format = 'marc' parse_meta_headers(edition_builder) return edition_builder.get_dict(), format
rec = MarcBinary(data) assert read_subjects(rec) == expected subjects = [] for item, expect in xml_samples: filename = os.path.dirname( __file__) + '/test_data/xml_input/' + item + '_marc.xml' element = etree.parse(filename).getroot() if element.tag != record_tag and element[0].tag == record_tag: element = element[0] rec = MarcXml(element) subjects.append(read_subjects(rec)) for item, expect in bin_samples: filename = os.path.dirname(__file__) + '/test_data/bin_input/' + item data = open(filename).read() if len(data) != int(data[:5]): data = data.decode('utf-8').encode('raw_unicode_escape') rec = MarcBinary(data) subjects.append(read_subjects(rec)) all_subjects = defaultdict(lambda: defaultdict(int)) for a in subjects: for b, c in a.items(): for d, e in c.items(): all_subjects[b][d] += e print four_types(dict((k, dict(v)) for k, v in all_subjects.items()))
def test_raises_see_also(self): filename = "%s/bin_input/talis_see_also.mrc" % test_data with open(filename, 'r') as f: rec = MarcBinary(f.read()) with pytest.raises(SeeAlsoAsTitle): read_edition(rec)
def parse_data(data): data = data.strip() if -1 != data[:10].find('<?xml'): root = etree.fromstring(data) #print root.tag if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' == root.tag: edition_builder = import_rdf.parse(root) format = 'rdf' elif '{http://www.w3.org/2005/Atom}entry' == root.tag: edition_builder = import_opds.parse(root) format = 'opds' elif '{http://www.loc.gov/MARC21/slim}record' == root.tag: if root.tag == '{http://www.loc.gov/MARC21/slim}collection': root = root[0] rec = MarcXml(root) edition = read_edition(rec) edition_builder = import_edition_builder.import_edition_builder( init_dict=edition) format = 'marcxml' else: print 'unrecognized XML format' return None, None elif data.startswith('{') and data.endswith('}'): obj = json.loads(data) edition_builder = import_edition_builder.import_edition_builder( init_dict=obj) format = 'json' else: # Special case to load IA records # Just passing ia:foo00bar is enough to load foo00bar from IA. if data.startswith("ia:"): source_records = [data] itemid = data[len("ia:"):] metadata = ia.get_metadata(itemid) if not metadata: raise DataError("invalid-ia-identifier") if not ia.edition_from_item_metadata(itemid, metadata): raise DataError("item-not-a-book") try: rec = get_marc_record_from_ia(itemid) except IOError: raise DataError("no-marc-record") if not rec: raise DataError("no-marc-record") else: source_records = None itemid = None #Marc Binary if len(data) != int(data[:5]): return json.dumps({ 'success': False, 'error': 'Bad MARC length' }) rec = MarcBinary(data) edition = read_edition(rec) if source_records: edition['source_records'] = source_records edition['ocaid'] = itemid edition_builder = import_edition_builder.import_edition_builder( init_dict=edition) format = 'marc' parse_meta_headers(edition_builder) return edition_builder.get_dict(), format
def test_subjects_bin(self, item, expected): filename = os.path.dirname(__file__) + '/test_data/bin_input/' + item with open(filename, mode='rb') as f: rec = MarcBinary(f.read()) assert read_subjects(rec) == expected
def test_bad_binary_data(self): with pytest.raises(BadMARC): result = MarcBinary('nonMARCdata')