def get_marc_record_from_ia(identifier): """ Takes IA identifiers and returns MARC record instance. 08/2018: currently called by openlibrary/plugins/importapi/code.py when the /api/import/ia endpoint is POSTed to. :param str identifier: ocaid :rtype: MarcXML | MarcBinary """ metadata = ia.get_metadata(identifier) filenames = metadata['_filenames'] marc_xml_filename = identifier + '_marc.xml' marc_bin_filename = identifier + '_meta.mrc' item_base = '{}{}/'.format(IA_DOWNLOAD_URL, identifier) # Try marc.xml first if marc_xml_filename in filenames: data = urlopen_keep_trying(item_base + marc_xml_filename).read() try: root = etree.fromstring(data) return MarcXml(root) except Exception as e: print("Unable to read MarcXML: %s" % e) traceback.print_exc() # If that fails, try marc.bin if marc_bin_filename in filenames: data = urlopen_keep_trying(item_base + marc_bin_filename).read() return MarcBinary(data)
def load_xml(ia): url = archive_url + ia + '/' + ia + '_marc.xml' f = urlopen_keep_trying(url) root = etree.parse(f).getroot() if root.tag == '{http://www.loc.gov/MARC21/slim}collection': root = root[0] return MarcXml(root)
def test_subjects_xml(self, item, expected): filename = os.path.dirname(__file__) + '/test_data/xml_input/' + item + '_marc.xml' element = etree.parse(filename).getroot() if element.tag != record_tag and element[0].tag == record_tag: element = element[0] rec = MarcXml(element) assert read_subjects(rec) == expected
def get_marc_record_from_ia(identifier): """Takes IA identifiers and returns MARC record instance. 11/2017: currently called by openlibrary/plugins/importapi/code.py when the /api/import/ia endpoint is POSTed to. """ metadata = ia.get_metadata(identifier) filenames = metadata['_filenames'] marc_xml_filename = identifier + "_marc.xml" marc_bin_filename = identifier + "_meta.mrc" item_base = base + "/" + identifier + "/" # Try marc.xml first if marc_xml_filename in filenames: data = urlopen_keep_trying(item_base + marc_xml_filename).read() try: root = etree.fromstring(data) return MarcXml(root) except Exception as e: print "Unable to read MarcXML: %s" % e traceback.print_exc() # If that fails, try marc.bin if marc_bin_filename in filenames: data = urlopen_keep_trying(item_base + marc_bin_filename).read() if len(data) == int(data[:5]): # This checks the reported data length against the actual data length # BinaryMARCs with incorrectly converted unicode characters do not match. return MarcBinary(data)
def load_xml(ia, host, path): url = 'http://' + host + path + '/' + ia + '_marc.xml' print(url) f = urlopen_keep_trying(url) root = etree.parse(f).getroot() if root.tag == '{http://www.loc.gov/MARC21/slim}collection': root = root[0] return MarcXml(root) edition = read_edition(rec) assert 'title' in edition return edition
def parse_data(data): """ Takes POSTed data and determines the format, and returns an Edition record suitable for adding to OL. :param bytes data: Raw data :rtype: (dict|None, str|None) :return: (Edition record, format (rdf|opds|marcxml|json|marc)) or (None, None) from typing import Dict, Optional, Tuple def parse_data(data: bytes) -> Tuple[Optional[Dict], Optional[str]]: """ data = data.strip() if b'<?xml' in data[:10]: root = etree.fromstring(data) if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' == root.tag: edition_builder = import_rdf.parse(root) format = 'rdf' elif '{http://www.w3.org/2005/Atom}entry' == root.tag: edition_builder = import_opds.parse(root) format = 'opds' elif '{http://www.loc.gov/MARC21/slim}record' == root.tag: if root.tag == '{http://www.loc.gov/MARC21/slim}collection': root = root[0] rec = MarcXml(root) edition = read_edition(rec) edition_builder = import_edition_builder.import_edition_builder( init_dict=edition) format = 'marcxml' else: raise DataError('unrecognized-XML-format') elif data.startswith(b'{') and data.endswith(b'}'): obj = json.loads(data) edition_builder = import_edition_builder.import_edition_builder( init_dict=obj) format = 'json' elif data[:MARC_LENGTH_POS].isdigit(): # Marc Binary if len(data) < MARC_LENGTH_POS or len(data) != int( data[:MARC_LENGTH_POS]): raise DataError('no-marc-record') rec = MarcBinary(data) edition = read_edition(rec) edition_builder = import_edition_builder.import_edition_builder( init_dict=edition) format = 'marc' else: raise DataError('unrecognised-import-format') parse_meta_headers(edition_builder) return edition_builder.get_dict(), format
def test_xml(self, i): expect_filename = "%s/xml_expect/%s_marc.xml" % (test_data, i) path = "%s/xml_input/%s_marc.xml" % (test_data, i) element = etree.parse(open(path)).getroot() # Handle MARC XML collection elements in our test_data expectations: if element.tag == collection_tag and element[0].tag == record_tag: element = element[0] rec = MarcXml(element) edition_marc_xml = read_edition(rec) assert edition_marc_xml j = simplejson.load(open(expect_filename)) assert j, "Unable to open test data: %s" % expect_filename assert sorted(edition_marc_xml.keys()) == sorted(j.keys()) for k in edition_marc_xml.keys(): assert edition_marc_xml[k] == j[k] assert edition_marc_xml == j
def test_xml(self, i): expect_filename = "%s/xml_expect/%s_marc.xml" % (test_data, i) path = "%s/xml_input/%s_marc.xml" % (test_data, i) element = etree.parse(open(path)).getroot() # Handle MARC XML collection elements in our test_data expectations: if element.tag == collection_tag and element[0].tag == record_tag: element = element[0] rec = MarcXml(element) edition_marc_xml = read_edition(rec) assert edition_marc_xml j = simplejson.load(open(expect_filename)) assert j, 'Unable to open test data: %s' % expect_filename assert sorted(edition_marc_xml) == sorted(j), ('Processed MARCXML fields do ' 'not match expectations in %s' % expect_filename) for k in edition_marc_xml: assert edition_marc_xml[k] == j[k], ('Processed MARCXML values do not ' 'match expectations in %s' % expect_filename) assert edition_marc_xml == j
def parse_data(data): data = data.strip() if -1 != data[:10].find('<?xml'): root = etree.fromstring(data) #print root.tag if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' == root.tag: edition_builder = import_rdf.parse(root) format = 'rdf' elif '{http://www.w3.org/2005/Atom}entry' == root.tag: edition_builder = import_opds.parse(root) format = 'opds' elif '{http://www.loc.gov/MARC21/slim}record' == root.tag: if root.tag == '{http://www.loc.gov/MARC21/slim}collection': root = root[0] rec = MarcXml(root) edition = read_edition(rec) edition_builder = import_edition_builder.import_edition_builder( init_dict=edition) format = 'marcxml' else: print 'unrecognized XML format' return None, None elif data.startswith('{') and data.endswith('}'): obj = json.loads(data) edition_builder = import_edition_builder.import_edition_builder( init_dict=obj) format = 'json' else: #Marc Binary if len(data) != int(data[:5]): return json.dumps({'success': False, 'error': 'Bad MARC length'}) rec = MarcBinary(data) edition = read_edition(rec) edition_builder = import_edition_builder.import_edition_builder( init_dict=edition) format = 'marc' parse_meta_headers(edition_builder) return edition_builder.get_dict(), format
def test_xml(self, i): expect_filename = "%s/xml_expect/%s_marc.xml" % (test_data, i) path = "%s/xml_input/%s_marc.xml" % (test_data, i) element = etree.parse(open(path)).getroot() # Handle MARC XML collection elements in our test_data expectations: if element.tag == collection_tag and element[0].tag == record_tag: element = element[0] rec = MarcXml(element) edition_marc_xml = read_edition(rec) assert edition_marc_xml j = json.load(open(expect_filename)) assert j, 'Unable to open test data: %s' % expect_filename assert sorted(edition_marc_xml) == sorted(j), ( 'Processed MARCXML fields do not match expectations in %s' % expect_filename) msg = ('Processed MARCXML values do not match expectations in %s' % expect_filename) for key, value in edition_marc_xml.items(): if isinstance(value, Iterable): # can not sort a list of dicts assert len(value) == len(j[key]), msg assert all(item in value for item in j[key]), msg else: assert value == j[key], msg
def get_marc_record_from_ia(identifier): """Takes IA identifiers and returns MARC record instance. """ metadata = ia.get_metadata(identifier) filenames = metadata['_filenames'] marc_xml_filename = identifier + "_marc.xml" marc_bin_filename = identifier + "_meta.mrc" item_base = base + "/" + identifier + "/" # Try marc.xml first if marc_xml_filename in filenames: data = urlopen_keep_trying(item_base + marc_xml_filename).read() if data[:10].find('<?xml') != -1: root = etree.fromstring(data) return MarcXml(root) # If that fails, try marc.bin if marc_bin_filename in filenames: data = urlopen_keep_trying(item_base + marc_bin_filename).read() if len(data) == int(data[:5]): return MarcBinary(data)
def parse_data(data): """ Takes POSTed data and determines the format, and returns an Edition record suitable for adding to OL. :param str data: Raw data :rtype: (dict|None, str|None) :return: (Edition record, format (rdf|opds|marcxml|json|marc)) or (None, None) """ data = data.strip() if -1 != data[:10].find('<?xml'): root = etree.fromstring(data) if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' == root.tag: edition_builder = import_rdf.parse(root) format = 'rdf' elif '{http://www.w3.org/2005/Atom}entry' == root.tag: edition_builder = import_opds.parse(root) format = 'opds' elif '{http://www.loc.gov/MARC21/slim}record' == root.tag: if root.tag == '{http://www.loc.gov/MARC21/slim}collection': root = root[0] rec = MarcXml(root) edition = read_edition(rec) edition_builder = import_edition_builder.import_edition_builder(init_dict=edition) format = 'marcxml' else: print 'unrecognized XML format' return None, None elif data.startswith('{') and data.endswith('}'): obj = json.loads(data) edition_builder = import_edition_builder.import_edition_builder(init_dict=obj) format = 'json' else: # Special case to load IA records, DEPRECATED: use import/ia endpoint # Just passing ia:foo00bar is enough to load foo00bar from IA. if data.startswith("ia:"): source_records = [data] itemid = data[len("ia:"):] metadata = ia.get_metadata(itemid) if not metadata: raise DataError("invalid-ia-identifier") # see ia_importapi to address `imagecount` limitations status = ia.get_item_status(itemid, metadata) if status != 'ok': raise DataError(status) try: rec = get_marc_record_from_ia(itemid) # skip serials if rec and rec.leader()[7] == 's': raise DataError("item-is-serial") except IOError: raise DataError("no-marc-record") if not rec: raise DataError("no-marc-record") else: source_records = None itemid = None #Marc Binary if len(data) != int(data[:5]): return json.dumps({'success':False, 'error':'Bad MARC length'}) rec = MarcBinary(data) edition = read_edition(rec) if source_records: edition['source_records'] = source_records edition['ocaid'] = itemid edition_builder = import_edition_builder.import_edition_builder(init_dict=edition) format = 'marc' parse_meta_headers(edition_builder) return edition_builder.get_dict(), format
def parse_data(data): data = data.strip() if -1 != data[:10].find('<?xml'): root = etree.fromstring(data) #print root.tag if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' == root.tag: edition_builder = import_rdf.parse(root) format = 'rdf' elif '{http://www.w3.org/2005/Atom}entry' == root.tag: edition_builder = import_opds.parse(root) format = 'opds' elif '{http://www.loc.gov/MARC21/slim}record' == root.tag: if root.tag == '{http://www.loc.gov/MARC21/slim}collection': root = root[0] rec = MarcXml(root) edition = read_edition(rec) edition_builder = import_edition_builder.import_edition_builder(init_dict=edition) format = 'marcxml' else: print 'unrecognized XML format' return None, None elif data.startswith('{') and data.endswith('}'): obj = json.loads(data) edition_builder = import_edition_builder.import_edition_builder(init_dict=obj) format = 'json' else: # Special case to load IA records # Just passing ia:foo00bar is enough to load foo00bar from IA. if data.startswith("ia:"): source_records = [data] itemid = data[len("ia:"):] metadata = ia.get_metadata(itemid) if not metadata: raise DataError("invalid-ia-identifier") if not ia.edition_from_item_metadata(itemid, metadata): raise DataError("item-not-a-book") try: rec = get_marc_record_from_ia(itemid) # skip serials if rec.leader()[7] == 's': raise DataError("item-is-serial") except IOError: raise DataError("no-marc-record") if not rec: raise DataError("no-marc-record") else: source_records = None itemid = None #Marc Binary if len(data) != int(data[:5]): return json.dumps({'success':False, 'error':'Bad MARC length'}) rec = MarcBinary(data) edition = read_edition(rec) if source_records: edition['source_records'] = source_records edition['ocaid'] = itemid edition_builder = import_edition_builder.import_edition_builder(init_dict=edition) format = 'marc' parse_meta_headers(edition_builder) return edition_builder.get_dict(), format
def parse_data(data): data = data.strip() if -1 != data[:10].find('<?xml'): root = etree.fromstring(data) #print root.tag if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' == root.tag: edition_builder = import_rdf.parse(root) format = 'rdf' elif '{http://www.w3.org/2005/Atom}entry' == root.tag: edition_builder = import_opds.parse(root) format = 'opds' elif '{http://www.loc.gov/MARC21/slim}record' == root.tag: if root.tag == '{http://www.loc.gov/MARC21/slim}collection': root = root[0] rec = MarcXml(root) edition = read_edition(rec) edition_builder = import_edition_builder.import_edition_builder( init_dict=edition) format = 'marcxml' else: print 'unrecognized XML format' return None, None elif data.startswith('{') and data.endswith('}'): obj = json.loads(data) edition_builder = import_edition_builder.import_edition_builder( init_dict=obj) format = 'json' else: # Special case to load IA records # Just passing ia:foo00bar is enough to load foo00bar from IA. if data.startswith("ia:"): source_records = [data] itemid = data[len("ia:"):] metadata = ia.get_metadata(itemid) if not metadata: raise DataError("invalid-ia-identifier") if not ia.edition_from_item_metadata(itemid, metadata): raise DataError("item-not-a-book") try: rec = get_marc_record_from_ia(itemid) except IOError: raise DataError("no-marc-record") if not rec: raise DataError("no-marc-record") else: source_records = None itemid = None #Marc Binary if len(data) != int(data[:5]): return json.dumps({ 'success': False, 'error': 'Bad MARC length' }) rec = MarcBinary(data) edition = read_edition(rec) if source_records: edition['source_records'] = source_records edition['ocaid'] = itemid edition_builder = import_edition_builder.import_edition_builder( init_dict=edition) format = 'marc' parse_meta_headers(edition_builder) return edition_builder.get_dict(), format
data = open(filename).read() if len(data) != int(data[:5]): data = data.decode('utf-8').encode('raw_unicode_escape') rec = MarcBinary(data) assert read_subjects(rec) == expected subjects = [] for item, expect in xml_samples: filename = os.path.dirname( __file__) + '/test_data/xml_input/' + item + '_marc.xml' element = etree.parse(filename).getroot() if element.tag != record_tag and element[0].tag == record_tag: element = element[0] rec = MarcXml(element) subjects.append(read_subjects(rec)) for item, expect in bin_samples: filename = os.path.dirname(__file__) + '/test_data/bin_input/' + item data = open(filename).read() if len(data) != int(data[:5]): data = data.decode('utf-8').encode('raw_unicode_escape') rec = MarcBinary(data) subjects.append(read_subjects(rec)) all_subjects = defaultdict(lambda: defaultdict(int)) for a in subjects: for b, c in a.items(): for d, e in c.items():