def format_book_data(book): d = web.storage() d.key = book.get('key') d.url = book.url() d.title = book.title or None d.ocaid = book.get("ocaid") d.eligibility = book.get("eligibility", {}) d.availability = book.get('availability', {}) def get_authors(doc): return [web.storage(key=a.key, name=a.name or None) for a in doc.get_authors()] work = book.works and book.works[0] d.authors = get_authors(work if work else book) d.work_key = work.key if work else book.key cover = work.get_cover() if work and work.get_cover() else book.get_cover() if cover: d.cover_url = cover.url("M") elif d.ocaid: d.cover_url = 'https://archive.org/services/img/%s' % d.ocaid if d.ocaid: collections = ia.get_metadata(d.ocaid).get('collection', []) if 'lendinglibrary' in collections or 'inlibrary' in collections: d.borrow_url = book.url("/borrow") else: d.read_url = book.url("/borrow") return d
def get_metadata(self, identifier): logger.info("find_metadata %s", identifier) if identifier in self.ia_cache: return self.ia_cache[identifier] return ia.get_metadata(identifier)
def get_marc_record_from_ia(identifier): """ Takes IA identifiers and returns MARC record instance. 08/2018: currently called by openlibrary/plugins/importapi/code.py when the /api/import/ia endpoint is POSTed to. :param str identifier: ocaid :rtype: MarcXML | MarcBinary """ metadata = ia.get_metadata(identifier) filenames = metadata['_filenames'] marc_xml_filename = identifier + '_marc.xml' marc_bin_filename = identifier + '_meta.mrc' item_base = '{}{}/'.format(IA_DOWNLOAD_URL, identifier) # Try marc.xml first if marc_xml_filename in filenames: data = urlopen_keep_trying(item_base + marc_xml_filename).read() try: root = etree.fromstring(data) return MarcXml(root) except Exception as e: print("Unable to read MarcXML: %s" % e) traceback.print_exc() # If that fails, try marc.bin if marc_bin_filename in filenames: data = urlopen_keep_trying(item_base + marc_bin_filename).read() return MarcBinary(data)
def get_marc_record_from_ia(identifier): """Takes IA identifiers and returns MARC record instance. 11/2017: currently called by openlibrary/plugins/importapi/code.py when the /api/import/ia endpoint is POSTed to. """ metadata = ia.get_metadata(identifier) filenames = metadata['_filenames'] marc_xml_filename = identifier + "_marc.xml" marc_bin_filename = identifier + "_meta.mrc" item_base = base + "/" + identifier + "/" # Try marc.xml first if marc_xml_filename in filenames: data = urlopen_keep_trying(item_base + marc_xml_filename).read() try: root = etree.fromstring(data) return MarcXml(root) except Exception as e: print "Unable to read MarcXML: %s" % e traceback.print_exc() # If that fails, try marc.bin if marc_bin_filename in filenames: data = urlopen_keep_trying(item_base + marc_bin_filename).read() if len(data) == int(data[:5]): # This checks the reported data length against the actual data length # BinaryMARCs with incorrectly converted unicode characters do not match. return MarcBinary(data)
def get_metadata(self, identifier): if identifier in self.ia_cache: logger.info("IA metadata cache hit") return self.ia_cache[identifier] else: logger.info("IA metadata cache miss") return ia.get_metadata(identifier)
def get(self, sitename, data): key = data.get('key') itemid = self._get_itemid(key) if itemid: edition_key = self._find_edition(sitename, itemid) if edition_key: # Delete the store entry, indicating that this is no more is an item to be imported. self._ensure_no_store_entry(sitename, itemid) return self._make_redirect(itemid, edition_key) else: metadata = ia.get_metadata(itemid) doc = ia.edition_from_item_metadata(itemid, metadata) if doc is None: # Delete store entry, if it exists. # When an item is darked on archive.org, it should be # automatically removed from OL. Removing entry from store # will trigger the solr-updater to delete it from solr as well. self._ensure_no_store_entry(sitename, itemid) raise client.ClientException( "404 Not Found", "notfound", simplejson.dumps({"key": "/books/ia:" + itemid, "error": "notfound"})) storedoc = self._ensure_store_entry(sitename, itemid) # Hack to add additional subjects /books/ia: pages # Adding subjects to store docs, will add thise subjects to the books. # These subjects are used when indexing the books in solr. if storedoc.get("subjects"): doc.setdefault("subjects", []).extend(storedoc['subjects']) return simplejson.dumps(doc) else: return ConnectionMiddleware.get(self, sitename, data)
def POST(self): web.header('Content-Type', 'application/json') if not can_write(): return json.dumps({'success': False, 'error': 'Permission Denied'}) i = web.input() if "identifier" not in i: self.error("bad-input", "identifier not provided") identifier = i.identifier # Case 0 - Is the item already loaded key = self.find_edition(identifier) if key: return self.status_matched(key) # Case 1 - Is this a valid item? metadata = ia.get_metadata(identifier) if not metadata: return self.error("invalid-ia-identifier") # Case 2 - Is the item has openlibrary field specified? # The scan operators search OL before loading the book and adds the # OL key if an match is found. We can trust them as attach the item # to that edition. if metadata.get("mediatype") == "texts" and metadata.get( "openlibrary"): d = { "title": metadata['title'], "openlibrary": "/books/" + metadata["openlibrary"] } d = self.populate_edition_data(d, identifier) return self.load_book(d) # Case 3 - Can the item be loaded into Open Library? status = ia.get_item_status(identifier, metadata) if status != 'ok': return self.error(status, "Prohibited Item") # Gio - April 2016 # items with metadata no_ol_import=true will be not imported if metadata.get("no_ol_import") == 'true' or metadata.get( "no_ol_import") == 'True': return self.error("no-ol-import") # Case 4 - Does this item have a marc record? marc_record = self.get_marc_record(identifier) if not marc_record: return self.error("no-marc-record") # Case 5 - Is the item a serial instead of a book? if marc_record.leader()[7] == 's': return self.error("item-is-serial") edition_data = self.get_edition_data(identifier, marc_record) if not edition_data: return self.error("invalid-marc-record") return self.load_book(edition_data)
def get_ia_availability(itemid): collections = ia.get_metadata(itemid).get('collection', []) if 'lendinglibrary' in collections or 'inlibrary' in collections: return 'borrow' elif 'printdisabled' in collections: return 'restricted' else: return 'full'
def POST(self): web.header('Content-Type', 'application/json') if not can_write(): return json.dumps({'success':False, 'error':'Permission Denied'}) i = web.input() if "identifier" not in i: self.error("bad-input", "identifier not provided") identifier = i.identifier # Case 0 - Is the item already loaded key = self.find_edition(identifier) if key: return self.status_matched(key) # Case 1 - Is this a valid item? metadata = ia.get_metadata(identifier) if not metadata: return self.error("invalid-ia-identifier") # Case 2 - Is the item has openlibrary field specified? # The scan operators search OL before loading the book and adds the # OL key if an match is found. We can trust them as attach the item # to that edition. if metadata.get("mediatype") == "texts" and metadata.get("openlibrary"): d = { "title": metadata['title'], "openlibrary": "/books/" + metadata["openlibrary"] } d = self.populate_edition_data(d, identifier) return self.load_book(d) # Case 3 - Can the item be loaded into Open Library? status = ia.get_item_status(identifier, metadata) if status != 'ok': return self.error(status, "Prohibited Item") # Gio - April 2016 # items with metadata no_ol_import=true will be not imported if metadata.get("no_ol_import") == 'true' or metadata.get("no_ol_import") == 'True': return self.error("no-ol-import") # Case 4 - Does this item have a marc record? marc_record = self.get_marc_record(identifier) if not marc_record: return self.error("no-marc-record") # Case 5 - Is the item a serial instead of a book? if marc_record.leader()[7] == 's': return self.error("item-is-serial") edition_data = self.get_edition_data(identifier, marc_record) if not edition_data: return self.error("invalid-marc-record") return self.load_book(edition_data)
def ia_import(cls, identifier, require_marc=True, force_import=False): """ Performs logic to fetch archive.org item + metadata, produces a data dict, then loads into Open Library :param str identifier: archive.org ocaid :param bool require_marc: require archive.org item have MARC record? :param bool force_import: force import of this record :rtype: dict :returns: the data of the imported book or raises BookImportError """ # Case 1 - Is this a valid Archive.org item? metadata = ia.get_metadata(identifier) if not metadata: raise BookImportError('invalid-ia-identifier', '%s not found' % identifier) # Case 2 - Does the item have an openlibrary field specified? # The scan operators search OL before loading the book and add the # OL key if a match is found. We can trust them and attach the item # to that edition. if metadata.get('mediatype') == 'texts' and metadata.get( 'openlibrary'): edition_data = cls.get_ia_record(metadata) edition_data['openlibrary'] = metadata['openlibrary'] edition_data = cls.populate_edition_data(edition_data, identifier) return cls.load_book(edition_data) # Case 3 - Can the item be loaded into Open Library? status = ia.get_item_status(identifier, metadata) if status != 'ok' and not force_import: raise BookImportError(status, 'Prohibited Item %s' % identifier) # Case 4 - Does this item have a marc record? marc_record = get_marc_record_from_ia(identifier) if require_marc and not marc_record: raise BookImportError('no-marc-record') if marc_record: if not force_import: raise_non_book_marc(marc_record) try: edition_data = read_edition(marc_record) except MarcException as e: logger.error('failed to read from MARC record %s: %s', identifier, str(e)) raise BookImportError('invalid-marc-record') else: try: edition_data = cls.get_ia_record(metadata) except KeyError: raise BookImportError('invalid-ia-metadata') # Add IA specific fields: ocaid, source_records, and cover edition_data = cls.populate_edition_data(edition_data, identifier) return cls.load_book(edition_data)
def get_ia_meta_fields(self): # Check for cached value # $$$ we haven't assigned _ia_meta_fields the first time around but there's apparently # some magic that lets us check this way (and breaks using hasattr to check if defined) if self._ia_meta_fields: return self._ia_meta_fields if not self.get('ocaid', None): meta = {} else: meta = ia.get_metadata(self.ocaid) meta.setdefault('external-identifier', []) meta.setdefault('collection', []) self._ia_meta_fields = meta return self._ia_meta_fields
def test_get_metadata(monkeypatch, mock_memcache): metadata = { "metadata": { "title": "Foo", "identifier": "foo00bar", "collection": ["printdisabled", "inlibrary"], } } monkeypatch.setattr(ia, 'get_api_response', lambda *args: metadata) assert ia.get_metadata('foo00bar') == { "title": "Foo", "identifier": "foo00bar", "collection": ["printdisabled", "inlibrary"], "access-restricted": False, "_filenames": [], }
def get_marc_record_from_ia(identifier): """Takes IA identifiers and returns MARC record instance. """ metadata = ia.get_metadata(identifier) filenames = metadata['_filenames'] marc_xml_filename = identifier + "_marc.xml" marc_bin_filename = identifier + "_meta.mrc" item_base = base + "/" + identifier + "/" # Try marc.xml first if marc_xml_filename in filenames: data = urlopen_keep_trying(item_base + marc_xml_filename).read() if data[:10].find('<?xml') != -1: root = etree.fromstring(data) return MarcXml(root) # If that fails, try marc.bin if marc_bin_filename in filenames: data = urlopen_keep_trying(item_base + marc_bin_filename).read() if len(data) == int(data[:5]): return MarcBinary(data)
def parse_data(data): """ Takes POSTed data and determines the format, and returns an Edition record suitable for adding to OL. :param str data: Raw data :rtype: (dict|None, str|None) :return: (Edition record, format (rdf|opds|marcxml|json|marc)) or (None, None) """ data = data.strip() if -1 != data[:10].find('<?xml'): root = etree.fromstring(data) if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' == root.tag: edition_builder = import_rdf.parse(root) format = 'rdf' elif '{http://www.w3.org/2005/Atom}entry' == root.tag: edition_builder = import_opds.parse(root) format = 'opds' elif '{http://www.loc.gov/MARC21/slim}record' == root.tag: if root.tag == '{http://www.loc.gov/MARC21/slim}collection': root = root[0] rec = MarcXml(root) edition = read_edition(rec) edition_builder = import_edition_builder.import_edition_builder(init_dict=edition) format = 'marcxml' else: print 'unrecognized XML format' return None, None elif data.startswith('{') and data.endswith('}'): obj = json.loads(data) edition_builder = import_edition_builder.import_edition_builder(init_dict=obj) format = 'json' else: # Special case to load IA records, DEPRECATED: use import/ia endpoint # Just passing ia:foo00bar is enough to load foo00bar from IA. if data.startswith("ia:"): source_records = [data] itemid = data[len("ia:"):] metadata = ia.get_metadata(itemid) if not metadata: raise DataError("invalid-ia-identifier") # see ia_importapi to address `imagecount` limitations status = ia.get_item_status(itemid, metadata) if status != 'ok': raise DataError(status) try: rec = get_marc_record_from_ia(itemid) # skip serials if rec and rec.leader()[7] == 's': raise DataError("item-is-serial") except IOError: raise DataError("no-marc-record") if not rec: raise DataError("no-marc-record") else: source_records = None itemid = None #Marc Binary if len(data) != int(data[:5]): return json.dumps({'success':False, 'error':'Bad MARC length'}) rec = MarcBinary(data) edition = read_edition(rec) if source_records: edition['source_records'] = source_records edition['ocaid'] = itemid edition_builder = import_edition_builder.import_edition_builder(init_dict=edition) format = 'marc' parse_meta_headers(edition_builder) return edition_builder.get_dict(), format
def _get_ia_item(self, itemid): timestamp = {"type": "/type/datetime", "value": "2010-01-01T00:00:00"} metadata = ia.get_metadata(itemid) if not self._is_valid_item(itemid, metadata): raise client.ClientException("404 Not Found", "notfound", simplejson.dumps({"key": "/books/ia:" + itemid})) d = { "key": "/books/ia:" + itemid, "type": {"key": "/type/edition"}, "title": itemid, "ocaid": itemid, "revision": 1, "created": timestamp, "last_modified": timestamp } def add(key, key2=None): key2 = key2 or key # sometimes the empty values are represneted as {} in metadata API. Avoid them. if key in metadata and metadata[key] != {}: value = metadata[key] if isinstance(value, list): value = [v for v in value if v != {}] if value: if isinstance(value[0], basestring): value = "\n\n".join(value) else: value = value[0] else: # empty list. Ignore. return d[key2] = value def add_list(key, key2): key2 = key2 or key # sometimes the empty values are represneted as {} in metadata API. Avoid them. if key in metadata and metadata[key] != {}: value = metadata[key] if not isinstance(value, list): value = [value] d[key2] = value def add_isbns(): isbns = metadata.get('isbn') isbn_10 = [] isbn_13 = [] if isbns: for isbn in isbns: isbn = isbn.replace("-", "").strip() if len(isbn) == 13: isbn_13.append(isbn) elif len(isbn) == 10: isbn_10.append(isbn) if isbn_10: d["isbn_10"] = isbn_10 if isbn_13: d["isbn_13"] = isbn_13 def add_subjects(): collections = metadata.get("collection", []) mapping = { "inlibrary": "In library", "lendinglibrary": "Lending library" } subjects = [subject for c, subject in mapping.items() if c in collections] if subjects: d['subjects'] = subjects add('title') add('description', 'description') add_list('publisher', 'publishers') add_list("creator", "author_names") add('date', 'publish_date') add_isbns() add_subjects() return d
def POST(self): web.header('Content-Type', 'application/json') if not can_write(): raise web.HTTPError('403 Forbidden') i = web.input() require_marc = not (i.get('require_marc') == 'false') bulk_marc = i.get('bulk_marc') == 'true' if 'identifier' not in i: return self.error('bad-input', 'identifier not provided') identifier = i.identifier # First check whether this is a non-book, bulk-marc item if bulk_marc: # Get binary MARC by identifier = ocaid/filename:offset:length re_bulk_identifier = re.compile("([^/]*)/([^:]*):(\d*):(\d*)") try: ocaid, filename, offset, length = re_bulk_identifier.match( identifier).groups() data, next_offset, next_length = get_from_archive_bulk( identifier) next_data = { 'next_record_offset': next_offset, 'next_record_length': next_length } rec = MarcBinary(data) edition = read_edition(rec) except MarcException as e: details = "%s: %s" % (identifier, str(e)) logger.error("failed to read from bulk MARC record %s", details) return self.error('invalid-marc-record', details, **next_data) actual_length = int(rec.leader()[:MARC_LENGTH_POS]) edition['source_records'] = 'marc:%s/%s:%s:%d' % ( ocaid, filename, offset, actual_length) local_id = i.get('local_id') if local_id: local_id_type = web.ctx.site.get('/local_ids/' + local_id) prefix = local_id_type.urn_prefix id_field, id_subfield = local_id_type.id_location.split('$') def get_subfield(field, id_subfield): if isinstance(field, str): return field subfields = field[1].get_subfield_values(id_subfield) return subfields[0] if subfields else None _ids = [ get_subfield(f, id_subfield) for f in rec.read_fields([id_field]) if f and get_subfield(f, id_subfield) ] edition['local_id'] = [ 'urn:%s:%s' % (prefix, _id) for _id in _ids ] # Don't add the book if the MARC record is a non-book item self.reject_non_book_marc(rec, **next_data) result = add_book.load(edition) # Add next_data to the response as location of next record: result.update(next_data) return json.dumps(result) # Case 1 - Is this a valid Archive.org item? metadata = ia.get_metadata(identifier) if not metadata: return self.error('invalid-ia-identifier', '%s not found' % identifier) # Case 2 - Does the item have an openlibrary field specified? # The scan operators search OL before loading the book and add the # OL key if a match is found. We can trust them and attach the item # to that edition. if metadata.get('mediatype') == 'texts' and metadata.get( 'openlibrary'): edition_data = self.get_ia_record(metadata) edition_data['openlibrary'] = metadata['openlibrary'] edition_data = self.populate_edition_data(edition_data, identifier) return self.load_book(edition_data) # Case 3 - Can the item be loaded into Open Library? status = ia.get_item_status(identifier, metadata) if status != 'ok': return self.error(status, 'Prohibited Item %s' % identifier) # Case 4 - Does this item have a marc record? marc_record = get_marc_record_from_ia(identifier) if marc_record: self.reject_non_book_marc(marc_record) try: edition_data = read_edition(marc_record) except MarcException as e: logger.error('failed to read from MARC record %s: %s', identifier, str(e)) return self.error('invalid-marc-record') elif require_marc: return self.error('no-marc-record') else: try: edition_data = self.get_ia_record(metadata) except KeyError: return self.error("invalid-ia-metadata") # Add IA specific fields: ocaid, source_records, and cover edition_data = self.populate_edition_data(edition_data, identifier) return self.load_book(edition_data)
def parse_data(data): data = data.strip() if -1 != data[:10].find('<?xml'): root = etree.fromstring(data) #print root.tag if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' == root.tag: edition_builder = import_rdf.parse(root) format = 'rdf' elif '{http://www.w3.org/2005/Atom}entry' == root.tag: edition_builder = import_opds.parse(root) format = 'opds' elif '{http://www.loc.gov/MARC21/slim}record' == root.tag: if root.tag == '{http://www.loc.gov/MARC21/slim}collection': root = root[0] rec = MarcXml(root) edition = read_edition(rec) edition_builder = import_edition_builder.import_edition_builder(init_dict=edition) format = 'marcxml' else: print 'unrecognized XML format' return None, None elif data.startswith('{') and data.endswith('}'): obj = json.loads(data) edition_builder = import_edition_builder.import_edition_builder(init_dict=obj) format = 'json' else: # Special case to load IA records # Just passing ia:foo00bar is enough to load foo00bar from IA. if data.startswith("ia:"): source_records = [data] itemid = data[len("ia:"):] metadata = ia.get_metadata(itemid) if not metadata: raise DataError("invalid-ia-identifier") if not ia.edition_from_item_metadata(itemid, metadata): raise DataError("item-not-a-book") try: rec = get_marc_record_from_ia(itemid) # skip serials if rec.leader()[7] == 's': raise DataError("item-is-serial") except IOError: raise DataError("no-marc-record") if not rec: raise DataError("no-marc-record") else: source_records = None itemid = None #Marc Binary if len(data) != int(data[:5]): return json.dumps({'success':False, 'error':'Bad MARC length'}) rec = MarcBinary(data) edition = read_edition(rec) if source_records: edition['source_records'] = source_records edition['ocaid'] = itemid edition_builder = import_edition_builder.import_edition_builder(init_dict=edition) format = 'marc' parse_meta_headers(edition_builder) return edition_builder.get_dict(), format
def process(self, req): requests = req.split('|') bib_keys = sum([r.split(';') for r in requests], []) # filter out 'id:foo' before passing to dynlinks bib_keys = [k for k in bib_keys if k[:3].lower() != 'id:'] self.docs = dynlinks.query_docs(bib_keys) if not self.options.get('no_details'): self.detailss = dynlinks.process_result_for_details(self.docs) else: self.detailss = {} dp = dynlinks.DataProcessor() self.datas = dp.process(self.docs) self.works = dp.works # XXX control costs below with [:iaid_limit] - note that this may result # in no 'exact' item match, even if one exists # Note that it's available thru above works/docs iaid_limit = 500 self.wkey_to_iaids = dict( (wkey, get_work_iaids(wkey)[:iaid_limit]) for wkey in self.works) iaids = sum(self.wkey_to_iaids.values(), []) self.iaid_to_meta = dict( (iaid, ia.get_metadata(iaid)) for iaid in iaids) def lookup_iaids(iaids): step = 10 if len(iaids) > step and not self.options.get('debug_things'): result = [] while iaids: result += lookup_iaids(iaids[:step]) iaids = iaids[step:] return result query = { 'type': '/type/edition', 'ocaid': iaids, } result = web.ctx.site.things(query) return result ekeys = lookup_iaids(iaids) # If returned order were reliable, I could skip the below. eds = dynlinks.ol_get_many_as_dict(ekeys) self.iaid_to_ed = dict((ed['ocaid'], ed) for ed in eds.values()) # self.iaid_to_ekey = dict((iaid, ed['key']) # for iaid, ed in self.iaid_to_ed.items()) # Work towards building a dict of iaid loanability, # def has_lending_collection(meta): # collections = meta.get("collection", []) # return 'lendinglibrary' in collections or 'inlibrary' in collections # in case site.store supports get_many (unclear) # maybe_loanable_iaids = [iaid for iaid in iaids # if has_lending_collection(self.iaid_to_meta.get(iaid, {}))] # loanable_ekeys = [self.iaid_to_ekey.get(iaid) for iaid in maybe_loanable_iaids] # loanstatus = web.ctx.site.store.get('ebooks' + ekey, {'borrowed': 'false'}) result = {} for r in requests: bib_keys = r.split(';') if r.lower().startswith('id:'): result_key = bib_keys.pop(0)[3:] else: result_key = r sub_result = self.make_record(bib_keys) if sub_result: result[result_key] = sub_result if self.options.get('debug_items'): result['ekeys'] = ekeys result['eds'] = eds result['iaids'] = iaids return result
def test_get_metadata_empty(monkeypatch, mock_memcache): monkeypatch.setattr(ia, 'get_api_response', lambda *args: {}) assert ia.get_metadata('foo02bar') == {}
def get_metadata(self, identifier): logger.info("find_metadata %s", identifier) return ia.get_metadata(identifier)
def get_metadata(self, identifier: str): return ia.get_metadata(identifier)
def _get_ia_item(self, itemid): timestamp = {"type": "/type/datetime", "value": "2010-01-01T00:00:00"} metadata = ia.get_metadata(itemid) if not self._is_valid_item(itemid, metadata): return None d = { "key": "/books/ia:" + itemid, "type": { "key": "/type/edition" }, "title": itemid, "ocaid": itemid, "revision": 1, "created": timestamp, "last_modified": timestamp } def add(key, key2=None): key2 = key2 or key # sometimes the empty values are represneted as {} in metadata API. Avoid them. if key in metadata and metadata[key] != {}: value = metadata[key] if isinstance(value, list): value = [v for v in value if v != {}] if value: if isinstance(value[0], basestring): value = "\n\n".join(value) else: value = value[0] else: # empty list. Ignore. return d[key2] = value def add_list(key, key2): key2 = key2 or key # sometimes the empty values are represneted as {} in metadata API. Avoid them. if key in metadata and metadata[key] != {}: value = metadata[key] if not isinstance(value, list): value = [value] d[key2] = value def add_isbns(): isbns = metadata.get('isbn') isbn_10 = [] isbn_13 = [] if isbns: for isbn in isbns: isbn = isbn.replace("-", "").strip() if len(isbn) == 13: isbn_13.append(isbn) elif len(isbn) == 10: isbn_10.append(isbn) if isbn_10: d["isbn_10"] = isbn_10 if isbn_13: d["isbn_13"] = isbn_13 def add_subjects(): collections = metadata.get("collection", []) mapping = { "inlibrary": "In library", "lendinglibrary": "Lending library" } subjects = [ subject for c, subject in mapping.items() if c in collections ] if subjects: d['subjects'] = subjects add('title') add('description', 'description') add_list('publisher', 'publishers') add_list("creator", "author_names") add('date', 'publish_date') add_isbns() add_subjects() return d
def parse_data(data): data = data.strip() if -1 != data[:10].find('<?xml'): root = etree.fromstring(data) #print root.tag if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' == root.tag: edition_builder = import_rdf.parse(root) format = 'rdf' elif '{http://www.w3.org/2005/Atom}entry' == root.tag: edition_builder = import_opds.parse(root) format = 'opds' elif '{http://www.loc.gov/MARC21/slim}record' == root.tag: if root.tag == '{http://www.loc.gov/MARC21/slim}collection': root = root[0] rec = MarcXml(root) edition = read_edition(rec) edition_builder = import_edition_builder.import_edition_builder( init_dict=edition) format = 'marcxml' else: print 'unrecognized XML format' return None, None elif data.startswith('{') and data.endswith('}'): obj = json.loads(data) edition_builder = import_edition_builder.import_edition_builder( init_dict=obj) format = 'json' else: # Special case to load IA records # Just passing ia:foo00bar is enough to load foo00bar from IA. if data.startswith("ia:"): source_records = [data] itemid = data[len("ia:"):] metadata = ia.get_metadata(itemid) if not metadata: raise DataError("invalid-ia-identifier") if not ia.edition_from_item_metadata(itemid, metadata): raise DataError("item-not-a-book") try: rec = get_marc_record_from_ia(itemid) except IOError: raise DataError("no-marc-record") if not rec: raise DataError("no-marc-record") else: source_records = None itemid = None #Marc Binary if len(data) != int(data[:5]): return json.dumps({ 'success': False, 'error': 'Bad MARC length' }) rec = MarcBinary(data) edition = read_edition(rec) if source_records: edition['source_records'] = source_records edition['ocaid'] = itemid edition_builder = import_edition_builder.import_edition_builder( init_dict=edition) format = 'marc' parse_meta_headers(edition_builder) return edition_builder.get_dict(), format
def test_get_metadata_empty(monkeypatch, mock_memcache): monkeypatch.setattr(ia, '_get_metadata', lambda _id: {}) assert ia.get_metadata('foo02bar') == {}