def lookup(self, identifier): """Requests NoveList metadata for a particular identifier :return: Metadata object or None """ client_identifier = identifier.urn if identifier.type != Identifier.ISBN: return self.lookup_equivalent_isbns(identifier) params = dict(ClientIdentifier=client_identifier, ISBN=identifier.identifier, version=self.version, profile=self.profile, password=self.password) url = self._build_query(params) self.log.debug("NoveList lookup: %s", url) representation, from_cache = Representation.cacheable_post( self._db, unicode(url), params, max_age=self.MAX_REPRESENTATION_AGE, response_reviewer=self.review_response) return self.lookup_info_to_metadata(representation)
def _check_for_gutenberg_first(self, url, headers, **kwargs): """Make a HEAD request for the given URL to make sure it doesn't redirect to gutenberg.org. """ parsed = urlparse.urlparse(url) if parsed.netloc.endswith('unglue.it'): # It might be a redirect. Make a HEAD request to see where # it leads. head_response = requests.head(url, headers=headers) if head_response.status_code / 100 == 3: # Yes, it's a redirect. location = head_response.headers.get('location') if location: parsed = urlparse.urlparse(location) if parsed.netloc.endswith('gutenberg.org'): # If we make this request we're going to be in # for some trouble, and we won't even get # anything useful. Act as though we got an # unappetizing representation. self.log.info("Not making request to gutenberg.org.") return ( 200, {"content-type" : "application/vnd.librarysimplified-clickthrough"}, "Gated behind Gutenberg click-through" ) return Representation.simple_http_get(url, headers, **kwargs)
def request(self, path, identifier=None, max_age=LIST_MAX_AGE): if not path.startswith(self.BASE_URL): if not path.startswith("/"): path = "/" + path url = self.BASE_URL + path else: url = path joiner = '?' if '?' in url: joiner = '&' url += joiner + "api-key=" + self.api_key representation, cached = Representation.get( self._db, url, do_get=self.do_get, max_age=max_age, debug=True, pause_before=0.1) status = representation.status_code if status == 200: # Everything's fine. content = json.loads(representation.content) return content diagnostic = "Response from %s was: %r" % ( url, representation.content ) if status == 403: raise IntegrationException( "API authentication failed", "API key is most likely wrong. %s" % diagnostic ) else: raise IntegrationException( "Unknown API error (status %s)" % status, diagnostic )
def request(self, path, identifier=None, max_age=LIST_MAX_AGE): if not path.startswith(self.BASE_URL): if not path.startswith("/"): path = "/" + path url = self.BASE_URL + path else: url = path joiner = '?' if '?' in url: joiner = '&' url += joiner + "api-key=" + self.api_key representation, cached = Representation.get(self._db, url, do_get=self.do_get, max_age=max_age, debug=True, pause_before=0.1) status = representation.status_code if status == 200: # Everything's fine. content = json.loads(representation.content) return content diagnostic = "Response from %s was: %r" % (url, representation.content) if status == 403: raise IntegrationException( "API authentication failed", "API key is most likely wrong. %s" % diagnostic) else: raise IntegrationException( "Unknown API error (status %s)" % status, diagnostic)
def lookup(self, identifier): """Requests NoveList metadata for a particular identifier :return: Metadata object or None """ client_identifier = identifier.urn if identifier.type != Identifier.ISBN: return self.lookup_equivalent_isbns(identifier) params = dict( ClientIdentifier=client_identifier, ISBN=identifier.identifier, version=self.version, profile=self.profile, password=self.password ) scrubbed_url = unicode(self.scrubbed_url(params)) representation = self.cached_representation(scrubbed_url) if not representation: self.log.info("No cached NoveList request available.") url = self.build_query_url(params) self.log.debug("NoveList lookup: %s", url) representation, from_cache = Representation.post( self._db, unicode(url), '', max_age=self.MAX_REPRESENTATION_AGE, response_reviewer=self.review_response ) # Remove credential information from the Representation URL. This # avoids holding those details in an unexpected part of the database # and lets multiple libraries to use the same cached representation. representation.url = scrubbed_url return self.lookup_info_to_metadata(representation)
def lookup(self, identifier): """Requests NoveList metadata for a particular identifier :return: Metadata object or None """ client_identifier = identifier.urn if identifier.type != Identifier.ISBN: return self.lookup_equivalent_isbns(identifier) params = dict(ClientIdentifier=client_identifier, ISBN=identifier.identifier, version=self.version, profile=self.profile, password=self.password) scrubbed_url = unicode(self.scrubbed_url(params)) representation = self.cached_representation(scrubbed_url) if not representation: self.log.info("No cached NoveList request available.") url = self.build_query_url(params) self.log.debug("NoveList lookup: %s", url) representation, from_cache = Representation.post( self._db, unicode(url), '', max_age=self.MAX_REPRESENTATION_AGE, response_reviewer=self.review_response) # Remove credential information from the Representation URL. This # avoids holding those details in an unexpected part of the database # and lets multiple libraries to use the same cached representation. representation.url = scrubbed_url return self.lookup_info_to_metadata(representation)
def lookup_by_viaf(self, viaf, working_sort_name=None, working_display_name=None, do_get=None): url = self.LOOKUP_URL % dict(viaf=viaf) r, cached = Representation.get( self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE ) xml = r.content return self.parser.parse(xml, working_sort_name, working_display_name)
def lookup_by_name(self, sort_name, display_name=None, do_get=None, known_titles=None): """ Asks VIAF for a list of author clusters, matching the passed-in author name. Selects the cluster we deem the best match for the author we mean. :param sort_name: Author name in Last, First format. :param display_name: Author name in First Last format. :param do_get: Ask Representation to use Http GET? :param known_titles: A list of titles we know this author wrote. :return: (selected_candidate, match_confidences, contributor_titles) for selected ContributorData. """ author_name = sort_name or display_name # from OCLC tech support: # VIAF's SRU endpoint can only return a maximum number of 10 records # when the recordSchema is http://viaf.org/VIAFCluster maximum_records = 10 # viaf maximum that's not ignored page = 1 contributor_candidates = [] # limit ourselves to reading the first 500 viaf clusters, on the # assumption that search match quality is unlikely to be usable after that. for page in range (1, 51): start_record = 1 + maximum_records * (page-1) scope = 'local.personalNames' if is_corporate_name(author_name): scope = 'local.corporateNames' url = self.SEARCH_URL.format( scope=scope, author_name=author_name.encode("utf8"), maximum_records=maximum_records, start_record=start_record ) representation, cached = Representation.get( self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE ) xml = representation.content candidates = self.parser.parse_multiple(xml, sort_name, display_name, page) if not any(candidates): # Delete the representation so it's not cached. self._db.query(Representation).filter( Representation.id==representation.id ).delete() # We ran out of clusters, so we can relax and move on to # ordering the returned results break contributor_candidates.extend(candidates) page += 1 best_match = self.select_best_match(candidates=contributor_candidates, working_sort_name=author_name, known_titles=known_titles) return best_match
def get_jsonld(self, url): representation, cached = Representation.get(self._db, url) try: data = jsonld.load_document(url) except Exception as e: self.log.error("EXCEPTION on %s: %s", url, e, exc_info=e) return None, False if cached and not representation.content: representation, cached = Representation.get( self._db, url, max_age=0) if not representation.content: return None, False doc = { 'contextUrl': None, 'documentUrl': url, 'document': representation.content.decode('utf8') } return doc, cached
def lookup_name_title(self, viaf, do_get=None): url = self.LOOKUP_URL % dict(viaf=viaf) r, cached = Representation.get( self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE ) xml = r.content cluster = etree.fromstring(xml, parser=etree.XMLParser(recover=True)) titles = [] for potential_title in self.parser.name_titles_for_cluster(cluster): titles.append(potential_title) return titles
def get_jsonld(self, url): representation, cached = Representation.get(self._db, url) try: data = jsonld.load_document(url) except Exception as e: self.log.error("EXCEPTION on %s: %s", url, e, exc_info=e) return None, False if cached and not representation.content: representation, cached = Representation.get(self._db, url, max_age=0) if not representation.content: return None, False doc = { 'contextUrl': None, 'documentUrl': url, 'document': representation.content.decode('utf8') } return doc, cached
def lookup_by_viaf(self, viaf, working_sort_name=None, working_display_name=None, do_get=None): url = self.LOOKUP_URL % dict(viaf=viaf) r, cached = Representation.get(self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE) xml = r.content return self.parser.parse(xml, working_sort_name, working_display_name)
def lookup_name_title(self, viaf, do_get=None): url = self.LOOKUP_URL % dict(viaf=viaf) r, cached = Representation.get(self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE) xml = r.content cluster = etree.fromstring(xml, parser=etree.XMLParser(recover=True)) titles = [] for potential_title in self.parser.name_titles_for_cluster(cluster): titles.append(potential_title) return titles
def __init__(self, content_dict, **kwargs): super(AudiobookManifest, self).__init__(**kwargs) self.raw = content_dict # Metadata values that map directly onto the core spec. self.import_metadata('title') self.import_metadata('publisher') self.import_metadata('description') self.import_metadata('isbn', 'identifier') self.import_metadata('authors', 'author') self.import_metadata('narrators', 'narrator') self.import_metadata('minutes', 'duration', lambda x: x*60) # Metadata values that have no equivalent in the core spec, # but are potentially useful. self.import_metadata('size', 'schema:contentSize') self.import_metadata('titleid', 'rbdigital:id', str) self.import_metadata('hasDrm', 'rbdigital:hasDrm') self.import_metadata('encryptionKey', 'rbdigital:encryptionKey') # Spine items. for file_data in self.raw.get('files', []): self.import_spine(file_data) # Links. download_url = self.raw.get('downloadUrl') if download_url: self.add_link( download_url, 'alternate', type=Representation.guess_media_type(download_url) ) cover = self.best_cover(self.raw.get('images', [])) if cover: self.add_link( cover, "cover", type=Representation.guess_media_type(cover) )
def request(self, path, identifier=None, max_age=LIST_MAX_AGE): if not path.startswith(self.BASE_URL): if not path.startswith("/"): path = "/" + path url = self.BASE_URL + path else: url = path joiner = '?' if '?' in url: joiner = '&' url += joiner + "api-key=" + self.api_key representation, cached = Representation.get( self._db, url, do_get=self.do_get, max_age=max_age, debug=True, pause_before=0.1) content = json.loads(representation.content) return content
def lookup(self, identifier, **kwargs): """Requests NoveList metadata for a particular identifier :param kwargs: Keyword arguments passed into Representation.post(). :return: Metadata object or None """ client_identifier = identifier.urn if identifier.type != Identifier.ISBN: return self.lookup_equivalent_isbns(identifier) params = dict( ClientIdentifier=client_identifier, ISBN=identifier.identifier, version=self.version, profile=self.profile, password=self.password, ) scrubbed_url = str(self.scrubbed_url(params)) url = self.build_query_url(params) self.log.debug("NoveList lookup: %s", url) # We want to make an HTTP request for `url` but cache the # result under `scrubbed_url`. Define a 'URL normalization' # function that always returns `scrubbed_url`. def normalized_url(original): return scrubbed_url representation, from_cache = Representation.post( _db=self._db, url=str(url), data="", max_age=self.MAX_REPRESENTATION_AGE, response_reviewer=self.review_response, url_normalizer=normalized_url, **kwargs ) # Commit to the database immediately to reduce the chance # that some other incoming request will try to create a # duplicate Representation and crash. self._db.commit() return self.lookup_info_to_metadata(representation)
def oclc_number_for_isbn(self, isbn): """Turn an ISBN identifier into an OCLC Number identifier.""" url = self.ISBN_BASE_URL % dict(id=isbn.identifier) representation, cached = Representation.get( self._db, url, Representation.http_get_no_redirect) if not representation.location: raise IOError( "Expected %s to redirect, but couldn't find location." % url) location = representation.location match = self.URI_WITH_OCLC_NUMBER.match(location) if not match: raise IOError( "OCLC redirected ISBN lookup, but I couldn't make sense of the destination, %s" % location) oclc_number = match.groups()[0] return Identifier.for_foreign_id(self._db, Identifier.OCLC_NUMBER, oclc_number)[0]
def oclc_number_for_isbn(self, isbn): """Turn an ISBN identifier into an OCLC Number identifier.""" url = self.ISBN_BASE_URL % dict(id=isbn.identifier) representation, cached = Representation.get( self._db, url, Representation.http_get_no_redirect) if not representation.location: raise IOError( "Expected %s to redirect, but couldn't find location." % url ) location = representation.location match = self.URI_WITH_OCLC_NUMBER.match(location) if not match: raise IOError( "OCLC redirected ISBN lookup, but I couldn't make sense of the destination, %s" % location) oclc_number = match.groups()[0] return Identifier.for_foreign_id( self._db, Identifier.OCLC_NUMBER, oclc_number)[0]
def mirror_hyperlink(self, hyperlink): resource = hyperlink.resource if not resource.representation: resource.representation, cached = Representation.get( self._db, resource.url, max_age=self.ONE_YEAR) representation = resource.representation if not representation.media_type or not representation.media_type.startswith('image/'): representation.fetch_exception = ( 'Representation is not an image as expected.') return representation extension = self.image_extensions_for_types.get( representation.media_type, '') filename = "cover" + extension representation.mirror_url = self.uploader.cover_image_url( hyperlink.data_source, hyperlink.identifier, filename) self._db.commit() return resource.representation
def mirror_hyperlink(self, hyperlink): resource = hyperlink.resource if not resource.representation: resource.representation, cached = Representation.get( self._db, resource.url, max_age=self.ONE_YEAR) representation = resource.representation if not representation.media_type or not representation.media_type.startswith( 'image/'): representation.fetch_exception = ( 'Representation is not an image as expected.') return representation extension = self.image_extensions_for_types.get( representation.media_type, '') filename = "cover" + extension representation.mirror_url = self.uploader.cover_image_url( hyperlink.data_source, hyperlink.identifier, filename) self._db.commit() return resource.representation
def lookup_by_identifier(self, identifier, processed_uris=set()): """Turn an Identifier into a JSON-LD document.""" if identifier.type == Identifier.OCLC_WORK: foreign_type = 'work' url = self.WORK_BASE_URL elif identifier.type == Identifier.OCLC_NUMBER: foreign_type = "oclc" url = self.BASE_URL url = url % dict(id=identifier.identifier, type=foreign_type) if url in processed_uris: self.log.debug("SKIPPING %s, already processed.", url) return None, True processed_uris.add(url) representation, cached = Representation.get(self._db, url) try: data = jsonld.load_document(url) except Exception, e: self.log.error("EXCEPTION on %s: %s", url, e, exc_info=e) return None, False
def open(self): if len(sys.argv) > 1: return open(sys.argv[1]) url = Configuration.integration_url( Configuration.STAFF_PICKS_INTEGRATION, True ) if not url.startswith('https://') or url.startswith('http://'): url = self.DEFAULT_URL_TEMPLATE % url self.log.info("Retrieving %s", url) representation, cached = Representation.get( self._db, url, do_get=Representation.browser_http_get, accept="text/csv", max_age=timedelta(days=1)) if representation.status_code != 200: raise ValueError("Unexpected status code %s" % representation.status_code) if not representation.media_type.startswith("text/csv"): raise ValueError("Unexpected media type %s" % representation.media_type) return StringIO(representation.content)
def lookup_by_name(self, sort_name, display_name=None, do_get=None, best_match=False): sort_name = sort_name or display_name # from OCLC tech support: # VIAF's SRU endpoint can only return a maximum number of 10 records # when the recordSchema is http://viaf.org/VIAFCluster maximum_records = 10 # viaf maximum that's not ignored page = 1 contributor_candidates = [] # limit ourselves to reading the first 500 viaf clusters, on the # assumption that search match quality is unlikely to be usable after that. for page in range (1, 51): start_record = 1 + maximum_records * (page-1) scope = 'local.personalNames' if is_corporate_name(sort_name): scope = 'local.corporateNames' url = self.SEARCH_URL.format( scope=scope, sort_name=sort_name.encode("utf8"), maximum_records=maximum_records, start_record=start_record ) representation, cached = Representation.get( self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE ) xml = representation.content candidates = self.parser.parse_multiple(xml, sort_name, display_name, page) if not any(candidates): # Delete the representation so it's not cached. self._db.query(Representation).filter( Representation.id==representation.id ).delete() # We ran out of clusters, so we can relax and move on to # ordering the returned results break contributor_candidates.extend(candidates) page += 1 return self.select_best_match(contributor_candidates, sort_name)
def import_spine(self, file_data): """Import an RBdigital spine item as a Web Publication Manifest spine item. """ href = file_data.get('downloadUrl') duration = file_data.get('minutes') * 60 title = file_data.get('display') id = file_data.get('id') size = file_data.get('size') filename = file_data.get('filename') type = Representation.guess_media_type(filename) extra = {} for k, v, transform in ( ('id', 'rbdigital:id', str), ('size', 'schema:contentSize', lambda x: x), ('minutes', 'duration', lambda x: x * 60), ): if k in file_data: extra[v] = transform(file_data[k]) self.add_spine(href, type, title, **extra)
def test_mirror(self, name, uploader_class, bucket_type, bucket_name, open_access, settings=None): # Arrange book_title = "1234567890" book_content = "1234567890" identifier = Identifier(type=Identifier.ISBN, identifier=book_title) representation = Representation( content=book_content, media_type=Representation.EPUB_MEDIA_TYPE) buckets = { bucket_type: bucket_name, } if settings: settings.update(buckets) else: settings = buckets s3_uploader = self._create_s3_uploader(uploader_class=uploader_class, **settings) self.minio_s3_client.create_bucket(Bucket=bucket_name) # Act book_url = s3_uploader.book_url(identifier, open_access=open_access) s3_uploader.mirror_one(representation, book_url) # Assert response = self.minio_s3_client.list_objects(Bucket=bucket_name) assert "Contents" in response assert len(response["Contents"]) == 1 [object] = response["Contents"] assert object["Key"] == "ISBN/{0}.epub".format(book_title)
def lookup(self, identifier): """Requests NoveList metadata for a particular identifier :return: Metadata object or None """ client_identifier = identifier.urn if identifier.type != Identifier.ISBN: return self.lookup_equivalent_isbns(identifier) params = dict( ClientIdentifier=client_identifier, ISBN=identifier.identifier, version=self.version, profile=self.profile, password=self.password ) url = self._build_query(params) self.log.debug("NoveList lookup: %s", url) representation, from_cache = Representation.cacheable_post( self._db, unicode(url), params, max_age=self.MAX_REPRESENTATION_AGE, response_reviewer=self.review_response ) return self.lookup_info_to_metadata(representation)
def lookup_by(self, **kwargs): """Perform an OCLC Classify lookup.""" query_string = self.query_string(**kwargs) url = self.BASE_URL + query_string representation, cached = Representation.get(self._db, url) return representation.content
def improve_description(self, id, metadata): """Improve the description associated with a book, if possible. This involves fetching an alternate OPDS entry that might contain more detailed descriptions than those available in the main feed. """ alternate_links = [] existing_descriptions = [] everything_except_descriptions = [] for x in metadata.links: if (x.rel == Hyperlink.ALTERNATE and x.href and x.media_type == OPDSFeed.ENTRY_TYPE): alternate_links.append(x) if x.rel == Hyperlink.DESCRIPTION: existing_descriptions.append((x.media_type, x.content)) else: everything_except_descriptions.append(x) better_descriptions = [] for alternate_link in alternate_links: # There should only be one alternate link, but we'll keep # processing them until we get a good description. # Fetch the alternate entry. representation, is_new = Representation.get( self._db, alternate_link.href, max_age=self.THIRTY_DAYS, do_get=self.http_get ) if representation.status_code != 200: continue # Parse the alternate entry with feedparser and run it through # data_detail_for_feedparser_entry(). parsed = feedparser.parse(representation.content) if len(parsed['entries']) != 1: # This is supposed to be a single entry, and it's not. continue [entry] = parsed['entries'] data_source = self.data_source detail_id, new_detail, failure = self.data_detail_for_feedparser_entry( entry, data_source ) if failure: # There was a problem parsing the entry. self.log.error(failure.exception) continue # TODO: Ideally we could verify that detail_id == id, but # right now they are always different -- one is an HTTPS # URI and one is an HTTP URI. So we omit this step and # assume the documents at both ends of the 'alternate' # link identify the same resource. # Find any descriptions present in the alternate view which # are not present in the original. new_descriptions = [ x for x in new_detail['links'] if x.rel == Hyperlink.DESCRIPTION and (x.media_type, x.content) not in existing_descriptions ] if new_descriptions: # Replace old descriptions with new descriptions. metadata.links = ( everything_except_descriptions + new_descriptions ) break return metadata
def lookup_by_name(self, sort_name, display_name=None, do_get=None, known_titles=None): """ Asks VIAF for a list of author clusters, matching the passed-in author name. Selects the cluster we deem the best match for the author we mean. :param sort_name: Author name in Last, First format. :param display_name: Author name in First Last format. :param do_get: Ask Representation to use Http GET? :param known_titles: A list of titles we know this author wrote. :return: (selected_candidate, match_confidences, contributor_titles) for selected ContributorData. """ author_name = sort_name or display_name # from OCLC tech support: # VIAF's SRU endpoint can only return a maximum number of 10 records # when the recordSchema is http://viaf.org/VIAFCluster maximum_records = 10 # viaf maximum that's not ignored page = 1 contributor_candidates = [] # limit ourselves to reading the first 500 viaf clusters, on the # assumption that search match quality is unlikely to be usable after that. for page in range(1, 51): start_record = 1 + maximum_records * (page - 1) scope = 'local.personalNames' if is_corporate_name(author_name): scope = 'local.corporateNames' url = self.SEARCH_URL.format( scope=scope, author_name=author_name.encode("utf8"), maximum_records=maximum_records, start_record=start_record) representation, cached = Representation.get( self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE) xml = representation.content candidates = self.parser.parse_multiple(xml, sort_name, display_name, page) if not any(candidates): # Delete the representation so it's not cached. self._db.query(Representation).filter( Representation.id == representation.id).delete() # We ran out of clusters, so we can relax and move on to # ordering the returned results break contributor_candidates.extend(candidates) page += 1 best_match = self.select_best_match(candidates=contributor_candidates, working_sort_name=author_name, known_titles=known_titles) return best_match
def improve_description(self, id, metadata): """Improve the description associated with a book, if possible. This involves fetching an alternate OPDS entry that might contain more detailed descriptions than those available in the main feed. """ alternate_links = [] existing_descriptions = [] everything_except_descriptions = [] for x in metadata.links: if (x.rel == Hyperlink.ALTERNATE and x.href and x.media_type == OPDSFeed.ENTRY_TYPE): alternate_links.append(x) if x.rel == Hyperlink.DESCRIPTION: existing_descriptions.append((x.media_type, x.content)) else: everything_except_descriptions.append(x) better_descriptions = [] for alternate_link in alternate_links: # There should only be one alternate link, but we'll keep # processing them until we get a good description. # Fetch the alternate entry. representation, is_new = Representation.get( self._db, alternate_link.href, max_age=self.THIRTY_DAYS, do_get=self.http_get) if representation.status_code != 200: continue # Parse the alternate entry with feedparser and run it through # data_detail_for_feedparser_entry(). parsed = feedparser.parse(representation.content) if len(parsed['entries']) != 1: # This is supposed to be a single entry, and it's not. continue [entry] = parsed['entries'] data_source = self.data_source detail_id, new_detail, failure = self.data_detail_for_feedparser_entry( entry, data_source) if failure: # There was a problem parsing the entry. self.log.error(failure.exception) continue # TODO: Ideally we could verify that detail_id == id, but # right now they are always different -- one is an HTTPS # URI and one is an HTTP URI. So we omit this step and # assume the documents at both ends of the 'alternate' # link identify the same resource. # Find any descriptions present in the alternate view which # are not present in the original. new_descriptions = [ x for x in new_detail['links'] if x.rel == Hyperlink.DESCRIPTION and ( x.media_type, x.content) not in existing_descriptions ] if new_descriptions: # Replace old descriptions with new descriptions. metadata.links = (everything_except_descriptions + new_descriptions) break return metadata