def createSession(self): iaKey = decryptEnvVar('IA_ACCESS_KEY') iaSecret = decryptEnvVar('IA_SECRET_KEY') return get_session( config={'s3': { 'access': iaKey, 'secret': iaSecret }})
def test_env_decryptor_success(self, mock_boto): mock_boto.client().decrypt.return_value = { 'Plaintext': 'testing'.encode('utf-8') } outEnv = decryptEnvVar('testing') self.assertEqual(outEnv, 'testing')
def test_env_decryptor_boto_error(self, mock_boto): mock_boto.client().decrypt.side_effect = ClientError outEnv = decryptEnvVar('testing') self.assertEqual(outEnv, b64encode('testing'.encode('utf-8')).decode('utf-8'))
def test_env_decryptor_non_encoded(self, mock_boto): mock_boto.client().decrypt.return_value = {'Plaintext': 'testing'} outEnv = decryptEnvVar('testing') self.assertEqual(outEnv, 'testing')
class GBCoverFetcher(AbsCoverFetcher): """Google Books API cover fetcher""" GOOGLE_API_KEY = decryptEnvVar('GOOGLE_BOOKS_KEY') GOOGLE_BOOKS_SEARCH = 'https://www.googleapis.com/books/v1/volumes?q={}:{}&key={}' # noqa: E501 GOOGLE_BOOKS_VOLUME = 'https://www.googleapis.com/books/v1/volumes/{}?key={}' # noqa: E501 IMAGE_SIZE_ORDER = ['small', 'thumbnail', 'smallThumbnail'] def __init__(self): pass def getSource(self): return 'googleBooks' def queryIdentifier(self, idType, identifier): """Makes an authenticated request against the Google Books API for a volume identified by the supplied identifier. If found this returns the identifier for the first volume found Arguments: idType {string} -- Type of the identifier to be queried identifier {string} -- Value of the identifier to be queried Returns: string -- Google Books Volume Identifier """ searchResp = requests.get( self.GOOGLE_BOOKS_SEARCH.format(idType, identifier, self.GOOGLE_API_KEY)) if searchResp.status_code == 200: respBody = searchResp.json() if (respBody['kind'] == 'books#volumes' and respBody['totalItems']) == 1: return respBody['items'][0]['id'] return None def createCoverURL(self, volumeID): """Parses the Google Books metadata object for a cover URL. If found it takes the first size found as set in the IMAGE_SIZE_ORDER class variable. It first retrieves this object from the volumeID parameter Arguments: volumeID {string} -- Google Books Volume Identifier Returns: string -- Cover Image URI """ volumeResp = requests.get( self.GOOGLE_BOOKS_VOLUME.format(volumeID, self.GOOGLE_API_KEY)) if volumeResp.status_code == 200: volBody = volumeResp.json() try: return self.getImage(volBody['volumeInfo']['imageLinks'], 0) except KeyError: pass return None @classmethod def getImage(cls, imageLinks, pos): """Recursively fetches the image link from the imageLinks array, returning None if no usable sizes are found. Arguments: imageLinks {list} -- List of image links found in metadata object pos {integer} -- Position in IMAGE_SIZE_ORDER list to look for in array Returns: string -- URI to cover image """ try: return imageLinks[cls.IMAGE_SIZE_ORDER[pos]] except KeyError: return cls.getImage(imageLinks, pos + 1) except IndexError: return None def getMimeType(self): return 'image/jpeg'
class CCCoverFetcher(AbsCoverFetcher): """Fetcher for the ContentCafe cover API. This API requires that all requests be authenticated and the required credentials are stored as KMS encrypted variables. The API only accepts isbn values. """ CONTENT_CAFE_USER = decryptEnvVar('CONTENT_CAFE_USER') CONTENT_CAFE_PSWD = decryptEnvVar('CONTENT_CAFE_PSWD') CONTENT_CAFE_URL = 'http://contentcafe2.btol.com/ContentCafe/Jacket.aspx?userID={}&password={}&type=L&Value={}' # noqa: E501 def __init__(self): """Constructor method, creates the stockImage bytes.""" self.stockImage = CCCoverFetcher.loadStockImage() def getSource(self): """Returns name: contentCafe""" return 'contentCafe' @staticmethod def loadStockImage(): """This sets the stock image. If an cover is not found by the API a generic blank cover is returned. To filter out these covers we must compare the bytes of each file. This generates the bytes for the comparison file. """ return open('./assets/stand-in-prefix.png', 'rb').read() def queryIdentifier(self, idType, identifier): """Queries the API for a cover URI Arguments: idType {string} -- Type of identifier, only accepts isbn identifier {string} -- Value of the identifier Returns: [string] -- URI for the cover from the ContentCafe API """ if idType != 'isbn': return None coverURL = self.CONTENT_CAFE_URL.format(self.CONTENT_CAFE_USER, self.CONTENT_CAFE_PSWD, identifier) searchResp = requests.get(coverURL) if searchResp.status_code == 200: imageContent = searchResp.content if imageContent.startswith(self.stockImage): return None return coverURL return None def createCoverURL(self, volumeID): """ContentCafe implementation of the createCoverURL method. This method does nothing because the queryIdentifier returns a valid URL. Arguments: volumeID {string} -- ContentCafe URI Returns: [string] -- Unchanged ContentCafe URI """ return volumeID def getMimeType(self): return 'image/jpeg'
class CoverParse: HATHI_CLIENT_KEY = decryptEnvVar('HATHI_CLIENT_KEY') HATHI_CLIENT_SECRET = decryptEnvVar('HATHI_CLIENT_SECRET') URL_ID_REGEX = r'\/([^\/]+\.[a-zA-Z]{3,4}$)' HATHI_URL_ID_REGEX = r'([a-z0-9]+\.[$0-9a-z]+)\/[0-9]{1,2}\?format=jpeg&v=2$' # noqa: E501 GOOGLE_URL_ID_REGEX = r'\/[^\/]+\?id=([0-9a-zA-Z]+)\S+imgtk=[a-zA-Z_\-0-9]+&source=gbs_api$' # noqa: E501 def __init__(self, record): self.logger = LOGGER self.source = record.get('source', 'unk') self.sourceID = record.get('identifier', None) self.originalURL = record.get('url', None) self.remoteURL = record.get('url', None) self.s3CoverURL = None self.logger.debug('Source: {}|ID: {}|URL: {}'.format( self.source, self.sourceID, self.remoteURL)) @property def remoteURL(self): return self._remoteURL @remoteURL.setter def remoteURL(self, url): if not url: self.logger.error( 'URL not provided from {}({}) to cover ingester'.format( self.sourceID, self.source)) raise InvalidParameter('URL must be supplied to CoverParse()') if url[:4] != 'http': url = 'https://{}'.format(url) parsedURL = urlparse(url) if not parsedURL.scheme or not parsedURL.netloc or not parsedURL.path: self.logger.error('Invalid URL provided, unable to access cover') raise InvalidParameter('Unable to validate URL {}'.format(url)) self._remoteURL = url @property def sourceID(self): return self._sourceID @sourceID.setter def sourceID(self, identifier): if not identifier: self.logger.error('Must supply unique identifier with remoteURL') raise InvalidParameter('Source identifier required. None provided') self._sourceID = identifier def storeCover(self): authObj = None if 'hathitrust' in self.remoteURL: authObj = CoverParse.createAuth() try: imgResp = requests.get(self.remoteURL, auth=authObj, timeout=5) except ReadTimeout: raise URLFetchError('URL request timed out', 504, self.remoteURL) if imgResp.status_code != 200: raise URLFetchError('Unable to read image at url', imgResp.status_code, self.remoteURL) coverKey = self.createKey() mimeType = self.getMimeType(coverKey) s3 = s3Client(coverKey) existingFile = s3.checkForFile() if existingFile is None: resizer = CoverResizer(imgResp.content) resizer.getNewDimensions() resizer.resizeCover() standardCoverBytes = resizer.getCoverInBytes() self.s3CoverURL = s3.storeNewFile(standardCoverBytes, mimeType) else: self.s3CoverURL = existingFile def createKey(self): if 'hathitrust' in self.remoteURL: urlMatch = re.search(self.HATHI_URL_ID_REGEX, self.remoteURL) urlID = '{}.jpg'.format(urlMatch.group(1)) elif 'google' in self.remoteURL: urlMatch = re.search(self.GOOGLE_URL_ID_REGEX, self.remoteURL) urlID = '{}.jpg'.format(urlMatch.group(1)) elif 'contentcafe2' in self.remoteURL: urlID = '{}.jpg'.format(self.sourceID) elif 'archive.org' in self.remoteURL: urlID = '{}.jpg'.format(self.sourceID) else: urlMatch = re.search(self.URL_ID_REGEX, self.remoteURL) urlID = urlMatch.group(1) return '{}/{}_{}'.format(self.source.lower(), self.sourceID, urlID.lower()) def getMimeType(self, key): return guess_type(key)[0] @classmethod def createAuth(cls): return OAuth1(cls.HATHI_CLIENT_KEY, client_secret=cls.HATHI_CLIENT_SECRET, signature_type='query')
class HathiCover(): """Manager class for finding a cover image for HathiTrust images. This is done by parsing a METS object obtained through the Hathi API, extracting the first 25 pages and scoring them based on relevancy as a cover. The URI to the most relevant page image is ultimately returned. """ HATHI_BASE_API = os.environ.get('HATHI_BASE_API', None) HATHI_CLIENT_KEY = decryptEnvVar('HATHI_CLIENT_KEY') HATHI_CLIENT_SECRET = decryptEnvVar('HATHI_CLIENT_SECRET') def __init__(self, htid): self.htid = htid self.logger = logger def generateOAuth(self): """Helper method that generates an OAuth1 block that authenticates requests against the HathiTrust Data API. Due to the structure of the API this is formatted as part of the query string. Returns: [object] -- An OAuth1 authentication block """ return OAuth1(self.HATHI_CLIENT_KEY, client_secret=self.HATHI_CLIENT_SECRET, signature_type='query') def getResponse(self, queryURL): queryAuth = self.generateOAuth() try: return requests.get(queryURL, auth=queryAuth, timeout=3) except ReadTimeout: raise URLFetchError('URL request timed out'.format(queryURL), 504, queryURL) def getPageFromMETS(self): """Query method for the best page URI from the record's METS file Returns: [uri] -- URI to the page to be used as a cover image """ self.logger.debug('Querying {} for cover image'.format(self.htid)) structURL = '{}/structure/{}?format=json&v=2'.format( self.HATHI_BASE_API, self.htid) try: structResp = self.getResponse(structURL) if structResp.status_code == 200: return self.parseMETS(structResp.json()) except URLFetchError: self.logger.warning('Request for structure file timed out') return None def parseMETS(self, metsJson): """Parser that handles the METS file, parsing the first 25 pages into HathiPage objects that contain a score and position. Once parsed it sets the "imagePage" as the page that contains the most plausibly relevant cover. Arguments: metsJson {object} -- METS object extracted from the JSON response Returns: [uri] -- URI to the page to be used as a cover image """ structMap = metsJson['METS:structMap'] self.logger.info('Retrieved METS for {}'.format(self.htid)) self.pages = [ HathiPage(page) for page in structMap['METS:div']['METS:div'][:25] ] self.pages.sort(key=lambda x: x.score, reverse=True) self.imagePage = self.pages[0] return self.getPageURL() def getPageURL(self): """Extracts a resolvable URI from the page selected as a cover image. This URI can be used to create a local copy of the cover. Returns: [uri] -- The created URI of the cover page """ return '{}/volume/pageimage/{}/{}?format=jpeg&v=2'.format( self.HATHI_BASE_API, self.htid, self.imagePage.page)