Beispiel #1
0
 def createSession(self):
     iaKey = decryptEnvVar('IA_ACCESS_KEY')
     iaSecret = decryptEnvVar('IA_SECRET_KEY')
     return get_session(
         config={'s3': {
             'access': iaKey,
             'secret': iaSecret
         }})
Beispiel #2
0
 def test_env_decryptor_success(self, mock_boto):
     mock_boto.client().decrypt.return_value = {
         'Plaintext': 'testing'.encode('utf-8')
     }
     outEnv = decryptEnvVar('testing')
     self.assertEqual(outEnv, 'testing')
Beispiel #3
0
 def test_env_decryptor_boto_error(self, mock_boto):
     mock_boto.client().decrypt.side_effect = ClientError
     outEnv = decryptEnvVar('testing')
     self.assertEqual(outEnv,
                      b64encode('testing'.encode('utf-8')).decode('utf-8'))
Beispiel #4
0
 def test_env_decryptor_non_encoded(self, mock_boto):
     mock_boto.client().decrypt.return_value = {'Plaintext': 'testing'}
     outEnv = decryptEnvVar('testing')
     self.assertEqual(outEnv, 'testing')
class GBCoverFetcher(AbsCoverFetcher):
    """Google Books API cover fetcher"""
    GOOGLE_API_KEY = decryptEnvVar('GOOGLE_BOOKS_KEY')
    GOOGLE_BOOKS_SEARCH = 'https://www.googleapis.com/books/v1/volumes?q={}:{}&key={}'  # noqa: E501
    GOOGLE_BOOKS_VOLUME = 'https://www.googleapis.com/books/v1/volumes/{}?key={}'  # noqa: E501
    IMAGE_SIZE_ORDER = ['small', 'thumbnail', 'smallThumbnail']

    def __init__(self):
        pass

    def getSource(self):
        return 'googleBooks'

    def queryIdentifier(self, idType, identifier):
        """Makes an authenticated request against the Google Books API for a
        volume identified by the supplied identifier. If found this returns
        the identifier for the first volume found

        Arguments:
            idType {string} -- Type of the identifier to be queried
            identifier {string} --  Value of the identifier to be queried

        Returns:
            string -- Google Books Volume Identifier
        """
        searchResp = requests.get(
            self.GOOGLE_BOOKS_SEARCH.format(idType, identifier,
                                            self.GOOGLE_API_KEY))
        if searchResp.status_code == 200:
            respBody = searchResp.json()
            if (respBody['kind'] == 'books#volumes'
                    and respBody['totalItems']) == 1:
                return respBody['items'][0]['id']

        return None

    def createCoverURL(self, volumeID):
        """Parses the Google Books metadata object for a cover URL. If found
        it takes the first size found as set in the IMAGE_SIZE_ORDER class
        variable. It first retrieves this object from the volumeID parameter

        Arguments:
            volumeID {string} -- Google Books Volume Identifier

        Returns:
            string -- Cover Image URI
        """
        volumeResp = requests.get(
            self.GOOGLE_BOOKS_VOLUME.format(volumeID, self.GOOGLE_API_KEY))

        if volumeResp.status_code == 200:
            volBody = volumeResp.json()
            try:
                return self.getImage(volBody['volumeInfo']['imageLinks'], 0)
            except KeyError:
                pass

        return None

    @classmethod
    def getImage(cls, imageLinks, pos):
        """Recursively fetches the image link from the imageLinks array,
        returning None if no usable sizes are found.

        Arguments:
            imageLinks {list} -- List of image links found in metadata object
            pos {integer} -- Position in IMAGE_SIZE_ORDER list to look for in
            array

        Returns:
            string -- URI to cover image
        """
        try:
            return imageLinks[cls.IMAGE_SIZE_ORDER[pos]]
        except KeyError:
            return cls.getImage(imageLinks, pos + 1)
        except IndexError:
            return None

    def getMimeType(self):
        return 'image/jpeg'
Beispiel #6
0
class CCCoverFetcher(AbsCoverFetcher):
    """Fetcher for the ContentCafe cover API. This API requires that all
    requests be authenticated and the required credentials are stored as KMS
    encrypted variables. The API only accepts isbn values.
    """
    CONTENT_CAFE_USER = decryptEnvVar('CONTENT_CAFE_USER')
    CONTENT_CAFE_PSWD = decryptEnvVar('CONTENT_CAFE_PSWD')
    CONTENT_CAFE_URL = 'http://contentcafe2.btol.com/ContentCafe/Jacket.aspx?userID={}&password={}&type=L&Value={}'  # noqa: E501

    def __init__(self):
        """Constructor method, creates the stockImage bytes."""
        self.stockImage = CCCoverFetcher.loadStockImage()

    def getSource(self):
        """Returns name: contentCafe"""
        return 'contentCafe'

    @staticmethod
    def loadStockImage():
        """This sets the stock image. If an cover is not found by the API a
        generic blank cover is returned. To filter out these covers we must
        compare the bytes of each file. This generates the bytes for the
        comparison file.
        """
        return open('./assets/stand-in-prefix.png', 'rb').read()

    def queryIdentifier(self, idType, identifier):
        """Queries the API for a cover URI

        Arguments:
            idType {string} -- Type of identifier, only accepts isbn
            identifier {string} --  Value of the identifier

        Returns:
            [string] -- URI for the cover from the ContentCafe API
        """
        if idType != 'isbn':
            return None

        coverURL = self.CONTENT_CAFE_URL.format(self.CONTENT_CAFE_USER,
                                                self.CONTENT_CAFE_PSWD,
                                                identifier)

        searchResp = requests.get(coverURL)

        if searchResp.status_code == 200:
            imageContent = searchResp.content
            if imageContent.startswith(self.stockImage):
                return None

            return coverURL

        return None

    def createCoverURL(self, volumeID):
        """ContentCafe implementation of the createCoverURL method. This
        method does nothing because the queryIdentifier returns a valid URL.

        Arguments:
            volumeID {string} -- ContentCafe URI

        Returns:
            [string] -- Unchanged ContentCafe URI
        """
        return volumeID

    def getMimeType(self):
        return 'image/jpeg'
Beispiel #7
0
class CoverParse:
    HATHI_CLIENT_KEY = decryptEnvVar('HATHI_CLIENT_KEY')
    HATHI_CLIENT_SECRET = decryptEnvVar('HATHI_CLIENT_SECRET')
    URL_ID_REGEX = r'\/([^\/]+\.[a-zA-Z]{3,4}$)'
    HATHI_URL_ID_REGEX = r'([a-z0-9]+\.[$0-9a-z]+)\/[0-9]{1,2}\?format=jpeg&v=2$'  # noqa: E501
    GOOGLE_URL_ID_REGEX = r'\/[^\/]+\?id=([0-9a-zA-Z]+)\S+imgtk=[a-zA-Z_\-0-9]+&source=gbs_api$'  # noqa: E501

    def __init__(self, record):
        self.logger = LOGGER
        self.source = record.get('source', 'unk')
        self.sourceID = record.get('identifier', None)
        self.originalURL = record.get('url', None)
        self.remoteURL = record.get('url', None)
        self.s3CoverURL = None
        self.logger.debug('Source: {}|ID: {}|URL: {}'.format(
            self.source, self.sourceID, self.remoteURL))

    @property
    def remoteURL(self):
        return self._remoteURL

    @remoteURL.setter
    def remoteURL(self, url):
        if not url:
            self.logger.error(
                'URL not provided from {}({}) to cover ingester'.format(
                    self.sourceID, self.source))
            raise InvalidParameter('URL must be supplied to CoverParse()')
        if url[:4] != 'http':
            url = 'https://{}'.format(url)
        parsedURL = urlparse(url)
        if not parsedURL.scheme or not parsedURL.netloc or not parsedURL.path:
            self.logger.error('Invalid URL provided, unable to access cover')
            raise InvalidParameter('Unable to validate URL {}'.format(url))

        self._remoteURL = url

    @property
    def sourceID(self):
        return self._sourceID

    @sourceID.setter
    def sourceID(self, identifier):
        if not identifier:
            self.logger.error('Must supply unique identifier with remoteURL')
            raise InvalidParameter('Source identifier required. None provided')

        self._sourceID = identifier

    def storeCover(self):
        authObj = None
        if 'hathitrust' in self.remoteURL:
            authObj = CoverParse.createAuth()
        try:
            imgResp = requests.get(self.remoteURL, auth=authObj, timeout=5)
        except ReadTimeout:
            raise URLFetchError('URL request timed out', 504, self.remoteURL)
        if imgResp.status_code != 200:
            raise URLFetchError('Unable to read image at url',
                                imgResp.status_code, self.remoteURL)

        coverKey = self.createKey()
        mimeType = self.getMimeType(coverKey)
        s3 = s3Client(coverKey)
        existingFile = s3.checkForFile()
        if existingFile is None:
            resizer = CoverResizer(imgResp.content)
            resizer.getNewDimensions()
            resizer.resizeCover()
            standardCoverBytes = resizer.getCoverInBytes()
            self.s3CoverURL = s3.storeNewFile(standardCoverBytes, mimeType)
        else:
            self.s3CoverURL = existingFile

    def createKey(self):
        if 'hathitrust' in self.remoteURL:
            urlMatch = re.search(self.HATHI_URL_ID_REGEX, self.remoteURL)
            urlID = '{}.jpg'.format(urlMatch.group(1))
        elif 'google' in self.remoteURL:
            urlMatch = re.search(self.GOOGLE_URL_ID_REGEX, self.remoteURL)
            urlID = '{}.jpg'.format(urlMatch.group(1))
        elif 'contentcafe2' in self.remoteURL:
            urlID = '{}.jpg'.format(self.sourceID)
        elif 'archive.org' in self.remoteURL:
            urlID = '{}.jpg'.format(self.sourceID)
        else:
            urlMatch = re.search(self.URL_ID_REGEX, self.remoteURL)
            urlID = urlMatch.group(1)
        return '{}/{}_{}'.format(self.source.lower(), self.sourceID,
                                 urlID.lower())

    def getMimeType(self, key):
        return guess_type(key)[0]

    @classmethod
    def createAuth(cls):
        return OAuth1(cls.HATHI_CLIENT_KEY,
                      client_secret=cls.HATHI_CLIENT_SECRET,
                      signature_type='query')
Beispiel #8
0
class HathiCover():
    """Manager class for finding a cover image for HathiTrust images. This is
    done by parsing a METS object obtained through the Hathi API, extracting
    the first 25 pages and scoring them based on relevancy as a cover. The
    URI to the most relevant page image is ultimately returned.
    """
    HATHI_BASE_API = os.environ.get('HATHI_BASE_API', None)
    HATHI_CLIENT_KEY = decryptEnvVar('HATHI_CLIENT_KEY')
    HATHI_CLIENT_SECRET = decryptEnvVar('HATHI_CLIENT_SECRET')

    def __init__(self, htid):
        self.htid = htid
        self.logger = logger

    def generateOAuth(self):
        """Helper method that generates an OAuth1 block that authenticates
        requests against the HathiTrust Data API. Due to the structure of the
        API this is formatted as part of the query string.

        Returns:
            [object] -- An OAuth1 authentication block
        """
        return OAuth1(self.HATHI_CLIENT_KEY,
                      client_secret=self.HATHI_CLIENT_SECRET,
                      signature_type='query')

    def getResponse(self, queryURL):
        queryAuth = self.generateOAuth()
        try:
            return requests.get(queryURL, auth=queryAuth, timeout=3)
        except ReadTimeout:
            raise URLFetchError('URL request timed out'.format(queryURL), 504,
                                queryURL)

    def getPageFromMETS(self):
        """Query method for the best page URI from the record's METS file

        Returns:
            [uri] -- URI to the page to be used as a cover image
        """
        self.logger.debug('Querying {} for cover image'.format(self.htid))
        structURL = '{}/structure/{}?format=json&v=2'.format(
            self.HATHI_BASE_API, self.htid)
        try:
            structResp = self.getResponse(structURL)
            if structResp.status_code == 200:
                return self.parseMETS(structResp.json())
        except URLFetchError:
            self.logger.warning('Request for structure file timed out')

        return None

    def parseMETS(self, metsJson):
        """Parser that handles the METS file, parsing the first 25 pages into
        HathiPage objects that contain a score and position. Once parsed it
        sets the "imagePage" as the page that contains the most plausibly
        relevant cover.

        Arguments:
            metsJson {object} -- METS object extracted from the JSON response

        Returns:
            [uri] -- URI to the page to be used as a cover image
        """
        structMap = metsJson['METS:structMap']
        self.logger.info('Retrieved METS for {}'.format(self.htid))
        self.pages = [
            HathiPage(page) for page in structMap['METS:div']['METS:div'][:25]
        ]

        self.pages.sort(key=lambda x: x.score, reverse=True)
        self.imagePage = self.pages[0]
        return self.getPageURL()

    def getPageURL(self):
        """Extracts a resolvable URI from the page selected as a cover image.
        This URI can be used to create a local copy of the cover.

        Returns:
            [uri] -- The created URI of the cover page
        """
        return '{}/volume/pageimage/{}/{}?format=jpeg&v=2'.format(
            self.HATHI_BASE_API, self.htid, self.imagePage.page)