Beispiel #1
0
    def download_thumbnail(self, prl_solr_document: PRLSolrDocument):
        """Puts the thumbnail file in its place on the file system.

        Returns its path, or None if no thumbnail could be fetched."""

        # TODO: need better exception handling here
        thumbnail_s3_key = prl_solr_document.get_thumbnail_s3_key()
        try:
            filepath = os.path.join(
                os.path.abspath(
                    os.path.expanduser(self.config['s3']['sync']['source'])),
                thumbnail_s3_key)
            os.makedirs(os.path.dirname(filepath), exist_ok=True)

            original_thumbnail_url = prl_solr_document.original_thumbnail_metadata(
            )['url']
            n_tries = 3
            for try_i in range(1, n_tries + 1):
                try:
                    response = requests.get(original_thumbnail_url,
                                            timeout=30,
                                            stream=True)
                    # Fail on 4xx or 5xx
                    response.raise_for_status()
                    # Make sure the Content-Type is what we expect. Some servers discriminate against robots.
                    if re.match(re.compile('image/.+'),
                                response.headers.get('Content-Type')):
                        with open(filepath, 'wb') as image_file:
                            for chunk in response.iter_content(
                                    chunk_size=1024):
                                image_file.write(chunk)
                        logging.debug(
                            '%s thumbnail put on local filesystem at %s',
                            thumbnail_s3_key, filepath)
                        return filepath
                    else:
                        logging.debug('Robots cannot access %s',
                                      original_thumbnail_url)
                        return None
                except requests.Timeout as e:
                    if try_i < n_tries:
                        msg = 'Thumbnail download timed out, retrying...'
                        logging.info(msg)
                        # Continue loop
                    else:
                        # No more tries left, so fail
                        msg = 'Failed to download thumbnail after {} tries: {}'.format(
                            n_tries, str(e))
                        logging.debug(msg)
                        return None
                except (requests.RequestException, IOError) as e:
                    msg = 'Failed to download thumbnail: {}'.format(e)
                    logging.debug(msg)
                    return None
        except Exception as e:
            raise IndexerError(
                'Failed to put thumbnail on local filesystem: {}'.format(e))
Beispiel #2
0
    def upload_thumbnail(self, prl_solr_document: PRLSolrDocument,
                         filepath: str):
        """Puts the thumbnail on S3."""

        try:
            self.s3.put_object(
                Bucket=self.config['s3']['sync']['destination']['s3_uri'],
                Key=prl_solr_document.get_thumbnail_s3_key(),
                Body=open(filepath, 'rb'),
                ContentType=prl_solr_document.original_thumbnail_metadata()
                ['content-type'])
            logging.debug('%s thumbnail put on S3',
                          prl_solr_document.get_record_identifier())
        except BotoCoreError as e:
            raise IndexerError('Failed to put thumbnail on S3: {}'.format(
                e.msg))
Beispiel #3
0
    def upload_thumbnail(self, prl_solr_document: PRLSolrDocument,
                         filepath: str):
        """Puts the thumbnail on S3."""

        # Determine a URL for the thumbnail now that we've downloaded it and know the image format
        prl_solr_document.add_thumbnail_url()

        try:
            self.s3.put_object(
                Bucket=os.environ.get('AWS_S3_BUCKET_NAME'),
                Key=prl_solr_document.get_thumbnail_s3_key(),
                Body=open(filepath, 'rb'),
                ContentType=prl_solr_document.original_thumbnail_metadata()
                ['content-type'])
            logging.debug('%s thumbnail put on S3',
                          prl_solr_document.get_record_identifier())
        except BotoCoreError as e:
            raise IndexerError('Failed to put thumbnail on S3: {}'.format(
                e.msg))
Beispiel #4
0
    def save_thumbnail(self, prl_solr_document: PRLSolrDocument):
        """Puts thumbnail on the local filesystem and on S3.

        Returns the Boolean value of whether or not a thumbnail was saved."""

        thumbnail_path = self.download_thumbnail(prl_solr_document)
        if thumbnail_path:
            self.upload_thumbnail(prl_solr_document, thumbnail_path)
            logging.debug('%s thumbnail saved',
                          prl_solr_document.get_record_identifier())
            return True
        else:
            return False
Beispiel #5
0
    def get_solr_document(self, file_object: TextIOWrapper) -> PRLSolrDocument:
        """Builds a Solr document for PRL."""
        identifier, institution_key, institution_name, collection_key, collection_name = self.get_key_record_metadata(
            file_object.name)

        if self.args['dry_run']:
            s3_domain_name = 'example.com'
        else:
            s3_domain_name = os.environ.get('AWS_S3_BUCKET_DOMAIN_NAME')

        return PRLSolrDocument(file_object, identifier, institution_key,
                               institution_name, collection_key,
                               collection_name, s3_domain_name)
Beispiel #6
0
    def get_solr_document(self, file_object: TextIOWrapper) -> PRLSolrDocument:
        """Builds a Solr document for PRL."""
        identifier, institution_key, institution_name, collection_key, collection_name = self.get_key_record_metadata(
            file_object.name)

        if self.args['dry_run']:
            s3_domain_name = 'example.com'
        else:
            s3_domain_name = self.config['s3']['sync']['destination'][
                'domain_name']

        return PRLSolrDocument(
            file_object, identifier, institution_key, institution_name,
            collection_key, collection_name,
            self.config['metadata']['dublin_core']['solr_mapping'],
            self.config['metadata']['dublin_core']
            ['external_link_field_patterns'],
            self.config['metadata']['dublin_core']['thumbnail_field_patterns'],
            s3_domain_name)