def download_thumbnail(self, prl_solr_document: PRLSolrDocument): """Puts the thumbnail file in its place on the file system. Returns its path, or None if no thumbnail could be fetched.""" # TODO: need better exception handling here thumbnail_s3_key = prl_solr_document.get_thumbnail_s3_key() try: filepath = os.path.join( os.path.abspath( os.path.expanduser(self.config['s3']['sync']['source'])), thumbnail_s3_key) os.makedirs(os.path.dirname(filepath), exist_ok=True) original_thumbnail_url = prl_solr_document.original_thumbnail_metadata( )['url'] n_tries = 3 for try_i in range(1, n_tries + 1): try: response = requests.get(original_thumbnail_url, timeout=30, stream=True) # Fail on 4xx or 5xx response.raise_for_status() # Make sure the Content-Type is what we expect. Some servers discriminate against robots. if re.match(re.compile('image/.+'), response.headers.get('Content-Type')): with open(filepath, 'wb') as image_file: for chunk in response.iter_content( chunk_size=1024): image_file.write(chunk) logging.debug( '%s thumbnail put on local filesystem at %s', thumbnail_s3_key, filepath) return filepath else: logging.debug('Robots cannot access %s', original_thumbnail_url) return None except requests.Timeout as e: if try_i < n_tries: msg = 'Thumbnail download timed out, retrying...' logging.info(msg) # Continue loop else: # No more tries left, so fail msg = 'Failed to download thumbnail after {} tries: {}'.format( n_tries, str(e)) logging.debug(msg) return None except (requests.RequestException, IOError) as e: msg = 'Failed to download thumbnail: {}'.format(e) logging.debug(msg) return None except Exception as e: raise IndexerError( 'Failed to put thumbnail on local filesystem: {}'.format(e))
def upload_thumbnail(self, prl_solr_document: PRLSolrDocument, filepath: str): """Puts the thumbnail on S3.""" try: self.s3.put_object( Bucket=self.config['s3']['sync']['destination']['s3_uri'], Key=prl_solr_document.get_thumbnail_s3_key(), Body=open(filepath, 'rb'), ContentType=prl_solr_document.original_thumbnail_metadata() ['content-type']) logging.debug('%s thumbnail put on S3', prl_solr_document.get_record_identifier()) except BotoCoreError as e: raise IndexerError('Failed to put thumbnail on S3: {}'.format( e.msg))
def upload_thumbnail(self, prl_solr_document: PRLSolrDocument, filepath: str): """Puts the thumbnail on S3.""" # Determine a URL for the thumbnail now that we've downloaded it and know the image format prl_solr_document.add_thumbnail_url() try: self.s3.put_object( Bucket=os.environ.get('AWS_S3_BUCKET_NAME'), Key=prl_solr_document.get_thumbnail_s3_key(), Body=open(filepath, 'rb'), ContentType=prl_solr_document.original_thumbnail_metadata() ['content-type']) logging.debug('%s thumbnail put on S3', prl_solr_document.get_record_identifier()) except BotoCoreError as e: raise IndexerError('Failed to put thumbnail on S3: {}'.format( e.msg))
def save_thumbnail(self, prl_solr_document: PRLSolrDocument): """Puts thumbnail on the local filesystem and on S3. Returns the Boolean value of whether or not a thumbnail was saved.""" thumbnail_path = self.download_thumbnail(prl_solr_document) if thumbnail_path: self.upload_thumbnail(prl_solr_document, thumbnail_path) logging.debug('%s thumbnail saved', prl_solr_document.get_record_identifier()) return True else: return False
def get_solr_document(self, file_object: TextIOWrapper) -> PRLSolrDocument: """Builds a Solr document for PRL.""" identifier, institution_key, institution_name, collection_key, collection_name = self.get_key_record_metadata( file_object.name) if self.args['dry_run']: s3_domain_name = 'example.com' else: s3_domain_name = os.environ.get('AWS_S3_BUCKET_DOMAIN_NAME') return PRLSolrDocument(file_object, identifier, institution_key, institution_name, collection_key, collection_name, s3_domain_name)
def get_solr_document(self, file_object: TextIOWrapper) -> PRLSolrDocument: """Builds a Solr document for PRL.""" identifier, institution_key, institution_name, collection_key, collection_name = self.get_key_record_metadata( file_object.name) if self.args['dry_run']: s3_domain_name = 'example.com' else: s3_domain_name = self.config['s3']['sync']['destination'][ 'domain_name'] return PRLSolrDocument( file_object, identifier, institution_key, institution_name, collection_key, collection_name, self.config['metadata']['dublin_core']['solr_mapping'], self.config['metadata']['dublin_core'] ['external_link_field_patterns'], self.config['metadata']['dublin_core']['thumbnail_field_patterns'], s3_domain_name)