def test_get_s3_key_names_from_bucket(self): "simple tests for coverage" fake_bucket = FakeBucket() fake_bucket.items += self.fake_s3_keys fake_bucket.items += self.fake_s3_prefixes self.assertEqual(len(s3lib.get_s3_key_names_from_bucket(fake_bucket)), 3) self.assertEqual(len(s3lib.get_s3_key_names_from_bucket( fake_bucket, file_extensions=['.xml'])), 1) self.assertEqual(len(s3lib.get_s3_key_names_from_bucket( fake_bucket, file_extensions=['.xml', '.pdf'])), 2) self.assertEqual(len(s3lib.get_s3_key_names_from_bucket( fake_bucket, key_type='prefix')), 1)
def rename_article_s3_objects(self, bucket_folder_name, version): """ Main function to rename article objects on S3 and apply the renamed file names to the article XML file """ # Connect to S3 and bucket s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key, host=self.settings.s3_hostname) bucket = s3_conn.lookup(self.expanded_bucket_name) # bucket object list s3_key_names = s3lib.get_s3_key_names_from_bucket( bucket=bucket, prefix=bucket_folder_name + "/") # Get the old name to new name map file_name_map = self.build_file_name_map(s3_key_names, version) # log file names for reference if self.logger: self.logger.info('file_name_map: %s' % json.dumps(file_name_map, sort_keys=True, indent=4)) # rename_s3_objects(old_name_new_name_dict) self.rename_s3_objects(bucket, self.expanded_bucket_name, bucket_folder_name, file_name_map) # rewrite_and_upload_article_xml() xml_filename = self.find_xml_filename_in_map(file_name_map) self.download_file_from_bucket(bucket, bucket_folder_name, xml_filename) self.rewrite_xml_file(xml_filename, file_name_map) self.upload_file_to_bucket(bucket, bucket_folder_name, xml_filename)
def zip_revision_number(self, fid): """ Look at previously supplied files and determine the next revision number """ revision = None bucket_name = self.publish_bucket prefix = self.published_zip_folder + '/' # Connect to S3 and bucket s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = s3_conn.lookup(bucket_name) s3_key_names = s3lib.get_s3_key_names_from_bucket( bucket=bucket, prefix=prefix) s3_key_name = s3lib.latest_pmc_zip_revision(fid, s3_key_names) if s3_key_name: # Found an existing PMC zip file, look for a revision number revision_match = re.match(ur'.*r(.*)\.zip$', s3_key_name) if revision_match is None: # There is a zip but no revision number, use 1 revision = 1 else: # Use the latest revision plus 1 revision = int(revision_match.group(1)) + 1 return revision
def download_pmc_zip_from_s3(self, doi_id, workflow): """ Simple download of PMC zip file from the live bucket """ bucket_name = self.pmc_zip_bucket prefix = self.pmc_zip_folder # Connect to S3 and bucket s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = s3_conn.lookup(bucket_name) s3_key_names = s3lib.get_s3_key_names_from_bucket( bucket = bucket, prefix = prefix) s3_key_name = s3lib.latest_pmc_zip_revision(doi_id, s3_key_names) if s3_key_name: # Download s3_key = bucket.get_key(s3_key_name) filename = s3_key_name.split("/")[-1] filename_plus_path = (self.get_tmp_dir() + os.sep + self.INPUT_DIR + os.sep + filename) mode = "wb" f = open(filename_plus_path, mode) s3_key.get_contents_to_file(f) f.close() return True else: return False
def zip_revision_number(self, fid): """ Look at previously supplied files and determine the next revision number """ revision = None bucket_name = self.publish_bucket prefix = self.published_zip_folder + '/' # Connect to S3 and bucket s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = s3_conn.lookup(bucket_name) s3_key_names = s3lib.get_s3_key_names_from_bucket( bucket = bucket, prefix = prefix) s3_key_name = s3lib.latest_pmc_zip_revision(fid, s3_key_names) if s3_key_name: # Found an existing PMC zip file, look for a revision number revision_match = re.match(ur'.*r(.*)\.zip$', s3_key_name) if revision_match is None: # There is a zip but no revision number, use 1 revision = 1 else: # Use the latest revision plus 1 revision = int(revision_match.group(1)) + 1 return revision
def get_outbox_s3_key_names(self, force = None): """ Separately get a list of S3 key names form the outbox for reporting purposes, excluding the outbox folder itself """ # Return cached values if available if self.outbox_s3_key_names and not force: return self.outbox_s3_key_names bucket_name = self.publish_bucket # Connect to S3 and bucket s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = s3_conn.lookup(bucket_name) s3_key_names = s3lib.get_s3_key_names_from_bucket( bucket = bucket, prefix = self.outbox_folder) # Remove the outbox_folder from the list, if present try: s3_key_names.remove(self.outbox_folder) except: pass self.outbox_s3_key_names = s3_key_names return self.outbox_s3_key_names
def download_files_from_s3_outbox(self): """ Connect to the S3 bucket, and from the outbox folder, download the .xml and .pdf files to be bundled. """ file_extensions = [] file_extensions.append(".xml") bucket_name = self.publish_bucket # Connect to S3 and bucket s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = s3_conn.lookup(bucket_name) s3_key_names = s3lib.get_s3_key_names_from_bucket( bucket = bucket, prefix = self.outbox_folder, file_extensions = file_extensions) for name in s3_key_names: # Download objects from S3 and save to disk s3_key = bucket.get_key(name) filename = name.split("/")[-1] # Save .xml and .pdf to different folders if re.search(".*\\.xml$", name): dirname = self.elife_poa_lib.settings.STAGING_TO_HW_DIR filename_plus_path = dirname + os.sep + filename mode = "wb" f = open(filename_plus_path, mode) s3_key.get_contents_to_file(f) f.close()
def get_outbox_s3_key_names(self, force=None): """ Separately get a list of S3 key names form the outbox for reporting purposes, excluding the outbox folder itself """ # Return cached values if available if self.outbox_s3_key_names and not force: return self.outbox_s3_key_names bucket_name = self.publish_bucket # Connect to S3 and bucket s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = s3_conn.lookup(bucket_name) s3_key_names = s3lib.get_s3_key_names_from_bucket( bucket=bucket, prefix=self.outbox_folder) # Remove the outbox_folder from the list, if present try: s3_key_names.remove(self.outbox_folder) except: pass self.outbox_s3_key_names = s3_key_names return self.outbox_s3_key_names
def download_files_from_s3_outbox(self): """ Connect to the S3 bucket, and from the outbox folder, download the .xml and .pdf files to be bundled. """ file_extensions = [] file_extensions.append(".xml") bucket_name = self.publish_bucket # Connect to S3 and bucket s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = s3_conn.lookup(bucket_name) s3_key_names = s3lib.get_s3_key_names_from_bucket( bucket=bucket, prefix=self.outbox_folder, file_extensions=file_extensions) for name in s3_key_names: # Download objects from S3 and save to disk s3_key = bucket.get_key(name) filename = name.split("/")[-1] # Save .xml and .pdf to different folders if re.search(".*\\.xml$", name): dirname = self.elife_poa_lib.settings.STAGING_TO_HW_DIR filename_plus_path = dirname + os.sep + filename mode = "wb" f = open(filename_plus_path, mode) s3_key.get_contents_to_file(f) f.close()
def download_pmc_zip_from_s3(self, doi_id, workflow): """ Simple download of PMC zip file from the live bucket """ bucket_name = self.pmc_zip_bucket prefix = self.pmc_zip_folder # Connect to S3 and bucket s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = s3_conn.lookup(bucket_name) s3_key_names = s3lib.get_s3_key_names_from_bucket(bucket=bucket, prefix=prefix) s3_key_name = s3lib.latest_pmc_zip_revision(doi_id, s3_key_names) if s3_key_name: # Download s3_key = bucket.get_key(s3_key_name) filename = s3_key_name.split("/")[-1] filename_plus_path = (self.get_tmp_dir() + os.sep + self.INPUT_DIR + os.sep + filename) mode = "wb" f = open(filename_plus_path, mode) s3_key.get_contents_to_file(f) f.close() return True else: return False
def rename_article_s3_objects(self, bucket_folder_name, version): """ Main function to rename article objects on S3 and apply the renamed file names to the article XML file """ # Connect to S3 and bucket s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key, host=self.settings.s3_hostname) bucket = s3_conn.lookup(self.expanded_bucket_name) # bucket object list s3_key_names = s3lib.get_s3_key_names_from_bucket( bucket=bucket, prefix=bucket_folder_name + "/") # Get the old name to new name map file_name_map = self.build_file_name_map(s3_key_names, version) # log file names for reference if self.logger: self.logger.info( 'file_name_map: %s' % json.dumps(file_name_map, sort_keys=True, indent=4)) # rename_s3_objects(old_name_new_name_dict) self.rename_s3_objects(bucket, self.expanded_bucket_name, bucket_folder_name, file_name_map) # rewrite_and_upload_article_xml() xml_filename = self.find_xml_filename_in_map(file_name_map) self.download_file_from_bucket(bucket, bucket_folder_name, xml_filename) self.rewrite_xml_file(xml_filename, file_name_map) self.upload_file_to_bucket(bucket, bucket_folder_name, xml_filename)
def archive_zip_file_name(self, article, status='vor'): """ Get the file name of the most recent archive zip from the archive bucket """ zip_file_name = None bucket_name = self.archive_bucket # Connect to S3 and bucket s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = s3_conn.lookup(bucket_name) s3_key_names = s3lib.get_s3_key_names_from_bucket(bucket=bucket) return self.latest_archive_zip_revision(article.doi_id, s3_key_names, self.journal, status)
def get_s3_key_names_from_bucket(self, bucket_name, prefix, file_extensions): """ Use live s3 bucket connection to get the s3 key names from the bucket. This is so functions that rely on the data can use test data when running automated tests """ s3_key_names = None # Connect to S3 and bucket s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = s3_conn.lookup(bucket_name) s3_key_names = s3lib.get_s3_key_names_from_bucket( bucket = bucket, key_type = "key", prefix = prefix, file_extensions = file_extensions) return s3_key_names
def get_folder_names_from_bucket(self, bucket_name, prefix): """ Use live s3 bucket connection to get the folder names from the bucket. This is so functions that rely on the data can use test data when running automated tests """ folder_names = None # Connect to S3 and bucket s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = s3_conn.lookup(bucket_name) # Step one, get all the subfolder names folder_names = s3lib.get_s3_key_names_from_bucket( bucket = bucket, key_type = "prefix", prefix = prefix) return folder_names
def get_s3_key_names_from_bucket(self, bucket_name, prefix, file_extensions): """ Use live s3 bucket connection to get the s3 key names from the bucket. This is so functions that rely on the data can use test data when running automated tests """ s3_key_names = None # Connect to S3 and bucket s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = s3_conn.lookup(bucket_name) s3_key_names = s3lib.get_s3_key_names_from_bucket( bucket=bucket, key_type="key", prefix=prefix, file_extensions=file_extensions) return s3_key_names
def get_folder_names_from_bucket(self, bucket_name, prefix): """ Use live s3 bucket connection to get the folder names from the bucket. This is so functions that rely on the data can use test data when running automated tests """ folder_names = None # Connect to S3 and bucket s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = s3_conn.lookup(bucket_name) # Step one, get all the subfolder names folder_names = s3lib.get_s3_key_names_from_bucket( bucket=bucket, key_type="prefix", prefix=prefix) return folder_names
def download_files_from_s3_outbox(self): """ Connect to the S3 bucket, and from the outbox folder, download the .xml to be processed """ filenames = [] file_extensions = [] file_extensions.append(".xml") bucket_name = self.publish_bucket # Connect to S3 and bucket s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = s3_conn.lookup(bucket_name) s3_key_names = s3lib.get_s3_key_names_from_bucket( bucket=bucket, prefix=self.outbox_folder, file_extensions=file_extensions) for name in s3_key_names: # Download objects from S3 and save to disk s3_key = bucket.get_key(name) filename = name.split("/")[-1] # Download to the activity temp directory dirname = self.get_tmp_dir() filename_plus_path = dirname + os.sep + filename mode = "wb" f = open(filename_plus_path, mode) s3_key.get_contents_to_file(f) f.close() filenames.append(filename_plus_path) return filenames
def does_source_zip_exist_from_s3(self, doi_id): """ """ bucket_name = self.pmc_zip_bucket prefix = self.pmc_zip_folder # Connect to S3 and bucket s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = s3_conn.lookup(bucket_name) s3_key_names = s3lib.get_s3_key_names_from_bucket(bucket=bucket, prefix=prefix) s3_key_name = s3lib.latest_pmc_zip_revision(doi_id, s3_key_names) if s3_key_name: return True else: return False
def download_files_from_s3(self): """ Connect to the S3 bucket, and from the outbox folder, download the .xml and .pdf files to be bundled. """ file_extensions = [] file_extensions.append(".xml") file_extensions.append(".pdf") file_extensions.append(".zip") bucket_name = self.input_bucket # Connect to S3 and bucket s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = s3_conn.lookup(bucket_name) s3_key_names = s3lib.get_s3_key_names_from_bucket( bucket=bucket, prefix=self.outbox_folder, file_extensions=file_extensions) self.outbox_s3_key_names = s3_key_names for name in s3_key_names: # Download objects from S3 and save to disk s3_key = bucket.get_key(name) filename = name.split("/")[-1] filename_plus_path = self.INPUT_DIR + os.sep + filename if self.logger: self.logger.info('PublishFinalPOA downloading: %s' % filename_plus_path) mode = "wb" f = open(filename_plus_path, mode) s3_key.get_contents_to_file(f) f.close()
def does_source_zip_exist_from_s3(self, doi_id): """ """ bucket_name = self.pmc_zip_bucket prefix = self.pmc_zip_folder # Connect to S3 and bucket s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = s3_conn.lookup(bucket_name) s3_key_names = s3lib.get_s3_key_names_from_bucket( bucket=bucket, prefix=prefix) s3_key_name = s3lib.latest_pmc_zip_revision(doi_id, s3_key_names) if s3_key_name: return True else: return False
def next_revision_number(self, doi_id, status='poa'): """ From the bucket, get a list of zip files and determine the next revision number to use """ next_revision_number = 1 bucket_name = self.publish_bucket file_extensions = [] file_extensions.append(".zip") # Connect to S3 and bucket s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = s3_conn.lookup(bucket_name) s3_key_names = s3lib.get_s3_key_names_from_bucket(bucket=bucket) max_revision_number = 0 for key_name in s3_key_names: name_prefix = 'elife-' + str(doi_id).zfill(5) + '-' + str( status) + '-r' if key_name.startswith(name_prefix): # Attempt to get a revision number from the matching files try: part = key_name.replace(name_prefix, '') revision = int(part.split('.')[0]) except (IndexError, ValueError): revision = None if revision and revision > max_revision_number: max_revision_number = revision if max_revision_number > 0: next_revision_number = max_revision_number + 1 return next_revision_number
def check_published_folder_exists(self): if not self.published_folder_name: return None bucket_name = self.input_bucket # Connect to S3 and bucket s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = s3_conn.lookup(bucket_name) # Strip the trailing slash from the folder name if present published_folder_prefix = self.published_folder_name.rstrip('/') s3_key_names = s3lib.get_s3_key_names_from_bucket( bucket = bucket, key_type = 'prefix', prefix = published_folder_prefix) if len(s3_key_names) > 0: return True else: return False
def download_files_from_s3(self): """ Connect to the S3 bucket, and from the outbox folder, download the .xml and .pdf files to be bundled. """ file_extensions = [] file_extensions.append(".xml") file_extensions.append(".pdf") file_extensions.append(".zip") bucket_name = self.input_bucket # Connect to S3 and bucket s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = s3_conn.lookup(bucket_name) s3_key_names = s3lib.get_s3_key_names_from_bucket(bucket=bucket, prefix=self.outbox_folder, file_extensions=file_extensions) self.outbox_s3_key_names = s3_key_names for name in s3_key_names: # Download objects from S3 and save to disk s3_key = bucket.get_key(name) filename = name.split("/")[-1] filename_plus_path = self.INPUT_DIR + os.sep + filename if self.logger: self.logger.info('PublishFinalPOA downloading: %s' % filename_plus_path) mode = "wb" f = open(filename_plus_path, mode) s3_key.get_contents_to_file(f) f.close()
def next_revision_number(self, doi_id, status='poa'): """ From the bucket, get a list of zip files and determine the next revision number to use """ next_revision_number = 1 bucket_name = self.publish_bucket file_extensions = [] file_extensions.append(".zip") # Connect to S3 and bucket s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = s3_conn.lookup(bucket_name) s3_key_names = s3lib.get_s3_key_names_from_bucket(bucket=bucket) max_revision_number = 0 for key_name in s3_key_names: name_prefix = 'elife-' + str(doi_id).zfill(5) + '-' + str(status) + '-r' if key_name.startswith(name_prefix): # Attempt to get a revision number from the matching files try: part = key_name.replace(name_prefix, '') revision = int(part.split('.')[0]) except (IndexError, ValueError): revision = None if revision and revision > max_revision_number: max_revision_number = revision if max_revision_number > 0: next_revision_number = max_revision_number + 1 return next_revision_number