def from_manifest(manifest, destination, **ingestion_args): '''From a dict that can be jsonified and uploaded to S3. For more info on manifests, see http://docs.aws.amazon.com/redshift/latest/dg/loading-data-files-using-manifest.html''' s3_path = _transient_s3_path(destination) + '.manifest' s3_manifest = S3File.from_json_serializable(manifest, s3_path) s3_to_redshift(s3_manifest, destination, with_manifest=True, **ingestion_args)
def test_s3_file_upload_and_download_with_path_object(self): s3_file = S3File.from_local_file(local_path=self.S3_FILE_UPLOAD_PATH, s3_path=self.PATH_OBJECT) s3_file.download(destination_path=self.S3_FILE_DOWNLOAD_PATH) with open(self.S3_FILE_DOWNLOAD_PATH, 'r') as downloaded_file: self.assertEqual(downloaded_file.read(), self.S3_FILE_CONTENTS)
def test_s3_file_from_in_memory_data_upload_and_download(self): s3_file = S3File.from_in_memory_data(data=self.IN_MEMORY_DATA, s3_path=self.S3_PATH) s3_file.download(destination_path=self.S3_FILE_DOWNLOAD_PATH) data_uploaded_to_s3 = self._read_csv_as_list_of_tuples( self.S3_FILE_DOWNLOAD_PATH) self.assertEqual(data_uploaded_to_s3, self.IN_MEMORY_DATA)
def test_dictionary_becomes_json_file_in_s3(self): file = S3File.from_json_serializable(data=self.DATA, s3_path=self.S3_PATH) temp_path = file.download_to_temp() with open(temp_path) as file: actual_data = json.load(file) self.assertEqual(actual_data, self.DATA)
def test_upsert_audit(self): s3_file = S3File.from_local_file(local_path=self.LOCAL_FILE_PATH, s3_path=self.S3_PATH) s3_to_redshift( s3_file, RedshiftTable(self.DB_CONNECTION, self.TABLE, self.UPSERT_UNIQUENESS_KEY)) recorded_audit_data = self.DB_CONNECTION.fetch( self.AUDIT_TABLE_CONTENTS_QUERY) self.assertEqual(recorded_audit_data, self.EXPECTED_AUDIT_DATA)
def test_data_in_redshift(self): '''Because our destination database, `BasicRedshiftButActuallyPostgres`, is only pretending to be a Redshift database, the s3_to_redshift method should successfully move a local .csv file to it.''' s3_file = S3File.from_local_file(local_path=self.LOCAL_FILE_PATH, s3_path=self.S3_PATH) s3_to_redshift( s3_file, RedshiftTable(self.DB_CONNECTION, self.TABLE, self.UPSERT_UNIQUENESS_KEY)) current_data_in_table = self.DB_CONNECTION.fetch( self.DB_SELECT_ALL_QUERY) self.assertEqual(current_data_in_table, self.FILE_CONTENTS)
def test_vacuum_errors_are_swallowed(self, database_execute): database_execute.side_effect = [ 'pre_upsert_audit_table_insert_statement', 'post_upsert_audit_table_update_statement', NotSupportedError( "VACUUM is running. HINT: re-execute after other vacuum finished" ) ] s3_file = S3File.from_local_file(local_path=self.LOCAL_FILE_PATH, s3_path=self.S3_PATH) try: s3_to_redshift( s3_file, RedshiftTable(self.DB_CONNECTION, self.TABLE, self.UPSERT_UNIQUENESS_KEY)) except BaseException: self.fail('nothing should have errored here unexpectedly!')
def test_s3_file_size(self): s3_file = S3File.from_local_file(local_path=self.S3_FILE_UPLOAD_PATH, s3_path=self.S3_PATH) expected_file_size = os.path.getsize(self.S3_FILE_UPLOAD_PATH) self.assertEqual(s3_file.file_size, expected_file_size)
def test_s3_file_from_in_memory_data_factory(self): s3_file = S3File.from_in_memory_data(data=self.IN_MEMORY_DATA, s3_path=self.S3_PATH) self.assertIsInstance(s3_file, S3File)
def test_s3_file_factory(self): s3_file = S3File.from_local_file(local_path=self.S3_FILE_UPLOAD_PATH, s3_path=self.S3_PATH) self.assertIsInstance(s3_file, S3File)
def test_init_with_path_object_sets_key(self): s3_file = S3File(s3_path=self.PATH_OBJECT) self.assertEqual(s3_file.key_name, self.KEY_NAME)
def test_s3_key_name(self): s3_file = S3File(self.S3_PATH) self.assertEqual(s3_file.key_name, self.S3_KEY_NAME)
def from_local_file(file_path, destination): '''Assumes a CSV''' s3_path = _transient_s3_path(destination) + '.csv' s3_file = S3File.from_local_file(file_path, s3_path) from_s3_file(s3_file, destination)
def from_s3_path(s3_path, destination): '''Assumes a CSV''' s3_file = S3File(s3_path) from_s3_file(s3_file, destination)
def test_file_size_of_non_existent_file_equals_0(self): self.assertEqual(S3File(self.S3_PATH).file_size, 0)
def test_init_with_path_object_sets_bucket(self): s3_file = S3File(s3_path=self.PATH_OBJECT) self.assertEqual(s3_file.bucket_name, self.S3_BUCKET_NAME)
def test_s3_bucket_name(self): s3_file = S3File(self.S3_PATH) self.assertEqual(s3_file.bucket_name, self.S3_BUCKET_NAME)
def __init__(self, file_path, destination, **kwargs): local_file_path = S3File(file_path).download_to_temp() super().__init__(local_file_path, destination, **kwargs) if self.with_manifest: raise ValueError( "Postgres cannot handle manifests like redshift. Sorry.")