def test_save_and_load_to_s3(self): # Mocked AWS Credentials for moto. os.environ["AWS_ACCESS_KEY_ID"] = "fake_access_key" os.environ["AWS_SECRET_ACCESS_KEY"] = "fake_secret_key" os.environ["AWS_SECURITY_TOKEN"] = "fake_secrurity_token" os.environ["AWS_SESSION_TOKEN"] = "fake_session_token" s3 = boto3.client("s3", region_name="us-east-1") mock_bucket = "moto-mock-s3-bucket" # We need to create the bucket since this is all in Moto's 'virtual' AWS account s3.create_bucket(Bucket=mock_bucket) dataset_path = f"s3://{mock_bucket}/datasets/dict" fs = S3FileSystem(key="fake_access_key", secret="fake_secret") dsets = self._create_dummy_dataset_dict() dsets.save_to_disk(dataset_path, fs) del dsets dsets = load_from_disk(dataset_path, fs) self.assertListEqual(sorted(dsets), ["test", "train"]) self.assertEqual(len(dsets["train"]), 30) self.assertListEqual(dsets["train"].column_names, ["filename"]) self.assertEqual(len(dsets["test"]), 30) self.assertListEqual(dsets["test"].column_names, ["filename"]) del dsets
def test_is_remote_filesystem(): fs = S3FileSystem(key="fake_access_key", secret="fake_secret") is_remote = is_remote_filesystem(fs) assert is_remote is True fs = fsspec.filesystem("file") is_remote = is_remote_filesystem(fs) assert is_remote is False
def test_distilbert_base(docker_image, processor, instance_type, sagemaker_local_session, py_version): from datasets import load_dataset from transformers import AutoTokenizer # tokenizer used in preprocessing tokenizer_name = 'distilbert-base-uncased' # dataset used dataset_name = 'imdb' # s3 key prefix for the data s3_prefix = 'samples/datasets/imdb' # load dataset dataset = load_dataset(dataset_name) # download tokenizer tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) # tokenizer helper function def tokenize(batch): return tokenizer(batch['text'], padding='max_length', truncation=True) # load dataset train_dataset, test_dataset = load_dataset('imdb', split=['train', 'test']) test_dataset = test_dataset.shuffle().select( range(100)) # smaller the size for test dataset to 10k # tokenize dataset train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset)) test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset)) # set format for pytorch train_dataset.rename_column_("label", "labels") train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels']) test_dataset.rename_column_("label", "labels") test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels']) # hyperparameters, which are passed into the training job hyperparameters = { 'max_steps': 5, 'train_batch_size': 4, 'model_name': 'distilbert-base-uncased' } s3 = S3FileSystem() # save train_dataset to s3 training_input_path = f's3://{sagemaker_local_session.default_bucket()}/{s3_prefix}/train' train_dataset.save_to_disk(training_input_path, fs=s3) # save test_dataset to s3 test_input_path = f's3://{sagemaker_local_session.default_bucket()}/{s3_prefix}/test' test_dataset.save_to_disk(test_input_path, fs=s3) estimator = HuggingFace(entry_point=distrilbert_script, instance_type='local_gpu', sagemaker_session=sagemaker_local_session, image_uri=docker_image, instance_count=1, role=ROLE, py_version=py_version, hyperparameters=hyperparameters) estimator.fit({ 'train': f's3://{sagemaker_local_session.default_bucket()}/{s3_prefix}/train', 'test': f's3://{sagemaker_local_session.default_bucket()}/{s3_prefix}/test' })