def test_classification_results_checkpoint(tmpdir): # Verify checkpoints can be extracted correctly regardless of format tempdir_path = Path(tmpdir) checkpoint_path = tempdir_path / "test_checkpoint" checkpoint_path.mkdir(parents=True) checkpoint_file = checkpoint_path / "checkpoint.txt" checkpoint_contents = "test" checkpoint_file.write_text(checkpoint_contents) checkpoint_bytes = dir_to_blob(checkpoint_path) common_args = { "training_results": [], "labels": [], "X": [], "y_true": [], "y_pred_proba": pd.DataFrame(), } bytes_results = ClassificationExperimentResults( **common_args, best_model_checkpoint=checkpoint_bytes, best_model_checkpoint_name=checkpoint_file.name, ) path_results = ClassificationExperimentResults( **common_args, best_model_checkpoint=checkpoint_path, best_model_checkpoint_name=checkpoint_file.name, ) # Bytes checkpoint, no base_path (results object creates tempdir) with warnings.catch_warnings(): warnings.filterwarnings( "ignore", "No base_path provided; checkpoint extracting to temporary directory.", ) bytes_checkpoint = bytes_results.get_checkpoint() assert bytes_checkpoint.read_text() == checkpoint_contents # Bytes checkpoint, base path with tempfile.TemporaryDirectory() as test_dir: test_dir_path = Path(test_dir) / "test" bytes_checkpoint = bytes_results.get_checkpoint( base_path=test_dir_path) assert bytes_checkpoint.parent == test_dir_path assert bytes_checkpoint.read_text() == checkpoint_contents # Path checkpoint, no base path path_checkpoint = path_results.get_checkpoint() assert path_checkpoint == checkpoint_path / checkpoint_file assert path_checkpoint.read_text() == checkpoint_contents # Path checkpoint, base path with tempfile.TemporaryDirectory() as test_dir: test_dir_path = Path(test_dir) / "test" path_checkpoint = path_results.get_checkpoint(base_path=test_dir_path) assert path_checkpoint.parent == test_dir_path assert path_checkpoint.read_text() == checkpoint_contents
def test_blob_to_dir(tmpdir): test_dir = Path(tmpdir) / "test" test_dir.mkdir() test_file_name = "test.txt" test_file = test_dir / test_file_name file_contents = "test" test_file.write_text(file_contents) blob = dir_to_blob(test_dir) extract_path = test_dir / "test2" blob_to_dir(blob, extract_path) extracted_file = extract_path / test_file_name assert extracted_file.exists() assert extracted_file.read_text() == file_contents
def test_dir_to_blob(tmpdir): test_dir = Path(tmpdir) / "test" test_dir.mkdir() test_file_name = "test.txt" test_file = test_dir / test_file_name file_contents = "test" test_file.write_text(file_contents) blob = dir_to_blob(test_dir) fileobj = io.BytesIO(blob) fileobj.seek(0) extract_path = test_dir / "test2" with tarfile.open(fileobj=fileobj, mode="r:gz") as archive: archive.extractall(extract_path) extracted_file = extract_path / test_file_name assert extracted_file.exists() assert extracted_file.read_text() == file_contents
def train( X_train: Any, y_train: Any, X_valid: Any, y_valid: Any, train_batch_size: int, valid_batch_size: int, num_train_epochs: int, model_cls: Any, model_params: Dict[str, Any], master_ip: str, gobbli_dir: Optional[Path] = None, log_level: Union[int, str] = logging.WARNING, distributed: bool = False, ) -> RemoteTrainResult: logger = init_worker_env(gobbli_dir=gobbli_dir, log_level=log_level) use_gpu, nvidia_visible_devices = init_gpu_config() worker_ip = get_worker_ip() if not distributed and worker_ip != master_ip: raise RuntimeError( "Experiments must be started with distributed = True to run " "tasks on remote workers.") clf = model_cls( **model_params, use_gpu=use_gpu, nvidia_visible_devices=nvidia_visible_devices, logger=logger, ) clf.build() train_input = gobbli.io.TrainInput( X_train=X_train, y_train=y_train, X_valid=X_valid, y_valid=y_valid, train_batch_size=train_batch_size, valid_batch_size=valid_batch_size, num_train_epochs=num_train_epochs, ) train_output = clf.train(train_input) checkpoint = train_output.checkpoint checkpoint_name = getattr(checkpoint, "name", None) if distributed: # Copy weights into the object store, since we don't share a filesystem # with the master node checkpoint = (dir_to_blob(checkpoint.parent) if checkpoint is not None else None) if not is_ray_local_mode(): checkpoint = ray.put(checkpoint) return RemoteTrainResult( metadata=train_output.metadata(), labels=train_output.labels, checkpoint_name=checkpoint_name, checkpoint_id=checkpoint, model_params=model_params, ip_address=worker_ip, )