Ejemplo n.º 1
0
def test_classification_results_checkpoint(tmpdir):
    # Verify checkpoints can be extracted correctly regardless of format
    tempdir_path = Path(tmpdir)
    checkpoint_path = tempdir_path / "test_checkpoint"
    checkpoint_path.mkdir(parents=True)
    checkpoint_file = checkpoint_path / "checkpoint.txt"

    checkpoint_contents = "test"
    checkpoint_file.write_text(checkpoint_contents)
    checkpoint_bytes = dir_to_blob(checkpoint_path)

    common_args = {
        "training_results": [],
        "labels": [],
        "X": [],
        "y_true": [],
        "y_pred_proba": pd.DataFrame(),
    }
    bytes_results = ClassificationExperimentResults(
        **common_args,
        best_model_checkpoint=checkpoint_bytes,
        best_model_checkpoint_name=checkpoint_file.name,
    )

    path_results = ClassificationExperimentResults(
        **common_args,
        best_model_checkpoint=checkpoint_path,
        best_model_checkpoint_name=checkpoint_file.name,
    )

    # Bytes checkpoint, no base_path (results object creates tempdir)
    with warnings.catch_warnings():
        warnings.filterwarnings(
            "ignore",
            "No base_path provided; checkpoint extracting to temporary directory.",
        )
        bytes_checkpoint = bytes_results.get_checkpoint()
    assert bytes_checkpoint.read_text() == checkpoint_contents

    # Bytes checkpoint, base path
    with tempfile.TemporaryDirectory() as test_dir:
        test_dir_path = Path(test_dir) / "test"
        bytes_checkpoint = bytes_results.get_checkpoint(
            base_path=test_dir_path)
        assert bytes_checkpoint.parent == test_dir_path
        assert bytes_checkpoint.read_text() == checkpoint_contents

    # Path checkpoint, no base path
    path_checkpoint = path_results.get_checkpoint()
    assert path_checkpoint == checkpoint_path / checkpoint_file
    assert path_checkpoint.read_text() == checkpoint_contents

    # Path checkpoint, base path
    with tempfile.TemporaryDirectory() as test_dir:
        test_dir_path = Path(test_dir) / "test"
        path_checkpoint = path_results.get_checkpoint(base_path=test_dir_path)
        assert path_checkpoint.parent == test_dir_path
        assert path_checkpoint.read_text() == checkpoint_contents
Ejemplo n.º 2
0
def test_blob_to_dir(tmpdir):
    test_dir = Path(tmpdir) / "test"
    test_dir.mkdir()
    test_file_name = "test.txt"
    test_file = test_dir / test_file_name
    file_contents = "test"
    test_file.write_text(file_contents)

    blob = dir_to_blob(test_dir)
    extract_path = test_dir / "test2"
    blob_to_dir(blob, extract_path)

    extracted_file = extract_path / test_file_name
    assert extracted_file.exists()
    assert extracted_file.read_text() == file_contents
Ejemplo n.º 3
0
def test_dir_to_blob(tmpdir):
    test_dir = Path(tmpdir) / "test"
    test_dir.mkdir()
    test_file_name = "test.txt"
    test_file = test_dir / test_file_name
    file_contents = "test"
    test_file.write_text(file_contents)

    blob = dir_to_blob(test_dir)
    fileobj = io.BytesIO(blob)
    fileobj.seek(0)
    extract_path = test_dir / "test2"
    with tarfile.open(fileobj=fileobj, mode="r:gz") as archive:
        archive.extractall(extract_path)

    extracted_file = extract_path / test_file_name
    assert extracted_file.exists()
    assert extracted_file.read_text() == file_contents
Ejemplo n.º 4
0
        def train(
            X_train: Any,
            y_train: Any,
            X_valid: Any,
            y_valid: Any,
            train_batch_size: int,
            valid_batch_size: int,
            num_train_epochs: int,
            model_cls: Any,
            model_params: Dict[str, Any],
            master_ip: str,
            gobbli_dir: Optional[Path] = None,
            log_level: Union[int, str] = logging.WARNING,
            distributed: bool = False,
        ) -> RemoteTrainResult:

            logger = init_worker_env(gobbli_dir=gobbli_dir,
                                     log_level=log_level)
            use_gpu, nvidia_visible_devices = init_gpu_config()

            worker_ip = get_worker_ip()
            if not distributed and worker_ip != master_ip:
                raise RuntimeError(
                    "Experiments must be started with distributed = True to run "
                    "tasks on remote workers.")

            clf = model_cls(
                **model_params,
                use_gpu=use_gpu,
                nvidia_visible_devices=nvidia_visible_devices,
                logger=logger,
            )

            clf.build()

            train_input = gobbli.io.TrainInput(
                X_train=X_train,
                y_train=y_train,
                X_valid=X_valid,
                y_valid=y_valid,
                train_batch_size=train_batch_size,
                valid_batch_size=valid_batch_size,
                num_train_epochs=num_train_epochs,
            )
            train_output = clf.train(train_input)
            checkpoint = train_output.checkpoint
            checkpoint_name = getattr(checkpoint, "name", None)

            if distributed:
                # Copy weights into the object store, since we don't share a filesystem
                # with the master node
                checkpoint = (dir_to_blob(checkpoint.parent)
                              if checkpoint is not None else None)

            if not is_ray_local_mode():
                checkpoint = ray.put(checkpoint)

            return RemoteTrainResult(
                metadata=train_output.metadata(),
                labels=train_output.labels,
                checkpoint_name=checkpoint_name,
                checkpoint_id=checkpoint,
                model_params=model_params,
                ip_address=worker_ip,
            )