Exemple #1
0
 def get_dataset():
     # Train dataset should be sharded.
     train_dataset = train.get_dataset_shard("train")
     assert train_dataset.count(
     ) == num_train_data / scale_config["num_workers"]
     # All other datasets should not be sharded.
     val_dataset = train.get_dataset_shard("val")
     assert val_dataset.count() == num_val_data
def train_func(config):
    batch_size = config.get("batch_size", 32)
    hidden_size = config.get("hidden_size", 1)
    lr = config.get("lr", 1e-2)
    epochs = config.get("epochs", 3)

    train_dataset_pipeline_shard = train.get_dataset_shard("train")
    validation_dataset_pipeline_shard = train.get_dataset_shard("validation")

    device = torch.device(
        f"cuda:{train.local_rank()}" if torch.cuda.is_available() else "cpu")
    if torch.cuda.is_available():
        torch.cuda.set_device(device)

    model = nn.Linear(1, hidden_size)
    model = model.to(device)
    model = DistributedDataParallel(
        model,
        device_ids=[train.local_rank()] if torch.cuda.is_available() else None)

    loss_fn = nn.MSELoss()

    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    results = []

    train_dataset_iterator = train_dataset_pipeline_shard.iter_datasets()
    validation_dataset_iterator = \
        validation_dataset_pipeline_shard.iter_datasets()

    for _ in range(epochs):
        train_dataset = next(train_dataset_iterator)
        validation_dataset = next(validation_dataset_iterator)

        train_torch_dataset = train_dataset.to_torch(
            label_column="y",
            feature_columns=["x"],
            label_column_dtype=torch.float,
            feature_column_dtypes=[torch.float],
            batch_size=batch_size,
        )
        validation_torch_dataset = validation_dataset.to_torch(
            label_column="y",
            feature_columns=["x"],
            label_column_dtype=torch.float,
            feature_column_dtypes=[torch.float],
            batch_size=batch_size)

        train_epoch(train_torch_dataset, model, loss_fn, optimizer, device)
        result = validate_epoch(validation_torch_dataset, model, loss_fn,
                                device)
        train.report(**result)
        results.append(result)

    return results
Exemple #3
0
def train_fn(
    executable_kwargs: Dict[str, Any] = None,
    model: "LudwigModel" = None,  # noqa: F821
    training_set_metadata: Dict[str, Any] = None,
    features: Dict[str, Dict] = None,
    **kwargs,
):
    # Pin GPU before loading the model to prevent memory leaking onto other devices
    hvd = initialize_horovod()
    initialize_pytorch(horovod=hvd)

    train_shard = RayDatasetShard(
        rt.get_dataset_shard("train"),
        features,
        training_set_metadata,
    )

    try:
        val_shard = rt.get_dataset_shard("val")
    except KeyError:
        val_shard = None

    if val_shard is not None:
        val_shard = RayDatasetShard(
            val_shard,
            features,
            training_set_metadata,
        )

    try:
        test_shard = rt.get_dataset_shard("test")
    except KeyError:
        test_shard = None

    if test_shard is not None:
        test_shard = RayDatasetShard(
            test_shard,
            features,
            training_set_metadata,
        )

    trainer = RemoteTrainer(model=model, **executable_kwargs)
    results = trainer.train(train_shard, val_shard, test_shard, **kwargs)

    # TODO(shreya): Figure out GPU memory leak
    # TODO(shreya): Check if placing model off GPU explicitly makes a difference
    # Clear CUDA memory, place model on CPU, return model to user
    # torch.cuda.empty_cache()
    # model.cpu()

    return results, trainer.validation_field, trainer.validation_metric
def train_func(config):
    batch_size = config.get("batch_size", 32)
    hidden_size = config.get("hidden_size", 1)
    lr = config.get("lr", 1e-2)
    epochs = config.get("epochs", 3)

    train_dataset_pipeline_shard = train.get_dataset_shard("train")
    validation_dataset_pipeline_shard = train.get_dataset_shard("validation")

    model = nn.Linear(1, hidden_size)
    model = train.torch.prepare_model(model)

    loss_fn = nn.MSELoss()

    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    results = []

    train_dataset_iterator = train_dataset_pipeline_shard.iter_epochs()
    validation_dataset_iterator = validation_dataset_pipeline_shard.iter_epochs(
    )

    for _ in range(epochs):
        train_dataset = next(train_dataset_iterator)
        validation_dataset = next(validation_dataset_iterator)

        train_torch_dataset = train_dataset.to_torch(
            label_column="y",
            feature_columns=["x"],
            label_column_dtype=torch.float,
            feature_column_dtypes=torch.float,
            batch_size=batch_size,
        )
        validation_torch_dataset = validation_dataset.to_torch(
            label_column="y",
            feature_columns=["x"],
            label_column_dtype=torch.float,
            feature_column_dtypes=torch.float,
            batch_size=batch_size,
        )

        device = train.torch.get_device()

        train_epoch(train_torch_dataset, model, loss_fn, optimizer, device)
        result = validate_epoch(validation_torch_dataset, model, loss_fn,
                                device)
        train.report(**result)
        results.append(result)

    return results
Exemple #5
0
 def train_loop_per_worker():
     data_shard = train.get_dataset_shard("train")
     if expect_ds:
         assert isinstance(data_shard, Dataset), data_shard
     else:
         assert isinstance(data_shard, DatasetPipeline), data_shard
     for k, v in expect_sizes.items():
         shard = train.get_dataset_shard(k)
         if v == -1:
             assert shard is None, shard
         else:
             if isinstance(shard, DatasetPipeline):
                 assert next(shard.iter_epochs()).count() == v, shard
             else:
                 assert shard.count() == v, shard
Exemple #6
0
def train_loop_per_worker(config):
    batch_size = config["batch_size"]
    lr = config["lr"]
    epochs = config["num_epochs"]
    num_features = config["num_features"]

    # Get the Ray Dataset shard for this data parallel worker,
    # and convert it to a Tensorflow Dataset.
    train_data = train.get_dataset_shard("train")

    strategy = tf.distribute.MultiWorkerMirroredStrategy()
    with strategy.scope():
        # Model building/compiling need to be within `strategy.scope()`.
        multi_worker_model = create_keras_model(num_features)
        multi_worker_model.compile(
            optimizer=tf.keras.optimizers.SGD(learning_rate=lr),
            loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
            metrics=[
                tf.keras.metrics.BinaryCrossentropy(
                    name="loss",
                )
            ],
        )

    results = []
    for _ in range(epochs):
        tf_dataset = to_tf_dataset(dataset=train_data, batch_size=batch_size)
        history = multi_worker_model.fit(
            tf_dataset,
            callbacks=[KerasCallback()],
            verbose=0,
        )
    return results
Exemple #7
0
def eval_fn(
    predictor_kwargs: Dict[str, Any] = None,
    model_ref: ObjectRef = None,  # noqa: F821
    training_set_metadata: Dict[str, Any] = None,
    features: Dict[str, Dict] = None,
    **kwargs,
):
    # Pin GPU before loading the model to prevent memory leaking onto other devices
    hvd = initialize_horovod()
    try:
        initialize_pytorch(horovod=hvd)

        eval_shard = RayDatasetShard(
            rt.get_dataset_shard("eval"),
            features,
            training_set_metadata,
        )

        model = ray.get(model_ref)
        device = get_torch_device()
        model = model.to(device)

        predictor = RemotePredictor(model=model,
                                    horovod=hvd,
                                    report_tqdm_to_ray=True,
                                    **predictor_kwargs)
        return predictor.batch_evaluation(eval_shard, **kwargs)
    finally:
        torch.cuda.empty_cache()
        hvd.shutdown()
Exemple #8
0
def train_func(config):
    batch_size = config.get("batch_size", 64)
    epochs = config.get("epochs", 3)

    strategy = tf.distribute.MultiWorkerMirroredStrategy()
    with strategy.scope():
        # Model building/compiling need to be within `strategy.scope()`.
        multi_worker_model = build_and_compile_model(config)

    dataset_pipeline = train.get_dataset_shard()
    dataset_iterator = dataset_pipeline.iter_epochs()

    results = []
    for _ in range(epochs):
        dataset = next(dataset_iterator)
        tf_dataset = prepare_dataset_shard(
            dataset.to_tf(
                label_column="y",
                output_signature=(
                    tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
                    tf.TensorSpec(shape=(None), dtype=tf.float32),
                ),
                batch_size=batch_size,
            ))
        history = multi_worker_model.fit(tf_dataset,
                                         callbacks=[TrainReportCallback()])
        results.append(history.history)
    return results
Exemple #9
0
def train_func(config: dict):
    batch_size = config.get("batch_size", 64)
    epochs = config.get("epochs", 3)

    strategy = tf.distribute.MultiWorkerMirroredStrategy()
    with strategy.scope():
        # Model building/compiling need to be within `strategy.scope()`.
        multi_worker_model = build_model()
        multi_worker_model.compile(
            optimizer=tf.keras.optimizers.SGD(learning_rate=config.get("lr", 1e-3)),
            loss=tf.keras.losses.mean_squared_error,
            metrics=[tf.keras.metrics.mean_squared_error],
        )

    dataset = train.get_dataset_shard("train")

    results = []
    for _ in range(epochs):
        tf_dataset = prepare_dataset_shard(
            dataset.to_tf(
                label_column="y",
                output_signature=(
                    tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
                    tf.TensorSpec(shape=(None), dtype=tf.float32),
                ),
                batch_size=batch_size,
            )
        )
        history = multi_worker_model.fit(
            tf_dataset, callbacks=[TrainCheckpointReportCallback()]
        )
        results.append(history.history)
    return results
Exemple #10
0
 def train_loop_per_worker():
     data_shard = train.get_dataset_shard("train")
     assert isinstance(data_shard, DatasetPipeline), data_shard
     results = []
     for epoch in data_shard.iter_epochs(2):
         results.append(epoch.take())
     check_results_fn(data_shard, results)
Exemple #11
0
    def get_dataset():
        data_train_all_epochs = []
        data_val_all_epochs = []
        for _ in range(2):
            data_this_epoch_train = []
            train_dataset = train.get_dataset_shard("train")
            for batch in train_dataset.iter_batches():
                data_this_epoch_train.extend(batch)
            data_train_all_epochs.append(data_this_epoch_train)

            data_this_epoch_val = []
            val_dataset = train.get_dataset_shard("val")
            for batch in val_dataset.iter_batches():
                data_this_epoch_val.extend(batch)
            data_val_all_epochs.append(data_this_epoch_val)

        return data_train_all_epochs, data_val_all_epochs
Exemple #12
0
 def get_dataset():
     data_all_epochs = []
     for _ in range(2):
         data_this_epoch = []
         dataset = train.get_dataset_shard()
         for batch in dataset.iter_batches():
             data_this_epoch.extend(batch)
         data_all_epochs.append(data_this_epoch)
     return data_all_epochs
Exemple #13
0
def train_loop_per_worker():
    data_shard: Dataset = train.get_dataset_shard("train")

    # Iterate over 10 epochs of data.
    for epoch in range(10):
        for batch in data_shard.iter_batches(
            batch_size=10_000,
            local_shuffle_buffer_size=100_000,
        ):
            print("Do some training on batch", batch)
Exemple #14
0
def train_loop_per_worker():
    data_shard: Dataset = train.get_dataset_shard("train")

    # Iterate over 10 epochs of data.
    for epoch in range(10):
        for batch in data_shard.iter_batches():
            print("Do some training on batch", batch)

    # View the stats for performance debugging.
    print(data_shard.stats())
Exemple #15
0
 def get_dataset():
     pipeline_iterator = train.get_dataset_shard().iter_datasets()
     data_all_epochs = []
     for _ in range(num_epochs):
         dataset_this_epoch = next(pipeline_iterator)
         data_this_epoch = []
         for batch in dataset_this_epoch.iter_batches():
             data_this_epoch.extend(batch)
         data_all_epochs.append(data_this_epoch)
     return data_all_epochs
Exemple #16
0
def train_loop_per_worker():
    # A DatasetPipeline object is returned when `use_stream_api` is set.
    data_shard: DatasetPipeline = train.get_dataset_shard("train")

    # Use iter_epochs(10) to iterate over 10 epochs of data.
    for epoch in data_shard.iter_epochs(10):
        for batch in epoch.iter_batches():
            print("Do some training on batch", batch)

    # View the stats for performance debugging.
    print(data_shard.stats())
Exemple #17
0
def train_loop_per_worker():
    # By default, bulk loading is used and returns a Dataset object.
    data_shard: Dataset = train.get_dataset_shard("train")

    # Manually iterate over the data 10 times (10 epochs).
    for _ in range(10):
        for batch in data_shard.iter_batches():
            print("Do some training on batch", batch)

    # View the stats for performance debugging.
    print(data_shard.stats())
Exemple #18
0
        def train_loop_per_worker():
            import pandas as pd

            rank = train.world_rank()
            data_shard = train.get_dataset_shard("train")
            start = time.perf_counter()
            num_epochs, num_batches, num_bytes = 0, 0, 0
            batch_delays = []

            print("Starting train loop on worker", rank)
            while time.perf_counter() - start < runtime_seconds:
                num_epochs += 1
                batch_start = time.perf_counter()
                for batch in data_shard.iter_batches(
                    prefetch_blocks=prefetch_blocks, batch_size=batch_size
                ):
                    batch_delay = time.perf_counter() - batch_start
                    batch_delays.append(batch_delay)
                    num_batches += 1
                    if isinstance(batch, pd.DataFrame):
                        num_bytes += int(
                            batch.memory_usage(index=True, deep=True).sum()
                        )
                    elif isinstance(batch, np.ndarray):
                        num_bytes += batch.nbytes
                    else:
                        # NOTE: This isn't recursive and will just return the size of
                        # the object pointers if list of non-primitive types.
                        num_bytes += sys.getsizeof(batch)
                    train.report(
                        bytes_read=num_bytes,
                        num_batches=num_batches,
                        num_epochs=num_epochs,
                        batch_delay=batch_delay,
                    )
                    batch_start = time.perf_counter()
            delta = time.perf_counter() - start

            print("Time to read all data", delta, "seconds")
            print(
                "P50/P95/Max batch delay (s)",
                np.quantile(batch_delays, 0.5),
                np.quantile(batch_delays, 0.95),
                np.max(batch_delays),
            )
            print("Num epochs read", num_epochs)
            print("Num batches read", num_batches)
            print("Num bytes read", round(num_bytes / (1024 * 1024), 2), "MiB")
            print(
                "Mean throughput", round(num_bytes / (1024 * 1024) / delta, 2), "MiB/s"
            )

            if rank == 0:
                print("Ingest stats from rank=0:\n\n{}".format(data_shard.stats()))
Exemple #19
0
    def get_dataset():
        pipeline_iterator = train.get_dataset_shard().iter_datasets()
        data_all_epochs = []
        for _ in range(2):
            dataset_this_epoch = next(pipeline_iterator)
            data_this_epoch = []
            for batch in dataset_this_epoch.iter_batches():
                data_this_epoch.extend(batch)

            if len(data_all_epochs) > 0:
                # Make sure data is shuffled per epoch.
                assert data_this_epoch != data_all_epochs[-1]

            data_all_epochs.append(data_this_epoch)
        return data_all_epochs
Exemple #20
0
        def train_loop_per_worker():
            rank = train.world_rank()
            data_shard = train.get_dataset_shard("train")
            start = time.perf_counter()
            num_epochs, num_batches, num_bytes = 0, 0, 0
            batch_delays = []

            print("Starting train loop on worker", rank)
            while time.perf_counter() - start < runtime_seconds:
                num_epochs += 1
                batch_start = time.perf_counter()
                for batch in data_shard.iter_batches(
                        prefetch_blocks=prefetch_blocks,
                        batch_size=batch_size):
                    batch_delay = time.perf_counter() - batch_start
                    batch_delays.append(batch_delay)
                    num_batches += 1
                    num_bytes += int(
                        batch.memory_usage(index=True, deep=True).sum())
                    train.report(
                        bytes_read=num_bytes,
                        num_batches=num_batches,
                        num_epochs=num_epochs,
                        batch_delay=batch_delay,
                    )
                    batch_start = time.perf_counter()
            delta = time.perf_counter() - start

            print("Time to read all data", delta, "seconds")
            print(
                "P50/P95/Max batch delay (s)",
                np.quantile(batch_delays, 0.5),
                np.quantile(batch_delays, 0.95),
                np.max(batch_delays),
            )
            print("Num epochs read", num_epochs)
            print("Num batches read", num_batches)
            print("Num bytes read", round(num_bytes / (1024 * 1024), 2), "MiB")
            print("Mean throughput", round(num_bytes / (1024 * 1024) / delta,
                                           2), "MiB/s")

            if rank == 0:
                print("Ingest stats from rank=0:\n\n{}".format(
                    data_shard.stats()))
def train_func(config: dict):

    per_worker_batch_size = config.get("batch_size", 64)
    epochs = config.get("epochs", 3)

    dataset_shard = train.get_dataset_shard("train")

    strategy = tf.distribute.MultiWorkerMirroredStrategy()

    with strategy.scope():
        # Model building/compiling need to be within `strategy.scope()`.
        multi_worker_model = build_autoencoder_model()
        learning_rate = config.get("lr", 0.001)
        multi_worker_model.compile(
            loss=tf.keras.losses.BinaryCrossentropy(),
            optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
            metrics=[
                "binary_crossentropy",
            ],
        )

    results = []
    for epoch in range(epochs):
        tf_dataset = prepare_dataset_shard(
            dataset_shard.to_tf(
                feature_columns=["image"],
                label_column="label",
                output_signature=(
                    tf.TensorSpec(shape=(None, 784), dtype=tf.float32),
                    tf.TensorSpec(shape=(None, 784), dtype=tf.float32),
                ),
                batch_size=per_worker_batch_size,
            ))
        history = multi_worker_model.fit(
            tf_dataset, callbacks=[TrainCheckpointReportCallback()])
        results.append(history.history)
    return results
Exemple #22
0
def train_loop_per_worker(config):
    batch_size = config["batch_size"]
    lr = config["lr"]
    epochs = config["num_epochs"]
    num_features = config["num_features"]

    # Get the Ray Dataset shard for this data parallel worker,
    # and convert it to a PyTorch Dataset.
    train_data = train.get_dataset_shard("train")

    def to_tensor_iterator(dataset, batch_size):
        data_iterator = dataset.iter_batches(batch_format="numpy",
                                             batch_size=batch_size)
        for d in data_iterator:
            # "concat_out" is the output column of the Concatenator.
            yield (
                torch.Tensor(d["concat_out"]).float(),
                torch.Tensor(d["target"]).float(),
            )

    # Create model.
    model = create_model(num_features)
    model = train.torch.prepare_model(model)

    loss_fn = nn.BCELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    for cur_epoch in range(epochs):
        for inputs, labels in to_tensor_iterator(train_data, batch_size):
            optimizer.zero_grad()
            predictions = model(inputs)
            train_loss = loss_fn(predictions, labels.unsqueeze(1))
            train_loss.backward()
            optimizer.step()
        loss = train_loss.item()
        session.report({"loss": loss},
                       checkpoint=TorchCheckpoint.from_model(model))
Exemple #23
0
def train_fn(
    executable_kwargs: Dict[str, Any] = None,
    model_ref: ObjectRef = None,  # noqa: F821
    training_set_metadata: Dict[str, Any] = None,
    features: Dict[str, Dict] = None,
    **kwargs,
):
    # Pin GPU before loading the model to prevent memory leaking onto other devices
    hvd = initialize_horovod()
    try:
        initialize_pytorch(horovod=hvd)

        train_shard = RayDatasetShard(
            rt.get_dataset_shard("train"),
            features,
            training_set_metadata,
        )

        try:
            val_shard = rt.get_dataset_shard("val")
        except KeyError:
            val_shard = None

        if val_shard is not None:
            val_shard = RayDatasetShard(
                val_shard,
                features,
                training_set_metadata,
            )

        try:
            test_shard = rt.get_dataset_shard("test")
        except KeyError:
            test_shard = None

        if test_shard is not None:
            test_shard = RayDatasetShard(
                test_shard,
                features,
                training_set_metadata,
            )

        model = ray.get(model_ref)
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model = model.to(device)

        trainer = RemoteTrainer(model=model, horovod=hvd, **executable_kwargs)
        results = trainer.train(train_shard, val_shard, test_shard, **kwargs)

        if results is not None:
            # only return the model state dict back to the head node.
            trained_model, *args = results
            results = (trained_model.cpu().state_dict(), *args)

        torch.cuda.empty_cache()

        train_results = results, trainer.validation_field, trainer.validation_metric

    finally:
        hvd.shutdown()
    return train_results
Exemple #24
0
def train_func(config):
    use_gpu = config["use_gpu"]
    num_epochs = config["num_epochs"]
    batch_size = config["batch_size"]
    num_layers = config["num_layers"]
    num_hidden = config["num_hidden"]
    dropout_every = config["dropout_every"]
    dropout_prob = config["dropout_prob"]
    num_features = config["num_features"]

    print("Defining model, loss, and optimizer...")

    # Setup device.
    device = torch.device(f"cuda:{train.local_rank()}"
                          if use_gpu and torch.cuda.is_available() else "cpu")
    print(f"Device: {device}")

    # Setup data.
    train_dataset_pipeline = train.get_dataset_shard("train_dataset")
    train_dataset_epoch_iterator = train_dataset_pipeline.iter_epochs()
    test_dataset = train.get_dataset_shard("test_dataset")
    test_torch_dataset = test_dataset.to_torch(label_column="label",
                                               batch_size=batch_size)

    net = Net(
        n_layers=num_layers,
        n_features=num_features,
        num_hidden=num_hidden,
        dropout_every=dropout_every,
        drop_prob=dropout_prob,
    ).to(device)
    print(net.parameters)

    net = train.torch.prepare_model(net)

    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(net.parameters(), weight_decay=0.0001)

    print("Starting training...")
    for epoch in range(num_epochs):
        train_dataset = next(train_dataset_epoch_iterator)

        train_torch_dataset = train_dataset.to_torch(label_column="label",
                                                     batch_size=batch_size)

        train_running_loss, train_num_correct, train_num_total = train_epoch(
            train_torch_dataset, net, device, criterion, optimizer,
            num_features)
        train_acc = train_num_correct / train_num_total
        print(f"epoch [{epoch + 1}]: training accuracy: "
              f"{train_num_correct} / {train_num_total} = {train_acc:.4f}")

        test_running_loss, test_num_correct, test_num_total = test_epoch(
            test_torch_dataset, net, device, criterion)
        test_acc = test_num_correct / test_num_total
        print(f"epoch [{epoch + 1}]: testing accuracy: "
              f"{test_num_correct} / {test_num_total} = {test_acc:.4f}")

        # Record and log stats.
        train.report(
            train_acc=train_acc,
            train_loss=train_running_loss,
            test_acc=test_acc,
            test_loss=test_running_loss,
        )

        # Checkpoint model.
        module = net.module if isinstance(net,
                                          DistributedDataParallel) else net
        train.save_checkpoint(model_state_dict=module.state_dict())

    if train.world_rank() == 0:
        return module.cpu()
Exemple #25
0
def train_func(config):
    is_distributed = config.get("is_distributed", False)
    use_gpu = config["use_gpu"]
    num_epochs = config["num_epochs"]
    batch_size = config["batch_size"]
    num_layers = config["num_layers"]
    num_hidden = config["num_hidden"]
    dropout_every = config["dropout_every"]
    dropout_prob = config["dropout_prob"]
    num_features = config["num_features"]

    print("Defining model, loss, and optimizer...")

    # Setup device.
    if is_distributed:
        device = torch.device(f"cuda:{train.local_rank()}" if use_gpu
                              and torch.cuda.is_available() else "cpu")
    else:
        device = torch.device(
            "cuda:0" if use_gpu and torch.cuda.is_available() else "cpu")
    print(f"Device: {device}")

    # Setup data.
    if is_distributed:
        train_dataset_pipeline = train.get_dataset_shard("train_dataset")
        train_dataset_epoch_iterator = train_dataset_pipeline.iter_epochs()
        test_dataset = train.get_dataset_shard("test_dataset")
    else:
        train_dataset_epoch_iterator = config["train_dataset"].iter_epochs()
        test_dataset = config["test_dataset"]
    test_torch_dataset = test_dataset.to_torch(label_column="label",
                                               batch_size=batch_size)

    # Setup Tensorboard and MLflow.
    if is_distributed:
        # Setup is done through Callback.
        pass
    else:
        writer = SummaryWriter()
        mlflow.start_run()
        mlflow_config = config.copy()
        mlflow_config.pop("test_dataset")
        mlflow_config.pop("train_dataset")
        mlflow.log_params(mlflow_config)

    net = Net(
        n_layers=num_layers,
        n_features=num_features,
        num_hidden=num_hidden,
        dropout_every=dropout_every,
        drop_prob=dropout_prob,
    ).to(device)
    print(net.parameters)

    if is_distributed:
        net = DistributedDataParallel(net)

    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(net.parameters(), weight_decay=0.0001)

    print("Starting training...")
    for epoch in range(num_epochs):
        train_dataset = next(train_dataset_epoch_iterator)

        train_torch_dataset = train_dataset.to_torch(label_column="label",
                                                     batch_size=batch_size)

        train_running_loss, train_num_correct, train_num_total = train_epoch(
            train_torch_dataset, net, device, criterion, optimizer)
        train_acc = train_num_correct / train_num_total
        print(
            f"epoch [{epoch + 1}]: training accuracy: {train_num_correct} / {train_num_total} = {train_acc:.4f}"
        )

        test_running_loss, test_num_correct, test_num_total = test_epoch(
            test_torch_dataset, net, device, criterion)
        test_acc = test_num_correct / test_num_total
        print(
            f"epoch [{epoch + 1}]: testing accuracy: {test_num_correct} / {test_num_total} = {test_acc:.4f}"
        )

        # Record and log stats.
        if is_distributed:
            train.report(train_acc=train_acc,
                         train_loss=train_running_loss,
                         test_acc=test_acc,
                         test_loss=test_running_loss)
        else:
            writer.add_scalar("Accuracy/train", train_acc, epoch)
            writer.add_scalar("Loss/train", train_running_loss, epoch)
            writer.add_scalar("Accuracy/test", test_acc, epoch)
            writer.add_scalar("Loss/test", test_running_loss, epoch)
            writer.flush()

            mlflow.log_metrics({
                "train_acc": train_acc,
                "train_loss": train_running_loss,
                "test_acc": test_acc,
                "test_loss": test_running_loss
            })

        # Checkpoint model.
        if is_distributed:
            import copy
            model_copy = copy.deepcopy(net.module)
            train.save_checkpoint(
                model_state_dict=model_copy.cpu().state_dict())
        else:
            torch.save(net.state_dict(), f"models/model-epoch-{epoch}.torch")

    # Shutdown Tensorboard and MLflow.
    if is_distributed:
        pass
    else:
        writer.close()
        # mlflow.end_run()

    if is_distributed:
        if train.world_rank() == 0:
            return net.module.cpu()
        else:
            return None
    else:
        return net
Exemple #26
0
 def train_func():
     return train.get_dataset_shard()
def _huggingface_train_loop_per_worker(config):
    """Per-worker training loop for HuggingFace Transformers."""
    trainer_init_per_worker = config.pop("_trainer_init_per_worker")

    # Env vars necessary for HF to setup DDP
    os.environ["RANK"] = str(train.world_rank())
    os.environ["WORLD_SIZE"] = str(train.world_size())
    os.environ["LOCAL_RANK"] = str(train.local_rank())

    train_dataset = train.get_dataset_shard(TRAIN_DATASET_KEY)
    eval_dataset = train.get_dataset_shard(EVALUATION_DATASET_KEY)

    train_torch_dataset, eval_torch_dataset = process_datasets(
        train_dataset,
        eval_dataset,
    )

    trainer: transformers.trainer.Trainer = trainer_init_per_worker(
        train_torch_dataset, eval_torch_dataset, **config)

    if trainer.args.push_to_hub and not trainer.args.hub_token:
        warnings.warn(
            "You have set `push_to_hub=True` but didn't specify `hub_token`. "
            "Pushing to hub will most likely fail, as the credentials will not "
            "be automatically propagated from the local enviroment to the Ray Actors. "
            "If that happens, specify `hub_token` in `TrainingArguments`.")

    if (trainer.args.evaluation_strategy == "steps"
            or trainer.args.save_strategy == "steps"
            or trainer.args.logging_strategy == "steps"):
        raise ValueError(
            "'steps' value for `evaluation_strategy`, `logging_strategy` "
            "or `save_strategy` is not yet supported.")

    trainer = wrap_transformers_trainer(trainer)

    # ensure no HF logging callbacks are added
    # aside from doubling functionality with our callbacks,
    # the Wandb callbacks causes training to freeze
    integration_callbacks = transformers.trainer.get_reporting_integration_callbacks(
        trainer.args.report_to)
    for callback in integration_callbacks:
        trainer.pop_callback(callback)

    trainer.add_callback(TrainReportCallback)

    checkpoint = session.get_checkpoint()
    checkpoint_path = None
    remove_checkpoint_path = False
    if checkpoint:
        assert isinstance(checkpoint, Checkpoint)
        checkpoint_dict = checkpoint.to_dict()
        source_ip = checkpoint_dict[NODE_IP_KEY]
        source_path = checkpoint_dict[CHECKPOINT_PATH_ON_NODE_KEY]
        target_ip = get_node_ip_address()
        if source_ip == target_ip:
            checkpoint_path = source_path
        else:
            checkpoint_path = tempfile.mkdtemp(
                suffix=Path(trainer.args.output_dir).name)
            remove_checkpoint_path = True
            sync_dir_between_nodes(
                source_ip=source_ip,
                source_path=source_path,
                target_ip=target_ip,
                target_path=checkpoint_path,
                return_futures=False,
                max_size_bytes=None,
            )
    trainer.train(resume_from_checkpoint=checkpoint_path)
    if remove_checkpoint_path:
        shutil.rmtree(checkpoint_path, ignore_errors=True)
 def train_loop_per_worker():
     data_shard = train.get_dataset_shard("train")
     assert isinstance(data_shard, Dataset), data_shard
     results = data_shard.take()
     check_results_fn(data_shard, results)