Ejemplo n.º 1
0
def train_linear(num_workers=2, use_gpu=False, epochs=3):
    trainer = Trainer(backend="torch", num_workers=num_workers, use_gpu=use_gpu)
    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs}
    trainer.start()
    results = trainer.run(
        train_func, config, callbacks=[JsonLoggerCallback(), TBXLoggerCallback()]
    )
    trainer.shutdown()

    print(results)
    return results
Ejemplo n.º 2
0
def train_linear(num_workers=2):
    trainer = Trainer(TorchConfig(backend="gloo"), num_workers=num_workers)
    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3}
    trainer.start()
    results = trainer.run(
        train_func,
        config,
        callbacks=[JsonLoggerCallback(),
                   TBXLoggerCallback()])
    trainer.shutdown()

    print(results)
    return results
Ejemplo n.º 3
0
def train_linear(num_workers=2, use_gpu=False):
    datasets = get_datasets()

    trainer = Trainer("torch", num_workers=num_workers, use_gpu=use_gpu)
    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3}
    trainer.start()
    results = trainer.run(
        train_func,
        config,
        dataset=datasets,
        callbacks=[JsonLoggerCallback(), TBXLoggerCallback()],
    )
    trainer.shutdown()
    print(results)
    return results
Ejemplo n.º 4
0
def test_TBX(ray_start_4_cpus, tmp_path):
    config = TestConfig()

    temp_dir = tmp_path
    num_workers = 4

    def train_func():
        train.report(episode_reward_mean=4)
        train.report(episode_reward_mean=5)
        train.report(episode_reward_mean=6, score=[1, 2, 3], hello={"world": 1})
        return 1

    callback = TBXLoggerCallback(temp_dir)
    trainer = Trainer(config, num_workers=num_workers)
    trainer.start()
    trainer.run(train_func, callbacks=[callback])

    _validate_tbx_result(temp_dir)
Ejemplo n.º 5
0
from ray import train
from ray.train import Trainer
from ray.train.callbacks import MLflowLoggerCallback, TBXLoggerCallback


def train_func():
    for i in range(3):
        train.report(epoch=i)


trainer = Trainer(backend="torch", num_workers=2)
trainer.start()

# Run the training function, logging all the intermediate results
# to MLflow and Tensorboard.
result = trainer.run(
    train_func,
    callbacks=[
        MLflowLoggerCallback(experiment_name="train_experiment"),
        TBXLoggerCallback()
    ])

# Print the latest run directory and keep note of it.
# For example: /home/ray_results/train_2021-09-01_12-00-00/run_001
print("Run directory:", trainer.latest_run_dir)

trainer.shutdown()
Ejemplo n.º 6
0
        "num_hidden": NUM_HIDDEN,
        "num_layers": NUM_LAYERS,
        "dropout_every": DROPOUT_EVERY,
        "dropout_prob": DROPOUT_PROB,
        "num_features": num_features,
    }

    # Create 2 callbacks: one for Tensorboard Logging and one for MLflow
    # logging. Pass these into Trainer, and all results that are
    # reported by ``train.report()`` will be logged to these 2 places.
    # TODO: TBXLoggerCallback should create nonexistent logdir
    #       and should also create 1 directory per file.
    tbx_runs_dir = os.path.join(dir_path, "runs")
    os.makedirs(tbx_runs_dir, exist_ok=True)
    callbacks = [
        TBXLoggerCallback(logdir=tbx_runs_dir),
        MLflowLoggerCallback(experiment_name="cuj-big-data-training",
                             save_artifact=True),
    ]

    # Remove CPU resource so Datasets can be scheduled.
    resources_per_worker = {"CPU": 0, "GPU": 1} if use_gpu else None

    trainer = Trainer(
        backend="torch",
        num_workers=num_workers,
        use_gpu=use_gpu,
        resources_per_worker=resources_per_worker,
    )
    trainer.start()
    results = trainer.run(train_func=train_func,
Ejemplo n.º 7
0
        "num_hidden": NUM_HIDDEN,
        "num_layers": NUM_LAYERS,
        "dropout_every": DROPOUT_EVERY,
        "dropout_prob": DROPOUT_PROB,
        "num_features": num_features,
    }

    # Create 2 callbacks: one for Tensorboard Logging and one for MLflow
    # logging. Pass these into Trainer, and all results that are
    # reported by ``train.report()`` will be logged to these 2 places.
    # TODO: TBXLoggerCallback should create nonexistent logdir
    #       and should also create 1 directory per file.
    tbx_logdir = "./runs"
    os.makedirs(tbx_logdir, exist_ok=True)
    callbacks = [
        TBXLoggerCallback(logdir=tbx_logdir),
        MLflowLoggerCallback(experiment_name="cuj-big-data-training",
                             save_artifact=True),
    ]

    # Remove CPU resource so Datasets can be scheduled.
    resources_per_worker = {"CPU": 0, "GPU": 1} if use_gpu else None

    trainer = Trainer(
        backend="torch",
        num_workers=num_workers,
        use_gpu=use_gpu,
        resources_per_worker=resources_per_worker,
    )
    trainer.start()
    results = trainer.run(train_func=train_func,
Ejemplo n.º 8
0
        "batch_size": BATCH_SIZE,
        "num_hidden": NUM_HIDDEN,
        "num_layers": NUM_LAYERS,
        "dropout_every": DROPOUT_EVERY,
        "dropout_prob": DROPOUT_PROB,
        "num_features": num_features
    }

    # Create 2 callbacks: one for Tensorboard Logging and one for MLflow
    # logging. Pass these into Trainer, and all results that are
    # reported by ``train.report()`` will be logged to these 2 places.
    # TODO: TBXLoggerCallback should create nonexistent logdir
    #       and should also create 1 directory per file.
    tbx_logdir = "./runs"
    os.makedirs(tbx_logdir, exist_ok=True)
    callbacks = [TBXLoggerCallback(logdir=tbx_logdir), MLflowCallback(config)]

    # Remove CPU resource so Datasets can be scheduled.
    resources_per_worker = {"CPU": 0, "GPU": 1} if use_gpu else None

    trainer = Trainer(backend="torch",
                      num_workers=num_workers,
                      use_gpu=use_gpu,
                      resources_per_worker=resources_per_worker)
    trainer.start()
    results = trainer.run(train_func=train_func,
                          config=config,
                          callbacks=callbacks,
                          dataset=datasets)
    model = results[0]
    trainer.shutdown()
Ejemplo n.º 9
0
        "use_gpu": use_gpu,
        "num_epochs": NUM_EPOCHS,
        "batch_size": BATCH_SIZE,
        "num_hidden": NUM_HIDDEN,
        "num_layers": NUM_LAYERS,
        "dropout_every": DROPOUT_EVERY,
        "dropout_prob": DROPOUT_PROB,
        "num_features": num_features
    }

    # Create 2 callbacks: one for Tensorboard Logging and one for MLflow
    # logging. Pass these into Trainer, and all results that are
    # reported by ``train.report()`` will be logged to these 2 places.
    # TODO: TBXLoggerCallback should create nonexistent logdir
    #       and should also create 1 directory per file.
    callbacks = [TBXLoggerCallback(logdir="/tmp"), MLflowCallback(config)]

    # Remove CPU resource so Datasets can be scheduled.
    resources_per_worker = {"CPU": 0, "GPU": 1} if use_gpu else None

    trainer = Trainer(backend="torch",
                      num_workers=num_workers,
                      use_gpu=use_gpu,
                      resources_per_worker=resources_per_worker)
    trainer.start()
    results = trainer.run(train_func=train_func,
                          config=config,
                          callbacks=callbacks,
                          dataset=datasets)
    model = results[0]
    trainer.shutdown()
Ejemplo n.º 10
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--address",
                        required=False,
                        type=str,
                        help="the address to use for Ray")
    parser.add_argument(
        "--num-workers",
        "-n",
        type=int,
        default=2,
        help="Sets number of workers for training.",
    )
    parser.add_argument("--use-gpu",
                        action="store_true",
                        default=False,
                        help="Enables GPU training")

    args = parser.parse_args()

    ray.init(address=args.address)

    callbacks = [TorchTensorboardProfilerCallback(), TBXLoggerCallback()]
    trainer = Trainer(backend="torch",
                      num_workers=args.num_workers,
                      use_gpu=args.use_gpu)
    trainer.start()
    trainer.run(train_func, callbacks=callbacks)
    trainer.shutdown()