Example #1
0
def main():
    num_points = 32 * 100 * 2
    data = [i * (1 / num_points) for i in range(num_points)]
    it = parallel_it.from_items(data, 2, False).for_each(lambda x: [x, x])
    # this will create MLDataset with column RangeIndex(range(2))
    ds = ml_data.from_parallel_iter(it, True, batch_size=32, repeated=False)
    tf_ds = ds.to_tf(feature_columns=[0], label_column=1)

    trainer = TFTrainer(
        model_creator=model_creator,
        data_creator=make_data_creator(tf_ds),
        num_replicas=2,
        config={
            "batch_size": 32,
            "fit_config": {
                "steps_per_epoch": 100,
            },
        },
    )

    for _ in range(10):
        trainer.train()

    model = trainer.get_model()
    print("f(0.5)=", float(model.predict([0.5])))
Example #2
0
def train_example(num_replicas=1, batch_size=128, use_gpu=False):
    trainer = TFTrainer(
        model_creator=simple_model,
        data_creator=simple_dataset,
        num_replicas=num_replicas,
        use_gpu=use_gpu,
        verbose=True,
        config=create_config(batch_size),
    )

    # model baseline performance
    start_stats = trainer.validate()
    print(start_stats)

    # train for 2 epochs
    trainer.train()
    trainer.train()

    # model performance after training (should improve)
    end_stats = trainer.validate()
    print(end_stats)

    # sanity check that training worked
    dloss = end_stats["validation_loss"] - start_stats["validation_loss"]
    dmse = (end_stats["validation_mean_squared_error"] -
            start_stats["validation_mean_squared_error"])
    print(f"dLoss: {dloss}, dMSE: {dmse}")

    if dloss > 0 or dmse > 0:
        print("training sanity check failed. loss increased!")
    else:
        print("success!")
Example #3
0
def main(smoke_test,
         num_replicas,
         use_gpu=False,
         augment_data=False,
         batch_size=32):
    data_size = 60000
    test_size = 10000
    batch_size = batch_size

    num_train_steps = 10 if smoke_test else data_size // batch_size
    num_eval_steps = 10 if smoke_test else test_size // batch_size

    trainer = TFTrainer(
        model_creator=create_model,
        data_creator=(data_augmentation_creator
                      if augment_data else data_creator),
        num_replicas=num_replicas,
        use_gpu=use_gpu,
        verbose=True,
        config={
            "batch_size": batch_size,
            "fit_config": {
                "steps_per_epoch": num_train_steps,
            },
            "evaluate_config": {
                "steps": num_eval_steps,
            },
        },
    )

    training_start = time.time()
    num_epochs = 1 if smoke_test else 3
    for i in range(num_epochs):
        # Trains num epochs
        train_stats1 = trainer.train()
        train_stats1.update(trainer.validate())
        print(f"iter {i}:", train_stats1)

    dt = (time.time() - training_start) / 3
    print(f"Training on workers takes: {dt:.3f} seconds/epoch")

    model = trainer.get_model()
    trainer.shutdown()
    dataset, test_dataset = data_augmentation_creator(
        dict(batch_size=batch_size))

    training_start = time.time()
    model.fit(dataset, steps_per_epoch=num_train_steps, epochs=1)
    dt = time.time() - training_start
    print(f"Training on workers takes: {dt:.3f} seconds/epoch")

    scores = model.evaluate(test_dataset, steps=num_eval_steps)
    print("Test loss:", scores[0])
    print("Test accuracy:", scores[1])
Example #4
0
    step_size_train = train_utils.get_step_size(train_total_items,
                                                args.batch_size)
    step_size_val = train_utils.get_step_size(val_total_items, args.batch_size)

    num_train_steps = 10 if args.smoke_test else step_size_train
    num_eval_steps = 10 if args.smoke_test else step_size_val

    trainer = TFTrainer(model_creator=model_creator,
                        data_creator=dataset_creator,
                        num_replicas=args.num_replicas,
                        use_gpu=args.use_gpu,
                        verbose=True,
                        config={
                            "batch_size": args.batch_size,
                            "fit_config": {
                                "steps_per_epoch": num_train_steps,
                            },
                            "evaluate_config": {
                                "steps": num_eval_steps,
                            },
                            "opt": args,
                            "hyper_params": hyper_params,
                            "ssd_model_path": ssd_model_path
                        })

    checkpoint_callback = ModelCheckpoint(ssd_model_path,
                                          monitor="val_loss",
                                          save_best_only=True,
                                          save_weights_only=True)
    tensorboard_callback = TensorBoard(log_dir=ssd_log_path)
    learning_rate_callback = LearningRateScheduler(train_utils.scheduler,
Example #5
0
    ray.init(address=args.address)
    data_size = 60000
    test_size = 10000
    batch_size = args.batch_size

    num_train_steps = 10 if args.smoke_test else data_size // batch_size
    num_eval_steps = 10 if args.smoke_test else test_size // batch_size

    trainer = TFTrainer(
        model_creator=create_model,
        data_creator=(data_augmentation_creator
                      if args.augment_data else data_creator),
        num_replicas=args.num_replicas,
        use_gpu=args.use_gpu,
        verbose=True,
        config={
            "batch_size": batch_size,
            "fit_config": {
                "steps_per_epoch": num_train_steps,
            },
            "evaluate_config": {
                "steps": num_eval_steps,
            }
        })

    training_start = time.time()
    for i in range(3):
        # Trains num epochs
        train_stats1 = trainer.train()
        train_stats1.update(trainer.validate())
        print("iter {}:".format(i), train_stats1)