def main(): num_points = 32 * 100 * 2 data = [i * (1 / num_points) for i in range(num_points)] it = parallel_it.from_items(data, 2, False).for_each(lambda x: [x, x]) # this will create MLDataset with column RangeIndex(range(2)) ds = ml_data.from_parallel_iter(it, True, batch_size=32, repeated=False) tf_ds = ds.to_tf(feature_columns=[0], label_column=1) trainer = TFTrainer( model_creator=model_creator, data_creator=make_data_creator(tf_ds), num_replicas=2, config={ "batch_size": 32, "fit_config": { "steps_per_epoch": 100, }, }, ) for _ in range(10): trainer.train() model = trainer.get_model() print("f(0.5)=", float(model.predict([0.5])))
def train_example(num_replicas=1, batch_size=128, use_gpu=False): trainer = TFTrainer( model_creator=simple_model, data_creator=simple_dataset, num_replicas=num_replicas, use_gpu=use_gpu, verbose=True, config=create_config(batch_size), ) # model baseline performance start_stats = trainer.validate() print(start_stats) # train for 2 epochs trainer.train() trainer.train() # model performance after training (should improve) end_stats = trainer.validate() print(end_stats) # sanity check that training worked dloss = end_stats["validation_loss"] - start_stats["validation_loss"] dmse = (end_stats["validation_mean_squared_error"] - start_stats["validation_mean_squared_error"]) print(f"dLoss: {dloss}, dMSE: {dmse}") if dloss > 0 or dmse > 0: print("training sanity check failed. loss increased!") else: print("success!")
def main(smoke_test, num_replicas, use_gpu=False, augment_data=False, batch_size=32): data_size = 60000 test_size = 10000 batch_size = batch_size num_train_steps = 10 if smoke_test else data_size // batch_size num_eval_steps = 10 if smoke_test else test_size // batch_size trainer = TFTrainer( model_creator=create_model, data_creator=(data_augmentation_creator if augment_data else data_creator), num_replicas=num_replicas, use_gpu=use_gpu, verbose=True, config={ "batch_size": batch_size, "fit_config": { "steps_per_epoch": num_train_steps, }, "evaluate_config": { "steps": num_eval_steps, }, }, ) training_start = time.time() num_epochs = 1 if smoke_test else 3 for i in range(num_epochs): # Trains num epochs train_stats1 = trainer.train() train_stats1.update(trainer.validate()) print(f"iter {i}:", train_stats1) dt = (time.time() - training_start) / 3 print(f"Training on workers takes: {dt:.3f} seconds/epoch") model = trainer.get_model() trainer.shutdown() dataset, test_dataset = data_augmentation_creator( dict(batch_size=batch_size)) training_start = time.time() model.fit(dataset, steps_per_epoch=num_train_steps, epochs=1) dt = time.time() - training_start print(f"Training on workers takes: {dt:.3f} seconds/epoch") scores = model.evaluate(test_dataset, steps=num_eval_steps) print("Test loss:", scores[0]) print("Test accuracy:", scores[1])
step_size_train = train_utils.get_step_size(train_total_items, args.batch_size) step_size_val = train_utils.get_step_size(val_total_items, args.batch_size) num_train_steps = 10 if args.smoke_test else step_size_train num_eval_steps = 10 if args.smoke_test else step_size_val trainer = TFTrainer(model_creator=model_creator, data_creator=dataset_creator, num_replicas=args.num_replicas, use_gpu=args.use_gpu, verbose=True, config={ "batch_size": args.batch_size, "fit_config": { "steps_per_epoch": num_train_steps, }, "evaluate_config": { "steps": num_eval_steps, }, "opt": args, "hyper_params": hyper_params, "ssd_model_path": ssd_model_path }) checkpoint_callback = ModelCheckpoint(ssd_model_path, monitor="val_loss", save_best_only=True, save_weights_only=True) tensorboard_callback = TensorBoard(log_dir=ssd_log_path) learning_rate_callback = LearningRateScheduler(train_utils.scheduler,
ray.init(address=args.address) data_size = 60000 test_size = 10000 batch_size = args.batch_size num_train_steps = 10 if args.smoke_test else data_size // batch_size num_eval_steps = 10 if args.smoke_test else test_size // batch_size trainer = TFTrainer( model_creator=create_model, data_creator=(data_augmentation_creator if args.augment_data else data_creator), num_replicas=args.num_replicas, use_gpu=args.use_gpu, verbose=True, config={ "batch_size": batch_size, "fit_config": { "steps_per_epoch": num_train_steps, }, "evaluate_config": { "steps": num_eval_steps, } }) training_start = time.time() for i in range(3): # Trains num epochs train_stats1 = trainer.train() train_stats1.update(trainer.validate()) print("iter {}:".format(i), train_stats1)