def main(args): if 'sourcedir.tar.gz' in args.tensorboard_dir: tensorboard_dir = re.sub('source/sourcedir.tar.gz', 'model', args.tensorboard_dir) else: tensorboard_dir = args.tensorboard_dir logging.info("Writing TensorBoard logs to {}".format(tensorboard_dir)) logging.info("getting data") train_dataset = process_input(args.epochs, args.batch_size, args.train, 'train', args.data_config) eval_dataset = process_input(args.epochs, args.batch_size, args.eval, 'eval', args.data_config) validation_dataset = process_input(args.epochs, args.batch_size, args.validation, 'validation', args.data_config) logging.info("configuring model") logging.info("Hosts: " + os.environ.get('SM_HOSTS')) size = len(args.hosts) #Deal with this model = get_model(args.learning_rate, args.weight_decay, args.optimizer, args.momentum, size) callbacks = [] if args.current_host == args.hosts[0]: callbacks.append( ModelCheckpoint(args.output_data_dir + '/checkpoint-{epoch}.h5')) callbacks.append(CustomTensorBoardCallback(log_dir=tensorboard_dir)) logging.info("Starting training") history = model.fit( x=train_dataset[0], y=train_dataset[1], steps_per_epoch=(num_examples_per_epoch('train') // args.batch_size) // size, epochs=args.epochs, validation_data=validation_dataset, validation_steps=(num_examples_per_epoch('validation') // args.batch_size) // size, callbacks=callbacks) score = model.evaluate(eval_dataset[0], eval_dataset[1], steps=num_examples_per_epoch('eval') // args.batch_size, verbose=0) logging.info('Test loss:{}'.format(score[0])) logging.info('Test accuracy:{}'.format(score[1])) # PS: Save model and history only on worker 0 if args.current_host == args.hosts[0]: save_history(args.model_dir + "/ps_history.p", history) save_model(model, args.model_dir)
def model_fn(model_dir): """ Load the model for inference """ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = get_model() model.load_state_dict(torch.load(model_dir + '/model.pth')) model.eval() return model.to(device)
def train(): """ Train the PyTorch model """ x_train, y_train = get_train_data(args.train) x_test, y_test = get_test_data(args.test) train_ds = TensorDataset(x_train, y_train) batch_size = args.batch_size epochs = args.epochs learning_rate = args.learning_rate logger.info('batch_size = {}, epochs = {}, learning rate = {}'.format( batch_size, epochs, learning_rate)) train_dl = DataLoader(train_ds, batch_size, shuffle=True) model = get_model() model = model.to(device) criterion = nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) for epoch in range(epochs): for x_train_batch, y_train_batch in train_dl: y = model(x_train_batch.float()) loss = criterion(y.flatten(), y_train_batch.float()) optimizer.zero_grad() loss.backward() optimizer.step() epoch += 1 logger.info(f'epoch: {epoch} -> loss: {loss}') # evalutate on test set with torch.no_grad(): y = model(x_test.float()).flatten() mse = ((y - y_test)**2).sum() / y_test.shape[0] print("\nTest MSE:", mse.numpy()) torch.save(model.state_dict(), args.model_dir + '/model.pth') # PyTorch requires that the inference script must # be in the .tar.gz model file and Step Functions SDK doesn't do this. inference_code_path = args.model_dir + '/code/' if not os.path.exists(inference_code_path): os.mkdir(inference_code_path) logger.info('Created a folder at {}!'.format(inference_code_path)) shutil.copy('train_deploy.py', inference_code_path) shutil.copy('model_def.py', inference_code_path) logger.info('Saving models files to {}'.format(inference_code_path))
def main(): # Scaffold new experiment directory experiments = os.listdir('experiments') most_recent = 0 args = parse_args() for d in experiments: most_recent = int(d) if int(d) > most_recent else most_recent exp_dir = os.path.join('experiments', str(most_recent + 1)) os.mkdir(exp_dir) os.mkdir(os.path.join(exp_dir, 'tb_logs')) os.mkdir(os.path.join(exp_dir, 'checkpoints')) os.mkdir(os.path.join(exp_dir, 'generated')) # Create notes file with notes from cmd line with io.open(os.path.join(exp_dir, 'README.txt'), 'w') as f: f.write(args.notes) print('Scaffolded new experiment directory') # Create model model = model_def.get_model(args.max_words, args.sequence_length, args.rnn_size, args.rnn_layers) m_json = model.to_json() # Save model to directory with io.open(os.path.join(exp_dir, 'model.json'), 'w') as f: f.write(m_json) with io.open(os.path.join(exp_dir, 'config.json'), 'w') as f: json.dump(vars(args), f) model.save_weights(os.path.join(exp_dir, 'checkpoints', '_initial.h5')) print('Experiment ' + str(most_recent + 1) + 'created! Train with "train.py --exp ' + str(most_recent + 1) + '"')
def main(args): # Hyper-parameters epochs = args.epochs lr = args.learning_rate batch_size = args.batch_size momentum = args.momentum weight_decay = args.weight_decay optimizer = args.optimizer # SageMaker options gpu_count = args.gpu_count model_dir = args.model_dir training_dir = args.train validation_dir = args.validation eval_dir = args.eval # Change 2 hvd.init() size = hvd.size() # Change 3 - pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.Session(config=config)) train_dataset = make_batch(training_dir + '/train.tfrecords', batch_size) val_dataset = make_batch(validation_dir + '/validation.tfrecords', batch_size) eval_dataset = make_batch(eval_dir + '/eval.tfrecords', batch_size) input_shape = (HEIGHT, WIDTH, DEPTH) # Change 4 - update learning rate # Change 5 - update training code # Change 6 - update callbacks - sync initial state, checkpoint only on 1st worker callbacks = [] callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0)) callbacks.append(hvd.callbacks.MetricAverageCallback()) callbacks.append( hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1)) callbacks.append( tf.keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1)) if hvd.rank() == 0: callbacks.append( ModelCheckpoint(args.output_data_dir + '/checkpoint-{epoch}.h5')) logdir = args.output_data_dir + '/' + datetime.now().strftime( "%Y%m%d-%H%M%S") callbacks.append(TensorBoard(log_dir=logdir, profile_batch=0)) callbacks.append(Sync2S3(logdir=logdir, s3logdir=model_dir)) model = get_model(lr, weight_decay, optimizer, momentum, hvd) # Train model history = model.fit( x=train_dataset[0], y=train_dataset[1], steps_per_epoch=(NUM_TRAIN_IMAGES // batch_size) // size, validation_data=val_dataset, validation_steps=(NUM_VALID_IMAGES // batch_size) // size, epochs=epochs, callbacks=callbacks) # Evaluate model performance score = model.evaluate(eval_dataset[0], eval_dataset[1], steps=NUM_TEST_IMAGES // args.batch_size, verbose=0) print('Test loss :', score[0]) print('Test accuracy:', score[1]) if hvd.rank() == 0: save_history(args.output_data_dir + "/hvd_history.p", history)
def main(): device = torch.device("cuda:0" if cuda.is_available() else "cpu") num_epochs = 5 batch_size = 100 print("CSE666: Biometrics\nAssignment 1\n\nStarting training\n") start = time.time() # Transform example # https://discuss.pytorch.org/t/changing-transformation-applied-to-data-during-training/15671 # data_transform = transforms.Compose([transforms.RandomSizedCrop(224), data_transform = transforms.Compose([ transforms.RandomHorizontalFlip(0.2), transforms.RandomVerticalFlip(0.2), # transforms.RandomResizedCrop(60), transforms.ToTensor() ]) data_dir = '/home/ved/PycharmProjects/CSE666_ass_1/tiny-imagenet-200/' op_dir = '/home/ved/PycharmProjects/CSE666_ass_1/op/' train_dataset = datasets.ImageFolder(data_dir + 'train', transform=data_transform) test_dataset = datasets.ImageFolder(data_dir + 'val', transform=transforms.ToTensor()) train_loader = data.DataLoader(train_dataset, batch_size=batch_size) test_loader = data.DataLoader(test_dataset, batch_size=batch_size) # model, criterion, optimizer = mod.get_model(device, num_classes=200, layer1ch=32, layer2ch=64, hidden_size=512, # k_size=3, padding=1, lamb=1, do_rate=0.1, learning_rate=0.001) model, criterion, optimizer = mod.get_model(device) total_len = len(train_loader) running_loss = 0.0 for epoch in range(num_epochs): for i, (images, labels) in enumerate(train_loader): # Change variable type to match GPU requirements a = images.size() inp = images.to(device) lab = labels.to(device) # Reset gradients before processing optimizer.zero_grad() # Get model output out = model(inp) # Calculate loss loss = criterion(out, lab) # Update weights loss.backward() optimizer.step() # running_loss += loss.data[0] if i % 100 == 0: print("Epoch " + str(epoch) + " Step " + str(i + 1) + "/" + str(total_len), end="\t") print("Running Loss data: ", loss.data) # print("\nRunning Loss (avg): ", running_loss/100) # running_loss = 0.0 # Set to eval mode model.eval() lap = time.time() print("Training completed in {0}secs/{1}hrs".format( lap - start, (lap - start) / 3600)) # torch.no_grad() used to reduce gradient calculation which is not needed for testing. with torch.no_grad(): correct = 0 total = 0 test_len = len(test_loader) print("\n\nRunning test\n") t = True for i, (images, labels ) in enumerate(test_loader): # images, labels in test_loader: images = images.to(device) labels = labels.to(device) outputs = model(images) _, predicted = torch.max(outputs.data, 1) print(predicted, labels) print("----------------") total += labels.size(0) correct += (predicted == labels).sum().item() if i % 20 == 0: print("Step " + str(i + 1) + "/" + str(test_len)) accu = (100 * correct) / total print('Accuracy of the model on the test images: {:.2f} %'.format(accu)) print('Correct: {0}\nTotal: {1}'.format(correct, total)) filename = op_dir + 'A1_A:{:.2f}_'.format(accu) + str( time.strftime("%Y%m%d_%H%M%S", time.gmtime())) + '.ckpt' torch.save(model.state_dict(), filename) end = time.time() print("Model saved. Program completed in {0}secs/{1}hrs".format( end - start, (end - start) / 3600))
def main(args): mpi = False if 'sourcedir.tar.gz' in args.tensorboard_dir: tensorboard_dir = re.sub('source/sourcedir.tar.gz', 'model', args.tensorboard_dir) else: tensorboard_dir = args.tensorboard_dir logging.info("Writing TensorBoard logs to {}".format(tensorboard_dir)) if 'sagemaker_mpi_enabled' in args.fw_params: if args.fw_params['sagemaker_mpi_enabled']: import horovod.tensorflow.keras as hvd mpi = True # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.Session(config=config)) else: hvd = None logging.info("Running with MPI={}".format(mpi)) logging.info("getting data") train_dataset = process_input(args.epochs, args.batch_size, args.train, 'train', args.data_config) eval_dataset = process_input(args.epochs, args.batch_size, args.eval, 'eval', args.data_config) validation_dataset = process_input(args.epochs, args.batch_size, args.validation, 'validation', args.data_config) logging.info("configuring model") model = get_model(args.learning_rate, args.weight_decay, args.optimizer, args.momentum, 1, mpi, hvd) callbacks = [] if mpi: callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0)) callbacks.append(hvd.callbacks.MetricAverageCallback()) callbacks.append(hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1)) callbacks.append(tf.keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1)) if hvd.rank() == 0: callbacks.append(ModelCheckpoint(args.output_data_dir + '/checkpoint-{epoch}.h5')) callbacks.append(CustomTensorBoardCallback(log_dir=tensorboard_dir)) else: callbacks.append(ModelCheckpoint(args.output_data_dir + '/checkpoint-{epoch}.h5')) callbacks.append(CustomTensorBoardCallback(log_dir=tensorboard_dir)) logging.info("Starting training") size = 1 if mpi: size = hvd.size() history = model.fit(x=train_dataset[0], y=train_dataset[1], steps_per_epoch=(num_examples_per_epoch('train') // args.batch_size) // size, epochs=args.epochs, validation_data=validation_dataset, validation_steps=(num_examples_per_epoch('validation') // args.batch_size) // size, callbacks=callbacks) score = model.evaluate(eval_dataset[0], eval_dataset[1], steps=num_examples_per_epoch('eval') // args.batch_size, verbose=0) logging.info('Test loss:{}'.format(score[0])) logging.info('Test accuracy:{}'.format(score[1])) # Horovod: Save model and history only on worker 0 (i.e. master) if mpi: if hvd.rank() == 0: save_history(args.model_dir + "/hvd_history.p", history) return save_model(model, args.model_output_dir) else: save_history(args.model_dir + "/hvd_history.p", history) return save_model(model, args.model_output_dir)
def main(args): # Hyper-parameters epochs = args.epochs lr = args.learning_rate batch_size = args.batch_size momentum = args.momentum weight_decay = args.weight_decay optimizer = args.optimizer # SageMaker options gpu_count = args.gpu_count training_dir = args.train validation_dir = args.validation eval_dir = args.eval tensorboard_logs = args.tensorboard_logs # Change 2: Initialize horovod and get the size of the cluster hvd.init() size = hvd.size() # Change 3 - Pin GPU to local process (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.Session(config=config)) train_dataset = get_dataset(training_dir + '/train.tfrecords', batch_size) val_dataset = get_dataset(validation_dir + '/validation.tfrecords', batch_size) eval_dataset = get_dataset(eval_dir + '/eval.tfrecords', batch_size) input_shape = (HEIGHT, WIDTH, DEPTH) # Change 6: Add callbacks for syncing initial state, and saving checkpoints only on 1st worker (rank 0) callbacks = [] callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0)) callbacks.append(hvd.callbacks.MetricAverageCallback()) callbacks.append( hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1)) callbacks.append( tf.keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1)) if hvd.rank() == 0: callbacks.append( ModelCheckpoint(args.output_data_dir + '/checkpoint-{epoch}.h5')) logdir = args.output_data_dir + '/' + datetime.now().strftime( "%Y%m%d-%H%M%S") callbacks.append(TensorBoard(log_dir=logdir)) callbacks.append(Sync2S3(logdir=logdir, s3logdir=tensorboard_logs)) model = get_model(input_shape, lr, weight_decay, optimizer, momentum, hvd) # To use ResNet model instead of custom model comment the above line and uncomment the following: #model = get_resnet_model(input_shape, lr, weight_decay, optimizer, momentum, hvd) # Train model # Change 7: Update the number of steps/epoch history = model.fit( train_dataset, steps_per_epoch=(NUM_TRAIN_IMAGES // batch_size) // size, validation_data=val_dataset, validation_steps=(NUM_VALID_IMAGES // batch_size) // size, verbose=1 if hvd.rank() == 0 else 0, epochs=epochs, callbacks=callbacks) # Evaluate model performance score = model.evaluate(eval_dataset, steps=NUM_TEST_IMAGES // args.batch_size, verbose=0) print('Test loss :', score[0]) print('Test accuracy:', score[1]) if hvd.rank() == 0: save_history(args.output_data_dir + "/hvd_history.p", history)
args, _ = parse_args() x_train, y_train = get_train_data(args.train) x_test, y_test = get_test_data(args.test) device = '/cpu:0' print(device) batch_size = args.batch_size epochs = args.epochs learning_rate = args.learning_rate print('batch_size = {}, epochs = {}, learning rate = {}'.format(batch_size, epochs, learning_rate)) with tf.device(device): model = get_model() optimizer = tf.train.GradientDescentOptimizer(learning_rate) model.compile(optimizer=optimizer, loss='mse') model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test)) # evaluate on test set scores = model.evaluate(x_test, y_test, batch_size, verbose=2) print("Test MSE :", scores) # save checkpoint for locally loading in notebook saver = tfe.Saver(model.variables) saver.save(args.model_dir + '/weights.ckpt') # create a separate SavedModel for deployment to a SageMaker endpoint with TensorFlow Serving tf.contrib.saved_model.save_keras_model(model, args.model_dir)
x_val = np.load(os.path.join(val_dir, 'x_val.npy')) y_val = np.load(os.path.join(val_dir, 'y_val.npy')) print('x val', x_val.shape, 'y val', y_val.shape) return x_val, y_val if __name__ == "__main__": args, _ = parse_args() x_train, y_train = get_train_data(args.train) x_val, y_val = get_val_data(args.val) model = get_model(args.embedding, args.num_words, args.word_index_len, args.labels_index_len, args.embedding_dim, args.max_sequence_len) model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc']) model.fit(x_train, y_train, batch_size=args.batch_size, epochs=args.epochs, validation_data=(x_val, y_val)) # create a TensorFlow SavedModel for deployment to a SageMaker endpoint with TensorFlow Serving tf.contrib.saved_model.save_keras_model(model, args.model_dir)
def main(args): if 'sourcedir.tar.gz' in args.tensorboard_dir: tensorboard_dir = re.sub('source/sourcedir.tar.gz', 'model', args.tensorboard_dir) else: tensorboard_dir = args.tensorboard_dir logging.info("Writing TensorBoard logs to {}".format(tensorboard_dir)) logging.info("getting data") train_dataset = process_input(args.epochs, args.batch_size, args.train, 'train', args.data_config) eval_dataset = process_input(args.epochs, args.batch_size, args.eval, 'eval', args.data_config) validation_dataset = process_input(args.epochs, args.batch_size, args.validation, 'validation', args.data_config) crime_lr = 1.1999607522739098e-06 diversity_lr = 1.599953611730598e-06 optimizer = tf.keras.optimizers.Adam(learning_rate=crime_lr) ### build optimizer #steps_per_epoch = tf.data.experimental.cardinality(train_dataset).numpy() #num_train_steps = steps_per_epoch * args.epochs #num_warmup_steps = int(0.1*num_train_steps) #init_lr = 5e-5 #optimizer = optimization.create_optimizer( # init_lr=init_lr, # num_train_steps=num_train_steps, # num_warmup_steps=num_warmup_steps, # optimizer_type='adamw') ### #print('train_dataset type:', type(train_dataset)) #print('train_dataset text type:', type(train_dataset['text'])) #print('train_dataset label type:', type(train_dataset['label'])) #print('eval_dataset type:', type(eval_dataset)) #print('eval_dataset text type:', type(eval_dataset['text'])) #print('eval_dataset label type:', type(eval_dataset['label'])) #print('validation_dataset type:', type(validation_dataset)) #print('validation_dataset text type:', type(validation_dataset['text'])) #print('validation_dataset label type:', type(validation_dataset['label'])) logging.info("configuring model") logging.info("Hosts: " + os.environ.get('SM_HOSTS')) size = len(args.hosts) model = get_model(args.learning_rate, args.weight_decay, optimizer, args.momentum, size) callbacks = [] if args.current_host == args.hosts[0]: callbacks.append( ModelCheckpoint(args.output_data_dir + '/checkpoint-{epoch}.h5')) #callbacks.append(CustomTensorBoardCallback(log_dir=tensorboard_dir)) logging.info("Starting training") #history = model.fit( # x=train_dataset, # steps_per_epoch=(num_examples_per_epoch('train') // args.batch_size) // size, # epochs=args.epochs, # validation_data=validation_dataset, # validation_steps=(num_examples_per_epoch('validation') // args.batch_size) // size, # callbacks=callbacks) history = model.fit(x=train_dataset, epochs=args.epochs, validation_data=validation_dataset, callbacks=callbacks) #score = model.evaluate( # eval_dataset, # steps=num_examples_per_epoch('eval') // args.batch_size, # verbose=0) score = model.evaluate(eval_dataset) logging.info('Test loss:{}'.format(score[0])) logging.info('Test accuracy:{}'.format(score[1])) # PS: Save model and history only on worker 0 if args.current_host == args.hosts[0]: save_history(args.model_dir + "/ps_history.p", history) save_model(model, args.model_dir)
import torchvision.datasets as datasets import torchvision.transforms as transforms import model_def as mod data_dir = '/home/ved/PycharmProjects/CSE666_ass_1/tiny-imagenet-200/' op_dir = '/home/ved/PycharmProjects/CSE666_ass_1/op/' device = torch.device("cuda:0" if cuda.is_available() else "cpu") batch_size = 100 test_dataset = datasets.ImageFolder(data_dir + 'val', transform=transforms.ToTensor()) test_loader = data.DataLoader(test_dataset, batch_size=batch_size) model, _, _ = mod.get_model(device) m = torch.load("/home/ved/PycharmProjects/CSE666_ass_1/op/A1_A:0.00_20190223_231257.ckpt") model.load_state_dict(m) model.eval() print("M = ", m) with torch.no_grad(): correct = 0 total = 0 test_len = len(test_loader) print("\n\nRunning test\n")