def do_partition(partition, device_id): # You have to set the right variables as global for visibility global net, optimizer, clr_scheduler, loss_function, fprint # The predecessor you load, all else is re-executed predecessor_epoch = partition[0] - 1 if not flor.is_initialized(): # Ray creates a new instance of the library per worker, so we have to re-init flor.initialize(**user_settings, predecessor_id=predecessor_epoch) # This line is so parallel workers don't collide fprint = flor.utils.fprint(['data', 'rogarcia', 'flor_output'], device_id) # Do the general initialization # The code below is copy/pasteed from __main__ # Each worker needs to initialize its own Neural Net so it's in the right GPU # Anything that goes on the GPU or reads from the GPU has to be initialized in each worker net = get_network(args, use_gpu=True) flor.namespace_stack.test_force(net, 'net') loss_function = nn.CrossEntropyLoss() flor.namespace_stack.test_force(loss_function, 'loss_function') optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.0, weight_decay=0.0) flor.namespace_stack.test_force(optimizer, 'optimizer') clr_scheduler = CLR_Scheduler(optimizer, net_steps=(iter_per_epoch * settings.EPOCH), min_lr=args.lr, max_lr=3.0, tail_frac=0.0) flor.namespace_stack.test_force(clr_scheduler, 'clr_scheduler') # Load the end state of the predecessor so we can re-execute in the middle if predecessor_epoch >= 0: # Initialize the Previous Epoch train(predecessor_epoch) eval_training(predecessor_epoch) # Re-execute in the middle flor.SKIP = False # THIS IS IMPORTANT, otherwise flor will SKIP for epoch in partition: # This is just good old fashined re-execution train(epoch) (loss, acc) = eval_training(epoch) fprint('Test set: Average loss: {:.4f}, Accuracy: {:.4f}'.format( loss, acc)) # Clear the memory for cleanliness, this step might be optional torch.cuda.empty_cache()
batch_size=args.b, shuffle=args.s) flor.namespace_stack.test_force(cifar100_test_loader, 'cifar100_test_loader') iter_per_epoch = len(cifar100_training_loader) flor.namespace_stack.test_force(iter_per_epoch, 'iter_per_epoch') loss_function = nn.CrossEntropyLoss() flor.namespace_stack.test_force(loss_function, 'loss_function') optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.0, weight_decay=0.0) flor.namespace_stack.test_force(optimizer, 'optimizer') clr_scheduler = CLR_Scheduler(optimizer, net_steps=(iter_per_epoch * settings.EPOCH), min_lr=args.lr, max_lr=3.0, tail_frac=0.0) flor.namespace_stack.test_force(clr_scheduler, 'clr_scheduler') checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net, settings.TIME_NOW) flor.namespace_stack.test_force(checkpoint_path, 'checkpoint_path') best_acc = 0.0 flor.namespace_stack.test_force(best_acc, 'best_acc') epoch = 1 flor.namespace_stack.test_force(epoch, 'epoch') for _ in range(settings.EPOCH): train(epoch) (loss, acc) = eval_training(epoch) print('Test set: Average loss: {:.4f}, Accuracy: {:.4f}'.format( loss, acc))
cifar100_test_loader = get_test_dataloader(settings.CIFAR100_TRAIN_MEAN, settings.CIFAR100_TRAIN_STD, num_workers=args.w, batch_size=args.b, shuffle=args.s) iter_per_epoch = len(cifar100_training_loader) loss_function = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.0, weight_decay=0.0) clr_scheduler = CLR_Scheduler(optimizer, net_steps=(iter_per_epoch * settings.EPOCH), min_lr=args.lr, max_lr=3.0, tail_frac=0.0) #memoize? checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net, settings.TIME_NOW) best_acc = 0.0 for epoch in range(settings.EPOCH): train(epoch ) #changes net,optimizer,clr_scheduler;not_changes train, epoch torch.save(net.state_dict(), f'net_{epoch}.pt') torch.save(optimizer.state_dict(), f'opt_{epoch}.pt') torch.save(clr_scheduler.state_dict(), f'clrsched_{epoch}.pt') loss, acc = eval_training(epoch) #changes loss, acc, net print('Test set: Average loss: {:.4f}, Accuracy: {:.4f}'.format(
for ((words, words_len)), _ in test_loader: words = words.to(device) words_len = words_len.detach().cpu() output = model(words, words_len) output = (output > 0.5).int() y_pred.extend(output.tolist()) print("Finished Training!") return y_pred EPOCHS = 80 MIN_LR = 1e-4 model = LSTM(8).to(device) optimizer = optim.SGD(model.parameters(), lr=MIN_LR) flor.log("optimizer", str(type(optimizer))) clr_scheduler = CLR_Scheduler( optimizer, net_steps=(len(train_iter) * EPOCHS), min_lr=MIN_LR, max_lr=4.0, tail_frac=0.0, ) pred = train(model=model, optimizer=optimizer, num_epochs=EPOCHS) # save result as .csv file # test_data = pd.read_csv("data/test.csv") # preds_df = pd.DataFrame({"id": test_data["id"], "target": pred}) # preds_df.to_csv(f"data/output_lstm_3.csv", index=False)