コード例 #1
0
def do_partition(partition, device_id):
    # You have to set the right variables as global for visibility
    global net, optimizer, clr_scheduler, loss_function, fprint

    # The predecessor you load, all else is re-executed
    predecessor_epoch = partition[0] - 1
    if not flor.is_initialized():
        # Ray creates a new instance of the library per worker, so we have to re-init
        flor.initialize(**user_settings, predecessor_id=predecessor_epoch)

    # This line is so parallel workers don't collide
    fprint = flor.utils.fprint(['data', 'rogarcia', 'flor_output'], device_id)

    # Do the general initialization
    # The code below is copy/pasteed from __main__
    # Each worker needs to initialize its own Neural Net so it's in the right GPU
    # Anything that goes on the GPU or reads from the GPU has to be initialized in each worker
    net = get_network(args, use_gpu=True)
    flor.namespace_stack.test_force(net, 'net')
    loss_function = nn.CrossEntropyLoss()
    flor.namespace_stack.test_force(loss_function, 'loss_function')
    optimizer = optim.SGD(net.parameters(),
                          lr=args.lr,
                          momentum=0.0,
                          weight_decay=0.0)
    flor.namespace_stack.test_force(optimizer, 'optimizer')
    clr_scheduler = CLR_Scheduler(optimizer,
                                  net_steps=(iter_per_epoch * settings.EPOCH),
                                  min_lr=args.lr,
                                  max_lr=3.0,
                                  tail_frac=0.0)
    flor.namespace_stack.test_force(clr_scheduler, 'clr_scheduler')

    # Load the end state of the predecessor so we can re-execute in the middle
    if predecessor_epoch >= 0:
        # Initialize the Previous Epoch
        train(predecessor_epoch)
        eval_training(predecessor_epoch)

    # Re-execute in the middle
    flor.SKIP = False  # THIS IS IMPORTANT, otherwise flor will SKIP
    for epoch in partition:
        # This is just good old fashined re-execution
        train(epoch)
        (loss, acc) = eval_training(epoch)
        fprint('Test set: Average loss: {:.4f}, Accuracy: {:.4f}'.format(
            loss, acc))

    # Clear the memory for cleanliness, this step might be optional
    torch.cuda.empty_cache()
コード例 #2
0
                                            batch_size=args.b,
                                            shuffle=args.s)
 flor.namespace_stack.test_force(cifar100_test_loader,
                                 'cifar100_test_loader')
 iter_per_epoch = len(cifar100_training_loader)
 flor.namespace_stack.test_force(iter_per_epoch, 'iter_per_epoch')
 loss_function = nn.CrossEntropyLoss()
 flor.namespace_stack.test_force(loss_function, 'loss_function')
 optimizer = optim.SGD(net.parameters(),
                       lr=args.lr,
                       momentum=0.0,
                       weight_decay=0.0)
 flor.namespace_stack.test_force(optimizer, 'optimizer')
 clr_scheduler = CLR_Scheduler(optimizer,
                               net_steps=(iter_per_epoch * settings.EPOCH),
                               min_lr=args.lr,
                               max_lr=3.0,
                               tail_frac=0.0)
 flor.namespace_stack.test_force(clr_scheduler, 'clr_scheduler')
 checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net,
                                settings.TIME_NOW)
 flor.namespace_stack.test_force(checkpoint_path, 'checkpoint_path')
 best_acc = 0.0
 flor.namespace_stack.test_force(best_acc, 'best_acc')
 epoch = 1
 flor.namespace_stack.test_force(epoch, 'epoch')
 for _ in range(settings.EPOCH):
     train(epoch)
     (loss, acc) = eval_training(epoch)
     print('Test set: Average loss: {:.4f}, Accuracy: {:.4f}'.format(
         loss, acc))
コード例 #3
0
    cifar100_test_loader = get_test_dataloader(settings.CIFAR100_TRAIN_MEAN,
                                               settings.CIFAR100_TRAIN_STD,
                                               num_workers=args.w,
                                               batch_size=args.b,
                                               shuffle=args.s)

    iter_per_epoch = len(cifar100_training_loader)
    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(),
                          lr=args.lr,
                          momentum=0.0,
                          weight_decay=0.0)
    clr_scheduler = CLR_Scheduler(optimizer,
                                  net_steps=(iter_per_epoch * settings.EPOCH),
                                  min_lr=args.lr,
                                  max_lr=3.0,
                                  tail_frac=0.0)  #memoize?
    checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net,
                                   settings.TIME_NOW)

    best_acc = 0.0
    for epoch in range(settings.EPOCH):
        train(epoch
              )  #changes net,optimizer,clr_scheduler;not_changes train, epoch
        torch.save(net.state_dict(), f'net_{epoch}.pt')
        torch.save(optimizer.state_dict(), f'opt_{epoch}.pt')
        torch.save(clr_scheduler.state_dict(), f'clrsched_{epoch}.pt')
        loss, acc = eval_training(epoch)  #changes loss, acc, net

        print('Test set: Average loss: {:.4f}, Accuracy: {:.4f}'.format(
コード例 #4
0
        for ((words, words_len)), _ in test_loader:
            words = words.to(device)
            words_len = words_len.detach().cpu()
            output = model(words, words_len)

            output = (output > 0.5).int()
            y_pred.extend(output.tolist())
    print("Finished Training!")
    return y_pred


EPOCHS = 80
MIN_LR = 1e-4

model = LSTM(8).to(device)
optimizer = optim.SGD(model.parameters(), lr=MIN_LR)
flor.log("optimizer", str(type(optimizer)))
clr_scheduler = CLR_Scheduler(
    optimizer,
    net_steps=(len(train_iter) * EPOCHS),
    min_lr=MIN_LR,
    max_lr=4.0,
    tail_frac=0.0,
)
pred = train(model=model, optimizer=optimizer, num_epochs=EPOCHS)

# save result as .csv file
# test_data = pd.read_csv("data/test.csv")
# preds_df = pd.DataFrame({"id": test_data["id"], "target": pred})
# preds_df.to_csv(f"data/output_lstm_3.csv", index=False)