Ejemplo n.º 1
0
def test_data_parallel():
    shutil.rmtree(out_dir, ignore_errors=True)

    hook = smd.Hook(
        out_dir=out_dir,
        save_config=smd.SaveConfig(save_steps=[0, 1, 5]),
        save_all=True,
        include_workers="one",
    )

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = Net().to(device)
    if device == "cuda":
        model = DataParallel(model)

    hook.register_module(model)

    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    train(model, hook, torch.device(device), optimizer, num_steps=10)

    trial = create_trial(out_dir)
    assert trial.steps() == [0, 1, 5]
    if device == "cpu":
        assert len(trial.tensor_names()) == 38
    else:
        assert len(trial.tensor_names()) > 37

    shutil.rmtree(out_dir, ignore_errors=True)
Ejemplo n.º 2
0
def model_fn(model_dir):
    global hook
    #create model    
    model = models.resnet18()

    #traffic sign dataset has 43 classes   
    nfeatures = model.fc.in_features
    model.fc = nn.Linear(nfeatures, 43)
    
    #load model
    weights = torch.load(model_dir + '/model/model.pt', map_location=lambda storage, loc: storage)
    model.load_state_dict(weights)
    
    model.eval()
    model.cpu()

    #hook configuration
    save_config = smd.SaveConfig(mode_save_configs={
        smd.modes.PREDICT: smd.SaveConfigMode(save_interval=1)
    })
    
    boto_session = boto3.Session()
    sagemaker_session = sagemaker.Session(boto_session=boto_session)    

    hook = CustomHook("s3://" + sagemaker_session.default_bucket() + "/endpoint/tensors", 
                    save_config=save_config, 
                    include_regex='.*bn|.*bias|.*downsample|.*ResNet_input|.*image|.*fc_output' )
    
    #register hook
    hook.register_module(model) 
    
    #set mode
    hook.set_mode(modes.PREDICT)

    return model
Ejemplo n.º 3
0
def create_net_and_train(out_dir, n_steps, use_loss_module=False, use_loss_functional=False):
    assert (
        use_loss_module != use_loss_functional
    ), "Exactly one of `use_loss_module` and `use_loss_functional` must be true."

    net = Net()
    optimizer = optim.SGD(net.parameters(), lr=0.05, momentum=0.9)
    criterion = nn.CrossEntropyLoss()

    hook = smd.Hook(out_dir=out_dir, save_config=smd.SaveConfig(save_interval=1))
    hook.register_module(net)
    if use_loss_module:
        hook.register_loss(criterion)

    batch_size = 1
    # Use the same data at each step to test loss decreasing
    inputs, labels = torch.rand(batch_size, 3, 32, 32), torch.zeros(batch_size).long()
    for _ in range(n_steps):
        optimizer.zero_grad()
        outputs = net(inputs)
        if use_loss_module:
            loss = criterion(outputs, labels)
        if use_loss_functional:
            loss = F.cross_entropy(outputs, labels)
            hook.record_tensor_value("nll_loss", tensor_value=loss)
        loss.backward()
        optimizer.step()

    # Users can call this method to immediately use the Trials API.
    hook.close()
    smd.del_hook()
Ejemplo n.º 4
0
def setDebuggerSaveConfig():  
    smd.SaveConfig(
        mode_save_configs={
            smd.modes.TRAIN: smd.SaveConfigMode(save_interval=1),
            smd.modes.EVAL: smd.SaveConfigMode(save_interval=1),
            smd.modes.PREDICT: smd.SaveConfigMode(save_interval=1),
            smd.modes.GLOBAL: smd.SaveConfigMode(save_interval=1)
        }
    )
def run(rank,
        size,
        include_workers="one",
        num_epochs=10,
        batch_size=128,
        num_batches=10):
    """Distributed function to be implemented later."""
    torch.manual_seed(1234)
    device = torch.device("cpu")
    model = Net().to(device)
    optimizer = optim.SGD(model.parameters(), lr=1)

    shutil.rmtree(out_dir, ignore_errors=True)

    hook = smd.Hook(
        out_dir=out_dir,
        save_config=smd.SaveConfig(save_steps=[0, 1, 5]),
        save_all=True,
        include_workers=include_workers,
    )

    hook.register_module(model)

    for epoch in range(num_epochs):
        epoch_loss = 0.0
        for _ in range(num_batches):
            optimizer.zero_grad()
            data, target = dataset(batch_size)
            output = model(data)
            loss = F.mse_loss(output, target)
            epoch_loss += loss.item()
            loss.backward()
            average_gradients(model)
            optimizer.step()
        # print(f"Rank {dist.get_rank()}, epoch {epoch}: {epoch_loss / num_batches}")

    assert hook._get_worker_name() == f"worker_{dist.get_rank()}"
    # Race condition here where both workers attempt to move
    # /tmp/{out_dir}/END_OF_JOB.ts to {out_dir}/END_OF_JOB.ts
    try:
        hook._cleanup()
    except FileNotFoundError:
        pass
Ejemplo n.º 6
0
def test_run_net_single_process(out_dir):
    """Runs a single linear layer."""
    device = torch.device("cpu")
    model = Net().to(device)
    optimizer = optim.SGD(model.parameters(), lr=0.01)

    shutil.rmtree(out_dir, ignore_errors=True)
    hook = smd.Hook(out_dir=out_dir,
                    save_config=smd.SaveConfig(save_steps=[0, 1, 5]),
                    save_all=True)
    hook.register_module(model)
    train(model=model, device=device, optimizer=optimizer)
    hook._cleanup()

    assert hook._get_worker_name() == "worker_0"

    trial = create_trial(path=out_dir)
    assert len(trial.workers()) == 1, f"trial.workers() = {trial.workers()}"
    assert len(trial.steps()) == 3, f"trial.steps() = {trial.steps()}"
    shutil.rmtree(out_dir, ignore_errors=True)
Ejemplo n.º 7
0
def test_no_name_clash():
    out_dir = TemporaryDirectory().name

    hook = smd.Hook(
        out_dir=out_dir,
        save_config=smd.SaveConfig(save_steps=[0, 1, 5]),
        save_all=True,
        include_workers="one",
    )
    model = Net()
    hook.register_module(model)
    device = "cpu"
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    train(model, hook, torch.device(device), optimizer, num_steps=10)

    trial = create_trial(out_dir)
    assert trial.steps() == [0, 1, 5]

    assert len(trial.tensor_names(regex="relu.*")) == 6
    shutil.rmtree(out_dir, ignore_errors=True)
Ejemplo n.º 8
0
def model_fn(model_dir: str) -> ModelWithDebugHook:
    #create model    
    model = models.resnet18()

    #traffic sign dataset has 43 classes   
    nfeatures = model.fc.in_features
    model.fc = nn.Linear(nfeatures, 43)

    #load model
    weights = torch.load(f'{model_dir}/model/model.pt', map_location=lambda storage, loc: storage)
    model.load_state_dict(weights)

    model.eval()
    model.cpu()

    #hook configuration
    tensors_output_s3uri = os.environ.get('tensors_output')
    if tensors_output_s3uri is None:
        logger.warning(
            'WARN: Skipping hook configuration as no tensors_output env var provided. '
            'Tensors will not be exported'
        )
        hook = None
    else:
        save_config = smd.SaveConfig(mode_save_configs={
            smd.modes.PREDICT: smd.SaveConfigMode(save_interval=1),
        })

        hook = CustomHook(
            tensors_output_s3uri,
            save_config=save_config,
            include_regex='.*bn|.*bias|.*downsample|.*ResNet_input|.*image|.*fc_output',
        )

        #register hook
        hook.register_module(model) 

        #set mode
        hook.set_mode(modes.PREDICT)

    return ModelWithDebugHook(model, hook)
Ejemplo n.º 9
0
def start_training(model, trainloader, testloader, model_ext):

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(),
                          lr=0.05,
                          momentum=0,
                          weight_decay=5e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                           T_max=200)

    # Registering Job
    job_name = model_ext
    hook = smd.Hook(out_dir=f'./smdebug/{job_name}',
                    save_config=smd.SaveConfig(save_interval=100),
                    include_collections=['weights', 'gradients', 'biases'])

    hook.register_module(model)
    hook.register_loss(criterion)

    for epoch in range(0, 5):
        train(model, trainloader, epoch, model_ext, criterion, optimizer, hook)
        test(model, testloader, epoch, criterion, model_ext)
        scheduler.step()
def train_model(out_dir="/tmp/smdebug", training_steps=5):
    rnn = RNN(50, 20, 10)
    save_config = smd.SaveConfig(save_interval=500)
    hook = smd.Hook(out_dir=out_dir, save_all=True, save_config=save_config)

    loss_fn = nn.MSELoss()

    hook.register_module(rnn)
    hook.register_module(loss_fn)

    batch_size = 10
    TIMESTEPS = training_steps

    # Create some fake data
    batch = torch.randn(batch_size, 50)
    hidden = torch.zeros(batch_size, 20)
    target = torch.zeros(batch_size, 10)

    loss = 0
    for t in range(TIMESTEPS):
        hidden, output = rnn(batch, hidden)
        loss += loss_fn(output, target)
    loss.backward()
    hook.close()
Ejemplo n.º 11
0
        self.hidden_size = hidden_size
        input_size = data_size + hidden_size

        self.i2h = nn.Linear(input_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)

    def forward(self, data, last_hidden):
        input = torch.cat((data, last_hidden), 1)
        hidden = self.i2h(input)
        output = self.h2o(hidden)
        return hidden, output


rnn = RNN(50, 20, 10)
save_config = smd.SaveConfig(save_interval=500)
hook = smd.Hook(out_dir="/tmp/smdebug", save_all=True, save_config=save_config)

loss_fn = nn.MSELoss()

hook.register_module(rnn)
# hook.register_module(loss_fn)

batch_size = 10
TIMESTEPS = 5

# Create some fake data
batch = torch.randn(batch_size, 50)
hidden = torch.zeros(batch_size, 20)
target = torch.zeros(batch_size, 10)
Ejemplo n.º 12
0
def train_model(epochs, batch_size_train, batch_size_val):
    
    #check if GPU is available and set context
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    #get pretrained ResNet model
    model = models.resnet18(pretrained=True)
    
    #replace inplace operators
    relu_inplace(model)

    nfeatures = model.fc.in_features

    #traffic sign dataset has 43 classes
    model.fc = nn.Linear(nfeatures, 43)

    #copy model to GPU or CPU
    model = model.to(device)

    # loss for multi label classification
    loss_function = nn.CrossEntropyLoss()

    # optimizer
    optimizer = optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum)

    #configure smdebug hook: 
    #save all iterations from validation phase
    #save only first iteration from training phase
    save_config = smd.SaveConfig(mode_save_configs={
        smd.modes.TRAIN: smd.SaveConfigMode(save_steps=[0]),
        smd.modes.EVAL: smd.SaveConfigMode(save_interval=1)
    })
    
    #create custom hook that has a customized forward function, so that we can get gradients of outputs   
    hook = CustomHook(args.smdebug_dir, save_config=save_config, include_regex='.*bn|.*bias|.*downsample|.*ResNet_input|.*image|.*fc_output' )
    
    #register hook
    hook.register_module(model)  
    
    #get the dataloaders for train and test data
    train_loader, val_loader = get_dataloaders(batch_size_train, batch_size_val)
    
    #training loop
    for epoch in range(epochs):
        
        epoch_loss = 0
        epoch_acc = 0
        
        #set hook training phase
        hook.set_mode(modes.TRAIN)
        model.train()
        
        for inputs, labels in train_loader: 
            inputs = inputs.to(device).requires_grad_()
            labels = labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward pass
            outputs = model(inputs)
            
            #get predictions
            _, preds = torch.max(outputs, 1)
            
            #compute loss
            loss = loss_function(outputs, labels)
            
            #backward pass
            loss.backward()
            
            #optimize parameters
            optimizer.step()
            
            # statistics
            epoch_loss += loss.item() 
            epoch_acc += torch.sum(preds == labels.data)

        #set hook validation phase
        hook.set_mode(modes.EVAL)
        model.eval()
        
        for inputs, labels in val_loader: 
         
            inputs = inputs.to(device).requires_grad_()
            hook.image_gradients(inputs)
            
            model.eval()
            
            #forward pass
            outputs = model(inputs)
            
            #get prediction
            predicted_class = outputs.data.max(1, keepdim=True)[1]
            agg = 0
            for i in range(outputs.shape[0]):
                agg += outputs[i,predicted_class[i]]
            model.zero_grad()
            
            #compute gradients with respect to outputs 
            agg.backward()

        print('Epoch {}/{} Loss: {:.4f} Acc: {:.4f}'.format(
            epoch, epochs - 1, epoch_loss, epoch_acc))

    return model