def _initialize_tracker(self): writer = CSVandPlottingWriter(self._save_path.replace('.csv', ''), primary_metric='test_accuracy') self._tracker = CheckLayerSat( self._save_path.replace('.csv', ''), [writer], self.model, ignore_layer_names='convolution', stats=['lsat', 'idim'], sat_threshold=self.delta, verbose=False, conv_method=self.conv_method, log_interval=1, device=self.device_sat, reset_covariance=True, max_samples=None, initial_epoch=self._initial_epoch, interpolation_strategy='nearest' if self.downsampling is not None else None, interpolation_downsampling=self.downsampling)
for h in [3, 32, 128]: # N is batch size; D_in is input dimension; # H is hidden dimension; D_out is output dimension. N, D_in, H, D_out = 64, 1000, h, 10 # Create random Tensors to hold inputs and outputs x = torch.randn(N, D_in) y = torch.randn(N, D_out) model = TwoLayerNet(D_in, H, D_out) x, y, model = x.to(device), y.to(device), model.to(device) layers = [model.linear1, model.linear2] stats = CheckLayerSat('regression/h{}'.format(h), layers) loss_fn = torch.nn.MSELoss(size_average=False) optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9) steps_iter = trange(2000, desc='steps', leave=True, position=0) steps_iter.write("{:^80}".format( "Regression - TwoLayerNet - Hidden layer size {}".format(h))) for _ in steps_iter: y_pred = model(x) loss = loss_fn(y_pred, y) steps_iter.set_description('loss=%g' % loss.data) optimizer.zero_grad() loss.backward() optimizer.step() stats.saturation()
def fit( self, train_dataloader, val_dataloader, train_ds, val_ds, loss_fn, optimizer, n_epochs, val_interval, patience_early_stopping, device, metrics: Union[list, dict] = [], val_metric: Union[int, str] = "loss", val_metric_mode: str = "min", start_epoch=0, ): """ train and validate the networks :param int n_epochs: max_train_epochs (default=500) :param int val_interval: run validation every val_interval number of epoch (ARGS.patience_early_stopping) :param int patience_early_stopping: after (patience_early_stopping/val_interval) number of epochs without improvement, terminate training """ self.logger.info("Init model on device '{}'".format(device)) self.model = self.model.to(device) # initalize delve self.tracker = CheckLayerSat(self.summary_dir, save_to="plotcsv", modules=self.model, device=device) best_model = copy.deepcopy(self.model.state_dict()) best_metric = 0.0 if val_metric_mode == "max" else float("inf") # as we don't validate after each epoch but at val_interval, # we update the patience_stopping accordingly to how many times of validation patience_stopping = math.ceil(patience_early_stopping / val_interval) patience_stopping = int(max(1, patience_stopping)) early_stopping = EarlyStoppingCriterion(mode=val_metric_mode, patience=patience_stopping) if not self.start_scratch and self.cp is not None: checkpoint = self.cp.read_latest() if checkpoint is not None: try: try: self.model.load_state_dict(checkpoint["modelState"]) except RuntimeError as e: self.logger.error( "Failed to restore checkpoint: " "Checkpoint has different parameters") self.logger.error(e) raise SystemExit optimizer.load_state_dict( checkpoint["trainState"]["optState"]) start_epoch = checkpoint["trainState"]["epoch"] + 1 best_metric = checkpoint["trainState"]["best_metric"] best_model = checkpoint["trainState"]["best_model"] early_stopping.load_state_dict( checkpoint["trainState"]["earlyStopping"]) #scheduler.load_state_dict(checkpoint["trainState"]["scheduler"]) self.logger.info( "Resuming with epoch {}".format(start_epoch)) except KeyError: self.logger.error("Failed to restore checkpoint") raise since = time.time() self.logger.info("Start training model " + self.prefix) try: if val_metric_mode == "min": val_comp = operator.lt # to run standard operator as function else: val_comp = operator.gt for epoch in range(start_epoch, n_epochs): self.train(epoch, train_dataloader, train_ds, loss_fn, optimizer, device) if epoch % val_interval == 0 or epoch == n_epochs - 1: # first, get val_loss for further comparison val_loss = self.validate(epoch, val_dataloader, val_ds, loss_fn, device, phase="val") if val_metric == "loss": val_result = val_loss # add metrics for delve to keep track of self.tracker.add_scalar("loss", val_loss) # add saturation to the mix self.tracker.add_saturations() else: val_result = metrics[val_metric].get() # compare to see if improvement occurs if val_comp(val_result, best_metric): best_metric = val_result # update best_metric with the loss (smaller than previous) best_model = copy.deepcopy(self.model.state_dict()) """previously, deadlock occurred, which seemed to be related to cp. comment self.cp.write() to see if freezing goes away.""" # write checkpoint self.cp.write({ "modelState": self.model.state_dict(), "trainState": { "epoch": epoch, "best_metric": best_metric, "best_model": best_model, "optState": optimizer.state_dict(), "earlyStopping": early_stopping.state_dict(), }, }) # test if the number of accumulated no-improvement epochs is bigger than patience if early_stopping.step(val_result): self.logger.info( "No improvement over the last {} epochs. Training is stopped." .format(patience_early_stopping)) break except Exception: import traceback self.logger.warning(traceback.format_exc()) self.logger.warning("Aborting...") self.logger.close() raise SystemExit # option here: load the best model to run test on test_dataset and log the final metric (along side best metric) # for ae, only split: train and validate dataset, without test_dataset time_elapsed = time.time() - since self.logger.info("Training complete in {:.0f}m {:.0f}s".format( time_elapsed // 60, time_elapsed % 60)) self.logger.info("Best val metric: {:4f}".format(best_metric)) # close delve tracker self.tracker.close() return self.model
def train(network, dataset, test_set, logging_dir, batch_size): network.to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(network.parameters()) #stats = CheckLayerSat(logging_dir, network, log_interval=len(dataset)//batch_size) stats = CheckLayerSat(logging_dir, network, log_interval=60, sat_method='cumvar99', conv_method='mean') epoch_acc = 0 thresh = 0.95 epoch = 0 total = 0 correct = 0 value_dict = None while epoch <= 20: print('Start Training Epoch', epoch, '\n') start = t.time() epoch_acc = 0 train_loss = 0 total = 0 correct = 0 network.train() for i, data in enumerate(dataset): step = epoch * len(dataset) + i inputs, labels = data inputs, labels = inputs.to(device), labels.to(device) optimizer.zero_grad() outputs = network(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() train_loss += loss.item() _, predicted = outputs.max(1) total += labels.size(0) correct += predicted.eq(labels).sum().item() #if i % 2000 == 1999: # print every 2000 mini-batches print(i, 'of', len(dataset), 'acc:', correct / total) # display layer saturation levels end = t.time() stats.saturation() test_loss, test_acc = test(network, test_set, criterion, stats, epoch) epoch_acc = correct / total print('Epoch', epoch, 'finished', 'Acc:', epoch_acc, 'Loss:', train_loss / total, '\n') stats.add_scalar('train_loss', train_loss / total, epoch) # optional stats.add_scalar('train_acc', epoch_acc, epoch) # optional value_dict = record_metrics(value_dict, stats.logs, epoch_acc, train_loss / total, test_acc, test_loss, epoch, (end - start) / total) log_to_csv(value_dict, logging_dir) epoch += 1 stats.close() # test_stats.close() return criterion
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") torch.manual_seed(1) epochs = 5 for h2 in [8, 32, 128]: # compare various hidden layer sizes net = Net(h2=h2) # instantiate network with hidden layer size `h2` net.to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) logging_dir = 'convNet/simpson_h2-{}'.format(h2) stats = CheckLayerSat(logging_dir, net, include_conv=True, sat_method='all') stats.write( "CIFAR10 ConvNet - Changing fc2 - size {}".format(h2)) # optional for epoch in range(epochs): running_loss = 0.0 step = 0 loader = tqdm(train_loader, leave=True, position=0) # track step progress and loss - optional for i, data in enumerate(loader): step = epoch * len(loader) + i inputs, labels = data inputs, labels = inputs.to(device), labels.to(device) optimizer.zero_grad() outputs = net(inputs)
epochs = 10 net = NET() if torch.cuda.is_available(): net.cuda() net.to(device) logging_dir = 'net/simpson_h2-{}'.format(2) stats = CheckLayerSat(savefile=logging_dir, save_to='plot', modules=net, include_conv=False, stats=['lsat'], max_samples=1024, verbose=True, writer_args={ 'figsize': [30, 30], 'fontsize': 32 }, conv_method='mean', device='cpu') #net = nn.DataParallel(net, device_ids=['cuda:0', 'cuda:1']) eps = torch.Tensor([1e-10]).cuda() def loss_fn(recon_x, x, mu, logvar, eps): BCE = F.binary_cross_entropy(recon_x + eps, x, size_average=False) KLD = -0.5 * torch.sum(1 + logvar - mu**2 - logvar.exp()) return (BCE + KLD) / x.size(0)
torch.manual_seed(1) cuda = torch.cuda.is_available() epochs = 5 for h2 in [8, 32, 128]: # compare various hidden layer sizes net = Net(h2=h2) # instantiate network with hidden layer size `h2` if cuda: net.cuda() criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) logging_dir = 'convNet/h2-{}'.format(h2) stats = CheckLayerSat(logging_dir, net) stats.write( "CIFAR10 ConvNet - Changing fc2 - size {}".format(h2)) # optional for epoch in range(epochs): running_loss = 0.0 step = 0 loader = tqdm(train_loader, leave=True, position=0) # track step progress and loss - optional for i, data in enumerate(loader): step = epoch * len(loader) + i inputs, labels = data inputs = Variable(inputs) labels = Variable(labels) if cuda: inputs, labels = inputs.cuda(), labels.cuda()
epochs = 5 for h2 in [8, 32, 128]: # compare various hidden layer sizes net = resnet18( pretrained=False, num_classes=10 ) #Net(h2=h2) # instantiate network with hidden layer size `h2` net.to(device) print(net) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) logging_dir = 'convNet/simpson_h2-{}'.format(h2) stats = CheckLayerSat(logging_dir, 'csv', net, include_conv=True, stats=['lsat']) stats.write( "CIFAR10 ConvNet - Changing fc2 - size {}".format(h2)) # optional for epoch in range(epochs): running_loss = 0.0 step = 0 loader = tqdm( train_loader, leave=True, position=0) # track step progress and loss - optional for i, data in enumerate(loader): step = epoch * len(loader) + i inputs, labels = data inputs, labels = inputs.to(device), labels.to(device)