コード例 #1
0
    def _initialize_tracker(self):
        writer = CSVandPlottingWriter(self._save_path.replace('.csv', ''),
                                      primary_metric='test_accuracy')

        self._tracker = CheckLayerSat(
            self._save_path.replace('.csv', ''), [writer],
            self.model,
            ignore_layer_names='convolution',
            stats=['lsat', 'idim'],
            sat_threshold=self.delta,
            verbose=False,
            conv_method=self.conv_method,
            log_interval=1,
            device=self.device_sat,
            reset_covariance=True,
            max_samples=None,
            initial_epoch=self._initial_epoch,
            interpolation_strategy='nearest'
            if self.downsampling is not None else None,
            interpolation_downsampling=self.downsampling)
コード例 #2
0
ファイル: example.py プロジェクト: rivol/delve
for h in [3, 32, 128]:
    # N is batch size; D_in is input dimension;
    # H is hidden dimension; D_out is output dimension.
    N, D_in, H, D_out = 64, 1000, h, 10

    # Create random Tensors to hold inputs and outputs
    x = torch.randn(N, D_in)
    y = torch.randn(N, D_out)

    model = TwoLayerNet(D_in, H, D_out)

    x, y, model = x.to(device), y.to(device), model.to(device)

    layers = [model.linear1, model.linear2]
    stats = CheckLayerSat('regression/h{}'.format(h), layers)

    loss_fn = torch.nn.MSELoss(size_average=False)
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
    steps_iter = trange(2000, desc='steps', leave=True, position=0)
    steps_iter.write("{:^80}".format(
        "Regression - TwoLayerNet - Hidden layer size {}".format(h)))
    for _ in steps_iter:
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        steps_iter.set_description('loss=%g' % loss.data)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        stats.saturation()
コード例 #3
0
    def fit(
        self,
        train_dataloader,
        val_dataloader,
        train_ds,
        val_ds,
        loss_fn,
        optimizer,
        n_epochs,
        val_interval,
        patience_early_stopping,
        device,
        metrics: Union[list, dict] = [],
        val_metric: Union[int, str] = "loss",
        val_metric_mode: str = "min",
        start_epoch=0,
    ):
        """
        train and validate the networks

        :param int n_epochs: max_train_epochs (default=500)
        :param int val_interval: run validation every val_interval number of epoch (ARGS.patience_early_stopping)
        :param int patience_early_stopping: after (patience_early_stopping/val_interval) number of epochs without improvement, terminate training
        """

        self.logger.info("Init model on device '{}'".format(device))
        self.model = self.model.to(device)

        # initalize delve
        self.tracker = CheckLayerSat(self.summary_dir,
                                     save_to="plotcsv",
                                     modules=self.model,
                                     device=device)

        best_model = copy.deepcopy(self.model.state_dict())
        best_metric = 0.0 if val_metric_mode == "max" else float("inf")

        # as we don't validate after each epoch but at val_interval,
        # we update the patience_stopping accordingly to how many times of validation
        patience_stopping = math.ceil(patience_early_stopping / val_interval)
        patience_stopping = int(max(1, patience_stopping))
        early_stopping = EarlyStoppingCriterion(mode=val_metric_mode,
                                                patience=patience_stopping)

        if not self.start_scratch and self.cp is not None:
            checkpoint = self.cp.read_latest()
            if checkpoint is not None:
                try:
                    try:
                        self.model.load_state_dict(checkpoint["modelState"])
                    except RuntimeError as e:
                        self.logger.error(
                            "Failed to restore checkpoint: "
                            "Checkpoint has different parameters")
                        self.logger.error(e)
                        raise SystemExit

                    optimizer.load_state_dict(
                        checkpoint["trainState"]["optState"])
                    start_epoch = checkpoint["trainState"]["epoch"] + 1
                    best_metric = checkpoint["trainState"]["best_metric"]
                    best_model = checkpoint["trainState"]["best_model"]
                    early_stopping.load_state_dict(
                        checkpoint["trainState"]["earlyStopping"])
                    #scheduler.load_state_dict(checkpoint["trainState"]["scheduler"])
                    self.logger.info(
                        "Resuming with epoch {}".format(start_epoch))
                except KeyError:
                    self.logger.error("Failed to restore checkpoint")
                    raise

        since = time.time()

        self.logger.info("Start training model " + self.prefix)

        try:
            if val_metric_mode == "min":
                val_comp = operator.lt  # to run standard operator as function
            else:
                val_comp = operator.gt
            for epoch in range(start_epoch, n_epochs):
                self.train(epoch, train_dataloader, train_ds, loss_fn,
                           optimizer, device)

                if epoch % val_interval == 0 or epoch == n_epochs - 1:
                    # first, get val_loss for further comparison
                    val_loss = self.validate(epoch,
                                             val_dataloader,
                                             val_ds,
                                             loss_fn,
                                             device,
                                             phase="val")
                    if val_metric == "loss":
                        val_result = val_loss
                        # add metrics for delve to keep track of
                        self.tracker.add_scalar("loss", val_loss)
                        # add saturation to the mix
                        self.tracker.add_saturations()
                    else:
                        val_result = metrics[val_metric].get()

                    # compare to see if improvement occurs
                    if val_comp(val_result, best_metric):
                        best_metric = val_result  # update best_metric with the loss (smaller than previous)
                        best_model = copy.deepcopy(self.model.state_dict())
                        """previously, deadlock occurred, which seemed to be related to cp. comment self.cp.write() to see if freezing goes away."""
                        # write checkpoint
                        self.cp.write({
                            "modelState": self.model.state_dict(),
                            "trainState": {
                                "epoch": epoch,
                                "best_metric": best_metric,
                                "best_model": best_model,
                                "optState": optimizer.state_dict(),
                                "earlyStopping": early_stopping.state_dict(),
                            },
                        })

                    # test if the number of accumulated no-improvement epochs is bigger than patience
                    if early_stopping.step(val_result):
                        self.logger.info(
                            "No improvement over the last {} epochs. Training is stopped."
                            .format(patience_early_stopping))
                        break
        except Exception:
            import traceback
            self.logger.warning(traceback.format_exc())
            self.logger.warning("Aborting...")
            self.logger.close()
            raise SystemExit

        # option here: load the best model to run test on test_dataset and log the final metric (along side best metric)
        # for ae, only split: train and validate dataset, without test_dataset

        time_elapsed = time.time() - since
        self.logger.info("Training complete in {:.0f}m {:.0f}s".format(
            time_elapsed // 60, time_elapsed % 60))

        self.logger.info("Best val metric: {:4f}".format(best_metric))

        # close delve tracker
        self.tracker.close()

        return self.model
コード例 #4
0
def train(network, dataset, test_set, logging_dir, batch_size):

    network.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(network.parameters())
    #stats = CheckLayerSat(logging_dir, network, log_interval=len(dataset)//batch_size)
    stats = CheckLayerSat(logging_dir,
                          network,
                          log_interval=60,
                          sat_method='cumvar99',
                          conv_method='mean')

    epoch_acc = 0
    thresh = 0.95
    epoch = 0
    total = 0
    correct = 0
    value_dict = None
    while epoch <= 20:
        print('Start Training Epoch', epoch, '\n')
        start = t.time()
        epoch_acc = 0
        train_loss = 0
        total = 0
        correct = 0
        network.train()
        for i, data in enumerate(dataset):
            step = epoch * len(dataset) + i
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()

            outputs = network(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
            #if i % 2000 == 1999:  # print every 2000 mini-batches
            print(i, 'of', len(dataset), 'acc:', correct / total)
            # display layer saturation levels
        end = t.time()
        stats.saturation()
        test_loss, test_acc = test(network, test_set, criterion, stats, epoch)
        epoch_acc = correct / total
        print('Epoch', epoch, 'finished', 'Acc:', epoch_acc, 'Loss:',
              train_loss / total, '\n')
        stats.add_scalar('train_loss', train_loss / total, epoch)  # optional
        stats.add_scalar('train_acc', epoch_acc, epoch)  # optional
        value_dict = record_metrics(value_dict, stats.logs, epoch_acc,
                                    train_loss / total, test_acc, test_loss,
                                    epoch, (end - start) / total)
        log_to_csv(value_dict, logging_dir)
        epoch += 1
    stats.close()
    #    test_stats.close()

    return criterion
コード例 #5
0
ファイル: example_deep.py プロジェクト: rivol/delve
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

torch.manual_seed(1)

epochs = 5

for h2 in [8, 32, 128]:  # compare various hidden layer sizes
    net = Net(h2=h2)  # instantiate network with hidden layer size `h2`

    net.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

    logging_dir = 'convNet/simpson_h2-{}'.format(h2)
    stats = CheckLayerSat(logging_dir,
                          net,
                          include_conv=True,
                          sat_method='all')
    stats.write(
        "CIFAR10 ConvNet - Changing fc2 - size {}".format(h2))  # optional

    for epoch in range(epochs):
        running_loss = 0.0
        step = 0
        loader = tqdm(train_loader, leave=True,
                      position=0)  # track step progress and loss - optional
        for i, data in enumerate(loader):
            step = epoch * len(loader) + i
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = net(inputs)
コード例 #6
0
    epochs = 10

    net = NET()
    if torch.cuda.is_available():
        net.cuda()

    net.to(device)
    logging_dir = 'net/simpson_h2-{}'.format(2)

    stats = CheckLayerSat(savefile=logging_dir,
                          save_to='plot',
                          modules=net,
                          include_conv=False,
                          stats=['lsat'],
                          max_samples=1024,
                          verbose=True,
                          writer_args={
                              'figsize': [30, 30],
                              'fontsize': 32
                          },
                          conv_method='mean',
                          device='cpu')

    #net = nn.DataParallel(net, device_ids=['cuda:0', 'cuda:1'])
    eps = torch.Tensor([1e-10]).cuda()

    def loss_fn(recon_x, x, mu, logvar, eps):
        BCE = F.binary_cross_entropy(recon_x + eps, x, size_average=False)
        KLD = -0.5 * torch.sum(1 + logvar - mu**2 - logvar.exp())
        return (BCE + KLD) / x.size(0)
コード例 #7
0

torch.manual_seed(1)
cuda = torch.cuda.is_available()
epochs = 5

for h2 in [8, 32, 128]:  # compare various hidden layer sizes
    net = Net(h2=h2)  # instantiate network with hidden layer size `h2`

    if cuda:
        net.cuda()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

    logging_dir = 'convNet/h2-{}'.format(h2)
    stats = CheckLayerSat(logging_dir, net)
    stats.write(
        "CIFAR10 ConvNet - Changing fc2 - size {}".format(h2))  # optional

    for epoch in range(epochs):
        running_loss = 0.0
        step = 0
        loader = tqdm(train_loader, leave=True,
                      position=0)  # track step progress and loss - optional
        for i, data in enumerate(loader):
            step = epoch * len(loader) + i
            inputs, labels = data
            inputs = Variable(inputs)
            labels = Variable(labels)
            if cuda:
                inputs, labels = inputs.cuda(), labels.cuda()
コード例 #8
0
    epochs = 5

    for h2 in [8, 32, 128]:  # compare various hidden layer sizes
        net = resnet18(
            pretrained=False, num_classes=10
        )  #Net(h2=h2)  # instantiate network with hidden layer size `h2`

        net.to(device)
        print(net)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

        logging_dir = 'convNet/simpson_h2-{}'.format(h2)
        stats = CheckLayerSat(logging_dir,
                              'csv',
                              net,
                              include_conv=True,
                              stats=['lsat'])
        stats.write(
            "CIFAR10 ConvNet - Changing fc2 - size {}".format(h2))  # optional

        for epoch in range(epochs):
            running_loss = 0.0
            step = 0
            loader = tqdm(
                train_loader, leave=True,
                position=0)  # track step progress and loss - optional
            for i, data in enumerate(loader):
                step = epoch * len(loader) + i
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)