def fine_tune(self):
        best_acc = 0.0
        self.lr = 0.01
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.lr, momentum=0.9, weight_decay=5e-4)
        print("Beginning Training for", self.epochs, " Epochs")
        for epoch in range(1, 41):
            if epoch == 10:
                self.lr = self.lr * 0.1
                for group in self.optimizer.param_groups:
                    group['lr'] = self.lr
            elif epoch == 20:
                self.lr = self.lr * 0.1
                for group in self.optimizer.param_groups:
                    group['lr'] = self.lr

            train_utils.train(self, epoch)
            acc, loss = train_utils.evaluate(self)
            acc = round(acc.item(), 4)

            # Save best performance model
            if best_acc < acc:
                best_model_wts = copy.deepcopy(self.model.state_dict())
                best_epoch = epoch
                best_acc = acc
                best_loss = loss
        # Save Best model
        # torch.save(best_model_wts, self.checkpoint_path.format(epoch=best_epoch, acc=round(best_acc * 100, 2)))

        # Record Metrics
        self.overall_log.append(
            {"Experiment": self.exp_name, "Epoch": best_epoch, "Test_Acc": best_acc,
             "Test_Loss": best_loss})
        train_utils.record_overall_metrics(self, ['Experiment', 'Epoch', "Test_Acc", "Test_Loss"])
Beispiel #2
0
def main(args):
    device = torch.device(args.device if torch.cuda.is_available() else "cpu")
    assert os.path.exists(args.weights), f"weights {args.weights} not found."

    # segmentation nun_classes + background
    num_classes = args.num_classes + 1

    # VOCdevkit -> VOC2012 -> ImageSets -> Segmentation -> val.txt
    val_dataset = VOCSegmentation(args.data_path,
                                  year="2012",
                                  transforms=SegmentationPresetEval(520),
                                  txt_name="val.txt")

    num_workers = 8
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=1,
                                             num_workers=num_workers,
                                             pin_memory=True,
                                             collate_fn=val_dataset.collate_fn)

    model = fcn_resnet50(aux=args.aux, num_classes=num_classes)
    model.load_state_dict(
        torch.load(args.weights, map_location=device)['model'])
    model.to(device)

    confmat = evaluate(model,
                       val_loader,
                       device=device,
                       num_classes=num_classes)
    print(confmat)
Beispiel #3
0
    def train_model(self):
        best_acc = 0.0
        print("Beginning Training for", self.epochs, " Epochs")

        for epoch in range(1, self.epochs + 1):
            if epoch == 80:
                self.lr = 0.01
                for group in self.optimizer.param_groups:
                    group['lr'] = self.lr
            elif epoch == 140:
                self.lr = 0.001
                for group in self.optimizer.param_groups:
                    group['lr'] = self.lr

            train_utils.train(self, epoch)
            acc, loss = train_utils.evaluate(self)
            acc = round(acc.item(), 4)
            loss = round(loss, 4)

            # Save best performance model
            if best_acc < acc:
                best_model_wts = copy.deepcopy(self.model.state_dict())
                best_epoch = epoch
                best_acc = acc
                best_loss = loss

        # Save Best model
        # torch.save(best_model_wts, self.model_path.format(task=self.task, epoch=best_epoch, acc=round(best_acc * 100, 2)))

        # Record Metrics
        train_utils.record_metrics(self)

        self.overall_log.append(
            {"Task": self.task, "Epoch": best_epoch, "Test_Acc": round(best_acc * 100, 2), "Test_Loss": best_loss})
        train_utils.record_overall_metrics(self)
    def train_model(self):
        best_acc = 0.0

        print("Beginning Training for", self.epochs, " Epochs")
        for epoch in range(1, self.epochs + 1):
            if epoch == 80:
                self.lr = 0.01
                for group in self.optimizer.param_groups:
                    group['lr'] = self.lr
            elif epoch == 140:
                self.lr = 0.001
                for group in self.optimizer.param_groups:
                    group['lr'] = self.lr

            train_utils.train(self, epoch)
            acc, loss = train_utils.evaluate(self)
            # acc = round(acc.item(), 4)

            # Save best performance model
            if best_acc < acc:
                best_model_wts = copy.deepcopy(self.model.state_dict())
                best_epoch = epoch
                best_acc = acc
                best_loss = loss
        print(f"Saving best model: Loss={best_loss}, Acc={best_acc}, Ep={best_epoch}")
        # Save Best model
        torch.save(best_model_wts, self.checkpoint_path.format(epoch=best_epoch, acc=best_acc))

        # Record Metrics
        self.overall_log.append(
            {"Experiment": self.exp_name, "Epoch": best_epoch, "Test_Acc": round(best_acc * 100, 2),
             "Test_Loss": best_loss})
        train_utils.record_overall_metrics(self, ['Experiment', 'Epoch', "Test_Acc", "Test_Loss"])
Beispiel #5
0
def fit():
    (
        train_img,
        test_img,
        train_labels,
        test_labels,
        train_orig_labels,
        test_orig_targets,
    ) = model_selection.train_test_split(IMAGES,
                                         LABELS_ENCODED,
                                         LABELS_NAMES,
                                         test_size=0.1,
                                         random_state=2020)

    train_dataset = OcrDataset(image_path=train_img,
                               labels=train_labels,
                               resize=(IMAGE_HEIGHT, IMAGE_WIDTH))

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=BATCH_SIZE,
                                               num_workers=NUM_WORKERS,
                                               shuffle=True)

    test_dataset = OcrDataset(image_path=test_img,
                              labels=test_labels,
                              resize=(IMAGE_HEIGHT, IMAGE_WIDTH))

    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=BATCH_SIZE,
                                              num_workers=NUM_WORKERS,
                                              shuffle=False)

    model = OcrModel_v0(num_characters=len(labels_encoded.classes_))
    model.to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           factor=0.8,
                                                           patience=2,
                                                           verbose=True)

    for epoch in range(EPOCHS):
        train_loss = train(model, train_loader, optimizer)
        valid_preds, valid_loss = evaluate(model, test_loader)
        valid_final_preds = []

        for pred in valid_preds:
            # print(pred)
            cur_preds = decode_preds(pred, labels_encoded)
            valid_final_preds.extend(cur_preds)
        show_preds_list = list(zip(test_orig_targets, valid_final_preds))[1:3]
        pprint(show_preds_list)
        pprint("-" * 90)
        pprint(
            f"Epoch: {epoch} | Train loss = {train_loss} | Valid loss = {valid_loss} |"
        )
        pprint("-" * 90)
    def finetune_classifier(self, task, ittr="0"):
        print('-' * 50)
        print("Training task:\t", task)
        self.data_loaders = train_utils.CIFAR_dl_task(self, task,
                                                      self.per_task_norm)
        best_acc = 0.0

        # Setup Model
        model = self.backbone_model
        for param in model.parameters():
            param.requires_grad = False
        model.fc = nn.Linear(512, 5)
        self.model = model.to(self.device)
        self.lr = 0.01
        self.optimizer = torch.optim.SGD(self.model.parameters(),
                                         lr=self.lr,
                                         momentum=0.9,
                                         weight_decay=5e-4)

        print("Finetuning for", self.epochs, " Epochs")
        for epoch in range(1, self.epochs + 1):
            if epoch == 10:
                self.lr = self.lr * 0.1
                for group in self.optimizer.param_groups:
                    group['lr'] = self.lr
            elif epoch == 20:
                self.lr = self.lr * 0.1
                for group in self.optimizer.param_groups:
                    group['lr'] = self.lr

            train_utils.train(self, epoch)
            acc, loss = train_utils.evaluate(self)
            acc = round(acc.item(), 4)
            loss = round(loss, 4)

            # Save best performance model
            if best_acc < acc:
                best_model_wts = copy.deepcopy(self.model.state_dict())
                best_acc = acc
                best_loss = loss
                best_epoch = epoch

        # Save Best model
        torch.save(
            best_model_wts,
            self.classifier_path.format(exp=ittr,
                                        task=task,
                                        epoch=best_epoch,
                                        acc=round(best_acc * 100, 2)))
        # Record Metrics
        self.classifier_results.append({
            "Task": task,
            "Acc": round(best_acc * 100, 2),
            "Loss": best_loss
        })
Beispiel #7
0
def generalization_test():
    lengths = np.arange(10, 101, 10)
    costs = {'lstm': [], 'ntm_lstm': [], 'ntm_mlp': []}

    # Load trained models
    ntm_lstm = load_model('checkpoints/ntm/copy-batch-1125.0--LSTM.model', 'NTM')
    ntm_mlp = load_model('checkpoints/ntm/copy-batch-7500.0--MLP.model', 'NTM')
    lstm, _ = load_model_v2('checkpoints/lstm/copy-batch-1000000.0.model', model_type='LSTM')

    # Average over 20 runs
    for T in tqdm_notebook(lengths):
        dataloader = random_binary(max_seq_length=T, num_sequences=None, batch_Size=1, min_seq_length=T - 1)
        cost, _, _ = evaluate(ntm_lstm, dataloader, 1, 'LSTM', False, how_many=20)
        costs['ntm_lstm'].append(cost)

        cost, _, _ = evaluate(ntm_mlp, dataloader, 1, 'MLP', False, how_many=20)
        costs['ntm_mlp'].append(cost)

        dataloader = sequence_loader(100, batch_size=1, min_length=T - 1, max_length=T)
        cost, _, _ = evaluate_lstm_baseline_v2(lstm, dataloader, 1, False)
        costs['lstm'].append(cost)
    return costs, lengths
Beispiel #8
0
def test(args):
    fn = 'corpus.{}.data'.format(hashlib.md5(args.data.encode()).hexdigest())
    if os.path.exists(fn):
        print('Loading cached dataset...')
        corpus = torch.load(fn)
        print('Done')
    else:
        print('Producing dataset...')
        corpus = data.Corpus(args.data)
        torch.save(corpus, fn)
        print('Done')

    ntokens = len(corpus.dictionary)
    batch_size = args.batchSize
    val_data = batchify(corpus.valid, batch_size, args)
    test_data = batchify(corpus.test, batch_size, args)

    if not os.path.isfile(args.weightFile):
        print('Pre-trained weight file does not exist. Please check the location: {}'.format(args.weightFile))
        exit()
    model, criterion, _, _ = model_load(args.weightFile)
    if torch.cuda.is_available():
        model = model.cuda()
        criterion = criterion.cuda()

    # Run on validation data.
    val_loss = evaluate(args, model, criterion, val_data, ntokens, batch_size)
    print('=' * 89)
    print('| End of Validation | val loss {:5.2f} | val ppl {:8.2f}'.format(
        val_loss, math.exp(val_loss)))
    print('=' * 89)

    # Run on test data.
    test_loss = evaluate(args, model, criterion, test_data, ntokens, batch_size)
    print('=' * 89)
    print('| End of Testing | test loss {:5.2f} | test ppl {:8.2f}'.format(
        test_loss, math.exp(test_loss)))
    print('=' * 89)
def train(model, trainloader, testloader, criterion, optimizer):

    best_accuracy = 0.0

    for epoch in range(20):  # loop over the dataset multiple times

        running_loss = 0.0
        index = 0
        for i, data in enumerate(trainloader, 0):

            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs.double())
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            index += 1
            print(
                "Epoch ",
                epoch,
                " (Index ",
                str(index),
                "/",
                str(len(trainloader)),
                " Loss : ",
                loss.item(),
                ")",
            )

        test_accuracy = evaluate(model.double(), testloader, "Test Accuracy")

        if test_accuracy >= best_accuracy:

            save_model(model, optimizer, name="models/char_model.pth")

        print("loss: ", running_loss / len(trainloader))

    print("Finished Training")
Beispiel #10
0
def adam_evaluate(neurons, lr, lr_decay, epochs, batch_size):
    # The Gaussian Process' space is continous, so we need to round some values
    neurons, epochs, batch_size = map(lambda x: int(round(x)),
                                      (neurons, epochs, batch_size))

    # K-fold stratified cross-validation
    skf = StratifiedKFold(n_splits=10, shuffle=True)

    scores = []
    for train_index, test_index in skf.split(features, labels):
        x_train, x_test = [features[i] for i in train_index
                           ], [features[i] for i in test_index]
        y_train, y_test = to_categorical([
            labels[i] for i in train_index
        ]), to_categorical([labels[i] for i in test_index])

        # Create and fit the LSTM network
        model = get_spatial_model(layers=[neurons],
                                  lr=lr,
                                  lr_decay=lr_decay,
                                  input_shape=(len(x_train[0]), ))
        for _ in range(epochs):
            for X, Y in zip(x_train, y_train):
                model.train_on_batch(np.array([X]), np.array([Y]))

        # Final evaluation of the model
        evals = train_utils.evaluate(model, x_test, y_test)
        losses = [x[0] for x in evals]
        accuracies = [x[1] for x in evals]
        scores.append([np.mean(losses), np.mean(accuracies)])

    losses = [x[0] for x in scores]
    accuracies = [x[1] for x in scores]

    print("Test loss and Standard dev: %.2f (+/- %.2f)" %
          (np.mean(losses), np.std(losses)))
    print("Test accuracy and Standard dev: %.2f%% (+/- %.2f%%)" %
          (np.mean(accuracies) * 100, np.std(accuracies) * 100))

    return np.mean(accuracies)
def main(args):
    device = torch.device(args.device if torch.cuda.is_available() else "cpu")
    batch_size = args.batch_size
    # segmentation nun_classes + background
    num_classes = args.num_classes + 1

    train_dataset = VOCSegmentation(args.data_path,
                                    transforms=get_transform(train=True),
                                    txt_name="train.txt")

    val_dataset = VOCSegmentation(args.data_path,
                                  transforms=get_transform(train=False),
                                  txt_name="val.txt")

    num_workers = 8
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size,
                                               num_workers=num_workers,
                                               shuffle=True,
                                               pin_memory=True,
                                               collate_fn=train_dataset.collate_fn)

    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=1,
                                             num_workers=num_workers,
                                             pin_memory=True,
                                             collate_fn=val_dataset.collate_fn)

    model = create_model(aux=args.aux, num_classes=num_classes)
    model.to(device)

    params_to_optimize = [
        {"params": [p for p in model.backbone.parameters() if p.requires_grad]},
        {"params": [p for p in model.classifier.parameters() if p.requires_grad]}
    ]

    if args.aux:
        params = [p for p in model.aux_classifier.parameters() if p.requires_grad]
        params_to_optimize.append({"params": params, "lr": args.lr * 10})

    optimizer = torch.optim.SGD(
        params_to_optimize,
        lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay
    )

    lr_scheduler = torch.optim.lr_scheduler.LambdaLR(
        optimizer,
        lambda x: (1 - x / args.epochs) ** 0.9)

    if args.resume:
        checkpoint = torch.load(args.resume, map_location='cpu')
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
        args.start_epoch = checkpoint['epoch'] + 1

    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        train_one_epoch(model, optimizer, train_loader, device, epoch,
                        warmup=True, print_freq=args.print_freq)

        lr_scheduler.step()

        confmat = evaluate(model, val_loader, device=device, num_classes=num_classes)
        print(confmat)

        save_file = {"model": model.state_dict(),
                     "optimizer": optimizer.state_dict(),
                     "lr_scheduler": lr_scheduler.state_dict(),
                     "epoch": epoch,
                     "args": args}
        torch.save(save_file, "save_weights/model_{}.pth".format(epoch))

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print("training time {}".format(total_time_str))
X_train, Y_train, X_test, Y_test = read_cifar_10(image_width=INPUT_WIDTH, image_height=INPUT_HEIGHT)

X = tf.placeholder(tf.float32, [None, 32, 32, 3]) 
Y = tf.placeholder(tf.float32, [None, 10])
dropout_rate = tf.placeholder("float")

fix_model = AlexNet_cifar100(X, qnum=2, dropout_keep_prob=dropout_rate)
param_list = fix_model.parameter_list

tr_model = Train_Alexnet(X, param_list, dropout_keep_prob=dropout_rate)
var_all = tf.trainable_variables(scope=None)

hypothesis = tr_model.hypothesis

correct_prediction = tf.equal(tf.argmax(hypothesis, 1), tf.arg_max(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32) , name='accuracy')

with tf.device('/GPU:0'): 
    with tf.Session() as sess:
        with tf.device('/cpu:0'):
            sess.run(tf.global_variables_initializer())
                        
            loader = tf.train.Saver(var_all)
            loader.restore(sess, tf.train.latest_checkpoint(CHECKPOINT))

        final_train_accuracy = tu.evaluate(sess, accuracy, X, Y, dropout_rate, X_train, Y_train, BATCH_SIZE)
        final_test_accuracy = tu.evaluate(sess, accuracy, X, Y, dropout_rate, X_test, Y_test, BATCH_SIZE)

        print('Train Accuracy = {:.3f}'.format(final_train_accuracy))
        print('Test Accuracy = {:.3f}'.format(final_test_accuracy))
        print("")
Beispiel #13
0
def main():

    parser = BasicConfig()

    model_type = vars(parser.parse_known_args()[0])["model_type"].lower()
    model_class, configs = MODEL_CLASSES[model_type]
    args = configs(parser)
    args = checkoutput_and_setcuda(args)
    logger = init_logger(args)
    logger.info('Dataset collected from {}'.format(args.data_dir))
    # Set seed
    set_seed(args)
    processor = UbuntuCorpus(args)

    logger.info(args)

    model = model_class(args=args)

    # model.to(args.device)

    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)

    # Training
    if args.do_train:
        args.train_batch_size = args.per_gpu_train_batch_size * max(
            1, args.n_gpu)
        train_dataloader = processor.create_batch(data_type="train")

        args.eval_batch_size = args.per_gpu_eval_batch_size * max(
            1, args.n_gpu)
        eval_dataloader = processor.create_batch(data_type="eval")

        args.logging_steps = len(
            train_dataloader) // args.gradient_accumulation_steps // 5
        args.valid_steps = len(
            train_dataloader) // args.gradient_accumulation_steps

        trainer_op = trainer(args=args,
                             model=model,
                             optimizer=optimizer,
                             train_iter=train_dataloader,
                             eval_iter=eval_dataloader,
                             logger=logger,
                             num_epochs=args.num_train_epochs,
                             save_dir=args.output_dir,
                             log_steps=args.logging_steps,
                             valid_steps=args.valid_steps,
                             valid_metric_name="+R10@1")
        trainer_op.train()
    print('training complete!')
    # Test
    if args.do_test:
        args.eval_batch_size = args.per_gpu_eval_batch_size * max(
            1, args.n_gpu)
        test_dataloader = processor.create_batch(data_type="eval")

        trainer_op = trainer(args=args,
                             model=model,
                             optimizer=optimizer,
                             train_iter=None,
                             eval_iter=None,
                             logger=logger,
                             num_epochs=args.num_train_epochs,
                             save_dir=args.output_dir,
                             log_steps=None,
                             valid_steps=None,
                             valid_metric_name="+R10@1")

        best_model_file = os.path.join(args.output_dir,
                                       args.fusion_type + "_best.model")
        best_train_file = os.path.join(args.output_dir,
                                       args.fusion_type + "_best.train")

        trainer_op.load(best_model_file, best_train_file)

        evaluate(args, trainer_op.model, test_dataloader, logger)
    print('test complete')
    # TODO: Infer case study
    if args.do_infer:
        #不知道写什么,懒得想了。
        pass
Beispiel #14
0
                                      decay_thresh=0.99)
        elif args.model == "ridge_regression":
            monitor = ProgressMonitor(init_lr=args.learning_rate,
                                      lr_decay_fac=2.0,
                                      min_lr=0.00001,
                                      min_metric_better=True,
                                      decay_thresh=0.99)
        else:
            raise Exception("model not supported!")
        for epoch in range(args.epoch):
            # train for one epoch
            loss_per_step = train(args, model, epoch, train_loader, optimizer,
                                  quantizer, kernel_approx)
            train_loss += loss_per_step
            # evaluate and save evaluate metric
            metric, monitor_signal = evaluate(args, model, epoch, val_loader,
                                              quantizer, kernel_approx)
            eval_metric.append(metric)
            monitor_signal_history.append(monitor_signal)

            if not os.path.isdir(args.save_path):
                os.makedirs(args.save_path)
            np.savetxt(args.save_path + "/train_loss.txt", train_loss)
            np.savetxt(args.save_path + "/eval_metric.txt", eval_metric)
            np.savetxt(args.save_path + "/monitor_signal.txt",
                       monitor_signal_history)
            if not args.fixed_epoch_number:
                print("using early stopping on lr")
                early_stop = monitor.end_of_epoch(monitor_signal, model,
                                                  optimizer, epoch)
                if early_stop:
                    break
Beispiel #15
0
def train(model,
          criterion,
          optimizer,
          optimizer_fp,
          train_iterator,
          n_epochs,
          n_batches,
          val_iterator,
          validation_step,
          n_validation_batches,
          saving_step=None,
          lr_scheduler=None):

    all_losses = []
    all_models = []

    is_reduce_on_plateau = isinstance(lr_scheduler, ReduceLROnPlateau)

    running_loss = 0.0
    running_accuracy = 0.0
    running_top5_accuracy = 0.0
    start = time.time()
    model.train()

    for epoch in range(0, n_epochs):
        for step, (x_batch, y_batch) in enumerate(train_iterator,
                                                  1 + epoch * n_batches):

            if lr_scheduler is not None and not is_reduce_on_plateau:
                optimizer = lr_scheduler(optimizer, step)

            batch_loss, batch_accuracy, batch_top5_accuracy = optimization_step(
                model, criterion, optimizer, optimizer_fp, x_batch, y_batch)
            running_loss += batch_loss
            running_accuracy += batch_accuracy
            running_top5_accuracy += batch_top5_accuracy

            if step % validation_step == 0:
                model.eval()
                test_loss, test_accuracy, test_top5_accuracy = evaluate(
                    model, criterion, val_iterator, n_validation_batches)
                end = time.time()

                print(
                    '{0:.2f}  {1:.3f} {2:.3f}  {3:.3f} {4:.3f}  {5:.3f} {6:.3f}  {7:.3f}'
                    .format(step / n_batches, running_loss / validation_step,
                            test_loss, running_accuracy / validation_step,
                            test_accuracy,
                            running_top5_accuracy / validation_step,
                            test_top5_accuracy, end - start))
                all_losses += [
                    (step / n_batches, running_loss / validation_step,
                     test_loss, running_accuracy / validation_step,
                     test_accuracy, running_top5_accuracy / validation_step,
                     test_top5_accuracy)
                ]

                if is_reduce_on_plateau:
                    lr_scheduler.step(test_accuracy)

                running_loss = 0.0
                running_accuracy = 0.0
                running_top5_accuracy = 0.0
                start = time.time()
                model.train()

            if saving_step is not None and step % saving_step == 0:

                print('saving')
                model.cpu()
                clone = copy.deepcopy(model)
                all_models += [clone.state_dict()]
                model.cuda()

    return all_losses, all_models
Beispiel #16
0
def main(args):
    device = torch.device(args.device if torch.cuda.is_available() else "cpu")
    batch_size = args.batch_size
    # segmentation nun_classes + background
    num_classes = args.num_classes + 1

    # 用来保存训练以及验证过程中信息
    results_file = "results{}.txt".format(
        datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

    # VOCdevkit -> VOC2012 -> ImageSets -> Segmentation -> train.txt
    train_dataset = VOCSegmentation(args.data_path,
                                    year="2012",
                                    transforms=get_transform(train=True),
                                    txt_name="train.txt")

    # VOCdevkit -> VOC2012 -> ImageSets -> Segmentation -> val.txt
    val_dataset = VOCSegmentation(args.data_path,
                                  year="2012",
                                  transforms=get_transform(train=False),
                                  txt_name="val.txt")

    num_workers = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=True,
        pin_memory=True,
        collate_fn=train_dataset.collate_fn)

    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=1,
                                             num_workers=num_workers,
                                             pin_memory=True,
                                             collate_fn=val_dataset.collate_fn)

    model = create_model(aux=args.aux, num_classes=num_classes)
    model.to(device)

    params_to_optimize = [{
        "params": [p for p in model.backbone.parameters() if p.requires_grad]
    }, {
        "params":
        [p for p in model.classifier.parameters() if p.requires_grad]
    }]

    if args.aux:
        params = [
            p for p in model.aux_classifier.parameters() if p.requires_grad
        ]
        params_to_optimize.append({"params": params, "lr": args.lr * 10})

    optimizer = torch.optim.SGD(params_to_optimize,
                                lr=args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    scaler = torch.cuda.amp.GradScaler() if args.amp else None

    # 创建学习率更新策略,这里是每个step更新一次(不是每个epoch)
    lr_scheduler = create_lr_scheduler(optimizer,
                                       len(train_loader),
                                       args.epochs,
                                       warmup=True)

    # import matplotlib.pyplot as plt
    # lr_list = []
    # for _ in range(args.epochs):
    #     for _ in range(len(train_loader)):
    #         lr_scheduler.step()
    #         lr = optimizer.param_groups[0]["lr"]
    #         lr_list.append(lr)
    # plt.plot(range(len(lr_list)), lr_list)
    # plt.show()

    if args.resume:
        checkpoint = torch.load(args.resume, map_location='cpu')
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
        args.start_epoch = checkpoint['epoch'] + 1
        if args.amp:
            scaler.load_state_dict(checkpoint["scaler"])

    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        mean_loss, lr = train_one_epoch(model,
                                        optimizer,
                                        train_loader,
                                        device,
                                        epoch,
                                        lr_scheduler=lr_scheduler,
                                        print_freq=args.print_freq,
                                        scaler=scaler)

        confmat = evaluate(model,
                           val_loader,
                           device=device,
                           num_classes=num_classes)
        val_info = str(confmat)
        print(val_info)
        # write into txt
        with open(results_file, "a") as f:
            # 记录每个epoch对应的train_loss、lr以及验证集各指标
            train_info = f"[epoch: {epoch}]\n" \
                         f"train_loss: {mean_loss:.4f}\n" \
                         f"lr: {lr:.6f}\n"
            f.write(train_info + val_info + "\n\n")

        save_file = {
            "model": model.state_dict(),
            "optimizer": optimizer.state_dict(),
            "lr_scheduler": lr_scheduler.state_dict(),
            "epoch": epoch,
            "args": args
        }
        if args.amp:
            save_file["scaler"] = scaler.state_dict()
        torch.save(save_file, "save_weights/model_{}.pth".format(epoch))

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print("training time {}".format(total_time_str))
Beispiel #17
0
def run(learning_rate, batch_size, cuda, memory_feature_size, num_inputs,
        num_outputs, controller_size, controller_type, controller_layers,
        memory_size, integer_shift, checkpoint_interval, total_batches,
        model_file):

    # model_file = "checkpoints/ntm/copy-batch-5120.0--LSTM.model"

    # Seeding
    SEED = 1000
    torch.manual_seed(SEED)
    np.random.seed(SEED)

    # Model Loading
    if model_file == 'None':
        ntm = NTM(num_inputs=num_inputs,
                  num_outputs=num_outputs,
                  controller_size=controller_size,
                  controller_type=controller_type,
                  controller_layers=controller_layers,
                  memory_size=memory_size,
                  memory_feature_size=memory_feature_size,
                  integer_shift=integer_shift,
                  batch_size=batch_size,
                  use_cuda=cuda)
        # Constants for keeping track
        total_examples = 0
        losses = []
        costs = []
        seq_lens = []
    else:
        from_before = torch.load(model_file)
        state_dict = from_before['state_dict']
        controller_type = from_before['controller_type']
        num_inputs = from_before['num_inputs']
        num_outputs = from_before['num_outputs']
        controller_size = from_before['controller_size']
        controller_layers = from_before['controller_layers']
        memory_size = from_before['memory_size']
        memory_feature_size = from_before['memory_feature_size']
        integer_shift = from_before['integer_shift']
        batch_size = from_before['batch_size']
        cuda = from_before['cuda']
        saved_biases = True
        ntm = NTM(num_inputs=num_inputs,
                  num_outputs=num_outputs,
                  controller_size=controller_size,
                  controller_type=controller_type,
                  controller_layers=controller_layers,
                  memory_size=memory_size,
                  memory_feature_size=memory_feature_size,
                  integer_shift=integer_shift,
                  batch_size=batch_size,
                  use_cuda=cuda,
                  saved_biases=saved_biases)
        ntm.load_state_dict(state_dict)
        losses = from_before['loss']
        costs = from_before['cost']
        seq_lens = from_before['seq_lengths']
        total_examples = from_before['total_examples']

    # Dataset creation
    training_dataset = random_binary(max_seq_length=20,
                                     num_sequences=500,
                                     vector_dim=8,
                                     batch_Size=batch_size)
    testing_dataset = random_binary(max_seq_length=10,
                                    num_sequences=50,
                                    vector_dim=8,
                                    batch_Size=batch_size)

    # Optimizer type and loss function
    # optimizer = torch.optim.Adam(ntm.parameters(), lr=learning_rate)
    optimizer = torch.optim.RMSprop(ntm.parameters(),
                                    lr=learning_rate,
                                    momentum=0.9,
                                    alpha=0.95)
    criterion = torch.nn.BCELoss()

    np.random.seed(
        SEED
    )  # reset training seed to ensure that batches remain the same between runs!
    for batch in training_dataset:

        optimizer.zero_grad()
        # Initialize head weights and memory to zero
        ntm.init_headweights()
        ntm.init_memory()

        batch = Variable(batch)
        if cuda:
            batch = batch.cuda()
        next_r = ntm.read_head.create_state(batch_size)
        if controller_type == 'LSTM':
            lstm_h, lstm_c = ntm.controller.create_state(batch_size)

        #  Read batch in
        for i in range(batch.size()[2]):
            x = batch[:, :, i]
            if controller_type == 'LSTM':
                _, next_r, lstm_h, lstm_c = ntm.forward(x=x,
                                                        r=next_r,
                                                        lstm_h=lstm_h,
                                                        lstm_c=lstm_c)
            elif controller_type == 'MLP':
                _, next_r = ntm.forward(x=x, r=next_r)

        # Output response
        x = Variable(torch.zeros(batch.size()[0:2]))
        output = Variable(torch.zeros(batch[:, :, :-1].size()))
        if cuda:
            x = x.cuda()
            output = output.cuda()

        for i in range(output.size()[2]):
            if controller_type == 'LSTM':
                output[:, :,
                       i], next_r, lstm_h, lstm_c = ntm.forward(x=x,
                                                                r=next_r,
                                                                lstm_h=lstm_h,
                                                                lstm_c=lstm_c)
            elif controller_type == 'MLP':
                output[:, :, i], next_r = ntm.forward(x=x, r=next_r)

        loss = criterion(output, batch[:, :, :-1])
        loss.backward(retain_graph=True)
        optimizer.step()

        print("Current Batch Loss:", round(loss.data[0], 3))
        total_examples += batch_size

        # The cost is the number of error bits per sequence
        binary_output = output.clone().data
        binary_output = binary_output > 0.5
        cost = torch.sum(
            torch.abs(binary_output.float() - batch.data[:, :, :-1]))

        losses += [loss.data[0]]
        costs += [cost / batch_size]
        seq_lens += [batch.size(2)]

        # Checkpoint model
        if (checkpoint_interval != 0) and (total_examples % checkpoint_interval
                                           == 0):
            print("Saving Checkpoint!")
            save_checkpoint(ntm, total_examples / batch_size, losses, costs,
                            seq_lens, total_examples, controller_type,
                            num_inputs, num_outputs, controller_size,
                            controller_layers, memory_size,
                            memory_feature_size, integer_shift, batch_size,
                            cuda)

            # Evaluate model on this saved checkpoint
            test_cost, prediction, input = evaluate(
                model=ntm,
                testset=testing_dataset,
                batch_size=batch_size,
                memory_feature_size=memory_feature_size,
                controller_type=controller_type,
                cuda=cuda)
            print("Total Test Cost (in bits per sequence):", test_cost)
            print("Example of Input/Output")
            print("prediction:", prediction[0])
            print("Input:", input[0])

        if total_examples / checkpoint_interval >= total_batches:
            break
Beispiel #18
0
def trainEvalLM(args):
    fn = 'corpus.{}.data'.format(hashlib.md5(args.data.encode()).hexdigest())
    if os.path.exists(fn):
        print('Loading cached dataset...')
        corpus = torch.load(fn)
    else:
        print('Producing dataset...')
        corpus = data.Corpus(args.data)
        torch.save(corpus, fn)

    if torch.cuda.is_available():
        args.cuda = True

    ntokens = len(corpus.dictionary)
    eval_batch_size = 10
    train_data = batchify(corpus.train, args.batch_size, args)
    val_data = batchify(corpus.valid, eval_batch_size, args)

    # Build the model and loss function
    model = lmModel.RNNModel(args.model,
                             ntokens,
                             args.emsize,
                             args.nhid,
                             args.nlayers,
                             args.dropout,
                             args.tied,
                             g=args.g,
                             k=args.k)
    criterion = nn.CrossEntropyLoss()
    if torch.cuda.is_available():
        model = model.cuda()
        criterion = criterion.cuda()

    #compute network parameters
    params = list(model.parameters())
    total_params = np.sum([np.prod(p.size()) for p in params])
    print(
        '\033[1;32;40mTotal parameters (in million):\033[0m\033[1;31;40m {:0.2f} \033[0m\n'
        .format(total_params / 1e6, 2))

    optimizer = torch.optim.SGD(params, lr=args.lr, weight_decay=args.wdecay)
    start_epoch = 1
    if args.resume:
        print('Resuming model ...')
        model, criterion, optimizer, start_epoch = model_load(args.resume)
        optimizer.param_groups[0]['lr'] = args.lr
        model.dropout = args.dropout

    # At any point you can hit Ctrl + C to break out of training early.
    try:
        #Create folder for saving model and log files
        args.saveDir += '_' + args.model
        # =====================
        if not os.path.isdir(args.saveDir):
            os.mkdir(args.saveDir)

        save_str = 'nl_' + str(args.nlayers) + '_nh_' + str(
            args.nhid) + '_g_' + str(args.g) + '_k_' + str(args.k)
        args.save = args.saveDir + '/model_' + save_str + '.pt'

        logFileLoc = args.saveDir + '/logs_' + save_str + '.txt'
        logger = open(logFileLoc, 'w')
        logger.write(str(args))
        logger.write('\n Total parameters (in million): {:0.2f}'.format(
            total_params / 1e6, 2))
        logger.write('\n\n')
        logger.write(
            "\n%s\t%s\t%s\t%s\t%s" %
            ('Epoch', 'Loss(Tr)', 'Loss(val)', 'ppl (tr)', 'ppl (val)'))
        logger.flush()

        best_val_loss = []
        stored_loss = 100000000
        # Loop over epochs.
        for epoch in range(start_epoch, args.epochs + 1):
            epoch_start_time = time.time()
            train_loss = train(args, model, criterion, optimizer, epoch,
                               train_data, ntokens)

            ### TRAIN WITH ASGD
            if 't0' in optimizer.param_groups[0]:
                tmp = {}
                for prm in model.parameters():
                    tmp[prm] = prm.data.clone()
                    prm.data = optimizer.state[prm]['ax'].clone()

                val_loss = evaluate(args, model, criterion, val_data, ntokens,
                                    eval_batch_size)

                print('-' * 89)
                print(
                    '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                    'valid ppl {:8.2f}'.format(
                        epoch, (time.time() - epoch_start_time), val_loss,
                        math.exp(val_loss)))
                print('-' * 89)

                logger.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f" %
                             (epoch, train_loss, val_loss,
                              math.exp(train_loss), math.exp(val_loss)))
                logger.flush()

                if val_loss < stored_loss:
                    model_save(args.save, model, criterion, optimizer, epoch)
                    print('Saving Averaged (new best validation)')
                    stored_loss = val_loss

                for prm in model.parameters():
                    prm.data = tmp[prm].clone()

            else:
                val_loss = evaluate(args, model, criterion, val_data, ntokens,
                                    eval_batch_size)

                print('-' * 89)
                print(
                    '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                    'valid ppl {:8.2f}'.format(
                        epoch, (time.time() - epoch_start_time), val_loss,
                        math.exp(val_loss)))
                print('-' * 89)

                logger.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f" %
                             (epoch, train_loss, val_loss,
                              math.exp(train_loss), math.exp(val_loss)))
                logger.flush()

                if val_loss < stored_loss:
                    model_save(args.save, model, criterion, optimizer, epoch)
                    print('Saving model (new best validation)')
                    stored_loss = val_loss

                if 't0' not in optimizer.param_groups[0] and (
                        len(best_val_loss) > args.nonmono
                        and val_loss > min(best_val_loss[:-args.nonmono])):
                    print('Switching to ASGD')
                    optimizer = torch.optim.ASGD(model.parameters(),
                                                 lr=args.lr,
                                                 t0=0,
                                                 lambd=0.,
                                                 weight_decay=args.wdecay)
                best_val_loss.append(val_loss)
    except KeyboardInterrupt:
        print('-' * 89)
        print('Exiting from training early')
Beispiel #19
0
    steps = tqdm(range(1, args.total_steps + 1))
    for step in steps:
        steps.set_description(
            f'Best validation accuracy: {best_validation_accuracy:.3f}')
        try:
            supervised_batch = next(supervised_train_dataiter)
        except StopIteration:
            supervised_train_dataiter = iter(supervised_train_dataloader)
            supervised_batch = next(supervised_train_dataiter)

        try:
            unsupervised_batch = next(unsupervised_dataiter)
        except StopIteration:
            unsupervised_dataiter = iter(unsupervised_dataloader)
            unsupervised_batch = next(unsupervised_dataiter)

        optimizer.zero_grad()
        total_loss, supervised_loss, unsupervised_loss = train_utils.compute_loss(
            device, model, supervised_batch, unsupervised_batch,
            supervised_criterion, unsupervised_criterion, step, args)
        total_loss.backward()
        optimizer.step()

        if not step % args.evaluate_every:
            accuracy = train_utils.evaluate(device, model,
                                            supervised_validation_dataloader)
            if accuracy > best_validation_accuracy:
                best_validation_accuracy = accuracy
                torch.save(model.state_dict(),
                           os.path.join(save_path, 'model.pt'))
def main(args):
    device = torch.device(args.device if torch.cuda.is_available() else "cpu")
    batch_size = args.batch_size
    # segmentation nun_classes + background
    num_classes = args.num_classes + 1

    # using compute_mean_std.py
    mean = (0.709, 0.381, 0.224)
    std = (0.127, 0.079, 0.043)

    # 用来保存训练以及验证过程中信息
    results_file = "results{}.txt".format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

    train_dataset = DriveDataset(args.data_path,
                                 train=True,
                                 transforms=get_transform(train=True, mean=mean, std=std))

    val_dataset = DriveDataset(args.data_path,
                               train=False,
                               transforms=get_transform(train=False, mean=mean, std=std))

    num_workers = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size,
                                               num_workers=num_workers,
                                               shuffle=True,
                                               pin_memory=True,
                                               collate_fn=train_dataset.collate_fn)

    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=1,
                                             num_workers=num_workers,
                                             pin_memory=True,
                                             collate_fn=val_dataset.collate_fn)

    model = create_model(num_classes=num_classes)
    model.to(device)

    params_to_optimize = [p for p in model.parameters() if p.requires_grad]

    optimizer = torch.optim.SGD(
        params_to_optimize,
        lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay
    )

    scaler = torch.cuda.amp.GradScaler() if args.amp else None

    # 创建学习率更新策略,这里是每个step更新一次(不是每个epoch)
    lr_scheduler = create_lr_scheduler(optimizer, len(train_loader), args.epochs, warmup=True)

    if args.resume:
        checkpoint = torch.load(args.resume, map_location='cpu')
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
        args.start_epoch = checkpoint['epoch'] + 1
        if args.amp:
            scaler.load_state_dict(checkpoint["scaler"])

    best_dice = 0.
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        mean_loss, lr = train_one_epoch(model, optimizer, train_loader, device, epoch, num_classes,
                                        lr_scheduler=lr_scheduler, print_freq=args.print_freq, scaler=scaler)

        confmat, dice = evaluate(model, val_loader, device=device, num_classes=num_classes)
        val_info = str(confmat)
        print(val_info)
        print(f"dice coefficient: {dice:.3f}")
        # write into txt
        with open(results_file, "a") as f:
            # 记录每个epoch对应的train_loss、lr以及验证集各指标
            train_info = f"[epoch: {epoch}]\n" \
                         f"train_loss: {mean_loss:.4f}\n" \
                         f"lr: {lr:.6f}\n" \
                         f"dice coefficient: {dice:.3f}\n"
            f.write(train_info + val_info + "\n\n")

        if args.save_best is True:
            if best_dice < dice:
                best_dice = dice
            else:
                continue

        save_file = {"model": model.state_dict(),
                     "optimizer": optimizer.state_dict(),
                     "lr_scheduler": lr_scheduler.state_dict(),
                     "epoch": epoch,
                     "args": args}
        if args.amp:
            save_file["scaler"] = scaler.state_dict()

        if args.save_best is True:
            torch.save(save_file, "save_weights/best_model.pth")
        else:
            torch.save(save_file, "save_weights/model_{}.pth".format(epoch))

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print("training time {}".format(total_time_str))
Beispiel #21
0
                                                   y_source_domain)
        loss_target_domain = domain_loss_criterion(target_domain_pred,
                                                   y_target_domain)
        domain_loss__ = loss_source_domain + loss_target_domain

        source_domain_loss.append(loss_source_domain)
        target_domain_loss.append(loss_target_domain)
        batch.append(batch__)

        loss = loss_source_label + domain_loss__
        loss.backward()
        optimizer.step()

        print(f'[{batch__ + 1}/{max_batches}] '
              f'class_loss: {loss_source_label.item():.4f} '
              f'source_domain_loss: {loss_source_domain.item():.4f} '
              f't_domain_loss: {loss_target_domain.item():.4f} '
              f'lambda: {completion_lambda:.3f} ')

        writer.add_scalar('Class Loss', loss_source_label, batch__)
        writer.add_scalars(
            f'Domain_Loss', {
                'Source Loss': source_domain_loss[batch__],
                'Target Loss': target_domain_loss[batch__]
            }, batch__)

    acc_source = evaluate(source_eval_dataloader)
    writer.add_scalars(f'Source Domain Accuracy', {'Source': acc_source},
                       epoch)
    i += 1
writer.flush()
def run_training(model, cfg, test_features, test_labels, train_data,
                 train_labels, val_data, val_labels):
    model_run_path = MODEL_PATH + "/" + strftime("%Y-%m-%d_%H:%M:%S", gmtime())
    model_weights_path = "{}/{}".format(model_run_path, cfg.model_weights_name)
    model_config_path = "{}/{}".format(model_run_path, cfg.model_config_name)
    result_path = "{}/result.txt".format(model_run_path)
    os.makedirs(model_run_path, exist_ok=True)
    """Choosing hardware"""
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    if device == "cuda":
        print(
            "Using GPU. Setting default tensor type to torch.cuda.FloatTensor")
        torch.set_default_tensor_type("torch.cuda.FloatTensor")
    else:
        print("Using CPU. Setting default tensor type to torch.FloatTensor")
        torch.set_default_tensor_type("torch.FloatTensor")

    json.dump(cfg.to_json(), open(model_config_path, "w"))
    """Converting model to specified hardware and format"""
    model.float()
    model = model.to(device)
    """Defining loss and optimizer"""
    optimizer = torch.optim.Adam(model.parameters(), lr=cfg.lr)
    criterion = torch.nn.CrossEntropyLoss()
    criterion = criterion.to(device)
    """Creating data generators"""
    test_iterator = BatchIterator(test_features, test_labels, 100)
    train_iterator = BatchIterator(train_data, train_labels, cfg.batch_size)
    validation_iterator = BatchIterator(val_data, val_labels, 100)

    train_loss = 999
    best_val_loss = 999
    train_acc = 0
    epochs_without_improvement = 0
    """Running training"""
    for epoch in range(cfg.n_epochs):
        train_iterator.shuffle()
        if epochs_without_improvement == cfg.patience:
            break

        val_loss, val_acc, val_weighted_acc, conf_mat = evaluate(
            model, validation_iterator, criterion)

        if val_loss < best_val_loss:
            torch.save(model.state_dict(), model_weights_path)
            best_val_loss = val_loss
            best_val_acc = val_acc
            best_val_weighted_acc = val_weighted_acc
            best_conf_mat = conf_mat
            epochs_without_improvement = 0
            log_success(
                " Epoch: {} | Val loss improved to {:.4f} | val acc: {:.3f} | weighted val acc: {:.3f} | train loss: {:.4f} | train acc: {:.3f} | saved model to {}."
                .format(epoch, best_val_loss, best_val_acc,
                        best_val_weighted_acc, train_loss, train_acc,
                        model_weights_path))

        train_loss, train_acc, train_weighted_acc, _ = train(
            model, train_iterator, optimizer, criterion, cfg.reg_ratio)

        epochs_without_improvement += 1

        if not epoch % 1:
            log(
                f'| Epoch: {epoch+1} | Val Loss: {val_loss:.3f} | Val Acc: {val_acc*100:.2f}% '
                f'| Train Loss: {train_loss:.4f} | Train Acc: {train_acc*100:.3f}%',
                cfg.verbose)

    model.load_state_dict(torch.load(model_weights_path))
    test_loss, test_acc, test_weighted_acc, conf_mat = evaluate(
        model, test_iterator, criterion)

    result = f'| Epoch: {epoch+1} | Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | Weighted Test Acc: {test_weighted_acc*100:.2f}%\n Confusion matrix:\n {conf_mat}'
    log_major("Train acc: {}".format(train_acc))
    log_major(result)
    log_major("Hyperparameters:{}".format(cfg.to_json()))
    with open(result_path, "w") as file:
        file.write(result)
                         shuffle=True,
                         collate_fn=pad_collate)
model = BilstmAspectAttPool(Configs1())
initialize_weights(model)
print(model)
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

best_valid_loss = float('inf')
for epoch in range(EPOCHS):

    start_time = time.time()

    train_loss = train(model, train_loader, optimizer, criterion, CLIP, device)
    valid_loss = evaluate(model, test_loader, criterion, device)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), model_name)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(
        f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}'
    )
    print(
        f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}'
            ".pth", '_{}.pth'.format(args.load_epoch))
        model.load_state_dict(torch.load(model_dir))
        model.eval()
        results = pd.read_csv(pretrained_model_dir.replace(".pth", ".csv"))
        results = {
            col_name: list(results[col_name].values)
            for col_name in results.columns
        }
        stp = 1 + len(results['epochs'])
        if gan_cfg.evaluate:
            images_dir = os.path.join(saving_dir, 'images')
            if not os.path.exists(images_dir):
                os.makedirs(images_dir)
            pcc, ssim, mse, is_mean = evaluate(model,
                                               dataloader_valid,
                                               norm=True,
                                               mean=gan_cfg.mean,
                                               std=gan_cfg.std,
                                               path=images_dir)
            print("Mean PCC:", pcc)
            print("Mean SSIM:", ssim)
            print("Mean MSE:", mse)
            print("IS mean", is_mean)
            exit(0)
    else:
        logging.info('Initialize')
        stp = 1

    results = dict(epochs=[], loss_encoder=[], loss_decoder=[])

    # An optimizer for each of the sub-networks, so we can selectively backpropogate
    optimizer_encoder = torch.optim.RMSprop(params=model.encoder.parameters(),
Beispiel #25
0
    args = parser.parse_args()
    cifar_dir = args.cifar_root
    fig_path = args.fig_path
    validation_split = args.val_split
    batch_size = args.batch_size
    epochs = args.epochs
    weight_path = args.weight_path
    weight_decay = args.weight_decay
    lr = args.lr

    SEED = args.seed # set random seed (default as 1234)

    # split train, val, test from `get_data` function
    train_loader, val_loader, test_loader = get_data(cifar_dir=cifar_dir, batch_size=batch_size, augment=True, validation_split=validation_split)

    # load model
    model = VGG_lite()
    # define loss
    loss = nn.CrossEntropyLoss()
    # train the model
    model, history = train(model, train_loader, val_loader, epochs, loss, batch_size, optimizer='adam', weight_decay=weight_decay, lr=lr)

    # save the model accordeing to `weight_path` from parser (default to './weights/final.pth')
    torch.save(model.state_dict(), weight_path)

    plot_history(history, fig_path) # save figures

    acc, cm, cm_norm = evaluate(model, test_loader) # evaluate trained model
    plot_cm(cm, cm_norm, fig_path) # save confusion matrix figures
    print('Test Accuracy: {}%'.format(round(acc*100, 4))) # print the model test accuracy
def main(args):
    init_distributed_mode(args)
    print(args)

    device = torch.device(args.device)
    # segmentation nun_classes + background
    num_classes = args.num_classes + 1

    mean = (0.709, 0.381, 0.224)
    std = (0.127, 0.079, 0.043)

    # 用来保存coco_info的文件
    results_file = "results{}.txt".format(
        datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

    data_root = args.data_path
    # check data root
    if os.path.exists(os.path.join(data_root, "DRIVE")) is False:
        raise FileNotFoundError(
            "DRIVE dose not in path:'{}'.".format(data_root))

    train_dataset = DriveDataset(args.data_path,
                                 train=True,
                                 transforms=get_transform(train=True,
                                                          mean=mean,
                                                          std=std))

    val_dataset = DriveDataset(args.data_path,
                               train=False,
                               transforms=get_transform(train=False,
                                                        mean=mean,
                                                        std=std))

    print("Creating data loaders")
    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
        test_sampler = torch.utils.data.distributed.DistributedSampler(
            val_dataset)
    else:
        train_sampler = torch.utils.data.RandomSampler(train_dataset)
        test_sampler = torch.utils.data.SequentialSampler(val_dataset)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        sampler=train_sampler,
        num_workers=args.workers,
        collate_fn=train_dataset.collate_fn,
        drop_last=True)

    val_data_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=1,
        sampler=test_sampler,
        num_workers=args.workers,
        collate_fn=train_dataset.collate_fn)

    print("Creating model")
    # create model num_classes equal background + foreground classes
    model = create_model(num_classes=num_classes)
    model.to(device)

    if args.sync_bn:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module

    params_to_optimize = [
        p for p in model_without_ddp.parameters() if p.requires_grad
    ]

    optimizer = torch.optim.SGD(params_to_optimize,
                                lr=args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    scaler = torch.cuda.amp.GradScaler() if args.amp else None

    # 创建学习率更新策略,这里是每个step更新一次(不是每个epoch)
    lr_scheduler = create_lr_scheduler(optimizer,
                                       len(train_data_loader),
                                       args.epochs,
                                       warmup=True)

    # 如果传入resume参数,即上次训练的权重地址,则接着上次的参数训练
    if args.resume:
        # If map_location is missing, torch.load will first load the module to CPU
        # and then copy each parameter to where it was saved,
        # which would result in all processes on the same machine using the same set of devices.
        checkpoint = torch.load(
            args.resume, map_location='cpu')  # 读取之前保存的权重文件(包括优化器以及学习率策略)
        model_without_ddp.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
        args.start_epoch = checkpoint['epoch'] + 1
        if args.amp:
            scaler.load_state_dict(checkpoint["scaler"])

    if args.test_only:
        confmat = evaluate(model,
                           val_data_loader,
                           device=device,
                           num_classes=num_classes)
        val_info = str(confmat)
        print(val_info)
        return

    best_dice = 0.
    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        mean_loss, lr = train_one_epoch(model,
                                        optimizer,
                                        train_data_loader,
                                        device,
                                        epoch,
                                        num_classes,
                                        lr_scheduler=lr_scheduler,
                                        print_freq=args.print_freq,
                                        scaler=scaler)

        confmat, dice = evaluate(model,
                                 val_data_loader,
                                 device=device,
                                 num_classes=num_classes)
        val_info = str(confmat)
        print(val_info)
        print(f"dice coefficient: {dice:.3f}")

        # 只在主进程上进行写操作
        if args.rank in [-1, 0]:
            # write into txt
            with open(results_file, "a") as f:
                # 记录每个epoch对应的train_loss、lr以及验证集各指标
                train_info = f"[epoch: {epoch}]\n" \
                             f"train_loss: {mean_loss:.4f}\n" \
                             f"lr: {lr:.6f}\n" \
                             f"dice coefficient: {dice:.3f}\n"
                f.write(train_info + val_info + "\n\n")

        if args.save_best is True:
            if best_dice < dice:
                best_dice = dice
            else:
                continue

        if args.output_dir:
            # 只在主节点上执行保存权重操作
            save_file = {
                'model': model_without_ddp.state_dict(),
                'optimizer': optimizer.state_dict(),
                'lr_scheduler': lr_scheduler.state_dict(),
                'args': args,
                'epoch': epoch
            }
            if args.amp:
                save_file["scaler"] = scaler.state_dict()

            if args.save_best is True:
                save_on_master(save_file,
                               os.path.join(args.output_dir, 'best_model.pth'))
            else:
                save_on_master(
                    save_file,
                    os.path.join(args.output_dir,
                                 'model_{}.pth'.format(epoch)))

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
    linguistic_model = AttentionModel(linguistic_cfg)
    linguistic_model.float().to(device)

    try:
        linguistic_model.load_state_dict(torch.load(args.linguistic_model))
    except:
        print(
            "Failed to load model from {} without device mapping. Trying to load with mapping to {}"
            .format(args.linguistic_model, device))
        linguistic_model.load_state_dict(
            torch.load(args.linguistic_model, map_location=device))
    """Defining loss and optimizer"""
    criterion = torch.nn.CrossEntropyLoss().to(device)

    test_loss, test_acc, test_weighted_acc, conf_mat = evaluate(
        acoustic_model, test_iterator_acoustic, criterion)
    print("Acoustic: loss: {}, acc: {}. unweighted acc: {}, conf_mat: \n{}".
          format(test_loss, test_acc, test_weighted_acc, conf_mat))

    test_loss, test_acc, test_weighted_acc, conf_mat = evaluate(
        linguistic_model, test_iterator_linguistic, criterion)
    print(
        "Linguistic(asr=False): loss: {}, acc: {}. unweighted acc: {}, conf_mat: \n{}"
        .format(test_loss, test_acc, test_weighted_acc, conf_mat))

    test_loss, test_acc, test_weighted_acc, conf_mat = evaluate_ensemble(
        acoustic_model, linguistic_model, test_iterator_acoustic,
        test_iterator_linguistic,
        torch.nn.NLLLoss().to(device), "average")
    print(
        "Ensemble average: loss: {}, acc: {}. unweighted acc: {}, conf_mat: \n{}"
def main():
    parser = argparse.ArgumentParser(description='gpat train ')
    parser.add_argument("out")
    parser.add_argument('--resume', default=None)
    parser.add_argument('--log_dir', default='runs_16')
    parser.add_argument('--gpus',
                        '-g',
                        type=int,
                        nargs="*",
                        default=[0, 1, 2, 3])
    parser.add_argument('--iterations',
                        default=10**5,
                        type=int,
                        help='number of iterations to learn')
    parser.add_argument('--interval',
                        default=1000,
                        type=int,
                        help='number of iterations to evaluate')
    parser.add_argument('--batch_size',
                        '-b',
                        type=int,
                        default=128,
                        help='learning minibatch size')
    parser.add_argument('--lr', type=float, default=1e-4)
    parser.add_argument('--loaderjob', type=int, default=8)
    # parser.add_argument('--size', '-s', default=96, type=int, choices=[48, 64, 80, 96, 112, 128],
    #                     help='image size')
    parser.add_argument('--hed',
                        dest='hed',
                        action='store_true',
                        default=False)
    parser.add_argument('--from_tiff',
                        dest='from_tiff',
                        action='store_true',
                        default=False)
    parser.add_argument('--no-texture',
                        dest='texture',
                        action='store_false',
                        default=True)
    parser.add_argument('--cbp',
                        dest='cbp',
                        action='store_true',
                        default=False)
    parser.add_argument('--no-normalize',
                        dest='normalize',
                        action='store_false',
                        default=True)
    parser.add_argument('--no-color_aug',
                        dest='color_aug',
                        action='store_false',
                        default=True)
    parser.add_argument('--model_test', default='', type=str)
    parser.add_argument('--no-finetune',
                        dest='finetune',
                        action='store_false',
                        default=True)
    parser.add_argument('--arch',
                        default='googlenet',
                        choices=[
                            'texturecnn', 'resnet50', 'googlenet', 'vgg',
                            'alex', 'trained', 'resume'
                        ])
    parser.add_argument('--opt', default='adam', choices=['adam', 'momentum'])
    parser.add_argument('--train_path',
                        default='train_0330_additional_new.npy')
    parser.add_argument('--data_size', type=float, default=1)
    parser.add_argument('--test_path', default='diag_256_0406.pkl')
    parser.add_argument('--new', action='store_true', default=False)
    args = parser.parse_args()

    devices = tuple(args.gpus)
    # os.environ['PATH'] += ':/usr/local/cuda/bin'

    # log directory
    logger.init(args)

    # load data
    train_data = np.load(os.path.join(dataset_path, args.train_path))
    test_data = np.load(os.path.join(dataset_path, args.test_path))

    num_class = 3 if 'three_class' in args.train_path else 2

    if '512' in args.train_path:
        image_size = 512
        crop_size = 384
    else:
        image_size = 256
        crop_size = 224 if not args.arch == 'alex' else 227

    perm = np.random.permutation(len(train_data))
    train_data = train_data[perm[:int(len(train_data) * args.data_size)]]
    preprocess_type = args.arch if not args.hed else 'hed'
    train = CamelyonDataset(train_data,
                            original_size=image_size,
                            crop_size=crop_size,
                            aug=True,
                            color_aug=args.color_aug,
                            num_class=num_class,
                            from_tif=False,
                            preprocess_type=preprocess_type)
    if len(devices) > 1:
        train_iter = [
            chainer.iterators.MultiprocessIterator(i,
                                                   args.batch_size,
                                                   n_processes=args.loaderjob)
            for i in chainer.datasets.split_dataset_n_random(
                train, len(devices))
        ]
    else:
        train_iter = iterators.MultiprocessIterator(train,
                                                    args.batch_size,
                                                    n_processes=args.loaderjob)

    diag_iter = {}
    for diag in test_data:
        image_size = int(diag.split('_')[-1])
        test = CamelyonDataset(test_data[diag],
                               original_size=image_size,
                               crop_size=crop_size,
                               aug=False,
                               color_aug=False,
                               num_class=num_class,
                               from_tif=False,
                               preprocess_type=preprocess_type,
                               texture=args.texture)
        diag_iter[diag] = iterators.MultiprocessIterator(test,
                                                         args.batch_size,
                                                         repeat=False,
                                                         shuffle=False)

    # model construct
    if args.new:
        if args.texture:
            model = BilinearCNN(base_cnn=args.arch,
                                pretrained_model='auto',
                                num_class=num_class,
                                texture_layer=None,
                                cbp=args.cbp,
                                cbp_size=4096)
        else:
            model = TrainableCNN(base_cnn=args.arch,
                                 pretrained_model='auto',
                                 num_class=num_class)
    else:
        model = archs[args.arch](texture=args.texture,
                                 cbp=args.cbp,
                                 normalize=args.normalize)
        if args.finetune:
            model.load_pretrained(
                os.path.join(MODEL_PATH, init_path[args.arch]), num_class)
        else:
            model.convert_to_finetune_model(num_class)

    if args.resume is not None:
        model_path = os.path.join(
            'runs_16', args.resume, 'models',
            sorted(os.listdir(os.path.join('runs_16', args.resume,
                                           'models')))[-1])
        print(model_path)
        chainer.serializers.load_npz(model_path, model)

    # set optimizer
    optimizer = make_optimizer(model, args.opt, args.lr)

    if args.model_test:
        # test
        model_path = os.path.join(
            'runs_16', args.model_test, 'models',
            sorted(
                os.listdir(os.path.join('runs_16', args.model_test,
                                        'models')))[-1])
        print(model_path)
        chainer.serializers.load_npz(model_path, model)
        cuda.get_device_from_id(devices[0]).use()
        model.to_gpu()
        with chainer.using_config('train', False), chainer.no_backprop_mode():
            evaluate(model, diag_iter, devices[0])
        logger.flush()
        exit()

    if len(devices) > 1:
        updater = updaters.MultiprocessParallelUpdater(train_iter,
                                                       optimizer,
                                                       devices=devices)
    else:
        cuda.get_device_from_id(devices[0]).use()
        model.to_gpu()
        # updater
        updater = chainer.training.StandardUpdater(train_iter,
                                                   optimizer,
                                                   device=devices[0])

    # start training
    start = time.time()
    train_loss = 0
    train_accuracy = 0
    while updater.iteration < args.iterations:

        # train
        updater.update()
        progress_report(updater.iteration, start,
                        len(devices) * args.batch_size, len(train))
        train_loss += model.loss.data
        train_accuracy += model.accuracy.data

        if updater.iteration % args.interval == 0:
            logger.plot('train_loss', cuda.to_cpu(train_loss) / args.interval)
            logger.plot('train_accuracy',
                        cuda.to_cpu(train_accuracy) / args.interval)
            train_loss = 0
            train_accuracy = 0

            # test
            with chainer.using_config('train',
                                      False), chainer.no_backprop_mode():
                evaluate(model, diag_iter, devices[0])

            # logger
            logger.flush()

            # save
            serializers.save_npz(os.path.join(logger.out_dir, 'resume'),
                                 updater)

            if updater.iteration % 20000 == 0:
                if args.opt == 'adam':
                    optimizer.alpha *= 0.1
                else:
                    optimizer.lr *= 0.1
        # load saved model
        print('\nLoading model from [%s]...' % args.snapshot)
        try:
            model = torch.load(args.snapshot)
        except Exception as e:
            print(e)
            exit(1)

    print("Load complete")

    # Train the model on train_data, use dev_data for early stopping
    model, dev_res = train_utils.train(train_data, dev_data, model, args)

    # Evaluate the trained model
    print("Evaluate on train set")
    train_res = train_utils.evaluate(train_data, model, args)

    print("Evaluate on test set")
    test_res = train_utils.evaluate(test_data, model, args, roc=True)

    if args.result_path:
        directory = args.result_path[:args.result_path.rfind('/')]
        if not os.path.exists(directory):
            os.makedirs(directory)

        result = {
            'train_loss': train_res[0],
            'train_acc': train_res[1],
            'train_recall': train_res[2],
            'train_precision': train_res[3],
            'train_f1': train_res[4],
Beispiel #30
0
        # train the network and early stop by dev loss
        dev_res = train_utils.train(train_data, dev_data, model, args)

        if args.save:
            with open(dev_res[-1] + '.vocab', 'wb') as f:
                pickle.dump(vocab, f, pickle.HIGHEST_PROTOCOL)

        prefix = 'results_rat/' + args.dataset + '/'
        if not os.path.exists(prefix):
            os.makedirs(prefix)

        print("Evaluate on train set")
        writer = data_utils.generate_writer(
                prefix + 'sel_'+str(args.l_selection) + '_target_' + str(args.l_selection_target) +\
                '_var_' + str(args.l_variation) + '.train')
        train_res = train_utils.evaluate(train_data, model, args, writer)
        data_utils.close_writer(writer)

        print("Evaluate on dev set")
        writer = data_utils.generate_writer(
                prefix + 'sel_'+str(args.l_selection) + '_target_' + str(args.l_selection_target) +\
                '_var_' + str(args.l_variation) + '.dev')
        dev_res = train_utils.evaluate(dev_data, model, args, writer)
        data_utils.close_writer(writer)

        if args.result_path:
            result = {
                'train_loss': train_res[0],
                'train_acc': train_res[1],
                'train_recall': train_res[2],
                'train_precision': train_res[3],