Ejemplo n.º 1
0
def main():
    in_arg = get_input_args()
    data_dir = in_arg.data_dir
    save_dir = in_arg.save_dir
    arch = in_arg.arch
    learning_rate = in_arg.learning_rate
    hidden_units = in_arg.hidden_units
    epochs = in_arg.epochs
    #processing_unit = in_arg.gpu
    if torch.cuda.is_available() and in_arg.gpu == 'gpu':
        print('GPU will be used')
        processing_unit = 'gpu'
    elif torch.cuda.is_available() == False:
        print('CPU will be used')
        processing_unit = 'cpu'

    print(in_arg)

    training_dataloaders, validation_dataloaders, testing_dataloaders, class_to_idx = load_datas(
        data_dir)
    pre_model = pretrained_model(arch)
    model = classifier(pre_model, hidden_units)
    after_train_model = train_model(model, training_dataloaders,
                                    validation_dataloaders, learning_rate,
                                    epochs, processing_unit)
    valid_model(after_train_model, testing_dataloaders, processing_unit)

    save_checkpoint(model, save_dir, class_to_idx)
Ejemplo n.º 2
0
def main():
    in_arg = get_input_args()  # Creates and returns command line arguments

    print('\nData Directory:\n', in_arg.data_directory, '\n')

    print('Optional Command Line Arguments:\n',
          'Save Checkpoint [--save_dir]: ', in_arg.save_dir, '\n',
          'Pretrained Network [--arch]: ', in_arg.arch, '\n',
          'Learning Rate [--learning_rate]: ', in_arg.learning_rate, '\n',
          'Hidden Units [--hidden_units]: ', in_arg.hidden_units, '\n',
          'Epochs [--epochs]: ', in_arg.epochs, '\n', 'GPU [--gpu]: ',
          in_arg.gpu, '\n')

    if 'checkpoints' not in listdir(
    ):  # makes checkpoints folder if it doesn't already exist
        mkdir('checkpoints')

    train_dir, valid_dir, test_dir = util.get_data(
        in_arg.data_directory
    )  # Returns Train, Validation and Test Directories

    transformed_train, transformed_valid, transformed_test = mod.transform_data(
        train_dir, valid_dir, test_dir)  # Returns transformed datasets

    train_loader, valid_loader, test_loader = mod.load_data(
        transformed_train, transformed_valid,
        transformed_test)  # Returns Data loaders

    model = mod.build_model(
        util.label_count(train_dir), in_arg.hidden_units, in_arg.arch,
        transformed_train.class_to_idx)  # Returns built model

    epochs = in_arg.epochs  # Epochs initially set by command line argument in_arg.epochs.  Can be changed with m.load_checkpoint()
    criterion = nn.NLLLoss()
    optimizer = optim.Adam(model.classifier.parameters(),
                           lr=in_arg.learning_rate)

    use_gpu = mod.use_gpu(model,
                          in_arg.gpu)  # Returns True or False for GPU use

    mod.train(
        model, criterion, optimizer, train_loader, valid_loader, use_gpu,
        in_arg.epochs
    )  # Trains the model.  Prints Training Loss, Validation Loss & Validation Accuracy

    mod.save_checkpoint(
        in_arg.arch,
        model.classifier.state_dict(), transformed_train.class_to_idx,
        util.label_count(train_dir), in_arg.hidden_units, in_arg.epochs,
        in_arg.save_dir
    )  # Saves classifier and other model parameters to checkpoint
Ejemplo n.º 3
0
def save_ds_checkpoint(iteration, model, args):
    """Save a model checkpoint."""

    sd = {}
    sd['iteration'] = iteration
    # rng states.
    if not args.no_save_rng:
        sd['random_rng_state'] = random.getstate()
        sd['np_rng_state'] = np.random.get_state()
        sd['torch_rng_state'] = torch.get_rng_state()
        sd['cuda_rng_state'] = torch.cuda.get_rng_state()
        sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker().get_states()
        
    model.save_checkpoint(args.save, str(iteration), client_state = sd)
Ejemplo n.º 4
0
def save_ds_checkpoint(iteration, model, lr_scheduler, args):
    """Save a model checkpoint."""

    sd = {}
    sd['iteration'] = iteration
    if lr_scheduler is not None:
        sd['client_lr_scheduler'] = lr_scheduler.state_dict()
    # rng states.
    if not args.no_save_rng:
        sd['random_rng_state'] = random.getstate()
        sd['np_rng_state'] = np.random.get_state()
        sd['torch_rng_state'] = torch.get_rng_state()
        sd['cuda_rng_state'] = torch.cuda.get_rng_state()
        sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker().get_states()

    model.save_checkpoint(args.save, iteration, client_state=sd)
Ejemplo n.º 5
0
def train_teacher_model(model,
                        labeled_dataset,
                        optimizer,
                        scheduler=None,
                        train_ratio=0.7,
                        batch_size=4,
                        device='cpu',
                        max_epochs=100,
                        print_freq=10,
                        save_path=None,
                        checkpoint=None):
    model.to(device)
    metric_logger = utils.MetricLogger(delimiter=" ")
    last_loss = 1e9

    cur_epoch = 0
    if checkpoint is not None:
        print("loading checkpoint:" + checkpoint)
        model, optimizer, scheduler, cur_epoch = load_checkpoint(
            model, optimizer, scheduler, device, checkpoint)

    train_dataset, vld_dataset = split_dataset(labeled_dataset, train_ratio)
    train_loader = DataLoader(train_dataset,
                              batch_size=batch_size,
                              shuffle=True,
                              collate_fn=collate_fn)
    vld_loader = DataLoader(vld_dataset,
                            batch_size=batch_size,
                            shuffle=False,
                            collate_fn=collate_fn)
    for epoch in range(cur_epoch, max_epochs):
        print("epoch {} / {}".format(epoch + 1, max_epochs))
        train_one_epoch(model, optimizer, train_loader, device, epoch,
                        print_freq)
        loss = evaluate(model, vld_loader, device, epoch, print_freq)

        if loss < last_loss and save_path != None:
            save_checkpoint(model, optimizer, scheduler, epoch + 1, device,
                            save_path)
            last_loss = loss
        if scheduler is not None:
            scheduler.step()
Ejemplo n.º 6
0
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print("===> Building model")
model = RRDBNet()  #.to(device, dtype=torch.float)
model = nn.DataParallel(model, device_ids=[0, 1, 2])
model.to(device)
criterion = nn.MSELoss()

for epoch in range(start_epoch, nEpochs + 1):
    optimizer = optim.Adam(model.parameters(),
                           lr=initial_lr,
                           weight_decay=1e-5)
    lr = adjust_learning_rate(initial_lr, optimizer, epoch)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    model.train()

    for iteration, batch in enumerate(training_data_loader, 1):
        x_data, z_data = Variable(batch[0].float()).cuda(), Variable(
            batch[1].float()).cuda()
        output = model(z_data)
        loss = criterion(output, x_data)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if iteration % 100 == 0:
            print("===> Epoch[{}]({}/{}): Loss: {:.10f}".format(
                epoch, iteration, len(training_data_loader), loss.item()))
            save_checkpoint(model, epoch, 'simple')
    save_checkpoint(model, epoch, 'simple')
Ejemplo n.º 7
0
def Main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "data_directory",
        help=
        "The parent directory to containing subfolders of train and test data.",
        type=str,
        default='flowers')
    parser.add_argument("--save_dir",
                        help="The directory of checkpoint file to be saved.",
                        type=str,
                        default=None)
    parser.add_argument("--gpu",
                        help="Use GPU instead of CPU.",
                        action="store_true")
    parser.add_argument("--learning_rate",
                        help="Set learning rate for training.",
                        type=float,
                        default=0.001)
    parser.add_argument("--hidden_units", nargs="+", type = int, default = [1000], \
        help="Set a list of hidden units: e.g. say two hidden layers of 500 and 200 units, the input format: 500 [space] 200")
    parser.add_argument("-e",
                        "--epochs",
                        help="Set the number of training iterations.",
                        type=int,
                        default=10)
    parser.add_argument(
        "--skip_accuracy",
        help=
        "Skip the validation on training and testing set in each iteration, and reduce the training time.",
        action="store_true")
    parser.add_argument(
        "--arch",
        help="Pre-trained Model Options: 0: Densenet121, 1: VGG16, 2: AlexNet ",
        type=int,
        default=1)

    args = parser.parse_args()

    if args.gpu:
        device = 'cuda'
        print('Compute using GPU')
    else:
        device = 'cpu'
        print('Compute using CPU')

    data_dir = args.data_directory  #'flowers'
    train_dir = data_dir + '/train'
    valid_dir = data_dir + '/valid'
    test_dir = data_dir + '/test'

    file_path = args.save_dir
    print('Training data directory: ', train_dir)
    print('Testing data directory: ', test_dir)
    print('Output checkpoint file directory: ', file_path)

    train_image_dataset, trainloader = loadData(train_dir,
                                                train=True,
                                                batch_size=128,
                                                shuffle=True)
    test_image_dataset, testloader = loadData(test_dir,
                                              train=False,
                                              batch_size=128,
                                              shuffle=True)
    train_size = len(trainloader.dataset.imgs)
    print('Total number of samples in the train set: ', train_size)

    hidden_layer = args.hidden_units
    model_name = args.arch
    print(
        'Building the model with hidden layer: {}, using pre-trained model: {}'
        .format(hidden_layer, model_options[model_name]))
    model = build_model(model_name=model_name, hidden_layer=hidden_layer)

    epochs = args.epochs
    print('Number of iteration: ', epochs)
    learn_rate = args.learning_rate
    print('Using the learning rate: ', learn_rate)
    if args.skip_accuracy:
        print_accuracy = False
        print('Skip calculating the accuracy during the training.')
    else:
        print_accuracy = True
        print(
            'Will calculate the accuracy during the training. Expected longer training time.'
        )
    criterion = nn.NLLLoss()
    optimizer = optim.Adam(model.classifier.parameters(), lr=learn_rate)

    print('Start training...')
    train(model,
          trainloader,
          testloader,
          criterion,
          optimizer,
          epochs=epochs,
          print_every=10,
          print_accuracy=print_accuracy,
          device=device)
    accuracy_score(model, trainloader, device=device, print_score=True)

    save_checkpoint(model,
                    optimizer,
                    train_image_dataset,
                    epochs,
                    file_path=file_path,
                    file_name='checkpoint.pth',
                    print_model=False)
Ejemplo n.º 8
0
                                  step + (epoch - 1) * iters_per_epoch)
                add_summary_value(summary_w, 'lr', lr,
                                  step + (epoch - 1) * iters_per_epoch)

                loss_temp = 0
                start = time.time()

        if args.mGPUs:
            save_name = os.path.join(
                './data/results', args.train_id, args.root_model,
                'faster_rcnn_{}_{}.pth'.format(epoch, step))
            save_checkpoint(
                {
                    'train_id': args.train_id,
                    'epoch': epoch + 1,
                    'model': fasterRCNN.module.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'pooling_mode': args.POOLING_MODE,
                    'class_agnostic': args.class_agnostic,
                }, save_name)
        else:
            save_name = os.path.join(
                './data/results', args.train_id, args.root_model,
                'faster_rcnn_{}_{}.pth'.format(epoch, step))
            save_checkpoint(
                {
                    'train_id': args.train_id,
                    'epoch': epoch + 1,
                    'model': fasterRCNN.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'pooling_mode': args.POOLING_MODE,
Ejemplo n.º 9
0
def train(
    model: TEDD1104,
    optimizer_name: str,
    optimizer: torch.optim,
    scheduler: torch.optim.lr_scheduler,
    train_dir: str,
    dev_dir: str,
    test_dir: str,
    output_dir: str,
    batch_size: int,
    accumulation_steps: int,
    initial_epoch: int,
    num_epoch: int,
    max_acc: float,
    hide_map_prob: float,
    dropout_images_prob: List[float],
    num_load_files_training: int,
    fp16: bool = True,
    amp_opt_level=None,
    save_checkpoints: bool = True,
    eval_every: int = 5,
    save_every: int = 20,
    save_best: bool = True,
):

    """
    Train a model

    Input:
    - model: TEDD1104 model to train
    - optimizer_name: Name of the optimizer to use [SGD, Adam]
    - optimizer: Optimizer (torch.optim)
    - train_dir: Directory where the train files are stored
    - dev_dir: Directory where the development files are stored
    - test_dir: Directory where the test files are stored
    - output_dir: Directory where the model and the checkpoints are going to be saved
    - batch_size: Batch size (Around 10 for 8GB GPU)
    - initial_epoch: Number of previous epochs used to train the model (0 unless the model has been
      restored from checkpoint)
    - num_epochs: Number of epochs to do
    - max_acc: Accuracy in the development set (0 unless the model has been
      restored from checkpoint)
    - hide_map_prob: Probability for removing the minimap (put a black square)
       from a training example (0<=hide_map_prob<=1)
    - dropout_images_prob List of 5 floats or None, probability for removing each input image during training
     (black image) from a training example (0<=dropout_images_prob<=1)
    - fp16: Use FP16 for training
    - amp_opt_level: If FP16 training Nvidia apex opt level
    - save_checkpoints: save a checkpoint each epoch (Each checkpoint will rewrite the previous one)
    - save_best: save the model that achieves the higher accuracy in the development set

    Output:
     - float: Accuracy in the development test of the best model
    """
    writer: SummaryWriter = SummaryWriter()

    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

    criterion: CrossEntropyLoss = torch.nn.CrossEntropyLoss()
    print("Loading dev set")
    X_dev, y_dev = load_dataset(dev_dir, fp=16 if fp16 else 32)
    X_dev = torch.from_numpy(X_dev)
    print("Loading test set")
    X_test, y_test = load_dataset(test_dir, fp=16 if fp16 else 32)
    X_test = torch.from_numpy(X_test)
    total_training_exampels: int = 0
    model.zero_grad()


    trainLoader = DataLoader(dataset=PickleDataset(train_dir), batch_size=batch_size, shuffle=
False, num_workers=8)

    printTrace("Training...")
    iteration_no: int = 0
    for epoch in range(num_epoch):
        #step_no: int = 0
        #num_used_files: int = 0

        print('EpochNum: ' + str(epoch))
        model.train()
        start_time: float = time.time()
        running_loss: float = 0.0
        acc_dev: float = 0.0

        for num_batchs, inputs in enumerate(trainLoader):
            X_bacth = torch.reshape(inputs[0], (inputs[0].shape[0] * 5, 3, inputs[0].shape[2], inputs[0].shape[3])).to(device)
            y_batch = torch.reshape(inputs[1], (inputs[0].shape[0],)).long().to(device)
            #print(X_bacth)
            #X_bacth, y_batch = (
            #    torch.from_numpy(batch_data).to(device),
            #    torch.from_numpy(inputs[1]).long().to(device),
            #)

            outputs = model.forward(X_bacth)
            #print(outputs.size())
            #print(y_batch)
            loss = criterion(outputs, y_batch) / accumulation_steps
            running_loss += loss.item()

            if fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if fp16:
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 1.0)
            else:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            model.zero_grad()

        #scheduler.step(running_loss)

        # Print Statistics
        printTrace(
            f"Loss: {running_loss/num_batchs}. "
            f"Learning rate {optimizer.state_dict()['param_groups'][0]['lr']}"
        )

        writer.add_scalar("Loss/train", running_loss, iteration_no)

        if (iteration_no + 1) % eval_every == 0:
            start_time_eval: float = time.time()

            acc_dev: float = evaluate(
                model=model,
                X=X_dev,
                golds=y_dev,
                device=device,
                batch_size=batch_size,
            )

            acc_test: float = evaluate(
                model=model,
                X=X_test,
                golds=y_test,
                device=device,
                batch_size=batch_size,
            )

            printTrace(
                f"Acc dev set: {round(acc_dev,2)}. "
                f"Acc test set: {round(acc_test,2)}.  "
                f"Eval time: {round(time.time() - start_time_eval,2)} secs."
            )

            if 0.0 < acc_dev > max_acc and save_best:
                max_acc = acc_dev
                printTrace(
                    f"New max acc in dev set {round(max_acc,2)}. Saving model..."
                )
                save_model(
                    model=model,
                    save_dir=output_dir,
                    fp16=fp16,
                    amp_opt_level=amp_opt_level,
                )
            writer.add_scalar("Accuracy/dev", acc_dev, iteration_no)
            writer.add_scalar("Accuracy/test", acc_test, iteration_no)

        if save_checkpoints and (iteration_no + 1) % save_every == 0:
            printTrace("Saving checkpoint...")
            save_checkpoint(
                path=os.path.join(output_dir, "checkpoint.pt"),
                model=model,
                optimizer_name=optimizer_name,
                optimizer=optimizer,
                scheduler=scheduler,
                acc_dev=acc_dev,
                epoch=initial_epoch + epoch,
                fp16=fp16,
                opt_level=amp_opt_level,
            )

        iteration_no += 1

    return max_acc
        add_summary_value(summary_w, 'eval_loss', loss_tt / len(pd_val.roidb),
                          total_iters)
        add_summary_value(summary_w, 'mcls_sc', mcls_sc, total_iters)
        add_summary_value(summary_w, 'mcls_ac', mcls_ac, total_iters)
        add_summary_value(summary_w, 'mcls_ap', mcls_ap, total_iters)
        add_summary_value(summary_w, 'mins_sc', mins_sc, total_iters)
        add_summary_value(summary_w, 'mins_ac', mins_ac, total_iters)
        add_summary_value(summary_w, 'mins_ap', mins_ap, total_iters)

        save_name = os.path.join(
            './data/results', args.train_id, args.root_model,
            'checkpoint{}_{}.pth'.format(epoch, total_iters))
        save_checkpoint(
            {
                'train_id': args.train_id,
                'epoch': epoch + 1,
                'model': basenet.state_dict(),
                'optimizer': optimizer.state_dict(),
            }, save_name)
        print('save model: {}'.format(save_name))

        end = time.time()
        print(end - start)

        if total_iters > args.max_iters:
            break

    if args.resume:
        total_iters -= (args.start_epoch - 1) * iters_per_epoch
    print('total train time: %.2f s, %.2f h' %
          (total_time, total_time / 3600.))
def train(
    model: TEDD1104,
    optimizer_name: str,
    optimizer: torch.optim,
    scheduler: torch.optim.lr_scheduler,
    scaler: GradScaler,
    train_dir: str,
    dev_dir: str,
    test_dir: str,
    output_dir: str,
    batch_size: int,
    accumulation_steps: int,
    initial_epoch: int,
    num_epoch: int,
    running_loss: float,
    total_batches: int,
    total_training_examples: int,
    max_acc: float,
    hide_map_prob: float,
    dropout_images_prob: List[float],
    fp16: bool = True,
    save_checkpoints: bool = True,
    save_every: int = 20,
    save_best: bool = True,
):
    """
    Train a model

    Input:
    - model: TEDD1104 model to train
    - optimizer_name: Name of the optimizer to use [SGD, Adam]
    - optimizer: Optimizer (torch.optim)
    - train_dir: Directory where the train files are stored
    - dev_dir: Directory where the development files are stored
    - test_dir: Directory where the test files are stored
    - output_dir: Directory where the model and the checkpoints are going to be saved
    - batch_size: Batch size (Around 10 for 8GB GPU)
    - initial_epoch: Number of previous epochs used to train the model (0 unless the model has been
      restored from checkpoint)
    - num_epochs: Number of epochs to do
    - max_acc: Accuracy in the development set (0 unless the model has been
      restored from checkpoint)
    - hide_map_prob: Probability for removing the minimap (put a black square)
       from a training example (0<=hide_map_prob<=1)
    - dropout_images_prob List of 5 floats or None, probability for removing each input image during training
     (black image) from a training example (0<=dropout_images_prob<=1)
    - fp16: Use FP16 for training
    - save_checkpoints: save a checkpoint each epoch (Each checkpoint will rewrite the previous one)
    - save_best: save the model that achieves the higher accuracy in the development set

    Output:
     - float: Accuracy in the development test of the best model
    """

    if not os.path.exists(output_dir):
        print(f"{output_dir} does not exits. We will create it.")
        os.makedirs(output_dir)

    writer: SummaryWriter = SummaryWriter()

    criterion: CrossEntropyLoss = torch.nn.CrossEntropyLoss().to(device)
    model.zero_grad()
    print_message("Training...")
    for epoch in range(num_epoch):
        acc_dev: float = 0.0
        num_batches: int = 0
        step_no: int = 0

        data_loader_train = DataLoader(
            Tedd1104Dataset(
                dataset_dir=train_dir,
                hide_map_prob=hide_map_prob,
                dropout_images_prob=dropout_images_prob,
            ),
            batch_size=batch_size,
            shuffle=True,
            num_workers=os.cpu_count(),
            pin_memory=True,
        )
        start_time: float = time.time()
        step_start_time: float = time.time()
        dataloader_delay: float = 0
        model.train()
        for batch in data_loader_train:

            x = torch.flatten(
                torch.stack(
                    (
                        batch["image1"],
                        batch["image2"],
                        batch["image3"],
                        batch["image4"],
                        batch["image5"],
                    ),
                    dim=1,
                ),
                start_dim=0,
                end_dim=1,
            ).to(device)

            y = batch["y"].to(device)
            dataloader_delay += time.time() - step_start_time

            total_training_examples += len(y)

            if fp16:
                with autocast():
                    outputs = model.forward(x)
                    loss = criterion(outputs, y)
                    loss = loss / accumulation_steps

                running_loss += loss.item()
                scaler.scale(loss).backward()

            else:
                outputs = model.forward(x)
                loss = criterion(outputs, y) / accumulation_steps
                running_loss += loss.item()
                loss.backward()

            if ((step_no + 1) % accumulation_steps == 0) or (
                    step_no + 1 >= len(data_loader_train)
            ):  # If we are in the last bach of the epoch we also want to perform gradient descent
                if fp16:
                    # Gradient clipping
                    scaler.unscale_(optimizer)
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                    scaler.step(optimizer)
                    scaler.update()
                    optimizer.zero_grad()
                else:
                    # Gradient clipping
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                    optimizer.step()
                    optimizer.zero_grad()

                total_batches += 1
                num_batches += 1
                scheduler.step(running_loss / total_batches)

                batch_time = round(time.time() - start_time, 2)
                est: float = batch_time * (math.ceil(
                    len(data_loader_train) / accumulation_steps) - num_batches)
                print_message(
                    f"EPOCH: {initial_epoch + epoch}. "
                    f"{num_batches} of {math.ceil(len(data_loader_train)/accumulation_steps)} batches. "
                    f"Total examples used for training {total_training_examples}. "
                    f"Iteration time: {batch_time} secs. "
                    f"Data Loading bottleneck: {round(dataloader_delay, 2)} secs. "
                    f"Epoch estimated time: "
                    f"{str(datetime.timedelta(seconds=est)).split('.')[0]}")

                print_message(
                    f"Loss: {running_loss / total_batches}. "
                    f"Learning rate {optimizer.state_dict()['param_groups'][0]['lr']}"
                )

                writer.add_scalar("Loss/train", running_loss / total_batches,
                                  total_batches)

                if save_checkpoints and (total_batches + 1) % save_every == 0:
                    print_message("Saving checkpoint...")
                    save_checkpoint(
                        path=os.path.join(output_dir, "checkpoint.pt"),
                        model=model,
                        optimizer_name=optimizer_name,
                        optimizer=optimizer,
                        scheduler=scheduler,
                        running_loss=running_loss,
                        total_batches=total_batches,
                        total_training_examples=total_training_examples,
                        acc_dev=max_acc,
                        epoch=initial_epoch + epoch,
                        fp16=fp16,
                        scaler=None if not fp16 else scaler,
                    )

                dataloader_delay: float = 0
                start_time: float = time.time()

            step_no += 1
            step_start_time = time.time()

        del data_loader_train

        print_message("Dev set evaluation...")

        start_time_eval: float = time.time()

        data_loader_dev = DataLoader(
            Tedd1104Dataset(
                dataset_dir=dev_dir,
                hide_map_prob=0,
                dropout_images_prob=[0, 0, 0, 0, 0],
            ),
            batch_size=batch_size //
            2,  # Use smaller batch size to prevent OOM issues
            shuffle=False,
            num_workers=os.cpu_count() // 2,  # Use less cores to save RAM
            pin_memory=True,
        )

        acc_dev: float = evaluate(
            model=model,
            data_loader=data_loader_dev,
            device=device,
            fp16=fp16,
        )

        del data_loader_dev

        print_message("Test set evaluation...")
        data_loader_test = DataLoader(
            Tedd1104Dataset(
                dataset_dir=test_dir,
                hide_map_prob=0,
                dropout_images_prob=[0, 0, 0, 0, 0],
            ),
            batch_size=batch_size //
            2,  # Use smaller batch size to prevent OOM issues
            shuffle=False,
            num_workers=os.cpu_count() // 2,  # Use less cores to save RAM
            pin_memory=True,
        )

        acc_test: float = evaluate(
            model=model,
            data_loader=data_loader_test,
            device=device,
            fp16=fp16,
        )

        del data_loader_test

        print_message(
            f"Acc dev set: {round(acc_dev*100,2)}. "
            f"Acc test set: {round(acc_test*100,2)}.  "
            f"Eval time: {round(time.time() - start_time_eval,2)} secs.")

        if 0.0 < acc_dev > max_acc and save_best:
            max_acc = acc_dev
            print_message(
                f"New max acc in dev set {round(max_acc, 2)}. Saving model...")
            save_model(
                model=model,
                save_dir=output_dir,
                fp16=fp16,
            )

        writer.add_scalar("Accuracy/dev", acc_dev, epoch)
        writer.add_scalar("Accuracy/test", acc_test, epoch)

    return max_acc
Ejemplo n.º 12
0
        pred_choice = outputs.data.max(1)[1]
        correct += pred_choice.eq(labels.data).cpu().sum()
        sum += len(labels)
        print('batch_index: [%d/%d]' % (batch_index, len(evalloader)),
              'Eval epoch: [%d]' % (epoch),
              'correct/sum:%d/%d, %.4f' % (correct, sum, correct / sum))

if __name__ == '__main__':
    # 是否装载模型参数
    load = False

    if load:
        checkpoint = model.load_checkpoint()
        net.load_state_dict(checkpoint['state_dict'])
        start_epoch = checkpoint['epoch'] + 1
    else:
        start_epoch = 0

    # 设置优化器
    optimizer = optim.Adam(net.parameters(), lr=1e-3, betas=(0.9, 0.999), weight_decay=0)
    # optimizer = optim.SGD(net.parameters(), lr=1e-3, momentum=1e-1, weight_decay=1e-4)

    for epoch in range(start_epoch, n_epoch):
        train(epoch)

        # 保存参数
        checkpoint = {'epoch': epoch, 'state_dict': net.state_dict(), 'optimizer': optimizer.state_dict()}
        model.save_checkpoint(checkpoint)

        eval(epoch)
Ejemplo n.º 13
0
def train(
    model: DRIVEMODEL,
    optimizer_name: str,
    optimizer: torch.optim,
    scheduler: torch.optim.lr_scheduler,
    train_dir: str,
    dev_dir: str,
    test_dir: str,
    output_dir: str,
    batch_size: int,
    accumulation_steps: int,
    initial_epoch: int,
    num_epoch: int,
    max_acc: float,
    hide_map_prob: float,
    dropout_images_prob: List[float],
    num_load_files_training: int,
    fp16: bool = True,
    amp_opt_level=None,
    save_checkpoints: bool = True,
    eval_every: int = 5,
    save_every: int = 20,
    save_best: bool = True,
):
    """
    Train a model

    Input:
    - model: DRIVEMODEL model to train
    - optimizer_name: Name of the optimizer to use [SGD, Adam]
    - optimizer: Optimizer (torch.optim)
    - train_dir: Directory where the train files are stored
    - dev_dir: Directory where the development files are stored
    - test_dir: Directory where the test files are stored
    - output_dir: Directory where the model and the checkpoints are going to be saved
    - batch_size: Batch size (Around 10 for 8GB GPU)
    - initial_epoch: Number of previous epochs used to train the model (0 unless the model has been
      restored from checkpoint)
    - num_epochs: Number of epochs to do
    - max_acc: Accuracy in the development set (0 unless the model has been
      restored from checkpoint)
    - hide_map_prob: Probability for removing the minimap (put a black square)
       from a training example (0<=hide_map_prob<=1)
    - dropout_images_prob List of 5 floats or None, probability for removing each input image during training
     (black image) from a training example (0<=dropout_images_prob<=1)
    - fp16: Use FP16 for training
    - amp_opt_level: If FP16 training Nvidia apex opt level
    - save_checkpoints: save a checkpoint each epoch (Each checkpoint will rewrite the previous one)
    - save_best: save the model that achieves the higher accuracy in the development set

    Output:
     - float: Accuracy in the development test of the best model
    """
    writer: SummaryWriter = SummaryWriter()

    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

    criterion: CrossEntropyLoss = torch.nn.CrossEntropyLoss()
    print("Loading dev set")
    X_dev, y_dev = load_dataset(dev_dir, fp=16 if fp16 else 32)
    X_dev = torch.from_numpy(X_dev)
    print("Loading test set")
    X_test, y_test = load_dataset(test_dir, fp=16 if fp16 else 32)
    X_test = torch.from_numpy(X_test)
    total_training_exampels: int = 0
    model.zero_grad()

    printTrace("Training...")
    for epoch in range(num_epoch):
        step_no: int = 0
        iteration_no: int = 0
        num_used_files: int = 0
        data_loader = DataLoader_AutoDrive(
            dataset_dir=train_dir,
            nfiles2load=num_load_files_training,
            hide_map_prob=hide_map_prob,
            dropout_images_prob=dropout_images_prob,
            fp=16 if fp16 else 32,
        )

        data = data_loader.get_next()
        # Get files in batches, all files will be loaded and data will be shuffled
        while data:
            X, y = data
            model.train()
            start_time: float = time.time()
            total_training_exampels += len(y)
            running_loss: float = 0.0
            num_batchs: int = 0
            acc_dev: float = 0.0

            for X_bacth, y_batch in nn_batchs(X, y, batch_size):
                X_bacth, y_batch = (
                    torch.from_numpy(X_bacth).to(device),
                    torch.from_numpy(y_batch).long().to(device),
                )

                outputs = model.forward(X_bacth)
                loss = criterion(outputs, y_batch) / accumulation_steps
                running_loss += loss.item()

                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                if fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), 1.0)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                if (step_no + 1) % accumulation_steps or (
                        num_used_files + 1 >
                        len(data_loader) - num_load_files_training
                        and num_batchs == math.ceil(len(y) / batch_size) - 1
                ):  # If we are in the last bach of the epoch we also want to perform gradient descent
                    optimizer.step()
                    model.zero_grad()

                num_batchs += 1
                step_no += 1

            num_used_files += num_load_files_training

            # Print Statistics
            printTrace(
                f"EPOCH: {initial_epoch+epoch}. Iteration {iteration_no}. "
                f"{num_used_files} of {len(data_loader)} files. "
                f"Total examples used for training {total_training_exampels}. "
                f"Iteration time: {round(time.time() - start_time,2)} secs.")
            printTrace(
                f"Loss: {-1 if num_batchs == 0 else running_loss / num_batchs}. "
                f"Learning rate {optimizer.state_dict()['param_groups'][0]['lr']}"
            )
            writer.add_scalar("Loss/train", running_loss / num_batchs,
                              iteration_no)

            scheduler.step(running_loss / num_batchs)

            if (iteration_no + 1) % eval_every == 0:
                start_time_eval: float = time.time()
                if len(X) > 0 and len(y) > 0:
                    acc_train: float = evaluate(
                        model=model,
                        X=torch.from_numpy(X),
                        golds=y,
                        device=device,
                        batch_size=batch_size,
                    )
                else:
                    acc_train = -1.0

                acc_dev: float = evaluate(
                    model=model,
                    X=X_dev,
                    golds=y_dev,
                    device=device,
                    batch_size=batch_size,
                )

                acc_test: float = evaluate(
                    model=model,
                    X=X_test,
                    golds=y_test,
                    device=device,
                    batch_size=batch_size,
                )

                printTrace(
                    f"Acc training set: {round(acc_train,2)}. "
                    f"Acc dev set: {round(acc_dev,2)}. "
                    f"Acc test set: {round(acc_test,2)}.  "
                    f"Eval time: {round(time.time() - start_time_eval,2)} secs."
                )

                if 0.0 < acc_dev > max_acc and save_best:
                    max_acc = acc_dev
                    printTrace(
                        f"New max acc in dev set {round(max_acc,2)}. Saving model..."
                    )
                    save_model(
                        model=model,
                        save_dir=output_dir,
                        fp16=fp16,
                        amp_opt_level=amp_opt_level,
                    )
                if acc_train > -1:
                    writer.add_scalar("Accuracy/train", acc_train,
                                      iteration_no)
                writer.add_scalar("Accuracy/dev", acc_dev, iteration_no)
                writer.add_scalar("Accuracy/test", acc_test, iteration_no)

            if save_checkpoints and (iteration_no + 1) % save_every == 0:
                printTrace("Saving checkpoint...")
                save_checkpoint(
                    path=os.path.join(output_dir, "checkpoint.pt"),
                    model=model,
                    optimizer_name=optimizer_name,
                    optimizer=optimizer,
                    scheduler=scheduler,
                    acc_dev=acc_dev,
                    epoch=initial_epoch + epoch,
                    fp16=fp16,
                    opt_level=amp_opt_level,
                )

            iteration_no += 1
            data = data_loader.get_next()

        data_loader.close()

    return max_acc
Ejemplo n.º 14
0
    if ckpt is not None:
        print('recovering from checkpoints...')
        model.load_state_dict(ckpt['model'])
        begin_epoch = ckpt['epoch'] + 1
        print('resuming training')

    begin = time()
    with open(os.path.join('../logs', 'down_sample.log'), 'w') as f:
        for epoch in range(begin_epoch, 1000):
            epoch_loss = []
            for bid, batch in enumerate(loader):
                hr, lr = batch['hr'].to(DEVICE), batch['lr'].to(DEVICE)
                optimizer.zero_grad()
                ds = model(hr)
                batch_loss = loss(ds, lr)
                batch_loss.backward()
                optimizer.step()
                epoch_loss.append(batch_loss.cpu().detach().numpy())
                print(
                    'Epoch {} | Batch {} | BMSE {:6f} | EMSE {:.6f} | RT {:6f}'
                    .format(epoch, bid, batch_loss, np.mean(epoch_loss),
                            since(begin)))
                f.write('{},{},{},{},{}\n'.format(epoch, bid, batch_loss,
                                                  np.mean(epoch_loss),
                                                  since(begin)))
                f.flush()
            state_dict = {'model': model.state_dict(), 'epoch': epoch}
            save_checkpoint(state_dict,
                            '../checkpoints/',
                            model_name='down_sample')
Ejemplo n.º 15
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Train a new network on a dataset and save the model as a checkpoint')
    parser.add_argument('data_dir',
                        metavar='path/to/dataset',
                        type=str,
                        nargs=1,
                        help='path to a data directory')
    parser.add_argument(
        '--save_dir',
        metavar='path/to/save_dir',
        type=str,
        nargs='?',
        help='path to a directory in which to save a checkpoint')
    parser.add_argument('--learning_rate',
                        metavar='learning rate',
                        type=float,
                        nargs='?',
                        default=0.002,
                        help='learning rate value for model training')
    parser.add_argument('--hidden_units',
                        metavar='hidden units',
                        type=int,
                        nargs='?',
                        default=512,
                        help='number of hidden units for model classifier')
    parser.add_argument('--epochs',
                        metavar='epochs',
                        type=int,
                        nargs='?',
                        default=5,
                        help='number of epochs for model training')
    parser.add_argument(
        '--arch',
        metavar='model name',
        type=str,
        nargs='?',
        default='densenet161',
        help='name of transfer model (e.g., resnet18 or densenet161)')
    parser.add_argument('--gpu',
                        action='store_true',
                        help='use GPU for model training (recommended)')
    args = parser.parse_args()
    data_dir = args.data_dir[0]
    save_dir = args.save_dir
    learning_rate = args.learning_rate
    hidden_units = args.hidden_units
    epochs = args.epochs
    model_name = args.arch
    use_gpu = args.gpu
    print('Using the following hyperparameters for training')
    print(f'  Learning rate: {learning_rate}')
    print(f'  Hidden units: {hidden_units}')
    print(f'  Epochs: {epochs}')
    print(f'Transfer model name: {model_name}')
    if use_gpu and not cuda.is_available():
        print('Error: GPU not available. Try again without the --gpu flag')
        exit(1)
    if use_gpu:
        print('Training on GPU...')
    else:
        print('Training on CPU...')
        print(
            'Warning: training on CPU could take a LONG time. Consider using --gpu flag.'
        )
    print('')

    if model_name not in ALLOWED_ARCHS:
        print(
            f'Error: Model architecture {model_name} is not currently supported.'
        )
        print('Please try one of the following:')
        for a in ALLOWED_ARCHS:
            print(f'  {a}')
        exit(1)

    dataloaders, image_datasets = prep_data(data_dir)

    model = build_model_from_pretrained(model_name, hidden_units,
                                        image_datasets['test'].class_to_idx)
    #     print(model.classifier)

    with active_session():
        trained, optimizer = train(model, dataloaders, learning_rate, epochs,
                                   use_gpu)

    if save_dir:
        save_checkpoint(trained, epochs, optimizer, model_name, learning_rate,
                        save_dir)
Ejemplo n.º 16
0
from utils import load_data

# Setup argparse arguments
parser = argparse.ArgumentParser()

parser.add_argument('data_dir', type=str)
parser.add_argument('--save_dir', type=str, default='./')
parser.add_argument('--arch', type=str, default='vgg16')
parser.add_argument('--lr', type=float, default='0.01')
parser.add_argument('--hidden_units', type=int, default=512)
parser.add_argument('--epochs', type=int, default=20)
parser.add_argument('--gpu', action='store_true')

arg = parser.parse_args()

if __name__ == '__main__':
    print('Loading data')
    trainloader, validloader, testloader, class_to_idx = load_data(
        arg.data_dir)

    print('Creating model')
    model = create_model(arg.arch, arg.hidden_units)

    print('Training model')
    train_model(model, arg.epochs, arg.lr, arg.gpu, trainloader, validloader)

    print('Saving model')
    model.class_to_idx = class_to_idx
    save_checkpoint(model, arg.save_dir + arg.arch + '.pth', arg.arch,
                    arg.hidden_units, class_to_idx)
Ejemplo n.º 17
0
def main():
    # Fetch user arguments
    user_input = get_train_arguments()
    data_dir = user_input.data_directory
    checkpoint_save_dir = user_input.save_dir
    architecture = user_input.arch
    classifier_hidden_units_list = user_input.hidden_units
    dropout = 0.2
    epochs = user_input.epochs
    learn_rate = user_input.learning_rate
    print_every = user_input.print_every
    device = torch.device(m.determine_device(user_input.gpu))

    with open('cat_to_name.json', 'r') as f:
        cat_to_name = json.load(f)

    classifier_outputs = len(cat_to_name)

    print("Configuring training session with the following parameters:\n",
          f"- Architecture: {architecture}\n",
          f"- Hidden layer inputs: {classifier_hidden_units_list}\n",
          f"- Classifier Outputs: {classifier_outputs}\n",
          f"- Dropout: {dropout}\n", f"- Epochs: {epochs}\n",
          f"- Learn Rate: {learn_rate}\n", f"- Device: {device}\n")

    # Load training and validation data
    training_dataloader, training_dataset = load.load_data(
        data_dir, icc.DIRECTORIES[icc.TRAIN_DIR])
    validation_dataloader, validation_dataset = load.load_data(
        data_dir, icc.DIRECTORIES[icc.VALID_DIR])

    # Load pre-trained model with new classifier
    model = m.build_model(architecture, classifier_hidden_units_list,
                          classifier_outputs, dropout)
    model.to(device)

    # Create an optimizer to update the weights
    classifier_params = m.fetch_feedforward_classifier_parameters(
        model, architecture)
    optimizer = optim.Adam(classifier_params, lr=learn_rate)
    criterion = nn.NLLLoss()

    # Train
    print(f"\nStarting training: {datetime.datetime.now()}")
    start = time.time()

    train_losses, validation_losses = train(model, device, epochs, optimizer,
                                            criterion, training_dataloader,
                                            validation_dataloader, print_every)

    print(
        f"Finished training: {datetime.datetime.now()}. Total run time: {time.time() - start}"
    )

    # Test (optional)
    if user_input.test:
        testing_dataloader, testing_dataset = load.load_data(
            data_dir, icc.DIRECTORIES[icc.TEST_DIR])

        test_start = time.time()
        test(model, criterion, testing_dataloader, device)

        print(f"Testing time: {time.time() - test_start}")

    # Save checkpoint
    m.save_checkpoint(model, training_dataset.class_to_idx, optimizer, epochs,
                      classifier_outputs, classifier_hidden_units_list,
                      dropout, architecture, checkpoint_save_dir)
Ejemplo n.º 18
0
def train(args):
    original_image_shape = 1024
    validation_frac = 0.10

    df_train = pd.read_csv(args.labels)
    df_train = df_train.sample(
        frac=1, random_state=args.seed)  # .sample(frac=1) does the shuffling
    pIds = [pId for pId in df_train["patientId"].unique()]

    pIds_valid = pIds[:int(round(validation_frac * len(pIds)))]
    pIds_train = pIds[int(round(validation_frac * len(pIds))):]
    print("{} patient IDs shuffled and {}% of them used in validation set.".
          format(len(pIds), validation_frac * 100))
    print(
        "{} images went into train set and {} images went into validation set."
        .format(len(pIds_train), len(pIds_valid)))

    pId_boxes_dict = {}
    for pId in (df_train.loc[(
            df_train["Target"] == 1)]["patientId"].unique().tolist()):
        pId_boxes_dict[pId] = get_boxes_per_patient(df_train, pId)
    print("{} ({:.1f}%) images have target boxes.".format(
        len(pId_boxes_dict), 100 * (len(pId_boxes_dict) / len(pIds))))

    transform = tv.transforms.Compose([tv.transforms.ToTensor()])

    # create datasets
    dataset_train = PneumoniaDataset(
        root=args.data,
        pIds=pIds_train,
        predict=False,
        boxes=pId_boxes_dict,
        rescale_factor=args.rescale_factor,
        transform=transform,
        rotation_angle=3,
        warping=True,
        seed=args.seed,
    )

    dataset_valid = PneumoniaDataset(
        root=args.data,
        pIds=pIds_valid,
        predict=False,
        boxes=pId_boxes_dict,
        rescale_factor=args.rescale_factor,
        transform=transform,
        rotation_angle=0,
        warping=False,
        seed=args.seed,
    )

    # define the dataloaders with the previous dataset
    loader_train = DataLoader(
        dataset=dataset_train,
        batch_size=args.batch_size,
        shuffle=True,
        pin_memory=True,
    )

    loader_valid = DataLoader(
        dataset=dataset_valid,
        batch_size=args.batch_size,
        shuffle=True,
        pin_memory=True,
    )

    # Check if train images have been properly loaded
    print("{} images in train set and {} images in validation set.".format(
        len(dataset_train), len(dataset_valid)))

    #
    img_batch, target_batch, pId_batch = next(iter(loader_train))
    print("Tensor batch size:", img_batch.size())

    for i in np.random.choice(len(dataset_train), size=5, replace=False):
        img, target, pId = dataset_train[i]  # picking an image with pneumonia
        print("\nImage and mask shapes:", img.shape, target.shape)
        print("Patient ID:", pId)
        print("Image scale: {} - {}".format(img[0].min(), img[0].max()))
        print("Target mask scale: {} - {}".format(target[0].min(),
                                                  target[0].max()))

    # define an instance of the model
    model = PneumoniaUNET(
        bn_momentum=args.bn_momentum,
        eps=args.bn_eps,
        alpha_leaky=args.alpha_leaky,
    ).cuda()

    print(model)
    # define the loss function
    loss_fn = BCEWithLogitsLoss2d().cuda()

    num_epochs = 2 if args.debug else args.epochs
    num_steps_train = 50 if args.debug else len(loader_train)
    num_steps_eval = 10 if args.debug else len(loader_valid)

    shape = int(round(original_image_shape / args.rescale_factor))

    histories, best_models = train_and_evaluate(
        model,
        loader_train,
        loader_valid,
        args.learning_rate,
        args.optimizer,
        args.learning_rate_decay,
        args.momentum,
        args.eps,
        args.weight_decay,
        loss_fn,
        num_epochs,
        num_steps_train,
        num_steps_eval,
        pId_boxes_dict,
        args.rescale_factor,
        shape,
        save_path=args.save,
        restore_file=args.checkpoint,
    )
    best_model = best_models["best precision model"]

    dataset_valid = PneumoniaDataset(
        root=args.data,
        pIds=pIds_valid[:100] if args.debug else pIds_valid,
        predict=True,
        boxes=None,
        rescale_factor=args.rescale_factor,
        transform=transform,
        seed=args.seed,
    )
    loader_valid = DataLoader(dataset=dataset_valid,
                              batch_size=args.batch_size,
                              shuffle=False)
    predictions_valid = predict(best_model, loader_valid)

    if args.train_threshold and not args.debug:
        (
            best_threshold,
            best_avg_precision_valid,
            thresholds,
            avg_precision_valids,
        ) = train_threshold(
            dataset_valid,
            predictions_valid,
            pId_boxes_dict,
            args.rescale_factor,
        )
        print(best_threshold)
        print(best_avg_precision_valid)
        print(thresholds)
        print(avg_precision_valids)
    else:
        best_threshold = args.box_threshold

    if args.save:
        os.makedirs(f"{args.save}/images", exist_ok=True)

    img_precisions = evaluate_threshold(
        dataset_valid,
        predictions_valid,
        best_threshold,
        pId_boxes_dict,
        args.rescale_factor,
        image_save_path=f"{args.save}/images" if args.save_images else None,
    )

    print(
        f"Total Average Precision: {np.nansum(img_precisions) / len(img_precisions)}"
    )

    save_checkpoint(
        {
            "best_threshold": best_threshold,
            "state_dict": best_model.state_dict(),
        },
        args.save,
        is_final=True,
    )
Ejemplo n.º 19
0
def train_and_evaluate(
    model,
    train_dataloader,
    val_dataloader,
    lr_init,
    optimizer_type,
    lr_decay,
    momentum,
    eps,
    wd,
    loss_fn,
    num_epochs,
    num_steps_train,
    num_steps_eval,
    pId_boxes_dict,
    rescale_factor,
    shape,
    save_path=None,
    restore_file=None,
):

    # reload weights from restore_file if specified
    if restore_file is not None:
        checkpoint = torch.load(restore_file)
        model.load_state_dict(checkpoint["state_dict"])

    writer = SummaryWriter(f"{os.environ.get('TRAINML_OUTPUT_PATH')}/logs")

    best_val_loss = 1e15
    best_val_prec = 0.0
    best_loss_model = None
    best_prec_model = None

    loss_t_history = []
    loss_v_history = []
    loss_avg_t_history = []
    prec_t_history = []
    prec_v_history = []

    for epoch in range(num_epochs):
        start = time.time()

        # define the optimizer
        if optimizer_type == "adagrad":
            lr = lr_init
            optimizer = torch.optim.Adagrad(
                model.parameters(),
                lr=lr,
                lr_decay=lr_decay,
                weight_decay=wd,
            )
        else:
            lr = lr_init * 0.5**float(
                epoch)  # reduce the learning rate at each epoch
            if optimizer_type == "adam":
                optimizer = torch.optim.Adam(model.parameters(),
                                             lr=lr,
                                             eps=eps,
                                             weight_decay=wd)
            elif optimizer_type == "adamw":
                optimizer = torch.optim.AdamW(model.parameters(),
                                              lr=lr,
                                              eps=eps,
                                              weight_decay=wd)
            elif optimizer_type == "adamax":
                optimizer = torch.optim.Adamax(model.parameters(),
                                               lr=lr,
                                               eps=eps,
                                               weight_decay=wd)
            elif optimizer_type == "sgd":
                optimizer = torch.optim.SGD(
                    model.parameters(),
                    lr=lr,
                    momentum=momentum,
                    weight_decay=wd,
                )
            else:
                raise ValueError(
                    'Invalid optimizer_type, allowed values are "adam", "adamw", "adamax", "sgd", "adagrad"'
                )

        # Run one epoch
        print("Epoch {}/{}. Learning rate = {:05.3f}.".format(
            epoch + 1, num_epochs, lr))

        # train model for a whole epoc (one full pass over the training set)
        loss_avg_t_hist_ep, loss_t_hist_ep, prec_t_hist_ep = train_model(
            model,
            train_dataloader,
            optimizer,
            loss_fn,
            num_steps_train,
            pId_boxes_dict,
            rescale_factor,
            shape,
            writer,
            epoch=epoch,
        )
        loss_avg_t_history += loss_avg_t_hist_ep
        loss_t_history += loss_t_hist_ep
        prec_t_history += prec_t_hist_ep

        # Evaluate for one epoch on validation set
        val_metrics = evaluate_model(
            model,
            val_dataloader,
            loss_fn,
            num_steps_eval,
            pId_boxes_dict,
            rescale_factor,
            shape,
            writer,
            epoch=epoch,
        )

        val_loss = val_metrics["loss"]
        val_prec = val_metrics["precision"]

        loss_v_history += len(loss_t_hist_ep) * [val_loss]
        prec_v_history += len(prec_t_hist_ep) * [val_prec]

        is_best_loss = val_loss <= best_val_loss
        is_best_prec = val_prec >= best_val_prec

        if is_best_loss:
            print("- Found new best loss: {:.4f}".format(val_loss))
            best_val_loss = val_loss
            best_loss_model = model
        if is_best_prec:
            print("- Found new best precision: {:.4f}".format(val_prec))
            best_val_prec = val_prec
            best_prec_model = model

        # Save best weights based on best_val_loss and best_val_prec
        if save_path:
            save_checkpoint(
                {
                    "epoch": epoch + 1,
                    "state_dict": model.state_dict(),
                    "optim_dict": optimizer.state_dict(),
                },
                save_path,
                is_best=is_best_loss,
                metric="loss",
            )
            save_checkpoint(
                {
                    "epoch": epoch + 1,
                    "state_dict": model.state_dict(),
                    "optim_dict": optimizer.state_dict(),
                },
                save_path,
                is_best=is_best_prec,
                metric="prec",
            )

        delta_time = time.time() - start
        print("Epoch run in {:.2f} minutes".format(delta_time / 60.0))

    histories = {
        "loss avg train": loss_avg_t_history,
        "loss train": loss_t_history,
        "precision train": prec_t_history,
        "loss validation": loss_v_history,
        "precision validation": prec_v_history,
    }
    best_models = {
        "best loss model": best_loss_model,
        "best precision model": best_prec_model,
    }

    return histories, best_models
def train(
    model: TEDD1104,
    optimizer_name: str,
    optimizer: torch.optim,
    train_dir: str,
    dev_dir: str,
    test_dir: str,
    output_dir: str,
    batch_size: int,
    initial_epoch: int,
    num_epoch: int,
    max_acc: float,
    hide_map_prob: float,
    num_load_files_training: int,
    fp16: bool = True,
    amp_opt_level=None,
    save_checkpoints: bool = True,
    save_every: int = 100,
    save_best: bool = True,
):
    """
    Train a model

    Input:
    - model: TEDD1104 model to train
    - optimizer_name: Name of the optimizer to use [SGD, Adam]
    - optimizer: Optimizer (torch.optim)
    - train_dir: Directory where the train files are stored
    - dev_dir: Directory where the development files are stored
    - test_dir: Directory where the test files are stored
    - output_dir: Directory where the model and the checkpoints are going to be saved
    - batch_size: Batch size (Around 10 for 8GB GPU)
    - initial_epoch: Number of previous epochs used to train the model (0 unless the model has been
      restored from checkpoint)
    - num_epochs: Number of epochs to do
    - max_acc: Accuracy in the development set (0 unless the model has been
      restored from checkpoint)
    - hide_map_prob: Probability for removing the minimap (black square) from the image (0<=hide_map_prob<=1)
    - fp16: Use FP16 for training
    - amp_opt_level: If FP16 training Nvidia apex opt level
    - save_checkpoints: save a checkpoint each epoch (Each checkpoint will rewrite the previous one)
    - save_best: save the model that achieves the higher accuracy in the development set

    Output:
     - float: Accuracy in the development test of the best model
    """

    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

    criterion: CrossEntropyLoss = torch.nn.CrossEntropyLoss()
    print("Loading dev set")
    X_dev, y_dev = load_dataset(dev_dir, fp=16 if fp16 else 32)
    X_dev = torch.from_numpy(X_dev)
    print("Loading test set")
    X_test, y_test = load_dataset(test_dir, fp=16 if fp16 else 32)
    X_test = torch.from_numpy(X_test)

    acc_dev: float = 0.0
    total_training_exampels: int = 0
    printTrace("Training...")
    for epoch in range(num_epoch):
        iteration_no = 0
        num_used_files: int = 0
        files: List[str] = glob.glob(os.path.join(train_dir, "*.npz"))
        random.shuffle(files)
        # Get files in batches, all files will be loaded and data will be shuffled
        for paths in batch(files, num_load_files_training):
            iteration_no += 1
            num_used_files += num_load_files_training
            model.train()
            start_time: float = time.time()

            X, y = load_and_shuffle_datasets(paths=paths,
                                             fp=16 if fp16 else 32,
                                             hide_map_prob=hide_map_prob)
            total_training_exampels += len(y)
            running_loss = 0.0
            num_batchs = 0

            for X_bacth, y_batch in nn_batchs(X, y, batch_size):
                X_bacth, y_batch = (
                    torch.from_numpy(X_bacth).to(device),
                    torch.from_numpy(y_batch).long().to(device),
                )
                optimizer.zero_grad()
                outputs = model.forward(X_bacth)
                loss = criterion(outputs, y_batch)
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                if fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), 1.0)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                optimizer.step()
                running_loss += loss.item()
                num_batchs += 1
            start_time_eval: float = time.time()
            # Print Statistics
            if len(X) > 0 and len(y) > 0:
                acc_train = evaluate(
                    model=model,
                    X=torch.from_numpy(X),
                    golds=y,
                    device=device,
                    batch_size=batch_size,
                )
            else:
                acc_train = -1.0

            acc_dev = evaluate(
                model=model,
                X=X_dev,
                golds=y_dev,
                device=device,
                batch_size=batch_size,
            )

            acc_test = evaluate(
                model=model,
                X=X_test,
                golds=y_test,
                device=device,
                batch_size=batch_size,
            )

            printTrace(
                f"EPOCH: {initial_epoch+epoch}. Iteration {iteration_no}. "
                f"{num_used_files} of {len(files)} files. "
                f"Total examples used for training {total_training_exampels}. "
                f"Iteration time: {time.time() - start_time} secs. Eval time: {time.time() - start_time_eval} secs."
            )

            printTrace(
                f"Loss: {-1 if num_batchs == 0 else running_loss / num_batchs}. Acc training set: {acc_train}. "
                f"Acc dev set: {acc_dev}. Acc test set: {acc_test}")

            if acc_dev > max_acc and save_best:
                max_acc = acc_dev
                printTrace(
                    f"New max acc in dev set {max_acc}. Saving model...")
                save_model(
                    model=model,
                    save_dir=output_dir,
                    fp16=fp16,
                    amp_opt_level=amp_opt_level,
                )

            if save_checkpoints and iteration_no % save_every == 0:
                printTrace("Saving checkpoint...")
                save_checkpoint(
                    path=os.path.join(output_dir, "checkpoint.pt"),
                    model=model,
                    optimizer_name=optimizer_name,
                    optimizer=optimizer,
                    acc_dev=acc_dev,
                    epoch=initial_epoch + epoch,
                    fp16=fp16,
                    opt_level=amp_opt_level,
                )

    return max_acc
Ejemplo n.º 21
0
        device = "cuda:0"
    else : # So if cuda is not available don't do some unexpected things, just raise an error.
        raise ValueError("We wanted to execute this training on GPU, but cuda is not available!!\nPlease remove the -g option or make sure cuda is available.")
else :
    device = 'cpu'

print("The training is done on {}".format(device))

if args.checkpoint_dir :
    ckp_filepath = args.checkpoint_dir + ckp_fileprefix + args.architecture + ".pth"
else :
    ckp_filepath = ckp_fileprefix + args.architecture + ".pth"

if os.path.isfile(ckp_filepath) :
    print("Checkpoint {} recognized, continue training this model!".format(ckp_filepath))
    model = mo.load_checkpoint(ckp_filepath, device)
else :
    print("Checkpoint {} not recognized, starting from scratch!".format(ckp_filepath))
    model = mo.init_model(args.directory, args.architecture, args.learning_rate, args.hidden_units)

# Create an object where we can iterate over the data
dataloaders, img_datasets, _ = ut.get_data_loader(args.directory)

# Is usefull in the do_training function
dataset_sizes = {x: len(img_datasets[x])
                 for x in ['train', 'valid', 'test']}
# Now we are ready to do some training
model = mo.do_training(model, dataloaders, dataset_sizes, device, epochs = args.epochs)
# Done with the f*****g training, now save the network again
mo.save_checkpoint(model, args.architecture, img_datasets['train'], ckp_filepath)
Ejemplo n.º 22
0
def main():
    # Measure total program runtime by collecting start time
    start_time = time()

    # Create & retrieve Command Line Arugments
    in_arg = get_input_args()

    # Set device to cuda if gpu flag is set
    device = 'cuda' if in_arg.gpu == True else 'cpu'

    # Load the data
    train_dir = in_arg.data_dir + '/train'
    valid_dir = in_arg.data_dir + '/valid'
    test_dir = in_arg.data_dir + '/test'

    # Load the datasets with ImageFolder
    train_datasets = datasets.ImageFolder(train_dir,
                                          transform=train_transforms)
    valid_datasets = datasets.ImageFolder(valid_dir, transform=test_transforms)
    test_datasets = datasets.ImageFolder(test_dir, transform=test_transforms)

    # Using the image datasets and the transforms, define the dataloaders
    trainloader = torch.utils.data.DataLoader(train_datasets,
                                              batch_size=32,
                                              shuffle=True)
    validloader = torch.utils.data.DataLoader(valid_datasets, batch_size=32)
    testloader = torch.utils.data.DataLoader(test_datasets, batch_size=32)

    # Get the number of classes as the size of the output layer
    output_size = len(train_datasets.classes)

    # Build the model
    model = build_model(in_arg.arch, output_size, in_arg.hidden_units)

    # Insert mapping from class to index and index to class
    model.class_to_idx = train_datasets.class_to_idx
    model.idx_to_class = {i: c for c, i in model.class_to_idx.items()}

    # Move model to cuda before constructing the optimizer
    model = model.to(device)

    # Define criterion
    criterion = nn.NLLLoss()

    # Define optimizer
    # Only train the classifier parameters, feature parameters are frozen (p.requires_grad == false)
    optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                  model.parameters()),
                           lr=in_arg.learning_rate)

    # Train the model
    train_model(model, trainloader, validloader, in_arg.epochs, criterion,
                optimizer, device)

    # Test the model
    print('Start testing')
    _, test_accuracy = validate_model(model, testloader, criterion, device)
    print('Finished testing')
    print(
        "Accuracy of the model on the test set: {:.3f}".format(test_accuracy))

    # Save checkpoint
    save_checkpoint(in_arg.save_path, model, in_arg.arch, output_size,
                    in_arg.epochs, optimizer)

    # Measure total program runtime by collecting end time
    end_time = time()

    # Computes overall runtime in seconds & prints it in hh:mm:ss format
    tot_time = end_time - start_time
    print(
        "\n** Total Elapsed Runtime:",
        str(int((tot_time / 3600))) + ":" + str(int(
            (tot_time % 3600) / 60)) + ":" + str(int((tot_time % 3600) % 60)))
Ejemplo n.º 23
0
from image import load_images
from workspace_utils import active_session


parser = argparse.ArgumentParser(description='Training image classifier')
parser.add_argument('data_directory', help='image data directory path with train/valid/test subfolders')
parser.add_argument('--save_dir', help='Model checkpoint saving', default='.')
parser.add_argument('--arch', help='Pretrained model from torchvision', default='vgg16')
parser.add_argument('--learning_rate', type=float, help='Optimizer learning rate', default=0.003)
parser.add_argument('--hidden_units', type=int, help='Number of hidden units in customized classifier', default=256)
parser.add_argument('--epochs', type=int, help='Number of training epochs', default=30)
parser.add_argument('--gpu', action='store_true', default=False, help='Flag to set using GPU')

args = parser.parse_args()
print('Hyperparameters:', args)

dataloaders, class_to_idx = load_images(args.data_directory)


device = torch.device("cuda" if args.gpu and torch.cuda.is_available() else "cpu")
with active_session():
    print("Start loading model...")
    model = get_torchvision_model(args.arch, args.hidden_units)
    print(model.classifier)
    print("Start training model...")
    train_model(args.learning_rate, model, args.epochs, device, dataloaders)

    
save_path = save_checkpoint(model, args.save_dir, args.hidden_units, args.arch, class_to_idx)
print('Training complete. Model checkpoint is save at: %s' % save_path)
Ejemplo n.º 24
0
def main_worker(gpu, ngpus_per_node, args):
    args.gpu = gpu

    setup_default_logging()
    _logger = logging.getLogger('train')

    if args.gpu is not None:
        _logger.info("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(
            backend=args.dist_backend,
            init_method=args.dist_url,
            world_size=args.world_size,
            rank=args.rank,
        )

    args.verbose = not args.distributed or (args.distributed and
                                            args.rank % ngpus_per_node == 0)

    if args.verbose:
        _logger.info("create model {}".format(args.arch))
    model = create_model(args.arch, args.num_classes, args.pretrained)

    if args.distributed:
        # 单进程单卡训练
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(args.workers / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu])
        # 单进程多卡训练
        else:
            model.cuda()
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        if args.verbose:
            _logger.warning("no gpu for training, using cpu")

    optimizer = optim.Adam(model.parameters(), args.lr)

    start_epoch = args.start_epoch

    if args.initial_checkpoint is not None:
        if os.path.isfile(args.initial_checkpoint):
            if args.verbose:
                _logger.info("initializing model from '{}'".format(
                    args.initial_checkpoint))
            if args.gpu is None:
                checkpoint = torch.load(args.initial_checkpoint)
            else:
                checkpoint = torch.load(args.initial_checkpoint,
                                        map_location='cuda:{}'.format(
                                            args.gpu))
            state_dict = checkpoint['state_dict']
            model.load_state_dict(state_dict)
            if args.verbose:
                _logger.info("initialized model from '{}'".format(
                    args.initial_checkpoint))

    if args.resume is not None:
        if os.path.isfile(args.resume):
            if args.verbose:
                _logger.info("loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                checkpoint = torch.load(args.resume,
                                        map_location='cuda:{}'.format(
                                            args.gpu))
            start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            if args.verbose:
                _logger.info("loaded checkpoint '{}' (epoch {})".format(
                    args.resume, checkpoint['epoch']))

    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                               mode='min',
                                               patience=args.patience_epochs,
                                               verbose=args.verbose)

    if args.vdata is not None and args.val_csv is not None:
        train_df = pd.read_csv(args.csv)
        train_set = create_dataset(args.data,
                                   train_df,
                                   args.mean,
                                   args.std,
                                   args.multi,
                                   evaluate=False)
        val_df = pd.read_csv(args.val_csv)
        val_set = create_dataset(args.vdata,
                                 val_df,
                                 args.mean,
                                 args.std,
                                 args.multi,
                                 evaluate=False)
    else:
        if args.multi:
            assert args.csv is not None, "Please specify annotation file"
            df = pd.read_csv(args.csv)
            val_df = df.sample(frac=args.test_split, random_state=args.seed)
            train_df = df.drop(val_df.index)
            train_set = create_dataset(args.data,
                                       train_df,
                                       args.mean,
                                       args.std,
                                       multi=True,
                                       train=True,
                                       evaluate=False)
            val_set = create_dataset(args.data,
                                     val_df,
                                     args.mean,
                                     args.std,
                                     multi=True,
                                     train=True,
                                     evaluate=False)
        else:
            df = pd.read_csv(args.csv) if args.csv is not None else None
            dataset = create_dataset(args.data,
                                     df,
                                     args.mean,
                                     args.std,
                                     multi=False,
                                     train=True,
                                     evaluate=False)
            train_set = copy.deepcopy(dataset)
            val_set = copy.deepcopy(dataset)
            kf = StratifiedShuffleSplit(n_splits=1,
                                        test_size=args.test_split,
                                        random_state=args.seed)
            train_idx, val_idx = next(kf.split(dataset.paths, dataset.targets))
            train_set.paths = [dataset.paths[i] for i in train_idx]
            train_set.targets = [dataset.targets[i] for i in train_idx]
            val_set.paths = [dataset.paths[i] for i in val_idx]
            val_set.targets = [dataset.targets[i] for i in val_idx]
            # val_set.transforms = transforms.Compose([transforms.ToTensor()])

    if args.verbose:
        _logger.info("Training set:\n{}".format(train_set))
        _logger.info("Validation set:\n{}".format(val_set))

    if args.distributed:
        train_sampler = DistributedSampler(
            dataset=train_set,
            shuffle=True,
        )
        val_sampler = DistributedSampler(
            dataset=val_set,
            shuffle=False,
        )
    else:
        train_sampler = None
        val_sampler = None

    train_loader = DataLoader(
        dataset=train_set,
        batch_size=args.batch_size,
        shuffle=not args.distributed,
        sampler=train_sampler,
        num_workers=args.workers,
        pin_memory=True,
        drop_last=True,
    )
    val_loader = DataLoader(
        dataset=val_set,
        batch_size=args.batch_size,
        shuffle=False,
        sampler=val_sampler,
        num_workers=args.workers,
        pin_memory=True,
    )

    if args.multi:
        train_criterion = nn.BCELoss().cuda(args.gpu)
        val_criterion = nn.BCELoss().cuda(args.gpu)
    else:
        train_criterion = nn.CrossEntropyLoss().cuda(args.gpu)
        val_criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    best_metric = None
    for epoch in range(start_epoch, args.epochs):
        train(args,
              epoch,
              model,
              train_loader,
              optimizer,
              train_criterion,
              logger=_logger)
        val_loss, val_acc, val_recal = validate(args,
                                                epoch,
                                                model,
                                                val_loader,
                                                val_criterion,
                                                logger=_logger)
        scheduler.step(val_loss)
        if best_metric is not None and val_loss < best_metric:
            is_best = True
            best_metric = val_loss
        elif best_metric is None:
            is_best = True
            best_metric = val_loss
        else:
            is_best = False
        if args.verbose:
            checkpoint = {
                'epoch':
                epoch,
                'state_dict':
                model.state_dict()
                if not args.distributed else model.module.state_dict(),
                'optimizer':
                optimizer.state_dict()
            }
            save_checkpoint(checkpoint, args.output, epoch, val_loss, val_acc,
                            is_best)
        dist.barrier()
Ejemplo n.º 25
0
        running_loss = 0.
        running_acc = 0.

        for inputs, labels in tqdm(train_dataloader):
            inputs = inputs.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()

            with torch.set_grad_enabled(True):
                predicts = model(inputs)
                loss_value = loss(predicts, labels)
                predicts_class = predicts.argmax(dim=1)
                loss_value.backward()
                optimizer.step()

            running_loss += loss_value.item()
            running_acc += (predicts_class == labels.data).float().mean()

        epoch_loss = running_loss / len(train_dataloader)
        epoch_acc = running_acc / len(train_dataloader)

        train_history_loss.append(epoch_loss)
        train_history_acc.append(epoch_acc)
        print('Loss: {:.4f} Acc: {:.4f}'.format(epoch_loss, epoch_acc),
              flush=True)
        if epoch % 10 == 0:
            model.save_checkpoint(epoch, model, optimizer, epoch_loss,
                                  path_model_save)

    print('end train')
Ejemplo n.º 26
0
def self_training(model,
                  labeled_dataset,
                  unlabeled_dataset,
                  optimizer,
                  scheduler=None,
                  batch_size=4,
                  train_ratio=0.7,
                  score_threshold=0.7,
                  unlabeled_loss_weight=0.1,
                  relabel_step=None,
                  device='cpu',
                  max_epochs=100,
                  print_freq=10,
                  save_path=None,
                  checkpoint=None):
    model.to(device)
    metric_logger = utils.MetricLogger(delimiter=" ")
    last_loss = 1e9

    cur_epoch = 0
    # train_labeled_dataset, val_labeled_dataset = split_dataset(labeled_dataset, train_ratio)
    # train_unlabeled_dataset, val_unlabeled_dataset = split_dataset(unlabeled_dataset, train_ratio)
    dataset_path = os.path.join(save_path, 'dataset')

    if checkpoint is not None:
        print("loading checkpoint:" + checkpoint)
        model, optimizer, scheduler, cur_epoch = load_checkpoint(
            model, optimizer, scheduler, device, checkpoint)

    for epoch in range(cur_epoch, max_epochs):
        print("epoch {} / {}".format(epoch + 1, max_epochs))
        with open(os.path.join(dataset_path, 'train_labeled_dataset.pickle'),
                  'rb') as handle:
            train_labeled_dataset = pickle.load(handle)
        with open(os.path.join(dataset_path, 'val_labeled_dataset.pickle'),
                  'rb') as handle:
            val_labeled_dataset = pickle.load(handle)
        with open(os.path.join(dataset_path, 'train_unlabeled_dataset.pickle'),
                  'rb') as handle:
            train_unlabeled_dataset = pickle.load(handle)
        with open(os.path.join(dataset_path, 'val_unlabeled_dataset.pickle'),
                  'rb') as handle:
            val_unlabeled_dataset = pickle.load(handle)

        train_unlabeled_dataset = convert_subset(train_unlabeled_dataset)
        val_unlabeled_dataset = convert_subset(val_unlabeled_dataset)

        labeled_train_loader = DataLoader(train_labeled_dataset,
                                          collate_fn=collate_fn,
                                          batch_size=batch_size,
                                          shuffle=True)
        labeled_vld_loader = DataLoader(val_labeled_dataset,
                                        collate_fn=collate_fn,
                                        batch_size=batch_size,
                                        shuffle=False)
        pseudo_train = FLIRPseudoDataset(model,
                                         train_unlabeled_dataset,
                                         batch_size=batch_size,
                                         device=device,
                                         score_threshold=score_threshold)
        pseudo_val = FLIRPseudoDataset(model,
                                       val_unlabeled_dataset,
                                       batch_size=batch_size,
                                       device=device,
                                       score_threshold=score_threshold)
        unlabeled_train_loader = DataLoader(pseudo_train,
                                            collate_fn=collate_fn,
                                            batch_size=batch_size,
                                            shuffle=True)
        unlabeled_vld_loader = DataLoader(pseudo_val,
                                          collate_fn=collate_fn,
                                          batch_size=batch_size,
                                          shuffle=False)

        train_label_loss = train_one_epoch_self_training(
            model, optimizer, labeled_train_loader, 1, device, epoch,
            print_freq)
        train_loss = train_one_epoch_self_training(model, optimizer,
                                                   unlabeled_train_loader,
                                                   unlabeled_loss_weight,
                                                   device, epoch, print_freq)
        train_loss = train_label_loss + unlabeled_loss_weight * train_loss
        all_training_loss.append(train_loss)

        coco_evaluate(model, labeled_vld_loader, device)
        # labeled_loss = evaluate(model, vld_loader, device, epoch, print_freq)
        coco_evaluate(model, unlabeled_vld_loader, device)
        # unlabeled_loss = evaluate(model, vld_loader, device, epoch, print_freq)

        # loss = labeled_loss + unlabeled_loss_weight * unlabeled_loss
        loss = 0
        all_evaluation_loss.append(loss)

        if save_path is not None:
            save_checkpoint(model, optimizer, scheduler, epoch + 1, device,
                            save_path)
            last_loss = loss
        print("epoch {}, train loss {}, validation loss {}".format(
            epoch + 1, train_loss, loss))

        if scheduler is not None:
            scheduler.step()
Ejemplo n.º 27
0
            print('-' * 89)

            generate_output(args, epoch, model, gen_dataset, startPoint=1500)

            if epoch % args.save_interval == 0:
                # Save the model if the validation loss is the best we've seen so far.
                is_best = val_loss > best_val_loss
                best_val_loss = max(val_loss, best_val_loss)
                model_dictionary = {
                    'epoch': epoch,
                    'best_loss': best_val_loss,
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'args': args
                }
                model.save_checkpoint(model_dictionary, is_best)

    except KeyboardInterrupt:
        print('-' * 89)
        print('Exiting from training early')

# Calculate mean and covariance for each channel's prediction errors, and save them with the trained model
print('=> calculating mean and covariance')
means, covs = list(), list()
train_dataset = TimeseriesData.batchify(args, TimeseriesData.trainData, bsz=1)
for channel_idx in range(model.enc_input_size):
    mean, cov = fit_norm_distribution_param(
        args, model, train_dataset[:TimeseriesData.length], channel_idx)
    means.append(mean), covs.append(cov)
model_dictionary = {
    'epoch': max(epoch, start_epoch),