Example #1
0
 def __init__(self):
     # load config file
     config = json.load(open("model/config.json"))
     # get the image processor
     self._imageProcessor = ImageProcessor(config)
     # load the DL model (change this if you are not using ONNX)
     target_label_names = [
         'necrosis', 'contrast_enhancing', 'core', 'tumor', 'brain'
     ]
     net1 = UNET_3D_to_2D(0,
                          channels_in=4,
                          channels=128,
                          growth_rate=12,
                          dilated_layers=[6, 6, 6, 6],
                          output_channels=len(target_label_names))
     net2 = UNET_3D_to_2D(1,
                          channels_in=4,
                          channels=128,
                          growth_rate=12,
                          dilated_layers=[6, 6, 6],
                          output_channels=len(target_label_names))
     net1 = net1.cuda()
     net2 = net2.cuda()
     load_checkpoint(net1, 'model/checkpoint.pth.tar')
     load_checkpoint(net2, 'model/checkpoint_2.pth.tar')
     self._model1 = net1
     self._model2 = net2
def main():
    model = YOLOv3(num_classes=config.NUM_CLASSES).to(config.DEVICE)
    optimizer = optim.Adam(model.parameters(),
                           lr=config.LEARNING_RATE,
                           weight_decay=config.WEIGHT_DECAY)
    loss_fn = YoloLoss()
    scaler = torch.cuda.amp.GradScaler()

    train_loader, test_loader, train_eval_loader = get_loaders(
        train_csv_path=config.DATASET + "/train.csv",
        test_csv_path=config.DATASET + "/test.csv")

    if config.LOAD_MODEL:
        load_checkpoint(config.CHECKPOINT_FILE, model, optimizer,
                        config.LEARNING_RATE)

    scaled_anchors = (torch.tensor(config.ANCHORS) * torch.tensor(
        config.S).unsqueeze(1).unsqueeze(1).repeat(1, 3, 2)).to(config.DEVICE)

    for epoch in range(config.NUM_EPOCHS):
        #plot_couple_examples(model, test_loader, 0.6, 0.5, scaled_anchors)
        train_fn(train_loader, model, optimizer, loss_fn, scaler,
                 scaled_anchors)

        #if config.SAVE_MODEL:
        #    save_checkpoint(model, optimizer, filename=f"checkpoint.pth.tar")

        #print(f"Currently epoch {epoch}")
        #print("On Train Eval loader:")
        #print("On Train loader:")
        #check_class_accuracy(model, train_loader, threshold=config.CONF_THRESHOLD)

        if epoch > 0 and epoch % 3 == 0:
            check_class_accuracy(model,
                                 test_loader,
                                 threshold=config.CONF_THRESHOLD)
            pred_boxes, true_boxes = get_evaluation_bboxes(
                test_loader,
                model,
                iou_threshold=config.NMS_IOU_THRESH,
                anchors=config.ANCHORS,
                threshold=config.CONF_THRESHOLD,
            )
            mapval = mean_average_precision(
                pred_boxes,
                true_boxes,
                iou_threshold=config.MAP_IOU_THRESH,
                box_format="midpoint",
                num_classes=config.NUM_CLASSES,
            )
            print(f"MAP: {mapval.item()}")
            model.train()
Example #3
0
def Main():
	parser = argparse.ArgumentParser()
	parser.add_argument("image", help = "The input image to be predicted.", type = str)
	parser.add_argument("checkpoint", help = "Use a mapping of categories to real names", type = str)
	parser.add_argument("--gpu", help = "Use GPU instead of CPU.", action = "store_true")
	parser.add_argument("--topk", help = "Return top K most likely classes. ", type = int, default = 1)
	parser.add_argument("--category_names", help = "Use a mapping of categories to real names", type = str, default = None)    
	args = parser.parse_args()

	if args.gpu:
		device = 'cuda'
		print('Compute using GPU')
	else:
		device = 'cpu'
		print('Compute using CPU')

	checkpoint_path, checkpoint_name = None, args.checkpoint #'checkpoint.pth'
	image_path = args.image #'flowers/test/1/image_06743.jpg'
	topk = args.topk
	category_names = args.category_names #'cat_to_name.json'

	model = load_checkpoint(file_path = checkpoint_path, file_name = checkpoint_name)
	cat_to_name = None
	if category_names is not None:
		cat_to_name = load_cat_json(file_name = category_names)
	
	probs, classes, class_name = predict(image_path, model, topk = topk, cat_to_name = cat_to_name, device = device, probs_show = True)
Example #4
0
def main():
    in_arg = get_input_args()  # Creates and returns command line arguments

    print('\nPath To Image:\n', in_arg.path_to_image, '\n', '\nCheckpoint:\n',
          in_arg.checkpoint, '\n')

    print('Optional Command Line Arguments:\n', 'Top K [--top_k]: ',
          in_arg.top_k, '\n', 'Category Names [--category_names]: ',
          in_arg.category_names, '\n', 'GPU [--gpu]: ', in_arg.gpu, '\n')

    label_count, hidden_units, arch, class_to_idx, classifier_state_dict, epochs = mod.load_checkpoint(
        in_arg.checkpoint, in_arg.gpu)  # Load checkpoint

    model = mod.build_model(label_count, hidden_units, arch,
                            class_to_idx)  # Build model

    model.classifier.load_state_dict(classifier_state_dict)
    criterion = nn.NLLLoss()

    image = util.process_image(in_arg.path_to_image)  # Pre-process image

    labels = util.get_labels(
        in_arg.category_names)  # Get dict of categories mapped to real names

    mod.predict(image, model, labels, in_arg.top_k,
                in_arg.gpu)  # Prints Top K Labels and Probabilities
def continue_training(
    checkpoint_path: str,
    train_dir: str = "Data\\GTAV-AI\\data-v2\\train\\",
    dev_dir: str = "Data\\GTAV-AI\\data-v2\\dev\\",
    test_dir: str = "Data\\GTAV-AI\\data-v2\\test\\",
    output_dir: str = "Data\\models\\",
    batch_size: int = 10,
    num_epoch: int = 20,
    hide_map_prob: float = 0.0,
    save_checkpoints=True,
    save_best=True,
):
    """
    Load a checkpoint and continue training, we will restore the model, the optimizer and the nvidia apex data if
    the model was trained using fp16. Note: If the model was trained using fp16 it cannot be restored as an fp32
    model and vice versa. The floating point precision used for training the model will be restored automatically
    from the checkpoint.

    Input:
    - checkpoint_path: Path of the checkpoint to restore
    - train_dir: Directory where the train files are stored
    - dev_dir: Directory where the development files are stored
    - test_dir: Directory where the test files are stored
    - output_dir: Directory where the model and the checkpoints are going to be saved
    - batch_size: Batch size (Around 10 for 8GB GPU)
    - num_epochs: Number of epochs to do
    - optimizer_name: Name of the optimizer to use [SGD, Adam]
    - hide_map_prob: Probability for removing the minimap (black square) from the image (0<=hide_map_prob<=1)
    - save_checkpoints: save a checkpoint each epoch (Each checkpoint will rewrite the previous one)
    - save_best: save the model that achieves the higher accuracy in the development set

    Output:

    """

    model, optimizer_name, optimizer, acc_dev, epoch, fp16, opt_level = load_checkpoint(
        checkpoint_path, device)
    model = model.to(device)

    max_acc = train(
        model=model,
        optimizer_name=optimizer_name,
        optimizer=optimizer,
        train_dir=train_dir,
        dev_dir=dev_dir,
        test_dir=test_dir,
        output_dir=output_dir,
        batch_size=batch_size,
        initial_epoch=epoch,
        num_epoch=num_epoch,
        max_acc=acc_dev,
        hide_map_prob=hide_map_prob,
        fp16=fp16,
        amp_opt_level=opt_level if fp16 else None,
        save_checkpoints=save_checkpoints,
        save_best=save_best,
    )

    print(f"Training finished, max accuracy in the development set {max_acc}")
Example #6
0
def main():
    # Fetch user input
    user_input = get_predict_arguments()
    image_path = user_input.image_path
    checkpoint_path = user_input.checkpoint_path
    top_k = user_input.top_k
    category_names_file_path = user_input.category_names
    gpu = user_input.gpu
    device = m.determine_device(gpu)

    # Load model and checkpoint
    model, checkpoint = m.load_checkpoint(checkpoint_path, device)

    # Predict
    top_probabilities, top_classes = predict(image_path, model, device, top_k)

    top_classes.squeeze_()
    top_probabilities.squeeze_()

    # Load categories-to-names
    with open(category_names_file_path, 'r') as f:
        cat_to_name = json.load(f)

    classes_to_indexes = checkpoint['class_to_index']
    indexes_to_classes = {v: k for k, v in classes_to_indexes.items()}

    named_classes = [
        cat_to_name[indexes_to_classes[top_class]]
        for top_class in top_classes.cpu().numpy()
    ]

    print("Prediction Results")
    print(f"- Model Used: {checkpoint['architecture']}")
    print("  - Details:")
    print(
        f"    - Inputs: {checkpoint['inputs']}\n",
        f"    - Outputs: {checkpoint['outputs']}\n",
        f"    - Hidden Layers: {checkpoint['hidden_layers']}\n",
        f"    - Dropout: {checkpoint['dropout']}\n"
        f"    - Epochs: {checkpoint['epochs']}\n")

    print(f"Top {top_k} Probabilities:")
    for named_class, probability in zip(named_classes, top_probabilities):
        print("- Predicted: {} --> {:.3f}".format(named_class.capitalize(),
                                                  probability))
def train_teacher_model(model,
                        labeled_dataset,
                        optimizer,
                        scheduler=None,
                        train_ratio=0.7,
                        batch_size=4,
                        device='cpu',
                        max_epochs=100,
                        print_freq=10,
                        save_path=None,
                        checkpoint=None):
    model.to(device)
    metric_logger = utils.MetricLogger(delimiter=" ")
    last_loss = 1e9

    cur_epoch = 0
    if checkpoint is not None:
        print("loading checkpoint:" + checkpoint)
        model, optimizer, scheduler, cur_epoch = load_checkpoint(
            model, optimizer, scheduler, device, checkpoint)

    train_dataset, vld_dataset = split_dataset(labeled_dataset, train_ratio)
    train_loader = DataLoader(train_dataset,
                              batch_size=batch_size,
                              shuffle=True,
                              collate_fn=collate_fn)
    vld_loader = DataLoader(vld_dataset,
                            batch_size=batch_size,
                            shuffle=False,
                            collate_fn=collate_fn)
    for epoch in range(cur_epoch, max_epochs):
        print("epoch {} / {}".format(epoch + 1, max_epochs))
        train_one_epoch(model, optimizer, train_loader, device, epoch,
                        print_freq)
        loss = evaluate(model, vld_loader, device, epoch, print_freq)

        if loss < last_loss and save_path != None:
            save_checkpoint(model, optimizer, scheduler, epoch + 1, device,
                            save_path)
            last_loss = loss
        if scheduler is not None:
            scheduler.step()
Example #8
0
def checkpoint2model(checkpoint_path: str, model_dir: str):
    """
    Given a checkpoint file, generates a model file that can be loaded by run_TEDD1104.py script.
    Input:
     - checkpoint_path path of checkpoint file (checkpoint.pt)
     - model_path directory where the model is going to be saved (model.bin and model_hyperparameters.json)
    Output:
     """

    if not os.path.exists(model_dir):
        print(f"{model_dir} does not exits. We will create it.")
        os.makedirs(model_dir)

    print_message(f"Loading checkpoint: {checkpoint_path}")

    (
        tedd1104_model,
        _,
        _,
        _,
        running_loss,
        total_batches,
        total_training_examples,
        acc_dev,
        epoch,
        fp16,
        _,
    ) = model.load_checkpoint(path=checkpoint_path,
                              device=model.torch.device("cpu"))

    print(f">>>>>> Checkpoint info <<<<<<\n"
          f"Running loss: {running_loss/total_batches}\n"
          f"Num epochs: {epoch+1}\n"
          f"Total training examples: {total_training_examples}\n"
          f"Acc dev set: {round(acc_dev*100,2)}\n"
          f"FP16: {fp16}\n")

    print_message(f"Saving model in {model_dir}")

    model.save_model(model=tedd1104_model, save_dir=model_dir, fp16=fp16)

    print_message(f"Done!")
Example #9
0
def main():
    # Measures total program runtime by collecting start time
    start_time = time()

    # Creates & retrieves Command Line Arugments
    in_arg = get_input_args()

    # Set device to cuda if gpu flag is set
    device = 'cuda' if in_arg.gpu == True else 'cpu'

    # If given, read the mapping of categories to class names
    cat_to_name = {}
    if in_arg.category_names:
        with open(in_arg.category_names, 'r') as f:
            cat_to_name = json.load(f)

    # Load checkpoint
    model, _, _ = load_checkpoint(in_arg.checkpoint)

    # Predict classes
    probs, classes = predict(in_arg.img_path, model, device, in_arg.top_k)

    # Convert categories to real names if a mapping was given
    if cat_to_name:
        classes = [cat_to_name[str(cat)] for cat in classes]

    # Print results
    print('\nThe top {} most likely classes are:'.format(in_arg.top_k))
    max_name_len = len(max(classes, key=len))
    row_format = "{:<" + str(max_name_len + 2) + "}{:<.4f}"
    for prob, name in zip(probs, classes):
        print(row_format.format(name, prob))

    # Measure total program runtime by collecting end time
    end_time = time()

    # Computes overall runtime in seconds & prints it in hh:mm:ss format
    tot_time = end_time - start_time
    print(
        "\n** Total Elapsed Runtime:",
        str(int((tot_time / 3600))) + ":" + str(int(
            (tot_time % 3600) / 60)) + ":" + str(int((tot_time % 3600) % 60)))
Example #10
0
def main():
    print('Predict')
    in_arg = get_input_args()
    print("Command Line Arguments:\n input =", in_arg.input, "\n checkpoint =",
          in_arg.checkpoint, "\n top_k =", in_arg.top_k, "\n category_names =",
          in_arg.category_names, "\n gpu =", in_arg.gpu)

    # Load checkpoint
    model, checkpoint = load_checkpoint(in_arg.checkpoint)

    # Load catagory mapping dictionary
    cat_to_name = category_mapping(in_arg.category_names)

    # Process the image to return a transposed_image
    transposed_image = process_image(in_arg.input)

    # Get the prediction for an image file.
    top_classes = predict(transposed_image, model, in_arg.top_k, cat_to_name,
                          in_arg.gpu)
    # Print the chart with the top classes and probabilities.
    print(top_classes)
                    help='Checkpoint of the model')
parser.add_argument('--top_k', action='store',
                    dest='top_k',
                    help='Top number of most likely class')
parser.add_argument('--category_names', action='store',
                    dest='category_names',
                    default=None
                    help='Maping of categories')
parser.add_argument('--gpu', action='store_true',
                    default=False,
                    dest='gpu',
                    help='Set training to gpu')

results = parser.parse_args()

model = model.load_checkpoint("model_checkpoint.pth")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model.to(device)

image = utils.process_image(results.image_path).to(device)
np_image = image.unsqueeze_(0)

model.eval()
with torch.no_grad():
    log_ps = model.forward(np_image)
    
ps = torch.exp(log_ps)
    
top_probs, top_idx_probs = results.topk(topk, dim=1)
Example #12
0
    print(DEVICE, torch.cuda.is_available)
    model = model.to(DEVICE)
    # model = nn.DataParallel(model, device_ids=[0, 1, 2, 3])
    loss = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-5, weight_decay=.96)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1000)
    dataset = SRTrainDataset(
        hr_dir='/usr/project/xtmp/superresoluter/dataset/DIV2K/DIV2K_train_HR/',
        lr_dir=
        '/usr/project/xtmp/superresoluter/dataset/DIV2K/DIV2K_train_LR_bicubic/X4',
        lr_parse=lambda x: x.replace('x4', ''))
    loader = DataLoader(dataset, batch_size=1, shuffle=True, num_workers=8)
    begin_epoch = 0

    ckpt = load_checkpoint(load_dir='./checkpoints/',
                           map_location=None,
                           model_name='down_sample')
    if ckpt is not None:
        print('recovering from checkpoints...')
        model.load_state_dict(ckpt['model'])
        begin_epoch = ckpt['epoch'] + 1
        print('resuming training')

    begin = time()
    with open(os.path.join('../logs', 'down_sample.log'), 'w') as f:
        for epoch in range(begin_epoch, 1000):
            epoch_loss = []
            for bid, batch in enumerate(loader):
                hr, lr = batch['hr'].to(DEVICE), batch['lr'].to(DEVICE)
                optimizer.zero_grad()
                ds = model(hr)
Example #13
0
# Get an object to reference all our input arguments
args = parser.parse_args()

# First determine if this piece of code has to be executed on GPU or not.
if args.GPU:
    # We know we want to the execute this on cuda. But maybe cuda is not available..
    if torch.cuda.is_available():
        device = "cuda:0"
    else:  # So if cuda is not available don't do some unexpected things, just raise an error.
        raise ValueError(
            "We wanted to execute this training on GPU, but cuda is not available!!\nPlease remove the -g option or make sure cuda is available."
        )
else:
    device = 'cpu'

cat_to_name = None
# Only if the default or supplied parameter for category_names is a file we will try to make a mapping from id to flower name
# This will ignore an invalid path to the json_file
if os.path.isfile(args.category_names):
    with open(args.category_names, 'r') as f:
        cat_to_name = json.load(f)

# Now we are ready to load the model
model = mo.load_checkpoint(args.checkpoint, device)

# Get a reference to the plot we want to make and save it
fig = ut.show_prediction(args.image_filepath, model, device, args.top_k,
                         cat_to_name)

fig.savefig("pred_" + os.path.basename(args.image_filepath))
Example #14
0
def load_checkpoint(model,
                    optimizer,
                    lr_scheduler,
                    args,
                    load_optimizer_states=True):
    """Load a model checkpoint."""

    iteration, release, success = get_checkpoint_iteration(args)

    if not success:
        return 0

    if args.deepspeed:

        checkpoint_name, sd = model.load_checkpoint(
            args.load, iteration, load_optimizer_states=not args.no_load_optim)
        if "client_lr_scheduler" in sd:
            lr_scheduler.load_state_dict(sd["client_lr_scheduler"])
            print_rank_0("Load lr scheduler state")
        if checkpoint_name is None:
            if mpu.get_data_parallel_rank() == 0:
                print("Unable to load checkpoint.")
            return iteration

    else:

        # Checkpoint.
        checkpoint_name = get_checkpoint_name(args.load, iteration, release)

        if mpu.get_data_parallel_rank() == 0:
            print('global rank {} is loading checkpoint {}'.format(
                torch.distributed.get_rank(), checkpoint_name))

        # Load the checkpoint.
        sd = torch.load(checkpoint_name, map_location='cpu')

        if isinstance(model, torchDDP):
            model = model.module

        # Model.
        try:
            model.load_state_dict(sd['module'])
        except KeyError:
            print_rank_0('A metadata file exists but unable to load model '
                         'from checkpoint {}, exiting'.format(checkpoint_name))
            exit()

        # Optimizer.
        if not release and not args.finetune and not args.no_load_optim:
            try:
                if optimizer is not None and load_optimizer_states:
                    optimizer.load_state_dict(sd['optimizer'])
                if lr_scheduler is not None:
                    lr_scheduler.load_state_dict(sd['lr_scheduler'])
            except KeyError:
                print_rank_0(
                    'Unable to load optimizer from checkpoint {}, exiting. '
                    'Specify --no-load-optim or --finetune to prevent '
                    'attempting to load the optimizer '
                    'state.'.format(checkpoint_name))
                exit()

    # Iterations.
    if args.finetune or release:
        iteration = 0
    else:
        try:
            iteration = sd['iteration']
        except KeyError:
            try:  # Backward compatible with older checkpoints
                iteration = sd['total_iters']
            except KeyError:
                print_rank_0(
                    'A metadata file exists but Unable to load iteration '
                    ' from checkpoint {}, exiting'.format(checkpoint_name))
                exit()

    # rng states.
    if not release and not args.finetune and not args.no_load_rng:
        try:
            random.setstate(sd['random_rng_state'])
            np.random.set_state(sd['np_rng_state'])
            torch.set_rng_state(sd['torch_rng_state'])
            torch.cuda.set_rng_state(sd['cuda_rng_state'])
            mpu.get_cuda_rng_tracker().set_states(sd['rng_tracker_states'])
        except KeyError:
            print_rank_0(
                'Unable to load optimizer from checkpoint {}, exiting. '
                'Specify --no-load-rng or --finetune to prevent '
                'attempting to load the random '
                'state.'.format(checkpoint_name))
            exit()

    if mpu.get_data_parallel_rank() == 0:
        print('  successfully loaded {}'.format(checkpoint_name))

    return iteration
Example #15
0
        device = "cuda:0"
    else : # So if cuda is not available don't do some unexpected things, just raise an error.
        raise ValueError("We wanted to execute this training on GPU, but cuda is not available!!\nPlease remove the -g option or make sure cuda is available.")
else :
    device = 'cpu'

print("The training is done on {}".format(device))

if args.checkpoint_dir :
    ckp_filepath = args.checkpoint_dir + ckp_fileprefix + args.architecture + ".pth"
else :
    ckp_filepath = ckp_fileprefix + args.architecture + ".pth"

if os.path.isfile(ckp_filepath) :
    print("Checkpoint {} recognized, continue training this model!".format(ckp_filepath))
    model = mo.load_checkpoint(ckp_filepath, device)
else :
    print("Checkpoint {} not recognized, starting from scratch!".format(ckp_filepath))
    model = mo.init_model(args.directory, args.architecture, args.learning_rate, args.hidden_units)

# Create an object where we can iterate over the data
dataloaders, img_datasets, _ = ut.get_data_loader(args.directory)

# Is usefull in the do_training function
dataset_sizes = {x: len(img_datasets[x])
                 for x in ['train', 'valid', 'test']}
# Now we are ready to do some training
model = mo.do_training(model, dataloaders, dataset_sizes, device, epochs = args.epochs)
# Done with the f*****g training, now save the network again
mo.save_checkpoint(model, args.architecture, img_datasets['train'], ckp_filepath)
Example #16
0
        # loss.backward()
        # optimizer.step()

        pred_choice = outputs.data.max(1)[1]
        correct += pred_choice.eq(labels.data).cpu().sum()
        sum += len(labels)
        print('batch_index: [%d/%d]' % (batch_index, len(evalloader)),
              'Eval epoch: [%d]' % (epoch),
              'correct/sum:%d/%d, %.4f' % (correct, sum, correct / sum))

if __name__ == '__main__':
    # 是否装载模型参数
    load = False

    if load:
        checkpoint = model.load_checkpoint()
        net.load_state_dict(checkpoint['state_dict'])
        start_epoch = checkpoint['epoch'] + 1
    else:
        start_epoch = 0

    # 设置优化器
    optimizer = optim.Adam(net.parameters(), lr=1e-3, betas=(0.9, 0.999), weight_decay=0)
    # optimizer = optim.SGD(net.parameters(), lr=1e-3, momentum=1e-1, weight_decay=1e-4)

    for epoch in range(start_epoch, n_epoch):
        train(epoch)

        # 保存参数
        checkpoint = {'epoch': epoch, 'state_dict': net.state_dict(), 'optimizer': optimizer.state_dict()}
        model.save_checkpoint(checkpoint)
Example #17
0
def train(opts,
          model,
          train_data,
          val_data,
          num_epochs,
          resume_from_epoch=None):
    train_loader = DataLoader(train_data,
                              batch_size=opts.batch_size,
                              shuffle=True,
                              num_workers=opts.dataloader_workers,
                              pin_memory=True)
    val_loader = DataLoader(val_data,
                            batch_size=opts.batch_size,
                            shuffle=False,
                            num_workers=opts.dataloader_workers,
                            pin_memory=True)

    if opts.hyperparameter == False:
        if os.path.exists(
                os.path.join(opts.results_dir, opts.experiment_name,
                             'training')):
            previous_runs = os.listdir(
                os.path.join(opts.results_dir, opts.experiment_name,
                             'training'))
            if len(previous_runs) == 0:
                run_number = 1
            else:
                run_number = max(
                    [int(s.split('run_')[1]) for s in previous_runs]) + 1
        else:
            run_number = 1
    elif opts.hyperparameter == True:
        if os.path.exists(
                os.path.join(opts.results_dir, 'hyperparameter_tuning',
                             'training')):
            previous_runs = os.listdir(
                os.path.join(opts.results_dir, 'hyperparameter_tuning',
                             'training'))
            if len(previous_runs) == 0:
                run_number = 1
            else:
                run_number = max(
                    [int(s.split('run_')[1]) for s in previous_runs]) + 1
        else:
            run_number = 1

    log_dir_num = 'run_%02d' % run_number
    print("Currently on run #: ", run_number)
    log_learning_rate = 'lr_{}'.format(opts.lr)
    log_batch_size = 'batch_{}'.format(opts.batch_size)
    log_loss_type = '{}'.format(opts.loss_type)
    log_loss_formulation = '{}'.format(opts.loss_formulation)
    log_method = '{}'.format(opts.method)
    log_normalization = 'norm_{}'.format(opts.image_normalization)
    log_compensating = 'comp_{}'.format(opts.compensated_target)

    if opts.hyperparameter == False:
        train_log_dir = os.path.join(opts.results_dir, opts.experiment_name,
                                     'training', log_dir_num,
                                     log_learning_rate, log_batch_size,
                                     log_loss_type, log_loss_formulation,
                                     log_method, log_normalization,
                                     log_compensating)
        val_log_dir = os.path.join(opts.results_dir, opts.experiment_name,
                                   'validation', log_dir_num,
                                   log_learning_rate, log_batch_size,
                                   log_loss_type, log_loss_formulation,
                                   log_method, log_normalization,
                                   log_compensating)
        train_writer = SummaryWriter(train_log_dir)
        val_writer = SummaryWriter(val_log_dir)

    elif opts.hyperparameter == True:
        train_log_dir = os.path.join(opts.results_dir, 'hyperparameter_tuning',
                                     'training', log_dir_num,
                                     log_learning_rate, log_batch_size,
                                     log_loss_type, log_loss_formulation,
                                     log_method, log_normalization,
                                     log_compensating)
        val_log_dir = os.path.join(opts.results_dir, 'hyperparameter_tuning',
                                   'validation', log_dir_num,
                                   log_learning_rate, log_batch_size,
                                   log_loss_type, log_loss_formulation,
                                   log_method, log_normalization,
                                   log_compensating)
        train_writer = SummaryWriter(train_log_dir)
        val_writer = SummaryWriter(val_log_dir)

    opts.save_txt('config.txt', log_dir_num)

    ### Load from Checkpoint
    if resume_from_epoch is not None:
        try:
            initial_epoch = model.load_checkpoint(resume_from_epoch) + 1
            iterations = (initial_epoch - 1) * opts.batch_size
        except FileNotFoundError:
            print('No model available for epoch {}, starting fresh'.format(
                resume_from_epoch))
            initial_epoch = 1
            iterations = 0

    else:
        initial_epoch = 1
        iterations = 0

    ### TRAIN AND VALIDATE ###
    if opts.jobs == 1:
        opts.best_model = 1e12

    if opts.hyperparameter == True:
        best_loss_this_run = None

    # MODEL PARAMETERS
    opt = torch.optim.Adam(model.parameters(), lr=opts.lr)

    if opts.loss_type == 'l1':
        loss_function = nn.L1Loss()
    elif opts.loss_type == 'mse':
        loss_function = nn.MSELoss()

    for epoch in range(initial_epoch, num_epochs + 1):
        epoch_start = time.perf_counter()

        # TRAIN
        epoch_train_loss = None
        set_mode('train', model)

        bar = progress.bar.Bar('Epoch {} train'.format(epoch),
                               max=len(train_loader))

        for data in train_loader:
            image, image_data = set_data(data, opts)
            loss_p = optimize(model, opt, loss_function, opts, image,
                              image_data)
            if opts.loss_type == 'mse':
                loss_p = torch.sqrt(loss_p)
            if epoch_train_loss is None:
                epoch_train_loss = get_errors(loss_p)
            else:
                epoch_train_loss = utils.concatenate_dicts(
                    epoch_train_loss, get_errors(loss_p))

            gc.collect()
            iterations += 1
            bar.next()
        bar.finish()

        train_end = time.perf_counter()

        # VALIDATE
        epoch_val_loss = None
        set_mode('eval', model)

        bar = progress.bar.Bar('Epoch {} val'.format(epoch),
                               max=len(val_loader))

        for data in val_loader:
            image, image_data = set_data(data, opts)
            loss_p, _ = model_test(opts,
                                   model,
                                   loss_function,
                                   image,
                                   image_data,
                                   compute_loss=True)
            if opts.loss_type == 'mse':
                loss_p = torch.sqrt(loss_p)
            if epoch_val_loss is None:
                epoch_val_loss = get_errors(loss_p)
            else:
                epoch_val_loss = utils.concatenate_dicts(
                    epoch_val_loss, get_errors(loss_p))
            bar.next()
        bar.finish()

        epoch_end = time.perf_counter()

        epoch_avg_val_loss = utils.compute_dict_avg(epoch_val_loss)
        epoch_avg_train_loss = utils.compute_dict_avg(epoch_train_loss)
        train_fps = len(train_data) / (train_end - epoch_start)
        val_fps = len(val_data) / (epoch_end - train_end)

        print(
            'End of epoch {}/{} | iter: {} | time: {:.3f} s | train: {:.3f} fps | val: {:.3f} fps'
            .format(epoch, num_epochs, iterations, epoch_end - epoch_start,
                    train_fps, val_fps))

        # LOG ERRORS
        train_errors = utils.tag_dict_keys(epoch_avg_train_loss, 'train')
        val_errors = utils.tag_dict_keys(epoch_avg_val_loss, 'val')
        print('Train errors: ', train_errors)
        print('Val errors: ', val_errors)
        for key, value in sorted(train_errors.items()):
            # print('Key: ', key, 'Value: ', value)
            train_writer.add_scalar(key, value, epoch)
            print('{:20}: {:.3e}'.format(key, value))

        for key, value in sorted(val_errors.items()):
            # print('Key: ', key, 'Value: ', value)
            val_writer.add_scalar(key, value, epoch)
            print('{:20}: {:.3e}'.format(key, value))

        # SAVE CHECKPOINT
        save_checkpoint(epoch, 'latest', opts, model)

        if epoch % opts.checkpoint_interval == 0:
            save_checkpoint(epoch, epoch, opts, model)

        curr_total_val_loss = 0
        for key, val in epoch_avg_val_loss.items():
            try:
                curr_total_val_loss += val[-1]
            except IndexError:
                curr_total_val_loss += val

        if curr_total_val_loss < opts.best_model:
            save_checkpoint(epoch, 'best', opts, model)
            opts.best_model = curr_total_val_loss

            # save the config of the best performing model
            opts.save_txt('best_model_config.txt')
            print('\nThe current best model hyperparameters are: \n')
            print(opts)
parser.add_argument(
    '--top_k',
    type=int,
    default=3,
    help='The number of how many of the top classes will be returned')
parser.add_argument('--category_names',
                    default='cat_to_name.json',
                    help='The mapping file of categories to real names')
parser.add_argument('--gpu',
                    action='store_true',
                    default=False,
                    help='Switch gpu mode on')

args = parser.parse_args()

if __name__ == '__main__':
    device = 'cuda' if args.gpu else 'cpu'
    model = load_checkpoint(args.checkpoint)

    with open(args.category_names, 'r') as f:
        cat_to_name = json.load(f)

    probs, classes = predict(image_path=args.input,
                             model=model,
                             topk=args.top_k,
                             device=device)
    class_names = [cat_to_name[str(c)] for c in classes]

    for p, c in zip(probs, class_names):
        print(f'Class: {c}, Probability: {p}')
Example #19
0
# convert argument parser input to a variable used in a function
device_selection = args.device_selection
data_dir = args.flowers_data_directory
pretrained_model_selection = args.pretrained_model_selection
checkpoint_filename = args.checkpoint_filename
learning_rate = args.learning_rate
cat_to_name_filename = args.cat_to_name_filename
image_filepath = args.image_filepath
topk = args.topk

#select device
device = set_device(device_selection)

# load model
loaded_model, criterion, optimizer, checkpoint = load_checkpoint(
    checkpoint_filename, pretrained_model_selection, learning_rate, device)

# Extract and Transform data
train_data, valid_data, test_data, trainloader, testloader, validloader = load_and_transform_data(
    data_dir)

# check device
print("Is our device GPU?")
print(device == torch.device("cuda"))

# test the model but only if it's GPU, on CPU it'll run forever. Purpose: to see if the model is fine after saving a checkpoint and loading it
if device == torch.device("cuda"):
    test_model(testloader, device, loaded_model)
else:
    pass
Example #20
0
import argparse
from model import load_checkpoint, predict

# Setup argparse arguments
parser = argparse.ArgumentParser()

parser.add_argument('img_path',type=str)
parser.add_argument('check_point',type=str)
parser.add_argument('--topk',type=int,default=3)
parser.add_argument('--category_name',type=str,default='./cat_to_name.json')
parser.add_argument('--gpu',action='store_true')

arg = parser.parse_args()

print('Predicting...') 
model = load_checkpoint(arg.check_point) 
probs, classes = predict(arg.img_path, model, arg.topk, arg.gpu,arg.category_name)
output = dict(zip(classes,probs))
print(output)
def self_training(model,
                  labeled_dataset,
                  unlabeled_dataset,
                  optimizer,
                  scheduler=None,
                  batch_size=4,
                  train_ratio=0.7,
                  score_threshold=0.7,
                  unlabeled_loss_weight=0.1,
                  relabel_step=None,
                  device='cpu',
                  max_epochs=100,
                  print_freq=10,
                  save_path=None,
                  checkpoint=None):
    model.to(device)
    metric_logger = utils.MetricLogger(delimiter=" ")
    last_loss = 1e9

    cur_epoch = 0
    # train_labeled_dataset, val_labeled_dataset = split_dataset(labeled_dataset, train_ratio)
    # train_unlabeled_dataset, val_unlabeled_dataset = split_dataset(unlabeled_dataset, train_ratio)
    dataset_path = os.path.join(save_path, 'dataset')

    if checkpoint is not None:
        print("loading checkpoint:" + checkpoint)
        model, optimizer, scheduler, cur_epoch = load_checkpoint(
            model, optimizer, scheduler, device, checkpoint)

    for epoch in range(cur_epoch, max_epochs):
        print("epoch {} / {}".format(epoch + 1, max_epochs))
        with open(os.path.join(dataset_path, 'train_labeled_dataset.pickle'),
                  'rb') as handle:
            train_labeled_dataset = pickle.load(handle)
        with open(os.path.join(dataset_path, 'val_labeled_dataset.pickle'),
                  'rb') as handle:
            val_labeled_dataset = pickle.load(handle)
        with open(os.path.join(dataset_path, 'train_unlabeled_dataset.pickle'),
                  'rb') as handle:
            train_unlabeled_dataset = pickle.load(handle)
        with open(os.path.join(dataset_path, 'val_unlabeled_dataset.pickle'),
                  'rb') as handle:
            val_unlabeled_dataset = pickle.load(handle)

        train_unlabeled_dataset = convert_subset(train_unlabeled_dataset)
        val_unlabeled_dataset = convert_subset(val_unlabeled_dataset)

        labeled_train_loader = DataLoader(train_labeled_dataset,
                                          collate_fn=collate_fn,
                                          batch_size=batch_size,
                                          shuffle=True)
        labeled_vld_loader = DataLoader(val_labeled_dataset,
                                        collate_fn=collate_fn,
                                        batch_size=batch_size,
                                        shuffle=False)
        pseudo_train = FLIRPseudoDataset(model,
                                         train_unlabeled_dataset,
                                         batch_size=batch_size,
                                         device=device,
                                         score_threshold=score_threshold)
        pseudo_val = FLIRPseudoDataset(model,
                                       val_unlabeled_dataset,
                                       batch_size=batch_size,
                                       device=device,
                                       score_threshold=score_threshold)
        unlabeled_train_loader = DataLoader(pseudo_train,
                                            collate_fn=collate_fn,
                                            batch_size=batch_size,
                                            shuffle=True)
        unlabeled_vld_loader = DataLoader(pseudo_val,
                                          collate_fn=collate_fn,
                                          batch_size=batch_size,
                                          shuffle=False)

        train_label_loss = train_one_epoch_self_training(
            model, optimizer, labeled_train_loader, 1, device, epoch,
            print_freq)
        train_loss = train_one_epoch_self_training(model, optimizer,
                                                   unlabeled_train_loader,
                                                   unlabeled_loss_weight,
                                                   device, epoch, print_freq)
        train_loss = train_label_loss + unlabeled_loss_weight * train_loss
        all_training_loss.append(train_loss)

        coco_evaluate(model, labeled_vld_loader, device)
        # labeled_loss = evaluate(model, vld_loader, device, epoch, print_freq)
        coco_evaluate(model, unlabeled_vld_loader, device)
        # unlabeled_loss = evaluate(model, vld_loader, device, epoch, print_freq)

        # loss = labeled_loss + unlabeled_loss_weight * unlabeled_loss
        loss = 0
        all_evaluation_loss.append(loss)

        if save_path is not None:
            save_checkpoint(model, optimizer, scheduler, epoch + 1, device,
                            save_path)
            last_loss = loss
        print("epoch {}, train loss {}, validation loss {}".format(
            epoch + 1, train_loss, loss))

        if scheduler is not None:
            scheduler.step()
Example #22
0
                              hids2.view(args.batch_size, -1).detach())
            '''Total loss = Loss1+Loss2+Loss3'''
            loss = loss1 + loss2 + loss3

            total_loss += loss.item()

    return total_loss / (nbatch + 1)


# Loop over epochs.
if args.resume or args.pretrained:
    print("=> loading checkpoint ")
    checkpoint = torch.load(
        Path('save', args.data, 'checkpoint',
             args.filename).with_suffix('.pth'))
    args, start_epoch, best_val_loss = model.load_checkpoint(
        args, checkpoint, feature_dim)
    optimizer.load_state_dict((checkpoint['optimizer']))
    del checkpoint
    epoch = start_epoch
    print("=> loaded checkpoint")
else:
    epoch = 1
    start_epoch = 1
    best_val_loss = 0
    print("=> Start training from scratch")
print('-' * 89)
print(args)
print('-' * 89)

if not args.pretrained:
    # At any point you can hit Ctrl + C to break out of training early.