def __init__(self): # load config file config = json.load(open("model/config.json")) # get the image processor self._imageProcessor = ImageProcessor(config) # load the DL model (change this if you are not using ONNX) target_label_names = [ 'necrosis', 'contrast_enhancing', 'core', 'tumor', 'brain' ] net1 = UNET_3D_to_2D(0, channels_in=4, channels=128, growth_rate=12, dilated_layers=[6, 6, 6, 6], output_channels=len(target_label_names)) net2 = UNET_3D_to_2D(1, channels_in=4, channels=128, growth_rate=12, dilated_layers=[6, 6, 6], output_channels=len(target_label_names)) net1 = net1.cuda() net2 = net2.cuda() load_checkpoint(net1, 'model/checkpoint.pth.tar') load_checkpoint(net2, 'model/checkpoint_2.pth.tar') self._model1 = net1 self._model2 = net2
def main(): model = YOLOv3(num_classes=config.NUM_CLASSES).to(config.DEVICE) optimizer = optim.Adam(model.parameters(), lr=config.LEARNING_RATE, weight_decay=config.WEIGHT_DECAY) loss_fn = YoloLoss() scaler = torch.cuda.amp.GradScaler() train_loader, test_loader, train_eval_loader = get_loaders( train_csv_path=config.DATASET + "/train.csv", test_csv_path=config.DATASET + "/test.csv") if config.LOAD_MODEL: load_checkpoint(config.CHECKPOINT_FILE, model, optimizer, config.LEARNING_RATE) scaled_anchors = (torch.tensor(config.ANCHORS) * torch.tensor( config.S).unsqueeze(1).unsqueeze(1).repeat(1, 3, 2)).to(config.DEVICE) for epoch in range(config.NUM_EPOCHS): #plot_couple_examples(model, test_loader, 0.6, 0.5, scaled_anchors) train_fn(train_loader, model, optimizer, loss_fn, scaler, scaled_anchors) #if config.SAVE_MODEL: # save_checkpoint(model, optimizer, filename=f"checkpoint.pth.tar") #print(f"Currently epoch {epoch}") #print("On Train Eval loader:") #print("On Train loader:") #check_class_accuracy(model, train_loader, threshold=config.CONF_THRESHOLD) if epoch > 0 and epoch % 3 == 0: check_class_accuracy(model, test_loader, threshold=config.CONF_THRESHOLD) pred_boxes, true_boxes = get_evaluation_bboxes( test_loader, model, iou_threshold=config.NMS_IOU_THRESH, anchors=config.ANCHORS, threshold=config.CONF_THRESHOLD, ) mapval = mean_average_precision( pred_boxes, true_boxes, iou_threshold=config.MAP_IOU_THRESH, box_format="midpoint", num_classes=config.NUM_CLASSES, ) print(f"MAP: {mapval.item()}") model.train()
def Main(): parser = argparse.ArgumentParser() parser.add_argument("image", help = "The input image to be predicted.", type = str) parser.add_argument("checkpoint", help = "Use a mapping of categories to real names", type = str) parser.add_argument("--gpu", help = "Use GPU instead of CPU.", action = "store_true") parser.add_argument("--topk", help = "Return top K most likely classes. ", type = int, default = 1) parser.add_argument("--category_names", help = "Use a mapping of categories to real names", type = str, default = None) args = parser.parse_args() if args.gpu: device = 'cuda' print('Compute using GPU') else: device = 'cpu' print('Compute using CPU') checkpoint_path, checkpoint_name = None, args.checkpoint #'checkpoint.pth' image_path = args.image #'flowers/test/1/image_06743.jpg' topk = args.topk category_names = args.category_names #'cat_to_name.json' model = load_checkpoint(file_path = checkpoint_path, file_name = checkpoint_name) cat_to_name = None if category_names is not None: cat_to_name = load_cat_json(file_name = category_names) probs, classes, class_name = predict(image_path, model, topk = topk, cat_to_name = cat_to_name, device = device, probs_show = True)
def main(): in_arg = get_input_args() # Creates and returns command line arguments print('\nPath To Image:\n', in_arg.path_to_image, '\n', '\nCheckpoint:\n', in_arg.checkpoint, '\n') print('Optional Command Line Arguments:\n', 'Top K [--top_k]: ', in_arg.top_k, '\n', 'Category Names [--category_names]: ', in_arg.category_names, '\n', 'GPU [--gpu]: ', in_arg.gpu, '\n') label_count, hidden_units, arch, class_to_idx, classifier_state_dict, epochs = mod.load_checkpoint( in_arg.checkpoint, in_arg.gpu) # Load checkpoint model = mod.build_model(label_count, hidden_units, arch, class_to_idx) # Build model model.classifier.load_state_dict(classifier_state_dict) criterion = nn.NLLLoss() image = util.process_image(in_arg.path_to_image) # Pre-process image labels = util.get_labels( in_arg.category_names) # Get dict of categories mapped to real names mod.predict(image, model, labels, in_arg.top_k, in_arg.gpu) # Prints Top K Labels and Probabilities
def continue_training( checkpoint_path: str, train_dir: str = "Data\\GTAV-AI\\data-v2\\train\\", dev_dir: str = "Data\\GTAV-AI\\data-v2\\dev\\", test_dir: str = "Data\\GTAV-AI\\data-v2\\test\\", output_dir: str = "Data\\models\\", batch_size: int = 10, num_epoch: int = 20, hide_map_prob: float = 0.0, save_checkpoints=True, save_best=True, ): """ Load a checkpoint and continue training, we will restore the model, the optimizer and the nvidia apex data if the model was trained using fp16. Note: If the model was trained using fp16 it cannot be restored as an fp32 model and vice versa. The floating point precision used for training the model will be restored automatically from the checkpoint. Input: - checkpoint_path: Path of the checkpoint to restore - train_dir: Directory where the train files are stored - dev_dir: Directory where the development files are stored - test_dir: Directory where the test files are stored - output_dir: Directory where the model and the checkpoints are going to be saved - batch_size: Batch size (Around 10 for 8GB GPU) - num_epochs: Number of epochs to do - optimizer_name: Name of the optimizer to use [SGD, Adam] - hide_map_prob: Probability for removing the minimap (black square) from the image (0<=hide_map_prob<=1) - save_checkpoints: save a checkpoint each epoch (Each checkpoint will rewrite the previous one) - save_best: save the model that achieves the higher accuracy in the development set Output: """ model, optimizer_name, optimizer, acc_dev, epoch, fp16, opt_level = load_checkpoint( checkpoint_path, device) model = model.to(device) max_acc = train( model=model, optimizer_name=optimizer_name, optimizer=optimizer, train_dir=train_dir, dev_dir=dev_dir, test_dir=test_dir, output_dir=output_dir, batch_size=batch_size, initial_epoch=epoch, num_epoch=num_epoch, max_acc=acc_dev, hide_map_prob=hide_map_prob, fp16=fp16, amp_opt_level=opt_level if fp16 else None, save_checkpoints=save_checkpoints, save_best=save_best, ) print(f"Training finished, max accuracy in the development set {max_acc}")
def main(): # Fetch user input user_input = get_predict_arguments() image_path = user_input.image_path checkpoint_path = user_input.checkpoint_path top_k = user_input.top_k category_names_file_path = user_input.category_names gpu = user_input.gpu device = m.determine_device(gpu) # Load model and checkpoint model, checkpoint = m.load_checkpoint(checkpoint_path, device) # Predict top_probabilities, top_classes = predict(image_path, model, device, top_k) top_classes.squeeze_() top_probabilities.squeeze_() # Load categories-to-names with open(category_names_file_path, 'r') as f: cat_to_name = json.load(f) classes_to_indexes = checkpoint['class_to_index'] indexes_to_classes = {v: k for k, v in classes_to_indexes.items()} named_classes = [ cat_to_name[indexes_to_classes[top_class]] for top_class in top_classes.cpu().numpy() ] print("Prediction Results") print(f"- Model Used: {checkpoint['architecture']}") print(" - Details:") print( f" - Inputs: {checkpoint['inputs']}\n", f" - Outputs: {checkpoint['outputs']}\n", f" - Hidden Layers: {checkpoint['hidden_layers']}\n", f" - Dropout: {checkpoint['dropout']}\n" f" - Epochs: {checkpoint['epochs']}\n") print(f"Top {top_k} Probabilities:") for named_class, probability in zip(named_classes, top_probabilities): print("- Predicted: {} --> {:.3f}".format(named_class.capitalize(), probability))
def train_teacher_model(model, labeled_dataset, optimizer, scheduler=None, train_ratio=0.7, batch_size=4, device='cpu', max_epochs=100, print_freq=10, save_path=None, checkpoint=None): model.to(device) metric_logger = utils.MetricLogger(delimiter=" ") last_loss = 1e9 cur_epoch = 0 if checkpoint is not None: print("loading checkpoint:" + checkpoint) model, optimizer, scheduler, cur_epoch = load_checkpoint( model, optimizer, scheduler, device, checkpoint) train_dataset, vld_dataset = split_dataset(labeled_dataset, train_ratio) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) vld_loader = DataLoader(vld_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn) for epoch in range(cur_epoch, max_epochs): print("epoch {} / {}".format(epoch + 1, max_epochs)) train_one_epoch(model, optimizer, train_loader, device, epoch, print_freq) loss = evaluate(model, vld_loader, device, epoch, print_freq) if loss < last_loss and save_path != None: save_checkpoint(model, optimizer, scheduler, epoch + 1, device, save_path) last_loss = loss if scheduler is not None: scheduler.step()
def checkpoint2model(checkpoint_path: str, model_dir: str): """ Given a checkpoint file, generates a model file that can be loaded by run_TEDD1104.py script. Input: - checkpoint_path path of checkpoint file (checkpoint.pt) - model_path directory where the model is going to be saved (model.bin and model_hyperparameters.json) Output: """ if not os.path.exists(model_dir): print(f"{model_dir} does not exits. We will create it.") os.makedirs(model_dir) print_message(f"Loading checkpoint: {checkpoint_path}") ( tedd1104_model, _, _, _, running_loss, total_batches, total_training_examples, acc_dev, epoch, fp16, _, ) = model.load_checkpoint(path=checkpoint_path, device=model.torch.device("cpu")) print(f">>>>>> Checkpoint info <<<<<<\n" f"Running loss: {running_loss/total_batches}\n" f"Num epochs: {epoch+1}\n" f"Total training examples: {total_training_examples}\n" f"Acc dev set: {round(acc_dev*100,2)}\n" f"FP16: {fp16}\n") print_message(f"Saving model in {model_dir}") model.save_model(model=tedd1104_model, save_dir=model_dir, fp16=fp16) print_message(f"Done!")
def main(): # Measures total program runtime by collecting start time start_time = time() # Creates & retrieves Command Line Arugments in_arg = get_input_args() # Set device to cuda if gpu flag is set device = 'cuda' if in_arg.gpu == True else 'cpu' # If given, read the mapping of categories to class names cat_to_name = {} if in_arg.category_names: with open(in_arg.category_names, 'r') as f: cat_to_name = json.load(f) # Load checkpoint model, _, _ = load_checkpoint(in_arg.checkpoint) # Predict classes probs, classes = predict(in_arg.img_path, model, device, in_arg.top_k) # Convert categories to real names if a mapping was given if cat_to_name: classes = [cat_to_name[str(cat)] for cat in classes] # Print results print('\nThe top {} most likely classes are:'.format(in_arg.top_k)) max_name_len = len(max(classes, key=len)) row_format = "{:<" + str(max_name_len + 2) + "}{:<.4f}" for prob, name in zip(probs, classes): print(row_format.format(name, prob)) # Measure total program runtime by collecting end time end_time = time() # Computes overall runtime in seconds & prints it in hh:mm:ss format tot_time = end_time - start_time print( "\n** Total Elapsed Runtime:", str(int((tot_time / 3600))) + ":" + str(int( (tot_time % 3600) / 60)) + ":" + str(int((tot_time % 3600) % 60)))
def main(): print('Predict') in_arg = get_input_args() print("Command Line Arguments:\n input =", in_arg.input, "\n checkpoint =", in_arg.checkpoint, "\n top_k =", in_arg.top_k, "\n category_names =", in_arg.category_names, "\n gpu =", in_arg.gpu) # Load checkpoint model, checkpoint = load_checkpoint(in_arg.checkpoint) # Load catagory mapping dictionary cat_to_name = category_mapping(in_arg.category_names) # Process the image to return a transposed_image transposed_image = process_image(in_arg.input) # Get the prediction for an image file. top_classes = predict(transposed_image, model, in_arg.top_k, cat_to_name, in_arg.gpu) # Print the chart with the top classes and probabilities. print(top_classes)
help='Checkpoint of the model') parser.add_argument('--top_k', action='store', dest='top_k', help='Top number of most likely class') parser.add_argument('--category_names', action='store', dest='category_names', default=None help='Maping of categories') parser.add_argument('--gpu', action='store_true', default=False, dest='gpu', help='Set training to gpu') results = parser.parse_args() model = model.load_checkpoint("model_checkpoint.pth") device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model.to(device) image = utils.process_image(results.image_path).to(device) np_image = image.unsqueeze_(0) model.eval() with torch.no_grad(): log_ps = model.forward(np_image) ps = torch.exp(log_ps) top_probs, top_idx_probs = results.topk(topk, dim=1)
print(DEVICE, torch.cuda.is_available) model = model.to(DEVICE) # model = nn.DataParallel(model, device_ids=[0, 1, 2, 3]) loss = nn.MSELoss() optimizer = torch.optim.Adam(model.parameters(), lr=5e-5, weight_decay=.96) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1000) dataset = SRTrainDataset( hr_dir='/usr/project/xtmp/superresoluter/dataset/DIV2K/DIV2K_train_HR/', lr_dir= '/usr/project/xtmp/superresoluter/dataset/DIV2K/DIV2K_train_LR_bicubic/X4', lr_parse=lambda x: x.replace('x4', '')) loader = DataLoader(dataset, batch_size=1, shuffle=True, num_workers=8) begin_epoch = 0 ckpt = load_checkpoint(load_dir='./checkpoints/', map_location=None, model_name='down_sample') if ckpt is not None: print('recovering from checkpoints...') model.load_state_dict(ckpt['model']) begin_epoch = ckpt['epoch'] + 1 print('resuming training') begin = time() with open(os.path.join('../logs', 'down_sample.log'), 'w') as f: for epoch in range(begin_epoch, 1000): epoch_loss = [] for bid, batch in enumerate(loader): hr, lr = batch['hr'].to(DEVICE), batch['lr'].to(DEVICE) optimizer.zero_grad() ds = model(hr)
# Get an object to reference all our input arguments args = parser.parse_args() # First determine if this piece of code has to be executed on GPU or not. if args.GPU: # We know we want to the execute this on cuda. But maybe cuda is not available.. if torch.cuda.is_available(): device = "cuda:0" else: # So if cuda is not available don't do some unexpected things, just raise an error. raise ValueError( "We wanted to execute this training on GPU, but cuda is not available!!\nPlease remove the -g option or make sure cuda is available." ) else: device = 'cpu' cat_to_name = None # Only if the default or supplied parameter for category_names is a file we will try to make a mapping from id to flower name # This will ignore an invalid path to the json_file if os.path.isfile(args.category_names): with open(args.category_names, 'r') as f: cat_to_name = json.load(f) # Now we are ready to load the model model = mo.load_checkpoint(args.checkpoint, device) # Get a reference to the plot we want to make and save it fig = ut.show_prediction(args.image_filepath, model, device, args.top_k, cat_to_name) fig.savefig("pred_" + os.path.basename(args.image_filepath))
def load_checkpoint(model, optimizer, lr_scheduler, args, load_optimizer_states=True): """Load a model checkpoint.""" iteration, release, success = get_checkpoint_iteration(args) if not success: return 0 if args.deepspeed: checkpoint_name, sd = model.load_checkpoint( args.load, iteration, load_optimizer_states=not args.no_load_optim) if "client_lr_scheduler" in sd: lr_scheduler.load_state_dict(sd["client_lr_scheduler"]) print_rank_0("Load lr scheduler state") if checkpoint_name is None: if mpu.get_data_parallel_rank() == 0: print("Unable to load checkpoint.") return iteration else: # Checkpoint. checkpoint_name = get_checkpoint_name(args.load, iteration, release) if mpu.get_data_parallel_rank() == 0: print('global rank {} is loading checkpoint {}'.format( torch.distributed.get_rank(), checkpoint_name)) # Load the checkpoint. sd = torch.load(checkpoint_name, map_location='cpu') if isinstance(model, torchDDP): model = model.module # Model. try: model.load_state_dict(sd['module']) except KeyError: print_rank_0('A metadata file exists but unable to load model ' 'from checkpoint {}, exiting'.format(checkpoint_name)) exit() # Optimizer. if not release and not args.finetune and not args.no_load_optim: try: if optimizer is not None and load_optimizer_states: optimizer.load_state_dict(sd['optimizer']) if lr_scheduler is not None: lr_scheduler.load_state_dict(sd['lr_scheduler']) except KeyError: print_rank_0( 'Unable to load optimizer from checkpoint {}, exiting. ' 'Specify --no-load-optim or --finetune to prevent ' 'attempting to load the optimizer ' 'state.'.format(checkpoint_name)) exit() # Iterations. if args.finetune or release: iteration = 0 else: try: iteration = sd['iteration'] except KeyError: try: # Backward compatible with older checkpoints iteration = sd['total_iters'] except KeyError: print_rank_0( 'A metadata file exists but Unable to load iteration ' ' from checkpoint {}, exiting'.format(checkpoint_name)) exit() # rng states. if not release and not args.finetune and not args.no_load_rng: try: random.setstate(sd['random_rng_state']) np.random.set_state(sd['np_rng_state']) torch.set_rng_state(sd['torch_rng_state']) torch.cuda.set_rng_state(sd['cuda_rng_state']) mpu.get_cuda_rng_tracker().set_states(sd['rng_tracker_states']) except KeyError: print_rank_0( 'Unable to load optimizer from checkpoint {}, exiting. ' 'Specify --no-load-rng or --finetune to prevent ' 'attempting to load the random ' 'state.'.format(checkpoint_name)) exit() if mpu.get_data_parallel_rank() == 0: print(' successfully loaded {}'.format(checkpoint_name)) return iteration
device = "cuda:0" else : # So if cuda is not available don't do some unexpected things, just raise an error. raise ValueError("We wanted to execute this training on GPU, but cuda is not available!!\nPlease remove the -g option or make sure cuda is available.") else : device = 'cpu' print("The training is done on {}".format(device)) if args.checkpoint_dir : ckp_filepath = args.checkpoint_dir + ckp_fileprefix + args.architecture + ".pth" else : ckp_filepath = ckp_fileprefix + args.architecture + ".pth" if os.path.isfile(ckp_filepath) : print("Checkpoint {} recognized, continue training this model!".format(ckp_filepath)) model = mo.load_checkpoint(ckp_filepath, device) else : print("Checkpoint {} not recognized, starting from scratch!".format(ckp_filepath)) model = mo.init_model(args.directory, args.architecture, args.learning_rate, args.hidden_units) # Create an object where we can iterate over the data dataloaders, img_datasets, _ = ut.get_data_loader(args.directory) # Is usefull in the do_training function dataset_sizes = {x: len(img_datasets[x]) for x in ['train', 'valid', 'test']} # Now we are ready to do some training model = mo.do_training(model, dataloaders, dataset_sizes, device, epochs = args.epochs) # Done with the f*****g training, now save the network again mo.save_checkpoint(model, args.architecture, img_datasets['train'], ckp_filepath)
# loss.backward() # optimizer.step() pred_choice = outputs.data.max(1)[1] correct += pred_choice.eq(labels.data).cpu().sum() sum += len(labels) print('batch_index: [%d/%d]' % (batch_index, len(evalloader)), 'Eval epoch: [%d]' % (epoch), 'correct/sum:%d/%d, %.4f' % (correct, sum, correct / sum)) if __name__ == '__main__': # 是否装载模型参数 load = False if load: checkpoint = model.load_checkpoint() net.load_state_dict(checkpoint['state_dict']) start_epoch = checkpoint['epoch'] + 1 else: start_epoch = 0 # 设置优化器 optimizer = optim.Adam(net.parameters(), lr=1e-3, betas=(0.9, 0.999), weight_decay=0) # optimizer = optim.SGD(net.parameters(), lr=1e-3, momentum=1e-1, weight_decay=1e-4) for epoch in range(start_epoch, n_epoch): train(epoch) # 保存参数 checkpoint = {'epoch': epoch, 'state_dict': net.state_dict(), 'optimizer': optimizer.state_dict()} model.save_checkpoint(checkpoint)
def train(opts, model, train_data, val_data, num_epochs, resume_from_epoch=None): train_loader = DataLoader(train_data, batch_size=opts.batch_size, shuffle=True, num_workers=opts.dataloader_workers, pin_memory=True) val_loader = DataLoader(val_data, batch_size=opts.batch_size, shuffle=False, num_workers=opts.dataloader_workers, pin_memory=True) if opts.hyperparameter == False: if os.path.exists( os.path.join(opts.results_dir, opts.experiment_name, 'training')): previous_runs = os.listdir( os.path.join(opts.results_dir, opts.experiment_name, 'training')) if len(previous_runs) == 0: run_number = 1 else: run_number = max( [int(s.split('run_')[1]) for s in previous_runs]) + 1 else: run_number = 1 elif opts.hyperparameter == True: if os.path.exists( os.path.join(opts.results_dir, 'hyperparameter_tuning', 'training')): previous_runs = os.listdir( os.path.join(opts.results_dir, 'hyperparameter_tuning', 'training')) if len(previous_runs) == 0: run_number = 1 else: run_number = max( [int(s.split('run_')[1]) for s in previous_runs]) + 1 else: run_number = 1 log_dir_num = 'run_%02d' % run_number print("Currently on run #: ", run_number) log_learning_rate = 'lr_{}'.format(opts.lr) log_batch_size = 'batch_{}'.format(opts.batch_size) log_loss_type = '{}'.format(opts.loss_type) log_loss_formulation = '{}'.format(opts.loss_formulation) log_method = '{}'.format(opts.method) log_normalization = 'norm_{}'.format(opts.image_normalization) log_compensating = 'comp_{}'.format(opts.compensated_target) if opts.hyperparameter == False: train_log_dir = os.path.join(opts.results_dir, opts.experiment_name, 'training', log_dir_num, log_learning_rate, log_batch_size, log_loss_type, log_loss_formulation, log_method, log_normalization, log_compensating) val_log_dir = os.path.join(opts.results_dir, opts.experiment_name, 'validation', log_dir_num, log_learning_rate, log_batch_size, log_loss_type, log_loss_formulation, log_method, log_normalization, log_compensating) train_writer = SummaryWriter(train_log_dir) val_writer = SummaryWriter(val_log_dir) elif opts.hyperparameter == True: train_log_dir = os.path.join(opts.results_dir, 'hyperparameter_tuning', 'training', log_dir_num, log_learning_rate, log_batch_size, log_loss_type, log_loss_formulation, log_method, log_normalization, log_compensating) val_log_dir = os.path.join(opts.results_dir, 'hyperparameter_tuning', 'validation', log_dir_num, log_learning_rate, log_batch_size, log_loss_type, log_loss_formulation, log_method, log_normalization, log_compensating) train_writer = SummaryWriter(train_log_dir) val_writer = SummaryWriter(val_log_dir) opts.save_txt('config.txt', log_dir_num) ### Load from Checkpoint if resume_from_epoch is not None: try: initial_epoch = model.load_checkpoint(resume_from_epoch) + 1 iterations = (initial_epoch - 1) * opts.batch_size except FileNotFoundError: print('No model available for epoch {}, starting fresh'.format( resume_from_epoch)) initial_epoch = 1 iterations = 0 else: initial_epoch = 1 iterations = 0 ### TRAIN AND VALIDATE ### if opts.jobs == 1: opts.best_model = 1e12 if opts.hyperparameter == True: best_loss_this_run = None # MODEL PARAMETERS opt = torch.optim.Adam(model.parameters(), lr=opts.lr) if opts.loss_type == 'l1': loss_function = nn.L1Loss() elif opts.loss_type == 'mse': loss_function = nn.MSELoss() for epoch in range(initial_epoch, num_epochs + 1): epoch_start = time.perf_counter() # TRAIN epoch_train_loss = None set_mode('train', model) bar = progress.bar.Bar('Epoch {} train'.format(epoch), max=len(train_loader)) for data in train_loader: image, image_data = set_data(data, opts) loss_p = optimize(model, opt, loss_function, opts, image, image_data) if opts.loss_type == 'mse': loss_p = torch.sqrt(loss_p) if epoch_train_loss is None: epoch_train_loss = get_errors(loss_p) else: epoch_train_loss = utils.concatenate_dicts( epoch_train_loss, get_errors(loss_p)) gc.collect() iterations += 1 bar.next() bar.finish() train_end = time.perf_counter() # VALIDATE epoch_val_loss = None set_mode('eval', model) bar = progress.bar.Bar('Epoch {} val'.format(epoch), max=len(val_loader)) for data in val_loader: image, image_data = set_data(data, opts) loss_p, _ = model_test(opts, model, loss_function, image, image_data, compute_loss=True) if opts.loss_type == 'mse': loss_p = torch.sqrt(loss_p) if epoch_val_loss is None: epoch_val_loss = get_errors(loss_p) else: epoch_val_loss = utils.concatenate_dicts( epoch_val_loss, get_errors(loss_p)) bar.next() bar.finish() epoch_end = time.perf_counter() epoch_avg_val_loss = utils.compute_dict_avg(epoch_val_loss) epoch_avg_train_loss = utils.compute_dict_avg(epoch_train_loss) train_fps = len(train_data) / (train_end - epoch_start) val_fps = len(val_data) / (epoch_end - train_end) print( 'End of epoch {}/{} | iter: {} | time: {:.3f} s | train: {:.3f} fps | val: {:.3f} fps' .format(epoch, num_epochs, iterations, epoch_end - epoch_start, train_fps, val_fps)) # LOG ERRORS train_errors = utils.tag_dict_keys(epoch_avg_train_loss, 'train') val_errors = utils.tag_dict_keys(epoch_avg_val_loss, 'val') print('Train errors: ', train_errors) print('Val errors: ', val_errors) for key, value in sorted(train_errors.items()): # print('Key: ', key, 'Value: ', value) train_writer.add_scalar(key, value, epoch) print('{:20}: {:.3e}'.format(key, value)) for key, value in sorted(val_errors.items()): # print('Key: ', key, 'Value: ', value) val_writer.add_scalar(key, value, epoch) print('{:20}: {:.3e}'.format(key, value)) # SAVE CHECKPOINT save_checkpoint(epoch, 'latest', opts, model) if epoch % opts.checkpoint_interval == 0: save_checkpoint(epoch, epoch, opts, model) curr_total_val_loss = 0 for key, val in epoch_avg_val_loss.items(): try: curr_total_val_loss += val[-1] except IndexError: curr_total_val_loss += val if curr_total_val_loss < opts.best_model: save_checkpoint(epoch, 'best', opts, model) opts.best_model = curr_total_val_loss # save the config of the best performing model opts.save_txt('best_model_config.txt') print('\nThe current best model hyperparameters are: \n') print(opts)
parser.add_argument( '--top_k', type=int, default=3, help='The number of how many of the top classes will be returned') parser.add_argument('--category_names', default='cat_to_name.json', help='The mapping file of categories to real names') parser.add_argument('--gpu', action='store_true', default=False, help='Switch gpu mode on') args = parser.parse_args() if __name__ == '__main__': device = 'cuda' if args.gpu else 'cpu' model = load_checkpoint(args.checkpoint) with open(args.category_names, 'r') as f: cat_to_name = json.load(f) probs, classes = predict(image_path=args.input, model=model, topk=args.top_k, device=device) class_names = [cat_to_name[str(c)] for c in classes] for p, c in zip(probs, class_names): print(f'Class: {c}, Probability: {p}')
# convert argument parser input to a variable used in a function device_selection = args.device_selection data_dir = args.flowers_data_directory pretrained_model_selection = args.pretrained_model_selection checkpoint_filename = args.checkpoint_filename learning_rate = args.learning_rate cat_to_name_filename = args.cat_to_name_filename image_filepath = args.image_filepath topk = args.topk #select device device = set_device(device_selection) # load model loaded_model, criterion, optimizer, checkpoint = load_checkpoint( checkpoint_filename, pretrained_model_selection, learning_rate, device) # Extract and Transform data train_data, valid_data, test_data, trainloader, testloader, validloader = load_and_transform_data( data_dir) # check device print("Is our device GPU?") print(device == torch.device("cuda")) # test the model but only if it's GPU, on CPU it'll run forever. Purpose: to see if the model is fine after saving a checkpoint and loading it if device == torch.device("cuda"): test_model(testloader, device, loaded_model) else: pass
import argparse from model import load_checkpoint, predict # Setup argparse arguments parser = argparse.ArgumentParser() parser.add_argument('img_path',type=str) parser.add_argument('check_point',type=str) parser.add_argument('--topk',type=int,default=3) parser.add_argument('--category_name',type=str,default='./cat_to_name.json') parser.add_argument('--gpu',action='store_true') arg = parser.parse_args() print('Predicting...') model = load_checkpoint(arg.check_point) probs, classes = predict(arg.img_path, model, arg.topk, arg.gpu,arg.category_name) output = dict(zip(classes,probs)) print(output)
def self_training(model, labeled_dataset, unlabeled_dataset, optimizer, scheduler=None, batch_size=4, train_ratio=0.7, score_threshold=0.7, unlabeled_loss_weight=0.1, relabel_step=None, device='cpu', max_epochs=100, print_freq=10, save_path=None, checkpoint=None): model.to(device) metric_logger = utils.MetricLogger(delimiter=" ") last_loss = 1e9 cur_epoch = 0 # train_labeled_dataset, val_labeled_dataset = split_dataset(labeled_dataset, train_ratio) # train_unlabeled_dataset, val_unlabeled_dataset = split_dataset(unlabeled_dataset, train_ratio) dataset_path = os.path.join(save_path, 'dataset') if checkpoint is not None: print("loading checkpoint:" + checkpoint) model, optimizer, scheduler, cur_epoch = load_checkpoint( model, optimizer, scheduler, device, checkpoint) for epoch in range(cur_epoch, max_epochs): print("epoch {} / {}".format(epoch + 1, max_epochs)) with open(os.path.join(dataset_path, 'train_labeled_dataset.pickle'), 'rb') as handle: train_labeled_dataset = pickle.load(handle) with open(os.path.join(dataset_path, 'val_labeled_dataset.pickle'), 'rb') as handle: val_labeled_dataset = pickle.load(handle) with open(os.path.join(dataset_path, 'train_unlabeled_dataset.pickle'), 'rb') as handle: train_unlabeled_dataset = pickle.load(handle) with open(os.path.join(dataset_path, 'val_unlabeled_dataset.pickle'), 'rb') as handle: val_unlabeled_dataset = pickle.load(handle) train_unlabeled_dataset = convert_subset(train_unlabeled_dataset) val_unlabeled_dataset = convert_subset(val_unlabeled_dataset) labeled_train_loader = DataLoader(train_labeled_dataset, collate_fn=collate_fn, batch_size=batch_size, shuffle=True) labeled_vld_loader = DataLoader(val_labeled_dataset, collate_fn=collate_fn, batch_size=batch_size, shuffle=False) pseudo_train = FLIRPseudoDataset(model, train_unlabeled_dataset, batch_size=batch_size, device=device, score_threshold=score_threshold) pseudo_val = FLIRPseudoDataset(model, val_unlabeled_dataset, batch_size=batch_size, device=device, score_threshold=score_threshold) unlabeled_train_loader = DataLoader(pseudo_train, collate_fn=collate_fn, batch_size=batch_size, shuffle=True) unlabeled_vld_loader = DataLoader(pseudo_val, collate_fn=collate_fn, batch_size=batch_size, shuffle=False) train_label_loss = train_one_epoch_self_training( model, optimizer, labeled_train_loader, 1, device, epoch, print_freq) train_loss = train_one_epoch_self_training(model, optimizer, unlabeled_train_loader, unlabeled_loss_weight, device, epoch, print_freq) train_loss = train_label_loss + unlabeled_loss_weight * train_loss all_training_loss.append(train_loss) coco_evaluate(model, labeled_vld_loader, device) # labeled_loss = evaluate(model, vld_loader, device, epoch, print_freq) coco_evaluate(model, unlabeled_vld_loader, device) # unlabeled_loss = evaluate(model, vld_loader, device, epoch, print_freq) # loss = labeled_loss + unlabeled_loss_weight * unlabeled_loss loss = 0 all_evaluation_loss.append(loss) if save_path is not None: save_checkpoint(model, optimizer, scheduler, epoch + 1, device, save_path) last_loss = loss print("epoch {}, train loss {}, validation loss {}".format( epoch + 1, train_loss, loss)) if scheduler is not None: scheduler.step()
hids2.view(args.batch_size, -1).detach()) '''Total loss = Loss1+Loss2+Loss3''' loss = loss1 + loss2 + loss3 total_loss += loss.item() return total_loss / (nbatch + 1) # Loop over epochs. if args.resume or args.pretrained: print("=> loading checkpoint ") checkpoint = torch.load( Path('save', args.data, 'checkpoint', args.filename).with_suffix('.pth')) args, start_epoch, best_val_loss = model.load_checkpoint( args, checkpoint, feature_dim) optimizer.load_state_dict((checkpoint['optimizer'])) del checkpoint epoch = start_epoch print("=> loaded checkpoint") else: epoch = 1 start_epoch = 1 best_val_loss = 0 print("=> Start training from scratch") print('-' * 89) print(args) print('-' * 89) if not args.pretrained: # At any point you can hit Ctrl + C to break out of training early.