def train_loop(hp, logger, writer): # make dataloader logger.info("Making train dataloader...") train_loader = create_dataloader(hp, DataloaderMode.train) logger.info("Making test dataloader...") test_loader = create_dataloader(hp, DataloaderMode.test) # init Model net_arch = Net_arch(hp) loss_f = torch.nn.MSELoss() model = Model(hp, net_arch, loss_f) if hp.load.resume_state_path is not None: model.load_training_state(logger) else: logger.info("Starting new training run.") try: for model.epoch in itertools.count(model.epoch + 1): if model.epoch > hp.train.num_iter: break train_model(hp, model, train_loader, writer, logger) if model.epoch % hp.log.chkpt_interval == 0: model.save_network(logger) model.save_training_state(logger) test_model(hp, model, test_loader, writer) logger.info("End of Train") except Exception as e: logger.info("Exiting due to exception: %s" % e) traceback.print_exc()
def _do_work(filenames): # This is the only unique thing to the handler. You have to # implement the method that operates on a file. new_filenames = [] new_filenames = train_model(filenames) data = [] if len(new_filenames) > 0: multi_file_manifest = {} context, socket = zmq_connect(port=5557, pattern="REQ") for f in new_filenames: single_file_manifest = generateFileManifest(f, purpose="train_model") for k in single_file_manifest: multi_file_manifest[k] = single_file_manifest[k] socket.send_string(json.dumps(multi_file_manifest)) repl = socket.recv() print(f"\nGot {repl}") else: n = inspect.stack()[0][3] print("\nnew_filenames is empty") print(f"{n} failed on {filenames}") return new_filenames
def train_loop(rank, hp, world_size=1): # reload hp hp = DotDict(hp) if hp.model.device.lower() == "cuda" and world_size != 0: setup(hp, rank, world_size) if rank != 0: logger = None writer = None else: # set logger logger = make_logger(hp) # set writer (tensorboard / wandb) writer = Writer(hp, hp.log.log_dir) hp_str = yaml.dump(hp.to_dict()) logger.info("Config:") logger.info(hp_str) if hp.data.train_dir == "" or hp.data.test_dir == "": logger.error("train or test data directory cannot be empty.") raise Exception("Please specify directories of data") logger.info("Set up train process") if hp.model.device.lower() == "cuda" and world_size != 0: hp.model.device = rank torch.cuda.set_device(rank) else: hp.model.device = hp.model.device.lower() # make dataloader if logger is not None: logger.info("Making train dataloader...") train_loader = create_dataloader(hp, DataloaderMode.train, rank, world_size) if logger is not None: logger.info("Making test dataloader...") test_loader = create_dataloader(hp, DataloaderMode.test, rank, world_size) # init Model net_arch = Net_arch(hp) loss_f = torch.nn.MSELoss() model = Model(hp, net_arch, loss_f, rank, world_size) # load training state if hp.load.resume_state_path is not None: model.load_training_state(logger) else: if logger is not None: logger.info("Starting new training run.") try: epoch_step = 1 if hp.data.divide_dataset_per_gpu else world_size for model.epoch in itertools.count(model.epoch + 1, epoch_step): if model.epoch > hp.train.num_iter: break train_model(hp, model, train_loader, writer, logger) if model.epoch % hp.log.chkpt_interval == 0: model.save_network(logger) model.save_training_state(logger) test_model(hp, model, test_loader, writer) cleanup() if logger is not None: logger.info("End of Train") except Exception as e: if logger is not None: logger.info("Exiting due to exception: %s" % e) traceback.print_exc() cleanup()
def main_func(params): assert params.data_mean != '', "-data_mean is required" assert params.data_sd != '', "-data_sd is required" params.data_mean = [float(m) for m in params.data_mean.split(',')] params.data_sd = [float(s) for s in params.data_sd.split(',')] if params.seed > -1: set_seed(params.seed) rnd_generator = torch.Generator(device='cpu') if params.seed > -1 else None # Setup image training data training_data, num_classes, class_weights = load_dataset(data_path=params.data_path, val_percent=params.val_percent, batch_size=params.batch_size, \ input_mean=params.data_mean, input_sd=params.data_sd, use_caffe=not params.not_caffe, \ train_workers=params.train_workers, val_workers=params.val_workers, balance_weights=params.balance_classes, \ rnd_generator=rnd_generator) # Setup model definition cnn, is_start_model, base_model = setup_model(params.model_file, num_classes=num_classes, base_model=params.base_model, pretrained=not params.reset_weights) if params.optimizer == 'sgd': optimizer = optim.SGD(cnn.parameters(), lr=params.lr, momentum=0.9) elif params.optimizer == 'adam': optimizer = optim.Adam(cnn.parameters(), lr=params.lr) lrscheduler = optim.lr_scheduler.StepLR(optimizer, step_size=8, gamma=0.96) if params.balance_classes: criterion = torch.nn.CrossEntropyLoss(weight=class_weights.to(params.use_device)) else: criterion = torch.nn.CrossEntropyLoss() # Maybe delete braches if params.delete_branches and not is_start_model: try: cnn.remove_branches() has_branches = False except: has_branches = True pass else: has_branches = True # Load pretrained model weights start_epoch = 1 if not params.reset_weights: cnn, optimizer, lrscheduler, start_epoch = load_checkpoint(cnn, params.model_file, optimizer, lrscheduler, num_classes, is_start_model=is_start_model) if params.delete_branches and is_start_model: try: cnn.remove_branches() has_branches = False except: has_branches = True pass else: has_branches = True # Maybe freeze some model layers main_layer_list = ['conv1', 'conv2', 'conv3', 'mixed3a', 'mixed3b', 'mixed4a', 'mixed4b', 'mixed4c', 'mixed4d', 'mixed4e', 'mixed5a', 'mixed5b'] if params.freeze_to != 'none': for layer in main_layer_list: if params.freeze_to == layer: break for param in getattr(cnn, layer).parameters(): param.requires_grad = False branch_layer_list = ['loss_conv', 'loss_fc', 'loss_classifier'] if params.freeze_aux1_to != 'none' and has_branches: for layer in branch_layer_list: if params.freeze_aux1_to == layer: break for param in getattr(getattr(cnn, 'aux1'), layer).parameters(): param.requires_grad = False if params.freeze_aux2_to != 'none' and has_branches: for layer in branch_layer_list: if params.freeze_aux2_to == layer: break for param in getattr(getattr(cnn, 'aux2'), layer).parameters(): param.requires_grad = False # Optionally freeze/unfreeze specific layers and sub layers if params.toggle_layers != 'none': toggle_layers = [l.replace('\\', '/').replace('.', '/').split('/') for l in params.toggle_layers.split(',')] for layer in toggle_layers: if len(layer) == 2: for param in getattr(getattr(cnn, layer[0]), layer[1]).parameters(): param.requires_grad = False if param.requires_grad == True else False else: for param in getattr(cnn, layer[0]).parameters(): param.requires_grad = False if param.requires_grad == True else False n_learnable_params = sum(param.numel() for param in cnn.parameters() if param.requires_grad) print('Model has ' + "{:,}".format(n_learnable_params) + ' learnable parameters\n') cnn = cnn.to(params.use_device) if 'cuda' in params.use_device: if params.seed > -1: torch.backends.cudnn.benchmark = True torch.backends.cudnn.enabled = True save_info = [[params.data_mean, params.data_sd, 'BGR'], num_classes, has_branches, base_model] # Train model train_model(model=cnn, dataloaders=training_data, criterion=criterion, optimizer=optimizer, lrscheduler=lrscheduler, \ num_epochs=params.num_epochs, start_epoch=start_epoch, save_epoch=params.save_epoch, output_name=params.output_name, \ device=params.use_device, has_branches=has_branches, fc_only=False, num_classes=num_classes, individual_acc=params.individual_acc, \ should_save_csv=params.save_csv, csv_path=params.csv_dir, save_info=save_info)
else: raise ("Invalid Model Type") print(model) model = model.to(device) model = trans_to_cuda(model) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=40, gamma=0.1) model, log = train_model(model, criterion, optimizer, exp_lr_scheduler, dataloaders, dataset_sizes, device, num_epochs=args.epochs) df = pd.DataFrame({ 'epoch': [], 'training_loss': [], 'training_acc': [], 'val_loss': [], 'val_acc': [] }) df['epoch'] = log['epoch'] df['training_loss'] = log['training_loss'] df['training_acc'] = log['training_acc'] df['val_loss'] = log['val_loss'] df['val_acc'] = log['val_acc']
epochs = 100 batch_size = 32 iterations = 10 # number of repeat the training iter_start = 0 model_type = 'OctFiResNet' # name proposed model folder_name = '{}_models_{}'.format(model_type, train_id) save_dir = os.path.join(models_root, folder_name) make_summary = False if make_summary: # model.summary() and plot_model() summarize_model(save_dir, model_type) # train against other dataset for val_id in val_datasets: if train_id == val_id: dt_name = 'itself' val_id = None else: dt_name = datasets[val_id]['name'] print('Training', datasets[train_id]['name'], 'dataset against', dt_name) train_model(iterations=iterations, train_dataset=train_id, iter_start=iter_start, test_dataset=val_id, save_dir=save_dir, batch_size=batch_size, epochs=epochs, input_shape=input_shape, debug=DEBUG)
def train_loop(rank, cfg): logger = get_logger(cfg, os.path.basename(__file__)) if cfg.device == "cuda" and cfg.dist.gpus != 0: cfg.device = rank # turn off background generator when distributed run is on cfg.data.use_background_generator = False setup(cfg, rank) torch.cuda.set_device(cfg.device) # setup writer if is_logging_process(): # set log/checkpoint dir os.makedirs(cfg.log.chkpt_dir, exist_ok=True) # set writer (tensorboard / wandb) writer = Writer(cfg, "tensorboard") cfg_str = OmegaConf.to_yaml(cfg) logger.info("Config:\n" + cfg_str) if cfg.data.train_dir == "" or cfg.data.test_dir == "": logger.error("train or test data directory cannot be empty.") raise Exception("Please specify directories of data") logger.info("Set up train process") logger.info("BackgroundGenerator is turned off when Distributed running is on") # download MNIST dataset before making dataloader # TODO: This is example code. You should change this part as you need _ = torchvision.datasets.MNIST( root=hydra.utils.to_absolute_path("dataset/meta"), train=True, transform=torchvision.transforms.ToTensor(), download=True, ) _ = torchvision.datasets.MNIST( root=hydra.utils.to_absolute_path("dataset/meta"), train=False, transform=torchvision.transforms.ToTensor(), download=True, ) # Sync dist processes (because of download MNIST Dataset) if cfg.dist.gpus != 0: dist.barrier() # make dataloader if is_logging_process(): logger.info("Making train dataloader...") train_loader = create_dataloader(cfg, DataloaderMode.train, rank) if is_logging_process(): logger.info("Making test dataloader...") test_loader = create_dataloader(cfg, DataloaderMode.test, rank) # init Model net_arch = Net_arch(cfg) loss_f = torch.nn.CrossEntropyLoss() model = Model(cfg, net_arch, loss_f, rank) # load training state / network checkpoint if cfg.load.resume_state_path is not None: model.load_training_state() elif cfg.load.network_chkpt_path is not None: model.load_network() else: if is_logging_process(): logger.info("Starting new training run.") try: if cfg.dist.gpus == 0 or cfg.data.divide_dataset_per_gpu: epoch_step = 1 else: epoch_step = cfg.dist.gpus for model.epoch in itertools.count(model.epoch + 1, epoch_step): if model.epoch > cfg.num_epoch: break train_model(cfg, model, train_loader, writer) if model.epoch % cfg.log.chkpt_interval == 0: model.save_network() model.save_training_state() test_model(cfg, model, test_loader, writer) if is_logging_process(): logger.info("End of Train") except Exception as e: if is_logging_process(): logger.error(traceback.format_exc()) else: traceback.print_exc() finally: if cfg.dist.gpus != 0: cleanup()
def train_loop(rank, hp, world_size=0): if hp.model.device == "cuda" and world_size != 0: hp.model.device = rank # turn off background generator when distributed run is on hp.data.use_background_generator = False setup(hp, rank, world_size) torch.cuda.set_device(hp.model.device) # setup logger / writer if rank != 0: logger = None writer = None else: # set logger logger = make_logger(hp) # set writer (tensorboard / wandb) writer = Writer(hp, os.path.join(hp.log.log_dir, "tensorboard")) hp_str = yaml.dump(hp.to_dict()) logger.info("Config:") logger.info(hp_str) if hp.data.train_dir == "" or hp.data.test_dir == "": logger.error("train or test data directory cannot be empty.") raise Exception("Please specify directories of data") logger.info("Set up train process") logger.info( "BackgroundGenerator is turned off when Distributed running is on") # download MNIST dataset before making dataloader # TODO: This is example code. You should change this part as you need _ = torchvision.datasets.MNIST( root="dataset/meta", train=True, transform=torchvision.transforms.ToTensor(), download=True, ) _ = torchvision.datasets.MNIST( root="dataset/meta", train=False, transform=torchvision.transforms.ToTensor(), download=True, ) # Sync dist processes (because of download MNIST Dataset) if world_size != 0: dist.barrier() # make dataloader if logger is not None: logger.info("Making train dataloader...") train_loader = create_dataloader(hp, DataloaderMode.train, rank, world_size) if logger is not None: logger.info("Making test dataloader...") test_loader = create_dataloader(hp, DataloaderMode.test, rank, world_size) # init Model net_arch = Net_arch(hp) loss_f = torch.nn.CrossEntropyLoss() model = Model(hp, net_arch, loss_f, rank, world_size) # load training state / network checkpoint if hp.load.resume_state_path is not None: model.load_training_state(logger) elif hp.load.network_chkpt_path is not None: model.load_network(logger=logger) else: if logger is not None: logger.info("Starting new training run.") try: if world_size == 0 or hp.data.divide_dataset_per_gpu: epoch_step = 1 else: epoch_step = world_size for model.epoch in itertools.count(model.epoch + 1, epoch_step): if model.epoch > hp.train.num_epoch: break train_model(hp, model, train_loader, writer, logger) if model.epoch % hp.log.chkpt_interval == 0: model.save_network(logger) model.save_training_state(logger) test_model(hp, model, test_loader, writer, logger) if logger is not None: logger.info("End of Train") except Exception as e: if logger is not None: logger.error(traceback.format_exc()) else: traceback.print_exc() finally: if world_size != 0: cleanup()
def main(argv=sys.argv): """ The main script """ args = parse_args(argv) action = args.app_action train_folder_path = args.trp test_folder_path = args.tep folder_or_image = "" if args.path is None else args.path #Any arg supplied to this will be seen as True, no arg means False generate_model_name = args.gen_name # If the action is train, the model is the name of the new model # that is going to be trained; if it's predict, the model is the # name of the model to use for prediction model = args.model if action == 'train': new_model = model if not new_model: if generate_model_name in truth_values: #The user want us to generate model name for them #trp and tep args are required args implicitly for users from app if train_folder_path and test_folder_path: #Means user fulfilled the requirement. we can proceed now #generate name new_model = generate_name(train_folder_path) train_model(new_model, train_folder_path, test_folder_path) return #Here, the user might have supplied one folder argument or None at all print( "\n Both training folder and test folder arguments are required" ) return #The user did not supply model name and did not ask us to generate one. So definitely, # we are the one running this from console app #We don't want to retrain our default model. Better to delete. So we have to check if we #have trained our default model before. If default model exist, return if default_model in all_models(): print( "Retraining the default model is forbidden. Supply model name or Delete the default model manually and proceed" ) return #Training our default model now new_model = default_model print("Training the default model now...") #We use train function directly here for obvious reasons return train(new_model) #Model name supplied new_model = model + model_extension if new_model in all_models(): print( "There's already a model with that name. Please choose another name" " or find a model with name {}. Delete it and try again". format(new_model)) return #From here on, we expect user to supply training dataset and test dataset. #trp and tep args are required args implicitly for users from app if train_folder_path and test_folder_path: #Means user fulfilled the requirement. we can proceed now return train_model(new_model, train_folder_path, test_folder_path) #Here, the user might have supplied one folder argument or None at all print("\n Both training folder and test folder arguments are required") return elif action == 'predict': # If no model was given, use the default one if not model: model = default_model else: model = model + model_extension # If one was supplied, check that it actually exists if model not in all_models(): print("No such model has been trained") return # if it's not a folder that was supplied, check if it's a file if not os.path.isdir(folder_or_image): if os.path.isfile(folder_or_image): if not folder_or_image.endswith(image_extensions): print("\nError: An image file is required. Try again\n") return input_type = 'file' # add logic before here to pass in the model we want to use in the predictor predictor(input_type, folder_or_image, model) return print( '\nError: Invalid path. Kindly supply a valid folder or image path\n' ) return input_type = 'folder' # add logic before here to pass in the model we want to use in the predictor predictor(input_type, folder_or_image, model) if input_type == 'folder': print(f"\nDone! The results are in {folder_or_image}") elif action == 'delete': # Check that model name is provided. if not model: print("\n You must supply a model to delete") return model = model + model_extension if model not in all_models(): print("That model does not exist") return model_delete(model) return elif action == 'retrieve_models': # List all models print(all_models()) return else: print( '\nAction command is not supported\n for help: run python3 app.py -h' )