def main(): args = config.arguments() logger = log_utils.LogManager(args) net = model_utils.load(args.model_name) if args.cuda: torch.cuda.set_device(args.cuda_devide) torch.manual_seed(args.random_seed) if args.cuda: torch.cuda.manual_seed(args.random_seed) net = net.cuda() args.state = 'train' train_dataloader = data_utils.get_dataloader(args) args.state = 'test' test_dataloader = data_utils.get_dataloader(args) print('begin training') train.train(net, train_dataloader, logger, args) print('end training\n\n') print('begin eval') eval.eval(net, test_dataloader, logger, args) print('end eval')
def main(): """Main process.""" # Settings args = parse_args() check_args(args) use_gpu = args.use_gpu and torch.cuda.is_available() torch.manual_seed(args.seed) local_rank = args.local_rank # Local identifier of the current node device = torch.device('cpu') if use_gpu: torch.cuda.manual_seed(args.seed) torch.backends.cudnn.deterministic = True # Make the results exactly torch.backends.cudnn.benchmark = False # the same under the same seed device = local_rank torch.cuda.set_device(device) dist.init_process_group(backend=args.backend, init_method=args.init_method) rank = dist.get_rank() # Unique identifier among all processes # Prepare data socket.setdefaulttimeout(60) # Connection time limit for data downloading transform = transforms.Compose([transforms.ToTensor(), ]) if local_rank == 0: train_dataset = datasets.MNIST( args.data_path, train=True, download=True, transform=transform) dist.barrier() # Wait for local_rank 0 to finish downloading the dataset if local_rank != 0: train_dataset = datasets.MNIST( args.data_path, train=True, transform=transform) test_dataset = datasets.MNIST( args.data_path, train=False, transform=transform) train_sampler = torch.utils.data.DistributedSampler( train_dataset, shuffle=True) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.train_batch_size, shuffle=False, num_workers=1, pin_memory=use_gpu, sampler=train_sampler) test_loader = torch.utils.data.DataLoader( test_dataset, batch_size=args.test_batch_size, shuffle=False, num_workers=1, pin_memory=use_gpu) # Build the model model = Net() # Pretrained weights should be loaded before using auto_decomposition if args.pretrained_path is not None: pretrained_path = os.path.realpath(args.pretrained_path) state_dict = torch.load(pretrained_path, map_location='cpu') model.load_state_dict(state_dict) if rank == 0: print('Loaded pretrained weights file: {}'.format(pretrained_path)) # Use tensor decomposition here, after building the model, before DDP and # before passing model parameters to the optimizer if args.tensor_decompose: dec_info_path = args.decompose_info_path dec_weights_path = os.path.realpath(args.decomposed_weights_path) if args.run_mode == 'online': if local_rank == 0: # Decompose the model on local_rank 0, and save the # decomposition information file. It will take some time. # Pretrained weights should have been loaded to the model. model, _ = auto_decomposition(model, dec_info_path) # Save the decomposed weights os.makedirs(os.path.dirname(dec_weights_path), exist_ok=True) torch.save(model.state_dict(), dec_weights_path) print('Decomposed weights file is saved to: {}'.format( dec_weights_path)) # Wait until all local_ranks get here. This makes the other # local_ranks wait for local_rank 0 to complete, in order to use # the decomposition information and weights saved by local_rank 0. dist.barrier() if local_rank != 0: # Decompose the model with the saved decomposition information model, _ = decompose_network(model, dec_info_path) # Load the saved weights state_dict = torch.load(dec_weights_path, map_location='cpu') model.load_state_dict(state_dict) else: # offline # Use existing decomposition information and decomposed weights for # all local_ranks model, _ = decompose_network(model, dec_info_path) state_dict = torch.load(dec_weights_path, map_location='cpu') model.load_state_dict(state_dict) if rank == 0: print('Loaded decomposed weights file: {}'.format( dec_weights_path)) # Put the model on target device model = model.to(device) # DDP device_ids = [local_rank] if use_gpu else [] model = DDP(model, device_ids=device_ids) # Build optimizer # Note: please do not load optimizer parameters of the original model after # tensor decomposition, since the model parameters have been changed. optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) # Train and test train(model, device, train_loader, optimizer, args.steps, args.log_steps, rank == 0) if rank == 0: test(model, device, test_loader) # Save model weights if rank == 0 and args.save_path is not None: save_path = os.path.realpath(args.save_path) os.makedirs(os.path.dirname(save_path), exist_ok=True) torch.save(model.module.state_dict(), save_path) print('Trained weights file is saved to: {}'.format(save_path))
def train_and_evaluate(model, optimizer, train_loader, val_loader, loss_fn, metrics, params, run_dir, device, scheduler=None, restore_file=None, writer=None): """ Train the model and evaluate on every epoch Args: model: (inherits torch.nn.Module) the custom neural network model optimizer: (inherits torch.optim) optimizer to update the model parameters train_loader: (DataLoader) a torch.utils.data.DataLoader object that fetches the training set val_loader: (DataLoader) a torch.utils.data.DataLoader object that fetches the validation set loss_fn : (function) a function that takes batch_output (tensor) and batch_labels (np.ndarray) and return the loss (tensor) over the batch metrics: (dict) a dictionary of functions that compute a metric using the batch_output and batch_labels params: (Params) hyperparameters run_dir: (string) directory containing params.json, learned weights, and logs restore_file: (string) optional = name of file to restore training from -> no filename extension .pth or .pth.tar/gz writer: (tensorboard) tensorboard summary writer device: (str) device type; usually 'cuda:0' or 'cpu' """ # reload the weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(run_dir, restore_file + '.pth.zip') if os.path.exists(restore_path): logging.info("Restoring weights from {}".format(restore_path)) load_checkpoint(restore_path, model, optimizer) best_val_accu = 0.0 for epoch in range(params.num_epochs): # running one epoch logging.info("Epoch {} / {}".format(epoch + 1, params.num_epochs)) # logging current learning rate for i, param_group in enumerate(optimizer.param_groups): logging.info("learning rate = {} for parameter group {}".format( param_group['lr'], i)) # train for one full pass over the training set train_metrics, batch_summ = train(model, optimizer, loss_fn, train_loader, \ metrics, params, epoch, device, writer) # evaluate for one epoch on the validation set val_metrics = evaluate(model, loss_fn, val_loader, metrics, params, device) # schedule learning rate if scheduler is not None: scheduler.step() # check if current epoch has best accuracy val_accu = val_metrics['accuracy'] is_best = val_accu >= best_val_accu # save weights save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict() }, is_best=is_best, checkpoint=run_dir) # save batch summaries save_batch_summary(run_dir, batch_summ) # if best accuray if is_best: logging.info( "- Found new best accuray model at epoch {}".format(epoch + 1)) best_val_accu = val_accu # add training log to tensorboard if writer is not None: # train and validation per-epoch mean metrics for metric, value in train_metrics.items(): if metric in val_metrics.keys(): writer.add_scalars(metric, { 'train': value, 'val': val_metrics[metric] }, epoch) # layer weights / gradients distributions for idx, m in enumerate(model.modules()): if isinstance(m, (nn.Conv2d, nn.Linear)): if m.weight is not None: writer.add_histogram('layer{}.weight'.format(idx), m.weight, epoch) if m.weight.grad is not None: writer.add_histogram('layer{}.weight.grad'.format(idx), \ m.weight.grad, epoch)
def main(): """Main process.""" # Settings args = parse_args() check_args(args) use_gpu = args.use_gpu and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device('cpu') if use_gpu: torch.cuda.manual_seed(args.seed) torch.backends.cudnn.deterministic = True # Make the results exactly torch.backends.cudnn.benchmark = False # the same under the same seed device = torch.device('cuda') # Prepare data socket.setdefaulttimeout(60) # Connection time limit for data downloading transform = transforms.Compose([transforms.ToTensor(), ]) train_dataset = datasets.MNIST( args.data_path, train=True, download=True, transform=transform) test_dataset = datasets.MNIST( args.data_path, train=False, transform=transform) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=1, pin_memory=use_gpu) test_loader = torch.utils.data.DataLoader( test_dataset, batch_size=args.test_batch_size, shuffle=False, num_workers=1, pin_memory=use_gpu) # Build the model model = Net() # Pretrained weights should be loaded before using auto_decomposition if args.pretrained_path is not None: pretrained_path = os.path.realpath(args.pretrained_path) state_dict = torch.load(pretrained_path, map_location='cpu') model.load_state_dict(state_dict) print('Loaded pretrained weights: {}'.format(pretrained_path)) # Use tensor decomposition here, after building the model and before # passing model parameters to the optimizer if args.tensor_decompose: dec_info_path = args.decompose_info_path dec_weights_path = args.decomposed_weights_path if args.run_mode == 'online': # Decompose the model, and save the decomposition information file # (if needed). It will take some time. # Pretrained weights should have been loaded to the model. model, _ = auto_decomposition(model, dec_info_path) # Save the decomposed weights (if needed) if dec_weights_path is not None: dec_weights_path = os.path.realpath(dec_weights_path) os.makedirs(os.path.dirname(dec_weights_path), exist_ok=True) torch.save(model.state_dict(), dec_weights_path) print('Decomposed weights file is saved to: {}'.format( dec_weights_path)) else: # offline # Use existing decomposition information and decomposed weights model, _ = decompose_network(model, dec_info_path) dec_weights_path = os.path.realpath(dec_weights_path) state_dict = torch.load(dec_weights_path, map_location='cpu') model.load_state_dict(state_dict) print('Loaded decomposed weights file: {}'.format( dec_weights_path)) # Put the model on target device model = model.to(device) # Build optimizer # Note: please do not load optimizer parameters of the original model after # tensor decomposition, since the model parameters have been changed. optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) # Train and test train(model, device, train_loader, optimizer, args.steps, args.log_steps, True) test(model, device, test_loader) # Save model weights if args.save_path is not None: save_path = os.path.realpath(args.save_path) os.makedirs(os.path.dirname(save_path), exist_ok=True) torch.save(model.state_dict(), save_path) print('Trained weights file is saved to: {}'.format(save_path))
from common.train import train from Reader import Reader import tensorflow as tf if __name__ == '__main__': reader = Reader( data_dir='/home/give/Documents/dataset/cifar/cifar-100-python', batch_size=100, reshape_flag=True) category_num = 100 image_tensor = tf.placeholder(dtype=tf.float32, shape=[100, 32, 32, 3], name='input_x') label_tensor = tf.placeholder(dtype=tf.float32, shape=[100, category_num], name='input_y') restore_obj = dict() restore_obj[ 'path'] = '/home/give/PycharmProjects/Reproduce/CenterLoss/MNIST/parameters' train(image_tensor, label_tensor, int(1e6), reader, restore=None, output_num=category_num)