def __init__(self): super(EvaluationProcess, self).__init__() self.server = ParameterServer(2) self.args = ArgsProvider(call_from=self, define_params=[ ("eval_freq", 10), ("eval_gpu", 1), ]) self.count = 0
def main(config, max_samples): get_env_configs(config) ray.init() parameter_server = ParameterServer.remote(config) replay_buffer = ReplayBuffer.remote(config) learner = Learner.remote(config, replay_buffer, parameter_server) train_actor_ids = [] eval_actor_ids = [] learner.start_learning.remote() # start train actors for i in range(config["num_workers"]): epsilon = config["max_eps"] * i / config["num_workers"] training_actor = Actor.remote("train-" + str(i), replay_buffer, parameter_server, config, epsilon) training_actor.sample.remote() train_actor_ids.append(training_actor) # start eval actors for i in range(config["eval_num_workers"]): epsilon = 0 eval_actor = Actor.remote("eval-" + str(i), replay_buffer, parameter_server, config, epsilon, eval=True) eval_actor_ids.append(eval_actor) # fetch samples in loop and sync actor weights total_samples = 0 best_eval_mean_reward = np.NINF eval_mean_rewards = [] while total_samples < max_samples: total_env_samples_id = replay_buffer.get_total_env_samples.remote() new_total_samples = ray.get(total_env_samples_id) num_new_samples = new_total_samples - total_samples if num_new_samples >= config["timesteps_per_iteration"]: total_samples = new_total_samples print("Total samples:", total_samples) parameter_server.set_eval_weights.remote() eval_sampling_ids = [ eval_actor.sample.remote() for eval_actor in eval_actor_ids ] eval_rewards = ray.get(eval_sampling_ids) print("Evaluation rewards: {}".format(eval_rewards)) eval_mean_reward = np.mean(eval_rewards) eval_mean_rewards.append(eval_mean_reward) print("Mean evaluation reward: {}".format(eval_mean_reward)) if eval_mean_reward > best_eval_mean_reward: print("Model has improved! Saving the model!") best_eval_mean_reward = eval_mean_reward parameter_server.save_eval_weights.remote() print("Finishing the training.\n\n\n\n\n\n") [actor.stop.remote() for actor in train_actor_ids] learner.stop.remote()
class EvaluationProcess(mp.Process): def __init__(self): super(EvaluationProcess, self).__init__() self.server = ParameterServer(2) self.args = ArgsProvider(call_from=self, define_params=[ ("eval_freq", 10), ("eval_gpu", 1), ]) self.count = 0 def set_model(self, mi): self.server.server_send_model(mi) def update_model(self, key, mi, immediate=False): if (self.count % self.args.eval_freq == 0) or immediate: self.server.server_update_model(key, mi, noblock=True) self.count += 1 def set(self, evaluator, args): self.evaluator = evaluator self.args = args def run(self): ''' Run the model ''' self.server.client_receive_model() self.evaluator.setup(self.args) k = 0 while True: mi = self.server.client_refresh_model(gpu=self.evaluator.gpu) print("Eval: Get refreshed model") # Do your evaluation. self.evaluator.step(k, mi) k += 1 def run_same_process(self, mi): self.evaluator.setup(self.args) # Do your evaluation. self.evaluator.step(0, mi)
def run(args): ray.init( address='auto', ignore_reinit_error=True, webui_host='0.0.0.0', redis_password='******' ) try: ps = ParameterServer.remote(args) # https://docs.ray.io/en/releases-0.8.6/auto_examples/plot_parameter_server.html if args.sync_param_server: # synchronous parameter server: val = ps.run.remote() else: # asynchronous paramter server: val = ps.run_async.remote() print(ray.get(val)) except Exception as e: raise e finally: print('waiting 10s to allow logs to flush') time.sleep(10) ray.shutdown()
def main(args): time_stamp = '{:.0f}'.format(time.time() % 100000) if torch.cuda.is_available() is True: logging.info('Utilizing GPU', extra=args.client) print('Utilizing GPU') train_loader, val_loader = load_data(args) model = get_model(args.model, args) if args.batch_size > 256: # and args.dataset == 'imagenet' and args.model == 'resnet': batch_accumulate_num = args.batch_size // 256 else: batch_accumulate_num = 1 # create model if args.dataset == 'imagenet': if batch_accumulate_num > 1: args.iterations_per_epoch = len(train_loader.dataset.imgs) // 256 else: args.iterations_per_epoch = len(train_loader.dataset.imgs) // args.batch_size val_len = len(val_loader.dataset.imgs) // 1024 else: if batch_accumulate_num > 1: args.iterations_per_epoch = len(train_loader.dataset.train_labels) // 256 else: args.iterations_per_epoch = len(train_loader.dataset.train_labels) // args.batch_size val_len = len(val_loader.dataset.test_labels) // 1024 # get the number of model parameters log_str = 'Number of model parameters: {}'.format(sum([p.data.nelement() for p in model.parameters()])) logging.info(log_str, extra=args.client) print(log_str) # for training on multiple GPUs. model = torch.nn.DataParallel(model) model = model.cuda() server = ParameterServer.get_server(args.optimizer, model, args) val_statistics = Statistics.get_statistics('image_classification', args) train_statistics = Statistics.get_statistics('image_classification', args) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume + '/checkpoint.pth.tar'): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume + '/checkpoint.pth.tar') args.start_epoch = checkpoint['epoch'] server = checkpoint['server'] val_statistics = checkpoint['val_stats'] train_statistics = checkpoint['train_stats'] model.load_state_dict(checkpoint['state_dict']) print('=> loaded checkpoint {} (epoch {})'.format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # define loss function (criterion) and optimizer loss_params = {} if args.label_smoothing > 0: loss_params['smooth_eps'] = args.label_smoothing criterion = getattr(model, 'criterion', CrossEntropyLoss)(**loss_params).cuda() # criterion = nn.CrossEntropyLoss().cuda() if args.bar is True: train_bar = IncrementalBar('Training ', max=args.iterations_per_epoch, suffix='%(percent)d%%') val_bar = IncrementalBar('Evaluating', max=val_len, suffix='%(percent)d%%') else: train_bar = None val_bar = None log_str = '{}: Training neural network for {} epochs with {} workers'.format(args.id, args.epochs, args.workers_num) logging.info(log_str, extra=args.client) print(log_str) train_time = time.time() for epoch in range(args.start_epoch, args.epochs): # train for one epoch train_loss, train_error = train(train_loader, model, criterion, server, epoch, args.workers_num, args.grad_clip, batch_accumulate_num, train_bar, train_statistics, args.client) train_time = time.time() - train_time if args.bar is True: train_bar.finish() train_bar.index = 0 # evaluate on validation set val_time = time.time() with torch.no_grad(): val_loss, val_error = validate(val_loader, model, criterion, server, val_statistics, val_bar) train_statistics.save_loss(train_loss) train_statistics.save_error(train_error) train_statistics.save_weight_mean_dist(server.get_workers_mean_statistics()) train_statistics.save_weight_master_dist(server.get_workers_master_statistics()) train_statistics.save_mean_master_dist(server.get_mean_master_dist()) train_statistics.save_weight_norm(server.get_server_weights()) train_statistics.save_gradient_norm(server.get_server_gradients()) val_time = time.time() - val_time if args.bar is True: val_bar.finish() val_bar.index = 0 log_str = 'Epoch [{0:1d}]: Train: Time [{1:.2f}], Loss [{2:.3f}], Error[{3:.3f}] | ' \ 'Test: Time [{4:.2f}], Loss [{5:.3f}], Error[{6:.3f}]'.format(epoch + 1, train_time, train_loss, train_error, val_time, val_loss, val_error) logging.info(log_str, extra=args.client) print(log_str) if epoch % args.save == 0 and epoch > 0: save_checkpoint({'epoch': epoch + 1, 'state_dict': model.state_dict(), 'val_stats': val_statistics, 'train_stats': train_statistics, 'server': server}, sim_name=(args.name + time_stamp + '_' + str(epoch))) train_time = time.time() return train_statistics, val_statistics
def initialization(params): print('-----------------------') print('Initializing...', end='') batch_size = params.batch_size learning_rate = params.learning_rate momentum = params.momentum rho = params.rho tau = params.tau workers_number = params.workers_number optimizer = params.optimizer permute = params.permute gpu_num = params.gpu_number gradient_clipping = params.gradient_clipping lr_batch_adjustment = params.lr_batch_adjustment if torch.cuda.is_available() is True: print('Utilizing GPU') torch.cuda.set_device(gpu_num) dtype = torch.cuda.FloatTensor else: dtype = torch.FloatTensor if params.data_set == 'cifar10': dataset = data_set.DataSetCifar10(batch_size, permute) model = resnet(num_classes=10, depth=56, wide_factor=1) if params.data_set == 'image_net': dataset = data_set.DataSetImageNet(batch_size, permute) model = alexnet() if params.data_set == 'cifar100': dataset = data_set.DataSetCifar100(batch_size, permute) model = resnet(num_classes=100, depth=56, wide_factor=1) train_set = dataset.get_train() test_set = dataset.get_test() if torch.cuda.is_available() is True: model.cuda() # model = torch.nn.DataParallel(model) # Run on multiple GPUs parameters = net_model.get_model_parameters(model, dtype) gradients = net_model.get_model_parameters(model, dtype) loss_fn = torch.nn.CrossEntropyLoss() if optimizer == 'synchronous': effective_batch_size = batch_size * workers_number else: effective_batch_size = batch_size server = ParameterServer.get_server( optimizer, learning_rate=learning_rate, momentum=momentum, parameters=parameters, gradients=gradients, workers_number=workers_number, rho=rho, tau=tau, effective_batch_size=effective_batch_size, gradient_clipping=gradient_clipping, lr_batch_adjustment=lr_batch_adjustment) stats_train = Statistics.get_statistics('image_classification', params) stats_test = Statistics.get_statistics('image_classification', params) print('Done') return server, loss_fn, stats_train, stats_test, train_set, test_set, model, dtype
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=128, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=5, metavar='N', help='number of epochs to train (default: 5)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.1)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') # yuanfang added parser.add_argument('--quantize-nbits', default=0, type=int, help='quantize') parser.add_argument('--tau', default=32, type=int, help='hyperparameter used in AEASGD') parser.add_argument('--rho', default=0.01, type=float, help='hyperparameter used in AEASGD') parser.add_argument('--world-size', default=-1, type=int, help='number of nodes for distributed training') parser.add_argument('--rank', default=-1, type=int, help='node rank for distributed training') parser.add_argument('--dist-url', default='does not work', type=str, help='url used to set up distributed training') parser.add_argument('--dist-backend', default='gloo', type=str, help='distributed backend') ps_flag_parser = parser.add_mutually_exclusive_group(required=False) ps_flag_parser.add_argument('--flag', dest='ps_flag', action='store_true') ps_flag_parser.add_argument('--no-flag', dest='ps_flag', action='store_false') parser.set_defaults(ps_flag=False) args = parser.parse_args() use_cuda = False torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") model = Net().to(device) if args.ps_flag: print("before init process group") # start a parameter server dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) print("after init process group") ps = ParameterServer(model, args.world_size, quantize_num_bits=args.quantize_nbits) print("starting parameter server....") ps.start() else: print("before init process group") # start a worker dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) print("after init process group") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader(datasets.MNIST( './data%d' % (args.rank), train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader( datasets.MNIST('./data%d' % (args.rank), train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=True, **kwargs) optimizer = AEASGD(model.parameters(), lr=args.lr, tau=args.tau, rho=args.rho, model=model, quantize_num_bits=args.quantize_nbits) for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch) test(args, model, device, test_loader) optimizer.send_message( MessageCode.WorkerTerminate, torch.randn(optimizer.squash_model(optimizer.model).numel())) if args.save_model: torch.save(model.state_dict(), "mnist_cnn.pt")
def main(args): time_stamp = '{:.0f}'.format(time.time() % 100000) if torch.cuda.is_available() is True: logging.info('Utilizing GPU', extra=args.client) print('Utilizing GPU') train_loader, val_loader = load_data(args) model = get_model(args.model, args) if args.batch_size > 256 and args.dataset == 'imagenet' and args.model == 'resnet': batch_accumulate_num = args.batch_size // 256 else: batch_accumulate_num = 1 # create model if args.dataset == 'imagenet': if batch_accumulate_num > 1: args.iterations_per_epoch = len(train_loader.dataset.imgs) // 256 else: args.iterations_per_epoch = len( train_loader.dataset.imgs) // args.batch_size val_len = len(val_loader.dataset.imgs) // 1024 else: args.iterations_per_epoch = len( train_loader.dataset.train_labels) // args.batch_size val_len = len(val_loader.dataset.test_labels) // 1024 # get the number of model parameters log_str = 'Number of model parameters: {}'.format( sum([p.data.nelement() for p in model.parameters()])) logging.info(log_str, extra=args.client) print(log_str) # for training on multiple GPUs. model = torch.nn.DataParallel(model) model = model.cuda() server = ParameterServer.get_server(args.optimizer, model, args) val_statistics = Statistics.get_statistics('image_classification', args) train_statistics = Statistics.get_statistics('image_classification', args) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume + '/checkpoint.pth.tar'): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume + '/checkpoint.pth.tar') args.start_epoch = checkpoint['epoch'] server = checkpoint['server'] val_statistics = checkpoint['val_stats'] train_statistics = checkpoint['train_stats'] model.load_state_dict(checkpoint['state_dict']) print('=> loaded checkpoint {} (epoch {})'.format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # # Synchronous to Asynchronous Adjustments # print('Resetting Parameter Server to Asynchronous Mode') # logging.info('Resetting Parameter Server to Asynchronous Mode', extra=args.client) # server._shards_weights = list() # weights = server._get_model_weights() # for i in range(0, args.workers_num): # server._shards_weights.append(deepcopy(weights)) # server._workers_num = args.workers_num # # learning rate initialization # batch_baseline = args.baseline # server._lr = args.lr * np.sqrt((args.workers_num * args.batch_size) // batch_baseline) / (args.workers_num) # server._fast_im = args.fast_im # server._lr_warm_up = args.lr_warm_up # server._current_lr = args.lr # server._m_off = args.m_off # server._current_momentum = args.momentum # server._iterations_per_epoch = args.iterations_per_epoch # server._momentum = args.momentum # server._client = args.client # if args.fast_im is True: # end_lr = args.lr * ((args.workers_num * args.batch_size) // batch_baseline) / np.sqrt(args.workers_num) # start_lr = args.lr / (args.workers_num) # server._lr = end_lr # server._start_lr = start_lr # server._lr_increment_const = (end_lr - start_lr) / (args.iterations_per_epoch * 5) # log_str = 'Fast ImageNet Mode - Warm Up [{:.5f}]->[{:.5f}] In 5 Epochs'.format(start_lr, end_lr) # logging.info(log_str, extra=args.client) # print(log_str) # else: # server._start_lr = 0 # server._lr_increment_const = 0 # for param_group in server._optimizer.param_groups: # param_group['lr'] = start_lr # param_group['momentum'] = server._momentum # # Synchronous to Asynchronous Adjustments - End cudnn.benchmark = True # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() if args.bar is True: train_bar = IncrementalBar('Training ', max=args.iterations_per_epoch, suffix='%(percent)d%%') val_bar = IncrementalBar('Evaluating', max=val_len, suffix='%(percent)d%%') else: train_bar = None val_bar = None log_str = '{}: Training neural network for {} epochs with {} workers'.format( args.id, args.epochs, args.workers_num) logging.info(log_str, extra=args.client) print(log_str) train_time = time.time() for epoch in range(args.start_epoch, args.epochs): # train for one epoch train_loss, train_error = train(train_loader, model, criterion, server, epoch, args.workers_num, args.grad_clip, batch_accumulate_num, train_bar, train_statistics, args.client) train_time = time.time() - train_time if args.bar is True: train_bar.finish() train_bar.index = 0 # evaluate on validation set val_time = time.time() with torch.no_grad(): val_loss, val_error = validate(val_loader, model, criterion, server, val_statistics, val_bar) train_statistics.save_loss(train_loss) train_statistics.save_error(train_error) train_statistics.save_weight_mean_dist( server.get_workers_mean_statistics()) train_statistics.save_weight_master_dist( server.get_workers_master_statistics()) train_statistics.save_mean_master_dist(server.get_mean_master_dist()) train_statistics.save_weight_norm(server.get_server_weights()) train_statistics.save_gradient_norm(server.get_server_gradients()) val_time = time.time() - val_time if args.bar is True: val_bar.finish() val_bar.index = 0 log_str = 'Epoch [{0:1d}]: Train: Time [{1:.2f}], Loss [{2:.3f}], Error[{3:.3f}] | ' \ 'Test: Time [{4:.2f}], Loss [{5:.3f}], Error[{6:.3f}]'.format(epoch + 1, train_time, train_loss, train_error, val_time, val_loss, val_error) logging.info(log_str, extra=args.client) print(log_str) if epoch % args.save == 0 and epoch > 0: save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'val_stats': val_statistics, 'train_stats': train_statistics, 'server': server }, sim_name=(args.name + time_stamp + '_' + str(epoch))) train_time = time.time() return train_statistics, val_statistics
def main(args): if torch.cuda.is_available() is True: print('Utilizing GPU') # torch.cuda.set_device(args.gpu_num) train_loader, val_loader = load_data(args) # create model if args.dataset == 'image_net': model = alexnet() top_k = (1, 5) val_len = len(val_loader.dataset.imgs) else: model = WideResNet(args.layers, args.dataset == 'cifar10' and 10 or 100, args.widen_factor, dropRate=args.droprate) top_k = (1,) val_len = len(val_loader.dataset.test_labels) # get the number of model parameters print('Number of model parameters: {}'.format( sum([p.data.nelement() for p in model.parameters()]))) # for training on multiple GPUs. model = torch.nn.DataParallel(model).cuda() model = model.cuda() server = ParameterServer.get_server(args.optimizer, model, args) val_statistics = Statistics.get_statistics('image_classification', args) train_statistics = Statistics.get_statistics('image_classification', args) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() # ghost batch normalization (128 as baseline) repeat = args.batch_size // 128 if args.gbn == 1 else 1 total_iterations = args.iterations_per_epoch + val_len // args.batch_size if args.bar is True: train_bar = IncrementalBar('Training ', max=args.iterations_per_epoch, suffix='%(percent)d%%') val_bar = IncrementalBar('Evaluating', max=total_iterations, suffix='%(percent)d%%') else: train_bar = None val_bar = None print( '{}: Training neural network for {} epochs with {} workers'.format(args.sim_num, args.epochs, args.workers_num)) train_time = time.time() for epoch in range(args.start_epoch, args.epochs): # train for one epoch train(train_loader, model, criterion, server, epoch, args.workers_num, args.grad_clip, repeat, train_bar) train_time = time.time() - train_time if args.bar is True: train_bar.finish() train_bar.index = 0 # evaluate on validation set val_time = time.time() val_loss, val_error = validate(val_loader, model, criterion, server, val_statistics, top_k, val_bar) train_loss, train_error = validate(train_loader, model, criterion, server, train_statistics, top_k, val_bar, save_norm=True) val_time = time.time() - val_time if args.bar is True: val_bar.finish() val_bar.index = 0 print('Epoch [{0:1d}]: Train: Time [{1:.2f}], Loss [{2:.3f}], Error[{3:.3f}] |' ' Test: Time [{4:.2f}], Loss [{5:.3f}], Error[{6:.3f}]' .format(epoch, train_time, train_loss, train_error, val_time, val_loss, val_error)) train_time = time.time() return train_statistics, val_statistics
def main(): global args, best_prec1, dtype best_prec1 = 0 args = parser.parse_args() dtype = torch_dtypes.get(args.dtype) torch.manual_seed(args.seed) time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') if args.evaluate: args.results_dir = '/tmp' if args.save is '': args.save = time_stamp save_path = os.path.join(args.results_dir, args.save) if not os.path.exists(save_path): os.makedirs(save_path) else: print('***************************************\n' 'Warning: PATH exists - override warning\n' '***************************************') args.distributed = args.local_rank >= 0 or args.world_size > 1 setup_logging(os.path.join(save_path, 'log.txt'), resume=args.resume is not '', dummy=args.distributed and args.local_rank > 0) if args.deterministic: logging.info('Deterministic Run Set') torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False results_path = os.path.join(save_path, 'results') results = ResultsLog(results_path, title='Training Results - %s' % args.save) if args.distributed: args.device_ids = [args.local_rank] dist.init_process_group(backend=args.dist_backend, init_method=args.dist_init, world_size=args.world_size, rank=args.local_rank) logging.info("saving to %s", save_path) logging.debug("run arguments: %s", args) logging.info("creating model %s", args.model) if 'cuda' in args.device and torch.cuda.is_available(): torch.cuda.manual_seed_all(args.seed) torch.cuda.set_device(args.device_ids[0]) cudnn.benchmark = True else: args.device_ids = None # create model set_global_seeds(args.seed) model = models.__dict__[args.model] model_config = {'dataset': args.dataset} if args.model_config is not '': model_config = dict(model_config, **literal_eval(args.model_config)) model = model(**model_config) logging.info("created model with configuration: %s", model_config) num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info("number of parameters: %d", num_parameters) # optionally resume from a checkpoint shards = None x = None checkpoint = None if args.evaluate: if not os.path.isfile(args.evaluate): parser.error('invalid checkpoint: {}'.format(args.evaluate)) checkpoint = torch.load(args.evaluate) x = dict() for name, val in checkpoint['server_state_dict'].items(): x[name[7:]] = val model.load_state_dict(x) shards = checkpoint['server_weight_shards'] logging.info("loaded checkpoint '%s' (epoch %s)", args.evaluate, checkpoint['epoch']) elif args.resume: checkpoint_file = args.resume if os.path.isdir(checkpoint_file): results.load(os.path.join(checkpoint_file, 'results.csv')) checkpoint_file = os.path.join(checkpoint_file, 'model_best.pth.tar') if os.path.isfile(checkpoint_file): logging.info("loading checkpoint '%s'", args.resume) checkpoint = torch.load(checkpoint_file, map_location=torch.device('cpu')) args.start_epoch = checkpoint['epoch'] - 1 best_prec1 = checkpoint['best_prec1'] # model_dict = {'.'.join(k.split('.')[1:]): v for k, v in checkpoint['server_state_dict'].items()} # model.load_state_dict(model_dict) model.load_state_dict(checkpoint['server_state_dict']) logging.info("loaded checkpoint '%s' (epoch %s)", checkpoint_file, checkpoint['epoch']) shards = checkpoint['server_weight_shards'] else: logging.error("no checkpoint found at '%s'", args.resume) # define loss function (criterion) and optimizer loss_params = {} if args.label_smoothing > 0: loss_params['smooth_eps'] = args.label_smoothing criterion = getattr(model, 'criterion', CrossEntropyLoss)(**loss_params) criterion.to(args.device, dtype) model.to(args.device, dtype) # optimizer configuration optim_regime = getattr(model, 'regime', [{ 'epoch': 0, 'optimizer': args.optimizer, 'lr': args.lr, 'momentum': args.momentum, 'weight_decay': args.weight_decay }]) cpu_store = True if args.dataset == 'imagenet' and args.workers_num > 32 else False args.server = args.server if args.delay > 0 else 'ssgd' server = ParameterServer.get_server(args.server, args.delay, model=model, shards=shards, optimizer_regime=optim_regime, device_ids=args.device_ids, device=args.device, dtype=dtype, distributed=args.distributed, local_rank=args.local_rank, grad_clip=args.grad_clip, workers_num=args.workers_num, cpu_store=cpu_store) del shards, x, checkpoint torch.cuda.empty_cache() trainer = Trainer(model, server, criterion, device_ids=args.device_ids, device=args.device, dtype=dtype, distributed=args.distributed, local_rank=args.local_rank, workers_number=args.workers_num, grad_clip=args.grad_clip, print_freq=args.print_freq, schedule=args.schedule) # Evaluation Data loading code args.eval_batch_size = args.eval_batch_size if args.eval_batch_size > 0 else args.batch_size val_data = DataRegime(getattr(model, 'data_eval_regime', None), defaults={ 'datasets_path': args.datasets_dir, 'name': args.dataset, 'split': 'val', 'augment': False, 'input_size': args.input_size, 'batch_size': args.eval_batch_size, 'shuffle': False, 'num_workers': args.workers, 'pin_memory': True, 'drop_last': True }) # Training Data loading code train_data = DataRegime(getattr(model, 'data_regime', None), defaults={ 'datasets_path': args.datasets_dir, 'name': args.dataset, 'split': 'train', 'augment': args.augment, 'input_size': args.input_size, 'batch_size': args.batch_size, 'shuffle': True, 'num_workers': args.workers, 'pin_memory': True, 'drop_last': True, 'distributed': args.distributed, 'duplicates': args.duplicates, 'cutout': { 'holes': 1, 'length': 16 } if args.cutout else None }) if args.evaluate: trainer.forward_pass(train_data.get_loader(), duplicates=args.duplicates) results = trainer.validate(val_data.get_loader()) logging.info(results) return logging.info('optimization regime: %s', optim_regime) trainer.training_steps = args.start_epoch * len(train_data) args.iterations_steps = trainer.training_steps with open(os.path.join(save_path, 'args.txt'), 'w') as file: file.write(dict_to_table(vars(args))) tb.init(path=save_path, title='Training Results', params=args, res_iterations=args.resolution) for epoch in range(args.start_epoch, args.epochs): trainer.epoch = epoch train_data.set_epoch(epoch) val_data.set_epoch(epoch) logging.info('\nStarting Epoch: {0}\n'.format(epoch + 1)) # train for one epoch train_results = trainer.train(train_data.get_loader(), duplicates=args.duplicates) # evaluate on validation set val_results = trainer.validate(val_data.get_loader()) if args.distributed and args.local_rank > 0: continue # remember best prec@1 and save checkpoint is_best = val_results['prec1'] > best_prec1 best_prec1 = max(val_results['prec1'], best_prec1) if (epoch + 1) % args.save_freq == 0: tb.tboard.set_resume_step(epoch) save_checkpoint( { 'epoch': epoch + 1, 'model': args.model, 'server_state_dict': server._model.state_dict(), 'server_weight_shards': server._shards_weights, 'config': args.model_config, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, }, is_best, path=save_path) errors = { 'error1_train': 100 - train_results['prec1'], 'error5_train': 100 - train_results['prec5'], 'error1_val': 100 - val_results['prec1'], 'error5_val': 100 - val_results['prec5'], 'epochs': epoch } logging.info('\nResults - Epoch: {0}\n' 'Training Loss {train[loss]:.4f} \t' 'Training Error@1 {errors[error1_train]:.3f} \t' 'Training Error@5 {errors[error5_train]:.3f} \t' 'Validation Loss {val[loss]:.4f} \t' 'Validation Error@1 {errors[error1_val]:.3f} \t' 'Validation Error@5 {errors[error5_val]:.3f} \t\n'.format( epoch + 1, train=train_results, val=val_results, errors=errors)) values = dict(epoch=epoch + 1, steps=trainer.training_steps) values.update({'training ' + k: v for k, v in train_results.items()}) values.update({'validation ' + k: v for k, v in val_results.items()}) tb.tboard.log_results(epoch, **values) tb.tboard.log_model(server, epoch) if args.delay > 0: tb.tboard.log_delay(trainer.delay_hist, epoch) tb.tboard.close() return errors, args
def main(config, max_samples): get_env_parameters(config) log_dir = "logs/scalars/" + datetime.datetime.now().strftime( "%Y%m%d-%H%M%S") file_writer = tf.summary.create_file_writer(log_dir + "/metrics") file_writer.set_as_default() config['log_dir'] = log_dir ray.init() parameter_server = ParameterServer.remote(config) replay_buffer = ReplayBuffer.remote(config) learner = Learner.remote(config, replay_buffer, parameter_server) training_actor_ids = [] eval_actor_ids = [] learner.start_learning.remote() # Create training actors for i in range(config["num_workers"]): eps = config["max_eps"] * i / config["num_workers"] actor = Actor.remote("train-" + str(i), replay_buffer, parameter_server, config, eps) actor.sample.remote() training_actor_ids.append(actor) # Create eval actors for i in range(config["eval_num_workers"]): eps = 0 actor = Actor.remote("eval-" + str(i), replay_buffer, parameter_server, config, eps, True) eval_actor_ids.append(actor) total_samples = 0 best_eval_mean_reward = np.NINF eval_mean_rewards = [] while total_samples < max_samples: tsid = replay_buffer.get_total_env_samples.remote() new_total_samples = ray.get(tsid) if (new_total_samples - total_samples >= config["timesteps_per_iteration"]): total_samples = new_total_samples print("Total samples:", total_samples) parameter_server.set_eval_weights.remote() eval_sampling_ids = [] for eval_actor in eval_actor_ids: sid = eval_actor.sample.remote() eval_sampling_ids.append(sid) eval_rewards = ray.get(eval_sampling_ids) print("Evaluation rewards: {}".format(eval_rewards)) eval_mean_reward = np.mean(eval_rewards) eval_mean_rewards.append(eval_mean_reward) print("Mean evaluation reward: {}".format(eval_mean_reward)) tf.summary.scalar('Mean evaluation reward', data=eval_mean_reward, step=total_samples) if eval_mean_reward > best_eval_mean_reward: print("Model has improved! Saving the model!") best_eval_mean_reward = eval_mean_reward parameter_server.save_eval_weights.remote() print("Finishing the training.") for actor in training_actor_ids: actor.stop.remote() learner.stop.remote()