if not os.path.isdir('%s/checkpoint' %save_root): os.makedirs('%s/checkpoint' %save_root) torch.save(source_net.state_dict(), '%s/checkpoint/%s-temp.pth' %(save_root, source_dataset_name)) torch.save(target_net.state_dict(), '%s/checkpoint/%s-temp.pth' %(save_root, target_dataset_name)) if loss_t < small_train_loss: small_train_loss = loss_t descend_count = 0 else: descend_count += 1 print('\nTraining loss: %.3f, descend count: %d' % (loss_t, descend_count)) if descend_count >= 3: descend_count = 0 optimizer_t.param_groups[0]['lr'] *= 0.1 optimizer_s.param_groups[0]['lr'] *= 0.1 print('Learning rate: %e' % optimizer_t.param_groups[0]['lr']) if optimizer_t.param_groups[0]['lr'] <= 1e-6: stop_flag = True break print('Best test acc: %.3f' % best_test_acc) best_acc_list.append(best_test_acc) print(best_acc_list) source_recorder.close() target_recorder.close() alpha_change_point_file.close()
if args.lr_adjust == 'adaptive': if train_loss < min_train_loss: min_train_loss = train_loss ascent_count = 0 else: ascent_count += 1 print('Current Loss: %.3f [%.3f], ascent count: %d' % (train_loss, min_train_loss, ascent_count)) if ascent_count >= 3: optimizer.param_groups[0]['lr'] *= 0.1 ascent_count = 0 if (optimizer.param_groups[0]['lr']) < (args.lr * 1e-3): break elif (epoch + 1) % args.lr_adjust == 0: optimizer.param_groups[0]['lr'] *= 0.1 if (optimizer.param_groups[0]['lr']) < (args.lr * 1e-3): break print('Learning rate decrease to %e' % optimizer.param_groups[0]['lr']) recorder.close() for collection in [ weight_quantization_error_recorder_collection, input_quantization_error_recorder_collection, weight_bit_allocation_collection, input_bit_allocation_collection ]: for recorder in collection.values(): recorder.close()
class Task(): def __init__(self, task_name, task_type = 'prune', optimizer_type = 'adam', save_root = None, SummaryPath = None, use_cuda = True, **kwargs): self.task_name = task_name self.task_type = task_type # prune, soft-quantize self.model_name, self.dataset_name = task_name.split('-') self.ratio = 'sample' if self.dataset_name in ['CIFARS'] else -1 ####### # Net # ####### if task_type == 'prune': if self.model_name == 'ResNet20': if self.dataset_name in ['CIFAR10', 'CIFARS']: self.net = resnet20_cifar() elif self.dataset_name == 'STL10': self.net = resnet20_stl() else: raise NotImplementedError elif self.model_name == 'ResNet32': if self.dataset_name in ['CIFAR10', 'CIFARS']: self.net = resnet32_cifar() elif self.dataset_name == 'STL10': self.net = resnet32_stl() else: raise NotImplementedError elif self.model_name == 'ResNet56': if self.dataset_name in ['CIFAR10', 'CIFARS']: self.net = resnet56_cifar() elif self.dataset_name == 'CIFAR100': self.net = resnet56_cifar(num_classes=100) elif self.dataset_name == 'STL10': self.net = resnet56_stl() else: raise NotImplementedError elif self.model_name == 'ResNet18': if self.dataset_name == 'ImageNet': self.net = resnet18() else: raise NotImplementedError elif self.model_name == 'vgg11': self.net = vgg11() if self.dataset_name == 'CIFAR10' else vgg11_stl10() else: print(self.model_name, self.dataset_name) raise NotImplementedError elif task_type == 'soft-quantize': if self.model_name == 'ResNet20': if self.dataset_name in ['CIFAR10', 'CIFARS']: self.net = soft_quantized_resnet20_cifar() elif self.dataset_name in ['STL10']: self.net = soft_quantized_resnet20_stl() else: raise NotImplementedError else: raise ('Task type not defined.') self.meta_opt_flag = True # True for enabling meta leraning ############## # Meta Prune # ############## self.mask_dict = dict() self.meta_grad_dict = dict() self.meta_hidden_state_dict = dict() ###################### # Meta Soft Quantize # ###################### self.quantized = 0 # Quantized type self.alpha_dict = dict() self.alpha_hidden_dict = dict() self.sq_rate = 0 self.s_rate = 0 self.q_rate = 0 ########## # Record # ########## self.dataset_type = 'large' if self.dataset_name in ['ImageNet'] else 'small' self.SummaryPath = SummaryPath self.save_root = save_root self.recorder = Recorder(self.SummaryPath, self.dataset_name, self.task_name) #################### # Load Pre-trained # #################### self.pretrain_path = '%s/%s-pretrain.pth' %(self.save_root, self.task_name) self.net.load_state_dict(torch.load(self.pretrain_path)) print('Load pre-trained model from %s' %self.pretrain_path) if use_cuda: self.net.cuda() # Optimizer for this task if optimizer_type in ['Adam', 'adam']: self.optimizer = Adam(self.net.parameters(), lr=1e-3) else: self.optimizer = SGD(self.net.parameters()) if self.dataset_name == 'ImageNet': try: self.train_loader = get_lmdb_imagenet('train', 128) self.test_loader = get_lmdb_imagenet('test', 100) except: self.train_loader = get_dataloader(self.dataset_name, 'train', 128) self.test_loader = get_dataloader(self.dataset_name, 'test', 100) else: self.train_loader = get_dataloader(self.dataset_name, 'train', 128, ratio=self.ratio) self.test_loader = get_dataloader(self.dataset_name, 'test', 128) self.iter_train_loader = yielder(self.train_loader) # For shared # self.loss = 0 # self.niter = 0 # Overall iteration record # self.test_loss = 0 # self.smallest_training_loss = 1e9 # self.stop = False # Whether to stop training # # # For CIFAR dataset # # self.train_acc = AverageMeter() # self.total = 0 # Number of batches used in training # self.n_batch = 0 # Number of batches used in training # self.test_acc = 0 # self.best_test_acc = 0 # self.ascend_count = 0 # # # For ImageNet dataset # # self.loss = AverageMeter() # self.top1 = AverageMeter() # self.top5 = AverageMeter() # self.batch_time = AverageMeter() # self.data_time = AverageMeter() # self.test_acc_top1 = 0 # self.test_acc_top5 = 0 # self.best_test_acc_top1 = 0 # self.best_test_acc_top5 = 0 # # ####################### # # Parameters for Meta # # ####################### # self.mask_dict = dict() # self.meta_grad_dict = dict() # self.meta_hidden_state_dict = dict() # # ########################### # # Open File for Recording # # ########################### # if self.dataset_type == 'small': # self.loss_record = open('%s/%s-loss.txt' %(self.SummaryPath, self.task_name), 'w+') # self.train_acc_record = open('%s/%s-train-acc.txt' %(self.SummaryPath, self.task_name), 'w+') # self.test_acc_record = open('%s/%s-test-acc.txt' %(self.SummaryPath, self.task_name), 'w+') # self.lr_record = open('%s/%s-lr.txt' %(self.SummaryPath, self.task_name), 'w+') # # print('Initialize %s' %(self.task_name)) # else: # self.loss_record = open('%s/%s-loss.txt' % (self.SummaryPath, self.task_name), 'w+') # self.train_top1_acc_record = open('%s/%s-train-top1-acc.txt' % (self.SummaryPath, self.task_name), 'w+') # self.train_top5_acc_record = open('%s/%s-train-top5-acc.txt' % (self.SummaryPath, self.task_name), 'w+') # self.test_top1_acc_record = open('%s/%s-test-top1-acc.txt' % (self.SummaryPath, self.task_name), 'w+') # self.test_top5_acc_record = open('%s/%s-test-top5-acc.txt' % (self.SummaryPath, self.task_name), 'w+') # self.lr_record = open('%s/%s-lr.txt' % (self.SummaryPath, self.task_name), 'w+') def train(self): self.net.train() def eval(self): self.net.eval() def zero_grad(self): self.optimizer.zero_grad() def step(self): self.optimizer.step() def update_record_performance(self, loss, acc, batch_size=0, lr = 1e-3, end=None, is_train = True): self.recorder.update(loss=loss, acc=acc, batch_size=batch_size, cur_lr=lr, end=end, is_train=is_train) # if is_train: # # self.loss += loss # self.n_batch += 1 # self.total += batch_size # self.niter += 1 # # if self.dataset_type == 'small': # self.top1.update(acc[0], batch_size) # # self.loss_record.write('%d, %.8f\n' % (self.niter, self.loss / self.n_batch)) # self.train_acc_record.write('%d, %.3f\n' % (self.niter, self.top1.avg)) # self.lr_record.write('%d, %e\n' % (self.niter, self.optimizer.param_groups[0]['lr'])) # # self.flush([self.loss_record, self.train_acc_record, self.lr_record]) # # else: # self.batch_time.update(time.time() - end) # self.top1.update(acc[0], batch_size) # self.top5.update(acc[1], batch_size) # # self.loss_record.write('%d, %.8f\n' % (self.niter, self.loss / self.n_batch)) # self.train_top1_acc_record.write('%d, %.3f\n' % (self.niter, self.top1.avg)) # self.train_top5_acc_record.write('%d, %.3f\n' % (self.niter, self.top5.avg)) # self.lr_record.write('%d, %ef\n' % (self.niter, self.optimizer.param_groups[0]['lr'])) # # self.flush([self.loss_record, self.train_top1_acc_record, self.train_top5_acc_record, self.lr_record]) # # else: # self.test_loss = loss # # if self.dataset_type == 'small': # # self.test_acc = acc # # if self.best_test_acc < self.test_acc: # self.best_test_acc = self.test_acc # print('[%s] Best test acc' %self.task_name) # # self.save(self.SummaryPath) # # self.test_acc_record.write('%d, %.3f\n' % (self.niter, self.test_acc)) # self.flush([self.test_acc_record]) # # else: # # self.test_acc_top1, self.test_acc_top5 = acc[0], acc[1] # # if self.best_test_acc_top1 < self.test_acc_top1 or self.best_test_acc_top5 < self.test_acc_top5: # self.best_test_acc_top1 = self.test_acc_top1 # self.best_test_acc_top5 = self.test_acc_top5 # print('[%s] Best test acc' % self.task_name) # # self.save(self.SummaryPath) # # self.test_top1_acc_record.write('%d, %.3f\n' % (self.niter, self.test_acc_top1)) # self.test_top5_acc_record.write('%d, %.3f\n' % (self.niter, self.test_acc_top5)) # # self.flush([self.test_top1_acc_record, self.test_top5_acc_record]) def reset_performance(self): # self.loss = 0 # # if self.dataset_type == 'small': # self.loss = 0 # # self.train_acc.reset() # self.top1.reset() # self.total = 0 # self.n_batch = 0 # else: # self.best_test_acc_top1 = 0 # self.best_test_acc_top5 = 0 # self.top1.reset() # self.top5.reset() # self.batch_time.reset() self.recorder.reset_performance() # def set_best_acc(self, test_acc): # self.best_test_acc = test_acc def save(self, save_root): torch.save(self.net.state_dict(), '%s/%s-net.pth' %(save_root, self.task_name)) def get_best_test_acc(self): # if self.dataset_type == 'small': # return self.best_test_acc # else: # return self.best_test_acc_top1, self.best_test_acc_top5 return self.recorder.get_best_test_acc() def flush(self, file_list=None): for file in file_list: file.flush() def close(self): # if self.dataset_type == 'small': # self.loss_record.close() # self.train_acc_record.close() # self.test_acc_record.close() # self.lr_record.close() # else: # self.loss_record.close() # self.train_top1_acc_record.close() # self.train_top5_acc_record.close() # self.test_top1_acc_record.close() # self.test_top5_acc_record.close() # self.lr_record.close() self.recorder.close() def adjust_lr(self, adjust_type): # if self.dataset_type == 'small': # if self.loss > self.smallest_training_loss: # self.ascend_count += 1 # else: # self.smallest_training_loss = self.loss # self.ascend_count = 0 # # if self.ascend_count >= 3: # self.ascend_count = 0 # self.optimizer.param_groups[0]['lr'] *= 0.1 # if self.optimizer.param_groups[0]['lr'] < 1e-6: # self.stop = True # # print('[%s] Current training loss: %.3f[%.3f], ascend count: %d' # %(self.task_name, self.loss, self.smallest_training_loss, self.ascend_count)) # print('---------------------------------------------------') # else: # raise NotImplementedError self.recorder.adjust_lr(self.optimizer)
def main(): if sys.platform.startswith('win'): # Add the _win_handler function to the windows console's handler function list win32api.SetConsoleCtrlHandler(_win_handler, True) if os.path.exists( os.path.join(config_file.config['config_file'], 'config.yaml')): config = sth.load_config(config_file.config['config_file']) else: config = config_file.config print(f'load config from config.') hyper_config = config['hyper parameters'] train_config = config['train config'] record_config = config['record config'] basic_dir = record_config['basic_dir'] last_name = record_config['project_name'] + '/' \ + record_config['remark'] \ + record_config['run_id'] cp_dir = record_config['checkpoint_basic_dir'] + last_name cp_file = cp_dir + '/rb' log_dir = record_config['log_basic_dir'] + last_name excel_dir = record_config['excel_basic_dir'] + last_name config_dir = record_config['config_basic_dir'] + last_name sth.check_or_create(basic_dir, 'basic') sth.check_or_create(cp_dir, 'checkpoints') sth.check_or_create(log_dir, 'logs(summaries)') sth.check_or_create(excel_dir, 'excel') sth.check_or_create(config_dir, 'config') logger = create_logger( name='logger', console_level=logging.INFO, console_format='%(levelname)s : %(message)s', logger2file=record_config['logger2file'], file_name=log_dir + '\log.txt', file_level=logging.WARNING, file_format= '%(lineno)d - %(asctime)s - %(module)s - %(funcName)s - %(levelname)s - %(message)s' ) if train_config['train']: sth.save_config(config_dir, config) if train_config['unity_mode']: env = UnityEnvironment() else: env = UnityEnvironment( file_name=train_config['unity_file'], no_graphics=True if train_config['train'] else False, base_port=train_config['port']) brain_name = env.external_brain_names[0] brain = env.brains[brain_name] # set the memory use proportion of GPU tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True # tf_config.gpu_options.per_process_gpu_memory_fraction = 0.5 tf.reset_default_graph() graph = tf.Graph() with graph.as_default() as g: with tf.Session(graph=g, config=tf_config) as sess: logger.info('Algorithm: {0}'.format( train_config['algorithm'].name)) if train_config['algorithm'] == config_file.algorithms.ppo_sep_ac: from ppo.ppo_base import PPO_SEP model = PPO_SEP(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) logger.info('PPO_SEP initialize success.') elif train_config['algorithm'] == config_file.algorithms.ppo_com: from ppo.ppo_base import PPO_COM model = PPO_COM(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) logger.info('PPO_COM initialize success.') elif train_config['algorithm'] == config_file.algorithms.sac: from sac.sac import SAC model = SAC(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) logger.info('SAC initialize success.') elif train_config['algorithm'] == config_file.algorithms.sac_no_v: from sac.sac_no_v import SAC_NO_V model = SAC_NO_V(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) logger.info('SAC_NO_V initialize success.') elif train_config['algorithm'] == config_file.algorithms.ddpg: from ddpg.ddpg import DDPG model = DDPG(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) logger.info('DDPG initialize success.') elif train_config['algorithm'] == config_file.algorithms.td3: from td3.td3 import TD3 model = TD3(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) logger.info('TD3 initialize success.') recorder = Recorder(log_dir, excel_dir, record_config, logger, max_to_keep=5, pad_step_number=True, graph=g) episode = init_or_restore(cp_dir, sess, recorder, cp_file) try: if train_config['train']: train_OnPolicy( sess=sess, env=env, brain_name=brain_name, begin_episode=episode, model=model, recorder=recorder, cp_file=cp_file, hyper_config=hyper_config, train_config=train_config) if not train_config[ 'use_replay_buffer'] else train_OffPolicy( sess=sess, env=env, brain_name=brain_name, begin_episode=episode, model=model, recorder=recorder, cp_file=cp_file, hyper_config=hyper_config, train_config=train_config) tf.train.write_graph(g, cp_dir, 'raw_graph_def.pb', as_text=False) export_model(cp_dir, g) else: inference(env, brain_name, model, train_config) except Exception as e: logger.error(e) finally: env.close() recorder.close() sys.exit()
losses.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) optimizer.step() # ------ # Record # ------ if recoder is not None: recoder.update(losses.item(), batch_size=args.batch_size, cur_lr=optimizer.param_groups[0]['lr']) recoder.print_training_result(batch_idx, len(train_loader)) else: train_loss += losses.item() progress_bar(batch_idx, len(train_loader), "Loss: %.3f" % (train_loss / (batch_idx + 1))) # ----- # Test # ----- eval_loss = evaluate(model, test_loader, criterion) if recoder is not None: recoder.update(eval_loss, is_train=False) print('[%2d] Test loss: %.3f' % (epoch_idx, eval_loss)) if recoder is not None: recoder.close()