def blogJobDay(self): logger.info('blogJob-startTime:%s' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))) messages = Engine().getYesterdayUrls() MailUtil().senHtml(messages) logger.info('blogJob-endTime:%s' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
def ding_test(cfg:BaseConfigByEpoch, net=None, val_dataloader=None, show_variables=False, convbuilder=None, init_hdf5=None, extra_msg=None, weights_dict=None): with Engine(local_rank=0, for_val_only=True) as engine: engine.setup_log( name='test', log_dir='./', file_name=DETAIL_LOG_FILE) if convbuilder is None: convbuilder = ConvBuilder(base_config=cfg) if net is None: net_fn = get_model_fn(cfg.dataset_name, cfg.network_type) model = net_fn(cfg, convbuilder).cuda() else: model = net.cuda() if val_dataloader is None: val_data = create_dataset(cfg.dataset_name, cfg.dataset_subset, global_batch_size=cfg.global_batch_size, distributed=False) num_examples = num_val_examples(cfg.dataset_name) assert num_examples % cfg.global_batch_size == 0 val_iters = num_val_examples(cfg.dataset_name) // cfg.global_batch_size print('batchsize={}, {} iters'.format(cfg.global_batch_size, val_iters)) criterion = get_criterion(cfg).cuda() engine.register_state( scheduler=None, model=model, optimizer=None) if show_variables: engine.show_variables() assert not engine.distributed if weights_dict is not None: engine.load_from_weights_dict(weights_dict) else: if cfg.init_weights: engine.load_checkpoint(cfg.init_weights) if init_hdf5: engine.load_hdf5(init_hdf5) # engine.save_by_order('smi2_by_order.hdf5') # engine.load_by_order('smi2_by_order.hdf5') # engine.save_hdf5('model_files/stami2_lrs4Z.hdf5') model.eval() eval_dict, total_net_time = run_eval(val_data, val_iters, model, criterion, 'TEST', dataset_name=cfg.dataset_name) val_top1_value = eval_dict['top1'].item() val_top5_value = eval_dict['top5'].item() val_loss_value = eval_dict['loss'].item() msg = '{},{},{},top1={:.5f},top5={:.5f},loss={:.7f},total_net_time={}'.format(cfg.network_type, init_hdf5 or cfg.init_weights, cfg.dataset_subset, val_top1_value, val_top5_value, val_loss_value, total_net_time) if extra_msg is not None: msg += ', ' + extra_msg log_important(msg, OVERALL_LOG_FILE) return eval_dict
def ding_test(cfg:BaseConfigByEpoch, net=None, val_dataloader=None, show_variables=False, convbuilder=None, init_hdf5=None, ): with Engine() as engine: engine.setup_log( name='test', log_dir='./', file_name=DETAIL_LOG_FILE) if net is None: net = get_model_fn(cfg.dataset_name, cfg.network_type) if convbuilder is None: convbuilder = ConvBuilder(base_config=cfg) model = net(cfg, convbuilder).cuda() if val_dataloader is None: val_dataloader = create_dataset(cfg.dataset_name, cfg.dataset_subset, batch_size=cfg.global_batch_size) val_iters = 50000 // cfg.global_batch_size if cfg.dataset_name == 'imagenet' else 10000 // cfg.global_batch_size print('NOTE: Data prepared') print('NOTE: We have global_batch_size={} on {} GPUs, the allocated GPU memory is {}'.format(cfg.global_batch_size, torch.cuda.device_count(), torch.cuda.memory_allocated())) criterion = get_criterion(cfg).cuda() engine.register_state( scheduler=None, model=model, optimizer=None, cfg=cfg) if show_variables: engine.show_variables() if engine.distributed: print('Distributed training, engine.world_rank={}'.format(engine.world_rank)) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[engine.world_rank], broadcast_buffers=False, ) # model = DistributedDataParallel(model, delay_allreduce=True) elif torch.cuda.device_count() > 1: print('Single machine multiple GPU training') model = torch.nn.parallel.DataParallel(model) if cfg.init_weights: engine.load_checkpoint(cfg.init_weights, just_weights=True) if init_hdf5: engine.load_hdf5(init_hdf5) model.eval() eval_dict, _ = run_eval(val_dataloader, val_iters, model, criterion, 'TEST', dataset_name=cfg.dataset_name) val_top1_value = eval_dict['top1'].item() val_top5_value = eval_dict['top5'].item() val_loss_value = eval_dict['loss'].item() msg = '{},{},{},top1={:.5f},top5={:.5f},loss={:.7f}'.format(cfg.network_type, init_hdf5 or cfg.init_weights, cfg.dataset_subset, val_top1_value, val_top5_value, val_loss_value) log_important(msg, OVERALL_LOG_FILE)
def blogJob(self): logger.info('blogJob-startTime:%s' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))) engine = Engine() urls = engine.get_url() logger.info('urls : ' + str(len(urls.keys()))) deduplicationUrls = engine.deduplication() logger.info('deduplicationUrls : ' + str(len(urls.keys()))) if len(deduplicationUrls) > 0: engine.download_file() # engine.senMail() logger.info('blogJob-endTime:%s' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
def csgd_train_and_prune(cfg: BaseConfigByEpoch, target_deps, centri_strength, pacesetter_dict, succeeding_strategy, pruned_weights, extra_cfg, net=None, train_dataloader=None, val_dataloader=None, show_variables=False, beginning_msg=None, init_weights=None, no_l2_keywords=None, use_nesterov=False, tensorflow_style_init=False, iter=None): ensure_dir(cfg.output_dir) ensure_dir(cfg.tb_dir) clusters_save_path = os.path.join(cfg.output_dir, 'clusters.npy') print("cluster save path:{}".format(clusters_save_path)) config = extra_cfg with Engine() as engine: is_main_process = (engine.world_rank == 0) #TODO correct? logger = engine.setup_log(name='train', log_dir=cfg.output_dir, file_name='log.txt') saveName = "%s-%s.yaml" % (config['note'], config['dataset']) modelName = config['modelName'] os.environ['CUDA_VISIBLE_DEVICES'] = config['gpu_available'] device_ids = range(config['gpu_num']) trainSet = GoProDataset(sharp_root=config['train_sharp'], blur_root=config['train_blur'], resize_size=config['resize_size'], patch_size=config['crop_size'], phase='train') testSet = GoProDataset(sharp_root=config['test_sharp'], blur_root=config['test_blur'], resize_size=config['resize_size'], patch_size=config['crop_size'], phase='test') train_loader = DataLoader(trainSet, batch_size=config['batchsize'], shuffle=True, num_workers=4, drop_last=True, pin_memory=True) test_loader = DataLoader(testSet, batch_size=1, shuffle=False, num_workers=1, drop_last=False, pin_memory=True) print('NOTE: Data prepared') print( 'NOTE: We have global_batch_size={} on {} GPUs, the allocated GPU memory is {}' .format(config['batchsize'], torch.cuda.device_count(), torch.cuda.memory_allocated())) model = net optimizer = get_optimizer(cfg, model, use_nesterov=use_nesterov) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=config['step'], gamma=0.5) # learning rates criterion = get_criterion(cfg).cuda() engine.register_state(scheduler=scheduler, model=model, optimizer=optimizer, cfg=cfg) model = torch.nn.DataParallel(model.cuda(), device_ids=device_ids) # load weight of last prune iteration or the not pruned model if init_weights: engine.load_pth(init_weights) # for unet the last outconv will not be pruned kernel_namedvalue_list = engine.get_all_conv_kernel_namedvalue_as_list( remove='out') # cluster filters if os.path.exists(clusters_save_path): layer_idx_to_clusters = np.load(clusters_save_path, allow_pickle=True).item() print("cluster exist, load from {}".format(clusters_save_path)) else: layer_idx_to_clusters = get_layer_idx_to_clusters( kernel_namedvalue_list=kernel_namedvalue_list, target_deps=target_deps, pacesetter_dict=pacesetter_dict) if pacesetter_dict is not None: for follower_idx, pacesetter_idx in pacesetter_dict.items(): if pacesetter_idx in layer_idx_to_clusters: layer_idx_to_clusters[ follower_idx] = layer_idx_to_clusters[ pacesetter_idx] # print(layer_idx_to_clusters) np.save(clusters_save_path, layer_idx_to_clusters) csgd_save_file = os.path.join(cfg.output_dir, 'finish.pth') # if this prune iter has a trained model, then load it if os.path.exists(csgd_save_file): engine.load_pth(csgd_save_file) else: param_name_to_merge_matrix = generate_merge_matrix_for_kernel( deps=cfg.deps, layer_idx_to_clusters=layer_idx_to_clusters, kernel_namedvalue_list=kernel_namedvalue_list) param_name_to_decay_matrix = generate_decay_matrix_for_kernel_and_vecs( deps=cfg.deps, layer_idx_to_clusters=layer_idx_to_clusters, kernel_namedvalue_list=kernel_namedvalue_list, weight_decay=cfg.weight_decay, centri_strength=centri_strength) # if pacesetter_dict is not None: # for follower_idx, pacesetter_idx in pacesetter_dict.items(): # follower_kernel_name = kernel_namedvalue_list[follower_idx].name # pacesetter_kernel_name = kernel_namedvalue_list[follower_idx].name # if pacesetter_kernel_name in param_name_to_merge_matrix: # param_name_to_merge_matrix[follower_kernel_name] = param_name_to_merge_matrix[ # pacesetter_kernel_name] # param_name_to_decay_matrix[follower_kernel_name] = param_name_to_decay_matrix[ # pacesetter_kernel_name] # add 2 para of bn and conv.bias to mat dicts to enable the c-sgd update rule add_vecs_to_mat_dicts(param_name_to_merge_matrix) if show_variables: engine.show_variables() if beginning_msg: engine.log(beginning_msg) logger.info("\n\nStart training with pytorch version {}".format( torch.__version__)) iteration = engine.state.iteration startEpoch = config['start_epoch'] max_epochs = config['max_epochs'] engine.save_pth(os.path.join(cfg.output_dir, 'init.pth')) viz = Visdom(env=saveName) bestPSNR = config['bestPSNR'] itr = '' if iter is None else str(iter) for epoch in range(startEpoch, max_epochs): # eval if epoch % config['save_epoch'] == 0: with torch.no_grad(): model.eval() avg_PSNR = 0 idx = 0 for test_data in test_loader: idx += 1 test_data['L'] = test_data['L'].cuda() sharp = model(test_data['L']) sharp = sharp.detach().float().cpu() sharp = util.tensor2uint(sharp) test_data['H'] = util.tensor2uint(test_data['H']) current_psnr = util.calculate_psnr(sharp, test_data['H'], border=0) avg_PSNR += current_psnr if idx % 100 == 0: print("epoch {}: tested {}".format(epoch, idx)) avg_PSNR = avg_PSNR / idx print("total PSNR : {:<4.2f}".format(avg_PSNR)) viz.line(X=[epoch], Y=[avg_PSNR], win='testPSNR-' + itr, opts=dict(title='psnr', legend=['valid_psnr']), update='append') if avg_PSNR > bestPSNR: bestPSNR = avg_PSNR save_path = os.path.join(cfg.output_dir, 'finish.pth') engine.save_pth(save_path) # train avg_loss = 0.0 idx = 0 model.train() for i, train_data in enumerate(train_loader): idx += 1 train_data['L'] = train_data['L'].cuda() train_data['H'] = train_data['H'].cuda() optimizer.zero_grad() loss = train_one_step(model, train_data['L'], train_data['H'], criterion,\ optimizer,param_name_to_merge_matrix,\ param_name_to_decay_matrix) avg_loss += loss.item() if idx % 100 == 0: print("epoch {}: trained {}".format(epoch, idx)) scheduler.step() avg_loss = avg_loss / idx print("epoch {}: total loss : {:<4.2f}, lr : {}".format( epoch, avg_loss, scheduler.get_lr()[0])) viz.line(X=[epoch], Y=[avg_loss], win='trainMSELoss-' + itr, opts=dict(title='mse', legend=['train_mse']), update='append') # engine.save_pth(os.path.join(cfg.output_dir, 'finish.pth')) csgd_prune_and_save(engine=engine, layer_idx_to_clusters=layer_idx_to_clusters, save_file=pruned_weights, succeeding_strategy=succeeding_strategy, new_deps=target_deps)
def train_main(local_rank, cfg: BaseConfigByEpoch, net=None, train_dataloader=None, val_dataloader=None, show_variables=False, convbuilder=None, init_hdf5=None, no_l2_keywords='depth', gradient_mask=None, use_nesterov=False, tensorflow_style_init=False, load_weights_keyword=None, keyword_to_lr_mult=None, auto_continue=False, lasso_keyword_to_strength=None, save_hdf5_epochs=10000): if no_l2_keywords is None: no_l2_keywords = [] if type(no_l2_keywords) is not list: no_l2_keywords = [no_l2_keywords] ensure_dir(cfg.output_dir) ensure_dir(cfg.tb_dir) with Engine(local_rank=local_rank) as engine: engine.setup_log(name='train', log_dir=cfg.output_dir, file_name='log.txt') # ----------------------------- build model ------------------------------ if convbuilder is None: convbuilder = ConvBuilder(base_config=cfg) if net is None: net_fn = get_model_fn(cfg.dataset_name, cfg.network_type) model = net_fn(cfg, convbuilder) else: model = net model = model.cuda() # ----------------------------- model done ------------------------------ # ---------------------------- prepare data ------------------------- if train_dataloader is None: train_data = create_dataset(cfg.dataset_name, cfg.dataset_subset, cfg.global_batch_size, distributed=engine.distributed) if cfg.val_epoch_period > 0 and val_dataloader is None: val_data = create_dataset(cfg.dataset_name, 'val', global_batch_size=100, distributed=False) engine.echo('NOTE: Data prepared') engine.echo( 'NOTE: We have global_batch_size={} on {} GPUs, the allocated GPU memory is {}' .format(cfg.global_batch_size, torch.cuda.device_count(), torch.cuda.memory_allocated())) # ----------------------------- data done -------------------------------- # ------------------------ parepare optimizer, scheduler, criterion ------- optimizer = get_optimizer(engine, cfg, model, no_l2_keywords=no_l2_keywords, use_nesterov=use_nesterov, keyword_to_lr_mult=keyword_to_lr_mult) scheduler = get_lr_scheduler(cfg, optimizer) criterion = get_criterion(cfg).cuda() # --------------------------------- done ------------------------------- engine.register_state(scheduler=scheduler, model=model, optimizer=optimizer) if engine.distributed: torch.cuda.set_device(local_rank) engine.echo('Distributed training, device {}'.format(local_rank)) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], broadcast_buffers=False, ) else: assert torch.cuda.device_count() == 1 engine.echo('Single GPU training') if tensorflow_style_init: init_as_tensorflow(model) if cfg.init_weights: engine.load_checkpoint(cfg.init_weights) if init_hdf5: engine.load_hdf5(init_hdf5, load_weights_keyword=load_weights_keyword) if auto_continue: assert cfg.init_weights is None engine.load_checkpoint(get_last_checkpoint(cfg.output_dir)) if show_variables: engine.show_variables() # ------------ do training ---------------------------- # engine.log("\n\nStart training with pytorch version {}".format( torch.__version__)) iteration = engine.state.iteration iters_per_epoch = num_iters_per_epoch(cfg) max_iters = iters_per_epoch * cfg.max_epochs tb_writer = SummaryWriter(cfg.tb_dir) tb_tags = ['Top1-Acc', 'Top5-Acc', 'Loss'] model.train() done_epochs = iteration // iters_per_epoch last_epoch_done_iters = iteration % iters_per_epoch if done_epochs == 0 and last_epoch_done_iters == 0: engine.save_hdf5(os.path.join(cfg.output_dir, 'init.hdf5')) recorded_train_time = 0 recorded_train_examples = 0 collected_train_loss_sum = 0 collected_train_loss_count = 0 if gradient_mask is not None: gradient_mask_tensor = {} for name, value in gradient_mask.items(): gradient_mask_tensor[name] = torch.Tensor(value).cuda() else: gradient_mask_tensor = None for epoch in range(done_epochs, cfg.max_epochs): if engine.distributed and hasattr(train_data, 'train_sampler'): train_data.train_sampler.set_epoch(epoch) if epoch == done_epochs: pbar = tqdm(range(iters_per_epoch - last_epoch_done_iters)) else: pbar = tqdm(range(iters_per_epoch)) if epoch == 0 and local_rank == 0: val_during_train(epoch=epoch, iteration=iteration, tb_tags=tb_tags, engine=engine, model=model, val_data=val_data, criterion=criterion, descrip_str='Init', dataset_name=cfg.dataset_name, test_batch_size=TEST_BATCH_SIZE, tb_writer=tb_writer) top1 = AvgMeter() top5 = AvgMeter() losses = AvgMeter() discrip_str = 'Epoch-{}/{}'.format(epoch, cfg.max_epochs) pbar.set_description('Train' + discrip_str) for _ in pbar: start_time = time.time() data, label = load_cuda_data(train_data, dataset_name=cfg.dataset_name) # load_cuda_data(train_dataloader, cfg.dataset_name) data_time = time.time() - start_time if_accum_grad = ((iteration % cfg.grad_accum_iters) != 0) train_net_time_start = time.time() acc, acc5, loss = train_one_step( model, data, label, optimizer, criterion, if_accum_grad, gradient_mask_tensor=gradient_mask_tensor, lasso_keyword_to_strength=lasso_keyword_to_strength) train_net_time_end = time.time() if iteration > TRAIN_SPEED_START * max_iters and iteration < TRAIN_SPEED_END * max_iters: recorded_train_examples += cfg.global_batch_size recorded_train_time += train_net_time_end - train_net_time_start scheduler.step() for module in model.modules(): if hasattr(module, 'set_cur_iter'): module.set_cur_iter(iteration) if iteration % cfg.tb_iter_period == 0 and engine.world_rank == 0: for tag, value in zip( tb_tags, [acc.item(), acc5.item(), loss.item()]): tb_writer.add_scalars(tag, {'Train': value}, iteration) top1.update(acc.item()) top5.update(acc5.item()) losses.update(loss.item()) if epoch >= cfg.max_epochs - COLLECT_TRAIN_LOSS_EPOCHS: collected_train_loss_sum += loss.item() collected_train_loss_count += 1 pbar_dic = OrderedDict() pbar_dic['data-time'] = '{:.2f}'.format(data_time) pbar_dic['cur_iter'] = iteration pbar_dic['lr'] = scheduler.get_lr()[0] pbar_dic['top1'] = '{:.5f}'.format(top1.mean) pbar_dic['top5'] = '{:.5f}'.format(top5.mean) pbar_dic['loss'] = '{:.5f}'.format(losses.mean) pbar.set_postfix(pbar_dic) iteration += 1 if iteration >= max_iters or iteration % cfg.ckpt_iter_period == 0: engine.update_iteration(iteration) if (not engine.distributed) or (engine.distributed and engine.world_rank == 0): engine.save_and_link_checkpoint(cfg.output_dir) if iteration >= max_iters: break # do something after an epoch? engine.update_iteration(iteration) engine.save_latest_ckpt(cfg.output_dir) if (epoch + 1) % save_hdf5_epochs == 0: engine.save_hdf5( os.path.join(cfg.output_dir, 'epoch-{}.hdf5'.format(epoch))) if local_rank == 0 and \ cfg.val_epoch_period > 0 and (epoch >= cfg.max_epochs - 10 or epoch % cfg.val_epoch_period == 0): val_during_train(epoch=epoch, iteration=iteration, tb_tags=tb_tags, engine=engine, model=model, val_data=val_data, criterion=criterion, descrip_str=discrip_str, dataset_name=cfg.dataset_name, test_batch_size=TEST_BATCH_SIZE, tb_writer=tb_writer) if iteration >= max_iters: break # do something after the training if recorded_train_time > 0: exp_per_sec = recorded_train_examples / recorded_train_time else: exp_per_sec = 0 engine.log( 'TRAIN speed: from {} to {} iterations, batch_size={}, examples={}, total_net_time={:.4f}, examples/sec={}' .format(int(TRAIN_SPEED_START * max_iters), int(TRAIN_SPEED_END * max_iters), cfg.global_batch_size, recorded_train_examples, recorded_train_time, exp_per_sec)) if cfg.save_weights: engine.save_checkpoint(cfg.save_weights) print('NOTE: training finished, saved to {}'.format( cfg.save_weights)) engine.save_hdf5(os.path.join(cfg.output_dir, 'finish.hdf5')) if collected_train_loss_count > 0: engine.log( 'TRAIN LOSS collected over last {} epochs: {:.6f}'.format( COLLECT_TRAIN_LOSS_EPOCHS, collected_train_loss_sum / collected_train_loss_count))
class LearningPouring: def __init__(self): self.head_Joint = ["head_p", "head_y"] self.arm_Joint = [ "l_arm_el_y", "l_arm_sh_p1", "l_arm_sh_p2", "l_arm_sh_r", "l_arm_wr_p", "l_arm_wr_r", "l_arm_wr_y", "r_arm_el_y", "r_arm_sh_p1", "r_arm_sh_p2", "r_arm_sh_r", "r_arm_wr_p", "r_arm_wr_r", "r_arm_wr_y" ] self.leg_Joint = [ "l_leg_an_p", "l_leg_an_r", "l_leg_hip_p", "l_leg_hip_r", "l_leg_hip_y", "l_leg_kn_p", "r_arm_el_y", "r_leg_an_p", "r_leg_an_r", "r_leg_hip_p", "r_leg_hip_r", "r_leg_hip_y", "r_leg_kn_p", "torso_y" ] self.grip_Joint = ["l_arm_grip", "r_arm_grip"] # Initialize publisher objects self.initPublishers() # Initialize Subscriber objects self.initSubscribe() ### init engine name2num = { 'gripON_L': "gripON_L", 'gripOFF_L': "gripOFF_L", 'gripON_R': "gripON_R", 'gripOFF_R': "gripOFF_R", 'fLL': 34, 'bLL': 35, 'fRR': 36, 'bRR': 37, 'fML': 38, 'bML': 39, 'fMR': 40, 'bMR': 41, 'init': 44, 'DMPPourRtoL': 28, 'removeBall': 29 } self.engine = Engine(name2num) self.dmp_y0 = np.array([-1.52017496, 0.04908739, 1.41433029]) self.dmp_goal = np.array([-1.50848603, 0.0591503, 1.44347592]) load_file_name = "w_0_1_right_3_100_1000.0_0.01_2" #load_file_name = raw_input('file name: ') load_file_name_list = load_file_name.split('_') ### learning ep self.ep = int(load_file_name_list[1]) ### pouring number of ball to the other tube self.numofball = int(load_file_name_list[2]) ### which arm do the pouring motion self.pour_arm = load_file_name_list[3] n_dmps = int(load_file_name_list[4]) n_bfs = int(load_file_name_list[5]) decay = float(load_file_name_list[6]) dt = float(load_file_name_list[7]) self.total_ball = float(load_file_name_list[8]) ### initial DMP self.rl = RLDMPs(n_dmps=n_dmps, n_bfs=n_bfs, decay=decay, y0=self.dmp_y0, goal=self.dmp_goal, ay=np.ones(n_dmps) * 10.0, dt=dt) self.rl.load_weight(load_file_name) print(self.rl.predict().y) print("load npy file weight success:") print("ep: " + str(self.ep)) print("pouring " + str(self.numofball) + " ball to other tube. Total: " + str(self.total_ball)) print("using " + self.pour_arm + " pouring the ball") self.costT_list = [] def initPublishers(self): self.pub_joint_ctrl_module = rospy.Publisher( '/robotis/set_joint_ctrl_modules', JointCtrlModule, queue_size=10) self.pub_action = rospy.Publisher('/robotis/action/page_num', Int32, queue_size=10) self.pub_IK = rospy.Publisher( '/robotis/manipulation/kinematics_pose_msg', KinematicsPose, queue_size=1) self.pub_joint_value = rospy.Publisher('/robotis/set_joint_states', JointState, queue_size=10) self.fk_pub = rospy.Publisher('/thormang3/fk_set_joint_states', JointState, queue_size=10) # Wait a bit for the publishers to initialize sleep(1) def initSubscribe(self): #rospy.Subscriber('/robotis/present_joint_states', JointState, self.callback) pass #def callback(self,msg): # self.joint_pose = dict(zip(msg.name, msg.position)) # calulate Ik from robotis IK engine def cal_IK(self, name, x, y, z, qx, qy, qz, qw): self.set_manipulation_module() sleep(0.5) pose_msg = KinematicsPose() pose_msg.name = name pose_msg.pose.position.x = x pose_msg.pose.position.y = y pose_msg.pose.position.z = z pose_msg.pose.orientation.x = qx pose_msg.pose.orientation.y = qy pose_msg.pose.orientation.z = qz pose_msg.pose.orientation.w = qw self.pub_IK.publish(pose_msg) def get_tube_position(self, arm_type, j): rospy.wait_for_service("/thormang3_eureka/cal_fk") # Create service object fk_srv = rospy.ServiceProxy("/thormang3_eureka/cal_fk", CalFK) try: # Call service and get response fk_resp = fk_srv(arm_type, j[0], j[1], j[2], j[3], j[4], j[5], j[6]) except rospy.ServiceException as exc: print("Failed to call service: " + str(exc)) oR, oP, oY = euler_from_quaternion( [fk_resp.ox, fk_resp.oy, fk_resp.oz, fk_resp.ow]) Rx = np.array([[1, 0, 0, 0], [0, np.cos(-oR), np.sin(-oR), 0], [0, np.sin(-oR), np.cos(-oR), 0], [0, 0, 0, 1]]) Ry = np.array([[np.cos(oP), 0, np.sin(oP), 0], [0, 1, 0, 0], [-np.sin(oP), 0, np.cos(oP), 0], [0, 0, 0, 1]]) Rz = np.array([[np.cos(oY), -np.sin(oY), 0, 0], [np.sin(oY), np.cos(oY), 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]) bottom = np.array([fk_resp.px, fk_resp.py, fk_resp.pz]) up = np.array([fk_resp.px, fk_resp.py, fk_resp.pz]) turb_lenght = 0.07 pos = np.array([0, 0, turb_lenght, 1]) # rotate j4 pos = Rx.dot(pos) # rotate j5 pos = Ry.dot(pos) # rotate j6 pos = Rz.dot(pos) up[0] += pos[0] up[1] += pos[1] up[2] += pos[2] return bottom, up def detect_collision(self, pose, name): dic_pos = dict(zip(name, pose)) right_arm_name = [ "r_arm_sh_p1", "r_arm_sh_r", "r_arm_sh_p2", "r_arm_el_y", "r_arm_wr_r", "r_arm_wr_y", "r_arm_wr_p" ] right_arm_pose = [ bin2float(3124), bin2float(1614), bin2float(1029), bin2float(1525), bin2float(1063), bin2float(2084), bin2float(2987) ] r_pose = dict(zip(right_arm_name, right_arm_pose)) j = [] for n in right_arm_name: if n in name: j.append(dic_pos[n]) else: j.append(r_pose[n]) #j = [self.joint_pose["r_arm_sh_p1"],self.joint_pose["r_arm_sh_r"],self.joint_pose["r_arm_sh_p2"],self.joint_pose["r_arm_el_y"],self.joint_pose["r_arm_wr_r"],self.joint_pose["r_arm_wr_y"],self.joint_pose["r_arm_wr_p"]] # Create service object # Rb is the point bottom of the right tube, Rt is the point top of the right tube Rb, Rt = self.get_tube_position("right_arm", j) left_arm_pose = [ bin2float(1650), bin2float(3129), bin2float(2560), bin2float(2343), bin2float(3543), bin2float(2137), bin2float(1167) ] # Lb is the point bottom of the left tube, Lt is the point top of the left tube Lb, Lt = self.get_tube_position("left_arm", left_arm_pose) pA, pB, dis = closestDistanceBetweenLines(Rb, Rt, Lb, Lt, clampAll=True) return dis def execute_path(self, joint_name, traj, delay_time=0.0025): assert len(joint_name) == len(traj[0]) joint = JointState() joint.name = joint_name joint.velocity = [0.0 for _ in range(len(joint_name))] joint.effort = [0.0 for _ in range(len(joint_name))] self.set_none_module() start = time.time() for i in range(len(traj)): ts = time.time() joint.position = traj[i] self.pub_joint_value.publish(joint) te = time.time() while (te - ts) < delay_time: te = time.time() end = time.time() print("execute_path time is : ", end - start) def cal_cost(self, bnum, traj): cost = np.zeros((self.rl.timesteps)) + 1e-8 costT = (self.numofball - bnum)**2 return cost, costT def gripperOpen(self, name): self.set_none_module() sleep(0.5) # open gripper joint_msg = JointState() joint_msg.name = [name] joint_msg.position = [0.0] self.pub_joint_value.publish(joint_msg) self.set_manipulation_module() def gripperClose(self, name): self.set_none_module() # close gripper joint_msg = JointState() joint_msg.name = [name] joint_msg.position = [1.0] self.pub_joint_value.publish(joint_msg) self.set_manipulation_module() def set_action_modules(self): # Set arm to manipulation module j = JointCtrlModule() j.joint_name = self.arm_Joint + self.leg_Joint + self.head_Joint j.module_name = ["action_module" for _ in range(len(j.joint_name))] self.pub_joint_ctrl_module.publish(j) # Wait a bit for the publishers to set_joint_ctrl_modules sleep(0.2) def set_manipulation_module(self): # Set arm to manipulation module j = JointCtrlModule() j.joint_name = self.arm_Joint j.module_name = [ "manipulation_module" for _ in range(len(self.arm_Joint)) ] self.pub_joint_ctrl_module.publish(j) # Wait a bit for the publishers to set_joint_ctrl_modules sleep(0.2) # Set gripper to gripper module j = JointCtrlModule() j.joint_name = self.grip_Joint j.module_name = ["gripper_module" for _ in range(len(self.grip_Joint))] self.pub_joint_ctrl_module.publish(j) # Wait a bit for the publishers to set_joint_ctrl_modules sleep(0.2) def set_none_module(self): # Set arm to none module j = JointCtrlModule() j.joint_name = self.arm_Joint + self.leg_Joint j.module_name = ["none" for _ in range(len(j.joint_name))] self.pub_joint_ctrl_module.publish(j) # Wait a bit for the publishers to set_joint_ctrl_modules sleep(0.2) # Set gripper to gripper module j = JointCtrlModule() j.joint_name = self.grip_Joint j.module_name = ["none" for _ in range(len(self.grip_Joint))] self.pub_joint_ctrl_module.publish(j) # Wait a bit for the publishers to set_joint_ctrl_modules sleep(0.2) def run(self): rospy.init_node('Learning_pouring', anonymous=True) rate = rospy.Rate(10) # 10hz while not rospy.is_shutdown(): page = raw_input('Command: ') if page == "pickRR": self.set_action_modules() plan = ["init"] self.engine.setPlan(plan) while self.engine.isRunning: self.engine.run() self.cal_IK(name="right_arm", x=0.460, y=-0.180, z=1.100, qx=0, qy=0, qz=0, qw=1) sleep(3) self.cal_IK(name="right_arm", x=0.460, y=-0.180, z=0.89, qx=0, qy=0, qz=0, qw=1) sleep(3) self.gripperOpen("r_arm_grip") raw_input("wait the turb") self.gripperClose("r_arm_grip") self.cal_IK(name="right_arm", x=0.460, y=-0.180, z=1.100, qx=0, qy=0, qz=0, qw=1) sleep(3) self.set_action_modules() plan = ["init"] self.engine.setPlan(plan) while self.engine.isRunning: self.engine.run() if page == "pickLL": self.set_action_modules() plan = ["init"] self.engine.setPlan(plan) while self.engine.isRunning: self.engine.run() self.cal_IK(name="left_arm", x=0.460, y=0.180, z=1.100, qx=0, qy=0, qz=0, qw=1) sleep(3) self.cal_IK(name="left_arm", x=0.460, y=0.180, z=0.89, qx=0, qy=0, qz=0, qw=1) sleep(3) self.gripperOpen("l_arm_grip") raw_input("wait the turb") self.gripperClose("l_arm_grip") self.cal_IK(name="left_arm", x=0.460, y=0.180, z=1.100, qx=0, qy=0, qz=0, qw=1) sleep(3) self.set_action_modules() plan = ["init"] self.engine.setPlan(plan) while self.engine.isRunning: self.engine.run() if page == "cost": print(np.load(os.path.dirname(__file__) + "/costT.npy")) if page == "save": save_name = 'w_' save_name = save_name + str(self.ep) + '_' save_name = save_name + str(self.numofball) + '_' save_name = save_name + self.pour_arm + '_' save_name = save_name + str(self.rl.n_dmps) + '_' save_name = save_name + str(self.rl.n_bfs) + '_' save_name = save_name + str(self.rl.decay) + '_' save_name = save_name + str(self.rl.dt) + '_' save_name = save_name + str(self.total_ball) self.rl.save_weight(save_name) np.save( os.path.dirname(__file__) + "/costT.npy", np.array(self.costT_list)) print("save npy file weight success") ### learning pouring if page == "l": ### prepare plan = ["DMPPourRtoL"] self.engine.setPlan(plan) while self.engine.isRunning: self.engine.run() print("ep: " + str(self.ep)) track = self.rl.rollout() cost = np.zeros( (self.rl.n_stochastic, self.rl.timesteps)) + 1e-8 costT = np.zeros((self.rl.n_stochastic)) + 1e-8 for i in range(self.rl.n_stochastic): raw_input("wait_ball: ") ### detect collision print("random try: " + str(i)) #min_dis = 10 #for j in range(len(track.y[i])): # dis = self.detect_collision(track.y[i][j],["r_arm_wr_y"]) # if min_dis > dis: # min_dis = dis min_dis = 1 if min_dis > 0.001: ### excute self.execute_path( ["r_arm_wr_r", "r_arm_wr_y", "r_arm_wr_p"], track.y[i]) ### calulate cost bnum = float(raw_input("ball number: ")) cost[i], costT[i] = self.cal_cost(bnum, track.y[i]) if bnum != 0: plan = ["removeBall", "DMPPourRtoL"] self.engine.setPlan(plan) while self.engine.isRunning: self.engine.run() sleep(3) else: print("error: min_dis is ", min_dis) raw_input() costT[i] = -1 print("total cost:", np.sum(cost) + np.sum(costT)) self.rl.updatePI(cost, costT) self.ep += 1 self.costT_list.append(costT) if page == "p": track = self.rl.predict() ### prepare plan = ["DMPPourRtoL"] self.engine.setPlan(plan) while self.engine.isRunning: self.engine.run() sleep(2) ### excute self.execute_path(["r_arm_wr_r", "r_arm_wr_y", "r_arm_wr_p"], track.y[0]) plan = ["DMPPourRtoL"] self.engine.setPlan(plan) while self.engine.isRunning: self.engine.run() sleep(2) plan = ["removeBall", "DMPPourRtoL"] self.engine.setPlan(plan) while self.engine.isRunning: self.engine.run() sleep(3) if page == "t": path = [] wr_r_p0 = bin2float(1057) wr_y_p0 = bin2float(2080) wr_p_p0 = bin2float(2970) wr_r_p1 = bin2float(948) wr_y_p1 = bin2float(1918) wr_p_p1 = bin2float(2940) wr_r_p2 = bin2float(1067) wr_y_p2 = bin2float(2090) wr_p_p2 = bin2float(2990) genPath(path, wr_r_p0, wr_r_p1, wr_y_p0, wr_y_p1, wr_p_p0, wr_p_p1, 30) genPath(path, wr_r_p1, wr_r_p1, wr_y_p1, wr_y_p1, wr_p_p1, wr_p_p1, 20) genPath(path, wr_r_p1, wr_r_p2, wr_y_p1, wr_y_p2, wr_p_p1, wr_p_p2, 50) path = np.array(path) self.execute_path(["r_arm_wr_r", "r_arm_wr_y", "r_arm_wr_p"], path) plan = ["removeBall", "DMPPourRtoL"] self.engine.setPlan(plan) while self.engine.isRunning: self.engine.run() sleep(3) if page == "init": plan = ["init"] self.engine.setPlan(plan) while self.engine.isRunning: self.engine.run() if page == "c": self.cal_turb_collision(0) if page == "q": break
def csgd_train_main(local_rank, cfg: BaseConfigByEpoch, target_deps, succeeding_strategy, pacesetter_dict, centri_strength, pruned_weights, net=None, train_dataloader=None, val_dataloader=None, show_variables=False, convbuilder=None, init_hdf5=None, no_l2_keywords='depth', use_nesterov=False, load_weights_keyword=None, keyword_to_lr_mult=None, auto_continue=False, save_hdf5_epochs=10000): ensure_dir(cfg.output_dir) ensure_dir(cfg.tb_dir) clusters_save_path = os.path.join(cfg.output_dir, 'clusters.npy') with Engine(local_rank=local_rank) as engine: engine.setup_log(name='train', log_dir=cfg.output_dir, file_name='log.txt') # ----------------------------- build model ------------------------------ if convbuilder is None: convbuilder = ConvBuilder(base_config=cfg) if net is None: net_fn = get_model_fn(cfg.dataset_name, cfg.network_type) model = net_fn(cfg, convbuilder) else: model = net model = model.cuda() # ----------------------------- model done ------------------------------ # ---------------------------- prepare data ------------------------- if train_dataloader is None: train_data = create_dataset(cfg.dataset_name, cfg.dataset_subset, cfg.global_batch_size, distributed=engine.distributed) if cfg.val_epoch_period > 0 and val_dataloader is None: val_data = create_dataset(cfg.dataset_name, 'val', global_batch_size=100, distributed=False) engine.echo('NOTE: Data prepared') engine.echo( 'NOTE: We have global_batch_size={} on {} GPUs, the allocated GPU memory is {}' .format(cfg.global_batch_size, torch.cuda.device_count(), torch.cuda.memory_allocated())) # ----------------------------- data done -------------------------------- # ------------------------ parepare optimizer, scheduler, criterion ------- if no_l2_keywords is None: no_l2_keywords = [] if type(no_l2_keywords) is not list: no_l2_keywords = [no_l2_keywords] # For a target parameter, cancel its weight decay in optimizer, because the weight decay will be later encoded in the decay mat conv_idx = 0 for k, v in model.named_parameters(): if v.dim() != 4: continue print('prune {} from {} to {}'.format(conv_idx, target_deps[conv_idx], cfg.deps[conv_idx])) if target_deps[conv_idx] < cfg.deps[conv_idx]: no_l2_keywords.append(k.replace(KERNEL_KEYWORD, 'conv')) no_l2_keywords.append(k.replace(KERNEL_KEYWORD, 'bn')) conv_idx += 1 print('no l2: ', no_l2_keywords) optimizer = get_optimizer(engine, cfg, model, no_l2_keywords=no_l2_keywords, use_nesterov=use_nesterov, keyword_to_lr_mult=keyword_to_lr_mult) scheduler = get_lr_scheduler(cfg, optimizer) criterion = get_criterion(cfg).cuda() # --------------------------------- done ------------------------------- engine.register_state(scheduler=scheduler, model=model, optimizer=optimizer) if engine.distributed: torch.cuda.set_device(local_rank) engine.echo('Distributed training, device {}'.format(local_rank)) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], broadcast_buffers=False, ) else: assert torch.cuda.device_count() == 1 engine.echo('Single GPU training') if cfg.init_weights: engine.load_checkpoint(cfg.init_weights) if init_hdf5: engine.load_hdf5(init_hdf5, load_weights_keyword=load_weights_keyword) if auto_continue: assert cfg.init_weights is None engine.load_checkpoint(get_last_checkpoint(cfg.output_dir)) if show_variables: engine.show_variables() # ===================================== prepare the clusters and matrices for C-SGD ========== kernel_namedvalue_list = engine.get_all_conv_kernel_namedvalue_as_list( ) if os.path.exists(clusters_save_path): layer_idx_to_clusters = np.load(clusters_save_path, allow_pickle=True).item() else: if local_rank == 0: layer_idx_to_clusters = get_layer_idx_to_clusters( kernel_namedvalue_list=kernel_namedvalue_list, target_deps=target_deps, pacesetter_dict=pacesetter_dict) if pacesetter_dict is not None: for follower_idx, pacesetter_idx in pacesetter_dict.items( ): if pacesetter_idx in layer_idx_to_clusters: layer_idx_to_clusters[ follower_idx] = layer_idx_to_clusters[ pacesetter_idx] np.save(clusters_save_path, layer_idx_to_clusters) else: while not os.path.exists(clusters_save_path): time.sleep(10) print('sleep, waiting for process 0 to calculate clusters') layer_idx_to_clusters = np.load(clusters_save_path, allow_pickle=True).item() param_name_to_merge_matrix = generate_merge_matrix_for_kernel( deps=cfg.deps, layer_idx_to_clusters=layer_idx_to_clusters, kernel_namedvalue_list=kernel_namedvalue_list) add_vecs_to_merge_mat_dicts(param_name_to_merge_matrix) param_name_to_decay_matrix = generate_decay_matrix_for_kernel_and_vecs( deps=cfg.deps, layer_idx_to_clusters=layer_idx_to_clusters, kernel_namedvalue_list=kernel_namedvalue_list, weight_decay=cfg.weight_decay, weight_decay_bias=cfg.weight_decay_bias, centri_strength=centri_strength) print(param_name_to_decay_matrix.keys()) print(param_name_to_merge_matrix.keys()) conv_idx = 0 param_to_clusters = {} for k, v in model.named_parameters(): if v.dim() != 4: continue if conv_idx in layer_idx_to_clusters: for clsts in layer_idx_to_clusters[conv_idx]: if len(clsts) > 1: param_to_clusters[v] = layer_idx_to_clusters[conv_idx] break conv_idx += 1 # ============================================================================================ # ------------ do training ---------------------------- # engine.log("\n\nStart training with pytorch version {}".format( torch.__version__)) iteration = engine.state.iteration iters_per_epoch = num_iters_per_epoch(cfg) max_iters = iters_per_epoch * cfg.max_epochs tb_writer = SummaryWriter(cfg.tb_dir) tb_tags = ['Top1-Acc', 'Top5-Acc', 'Loss'] model.train() done_epochs = iteration // iters_per_epoch last_epoch_done_iters = iteration % iters_per_epoch if done_epochs == 0 and last_epoch_done_iters == 0: engine.save_hdf5(os.path.join(cfg.output_dir, 'init.hdf5')) recorded_train_time = 0 recorded_train_examples = 0 collected_train_loss_sum = 0 collected_train_loss_count = 0 for epoch in range(done_epochs, cfg.max_epochs): if engine.distributed and hasattr(train_data, 'train_sampler'): train_data.train_sampler.set_epoch(epoch) if epoch == done_epochs: pbar = tqdm(range(iters_per_epoch - last_epoch_done_iters)) else: pbar = tqdm(range(iters_per_epoch)) if epoch == 0 and local_rank == 0: val_during_train(epoch=epoch, iteration=iteration, tb_tags=tb_tags, engine=engine, model=model, val_data=val_data, criterion=criterion, descrip_str='Init', dataset_name=cfg.dataset_name, test_batch_size=TEST_BATCH_SIZE, tb_writer=tb_writer) top1 = AvgMeter() top5 = AvgMeter() losses = AvgMeter() discrip_str = 'Epoch-{}/{}'.format(epoch, cfg.max_epochs) pbar.set_description('Train' + discrip_str) for _ in pbar: start_time = time.time() data, label = load_cuda_data(train_data, dataset_name=cfg.dataset_name) # load_cuda_data(train_dataloader, cfg.dataset_name) data_time = time.time() - start_time train_net_time_start = time.time() acc, acc5, loss = train_one_step( model, data, label, optimizer, criterion, param_name_to_merge_matrix=param_name_to_merge_matrix, param_name_to_decay_matrix=param_name_to_decay_matrix) train_net_time_end = time.time() if iteration > TRAIN_SPEED_START * max_iters and iteration < TRAIN_SPEED_END * max_iters: recorded_train_examples += cfg.global_batch_size recorded_train_time += train_net_time_end - train_net_time_start scheduler.step() for module in model.modules(): if hasattr(module, 'set_cur_iter'): module.set_cur_iter(iteration) if iteration % cfg.tb_iter_period == 0 and engine.world_rank == 0: for tag, value in zip( tb_tags, [acc.item(), acc5.item(), loss.item()]): tb_writer.add_scalars(tag, {'Train': value}, iteration) deviation_sum = 0 for param, clusters in param_to_clusters.items(): pvalue = param.detach().cpu().numpy() for cl in clusters: if len(cl) == 1: continue selected = pvalue[cl, :, :, :] mean_kernel = np.mean(selected, axis=0, keepdims=True) diff = selected - mean_kernel deviation_sum += np.sum(diff**2) tb_writer.add_scalars('deviation_sum', {'Train': deviation_sum}, iteration) top1.update(acc.item()) top5.update(acc5.item()) losses.update(loss.item()) if epoch >= cfg.max_epochs - COLLECT_TRAIN_LOSS_EPOCHS: collected_train_loss_sum += loss.item() collected_train_loss_count += 1 pbar_dic = OrderedDict() pbar_dic['data-time'] = '{:.2f}'.format(data_time) pbar_dic['cur_iter'] = iteration pbar_dic['lr'] = scheduler.get_lr()[0] pbar_dic['top1'] = '{:.5f}'.format(top1.mean) pbar_dic['top5'] = '{:.5f}'.format(top5.mean) pbar_dic['loss'] = '{:.5f}'.format(losses.mean) pbar.set_postfix(pbar_dic) iteration += 1 if iteration >= max_iters or iteration % cfg.ckpt_iter_period == 0: engine.update_iteration(iteration) if (not engine.distributed) or (engine.distributed and engine.world_rank == 0): engine.save_and_link_checkpoint(cfg.output_dir) if iteration >= max_iters: break # do something after an epoch? engine.update_iteration(iteration) engine.save_latest_ckpt(cfg.output_dir) if (epoch + 1) % save_hdf5_epochs == 0: engine.save_hdf5( os.path.join(cfg.output_dir, 'epoch-{}.hdf5'.format(epoch))) if local_rank == 0 and \ cfg.val_epoch_period > 0 and (epoch >= cfg.max_epochs - 10 or epoch % cfg.val_epoch_period == 0): val_during_train(epoch=epoch, iteration=iteration, tb_tags=tb_tags, engine=engine, model=model, val_data=val_data, criterion=criterion, descrip_str=discrip_str, dataset_name=cfg.dataset_name, test_batch_size=TEST_BATCH_SIZE, tb_writer=tb_writer) if iteration >= max_iters: break # do something after the training if recorded_train_time > 0: exp_per_sec = recorded_train_examples / recorded_train_time else: exp_per_sec = 0 engine.log( 'TRAIN speed: from {} to {} iterations, batch_size={}, examples={}, total_net_time={:.4f}, examples/sec={}' .format(int(TRAIN_SPEED_START * max_iters), int(TRAIN_SPEED_END * max_iters), cfg.global_batch_size, recorded_train_examples, recorded_train_time, exp_per_sec)) if cfg.save_weights: engine.save_checkpoint(cfg.save_weights) print('NOTE: training finished, saved to {}'.format( cfg.save_weights)) engine.save_hdf5(os.path.join(cfg.output_dir, 'finish.hdf5')) if collected_train_loss_count > 0: engine.log( 'TRAIN LOSS collected over last {} epochs: {:.6f}'.format( COLLECT_TRAIN_LOSS_EPOCHS, collected_train_loss_sum / collected_train_loss_count)) if local_rank == 0: csgd_prune_and_save(engine=engine, layer_idx_to_clusters=layer_idx_to_clusters, save_file=pruned_weights, succeeding_strategy=succeeding_strategy, new_deps=target_deps)
def main(): """Create the ConResNet model and then start the training.""" parser = get_arguments() print(parser) # os.environ["CUDA_VISIBLE_DEVICES"] = '0' with Engine(custom_parser=parser) as engine: args = parser.parse_args() if args.num_gpus > 1: torch.cuda.set_device(args.local_rank) writer = SummaryWriter(args.snapshot_dir) d, h, w = map(int, args.input_size.split(',')) input_size = (d, h, w) cudnn.benchmark = True seed = args.random_seed if engine.distributed: seed = args.local_rank torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) model = ConResNet(input_size, num_classes=args.num_classes, weight_std=True) model.train() device = torch.device('cuda:{}'.format(args.local_rank)) model.to(device) optimizer = optim.Adam( [{ 'params': filter(lambda p: p.requires_grad, model.parameters()), 'lr': args.learning_rate }], lr=args.learning_rate, weight_decay=args.weight_decay) if args.num_gpus > 1: model = engine.data_parallel(model) # load checkpoint... if args.reload_from_checkpoint: print('loading from checkpoint: {}'.format(args.reload_path)) if os.path.exists(args.reload_path): model.load_state_dict( torch.load(args.reload_path, map_location=torch.device('cpu'))) else: print('File not exists in the reload path: {}'.format( args.reload_path)) loss_D = loss.DiceLoss4BraTS().to(device) loss_BCE = loss.BCELoss4BraTS().to(device) loss_B = loss.BCELossBoud().to(device) if not os.path.exists(args.snapshot_dir): os.makedirs(args.snapshot_dir) trainloader, train_sampler = engine.get_train_loader( BraTSDataSet(args.data_dir, args.train_list, max_iters=args.num_steps * args.batch_size, crop_size=input_size, scale=args.random_scale, mirror=args.random_mirror)) valloader, val_sampler = engine.get_test_loader( BraTSValDataSet(args.data_dir, args.val_list)) for i_iter, batch in enumerate(trainloader): i_iter += args.start_iters images, images_res, labels, labels_res = batch images = images.cuda() images_res = images_res.cuda() labels = labels.cuda() labels_res = labels_res.cuda() optimizer.zero_grad() lr = adjust_learning_rate(optimizer, i_iter, args.learning_rate, args.num_steps, args.power) preds = model([images, images_res]) preds_seg = preds[0] preds_res = preds[1] preds_resx2 = preds[2] preds_resx4 = preds[3] term_seg_Dice = loss_D.forward(preds_seg, labels) term_seg_BCE = loss_BCE.forward(preds_seg, labels) term_res_BCE = loss_B.forward(preds_res, labels_res) term_resx2_BCE = loss_B.forward(preds_resx2, labels_res) term_resx4_BCE = loss_B.forward(preds_resx4, labels_res) term_all = term_seg_Dice + term_seg_BCE + term_res_BCE + 0.5 * ( term_resx2_BCE + term_resx4_BCE) term_all.backward() optimizer.step() if i_iter % 100 == 0 and (args.local_rank == 0): writer.add_scalar('learning_rate', lr, i_iter) writer.add_scalar('loss', term_all.cpu().data.numpy(), i_iter) print( 'iter = {} of {} completed, lr = {:.4}, seg_loss = {:.4}, res_loss = {:.4}' .format(i_iter, args.num_steps, lr, (term_seg_Dice + term_seg_BCE).cpu().data.numpy(), (term_res_BCE + term_resx2_BCE + term_resx4_BCE).cpu().data.numpy())) if i_iter >= args.num_steps - 1 and (args.local_rank == 0): print('save last model ...') torch.save( model.state_dict(), osp.join(args.snapshot_dir, 'ConResNet_' + str(args.num_steps) + '.pth')) break if i_iter % args.val_pred_every == 0 and i_iter != 0 and ( args.local_rank == 0): print('save model ...') torch.save( model.state_dict(), osp.join(args.snapshot_dir, 'ConResNet_' + str(i_iter) + '.pth')) # val if i_iter % args.val_pred_every == 0: print('validate ...') val_ET, val_WT, val_TC = validate(input_size, model, valloader, args.num_classes) if (args.local_rank == 0): writer.add_scalar('Val_ET_Dice', val_ET, i_iter) writer.add_scalar('Val_WT_Dice', val_WT, i_iter) writer.add_scalar('Val_TC_Dice', val_TC, i_iter) print( 'Validate iter = {}, ET = {:.2}, WT = {:.2}, TC = {:.2}' .format(i_iter, val_ET, val_WT, val_TC)) end = timeit.default_timer() print(end - start, 'seconds')
from utils.engine import Engine if __name__ == '__main__': size = input("Entrez la taille du plateau (inf à 26 et pair): ") while not size.isdigit() \ or int(size) < 4 or int(size) > 26 \ or int(size) % 2 == 1: size = input("Err, entrez la taille du plateau (inf à 26 et pair): ") width = height = int(size) players = input("Joueur contre une IA ? [O/n] ") while players.lower() not in [ 'yes', 'y', 'oui', 'o', 'no', 'n', 'non', '' ]: players = input("Err, joueur contre une IA ? [O/n] ") players = 2 if players in ['no', 'n', 'non'] else 1 board = Board(width, height) board.make_board() engine = Engine(board, players) engine.start() try: while engine.is_playing: engine.get_action() except (KeyboardInterrupt, EOFError): engine.stop()
def ding_train(cfg: BaseConfigByEpoch, net=None, train_dataloader=None, val_dataloader=None, show_variables=False, convbuilder=None): ensure_dir(cfg.output_dir) ensure_dir(cfg.tb_dir) with Engine(cfg) as engine: is_main_process = (engine.world_rank == 0) #TODO correct? logger = engine.setup_log(name='train', log_dir=cfg.output_dir, file_name='log.txt') # -- typical model components model, opt, scheduler, dataloder --# if net is None: net = get_model_fn(cfg.dataset_name, cfg.network_type) if convbuilder is None: convbuilder = ConvBuilder() model = net(cfg, convbuilder).cuda() if train_dataloader is None: train_dataloader = create_dataset(cfg.dataset_name, cfg.dataset_subset, cfg.global_batch_size) if cfg.val_epoch_period > 0 and val_dataloader is None: val_dataloader = create_dataset(cfg.dataset_name, 'val', batch_size=100) #TODO 100? print('NOTE: Data prepared') print( 'NOTE: We have global_batch_size={} on {} GPUs, the allocated GPU memory is {}' .format(cfg.global_batch_size, torch.cuda.device_count(), torch.cuda.memory_allocated())) # device = torch.device(cfg.device) # model.to(device) # model.cuda() optimizer = get_optimizer(cfg, model) scheduler = get_lr_scheduler(cfg, optimizer) criterion = get_criterion(cfg).cuda() # model, optimizer = amp.initialize(model, optimizer, opt_level="O0") engine.register_state(scheduler=scheduler, model=model, optimizer=optimizer) if engine.distributed: print('Distributed training, engine.world_rank={}'.format( engine.world_rank)) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[engine.world_rank], broadcast_buffers=False, ) # model = DistributedDataParallel(model, delay_allreduce=True) if engine.continue_state_object: engine.restore_checkpoint() else: if cfg.init_weights: engine.load_checkpoint(cfg.init_weights, is_restore=False) if show_variables: engine.show_variables() # ------------ do training ---------------------------- # logger.info("\n\nStart training with pytorch version {}".format( torch.__version__)) iteration = engine.state.iteration # done_epochs = iteration // num_train_examples_per_epoch(cfg.dataset_name) iters_per_epoch = num_iters_per_epoch(cfg) max_iters = iters_per_epoch * cfg.max_epochs tb_writer = SummaryWriter(cfg.tb_dir) tb_tags = ['Top1-Acc', 'Top5-Acc', 'Loss'] model.train() done_epochs = iteration // iters_per_epoch for epoch in range(done_epochs, cfg.max_epochs): pbar = tqdm(range(iters_per_epoch)) top1 = AvgMeter() top5 = AvgMeter() losses = AvgMeter() discrip_str = 'Epoch-{}/{}'.format(epoch, cfg.max_epochs) pbar.set_description('Train' + discrip_str) if cfg.val_epoch_period > 0 and epoch % cfg.val_epoch_period == 0: model.eval() val_iters = 500 if cfg.dataset_name == 'imagenet' else 100 # use batch_size=100 for val on ImagenNet and CIFAR eval_dict = run_eval(val_dataloader, val_iters, model, criterion, discrip_str, dataset_name=cfg.dataset_name) val_top1_value = eval_dict['top1'].item() val_top5_value = eval_dict['top5'].item() val_loss_value = eval_dict['loss'].item() for tag, value in zip( tb_tags, [val_top1_value, val_top5_value, val_loss_value]): tb_writer.add_scalars(tag, {'Val': value}, iteration) engine.log( 'validate at epoch {}, top1={:.5f}, top5={:.5f}, loss={:.6f}' .format(epoch, val_top1_value, val_top5_value, val_loss_value)) model.train() for _ in pbar: scheduler.step() start_time = time.time() data, label = load_cuda_data(train_dataloader, cfg.dataset_name) data_time = time.time() - start_time if_accum_grad = ((iteration % cfg.grad_accum_iters) != 0) acc, acc5, loss = train_one_step(model, data, label, optimizer, criterion, if_accum_grad) if iteration % cfg.tb_iter_period == 0 and is_main_process: for tag, value in zip( tb_tags, [acc.item(), acc5.item(), loss.item()]): tb_writer.add_scalars(tag, {'Train': value}, iteration) top1.update(acc.item()) top5.update(acc5.item()) losses.update(loss.item()) pbar_dic = OrderedDict() pbar_dic['data-time'] = '{:.2f}'.format(data_time) pbar_dic['cur_iter'] = iteration pbar_dic['lr'] = scheduler.get_lr()[0] pbar_dic['top1'] = '{:.5f}'.format(top1.mean) pbar_dic['top5'] = '{:.5f}'.format(top5.mean) pbar_dic['loss'] = '{:.5f}'.format(losses.mean) pbar.set_postfix(pbar_dic) if iteration >= max_iters or iteration % cfg.ckpt_iter_period == 0: engine.update_iteration(iteration) if (not engine.distributed) or (engine.distributed and is_main_process): engine.save_and_link_checkpoint(cfg.output_dir) iteration += 1 if iteration >= max_iters: break # do something after an epoch? if iteration >= max_iters: break # do something after the training engine.save_checkpoint(cfg.save_weights) print('NOTE: training finished, saved to {}'.format(cfg.save_weights))
from utils.file_manager import FileManager from utils.engine import Engine B, L, D, libraries, books_scores, picked_books = FileManager.read_file('a_example.txt') engine = Engine(libraries=libraries, D=D, books_scores=books_scores, picked_books=picked_books) output: list = engine.start() FileManager.write_file('a.txt', output)
def aofp_train_main(local_rank, target_layers, succ_strategy, warmup_iterations, aofp_batches_per_half, flops_func, cfg: BaseConfigByEpoch, net=None, train_dataloader=None, val_dataloader=None, show_variables=False, convbuilder=None, init_hdf5=None, no_l2_keywords='depth', gradient_mask=None, use_nesterov=False, tensorflow_style_init=False, keyword_to_lr_mult=None, auto_continue=False, lasso_keyword_to_strength=None, save_hdf5_epochs=10000, remain_flops_ratio=0): if no_l2_keywords is None: no_l2_keywords = [] if type(no_l2_keywords) is not list: no_l2_keywords = [no_l2_keywords] ensure_dir(cfg.output_dir) ensure_dir(cfg.tb_dir) with Engine(local_rank=local_rank) as engine: engine.setup_log(name='train', log_dir=cfg.output_dir, file_name='log.txt') # ----------------------------- build model ------------------------------ if convbuilder is None: convbuilder = ConvBuilder(base_config=cfg) if net is None: net_fn = get_model_fn(cfg.dataset_name, cfg.network_type) model = net_fn(cfg, convbuilder) else: model = net model = model.cuda() # ----------------------------- model done ------------------------------ # ---------------------------- prepare data ------------------------- if train_dataloader is None: train_data = create_dataset(cfg.dataset_name, cfg.dataset_subset, cfg.global_batch_size, distributed=engine.distributed) if cfg.val_epoch_period > 0 and val_dataloader is None: val_data = create_dataset(cfg.dataset_name, 'val', global_batch_size=100, distributed=False) engine.echo('NOTE: Data prepared') engine.echo( 'NOTE: We have global_batch_size={} on {} GPUs, the allocated GPU memory is {}' .format(cfg.global_batch_size, torch.cuda.device_count(), torch.cuda.memory_allocated())) # ----------------------------- data done -------------------------------- # ------------------------ parepare optimizer, scheduler, criterion ------- optimizer = get_optimizer(engine, cfg, model, no_l2_keywords=no_l2_keywords, use_nesterov=use_nesterov, keyword_to_lr_mult=keyword_to_lr_mult) scheduler = get_lr_scheduler(cfg, optimizer) criterion = get_criterion(cfg).cuda() # --------------------------------- done ------------------------------- engine.register_state(scheduler=scheduler, model=model, optimizer=optimizer) if engine.distributed: torch.cuda.set_device(local_rank) engine.echo('Distributed training, device {}'.format(local_rank)) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], broadcast_buffers=False, ) else: assert torch.cuda.device_count() == 1 engine.echo('Single GPU training') if tensorflow_style_init: init_as_tensorflow(model) if cfg.init_weights: engine.load_checkpoint(cfg.init_weights) if init_hdf5: engine.load_part('base_path.', init_hdf5) if auto_continue: assert cfg.init_weights is None engine.load_checkpoint(get_last_checkpoint(cfg.output_dir)) if show_variables: engine.show_variables() # ------------ do training ---------------------------- # engine.log("\n\nStart training with pytorch version {}".format( torch.__version__)) iteration = engine.state.iteration iters_per_epoch = num_iters_per_epoch(cfg) max_iters = iters_per_epoch * cfg.max_epochs tb_writer = SummaryWriter(cfg.tb_dir) tb_tags = ['Top1-Acc', 'Top5-Acc', 'Loss'] model.train() done_epochs = iteration // iters_per_epoch last_epoch_done_iters = iteration % iters_per_epoch if done_epochs == 0 and last_epoch_done_iters == 0: engine.save_hdf5(os.path.join(cfg.output_dir, 'init.hdf5')) recorded_train_time = 0 recorded_train_examples = 0 collected_train_loss_sum = 0 collected_train_loss_count = 0 if gradient_mask is not None: gradient_mask_tensor = {} for name, value in gradient_mask.items(): gradient_mask_tensor[name] = torch.Tensor(value).cuda() else: gradient_mask_tensor = None ######################### aofp _init_interval = aofp_batches_per_half // len(target_layers) layer_to_start_iter = { i: (_init_interval * i + warmup_iterations) for i in target_layers } print( 'the initial layer_to_start_iter = {}'.format(layer_to_start_iter)) # 0. get all the AOFPLayers layer_idx_to_module = {} for submodule in model.modules(): if hasattr(submodule, 'score_mask') or hasattr( submodule, 't_value'): layer_idx_to_module[submodule.conv_idx] = submodule print(layer_idx_to_module) ###################################### for epoch in range(done_epochs, cfg.max_epochs): if engine.distributed and hasattr(train_data, 'train_sampler'): train_data.train_sampler.set_epoch(epoch) if epoch == done_epochs: pbar = tqdm(range(iters_per_epoch - last_epoch_done_iters)) else: pbar = tqdm(range(iters_per_epoch)) if epoch == 0 and local_rank == 0: val_during_train(epoch=epoch, iteration=iteration, tb_tags=tb_tags, engine=engine, model=model, val_data=val_data, criterion=criterion, descrip_str='Init', dataset_name=cfg.dataset_name, test_batch_size=TEST_BATCH_SIZE, tb_writer=tb_writer) top1 = AvgMeter() top5 = AvgMeter() losses = AvgMeter() discrip_str = 'Epoch-{}/{}'.format(epoch, cfg.max_epochs) pbar.set_description('Train' + discrip_str) for _ in pbar: start_time = time.time() data, label = load_cuda_data(train_data, dataset_name=cfg.dataset_name) # load_cuda_data(train_dataloader, cfg.dataset_name) data_time = time.time() - start_time if_accum_grad = ((iteration % cfg.grad_accum_iters) != 0) train_net_time_start = time.time() ############ aofp # 1. see if it is time to start on every layer # 2. forward and accumulate # 3. if a half on some layer is finished, do something # ---- fetch its accumulated t vectors, analyze the first 'granu' elements # ---- if good enough, set the base mask, reset the search space # ---- elif granu == 1, do nothing # ---- else, granu /= 2, reset the search space for layer_idx, start_iter in layer_to_start_iter.items(): if start_iter == iteration: layer_idx_to_module[layer_idx].start_aofp(iteration) acc, acc5, loss = train_one_step( model, data, label, optimizer, criterion, if_accum_grad, gradient_mask_tensor=gradient_mask_tensor, lasso_keyword_to_strength=lasso_keyword_to_strength) for layer_idx, aofp_layer in layer_idx_to_module.items(): # accumulate if layer_idx not in succ_strategy: continue follow_layer_idx = succ_strategy[layer_idx] if follow_layer_idx not in layer_idx_to_module: continue t_value = layer_idx_to_module[follow_layer_idx].t_value aofp_layer.accumulate_t_value(t_value) if aofp_layer.finished_a_half(iteration): aofp_layer.halve_or_stop(iteration) ################################### train_net_time_end = time.time() if iteration > TRAIN_SPEED_START * max_iters and iteration < TRAIN_SPEED_END * max_iters: recorded_train_examples += cfg.global_batch_size recorded_train_time += train_net_time_end - train_net_time_start scheduler.step() for module in model.modules(): if hasattr(module, 'set_cur_iter'): module.set_cur_iter(iteration) if iteration % cfg.tb_iter_period == 0 and engine.world_rank == 0: for tag, value in zip( tb_tags, [acc.item(), acc5.item(), loss.item()]): tb_writer.add_scalars(tag, {'Train': value}, iteration) top1.update(acc.item()) top5.update(acc5.item()) losses.update(loss.item()) if epoch >= cfg.max_epochs - COLLECT_TRAIN_LOSS_EPOCHS: collected_train_loss_sum += loss.item() collected_train_loss_count += 1 pbar_dic = OrderedDict() pbar_dic['data-time'] = '{:.2f}'.format(data_time) pbar_dic['cur_iter'] = iteration pbar_dic['lr'] = scheduler.get_lr()[0] pbar_dic['top1'] = '{:.5f}'.format(top1.mean) pbar_dic['top5'] = '{:.5f}'.format(top5.mean) pbar_dic['loss'] = '{:.5f}'.format(losses.mean) pbar.set_postfix(pbar_dic) iteration += 1 if iteration >= max_iters or iteration % cfg.ckpt_iter_period == 0: engine.update_iteration(iteration) if (not engine.distributed) or (engine.distributed and engine.world_rank == 0): engine.save_and_link_checkpoint(cfg.output_dir) if iteration >= max_iters: break # do something after an epoch? engine.update_iteration(iteration) engine.save_latest_ckpt(cfg.output_dir) if (epoch + 1) % save_hdf5_epochs == 0: engine.save_hdf5( os.path.join(cfg.output_dir, 'epoch-{}.hdf5'.format(epoch))) if local_rank == 0 and \ cfg.val_epoch_period > 0 and (epoch >= cfg.max_epochs - 10 or epoch % cfg.val_epoch_period == 0): val_during_train(epoch=epoch, iteration=iteration, tb_tags=tb_tags, engine=engine, model=model, val_data=val_data, criterion=criterion, descrip_str=discrip_str, dataset_name=cfg.dataset_name, test_batch_size=TEST_BATCH_SIZE, tb_writer=tb_writer) cur_deps = np.array(cfg.deps) for submodule in model.modules(): if hasattr(submodule, 'base_mask'): cur_deps[submodule.conv_idx] = np.sum( submodule.base_mask.cpu().numpy() == 1) origin_flops = flops_func(cfg.deps) cur_flops = flops_func(cur_deps) remain_ratio = cur_flops / origin_flops if local_rank == 0: print('##########################') print('origin deps ', cfg.deps) print('cur deps ', cur_deps) print('remain flops ratio = ', remain_ratio, 'the target is ', remain_flops_ratio) print('##########################') if remain_ratio < remain_flops_ratio: break if iteration >= max_iters: break # do something after the training if recorded_train_time > 0: exp_per_sec = recorded_train_examples / recorded_train_time else: exp_per_sec = 0 engine.log( 'TRAIN speed: from {} to {} iterations, batch_size={}, examples={}, total_net_time={:.4f}, examples/sec={}' .format(int(TRAIN_SPEED_START * max_iters), int(TRAIN_SPEED_END * max_iters), cfg.global_batch_size, recorded_train_examples, recorded_train_time, exp_per_sec)) if cfg.save_weights: engine.save_checkpoint(cfg.save_weights) print('NOTE: training finished, saved to {}'.format( cfg.save_weights)) engine.save_hdf5(os.path.join(cfg.output_dir, 'finish.hdf5')) if collected_train_loss_count > 0: engine.log( 'TRAIN LOSS collected over last {} epochs: {:.6f}'.format( COLLECT_TRAIN_LOSS_EPOCHS, collected_train_loss_sum / collected_train_loss_count)) final_deps = aofp_prune(model, origin_deps=cfg.deps, succ_strategy=succ_strategy, save_path=os.path.join(cfg.output_dir, 'finish_pruned.hdf5')) origin_flops = flops_func(cfg.deps) cur_flops = flops_func(final_deps) engine.log( '##################################################################' ) engine.log(cfg.network_type) engine.log('origin width: {} , flops {} '.format( cfg.deps, origin_flops)) engine.log('final width: {}, flops {} '.format(final_deps, cur_flops)) engine.log('flops reduction: {}'.format(1 - cur_flops / origin_flops)) return final_deps
def __init__(self): self.head_Joint = ["head_p", "head_y"] self.arm_Joint = [ "l_arm_el_y", "l_arm_sh_p1", "l_arm_sh_p2", "l_arm_sh_r", "l_arm_wr_p", "l_arm_wr_r", "l_arm_wr_y", "r_arm_el_y", "r_arm_sh_p1", "r_arm_sh_p2", "r_arm_sh_r", "r_arm_wr_p", "r_arm_wr_r", "r_arm_wr_y" ] self.leg_Joint = [ "l_leg_an_p", "l_leg_an_r", "l_leg_hip_p", "l_leg_hip_r", "l_leg_hip_y", "l_leg_kn_p", "r_arm_el_y", "r_leg_an_p", "r_leg_an_r", "r_leg_hip_p", "r_leg_hip_r", "r_leg_hip_y", "r_leg_kn_p", "torso_y" ] self.grip_Joint = ["l_arm_grip", "r_arm_grip"] # Initialize publisher objects self.initPublishers() # Initialize Subscriber objects self.initSubscribe() ### init engine name2num = { 'gripON_L': "gripON_L", 'gripOFF_L': "gripOFF_L", 'gripON_R': "gripON_R", 'gripOFF_R': "gripOFF_R", 'fLL': 34, 'bLL': 35, 'fRR': 36, 'bRR': 37, 'fML': 38, 'bML': 39, 'fMR': 40, 'bMR': 41, 'init': 44, 'DMPPourRtoL': 28, 'removeBall': 29 } self.engine = Engine(name2num) self.dmp_y0 = np.array([-1.52017496, 0.04908739, 1.41433029]) self.dmp_goal = np.array([-1.50848603, 0.0591503, 1.44347592]) load_file_name = "w_0_1_right_3_100_1000.0_0.01_2" #load_file_name = raw_input('file name: ') load_file_name_list = load_file_name.split('_') ### learning ep self.ep = int(load_file_name_list[1]) ### pouring number of ball to the other tube self.numofball = int(load_file_name_list[2]) ### which arm do the pouring motion self.pour_arm = load_file_name_list[3] n_dmps = int(load_file_name_list[4]) n_bfs = int(load_file_name_list[5]) decay = float(load_file_name_list[6]) dt = float(load_file_name_list[7]) self.total_ball = float(load_file_name_list[8]) ### initial DMP self.rl = RLDMPs(n_dmps=n_dmps, n_bfs=n_bfs, decay=decay, y0=self.dmp_y0, goal=self.dmp_goal, ay=np.ones(n_dmps) * 10.0, dt=dt) self.rl.load_weight(load_file_name) print(self.rl.predict().y) print("load npy file weight success:") print("ep: " + str(self.ep)) print("pouring " + str(self.numofball) + " ball to other tube. Total: " + str(self.total_ball)) print("using " + self.pour_arm + " pouring the ball") self.costT_list = []
def csgd_prune_and_save(engine:Engine, layer_idx_to_clusters, save_file, succeeding_strategy, new_deps): result = OrderedDict() succeeding_map = parse_succeeding_strategy(succeeding_strategy=succeeding_strategy, layer_idx_to_clusters=layer_idx_to_clusters) kernel_namedvalues = engine.get_all_kernel_namedvalue_as_list() for layer_idx, namedvalue in enumerate(kernel_namedvalues): if layer_idx not in layer_idx_to_clusters: continue k_name = namedvalue.name k_value = namedvalue.value if k_name in result: # If this kernel has been pruned because it is subsequent to another layer k_value = result[k_name] clusters = layer_idx_to_clusters[layer_idx] # Prune the kernel idx_to_delete = [] for clst in clusters: idx_to_delete += clst[1:] kernel_value_pruned = delete_or_keep(k_value, idx_to_delete, axis=0) print('cur kernel name: {}, from {} to {}'.format(k_name, k_value.shape, kernel_value_pruned.shape)) result[k_name] = kernel_value_pruned assert new_deps[layer_idx] == kernel_value_pruned.shape[0] # Prune the related vector params def handle_vecs(key_name): vec_name = k_name.replace('conv.weight', key_name) # Assume the names of conv kernel and bn params follow such a pattern. vec_value = engine.get_param_value_by_name(vec_name) if vec_value is not None: vec_value_pruned = delete_or_keep(vec_value, idx_to_delete) result[vec_name] = vec_value_pruned handle_vecs('conv.bias') handle_vecs('bn.weight') handle_vecs('bn.bias') handle_vecs('bn.running_mean') handle_vecs('bn.running_var') # Handle the succeeding kernels if layer_idx not in succeeding_map: continue follows = succeeding_map[layer_idx] print('{} follows {}'.format(follows, layer_idx)) if type(follows) is not list: follows = [follows] for follow_idx in follows: follow_kernel_value = kernel_namedvalues[follow_idx].value follow_kernel_name = kernel_namedvalues[follow_idx].name if follow_kernel_name in result: follow_kernel_value = result[follow_kernel_name] print('following kernel name: ', follow_kernel_name, 'origin shape: ', follow_kernel_value.shape) if follow_kernel_value.ndim == 2: # The following is a FC layer fc_idx_to_delete = [] num_filters = k_value.shape[0] fc_neurons_per_conv_kernel = follow_kernel_value.shape[1] // num_filters print('{} filters, {} neurons per kernel'.format(num_filters, fc_neurons_per_conv_kernel)) for clst in clusters: if len(clst) == 1: continue for i in clst[1:]: fc_idx_to_delete.append(np.arange(i * fc_neurons_per_conv_kernel, (i + 1) * fc_neurons_per_conv_kernel)) to_concat = [] for i in clst: corresponding_neurons_idx = np.arange(i * fc_neurons_per_conv_kernel, (i + 1) * fc_neurons_per_conv_kernel) to_concat.append(np.expand_dims(follow_kernel_value[:, corresponding_neurons_idx], axis=0)) summed = np.sum(np.concatenate(to_concat, axis=0), axis=0) reserved_idx = np.arange(clst[0] * fc_neurons_per_conv_kernel, (clst[0] + 1) * fc_neurons_per_conv_kernel) follow_kernel_value[:, reserved_idx] = summed if len(fc_idx_to_delete) > 0: follow_kernel_value = delete_or_keep(follow_kernel_value, np.concatenate(fc_idx_to_delete, axis=0), axis=1) result[follow_kernel_name] = follow_kernel_value print('shape of pruned following kernel: ', follow_kernel_value.shape) elif follow_kernel_value.ndim == 4: # The following is a conv layer for clst in clusters: selected_k_follow = follow_kernel_value[:, clst, :, :] summed_k_follow = np.sum(selected_k_follow, axis=1) follow_kernel_value[:, clst[0], :, :] = summed_k_follow follow_kernel_value = delete_or_keep(follow_kernel_value, idx_to_delete, axis=1) result[follow_kernel_name] = follow_kernel_value print('shape of pruned following kernel: ', follow_kernel_value.shape) else: raise ValueError('wrong ndim of kernel') key_variables = engine.state_values() for name, value in key_variables.items(): if name not in result: result[name] = value result['deps'] = new_deps print('save {} values to {} after pruning'.format(len(result), save_file)) save_hdf5(result, save_file)
def main(cfg, cuda_avail=torch.cuda.is_available()): ### flush cfg to output log file: tqdm.write(str(cfg), file=cfg['logfile']) tqdm.write('-' * 80) ### define function that returns a data loader: def get_iterator(mode='train'): # choose between train/valid data based on `mode`: if mode == 'train': datasets = cfg['train_data_paths'] pin_memory_flag = (cuda_avail and cfg['cuda']) num_workers_setting = 4 if mode == 'valid': datasets = cfg['valid_data_paths'] pin_memory_flag = False num_workers_setting = 1 # form a (possibly concatenated) dataset: ds = SeqTensorDataset(torch.load(datasets[0][0]), torch.load(datasets[0][1]), torch.load(datasets[0][2]), torch.load(datasets[0][3])) for dataset in datasets[1:]: ds += SeqTensorDataset(torch.load(dataset[0]), torch.load(dataset[1]), torch.load(dataset[2]), torch.load(dataset[3])) # return a loader that iterates over the dataset of choice; pagelock the memory location if GPU detected: return DataLoader(ds, batch_size=cfg['batch_size'], shuffle=True, num_workers=num_workers_setting, collate_fn=sequence_collate_fn, pin_memory=pin_memory_flag) ### build RawCTCNet model: in_dim = 1 layers = [(256, 256, d, 3) for d in [1, 2, 4, 8, 16, 32, 64]] * cfg['num_stacks'] num_labels = 5 out_dim = 512 network = RawCTCNet(in_dim, num_labels, layers, out_dim, input_kw=1, input_dil=1, positions=True, softmax=False, causal=False, batch_norm=True) print("Constructed network.") if (cuda_avail and cfg['cuda']): print("CUDA detected; placed network on GPU.") network.cuda() if cfg['model'] is not None: print("Loading model file...") try: network.load_state_dict(torch.load(cfg['model'])) except: print( "ERR: could not restore model. Check model datatype/dimensions." ) ### build CTCLoss and model evaluation function: ctc_loss_fn = CTCLoss() print("Constructed CTC loss function.") maybe_gpu = lambda tsr, has_cuda: tsr if not has_cuda else tsr.cuda() #--- this function performs the gradient descent in synchronous batched mode: def batch_model_loss(sample): # unpack inputs and wrap as `torch.autograd.Variable`s: signals_, signal_lengths_, sequences_, sequence_lengths_ = sample signals = Variable( maybe_gpu(signals_.permute(0, 2, 1), (cuda_avail and cfg['cuda']))) # BxTxD => BxDxT signal_lengths = Variable(signal_lengths_) sequences = Variable(concat_labels(sequences_, sequence_lengths_)) sequence_lengths = Variable(sequence_lengths_) # compute predicted labels: transcriptions = network(signals).permute(2, 0, 1) # Permute: BxDxT => TxBxD # compute CTC loss and return: loss = ctc_loss_fn(transcriptions, sequences.int(), signal_lengths.int(), sequence_lengths.int()) loss.backward() return loss, transcriptions #--- for evaluation-mode, batch-parallel: def batch_model_eval(sample): # unpack inputs and wrap as `torch.autograd.Variable`s: signals_, signal_lengths_, sequences_, sequence_lengths_ = sample signals = Variable(maybe_gpu(signals_.permute(0, 2, 1), (cuda_avail and cfg['cuda'])), volatile=True) # BxTxD => BxDxT signal_lengths = Variable(signal_lengths_, volatile=True) sequences = Variable(concat_labels(sequences_, sequence_lengths_), volatile=True) sequence_lengths = Variable(sequence_lengths_, volatile=True) # compute predicted labels: transcriptions = network(signals).permute(2, 0, 1) # Permute: BxDxT => TxBxD # compute CTC loss and return: loss = ctc_loss_fn(transcriptions, sequences.int(), signal_lengths.int(), sequence_lengths.int()) return loss, transcriptions #--- asynchronous gradient accumulation mode # compute target seqs/losses sequentially over each example, average gradients def async_model_loss(sample): # unpack inputs, optionally place on CUDA: signals_, signal_lengths_, sequences_, sequence_lengths_ = sample signals = maybe_gpu(signals_.permute(0, 2, 1), (cuda_avail and cfg['cuda'])) # BxTxD => BxDxT # sequential compute over the batch: total_loss = 0.0 transcriptions_list = [] bsz = signals.size(0) for k in range(bsz): # fetch k-th input from batched sample and wrap as Variable: sig_k_scalar = signal_lengths_[k] seq_k_scalar = sequence_lengths_[k] sig_k_length = Variable(torch.IntTensor([sig_k_scalar])) seq_k_length = Variable(torch.IntTensor([seq_k_scalar])) signal_k = Variable(signals[k, :, :sig_k_scalar].unsqueeze(0)) sequence_k = Variable(sequences_[k, :seq_k_scalar].unsqueeze(0)) # compute transcription output: trans_k = network(signal_k).permute(2, 0, 1) # Permute: 1xDxT => Tx1xD # compute normalized CTC loss and accumulate gradient: loss = ctc_loss_fn(trans_k, sequence_k.int(), sig_k_length.int(), seq_k_length.int()) loss.backward() total_loss += loss transcriptions_list.append(trans_k) # combine transcriptions back into a batch and return: max_length = max([t.size(0) for t in transcriptions_list]) transcriptions = Variable(torch.zeros(max_length, bsz, num_labels)) for j, tr in enumerate(transcriptions_list): transcriptions[0:tr.size(0), j, :] = tr[:, 0, :] return total_loss, transcriptions #--- asynchronous gradient accumulation mode # compute target seqs/losses sequentially over each example, average gradients def async_model_eval(sample): # unpack inputs, optionally place on CUDA: signals_, signal_lengths_, sequences_, sequence_lengths_ = sample signals = maybe_gpu(signals_.permute(0, 2, 1), (cuda_avail and cfg['cuda'])) # BxTxD => BxDxT # sequential compute over the batch: total_loss = 0.0 transcriptions_list = [] bsz = signals.size(0) for k in range(bsz): # fetch k-th input from batched sample and wrap as Variable: sig_k_scalar = signal_lengths_[k] seq_k_scalar = sequence_lengths_[k] sig_k_length = Variable(torch.IntTensor([sig_k_scalar]), volatile=True) seq_k_length = Variable(torch.IntTensor([seq_k_scalar]), volatile=True) signal_k = Variable(signals[k, :, :sig_k_scalar].unsqueeze(0), volatile=True) sequence_k = Variable(sequences_[k, :seq_k_scalar].unsqueeze(0), volatile=True) # compute transcription output: trans_k = network(signal_k).permute(2, 0, 1) # Permute: 1xDxT => Tx1xD # compute normalized CTC loss and accumulate gradient: loss = ctc_loss_fn(trans_k, sequence_k.int(), sig_k_length.int(), seq_k_length.int()) total_loss += loss transcriptions_list.append(trans_k) # combine transcriptions back into a batch and return: max_length = max([t.size(0) for t in transcriptions_list]) transcriptions = Variable(torch.zeros(max_length, bsz, num_labels), volatile=True) for j, tr in enumerate(transcriptions_list): transcriptions[0:tr.size(0), j, :] = tr[:, 0, :] return total_loss, transcriptions #--- choose appropriate model loss/eval functions depending on command line argument: model_loss = async_model_loss if cfg['async'] else batch_model_loss model_eval = async_model_eval if cfg['async'] else batch_model_eval ### build optimizer and LR scheduler: if (cfg['optim'] == 'adamax'): opt = optim.Adamax(network.parameters(), lr=cfg['lr']) elif (cfg['optim'] == 'adam'): opt = optim.Adam(network.parameters(), lr=cfg['lr']) else: raise Exception("Optimizer not recognized!") sched = ReduceLROnPlateau(opt, mode='min', patience=5) print("Constructed {} optimizer.".format(cfg['optim'])) ### build beam search decoder: beam_labels = [' ', 'A', 'G', 'C', 'T'] beam_blank_id = 0 beam_decoder = CTCBeamDecoder(beam_labels, beam_width=100, blank_id=beam_blank_id, num_processes=4) print("Constructed CTC beam search decoder.") ### build engine, meters, and hooks: engine = Engine() loss_meter = tnt.meter.MovingAverageValueMeter(windowsize=5) print("Constructed engine. Running training loop...") #-- hook: reset all meters def reset_all_meters(): loss_meter.reset() #-- hook: don't do anything for now when obtaining a data sample def on_sample(state): pass #-- hook: don't do anything on gradient update for now def on_update(state): pass #-- hook: update loggers at each forward pass def on_forward(state): loss_meter.add(state['loss'].data[0]) if (state['t'] % cfg['print_every'] == 0): tqdm.write("Step: {0} | Loss: {1}".format(state['t'], state['loss'].data[0]), file=cfg['logfile']) #-- hook: reset all meters at the start of the epoch def on_start_epoch(state): reset_all_meters() network.train() # set to training mode for batch norm state['iterator'] = tqdm(state['iterator']) #-- hook: perform validation and beam-search-decoding at end of each epoch: def on_end_epoch(state): network.eval() # set to validation mode for batch-norm # K steps of validation; average the loss: val_losses = [] base_seqs = [] val_data_iterator = get_iterator('valid') for k, val_sample in enumerate(val_data_iterator): if k > cfg['num_valid_steps']: break val_loss, transcriptions = model_eval(val_sample) val_losses.append(val_loss.data[0]) sequences = val_sample[2] # mask out the padding & permute (TxBxD => BxTxD): scores = mask_padding(transcriptions.permute(1, 0, 2), val_sample[1], fill_logit_idx=0) logits = F.softmax(scores, dim=2) base_seqs.append((sequences, logits)) avg_val_loss = np.mean(val_losses) # log to both logfile and stdout: tqdm.write("EPOCH {0} | Avg. Val Loss: {1}".format( state['epoch'], avg_val_loss), file=cfg['logfile']) print("EPOCH {0} | Avg. Val Loss: {1}".format(state['epoch'], avg_val_loss)) # send average val. loss to learning rate scheduler: sched.step(avg_val_loss) # beam search decoding: # (wrapped in try-excepts to prevent a thrown error from aborting training) _nt_dict_ = {0: ' ', 1: 'A', 2: 'G', 3: 'C', 4: 'T'} def convert_to_string(toks, voc, num): try: nt = ''.join([voc[t] for t in toks[0:num]]) except: nt = '' return nt for true_seqs, logits in base_seqs: try: true_nts = labels2strings(true_seqs, lookup=_nt_dict_) amax_nts = labels2strings(argmax_decode(logits), lookup=_nt_dict_) beam_result, beam_scores, beam_times, beam_lengths = beam_decoder.decode( logits.data) pred_nts = [ convert_to_string(beam_result[k][0], _nt_dict_, beam_lengths[k][0]) for k in range(len(beam_result)) ] for i in range(min(len(true_nts), len(pred_nts))): tqdm.write("True Seq: {0}".format(true_nts[i]), file=cfg['logfile']) tqdm.write("Beam Seq: {0}".format(pred_nts[i]), file=cfg['logfile']) tqdm.write("Amax Seq: {0}".format(amax_nts[i]), file=cfg['logfile']) tqdm.write( ("- " * 10 + "Local Beam Alignment" + " -" * 10), file=cfg['logfile']) tqdm.write(ssw(true_nts[i], pred_nts[i]), file=cfg['logfile']) tqdm.write("= " * 40, file=cfg['logfile']) except: tqdm.write("(WARN: Could not parse batch; skipping...)", file=cfg['logfile']) continue # save model: try: mdl_dtype = "cuda" if (cuda_avail and cfg['cuda']) else "cpu" mdl_path = os.path.join( cfg['save_dir'], "ctc_encoder.{0}.{1}.pth".format(state['epoch'], mdl_dtype)) torch.save(network.state_dict(), mdl_path) tqdm.write("Saved model.", file=cfg['logfile']) except: print("Unable to serialize model; Moving on. Traceback:") traceback.print_exc() tqdm.write("Unable to serialize models. Moving on...", file=cfg['logfile']) # reset all meters for next epoch: reset_all_meters() ### engine setup & training: engine.hooks['on_sample'] = on_sample engine.hooks['on_forward'] = on_forward engine.hooks['on_start_epoch'] = on_start_epoch engine.hooks['on_end_epoch'] = on_end_epoch engine.train(model_loss, get_iterator('train'), maxepoch=cfg['max_epochs'], optimizer=opt)
def csgd_train_and_prune(cfg: BaseConfigByEpoch, target_deps, centri_strength, pacesetter_dict, succeeding_strategy, pruned_weights, net=None, train_dataloader=None, val_dataloader=None, show_variables=False, convbuilder=None, beginning_msg=None, init_hdf5=None, no_l2_keywords=None, use_nesterov=False, tensorflow_style_init=False): ensure_dir(cfg.output_dir) ensure_dir(cfg.tb_dir) clusters_save_path = os.path.join(cfg.output_dir, 'clusters.npy') with Engine() as engine: is_main_process = (engine.world_rank == 0) #TODO correct? logger = engine.setup_log(name='train', log_dir=cfg.output_dir, file_name='log.txt') # -- typical model components model, opt, scheduler, dataloder --# if net is None: net = get_model_fn(cfg.dataset_name, cfg.network_type) if convbuilder is None: convbuilder = ConvBuilder(base_config=cfg) model = net(cfg, convbuilder).cuda() if train_dataloader is None: train_dataloader = create_dataset(cfg.dataset_name, cfg.dataset_subset, cfg.global_batch_size) if cfg.val_epoch_period > 0 and val_dataloader is None: val_dataloader = create_dataset(cfg.dataset_name, 'val', batch_size=100) #TODO 100? print('NOTE: Data prepared') print( 'NOTE: We have global_batch_size={} on {} GPUs, the allocated GPU memory is {}' .format(cfg.global_batch_size, torch.cuda.device_count(), torch.cuda.memory_allocated())) optimizer = get_optimizer(cfg, model, use_nesterov=use_nesterov) scheduler = get_lr_scheduler(cfg, optimizer) criterion = get_criterion(cfg).cuda() # model, optimizer = amp.initialize(model, optimizer, opt_level="O0") engine.register_state(scheduler=scheduler, model=model, optimizer=optimizer, cfg=cfg) if engine.distributed: print('Distributed training, engine.world_rank={}'.format( engine.world_rank)) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[engine.world_rank], broadcast_buffers=False, ) # model = DistributedDataParallel(model, delay_allreduce=True) elif torch.cuda.device_count() > 1: print('Single machine multiple GPU training') model = torch.nn.parallel.DataParallel(model) if tensorflow_style_init: for k, v in model.named_parameters(): if v.dim() in [2, 4]: torch.nn.init.xavier_uniform_(v) print('init {} as xavier_uniform'.format(k)) if 'bias' in k and 'bn' not in k.lower(): torch.nn.init.zeros_(v) print('init {} as zero'.format(k)) if cfg.init_weights: engine.load_checkpoint(cfg.init_weights) if init_hdf5: engine.load_hdf5(init_hdf5) kernel_namedvalue_list = engine.get_all_conv_kernel_namedvalue_as_list( ) if os.path.exists(clusters_save_path): layer_idx_to_clusters = np.load(clusters_save_path).item() else: layer_idx_to_clusters = get_layer_idx_to_clusters( kernel_namedvalue_list=kernel_namedvalue_list, target_deps=target_deps, pacesetter_dict=pacesetter_dict) if pacesetter_dict is not None: for follower_idx, pacesetter_idx in pacesetter_dict.items(): if pacesetter_idx in layer_idx_to_clusters: layer_idx_to_clusters[ follower_idx] = layer_idx_to_clusters[ pacesetter_idx] np.save(clusters_save_path, layer_idx_to_clusters) csgd_save_file = os.path.join(cfg.output_dir, 'finish.hdf5') if os.path.exists(csgd_save_file): engine.load_hdf5(csgd_save_file) else: param_name_to_merge_matrix = generate_merge_matrix_for_kernel( deps=cfg.deps, layer_idx_to_clusters=layer_idx_to_clusters, kernel_namedvalue_list=kernel_namedvalue_list) param_name_to_decay_matrix = generate_decay_matrix_for_kernel_and_vecs( deps=cfg.deps, layer_idx_to_clusters=layer_idx_to_clusters, kernel_namedvalue_list=kernel_namedvalue_list, weight_decay=cfg.weight_decay, centri_strength=centri_strength) # if pacesetter_dict is not None: # for follower_idx, pacesetter_idx in pacesetter_dict.items(): # follower_kernel_name = kernel_namedvalue_list[follower_idx].name # pacesetter_kernel_name = kernel_namedvalue_list[follower_idx].name # if pacesetter_kernel_name in param_name_to_merge_matrix: # param_name_to_merge_matrix[follower_kernel_name] = param_name_to_merge_matrix[ # pacesetter_kernel_name] # param_name_to_decay_matrix[follower_kernel_name] = param_name_to_decay_matrix[ # pacesetter_kernel_name] add_vecs_to_mat_dicts(param_name_to_merge_matrix) if show_variables: engine.show_variables() if beginning_msg: engine.log(beginning_msg) logger.info("\n\nStart training with pytorch version {}".format( torch.__version__)) iteration = engine.state.iteration # done_epochs = iteration // num_train_examples_per_epoch(cfg.dataset_name) iters_per_epoch = num_iters_per_epoch(cfg) max_iters = iters_per_epoch * cfg.max_epochs tb_writer = SummaryWriter(cfg.tb_dir) tb_tags = ['Top1-Acc', 'Top5-Acc', 'Loss'] model.train() done_epochs = iteration // iters_per_epoch engine.save_hdf5(os.path.join(cfg.output_dir, 'init.hdf5')) recorded_train_time = 0 recorded_train_examples = 0 for epoch in range(done_epochs, cfg.max_epochs): pbar = tqdm(range(iters_per_epoch)) top1 = AvgMeter() top5 = AvgMeter() losses = AvgMeter() discrip_str = 'Epoch-{}/{}'.format(epoch, cfg.max_epochs) pbar.set_description('Train' + discrip_str) if cfg.val_epoch_period > 0 and epoch % cfg.val_epoch_period == 0: model.eval() val_iters = 500 if cfg.dataset_name == 'imagenet' else 100 # use batch_size=100 for val on ImagenNet and CIFAR eval_dict, _ = run_eval(val_dataloader, val_iters, model, criterion, discrip_str, dataset_name=cfg.dataset_name) val_top1_value = eval_dict['top1'].item() val_top5_value = eval_dict['top5'].item() val_loss_value = eval_dict['loss'].item() for tag, value in zip( tb_tags, [val_top1_value, val_top5_value, val_loss_value]): tb_writer.add_scalars(tag, {'Val': value}, iteration) engine.log( 'validate at epoch {}, top1={:.5f}, top5={:.5f}, loss={:.6f}' .format(epoch, val_top1_value, val_top5_value, val_loss_value)) model.train() for _ in pbar: start_time = time.time() data, label = load_cuda_data(train_dataloader, cfg.dataset_name) data_time = time.time() - start_time train_net_time_start = time.time() acc, acc5, loss = train_one_step( model, data, label, optimizer, criterion, param_name_to_merge_matrix=param_name_to_merge_matrix, param_name_to_decay_matrix=param_name_to_decay_matrix) train_net_time_end = time.time() if iteration > TRAIN_SPEED_START * max_iters and iteration < TRAIN_SPEED_END * max_iters: recorded_train_examples += cfg.global_batch_size recorded_train_time += train_net_time_end - train_net_time_start scheduler.step() if iteration % cfg.tb_iter_period == 0 and is_main_process: for tag, value in zip( tb_tags, [acc.item(), acc5.item(), loss.item()]): tb_writer.add_scalars(tag, {'Train': value}, iteration) top1.update(acc.item()) top5.update(acc5.item()) losses.update(loss.item()) pbar_dic = OrderedDict() pbar_dic['data-time'] = '{:.2f}'.format(data_time) pbar_dic['cur_iter'] = iteration pbar_dic['lr'] = scheduler.get_lr()[0] pbar_dic['top1'] = '{:.5f}'.format(top1.mean) pbar_dic['top5'] = '{:.5f}'.format(top5.mean) pbar_dic['loss'] = '{:.5f}'.format(losses.mean) pbar.set_postfix(pbar_dic) if iteration >= max_iters or iteration % cfg.ckpt_iter_period == 0: engine.update_iteration(iteration) if (not engine.distributed) or (engine.distributed and is_main_process): engine.save_and_link_checkpoint(cfg.output_dir) iteration += 1 if iteration >= max_iters: break # do something after an epoch? if iteration >= max_iters: break # do something after the training if recorded_train_time > 0: exp_per_sec = recorded_train_examples / recorded_train_time else: exp_per_sec = 0 engine.log( 'TRAIN speed: from {} to {} iterations, batch_size={}, examples={}, total_net_time={:.4f}, examples/sec={}' .format(int(TRAIN_SPEED_START * max_iters), int(TRAIN_SPEED_END * max_iters), cfg.global_batch_size, recorded_train_examples, recorded_train_time, exp_per_sec)) if cfg.save_weights: engine.save_checkpoint(cfg.save_weights) print('NOTE: training finished, saved to {}'.format( cfg.save_weights)) engine.save_hdf5(os.path.join(cfg.output_dir, 'finish.hdf5')) csgd_prune_and_save(engine=engine, layer_idx_to_clusters=layer_idx_to_clusters, save_file=pruned_weights, succeeding_strategy=succeeding_strategy, new_deps=target_deps)
def ding_train(cfg:BaseConfigByEpoch, net=None, train_dataloader=None, val_dataloader=None, show_variables=False, convbuilder=None, beginning_msg=None, init_hdf5=None, no_l2_keywords=None, gradient_mask=None, use_nesterov=False): # LOCAL_RANK = 0 # # num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 # is_distributed = num_gpus > 1 # # if is_distributed: # torch.cuda.set_device(LOCAL_RANK) # torch.distributed.init_process_group( # backend="nccl", init_method="env://" # ) # synchronize() # # torch.backends.cudnn.benchmark = True ensure_dir(cfg.output_dir) ensure_dir(cfg.tb_dir) with Engine() as engine: is_main_process = (engine.world_rank == 0) #TODO correct? logger = engine.setup_log( name='train', log_dir=cfg.output_dir, file_name='log.txt') # -- typical model components model, opt, scheduler, dataloder --# if net is None: net = get_model_fn(cfg.dataset_name, cfg.network_type) if convbuilder is None: convbuilder = ConvBuilder(base_config=cfg) model = net(cfg, convbuilder).cuda() if train_dataloader is None: train_dataloader = create_dataset(cfg.dataset_name, cfg.dataset_subset, cfg.global_batch_size) if cfg.val_epoch_period > 0 and val_dataloader is None: val_dataloader = create_dataset(cfg.dataset_name, 'val', batch_size=100) #TODO 100? print('NOTE: Data prepared') print('NOTE: We have global_batch_size={} on {} GPUs, the allocated GPU memory is {}'.format(cfg.global_batch_size, torch.cuda.device_count(), torch.cuda.memory_allocated())) # device = torch.device(cfg.device) # model.to(device) # model.cuda() if no_l2_keywords is None: no_l2_keywords = [] optimizer = get_optimizer(cfg, model, no_l2_keywords=no_l2_keywords, use_nesterov=use_nesterov) scheduler = get_lr_scheduler(cfg, optimizer) criterion = get_criterion(cfg).cuda() # model, optimizer = amp.initialize(model, optimizer, opt_level="O0") engine.register_state( scheduler=scheduler, model=model, optimizer=optimizer) if engine.distributed: print('Distributed training, engine.world_rank={}'.format(engine.world_rank)) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[engine.world_rank], broadcast_buffers=False, ) # model = DistributedDataParallel(model, delay_allreduce=True) elif torch.cuda.device_count() > 1: print('Single machine multiple GPU training') model = torch.nn.parallel.DataParallel(model) # for k, v in model.named_parameters(): # if v.dim() in [2, 4]: # torch.nn.init.xavier_normal_(v) # print('init {} as xavier_normal'.format(k)) # if 'bias' in k and 'bn' not in k.lower(): # torch.nn.init.zeros_(v) # print('init {} as zero'.format(k)) if cfg.init_weights: engine.load_checkpoint(cfg.init_weights, is_restore=True) if init_hdf5: engine.load_hdf5(init_hdf5) if show_variables: engine.show_variables() # ------------ do training ---------------------------- # if beginning_msg: engine.log(beginning_msg) logger.info("\n\nStart training with pytorch version {}".format(torch.__version__)) iteration = engine.state.iteration # done_epochs = iteration // num_train_examples_per_epoch(cfg.dataset_name) iters_per_epoch = num_iters_per_epoch(cfg) max_iters = iters_per_epoch * cfg.max_epochs tb_writer = SummaryWriter(cfg.tb_dir) tb_tags = ['Top1-Acc', 'Top5-Acc', 'Loss'] model.train() done_epochs = iteration // iters_per_epoch engine.save_hdf5(os.path.join(cfg.output_dir, 'init.hdf5')) # summary(model=model, input_size=(224, 224) if cfg.dataset_name == 'imagenet' else (32, 32), batch_size=cfg.global_batch_size) recorded_train_time = 0 recorded_train_examples = 0 if gradient_mask is not None: gradient_mask_tensor = {} for name, value in gradient_mask.items(): gradient_mask_tensor[name] = torch.Tensor(value).cuda() else: gradient_mask_tensor = None for epoch in range(done_epochs, cfg.max_epochs): pbar = tqdm(range(iters_per_epoch)) top1 = AvgMeter() top5 = AvgMeter() losses = AvgMeter() discrip_str = 'Epoch-{}/{}'.format(epoch, cfg.max_epochs) pbar.set_description('Train' + discrip_str) if cfg.val_epoch_period > 0 and epoch % cfg.val_epoch_period == 0: model.eval() val_iters = 500 if cfg.dataset_name == 'imagenet' else 100 # use batch_size=100 for val on ImagenNet and CIFAR eval_dict, _ = run_eval(val_dataloader, val_iters, model, criterion, discrip_str, dataset_name=cfg.dataset_name) val_top1_value = eval_dict['top1'].item() val_top5_value = eval_dict['top5'].item() val_loss_value = eval_dict['loss'].item() for tag, value in zip(tb_tags, [val_top1_value, val_top5_value, val_loss_value]): tb_writer.add_scalars(tag, {'Val': value}, iteration) engine.log('validate at epoch {}, top1={:.5f}, top5={:.5f}, loss={:.6f}'.format(epoch, val_top1_value, val_top5_value, val_loss_value)) model.train() for _ in pbar: start_time = time.time() data, label = load_cuda_data(train_dataloader, cfg.dataset_name) data_time = time.time() - start_time if_accum_grad = ((iteration % cfg.grad_accum_iters) != 0) train_net_time_start = time.time() acc, acc5, loss = train_one_step(model, data, label, optimizer, criterion, if_accum_grad, gradient_mask_tensor=gradient_mask_tensor) train_net_time_end = time.time() if iteration > TRAIN_SPEED_START * max_iters and iteration < TRAIN_SPEED_END * max_iters: recorded_train_examples += cfg.global_batch_size recorded_train_time += train_net_time_end - train_net_time_start scheduler.step() if iteration % cfg.tb_iter_period == 0 and is_main_process: for tag, value in zip(tb_tags, [acc.item(), acc5.item(), loss.item()]): tb_writer.add_scalars(tag, {'Train': value}, iteration) top1.update(acc.item()) top5.update(acc5.item()) losses.update(loss.item()) pbar_dic = OrderedDict() pbar_dic['data-time'] = '{:.2f}'.format(data_time) pbar_dic['cur_iter'] = iteration pbar_dic['lr'] = scheduler.get_lr()[0] pbar_dic['top1'] = '{:.5f}'.format(top1.mean) pbar_dic['top5'] = '{:.5f}'.format(top5.mean) pbar_dic['loss'] = '{:.5f}'.format(losses.mean) pbar.set_postfix(pbar_dic) if iteration >= max_iters or iteration % cfg.ckpt_iter_period == 0: engine.update_iteration(iteration) if (not engine.distributed) or (engine.distributed and is_main_process): engine.save_and_link_checkpoint(cfg.output_dir) iteration += 1 if iteration >= max_iters: break # do something after an epoch? if iteration >= max_iters: break # do something after the training if recorded_train_time > 0: exp_per_sec = recorded_train_examples / recorded_train_time else: exp_per_sec = 0 engine.log( 'TRAIN speed: from {} to {} iterations, batch_size={}, examples={}, total_net_time={:.4f}, examples/sec={}' .format(int(TRAIN_SPEED_START * max_iters), int(TRAIN_SPEED_END * max_iters), cfg.global_batch_size, recorded_train_examples, recorded_train_time, exp_per_sec)) if cfg.save_weights: engine.save_checkpoint(cfg.save_weights) print('NOTE: training finished, saved to {}'.format(cfg.save_weights)) engine.save_hdf5(os.path.join(cfg.output_dir, 'finish.hdf5'))
#!/usr/bin/env python # -*- coding: utf-8 -*- from utils.strategies import AdvancedStrat from utils.engine import Engine #board_name = "match" board_name = "test4" strategy = AdvancedStrat() engine = Engine(board_name) #engine.setGoals([(16,10), (2,10)]) engine.setGoals([(18, 7), (1, 11)]) #engine.setGoals([(18,9), (1,9)]) engine.play(strategy)
default='xtqa', required=True) args.add_argument('--run_mode', dest='run_mode', choices=['train', 'test'], type=str, default='train', required=True) args = args.parse_args() return args if __name__ == '__main__': args = parse_input_args() cfg_file = 'configs/{}/{}.yml'.format(args.dataset_use, args.model) with open(cfg_file, 'r') as f: yml_dict = yaml.load(stream=f, Loader=yaml.BaseLoader) cfgs = CfgLoader(args.dataset_use, args.model).load() args_dict = cfgs.parse_to_dict(args) args_dict = {**args_dict, **yml_dict} cfgs.add_attr(args_dict) cfgs.proc() print('Configurations of Networks:') print(cfgs) engine = Engine(cfgs=cfgs) engine.load_method()
#!/usr/bin/env python # -*- coding: utf-8 -*- from utils.strategies import SingleStrat from utils.engine import Engine board_name = "pathfindingWorld3" strategy = SingleStrat() engine = Engine(board_name) engine.play(strategy)