Esempio n. 1
0
 def blogJobDay(self):
     logger.info('blogJob-startTime:%s' %
                 (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
     messages = Engine().getYesterdayUrls()
     MailUtil().senHtml(messages)
     logger.info('blogJob-endTime:%s' %
                 (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
Esempio n. 2
0
def ding_test(cfg:BaseConfigByEpoch, net=None, val_dataloader=None, show_variables=False, convbuilder=None,
               init_hdf5=None, extra_msg=None, weights_dict=None):

    with Engine(local_rank=0, for_val_only=True) as engine:

        engine.setup_log(
            name='test', log_dir='./', file_name=DETAIL_LOG_FILE)

        if convbuilder is None:
            convbuilder = ConvBuilder(base_config=cfg)

        if net is None:
            net_fn = get_model_fn(cfg.dataset_name, cfg.network_type)
            model = net_fn(cfg, convbuilder).cuda()
        else:
            model = net.cuda()

        if val_dataloader is None:
            val_data = create_dataset(cfg.dataset_name, cfg.dataset_subset,
                                      global_batch_size=cfg.global_batch_size, distributed=False)
        num_examples = num_val_examples(cfg.dataset_name)
        assert num_examples % cfg.global_batch_size == 0
        val_iters = num_val_examples(cfg.dataset_name) // cfg.global_batch_size
        print('batchsize={}, {} iters'.format(cfg.global_batch_size, val_iters))

        criterion = get_criterion(cfg).cuda()

        engine.register_state(
            scheduler=None, model=model, optimizer=None)

        if show_variables:
            engine.show_variables()

        assert not engine.distributed

        if weights_dict is not None:
            engine.load_from_weights_dict(weights_dict)
        else:
            if cfg.init_weights:
                engine.load_checkpoint(cfg.init_weights)
            if init_hdf5:
                engine.load_hdf5(init_hdf5)

        # engine.save_by_order('smi2_by_order.hdf5')
        # engine.load_by_order('smi2_by_order.hdf5')
        # engine.save_hdf5('model_files/stami2_lrs4Z.hdf5')

        model.eval()
        eval_dict, total_net_time = run_eval(val_data, val_iters, model, criterion, 'TEST', dataset_name=cfg.dataset_name)
        val_top1_value = eval_dict['top1'].item()
        val_top5_value = eval_dict['top5'].item()
        val_loss_value = eval_dict['loss'].item()

        msg = '{},{},{},top1={:.5f},top5={:.5f},loss={:.7f},total_net_time={}'.format(cfg.network_type, init_hdf5 or cfg.init_weights, cfg.dataset_subset,
                                                                    val_top1_value, val_top5_value, val_loss_value, total_net_time)
        if extra_msg is not None:
            msg += ', ' + extra_msg
        log_important(msg, OVERALL_LOG_FILE)
        return eval_dict
Esempio n. 3
0
def ding_test(cfg:BaseConfigByEpoch, net=None, val_dataloader=None, show_variables=False, convbuilder=None,
               init_hdf5=None, ):

    with Engine() as engine:

        engine.setup_log(
            name='test', log_dir='./', file_name=DETAIL_LOG_FILE)

        if net is None:
            net = get_model_fn(cfg.dataset_name, cfg.network_type)

        if convbuilder is None:
            convbuilder = ConvBuilder(base_config=cfg)

        model = net(cfg, convbuilder).cuda()

        if val_dataloader is None:
            val_dataloader = create_dataset(cfg.dataset_name, cfg.dataset_subset, batch_size=cfg.global_batch_size)
        val_iters = 50000 // cfg.global_batch_size if cfg.dataset_name == 'imagenet' else 10000 // cfg.global_batch_size

        print('NOTE: Data prepared')
        print('NOTE: We have global_batch_size={} on {} GPUs, the allocated GPU memory is {}'.format(cfg.global_batch_size, torch.cuda.device_count(), torch.cuda.memory_allocated()))

        criterion = get_criterion(cfg).cuda()

        engine.register_state(
            scheduler=None, model=model, optimizer=None, cfg=cfg)

        if show_variables:
            engine.show_variables()

        if engine.distributed:
            print('Distributed training, engine.world_rank={}'.format(engine.world_rank))
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[engine.world_rank],
                broadcast_buffers=False, )
            # model = DistributedDataParallel(model, delay_allreduce=True)
        elif torch.cuda.device_count() > 1:
            print('Single machine multiple GPU training')
            model = torch.nn.parallel.DataParallel(model)

        if cfg.init_weights:
            engine.load_checkpoint(cfg.init_weights, just_weights=True)

        if init_hdf5:
            engine.load_hdf5(init_hdf5)

        model.eval()
        eval_dict, _ = run_eval(val_dataloader, val_iters, model, criterion, 'TEST', dataset_name=cfg.dataset_name)
        val_top1_value = eval_dict['top1'].item()
        val_top5_value = eval_dict['top5'].item()
        val_loss_value = eval_dict['loss'].item()

        msg = '{},{},{},top1={:.5f},top5={:.5f},loss={:.7f}'.format(cfg.network_type, init_hdf5 or cfg.init_weights, cfg.dataset_subset,
                                                                    val_top1_value, val_top5_value, val_loss_value)
        log_important(msg, OVERALL_LOG_FILE)
Esempio n. 4
0
 def blogJob(self):
     logger.info('blogJob-startTime:%s' %
                 (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
     engine = Engine()
     urls = engine.get_url()
     logger.info('urls : ' + str(len(urls.keys())))
     deduplicationUrls = engine.deduplication()
     logger.info('deduplicationUrls : ' + str(len(urls.keys())))
     if len(deduplicationUrls) > 0:
         engine.download_file()
         # engine.senMail()
     logger.info('blogJob-endTime:%s' %
                 (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
Esempio n. 5
0
def csgd_train_and_prune(cfg: BaseConfigByEpoch,
                         target_deps,
                         centri_strength,
                         pacesetter_dict,
                         succeeding_strategy,
                         pruned_weights,
                         extra_cfg,
                         net=None,
                         train_dataloader=None,
                         val_dataloader=None,
                         show_variables=False,
                         beginning_msg=None,
                         init_weights=None,
                         no_l2_keywords=None,
                         use_nesterov=False,
                         tensorflow_style_init=False,
                         iter=None):

    ensure_dir(cfg.output_dir)
    ensure_dir(cfg.tb_dir)
    clusters_save_path = os.path.join(cfg.output_dir, 'clusters.npy')
    print("cluster save path:{}".format(clusters_save_path))
    config = extra_cfg

    with Engine() as engine:

        is_main_process = (engine.world_rank == 0)  #TODO correct?

        logger = engine.setup_log(name='train',
                                  log_dir=cfg.output_dir,
                                  file_name='log.txt')

        saveName = "%s-%s.yaml" % (config['note'], config['dataset'])
        modelName = config['modelName']
        os.environ['CUDA_VISIBLE_DEVICES'] = config['gpu_available']
        device_ids = range(config['gpu_num'])
        trainSet = GoProDataset(sharp_root=config['train_sharp'],
                                blur_root=config['train_blur'],
                                resize_size=config['resize_size'],
                                patch_size=config['crop_size'],
                                phase='train')
        testSet = GoProDataset(sharp_root=config['test_sharp'],
                               blur_root=config['test_blur'],
                               resize_size=config['resize_size'],
                               patch_size=config['crop_size'],
                               phase='test')

        train_loader = DataLoader(trainSet,
                                  batch_size=config['batchsize'],
                                  shuffle=True,
                                  num_workers=4,
                                  drop_last=True,
                                  pin_memory=True)
        test_loader = DataLoader(testSet,
                                 batch_size=1,
                                 shuffle=False,
                                 num_workers=1,
                                 drop_last=False,
                                 pin_memory=True)

        print('NOTE: Data prepared')
        print(
            'NOTE: We have global_batch_size={} on {} GPUs, the allocated GPU memory is {}'
            .format(config['batchsize'], torch.cuda.device_count(),
                    torch.cuda.memory_allocated()))

        model = net

        optimizer = get_optimizer(cfg, model, use_nesterov=use_nesterov)
        scheduler = lr_scheduler.MultiStepLR(optimizer,
                                             milestones=config['step'],
                                             gamma=0.5)  # learning rates
        criterion = get_criterion(cfg).cuda()

        engine.register_state(scheduler=scheduler,
                              model=model,
                              optimizer=optimizer,
                              cfg=cfg)

        model = torch.nn.DataParallel(model.cuda(), device_ids=device_ids)

        # load weight of last prune iteration or the not pruned model
        if init_weights:
            engine.load_pth(init_weights)

        # for unet the last outconv will not be pruned
        kernel_namedvalue_list = engine.get_all_conv_kernel_namedvalue_as_list(
            remove='out')

        # cluster filters
        if os.path.exists(clusters_save_path):
            layer_idx_to_clusters = np.load(clusters_save_path,
                                            allow_pickle=True).item()
            print("cluster exist, load from {}".format(clusters_save_path))
        else:
            layer_idx_to_clusters = get_layer_idx_to_clusters(
                kernel_namedvalue_list=kernel_namedvalue_list,
                target_deps=target_deps,
                pacesetter_dict=pacesetter_dict)
            if pacesetter_dict is not None:
                for follower_idx, pacesetter_idx in pacesetter_dict.items():
                    if pacesetter_idx in layer_idx_to_clusters:
                        layer_idx_to_clusters[
                            follower_idx] = layer_idx_to_clusters[
                                pacesetter_idx]

            # print(layer_idx_to_clusters)

            np.save(clusters_save_path, layer_idx_to_clusters)

        csgd_save_file = os.path.join(cfg.output_dir, 'finish.pth')

        # if this prune iter has a trained model, then load it
        if os.path.exists(csgd_save_file):
            engine.load_pth(csgd_save_file)
        else:
            param_name_to_merge_matrix = generate_merge_matrix_for_kernel(
                deps=cfg.deps,
                layer_idx_to_clusters=layer_idx_to_clusters,
                kernel_namedvalue_list=kernel_namedvalue_list)
            param_name_to_decay_matrix = generate_decay_matrix_for_kernel_and_vecs(
                deps=cfg.deps,
                layer_idx_to_clusters=layer_idx_to_clusters,
                kernel_namedvalue_list=kernel_namedvalue_list,
                weight_decay=cfg.weight_decay,
                centri_strength=centri_strength)
            # if pacesetter_dict is not None:
            #     for follower_idx, pacesetter_idx in pacesetter_dict.items():
            #         follower_kernel_name = kernel_namedvalue_list[follower_idx].name
            #         pacesetter_kernel_name = kernel_namedvalue_list[follower_idx].name
            #         if pacesetter_kernel_name in param_name_to_merge_matrix:
            #             param_name_to_merge_matrix[follower_kernel_name] = param_name_to_merge_matrix[
            #                 pacesetter_kernel_name]
            #             param_name_to_decay_matrix[follower_kernel_name] = param_name_to_decay_matrix[
            #                 pacesetter_kernel_name]

            # add 2 para of bn and conv.bias to mat dicts to enable the c-sgd update rule
            add_vecs_to_mat_dicts(param_name_to_merge_matrix)

            if show_variables:
                engine.show_variables()

            if beginning_msg:
                engine.log(beginning_msg)

            logger.info("\n\nStart training with pytorch version {}".format(
                torch.__version__))

            iteration = engine.state.iteration
            startEpoch = config['start_epoch']
            max_epochs = config['max_epochs']

            engine.save_pth(os.path.join(cfg.output_dir, 'init.pth'))

            viz = Visdom(env=saveName)
            bestPSNR = config['bestPSNR']

            itr = '' if iter is None else str(iter)
            for epoch in range(startEpoch, max_epochs):
                # eval
                if epoch % config['save_epoch'] == 0:
                    with torch.no_grad():
                        model.eval()
                        avg_PSNR = 0
                        idx = 0
                        for test_data in test_loader:
                            idx += 1
                            test_data['L'] = test_data['L'].cuda()
                            sharp = model(test_data['L'])
                            sharp = sharp.detach().float().cpu()
                            sharp = util.tensor2uint(sharp)
                            test_data['H'] = util.tensor2uint(test_data['H'])
                            current_psnr = util.calculate_psnr(sharp,
                                                               test_data['H'],
                                                               border=0)

                            avg_PSNR += current_psnr
                            if idx % 100 == 0:
                                print("epoch {}: tested {}".format(epoch, idx))
                        avg_PSNR = avg_PSNR / idx
                        print("total PSNR : {:<4.2f}".format(avg_PSNR))
                        viz.line(X=[epoch],
                                 Y=[avg_PSNR],
                                 win='testPSNR-' + itr,
                                 opts=dict(title='psnr',
                                           legend=['valid_psnr']),
                                 update='append')
                        if avg_PSNR > bestPSNR:
                            bestPSNR = avg_PSNR
                            save_path = os.path.join(cfg.output_dir,
                                                     'finish.pth')
                            engine.save_pth(save_path)

                # train
                avg_loss = 0.0
                idx = 0
                model.train()
                for i, train_data in enumerate(train_loader):
                    idx += 1
                    train_data['L'] = train_data['L'].cuda()
                    train_data['H'] = train_data['H'].cuda()
                    optimizer.zero_grad()
                    loss = train_one_step(model, train_data['L'], train_data['H'], criterion,\
                                     optimizer,param_name_to_merge_matrix,\
                                     param_name_to_decay_matrix)

                    avg_loss += loss.item()
                    if idx % 100 == 0:
                        print("epoch {}: trained {}".format(epoch, idx))

                scheduler.step()
                avg_loss = avg_loss / idx
                print("epoch {}: total loss : {:<4.2f}, lr : {}".format(
                    epoch, avg_loss,
                    scheduler.get_lr()[0]))
                viz.line(X=[epoch],
                         Y=[avg_loss],
                         win='trainMSELoss-' + itr,
                         opts=dict(title='mse', legend=['train_mse']),
                         update='append')
            # engine.save_pth(os.path.join(cfg.output_dir, 'finish.pth'))

        csgd_prune_and_save(engine=engine,
                            layer_idx_to_clusters=layer_idx_to_clusters,
                            save_file=pruned_weights,
                            succeeding_strategy=succeeding_strategy,
                            new_deps=target_deps)
Esempio n. 6
0
def train_main(local_rank,
               cfg: BaseConfigByEpoch,
               net=None,
               train_dataloader=None,
               val_dataloader=None,
               show_variables=False,
               convbuilder=None,
               init_hdf5=None,
               no_l2_keywords='depth',
               gradient_mask=None,
               use_nesterov=False,
               tensorflow_style_init=False,
               load_weights_keyword=None,
               keyword_to_lr_mult=None,
               auto_continue=False,
               lasso_keyword_to_strength=None,
               save_hdf5_epochs=10000):

    if no_l2_keywords is None:
        no_l2_keywords = []
    if type(no_l2_keywords) is not list:
        no_l2_keywords = [no_l2_keywords]

    ensure_dir(cfg.output_dir)
    ensure_dir(cfg.tb_dir)
    with Engine(local_rank=local_rank) as engine:
        engine.setup_log(name='train',
                         log_dir=cfg.output_dir,
                         file_name='log.txt')

        # ----------------------------- build model ------------------------------
        if convbuilder is None:
            convbuilder = ConvBuilder(base_config=cfg)
        if net is None:
            net_fn = get_model_fn(cfg.dataset_name, cfg.network_type)
            model = net_fn(cfg, convbuilder)
        else:
            model = net
        model = model.cuda()
        # ----------------------------- model done ------------------------------

        # ---------------------------- prepare data -------------------------
        if train_dataloader is None:
            train_data = create_dataset(cfg.dataset_name,
                                        cfg.dataset_subset,
                                        cfg.global_batch_size,
                                        distributed=engine.distributed)
        if cfg.val_epoch_period > 0 and val_dataloader is None:
            val_data = create_dataset(cfg.dataset_name,
                                      'val',
                                      global_batch_size=100,
                                      distributed=False)
        engine.echo('NOTE: Data prepared')
        engine.echo(
            'NOTE: We have global_batch_size={} on {} GPUs, the allocated GPU memory is {}'
            .format(cfg.global_batch_size, torch.cuda.device_count(),
                    torch.cuda.memory_allocated()))
        # ----------------------------- data done --------------------------------

        # ------------------------ parepare optimizer, scheduler, criterion -------
        optimizer = get_optimizer(engine,
                                  cfg,
                                  model,
                                  no_l2_keywords=no_l2_keywords,
                                  use_nesterov=use_nesterov,
                                  keyword_to_lr_mult=keyword_to_lr_mult)
        scheduler = get_lr_scheduler(cfg, optimizer)
        criterion = get_criterion(cfg).cuda()
        # --------------------------------- done -------------------------------

        engine.register_state(scheduler=scheduler,
                              model=model,
                              optimizer=optimizer)

        if engine.distributed:
            torch.cuda.set_device(local_rank)
            engine.echo('Distributed training, device {}'.format(local_rank))
            model = torch.nn.parallel.DistributedDataParallel(
                model,
                device_ids=[local_rank],
                broadcast_buffers=False,
            )
        else:
            assert torch.cuda.device_count() == 1
            engine.echo('Single GPU training')

        if tensorflow_style_init:
            init_as_tensorflow(model)
        if cfg.init_weights:
            engine.load_checkpoint(cfg.init_weights)
        if init_hdf5:
            engine.load_hdf5(init_hdf5,
                             load_weights_keyword=load_weights_keyword)
        if auto_continue:
            assert cfg.init_weights is None
            engine.load_checkpoint(get_last_checkpoint(cfg.output_dir))
        if show_variables:
            engine.show_variables()

        # ------------ do training ---------------------------- #
        engine.log("\n\nStart training with pytorch version {}".format(
            torch.__version__))

        iteration = engine.state.iteration
        iters_per_epoch = num_iters_per_epoch(cfg)
        max_iters = iters_per_epoch * cfg.max_epochs
        tb_writer = SummaryWriter(cfg.tb_dir)
        tb_tags = ['Top1-Acc', 'Top5-Acc', 'Loss']

        model.train()

        done_epochs = iteration // iters_per_epoch
        last_epoch_done_iters = iteration % iters_per_epoch

        if done_epochs == 0 and last_epoch_done_iters == 0:
            engine.save_hdf5(os.path.join(cfg.output_dir, 'init.hdf5'))

        recorded_train_time = 0
        recorded_train_examples = 0

        collected_train_loss_sum = 0
        collected_train_loss_count = 0

        if gradient_mask is not None:
            gradient_mask_tensor = {}
            for name, value in gradient_mask.items():
                gradient_mask_tensor[name] = torch.Tensor(value).cuda()
        else:
            gradient_mask_tensor = None

        for epoch in range(done_epochs, cfg.max_epochs):

            if engine.distributed and hasattr(train_data, 'train_sampler'):
                train_data.train_sampler.set_epoch(epoch)

            if epoch == done_epochs:
                pbar = tqdm(range(iters_per_epoch - last_epoch_done_iters))
            else:
                pbar = tqdm(range(iters_per_epoch))

            if epoch == 0 and local_rank == 0:
                val_during_train(epoch=epoch,
                                 iteration=iteration,
                                 tb_tags=tb_tags,
                                 engine=engine,
                                 model=model,
                                 val_data=val_data,
                                 criterion=criterion,
                                 descrip_str='Init',
                                 dataset_name=cfg.dataset_name,
                                 test_batch_size=TEST_BATCH_SIZE,
                                 tb_writer=tb_writer)

            top1 = AvgMeter()
            top5 = AvgMeter()
            losses = AvgMeter()
            discrip_str = 'Epoch-{}/{}'.format(epoch, cfg.max_epochs)
            pbar.set_description('Train' + discrip_str)

            for _ in pbar:

                start_time = time.time()
                data, label = load_cuda_data(train_data,
                                             dataset_name=cfg.dataset_name)

                # load_cuda_data(train_dataloader, cfg.dataset_name)
                data_time = time.time() - start_time

                if_accum_grad = ((iteration % cfg.grad_accum_iters) != 0)

                train_net_time_start = time.time()
                acc, acc5, loss = train_one_step(
                    model,
                    data,
                    label,
                    optimizer,
                    criterion,
                    if_accum_grad,
                    gradient_mask_tensor=gradient_mask_tensor,
                    lasso_keyword_to_strength=lasso_keyword_to_strength)
                train_net_time_end = time.time()

                if iteration > TRAIN_SPEED_START * max_iters and iteration < TRAIN_SPEED_END * max_iters:
                    recorded_train_examples += cfg.global_batch_size
                    recorded_train_time += train_net_time_end - train_net_time_start

                scheduler.step()

                for module in model.modules():
                    if hasattr(module, 'set_cur_iter'):
                        module.set_cur_iter(iteration)

                if iteration % cfg.tb_iter_period == 0 and engine.world_rank == 0:
                    for tag, value in zip(
                            tb_tags,
                        [acc.item(), acc5.item(),
                         loss.item()]):
                        tb_writer.add_scalars(tag, {'Train': value}, iteration)

                top1.update(acc.item())
                top5.update(acc5.item())
                losses.update(loss.item())

                if epoch >= cfg.max_epochs - COLLECT_TRAIN_LOSS_EPOCHS:
                    collected_train_loss_sum += loss.item()
                    collected_train_loss_count += 1

                pbar_dic = OrderedDict()
                pbar_dic['data-time'] = '{:.2f}'.format(data_time)
                pbar_dic['cur_iter'] = iteration
                pbar_dic['lr'] = scheduler.get_lr()[0]
                pbar_dic['top1'] = '{:.5f}'.format(top1.mean)
                pbar_dic['top5'] = '{:.5f}'.format(top5.mean)
                pbar_dic['loss'] = '{:.5f}'.format(losses.mean)
                pbar.set_postfix(pbar_dic)

                iteration += 1

                if iteration >= max_iters or iteration % cfg.ckpt_iter_period == 0:
                    engine.update_iteration(iteration)
                    if (not engine.distributed) or (engine.distributed and
                                                    engine.world_rank == 0):
                        engine.save_and_link_checkpoint(cfg.output_dir)

                if iteration >= max_iters:
                    break

            #   do something after an epoch?
            engine.update_iteration(iteration)
            engine.save_latest_ckpt(cfg.output_dir)

            if (epoch + 1) % save_hdf5_epochs == 0:
                engine.save_hdf5(
                    os.path.join(cfg.output_dir,
                                 'epoch-{}.hdf5'.format(epoch)))

            if local_rank == 0 and \
                    cfg.val_epoch_period > 0 and (epoch >= cfg.max_epochs - 10 or epoch % cfg.val_epoch_period == 0):
                val_during_train(epoch=epoch,
                                 iteration=iteration,
                                 tb_tags=tb_tags,
                                 engine=engine,
                                 model=model,
                                 val_data=val_data,
                                 criterion=criterion,
                                 descrip_str=discrip_str,
                                 dataset_name=cfg.dataset_name,
                                 test_batch_size=TEST_BATCH_SIZE,
                                 tb_writer=tb_writer)

            if iteration >= max_iters:
                break

        #   do something after the training
        if recorded_train_time > 0:
            exp_per_sec = recorded_train_examples / recorded_train_time
        else:
            exp_per_sec = 0
        engine.log(
            'TRAIN speed: from {} to {} iterations, batch_size={}, examples={}, total_net_time={:.4f}, examples/sec={}'
            .format(int(TRAIN_SPEED_START * max_iters),
                    int(TRAIN_SPEED_END * max_iters), cfg.global_batch_size,
                    recorded_train_examples, recorded_train_time, exp_per_sec))
        if cfg.save_weights:
            engine.save_checkpoint(cfg.save_weights)
            print('NOTE: training finished, saved to {}'.format(
                cfg.save_weights))
        engine.save_hdf5(os.path.join(cfg.output_dir, 'finish.hdf5'))
        if collected_train_loss_count > 0:
            engine.log(
                'TRAIN LOSS collected over last {} epochs: {:.6f}'.format(
                    COLLECT_TRAIN_LOSS_EPOCHS,
                    collected_train_loss_sum / collected_train_loss_count))
class LearningPouring:
    def __init__(self):

        self.head_Joint = ["head_p", "head_y"]

        self.arm_Joint = [
            "l_arm_el_y", "l_arm_sh_p1", "l_arm_sh_p2", "l_arm_sh_r",
            "l_arm_wr_p", "l_arm_wr_r", "l_arm_wr_y", "r_arm_el_y",
            "r_arm_sh_p1", "r_arm_sh_p2", "r_arm_sh_r", "r_arm_wr_p",
            "r_arm_wr_r", "r_arm_wr_y"
        ]

        self.leg_Joint = [
            "l_leg_an_p", "l_leg_an_r", "l_leg_hip_p", "l_leg_hip_r",
            "l_leg_hip_y", "l_leg_kn_p", "r_arm_el_y", "r_leg_an_p",
            "r_leg_an_r", "r_leg_hip_p", "r_leg_hip_r", "r_leg_hip_y",
            "r_leg_kn_p", "torso_y"
        ]

        self.grip_Joint = ["l_arm_grip", "r_arm_grip"]

        # Initialize publisher objects
        self.initPublishers()

        # Initialize Subscriber objects
        self.initSubscribe()

        ### init engine
        name2num = {
            'gripON_L': "gripON_L",
            'gripOFF_L': "gripOFF_L",
            'gripON_R': "gripON_R",
            'gripOFF_R': "gripOFF_R",
            'fLL': 34,
            'bLL': 35,
            'fRR': 36,
            'bRR': 37,
            'fML': 38,
            'bML': 39,
            'fMR': 40,
            'bMR': 41,
            'init': 44,
            'DMPPourRtoL': 28,
            'removeBall': 29
        }

        self.engine = Engine(name2num)

        self.dmp_y0 = np.array([-1.52017496, 0.04908739, 1.41433029])
        self.dmp_goal = np.array([-1.50848603, 0.0591503, 1.44347592])

        load_file_name = "w_0_1_right_3_100_1000.0_0.01_2"
        #load_file_name = raw_input('file name: ')
        load_file_name_list = load_file_name.split('_')
        ### learning ep
        self.ep = int(load_file_name_list[1])
        ### pouring number of ball to the other tube
        self.numofball = int(load_file_name_list[2])
        ### which arm do the pouring motion
        self.pour_arm = load_file_name_list[3]
        n_dmps = int(load_file_name_list[4])
        n_bfs = int(load_file_name_list[5])
        decay = float(load_file_name_list[6])
        dt = float(load_file_name_list[7])
        self.total_ball = float(load_file_name_list[8])

        ### initial DMP
        self.rl = RLDMPs(n_dmps=n_dmps,
                         n_bfs=n_bfs,
                         decay=decay,
                         y0=self.dmp_y0,
                         goal=self.dmp_goal,
                         ay=np.ones(n_dmps) * 10.0,
                         dt=dt)

        self.rl.load_weight(load_file_name)

        print(self.rl.predict().y)
        print("load npy file weight success:")
        print("ep: " + str(self.ep))
        print("pouring " + str(self.numofball) +
              " ball to other tube. Total: " + str(self.total_ball))
        print("using  " + self.pour_arm + " pouring the ball")

        self.costT_list = []

    def initPublishers(self):
        self.pub_joint_ctrl_module = rospy.Publisher(
            '/robotis/set_joint_ctrl_modules', JointCtrlModule, queue_size=10)
        self.pub_action = rospy.Publisher('/robotis/action/page_num',
                                          Int32,
                                          queue_size=10)
        self.pub_IK = rospy.Publisher(
            '/robotis/manipulation/kinematics_pose_msg',
            KinematicsPose,
            queue_size=1)
        self.pub_joint_value = rospy.Publisher('/robotis/set_joint_states',
                                               JointState,
                                               queue_size=10)

        self.fk_pub = rospy.Publisher('/thormang3/fk_set_joint_states',
                                      JointState,
                                      queue_size=10)

        # Wait a bit for the publishers to initialize
        sleep(1)

    def initSubscribe(self):
        #rospy.Subscriber('/robotis/present_joint_states', JointState, self.callback)
        pass

    #def callback(self,msg):
    #    self.joint_pose = dict(zip(msg.name, msg.position))

    # calulate Ik from robotis IK engine
    def cal_IK(self, name, x, y, z, qx, qy, qz, qw):

        self.set_manipulation_module()
        sleep(0.5)

        pose_msg = KinematicsPose()
        pose_msg.name = name
        pose_msg.pose.position.x = x
        pose_msg.pose.position.y = y
        pose_msg.pose.position.z = z
        pose_msg.pose.orientation.x = qx
        pose_msg.pose.orientation.y = qy
        pose_msg.pose.orientation.z = qz
        pose_msg.pose.orientation.w = qw
        self.pub_IK.publish(pose_msg)

    def get_tube_position(self, arm_type, j):
        rospy.wait_for_service("/thormang3_eureka/cal_fk")

        # Create service object
        fk_srv = rospy.ServiceProxy("/thormang3_eureka/cal_fk", CalFK)

        try:
            # Call service and get response
            fk_resp = fk_srv(arm_type, j[0], j[1], j[2], j[3], j[4], j[5],
                             j[6])
        except rospy.ServiceException as exc:
            print("Failed to call service: " + str(exc))

        oR, oP, oY = euler_from_quaternion(
            [fk_resp.ox, fk_resp.oy, fk_resp.oz, fk_resp.ow])

        Rx = np.array([[1, 0, 0, 0], [0, np.cos(-oR),
                                      np.sin(-oR), 0],
                       [0, np.sin(-oR), np.cos(-oR), 0], [0, 0, 0, 1]])

        Ry = np.array([[np.cos(oP), 0, np.sin(oP), 0], [0, 1, 0, 0],
                       [-np.sin(oP), 0, np.cos(oP), 0], [0, 0, 0, 1]])

        Rz = np.array([[np.cos(oY), -np.sin(oY), 0, 0],
                       [np.sin(oY), np.cos(oY), 0, 0], [0, 0, 1, 0],
                       [0, 0, 0, 1]])

        bottom = np.array([fk_resp.px, fk_resp.py, fk_resp.pz])
        up = np.array([fk_resp.px, fk_resp.py, fk_resp.pz])

        turb_lenght = 0.07

        pos = np.array([0, 0, turb_lenght, 1])
        # rotate j4
        pos = Rx.dot(pos)
        # rotate j5
        pos = Ry.dot(pos)
        # rotate j6
        pos = Rz.dot(pos)
        up[0] += pos[0]
        up[1] += pos[1]
        up[2] += pos[2]
        return bottom, up

    def detect_collision(self, pose, name):

        dic_pos = dict(zip(name, pose))

        right_arm_name = [
            "r_arm_sh_p1", "r_arm_sh_r", "r_arm_sh_p2", "r_arm_el_y",
            "r_arm_wr_r", "r_arm_wr_y", "r_arm_wr_p"
        ]
        right_arm_pose = [
            bin2float(3124),
            bin2float(1614),
            bin2float(1029),
            bin2float(1525),
            bin2float(1063),
            bin2float(2084),
            bin2float(2987)
        ]
        r_pose = dict(zip(right_arm_name, right_arm_pose))
        j = []
        for n in right_arm_name:
            if n in name:
                j.append(dic_pos[n])
            else:
                j.append(r_pose[n])

        #j = [self.joint_pose["r_arm_sh_p1"],self.joint_pose["r_arm_sh_r"],self.joint_pose["r_arm_sh_p2"],self.joint_pose["r_arm_el_y"],self.joint_pose["r_arm_wr_r"],self.joint_pose["r_arm_wr_y"],self.joint_pose["r_arm_wr_p"]]
        # Create service object

        # Rb is the point bottom of the right tube, Rt is the point top of the right tube
        Rb, Rt = self.get_tube_position("right_arm", j)

        left_arm_pose = [
            bin2float(1650),
            bin2float(3129),
            bin2float(2560),
            bin2float(2343),
            bin2float(3543),
            bin2float(2137),
            bin2float(1167)
        ]

        # Lb is the point bottom of the left tube, Lt is the point top of the left tube
        Lb, Lt = self.get_tube_position("left_arm", left_arm_pose)

        pA, pB, dis = closestDistanceBetweenLines(Rb,
                                                  Rt,
                                                  Lb,
                                                  Lt,
                                                  clampAll=True)

        return dis

    def execute_path(self, joint_name, traj, delay_time=0.0025):
        assert len(joint_name) == len(traj[0])

        joint = JointState()
        joint.name = joint_name
        joint.velocity = [0.0 for _ in range(len(joint_name))]
        joint.effort = [0.0 for _ in range(len(joint_name))]

        self.set_none_module()

        start = time.time()
        for i in range(len(traj)):
            ts = time.time()
            joint.position = traj[i]
            self.pub_joint_value.publish(joint)
            te = time.time()
            while (te - ts) < delay_time:
                te = time.time()

        end = time.time()
        print("execute_path time is : ", end - start)

    def cal_cost(self, bnum, traj):
        cost = np.zeros((self.rl.timesteps)) + 1e-8
        costT = (self.numofball - bnum)**2
        return cost, costT

    def gripperOpen(self, name):

        self.set_none_module()
        sleep(0.5)
        # open gripper
        joint_msg = JointState()
        joint_msg.name = [name]
        joint_msg.position = [0.0]
        self.pub_joint_value.publish(joint_msg)
        self.set_manipulation_module()

    def gripperClose(self, name):
        self.set_none_module()
        # close gripper
        joint_msg = JointState()
        joint_msg.name = [name]
        joint_msg.position = [1.0]
        self.pub_joint_value.publish(joint_msg)
        self.set_manipulation_module()

    def set_action_modules(self):
        # Set arm to manipulation module
        j = JointCtrlModule()
        j.joint_name = self.arm_Joint + self.leg_Joint + self.head_Joint
        j.module_name = ["action_module" for _ in range(len(j.joint_name))]
        self.pub_joint_ctrl_module.publish(j)

        # Wait a bit for the publishers to set_joint_ctrl_modules
        sleep(0.2)

    def set_manipulation_module(self):

        # Set arm to manipulation module
        j = JointCtrlModule()
        j.joint_name = self.arm_Joint
        j.module_name = [
            "manipulation_module" for _ in range(len(self.arm_Joint))
        ]
        self.pub_joint_ctrl_module.publish(j)

        # Wait a bit for the publishers to set_joint_ctrl_modules
        sleep(0.2)

        # Set gripper to gripper module
        j = JointCtrlModule()
        j.joint_name = self.grip_Joint
        j.module_name = ["gripper_module" for _ in range(len(self.grip_Joint))]
        self.pub_joint_ctrl_module.publish(j)

        # Wait a bit for the publishers to set_joint_ctrl_modules
        sleep(0.2)

    def set_none_module(self):

        # Set arm to none module
        j = JointCtrlModule()
        j.joint_name = self.arm_Joint + self.leg_Joint
        j.module_name = ["none" for _ in range(len(j.joint_name))]
        self.pub_joint_ctrl_module.publish(j)

        # Wait a bit for the publishers to set_joint_ctrl_modules
        sleep(0.2)

        # Set gripper to gripper module
        j = JointCtrlModule()
        j.joint_name = self.grip_Joint
        j.module_name = ["none" for _ in range(len(self.grip_Joint))]
        self.pub_joint_ctrl_module.publish(j)

        # Wait a bit for the publishers to set_joint_ctrl_modules
        sleep(0.2)

    def run(self):
        rospy.init_node('Learning_pouring', anonymous=True)
        rate = rospy.Rate(10)  # 10hz

        while not rospy.is_shutdown():
            page = raw_input('Command: ')

            if page == "pickRR":
                self.set_action_modules()
                plan = ["init"]
                self.engine.setPlan(plan)
                while self.engine.isRunning:
                    self.engine.run()

                self.cal_IK(name="right_arm",
                            x=0.460,
                            y=-0.180,
                            z=1.100,
                            qx=0,
                            qy=0,
                            qz=0,
                            qw=1)
                sleep(3)
                self.cal_IK(name="right_arm",
                            x=0.460,
                            y=-0.180,
                            z=0.89,
                            qx=0,
                            qy=0,
                            qz=0,
                            qw=1)
                sleep(3)

                self.gripperOpen("r_arm_grip")

                raw_input("wait the turb")

                self.gripperClose("r_arm_grip")

                self.cal_IK(name="right_arm",
                            x=0.460,
                            y=-0.180,
                            z=1.100,
                            qx=0,
                            qy=0,
                            qz=0,
                            qw=1)
                sleep(3)

                self.set_action_modules()
                plan = ["init"]
                self.engine.setPlan(plan)
                while self.engine.isRunning:
                    self.engine.run()

            if page == "pickLL":
                self.set_action_modules()
                plan = ["init"]
                self.engine.setPlan(plan)
                while self.engine.isRunning:
                    self.engine.run()

                self.cal_IK(name="left_arm",
                            x=0.460,
                            y=0.180,
                            z=1.100,
                            qx=0,
                            qy=0,
                            qz=0,
                            qw=1)
                sleep(3)
                self.cal_IK(name="left_arm",
                            x=0.460,
                            y=0.180,
                            z=0.89,
                            qx=0,
                            qy=0,
                            qz=0,
                            qw=1)
                sleep(3)

                self.gripperOpen("l_arm_grip")

                raw_input("wait the turb")

                self.gripperClose("l_arm_grip")

                self.cal_IK(name="left_arm",
                            x=0.460,
                            y=0.180,
                            z=1.100,
                            qx=0,
                            qy=0,
                            qz=0,
                            qw=1)
                sleep(3)

                self.set_action_modules()
                plan = ["init"]
                self.engine.setPlan(plan)
                while self.engine.isRunning:
                    self.engine.run()

            if page == "cost":
                print(np.load(os.path.dirname(__file__) + "/costT.npy"))

            if page == "save":
                save_name = 'w_'
                save_name = save_name + str(self.ep) + '_'
                save_name = save_name + str(self.numofball) + '_'
                save_name = save_name + self.pour_arm + '_'
                save_name = save_name + str(self.rl.n_dmps) + '_'
                save_name = save_name + str(self.rl.n_bfs) + '_'
                save_name = save_name + str(self.rl.decay) + '_'
                save_name = save_name + str(self.rl.dt) + '_'
                save_name = save_name + str(self.total_ball)
                self.rl.save_weight(save_name)
                np.save(
                    os.path.dirname(__file__) + "/costT.npy",
                    np.array(self.costT_list))
                print("save npy file weight success")

            ### learning pouring
            if page == "l":

                ### prepare
                plan = ["DMPPourRtoL"]
                self.engine.setPlan(plan)
                while self.engine.isRunning:
                    self.engine.run()

                print("ep: " + str(self.ep))
                track = self.rl.rollout()
                cost = np.zeros(
                    (self.rl.n_stochastic, self.rl.timesteps)) + 1e-8
                costT = np.zeros((self.rl.n_stochastic)) + 1e-8

                for i in range(self.rl.n_stochastic):
                    raw_input("wait_ball: ")
                    ### detect collision
                    print("random try: " + str(i))
                    #min_dis = 10
                    #for j in range(len(track.y[i])):
                    #    dis = self.detect_collision(track.y[i][j],["r_arm_wr_y"])
                    #    if min_dis > dis:
                    #        min_dis = dis
                    min_dis = 1
                    if min_dis > 0.001:

                        ### excute

                        self.execute_path(
                            ["r_arm_wr_r", "r_arm_wr_y", "r_arm_wr_p"],
                            track.y[i])
                        ### calulate cost
                        bnum = float(raw_input("ball number: "))
                        cost[i], costT[i] = self.cal_cost(bnum, track.y[i])
                        if bnum != 0:
                            plan = ["removeBall", "DMPPourRtoL"]
                            self.engine.setPlan(plan)
                            while self.engine.isRunning:
                                self.engine.run()
                            sleep(3)

                    else:
                        print("error: min_dis is ", min_dis)
                        raw_input()
                        costT[i] = -1

                print("total cost:", np.sum(cost) + np.sum(costT))
                self.rl.updatePI(cost, costT)
                self.ep += 1
                self.costT_list.append(costT)

            if page == "p":
                track = self.rl.predict()
                ### prepare
                plan = ["DMPPourRtoL"]
                self.engine.setPlan(plan)
                while self.engine.isRunning:
                    self.engine.run()
                sleep(2)

                ### excute
                self.execute_path(["r_arm_wr_r", "r_arm_wr_y", "r_arm_wr_p"],
                                  track.y[0])

                plan = ["DMPPourRtoL"]
                self.engine.setPlan(plan)
                while self.engine.isRunning:
                    self.engine.run()

                sleep(2)
                plan = ["removeBall", "DMPPourRtoL"]
                self.engine.setPlan(plan)
                while self.engine.isRunning:
                    self.engine.run()
                sleep(3)

            if page == "t":
                path = []

                wr_r_p0 = bin2float(1057)
                wr_y_p0 = bin2float(2080)
                wr_p_p0 = bin2float(2970)

                wr_r_p1 = bin2float(948)
                wr_y_p1 = bin2float(1918)
                wr_p_p1 = bin2float(2940)

                wr_r_p2 = bin2float(1067)
                wr_y_p2 = bin2float(2090)
                wr_p_p2 = bin2float(2990)

                genPath(path, wr_r_p0, wr_r_p1, wr_y_p0, wr_y_p1, wr_p_p0,
                        wr_p_p1, 30)
                genPath(path, wr_r_p1, wr_r_p1, wr_y_p1, wr_y_p1, wr_p_p1,
                        wr_p_p1, 20)
                genPath(path, wr_r_p1, wr_r_p2, wr_y_p1, wr_y_p2, wr_p_p1,
                        wr_p_p2, 50)
                path = np.array(path)
                self.execute_path(["r_arm_wr_r", "r_arm_wr_y", "r_arm_wr_p"],
                                  path)

                plan = ["removeBall", "DMPPourRtoL"]
                self.engine.setPlan(plan)
                while self.engine.isRunning:
                    self.engine.run()
                sleep(3)

            if page == "init":
                plan = ["init"]
                self.engine.setPlan(plan)
                while self.engine.isRunning:
                    self.engine.run()

            if page == "c":
                self.cal_turb_collision(0)
            if page == "q":
                break
def csgd_train_main(local_rank,
                    cfg: BaseConfigByEpoch,
                    target_deps,
                    succeeding_strategy,
                    pacesetter_dict,
                    centri_strength,
                    pruned_weights,
                    net=None,
                    train_dataloader=None,
                    val_dataloader=None,
                    show_variables=False,
                    convbuilder=None,
                    init_hdf5=None,
                    no_l2_keywords='depth',
                    use_nesterov=False,
                    load_weights_keyword=None,
                    keyword_to_lr_mult=None,
                    auto_continue=False,
                    save_hdf5_epochs=10000):

    ensure_dir(cfg.output_dir)
    ensure_dir(cfg.tb_dir)
    clusters_save_path = os.path.join(cfg.output_dir, 'clusters.npy')

    with Engine(local_rank=local_rank) as engine:
        engine.setup_log(name='train',
                         log_dir=cfg.output_dir,
                         file_name='log.txt')

        # ----------------------------- build model ------------------------------
        if convbuilder is None:
            convbuilder = ConvBuilder(base_config=cfg)
        if net is None:
            net_fn = get_model_fn(cfg.dataset_name, cfg.network_type)
            model = net_fn(cfg, convbuilder)
        else:
            model = net
        model = model.cuda()
        # ----------------------------- model done ------------------------------

        # ---------------------------- prepare data -------------------------
        if train_dataloader is None:
            train_data = create_dataset(cfg.dataset_name,
                                        cfg.dataset_subset,
                                        cfg.global_batch_size,
                                        distributed=engine.distributed)
        if cfg.val_epoch_period > 0 and val_dataloader is None:
            val_data = create_dataset(cfg.dataset_name,
                                      'val',
                                      global_batch_size=100,
                                      distributed=False)
        engine.echo('NOTE: Data prepared')
        engine.echo(
            'NOTE: We have global_batch_size={} on {} GPUs, the allocated GPU memory is {}'
            .format(cfg.global_batch_size, torch.cuda.device_count(),
                    torch.cuda.memory_allocated()))
        # ----------------------------- data done --------------------------------

        # ------------------------ parepare optimizer, scheduler, criterion -------
        if no_l2_keywords is None:
            no_l2_keywords = []
        if type(no_l2_keywords) is not list:
            no_l2_keywords = [no_l2_keywords]
        # For a target parameter, cancel its weight decay in optimizer, because the weight decay will be later encoded in the decay mat
        conv_idx = 0
        for k, v in model.named_parameters():
            if v.dim() != 4:
                continue
            print('prune {} from {} to {}'.format(conv_idx,
                                                  target_deps[conv_idx],
                                                  cfg.deps[conv_idx]))
            if target_deps[conv_idx] < cfg.deps[conv_idx]:
                no_l2_keywords.append(k.replace(KERNEL_KEYWORD, 'conv'))
                no_l2_keywords.append(k.replace(KERNEL_KEYWORD, 'bn'))
            conv_idx += 1
        print('no l2: ', no_l2_keywords)
        optimizer = get_optimizer(engine,
                                  cfg,
                                  model,
                                  no_l2_keywords=no_l2_keywords,
                                  use_nesterov=use_nesterov,
                                  keyword_to_lr_mult=keyword_to_lr_mult)
        scheduler = get_lr_scheduler(cfg, optimizer)
        criterion = get_criterion(cfg).cuda()
        # --------------------------------- done -------------------------------

        engine.register_state(scheduler=scheduler,
                              model=model,
                              optimizer=optimizer)

        if engine.distributed:
            torch.cuda.set_device(local_rank)
            engine.echo('Distributed training, device {}'.format(local_rank))
            model = torch.nn.parallel.DistributedDataParallel(
                model,
                device_ids=[local_rank],
                broadcast_buffers=False,
            )
        else:
            assert torch.cuda.device_count() == 1
            engine.echo('Single GPU training')

        if cfg.init_weights:
            engine.load_checkpoint(cfg.init_weights)
        if init_hdf5:
            engine.load_hdf5(init_hdf5,
                             load_weights_keyword=load_weights_keyword)
        if auto_continue:
            assert cfg.init_weights is None
            engine.load_checkpoint(get_last_checkpoint(cfg.output_dir))
        if show_variables:
            engine.show_variables()

        #   ===================================== prepare the clusters and matrices for C-SGD ==========
        kernel_namedvalue_list = engine.get_all_conv_kernel_namedvalue_as_list(
        )

        if os.path.exists(clusters_save_path):
            layer_idx_to_clusters = np.load(clusters_save_path,
                                            allow_pickle=True).item()
        else:
            if local_rank == 0:
                layer_idx_to_clusters = get_layer_idx_to_clusters(
                    kernel_namedvalue_list=kernel_namedvalue_list,
                    target_deps=target_deps,
                    pacesetter_dict=pacesetter_dict)
                if pacesetter_dict is not None:
                    for follower_idx, pacesetter_idx in pacesetter_dict.items(
                    ):
                        if pacesetter_idx in layer_idx_to_clusters:
                            layer_idx_to_clusters[
                                follower_idx] = layer_idx_to_clusters[
                                    pacesetter_idx]

                np.save(clusters_save_path, layer_idx_to_clusters)
            else:
                while not os.path.exists(clusters_save_path):
                    time.sleep(10)
                    print('sleep, waiting for process 0 to calculate clusters')
                layer_idx_to_clusters = np.load(clusters_save_path,
                                                allow_pickle=True).item()

        param_name_to_merge_matrix = generate_merge_matrix_for_kernel(
            deps=cfg.deps,
            layer_idx_to_clusters=layer_idx_to_clusters,
            kernel_namedvalue_list=kernel_namedvalue_list)
        add_vecs_to_merge_mat_dicts(param_name_to_merge_matrix)
        param_name_to_decay_matrix = generate_decay_matrix_for_kernel_and_vecs(
            deps=cfg.deps,
            layer_idx_to_clusters=layer_idx_to_clusters,
            kernel_namedvalue_list=kernel_namedvalue_list,
            weight_decay=cfg.weight_decay,
            weight_decay_bias=cfg.weight_decay_bias,
            centri_strength=centri_strength)
        print(param_name_to_decay_matrix.keys())
        print(param_name_to_merge_matrix.keys())

        conv_idx = 0
        param_to_clusters = {}
        for k, v in model.named_parameters():
            if v.dim() != 4:
                continue
            if conv_idx in layer_idx_to_clusters:
                for clsts in layer_idx_to_clusters[conv_idx]:
                    if len(clsts) > 1:
                        param_to_clusters[v] = layer_idx_to_clusters[conv_idx]
                        break
            conv_idx += 1
        #   ============================================================================================

        # ------------ do training ---------------------------- #
        engine.log("\n\nStart training with pytorch version {}".format(
            torch.__version__))

        iteration = engine.state.iteration
        iters_per_epoch = num_iters_per_epoch(cfg)
        max_iters = iters_per_epoch * cfg.max_epochs
        tb_writer = SummaryWriter(cfg.tb_dir)
        tb_tags = ['Top1-Acc', 'Top5-Acc', 'Loss']

        model.train()

        done_epochs = iteration // iters_per_epoch
        last_epoch_done_iters = iteration % iters_per_epoch

        if done_epochs == 0 and last_epoch_done_iters == 0:
            engine.save_hdf5(os.path.join(cfg.output_dir, 'init.hdf5'))

        recorded_train_time = 0
        recorded_train_examples = 0

        collected_train_loss_sum = 0
        collected_train_loss_count = 0

        for epoch in range(done_epochs, cfg.max_epochs):

            if engine.distributed and hasattr(train_data, 'train_sampler'):
                train_data.train_sampler.set_epoch(epoch)

            if epoch == done_epochs:
                pbar = tqdm(range(iters_per_epoch - last_epoch_done_iters))
            else:
                pbar = tqdm(range(iters_per_epoch))

            if epoch == 0 and local_rank == 0:
                val_during_train(epoch=epoch,
                                 iteration=iteration,
                                 tb_tags=tb_tags,
                                 engine=engine,
                                 model=model,
                                 val_data=val_data,
                                 criterion=criterion,
                                 descrip_str='Init',
                                 dataset_name=cfg.dataset_name,
                                 test_batch_size=TEST_BATCH_SIZE,
                                 tb_writer=tb_writer)

            top1 = AvgMeter()
            top5 = AvgMeter()
            losses = AvgMeter()
            discrip_str = 'Epoch-{}/{}'.format(epoch, cfg.max_epochs)
            pbar.set_description('Train' + discrip_str)

            for _ in pbar:

                start_time = time.time()
                data, label = load_cuda_data(train_data,
                                             dataset_name=cfg.dataset_name)

                # load_cuda_data(train_dataloader, cfg.dataset_name)
                data_time = time.time() - start_time

                train_net_time_start = time.time()
                acc, acc5, loss = train_one_step(
                    model,
                    data,
                    label,
                    optimizer,
                    criterion,
                    param_name_to_merge_matrix=param_name_to_merge_matrix,
                    param_name_to_decay_matrix=param_name_to_decay_matrix)
                train_net_time_end = time.time()

                if iteration > TRAIN_SPEED_START * max_iters and iteration < TRAIN_SPEED_END * max_iters:
                    recorded_train_examples += cfg.global_batch_size
                    recorded_train_time += train_net_time_end - train_net_time_start

                scheduler.step()

                for module in model.modules():
                    if hasattr(module, 'set_cur_iter'):
                        module.set_cur_iter(iteration)

                if iteration % cfg.tb_iter_period == 0 and engine.world_rank == 0:
                    for tag, value in zip(
                            tb_tags,
                        [acc.item(), acc5.item(),
                         loss.item()]):
                        tb_writer.add_scalars(tag, {'Train': value}, iteration)
                    deviation_sum = 0
                    for param, clusters in param_to_clusters.items():
                        pvalue = param.detach().cpu().numpy()
                        for cl in clusters:
                            if len(cl) == 1:
                                continue
                            selected = pvalue[cl, :, :, :]
                            mean_kernel = np.mean(selected,
                                                  axis=0,
                                                  keepdims=True)
                            diff = selected - mean_kernel
                            deviation_sum += np.sum(diff**2)
                    tb_writer.add_scalars('deviation_sum',
                                          {'Train': deviation_sum}, iteration)

                top1.update(acc.item())
                top5.update(acc5.item())
                losses.update(loss.item())

                if epoch >= cfg.max_epochs - COLLECT_TRAIN_LOSS_EPOCHS:
                    collected_train_loss_sum += loss.item()
                    collected_train_loss_count += 1

                pbar_dic = OrderedDict()
                pbar_dic['data-time'] = '{:.2f}'.format(data_time)
                pbar_dic['cur_iter'] = iteration
                pbar_dic['lr'] = scheduler.get_lr()[0]
                pbar_dic['top1'] = '{:.5f}'.format(top1.mean)
                pbar_dic['top5'] = '{:.5f}'.format(top5.mean)
                pbar_dic['loss'] = '{:.5f}'.format(losses.mean)
                pbar.set_postfix(pbar_dic)

                iteration += 1

                if iteration >= max_iters or iteration % cfg.ckpt_iter_period == 0:
                    engine.update_iteration(iteration)
                    if (not engine.distributed) or (engine.distributed and
                                                    engine.world_rank == 0):
                        engine.save_and_link_checkpoint(cfg.output_dir)

                if iteration >= max_iters:
                    break

            #   do something after an epoch?
            engine.update_iteration(iteration)
            engine.save_latest_ckpt(cfg.output_dir)

            if (epoch + 1) % save_hdf5_epochs == 0:
                engine.save_hdf5(
                    os.path.join(cfg.output_dir,
                                 'epoch-{}.hdf5'.format(epoch)))

            if local_rank == 0 and \
                    cfg.val_epoch_period > 0 and (epoch >= cfg.max_epochs - 10 or epoch % cfg.val_epoch_period == 0):
                val_during_train(epoch=epoch,
                                 iteration=iteration,
                                 tb_tags=tb_tags,
                                 engine=engine,
                                 model=model,
                                 val_data=val_data,
                                 criterion=criterion,
                                 descrip_str=discrip_str,
                                 dataset_name=cfg.dataset_name,
                                 test_batch_size=TEST_BATCH_SIZE,
                                 tb_writer=tb_writer)

            if iteration >= max_iters:
                break

        #   do something after the training
        if recorded_train_time > 0:
            exp_per_sec = recorded_train_examples / recorded_train_time
        else:
            exp_per_sec = 0
        engine.log(
            'TRAIN speed: from {} to {} iterations, batch_size={}, examples={}, total_net_time={:.4f}, examples/sec={}'
            .format(int(TRAIN_SPEED_START * max_iters),
                    int(TRAIN_SPEED_END * max_iters), cfg.global_batch_size,
                    recorded_train_examples, recorded_train_time, exp_per_sec))
        if cfg.save_weights:
            engine.save_checkpoint(cfg.save_weights)
            print('NOTE: training finished, saved to {}'.format(
                cfg.save_weights))
        engine.save_hdf5(os.path.join(cfg.output_dir, 'finish.hdf5'))
        if collected_train_loss_count > 0:
            engine.log(
                'TRAIN LOSS collected over last {} epochs: {:.6f}'.format(
                    COLLECT_TRAIN_LOSS_EPOCHS,
                    collected_train_loss_sum / collected_train_loss_count))

    if local_rank == 0:
        csgd_prune_and_save(engine=engine,
                            layer_idx_to_clusters=layer_idx_to_clusters,
                            save_file=pruned_weights,
                            succeeding_strategy=succeeding_strategy,
                            new_deps=target_deps)
Esempio n. 9
0
def main():
    """Create the ConResNet model and then start the training."""
    parser = get_arguments()
    print(parser)
    # os.environ["CUDA_VISIBLE_DEVICES"] = '0'

    with Engine(custom_parser=parser) as engine:
        args = parser.parse_args()
        if args.num_gpus > 1:
            torch.cuda.set_device(args.local_rank)

        writer = SummaryWriter(args.snapshot_dir)

        d, h, w = map(int, args.input_size.split(','))
        input_size = (d, h, w)

        cudnn.benchmark = True
        seed = args.random_seed
        if engine.distributed:
            seed = args.local_rank
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(seed)

        model = ConResNet(input_size,
                          num_classes=args.num_classes,
                          weight_std=True)
        model.train()
        device = torch.device('cuda:{}'.format(args.local_rank))
        model.to(device)

        optimizer = optim.Adam(
            [{
                'params': filter(lambda p: p.requires_grad,
                                 model.parameters()),
                'lr': args.learning_rate
            }],
            lr=args.learning_rate,
            weight_decay=args.weight_decay)

        if args.num_gpus > 1:
            model = engine.data_parallel(model)

        # load checkpoint...
        if args.reload_from_checkpoint:
            print('loading from checkpoint: {}'.format(args.reload_path))
            if os.path.exists(args.reload_path):
                model.load_state_dict(
                    torch.load(args.reload_path,
                               map_location=torch.device('cpu')))
            else:
                print('File not exists in the reload path: {}'.format(
                    args.reload_path))

        loss_D = loss.DiceLoss4BraTS().to(device)
        loss_BCE = loss.BCELoss4BraTS().to(device)

        loss_B = loss.BCELossBoud().to(device)

        if not os.path.exists(args.snapshot_dir):
            os.makedirs(args.snapshot_dir)

        trainloader, train_sampler = engine.get_train_loader(
            BraTSDataSet(args.data_dir,
                         args.train_list,
                         max_iters=args.num_steps * args.batch_size,
                         crop_size=input_size,
                         scale=args.random_scale,
                         mirror=args.random_mirror))
        valloader, val_sampler = engine.get_test_loader(
            BraTSValDataSet(args.data_dir, args.val_list))

        for i_iter, batch in enumerate(trainloader):
            i_iter += args.start_iters
            images, images_res, labels, labels_res = batch
            images = images.cuda()
            images_res = images_res.cuda()
            labels = labels.cuda()
            labels_res = labels_res.cuda()

            optimizer.zero_grad()
            lr = adjust_learning_rate(optimizer, i_iter, args.learning_rate,
                                      args.num_steps, args.power)

            preds = model([images, images_res])
            preds_seg = preds[0]
            preds_res = preds[1]
            preds_resx2 = preds[2]
            preds_resx4 = preds[3]

            term_seg_Dice = loss_D.forward(preds_seg, labels)
            term_seg_BCE = loss_BCE.forward(preds_seg, labels)

            term_res_BCE = loss_B.forward(preds_res, labels_res)
            term_resx2_BCE = loss_B.forward(preds_resx2, labels_res)
            term_resx4_BCE = loss_B.forward(preds_resx4, labels_res)

            term_all = term_seg_Dice + term_seg_BCE + term_res_BCE + 0.5 * (
                term_resx2_BCE + term_resx4_BCE)
            term_all.backward()

            optimizer.step()

            if i_iter % 100 == 0 and (args.local_rank == 0):
                writer.add_scalar('learning_rate', lr, i_iter)
                writer.add_scalar('loss', term_all.cpu().data.numpy(), i_iter)

            print(
                'iter = {} of {} completed, lr = {:.4}, seg_loss = {:.4}, res_loss = {:.4}'
                .format(i_iter, args.num_steps, lr,
                        (term_seg_Dice + term_seg_BCE).cpu().data.numpy(),
                        (term_res_BCE + term_resx2_BCE +
                         term_resx4_BCE).cpu().data.numpy()))

            if i_iter >= args.num_steps - 1 and (args.local_rank == 0):
                print('save last model ...')
                torch.save(
                    model.state_dict(),
                    osp.join(args.snapshot_dir,
                             'ConResNet_' + str(args.num_steps) + '.pth'))
                break

            if i_iter % args.val_pred_every == 0 and i_iter != 0 and (
                    args.local_rank == 0):
                print('save model ...')
                torch.save(
                    model.state_dict(),
                    osp.join(args.snapshot_dir,
                             'ConResNet_' + str(i_iter) + '.pth'))

            # val
            if i_iter % args.val_pred_every == 0:
                print('validate ...')
                val_ET, val_WT, val_TC = validate(input_size, model, valloader,
                                                  args.num_classes)
                if (args.local_rank == 0):
                    writer.add_scalar('Val_ET_Dice', val_ET, i_iter)
                    writer.add_scalar('Val_WT_Dice', val_WT, i_iter)
                    writer.add_scalar('Val_TC_Dice', val_TC, i_iter)
                    print(
                        'Validate iter = {}, ET = {:.2}, WT = {:.2}, TC = {:.2}'
                        .format(i_iter, val_ET, val_WT, val_TC))

    end = timeit.default_timer()
    print(end - start, 'seconds')
Esempio n. 10
0
from utils.engine import Engine

if __name__ == '__main__':
    size = input("Entrez la taille du plateau (inf à 26 et pair): ")
    while not size.isdigit() \
            or int(size) < 4 or int(size) > 26 \
            or int(size) % 2 == 1:
        size = input("Err, entrez la taille du plateau (inf à 26 et pair): ")

    width = height = int(size)

    players = input("Joueur contre une IA ? [O/n] ")
    while players.lower() not in [
            'yes', 'y', 'oui', 'o', 'no', 'n', 'non', ''
    ]:
        players = input("Err, joueur contre une IA ? [O/n] ")

    players = 2 if players in ['no', 'n', 'non'] else 1

    board = Board(width, height)
    board.make_board()

    engine = Engine(board, players)
    engine.start()

    try:
        while engine.is_playing:
            engine.get_action()
    except (KeyboardInterrupt, EOFError):
        engine.stop()
Esempio n. 11
0
def ding_train(cfg: BaseConfigByEpoch,
               net=None,
               train_dataloader=None,
               val_dataloader=None,
               show_variables=False,
               convbuilder=None):

    ensure_dir(cfg.output_dir)
    ensure_dir(cfg.tb_dir)
    with Engine(cfg) as engine:

        is_main_process = (engine.world_rank == 0)  #TODO correct?

        logger = engine.setup_log(name='train',
                                  log_dir=cfg.output_dir,
                                  file_name='log.txt')

        # -- typical model components model, opt,  scheduler,  dataloder --#
        if net is None:
            net = get_model_fn(cfg.dataset_name, cfg.network_type)

        if convbuilder is None:
            convbuilder = ConvBuilder()

        model = net(cfg, convbuilder).cuda()

        if train_dataloader is None:
            train_dataloader = create_dataset(cfg.dataset_name,
                                              cfg.dataset_subset,
                                              cfg.global_batch_size)
        if cfg.val_epoch_period > 0 and val_dataloader is None:
            val_dataloader = create_dataset(cfg.dataset_name,
                                            'val',
                                            batch_size=100)  #TODO 100?

        print('NOTE: Data prepared')
        print(
            'NOTE: We have global_batch_size={} on {} GPUs, the allocated GPU memory is {}'
            .format(cfg.global_batch_size, torch.cuda.device_count(),
                    torch.cuda.memory_allocated()))

        # device = torch.device(cfg.device)
        # model.to(device)
        # model.cuda()

        optimizer = get_optimizer(cfg, model)
        scheduler = get_lr_scheduler(cfg, optimizer)
        criterion = get_criterion(cfg).cuda()

        # model, optimizer = amp.initialize(model, optimizer, opt_level="O0")

        engine.register_state(scheduler=scheduler,
                              model=model,
                              optimizer=optimizer)

        if engine.distributed:
            print('Distributed training, engine.world_rank={}'.format(
                engine.world_rank))
            model = torch.nn.parallel.DistributedDataParallel(
                model,
                device_ids=[engine.world_rank],
                broadcast_buffers=False,
            )
            # model = DistributedDataParallel(model, delay_allreduce=True)

        if engine.continue_state_object:
            engine.restore_checkpoint()
        else:
            if cfg.init_weights:
                engine.load_checkpoint(cfg.init_weights, is_restore=False)

        if show_variables:
            engine.show_variables()

        # ------------ do training ---------------------------- #
        logger.info("\n\nStart training with pytorch version {}".format(
            torch.__version__))

        iteration = engine.state.iteration
        # done_epochs = iteration // num_train_examples_per_epoch(cfg.dataset_name)
        iters_per_epoch = num_iters_per_epoch(cfg)
        max_iters = iters_per_epoch * cfg.max_epochs
        tb_writer = SummaryWriter(cfg.tb_dir)
        tb_tags = ['Top1-Acc', 'Top5-Acc', 'Loss']

        model.train()

        done_epochs = iteration // iters_per_epoch

        for epoch in range(done_epochs, cfg.max_epochs):

            pbar = tqdm(range(iters_per_epoch))
            top1 = AvgMeter()
            top5 = AvgMeter()
            losses = AvgMeter()
            discrip_str = 'Epoch-{}/{}'.format(epoch, cfg.max_epochs)
            pbar.set_description('Train' + discrip_str)

            if cfg.val_epoch_period > 0 and epoch % cfg.val_epoch_period == 0:
                model.eval()
                val_iters = 500 if cfg.dataset_name == 'imagenet' else 100  # use batch_size=100 for val on ImagenNet and CIFAR
                eval_dict = run_eval(val_dataloader,
                                     val_iters,
                                     model,
                                     criterion,
                                     discrip_str,
                                     dataset_name=cfg.dataset_name)
                val_top1_value = eval_dict['top1'].item()
                val_top5_value = eval_dict['top5'].item()
                val_loss_value = eval_dict['loss'].item()
                for tag, value in zip(
                        tb_tags,
                    [val_top1_value, val_top5_value, val_loss_value]):
                    tb_writer.add_scalars(tag, {'Val': value}, iteration)
                engine.log(
                    'validate at epoch {}, top1={:.5f}, top5={:.5f}, loss={:.6f}'
                    .format(epoch, val_top1_value, val_top5_value,
                            val_loss_value))
                model.train()

            for _ in pbar:

                scheduler.step()

                start_time = time.time()
                data, label = load_cuda_data(train_dataloader,
                                             cfg.dataset_name)
                data_time = time.time() - start_time

                if_accum_grad = ((iteration % cfg.grad_accum_iters) != 0)
                acc, acc5, loss = train_one_step(model, data, label, optimizer,
                                                 criterion, if_accum_grad)

                if iteration % cfg.tb_iter_period == 0 and is_main_process:
                    for tag, value in zip(
                            tb_tags,
                        [acc.item(), acc5.item(),
                         loss.item()]):
                        tb_writer.add_scalars(tag, {'Train': value}, iteration)

                top1.update(acc.item())
                top5.update(acc5.item())
                losses.update(loss.item())

                pbar_dic = OrderedDict()
                pbar_dic['data-time'] = '{:.2f}'.format(data_time)
                pbar_dic['cur_iter'] = iteration
                pbar_dic['lr'] = scheduler.get_lr()[0]
                pbar_dic['top1'] = '{:.5f}'.format(top1.mean)
                pbar_dic['top5'] = '{:.5f}'.format(top5.mean)
                pbar_dic['loss'] = '{:.5f}'.format(losses.mean)
                pbar.set_postfix(pbar_dic)

                if iteration >= max_iters or iteration % cfg.ckpt_iter_period == 0:
                    engine.update_iteration(iteration)
                    if (not engine.distributed) or (engine.distributed
                                                    and is_main_process):
                        engine.save_and_link_checkpoint(cfg.output_dir)

                iteration += 1
                if iteration >= max_iters:
                    break

            #   do something after an epoch?
            if iteration >= max_iters:
                break
        #   do something after the training
        engine.save_checkpoint(cfg.save_weights)
        print('NOTE: training finished, saved to {}'.format(cfg.save_weights))
Esempio n. 12
0
from utils.file_manager import FileManager
from utils.engine import Engine

B, L, D, libraries, books_scores, picked_books = FileManager.read_file('a_example.txt')


engine = Engine(libraries=libraries, D=D, books_scores=books_scores, picked_books=picked_books)
output: list = engine.start()

FileManager.write_file('a.txt', output)
Esempio n. 13
0
def aofp_train_main(local_rank,
                    target_layers,
                    succ_strategy,
                    warmup_iterations,
                    aofp_batches_per_half,
                    flops_func,
                    cfg: BaseConfigByEpoch,
                    net=None,
                    train_dataloader=None,
                    val_dataloader=None,
                    show_variables=False,
                    convbuilder=None,
                    init_hdf5=None,
                    no_l2_keywords='depth',
                    gradient_mask=None,
                    use_nesterov=False,
                    tensorflow_style_init=False,
                    keyword_to_lr_mult=None,
                    auto_continue=False,
                    lasso_keyword_to_strength=None,
                    save_hdf5_epochs=10000,
                    remain_flops_ratio=0):

    if no_l2_keywords is None:
        no_l2_keywords = []
    if type(no_l2_keywords) is not list:
        no_l2_keywords = [no_l2_keywords]

    ensure_dir(cfg.output_dir)
    ensure_dir(cfg.tb_dir)
    with Engine(local_rank=local_rank) as engine:
        engine.setup_log(name='train',
                         log_dir=cfg.output_dir,
                         file_name='log.txt')

        # ----------------------------- build model ------------------------------
        if convbuilder is None:
            convbuilder = ConvBuilder(base_config=cfg)
        if net is None:
            net_fn = get_model_fn(cfg.dataset_name, cfg.network_type)
            model = net_fn(cfg, convbuilder)
        else:
            model = net
        model = model.cuda()
        # ----------------------------- model done ------------------------------

        # ---------------------------- prepare data -------------------------
        if train_dataloader is None:
            train_data = create_dataset(cfg.dataset_name,
                                        cfg.dataset_subset,
                                        cfg.global_batch_size,
                                        distributed=engine.distributed)
        if cfg.val_epoch_period > 0 and val_dataloader is None:
            val_data = create_dataset(cfg.dataset_name,
                                      'val',
                                      global_batch_size=100,
                                      distributed=False)
        engine.echo('NOTE: Data prepared')
        engine.echo(
            'NOTE: We have global_batch_size={} on {} GPUs, the allocated GPU memory is {}'
            .format(cfg.global_batch_size, torch.cuda.device_count(),
                    torch.cuda.memory_allocated()))
        # ----------------------------- data done --------------------------------

        # ------------------------ parepare optimizer, scheduler, criterion -------
        optimizer = get_optimizer(engine,
                                  cfg,
                                  model,
                                  no_l2_keywords=no_l2_keywords,
                                  use_nesterov=use_nesterov,
                                  keyword_to_lr_mult=keyword_to_lr_mult)
        scheduler = get_lr_scheduler(cfg, optimizer)
        criterion = get_criterion(cfg).cuda()
        # --------------------------------- done -------------------------------

        engine.register_state(scheduler=scheduler,
                              model=model,
                              optimizer=optimizer)

        if engine.distributed:
            torch.cuda.set_device(local_rank)
            engine.echo('Distributed training, device {}'.format(local_rank))
            model = torch.nn.parallel.DistributedDataParallel(
                model,
                device_ids=[local_rank],
                broadcast_buffers=False,
            )
        else:
            assert torch.cuda.device_count() == 1
            engine.echo('Single GPU training')

        if tensorflow_style_init:
            init_as_tensorflow(model)
        if cfg.init_weights:
            engine.load_checkpoint(cfg.init_weights)
        if init_hdf5:
            engine.load_part('base_path.', init_hdf5)
        if auto_continue:
            assert cfg.init_weights is None
            engine.load_checkpoint(get_last_checkpoint(cfg.output_dir))
        if show_variables:
            engine.show_variables()

        # ------------ do training ---------------------------- #
        engine.log("\n\nStart training with pytorch version {}".format(
            torch.__version__))

        iteration = engine.state.iteration
        iters_per_epoch = num_iters_per_epoch(cfg)
        max_iters = iters_per_epoch * cfg.max_epochs
        tb_writer = SummaryWriter(cfg.tb_dir)
        tb_tags = ['Top1-Acc', 'Top5-Acc', 'Loss']

        model.train()

        done_epochs = iteration // iters_per_epoch
        last_epoch_done_iters = iteration % iters_per_epoch

        if done_epochs == 0 and last_epoch_done_iters == 0:
            engine.save_hdf5(os.path.join(cfg.output_dir, 'init.hdf5'))

        recorded_train_time = 0
        recorded_train_examples = 0

        collected_train_loss_sum = 0
        collected_train_loss_count = 0

        if gradient_mask is not None:
            gradient_mask_tensor = {}
            for name, value in gradient_mask.items():
                gradient_mask_tensor[name] = torch.Tensor(value).cuda()
        else:
            gradient_mask_tensor = None

        #########################   aofp
        _init_interval = aofp_batches_per_half // len(target_layers)
        layer_to_start_iter = {
            i: (_init_interval * i + warmup_iterations)
            for i in target_layers
        }
        print(
            'the initial layer_to_start_iter = {}'.format(layer_to_start_iter))
        #   0.  get all the AOFPLayers
        layer_idx_to_module = {}
        for submodule in model.modules():
            if hasattr(submodule, 'score_mask') or hasattr(
                    submodule, 't_value'):
                layer_idx_to_module[submodule.conv_idx] = submodule
        print(layer_idx_to_module)
        ######################################

        for epoch in range(done_epochs, cfg.max_epochs):

            if engine.distributed and hasattr(train_data, 'train_sampler'):
                train_data.train_sampler.set_epoch(epoch)

            if epoch == done_epochs:
                pbar = tqdm(range(iters_per_epoch - last_epoch_done_iters))
            else:
                pbar = tqdm(range(iters_per_epoch))

            if epoch == 0 and local_rank == 0:
                val_during_train(epoch=epoch,
                                 iteration=iteration,
                                 tb_tags=tb_tags,
                                 engine=engine,
                                 model=model,
                                 val_data=val_data,
                                 criterion=criterion,
                                 descrip_str='Init',
                                 dataset_name=cfg.dataset_name,
                                 test_batch_size=TEST_BATCH_SIZE,
                                 tb_writer=tb_writer)

            top1 = AvgMeter()
            top5 = AvgMeter()
            losses = AvgMeter()
            discrip_str = 'Epoch-{}/{}'.format(epoch, cfg.max_epochs)
            pbar.set_description('Train' + discrip_str)

            for _ in pbar:

                start_time = time.time()
                data, label = load_cuda_data(train_data,
                                             dataset_name=cfg.dataset_name)

                # load_cuda_data(train_dataloader, cfg.dataset_name)
                data_time = time.time() - start_time

                if_accum_grad = ((iteration % cfg.grad_accum_iters) != 0)

                train_net_time_start = time.time()

                ############    aofp
                #   1.  see if it is time to start on every layer
                #   2.  forward and accumulate
                #   3.  if a half on some layer is finished, do something
                #   ----    fetch its accumulated t vectors, analyze the first 'granu' elements
                #   ----    if good enough, set the base mask, reset the search space
                #   ----    elif granu == 1, do nothing
                #   ----    else, granu /= 2, reset the search space
                for layer_idx, start_iter in layer_to_start_iter.items():
                    if start_iter == iteration:
                        layer_idx_to_module[layer_idx].start_aofp(iteration)
                acc, acc5, loss = train_one_step(
                    model,
                    data,
                    label,
                    optimizer,
                    criterion,
                    if_accum_grad,
                    gradient_mask_tensor=gradient_mask_tensor,
                    lasso_keyword_to_strength=lasso_keyword_to_strength)
                for layer_idx, aofp_layer in layer_idx_to_module.items():
                    #   accumulate
                    if layer_idx not in succ_strategy:
                        continue
                    follow_layer_idx = succ_strategy[layer_idx]
                    if follow_layer_idx not in layer_idx_to_module:
                        continue
                    t_value = layer_idx_to_module[follow_layer_idx].t_value
                    aofp_layer.accumulate_t_value(t_value)
                    if aofp_layer.finished_a_half(iteration):
                        aofp_layer.halve_or_stop(iteration)
                ###################################

                train_net_time_end = time.time()

                if iteration > TRAIN_SPEED_START * max_iters and iteration < TRAIN_SPEED_END * max_iters:
                    recorded_train_examples += cfg.global_batch_size
                    recorded_train_time += train_net_time_end - train_net_time_start

                scheduler.step()

                for module in model.modules():
                    if hasattr(module, 'set_cur_iter'):
                        module.set_cur_iter(iteration)

                if iteration % cfg.tb_iter_period == 0 and engine.world_rank == 0:
                    for tag, value in zip(
                            tb_tags,
                        [acc.item(), acc5.item(),
                         loss.item()]):
                        tb_writer.add_scalars(tag, {'Train': value}, iteration)

                top1.update(acc.item())
                top5.update(acc5.item())
                losses.update(loss.item())

                if epoch >= cfg.max_epochs - COLLECT_TRAIN_LOSS_EPOCHS:
                    collected_train_loss_sum += loss.item()
                    collected_train_loss_count += 1

                pbar_dic = OrderedDict()
                pbar_dic['data-time'] = '{:.2f}'.format(data_time)
                pbar_dic['cur_iter'] = iteration
                pbar_dic['lr'] = scheduler.get_lr()[0]
                pbar_dic['top1'] = '{:.5f}'.format(top1.mean)
                pbar_dic['top5'] = '{:.5f}'.format(top5.mean)
                pbar_dic['loss'] = '{:.5f}'.format(losses.mean)
                pbar.set_postfix(pbar_dic)

                iteration += 1

                if iteration >= max_iters or iteration % cfg.ckpt_iter_period == 0:
                    engine.update_iteration(iteration)
                    if (not engine.distributed) or (engine.distributed and
                                                    engine.world_rank == 0):
                        engine.save_and_link_checkpoint(cfg.output_dir)

                if iteration >= max_iters:
                    break

            #   do something after an epoch?
            engine.update_iteration(iteration)
            engine.save_latest_ckpt(cfg.output_dir)

            if (epoch + 1) % save_hdf5_epochs == 0:
                engine.save_hdf5(
                    os.path.join(cfg.output_dir,
                                 'epoch-{}.hdf5'.format(epoch)))

            if local_rank == 0 and \
                    cfg.val_epoch_period > 0 and (epoch >= cfg.max_epochs - 10 or epoch % cfg.val_epoch_period == 0):
                val_during_train(epoch=epoch,
                                 iteration=iteration,
                                 tb_tags=tb_tags,
                                 engine=engine,
                                 model=model,
                                 val_data=val_data,
                                 criterion=criterion,
                                 descrip_str=discrip_str,
                                 dataset_name=cfg.dataset_name,
                                 test_batch_size=TEST_BATCH_SIZE,
                                 tb_writer=tb_writer)

            cur_deps = np.array(cfg.deps)
            for submodule in model.modules():
                if hasattr(submodule, 'base_mask'):
                    cur_deps[submodule.conv_idx] = np.sum(
                        submodule.base_mask.cpu().numpy() == 1)
            origin_flops = flops_func(cfg.deps)
            cur_flops = flops_func(cur_deps)
            remain_ratio = cur_flops / origin_flops
            if local_rank == 0:
                print('##########################')
                print('origin deps ', cfg.deps)
                print('cur deps ', cur_deps)
                print('remain flops ratio = ', remain_ratio, 'the target is ',
                      remain_flops_ratio)
                print('##########################')
            if remain_ratio < remain_flops_ratio:
                break
            if iteration >= max_iters:
                break

        #   do something after the training
        if recorded_train_time > 0:
            exp_per_sec = recorded_train_examples / recorded_train_time
        else:
            exp_per_sec = 0
        engine.log(
            'TRAIN speed: from {} to {} iterations, batch_size={}, examples={}, total_net_time={:.4f}, examples/sec={}'
            .format(int(TRAIN_SPEED_START * max_iters),
                    int(TRAIN_SPEED_END * max_iters), cfg.global_batch_size,
                    recorded_train_examples, recorded_train_time, exp_per_sec))
        if cfg.save_weights:
            engine.save_checkpoint(cfg.save_weights)
            print('NOTE: training finished, saved to {}'.format(
                cfg.save_weights))
        engine.save_hdf5(os.path.join(cfg.output_dir, 'finish.hdf5'))
        if collected_train_loss_count > 0:
            engine.log(
                'TRAIN LOSS collected over last {} epochs: {:.6f}'.format(
                    COLLECT_TRAIN_LOSS_EPOCHS,
                    collected_train_loss_sum / collected_train_loss_count))

        final_deps = aofp_prune(model,
                                origin_deps=cfg.deps,
                                succ_strategy=succ_strategy,
                                save_path=os.path.join(cfg.output_dir,
                                                       'finish_pruned.hdf5'))
        origin_flops = flops_func(cfg.deps)
        cur_flops = flops_func(final_deps)
        engine.log(
            '##################################################################'
        )
        engine.log(cfg.network_type)
        engine.log('origin width: {} , flops {} '.format(
            cfg.deps, origin_flops))
        engine.log('final width: {}, flops {} '.format(final_deps, cur_flops))
        engine.log('flops reduction: {}'.format(1 - cur_flops / origin_flops))
        return final_deps
    def __init__(self):

        self.head_Joint = ["head_p", "head_y"]

        self.arm_Joint = [
            "l_arm_el_y", "l_arm_sh_p1", "l_arm_sh_p2", "l_arm_sh_r",
            "l_arm_wr_p", "l_arm_wr_r", "l_arm_wr_y", "r_arm_el_y",
            "r_arm_sh_p1", "r_arm_sh_p2", "r_arm_sh_r", "r_arm_wr_p",
            "r_arm_wr_r", "r_arm_wr_y"
        ]

        self.leg_Joint = [
            "l_leg_an_p", "l_leg_an_r", "l_leg_hip_p", "l_leg_hip_r",
            "l_leg_hip_y", "l_leg_kn_p", "r_arm_el_y", "r_leg_an_p",
            "r_leg_an_r", "r_leg_hip_p", "r_leg_hip_r", "r_leg_hip_y",
            "r_leg_kn_p", "torso_y"
        ]

        self.grip_Joint = ["l_arm_grip", "r_arm_grip"]

        # Initialize publisher objects
        self.initPublishers()

        # Initialize Subscriber objects
        self.initSubscribe()

        ### init engine
        name2num = {
            'gripON_L': "gripON_L",
            'gripOFF_L': "gripOFF_L",
            'gripON_R': "gripON_R",
            'gripOFF_R': "gripOFF_R",
            'fLL': 34,
            'bLL': 35,
            'fRR': 36,
            'bRR': 37,
            'fML': 38,
            'bML': 39,
            'fMR': 40,
            'bMR': 41,
            'init': 44,
            'DMPPourRtoL': 28,
            'removeBall': 29
        }

        self.engine = Engine(name2num)

        self.dmp_y0 = np.array([-1.52017496, 0.04908739, 1.41433029])
        self.dmp_goal = np.array([-1.50848603, 0.0591503, 1.44347592])

        load_file_name = "w_0_1_right_3_100_1000.0_0.01_2"
        #load_file_name = raw_input('file name: ')
        load_file_name_list = load_file_name.split('_')
        ### learning ep
        self.ep = int(load_file_name_list[1])
        ### pouring number of ball to the other tube
        self.numofball = int(load_file_name_list[2])
        ### which arm do the pouring motion
        self.pour_arm = load_file_name_list[3]
        n_dmps = int(load_file_name_list[4])
        n_bfs = int(load_file_name_list[5])
        decay = float(load_file_name_list[6])
        dt = float(load_file_name_list[7])
        self.total_ball = float(load_file_name_list[8])

        ### initial DMP
        self.rl = RLDMPs(n_dmps=n_dmps,
                         n_bfs=n_bfs,
                         decay=decay,
                         y0=self.dmp_y0,
                         goal=self.dmp_goal,
                         ay=np.ones(n_dmps) * 10.0,
                         dt=dt)

        self.rl.load_weight(load_file_name)

        print(self.rl.predict().y)
        print("load npy file weight success:")
        print("ep: " + str(self.ep))
        print("pouring " + str(self.numofball) +
              " ball to other tube. Total: " + str(self.total_ball))
        print("using  " + self.pour_arm + " pouring the ball")

        self.costT_list = []
Esempio n. 15
0
def csgd_prune_and_save(engine:Engine, layer_idx_to_clusters, save_file, succeeding_strategy, new_deps):
    result = OrderedDict()

    succeeding_map = parse_succeeding_strategy(succeeding_strategy=succeeding_strategy, layer_idx_to_clusters=layer_idx_to_clusters)

    kernel_namedvalues = engine.get_all_kernel_namedvalue_as_list()

    for layer_idx, namedvalue in enumerate(kernel_namedvalues):
        if layer_idx not in layer_idx_to_clusters:
            continue

        k_name = namedvalue.name
        k_value = namedvalue.value
        if k_name in result:                    # If this kernel has been pruned because it is subsequent to another layer
            k_value = result[k_name]

        clusters = layer_idx_to_clusters[layer_idx]

        #   Prune the kernel
        idx_to_delete = []
        for clst in clusters:
            idx_to_delete += clst[1:]
        kernel_value_pruned = delete_or_keep(k_value, idx_to_delete, axis=0)
        print('cur kernel name: {}, from {} to {}'.format(k_name, k_value.shape, kernel_value_pruned.shape))
        result[k_name] = kernel_value_pruned
        assert new_deps[layer_idx] == kernel_value_pruned.shape[0]

        #   Prune the related vector params
        def handle_vecs(key_name):
            vec_name = k_name.replace('conv.weight', key_name)      # Assume the names of conv kernel and bn params follow such a pattern.
            vec_value = engine.get_param_value_by_name(vec_name)
            if vec_value is not None:
                vec_value_pruned = delete_or_keep(vec_value, idx_to_delete)
                result[vec_name] = vec_value_pruned

        handle_vecs('conv.bias')
        handle_vecs('bn.weight')
        handle_vecs('bn.bias')
        handle_vecs('bn.running_mean')
        handle_vecs('bn.running_var')

        #   Handle the succeeding kernels
        if layer_idx not in succeeding_map:
            continue

        follows = succeeding_map[layer_idx]
        print('{} follows {}'.format(follows, layer_idx))
        if type(follows) is not list:
            follows = [follows]

        for follow_idx in follows:
            follow_kernel_value = kernel_namedvalues[follow_idx].value
            follow_kernel_name = kernel_namedvalues[follow_idx].name
            if follow_kernel_name in result:
                follow_kernel_value = result[follow_kernel_name]
            print('following kernel name: ', follow_kernel_name, 'origin shape: ', follow_kernel_value.shape)

            if follow_kernel_value.ndim == 2:  # The following is a FC layer
                fc_idx_to_delete = []
                num_filters = k_value.shape[0]
                fc_neurons_per_conv_kernel = follow_kernel_value.shape[1] // num_filters
                print('{} filters, {} neurons per kernel'.format(num_filters, fc_neurons_per_conv_kernel))
                for clst in clusters:
                    if len(clst) == 1:
                        continue
                    for i in clst[1:]:
                        fc_idx_to_delete.append(np.arange(i * fc_neurons_per_conv_kernel, (i + 1) * fc_neurons_per_conv_kernel))
                    to_concat = []
                    for i in clst:
                        corresponding_neurons_idx = np.arange(i * fc_neurons_per_conv_kernel, (i + 1) * fc_neurons_per_conv_kernel)
                        to_concat.append(np.expand_dims(follow_kernel_value[:, corresponding_neurons_idx], axis=0))
                    summed = np.sum(np.concatenate(to_concat, axis=0), axis=0)
                    reserved_idx = np.arange(clst[0] * fc_neurons_per_conv_kernel, (clst[0] + 1) * fc_neurons_per_conv_kernel)
                    follow_kernel_value[:, reserved_idx] = summed
                if len(fc_idx_to_delete) > 0:
                    follow_kernel_value = delete_or_keep(follow_kernel_value, np.concatenate(fc_idx_to_delete, axis=0), axis=1)
                result[follow_kernel_name] = follow_kernel_value
                print('shape of pruned following kernel: ', follow_kernel_value.shape)
            elif follow_kernel_value.ndim == 4:     # The following is a conv layer
                for clst in clusters:
                    selected_k_follow = follow_kernel_value[:, clst, :, :]
                    summed_k_follow = np.sum(selected_k_follow, axis=1)
                    follow_kernel_value[:, clst[0], :, :] = summed_k_follow
                follow_kernel_value = delete_or_keep(follow_kernel_value, idx_to_delete, axis=1)
                result[follow_kernel_name] = follow_kernel_value
                print('shape of pruned following kernel: ', follow_kernel_value.shape)
            else:
                raise ValueError('wrong ndim of kernel')

    key_variables = engine.state_values()
    for name, value in key_variables.items():
        if name not in result:
            result[name] = value

    result['deps'] = new_deps

    print('save {} values to {} after pruning'.format(len(result), save_file))
    save_hdf5(result, save_file)
Esempio n. 16
0
def main(cfg, cuda_avail=torch.cuda.is_available()):
    ### flush cfg to output log file:
    tqdm.write(str(cfg), file=cfg['logfile'])
    tqdm.write('-' * 80)

    ### define function that returns a data loader:
    def get_iterator(mode='train'):
        # choose between train/valid data based on `mode`:
        if mode == 'train':
            datasets = cfg['train_data_paths']
            pin_memory_flag = (cuda_avail and cfg['cuda'])
            num_workers_setting = 4
        if mode == 'valid':
            datasets = cfg['valid_data_paths']
            pin_memory_flag = False
            num_workers_setting = 1

        # form a (possibly concatenated) dataset:
        ds = SeqTensorDataset(torch.load(datasets[0][0]),
                              torch.load(datasets[0][1]),
                              torch.load(datasets[0][2]),
                              torch.load(datasets[0][3]))
        for dataset in datasets[1:]:
            ds += SeqTensorDataset(torch.load(dataset[0]),
                                   torch.load(dataset[1]),
                                   torch.load(dataset[2]),
                                   torch.load(dataset[3]))

        # return a loader that iterates over the dataset of choice; pagelock the memory location if GPU detected:
        return DataLoader(ds,
                          batch_size=cfg['batch_size'],
                          shuffle=True,
                          num_workers=num_workers_setting,
                          collate_fn=sequence_collate_fn,
                          pin_memory=pin_memory_flag)

    ### build RawCTCNet model:
    in_dim = 1
    layers = [(256, 256, d, 3)
              for d in [1, 2, 4, 8, 16, 32, 64]] * cfg['num_stacks']
    num_labels = 5
    out_dim = 512
    network = RawCTCNet(in_dim,
                        num_labels,
                        layers,
                        out_dim,
                        input_kw=1,
                        input_dil=1,
                        positions=True,
                        softmax=False,
                        causal=False,
                        batch_norm=True)
    print("Constructed network.")
    if (cuda_avail and cfg['cuda']):
        print("CUDA detected; placed network on GPU.")
        network.cuda()
    if cfg['model'] is not None:
        print("Loading model file...")
        try:
            network.load_state_dict(torch.load(cfg['model']))
        except:
            print(
                "ERR: could not restore model. Check model datatype/dimensions."
            )

    ### build CTCLoss and model evaluation function:
    ctc_loss_fn = CTCLoss()
    print("Constructed CTC loss function.")
    maybe_gpu = lambda tsr, has_cuda: tsr if not has_cuda else tsr.cuda()

    #--- this function performs the gradient descent in synchronous batched mode:
    def batch_model_loss(sample):
        # unpack inputs and wrap as `torch.autograd.Variable`s:
        signals_, signal_lengths_, sequences_, sequence_lengths_ = sample
        signals = Variable(
            maybe_gpu(signals_.permute(0, 2, 1),
                      (cuda_avail and cfg['cuda'])))  # BxTxD => BxDxT
        signal_lengths = Variable(signal_lengths_)
        sequences = Variable(concat_labels(sequences_, sequence_lengths_))
        sequence_lengths = Variable(sequence_lengths_)
        # compute predicted labels:
        transcriptions = network(signals).permute(2, 0,
                                                  1)  # Permute: BxDxT => TxBxD
        # compute CTC loss and return:
        loss = ctc_loss_fn(transcriptions, sequences.int(),
                           signal_lengths.int(), sequence_lengths.int())
        loss.backward()
        return loss, transcriptions

    #--- for evaluation-mode, batch-parallel:
    def batch_model_eval(sample):
        # unpack inputs and wrap as `torch.autograd.Variable`s:
        signals_, signal_lengths_, sequences_, sequence_lengths_ = sample
        signals = Variable(maybe_gpu(signals_.permute(0, 2, 1),
                                     (cuda_avail and cfg['cuda'])),
                           volatile=True)  # BxTxD => BxDxT
        signal_lengths = Variable(signal_lengths_, volatile=True)
        sequences = Variable(concat_labels(sequences_, sequence_lengths_),
                             volatile=True)
        sequence_lengths = Variable(sequence_lengths_, volatile=True)
        # compute predicted labels:
        transcriptions = network(signals).permute(2, 0,
                                                  1)  # Permute: BxDxT => TxBxD
        # compute CTC loss and return:
        loss = ctc_loss_fn(transcriptions, sequences.int(),
                           signal_lengths.int(), sequence_lengths.int())
        return loss, transcriptions

    #--- asynchronous gradient accumulation mode
    # compute target seqs/losses sequentially over each example, average gradients
    def async_model_loss(sample):
        # unpack inputs, optionally place on CUDA:
        signals_, signal_lengths_, sequences_, sequence_lengths_ = sample
        signals = maybe_gpu(signals_.permute(0, 2, 1),
                            (cuda_avail and cfg['cuda']))  # BxTxD => BxDxT

        # sequential compute over the batch:
        total_loss = 0.0
        transcriptions_list = []
        bsz = signals.size(0)
        for k in range(bsz):
            # fetch k-th input from batched sample and wrap as Variable:
            sig_k_scalar = signal_lengths_[k]
            seq_k_scalar = sequence_lengths_[k]
            sig_k_length = Variable(torch.IntTensor([sig_k_scalar]))
            seq_k_length = Variable(torch.IntTensor([seq_k_scalar]))
            signal_k = Variable(signals[k, :, :sig_k_scalar].unsqueeze(0))
            sequence_k = Variable(sequences_[k, :seq_k_scalar].unsqueeze(0))

            # compute transcription output:
            trans_k = network(signal_k).permute(2, 0,
                                                1)  # Permute: 1xDxT => Tx1xD

            # compute normalized CTC loss and accumulate gradient:
            loss = ctc_loss_fn(trans_k, sequence_k.int(), sig_k_length.int(),
                               seq_k_length.int())
            loss.backward()
            total_loss += loss
            transcriptions_list.append(trans_k)

        # combine transcriptions back into a batch and return:
        max_length = max([t.size(0) for t in transcriptions_list])
        transcriptions = Variable(torch.zeros(max_length, bsz, num_labels))
        for j, tr in enumerate(transcriptions_list):
            transcriptions[0:tr.size(0), j, :] = tr[:, 0, :]
        return total_loss, transcriptions

    #--- asynchronous gradient accumulation mode
    # compute target seqs/losses sequentially over each example, average gradients
    def async_model_eval(sample):
        # unpack inputs, optionally place on CUDA:
        signals_, signal_lengths_, sequences_, sequence_lengths_ = sample
        signals = maybe_gpu(signals_.permute(0, 2, 1),
                            (cuda_avail and cfg['cuda']))  # BxTxD => BxDxT

        # sequential compute over the batch:
        total_loss = 0.0
        transcriptions_list = []
        bsz = signals.size(0)
        for k in range(bsz):
            # fetch k-th input from batched sample and wrap as Variable:
            sig_k_scalar = signal_lengths_[k]
            seq_k_scalar = sequence_lengths_[k]
            sig_k_length = Variable(torch.IntTensor([sig_k_scalar]),
                                    volatile=True)
            seq_k_length = Variable(torch.IntTensor([seq_k_scalar]),
                                    volatile=True)
            signal_k = Variable(signals[k, :, :sig_k_scalar].unsqueeze(0),
                                volatile=True)
            sequence_k = Variable(sequences_[k, :seq_k_scalar].unsqueeze(0),
                                  volatile=True)

            # compute transcription output:
            trans_k = network(signal_k).permute(2, 0,
                                                1)  # Permute: 1xDxT => Tx1xD

            # compute normalized CTC loss and accumulate gradient:
            loss = ctc_loss_fn(trans_k, sequence_k.int(), sig_k_length.int(),
                               seq_k_length.int())
            total_loss += loss
            transcriptions_list.append(trans_k)

        # combine transcriptions back into a batch and return:
        max_length = max([t.size(0) for t in transcriptions_list])
        transcriptions = Variable(torch.zeros(max_length, bsz, num_labels),
                                  volatile=True)
        for j, tr in enumerate(transcriptions_list):
            transcriptions[0:tr.size(0), j, :] = tr[:, 0, :]
        return total_loss, transcriptions

    #--- choose appropriate model loss/eval functions depending on command line argument:
    model_loss = async_model_loss if cfg['async'] else batch_model_loss
    model_eval = async_model_eval if cfg['async'] else batch_model_eval

    ### build optimizer and LR scheduler:
    if (cfg['optim'] == 'adamax'):
        opt = optim.Adamax(network.parameters(), lr=cfg['lr'])
    elif (cfg['optim'] == 'adam'):
        opt = optim.Adam(network.parameters(), lr=cfg['lr'])
    else:
        raise Exception("Optimizer not recognized!")
    sched = ReduceLROnPlateau(opt, mode='min', patience=5)
    print("Constructed {} optimizer.".format(cfg['optim']))

    ### build beam search decoder:
    beam_labels = [' ', 'A', 'G', 'C', 'T']
    beam_blank_id = 0
    beam_decoder = CTCBeamDecoder(beam_labels,
                                  beam_width=100,
                                  blank_id=beam_blank_id,
                                  num_processes=4)
    print("Constructed CTC beam search decoder.")

    ### build engine, meters, and hooks:
    engine = Engine()
    loss_meter = tnt.meter.MovingAverageValueMeter(windowsize=5)
    print("Constructed engine. Running training loop...")

    #-- hook: reset all meters
    def reset_all_meters():
        loss_meter.reset()

    #-- hook: don't do anything for now when obtaining a data sample
    def on_sample(state):
        pass

    #-- hook: don't do anything on gradient update for now
    def on_update(state):
        pass

    #-- hook: update loggers at each forward pass
    def on_forward(state):
        loss_meter.add(state['loss'].data[0])
        if (state['t'] % cfg['print_every'] == 0):
            tqdm.write("Step: {0} | Loss: {1}".format(state['t'],
                                                      state['loss'].data[0]),
                       file=cfg['logfile'])

    #-- hook: reset all meters at the start of the epoch
    def on_start_epoch(state):
        reset_all_meters()
        network.train()  # set to training mode for batch norm
        state['iterator'] = tqdm(state['iterator'])

    #-- hook: perform validation and beam-search-decoding at end of each epoch:
    def on_end_epoch(state):
        network.eval()  # set to validation mode for batch-norm
        # K steps of validation; average the loss:
        val_losses = []
        base_seqs = []
        val_data_iterator = get_iterator('valid')
        for k, val_sample in enumerate(val_data_iterator):
            if k > cfg['num_valid_steps']: break
            val_loss, transcriptions = model_eval(val_sample)
            val_losses.append(val_loss.data[0])
            sequences = val_sample[2]
            # mask out the padding & permute (TxBxD => BxTxD):
            scores = mask_padding(transcriptions.permute(1, 0, 2),
                                  val_sample[1],
                                  fill_logit_idx=0)
            logits = F.softmax(scores, dim=2)
            base_seqs.append((sequences, logits))
        avg_val_loss = np.mean(val_losses)
        # log to both logfile and stdout:
        tqdm.write("EPOCH {0} | Avg. Val Loss: {1}".format(
            state['epoch'], avg_val_loss),
                   file=cfg['logfile'])
        print("EPOCH {0} | Avg. Val Loss: {1}".format(state['epoch'],
                                                      avg_val_loss))

        # send average val. loss to learning rate scheduler:
        sched.step(avg_val_loss)

        # beam search decoding:
        # (wrapped in try-excepts to prevent a thrown error from aborting training)
        _nt_dict_ = {0: ' ', 1: 'A', 2: 'G', 3: 'C', 4: 'T'}

        def convert_to_string(toks, voc, num):
            try:
                nt = ''.join([voc[t] for t in toks[0:num]])
            except:
                nt = ''
            return nt

        for true_seqs, logits in base_seqs:
            try:
                true_nts = labels2strings(true_seqs, lookup=_nt_dict_)
                amax_nts = labels2strings(argmax_decode(logits),
                                          lookup=_nt_dict_)
                beam_result, beam_scores, beam_times, beam_lengths = beam_decoder.decode(
                    logits.data)
                pred_nts = [
                    convert_to_string(beam_result[k][0], _nt_dict_,
                                      beam_lengths[k][0])
                    for k in range(len(beam_result))
                ]
                for i in range(min(len(true_nts), len(pred_nts))):
                    tqdm.write("True Seq: {0}".format(true_nts[i]),
                               file=cfg['logfile'])
                    tqdm.write("Beam Seq: {0}".format(pred_nts[i]),
                               file=cfg['logfile'])
                    tqdm.write("Amax Seq: {0}".format(amax_nts[i]),
                               file=cfg['logfile'])
                    tqdm.write(
                        ("- " * 10 + "Local Beam Alignment" + " -" * 10),
                        file=cfg['logfile'])
                    tqdm.write(ssw(true_nts[i], pred_nts[i]),
                               file=cfg['logfile'])
                    tqdm.write("= " * 40, file=cfg['logfile'])
            except:
                tqdm.write("(WARN: Could not parse batch; skipping...)",
                           file=cfg['logfile'])
                continue

        # save model:
        try:
            mdl_dtype = "cuda" if (cuda_avail and cfg['cuda']) else "cpu"
            mdl_path = os.path.join(
                cfg['save_dir'],
                "ctc_encoder.{0}.{1}.pth".format(state['epoch'], mdl_dtype))
            torch.save(network.state_dict(), mdl_path)
            tqdm.write("Saved model.", file=cfg['logfile'])
        except:
            print("Unable to serialize model; Moving on. Traceback:")
            traceback.print_exc()
            tqdm.write("Unable to serialize models. Moving on...",
                       file=cfg['logfile'])

        # reset all meters for next epoch:
        reset_all_meters()

    ### engine setup & training:
    engine.hooks['on_sample'] = on_sample
    engine.hooks['on_forward'] = on_forward
    engine.hooks['on_start_epoch'] = on_start_epoch
    engine.hooks['on_end_epoch'] = on_end_epoch
    engine.train(model_loss,
                 get_iterator('train'),
                 maxepoch=cfg['max_epochs'],
                 optimizer=opt)
Esempio n. 17
0
def csgd_train_and_prune(cfg: BaseConfigByEpoch,
                         target_deps,
                         centri_strength,
                         pacesetter_dict,
                         succeeding_strategy,
                         pruned_weights,
                         net=None,
                         train_dataloader=None,
                         val_dataloader=None,
                         show_variables=False,
                         convbuilder=None,
                         beginning_msg=None,
                         init_hdf5=None,
                         no_l2_keywords=None,
                         use_nesterov=False,
                         tensorflow_style_init=False):

    ensure_dir(cfg.output_dir)
    ensure_dir(cfg.tb_dir)
    clusters_save_path = os.path.join(cfg.output_dir, 'clusters.npy')

    with Engine() as engine:

        is_main_process = (engine.world_rank == 0)  #TODO correct?

        logger = engine.setup_log(name='train',
                                  log_dir=cfg.output_dir,
                                  file_name='log.txt')

        # -- typical model components model, opt,  scheduler,  dataloder --#
        if net is None:
            net = get_model_fn(cfg.dataset_name, cfg.network_type)

        if convbuilder is None:
            convbuilder = ConvBuilder(base_config=cfg)

        model = net(cfg, convbuilder).cuda()

        if train_dataloader is None:
            train_dataloader = create_dataset(cfg.dataset_name,
                                              cfg.dataset_subset,
                                              cfg.global_batch_size)
        if cfg.val_epoch_period > 0 and val_dataloader is None:
            val_dataloader = create_dataset(cfg.dataset_name,
                                            'val',
                                            batch_size=100)  #TODO 100?

        print('NOTE: Data prepared')
        print(
            'NOTE: We have global_batch_size={} on {} GPUs, the allocated GPU memory is {}'
            .format(cfg.global_batch_size, torch.cuda.device_count(),
                    torch.cuda.memory_allocated()))

        optimizer = get_optimizer(cfg, model, use_nesterov=use_nesterov)
        scheduler = get_lr_scheduler(cfg, optimizer)
        criterion = get_criterion(cfg).cuda()

        # model, optimizer = amp.initialize(model, optimizer, opt_level="O0")

        engine.register_state(scheduler=scheduler,
                              model=model,
                              optimizer=optimizer,
                              cfg=cfg)

        if engine.distributed:
            print('Distributed training, engine.world_rank={}'.format(
                engine.world_rank))
            model = torch.nn.parallel.DistributedDataParallel(
                model,
                device_ids=[engine.world_rank],
                broadcast_buffers=False,
            )
            # model = DistributedDataParallel(model, delay_allreduce=True)
        elif torch.cuda.device_count() > 1:
            print('Single machine multiple GPU training')
            model = torch.nn.parallel.DataParallel(model)

        if tensorflow_style_init:
            for k, v in model.named_parameters():
                if v.dim() in [2, 4]:
                    torch.nn.init.xavier_uniform_(v)
                    print('init {} as xavier_uniform'.format(k))
                if 'bias' in k and 'bn' not in k.lower():
                    torch.nn.init.zeros_(v)
                    print('init {} as zero'.format(k))

        if cfg.init_weights:
            engine.load_checkpoint(cfg.init_weights)

        if init_hdf5:
            engine.load_hdf5(init_hdf5)

        kernel_namedvalue_list = engine.get_all_conv_kernel_namedvalue_as_list(
        )

        if os.path.exists(clusters_save_path):
            layer_idx_to_clusters = np.load(clusters_save_path).item()
        else:
            layer_idx_to_clusters = get_layer_idx_to_clusters(
                kernel_namedvalue_list=kernel_namedvalue_list,
                target_deps=target_deps,
                pacesetter_dict=pacesetter_dict)
            if pacesetter_dict is not None:
                for follower_idx, pacesetter_idx in pacesetter_dict.items():
                    if pacesetter_idx in layer_idx_to_clusters:
                        layer_idx_to_clusters[
                            follower_idx] = layer_idx_to_clusters[
                                pacesetter_idx]

            np.save(clusters_save_path, layer_idx_to_clusters)

        csgd_save_file = os.path.join(cfg.output_dir, 'finish.hdf5')

        if os.path.exists(csgd_save_file):
            engine.load_hdf5(csgd_save_file)
        else:
            param_name_to_merge_matrix = generate_merge_matrix_for_kernel(
                deps=cfg.deps,
                layer_idx_to_clusters=layer_idx_to_clusters,
                kernel_namedvalue_list=kernel_namedvalue_list)
            param_name_to_decay_matrix = generate_decay_matrix_for_kernel_and_vecs(
                deps=cfg.deps,
                layer_idx_to_clusters=layer_idx_to_clusters,
                kernel_namedvalue_list=kernel_namedvalue_list,
                weight_decay=cfg.weight_decay,
                centri_strength=centri_strength)
            # if pacesetter_dict is not None:
            #     for follower_idx, pacesetter_idx in pacesetter_dict.items():
            #         follower_kernel_name = kernel_namedvalue_list[follower_idx].name
            #         pacesetter_kernel_name = kernel_namedvalue_list[follower_idx].name
            #         if pacesetter_kernel_name in param_name_to_merge_matrix:
            #             param_name_to_merge_matrix[follower_kernel_name] = param_name_to_merge_matrix[
            #                 pacesetter_kernel_name]
            #             param_name_to_decay_matrix[follower_kernel_name] = param_name_to_decay_matrix[
            #                 pacesetter_kernel_name]

            add_vecs_to_mat_dicts(param_name_to_merge_matrix)

            if show_variables:
                engine.show_variables()

            if beginning_msg:
                engine.log(beginning_msg)

            logger.info("\n\nStart training with pytorch version {}".format(
                torch.__version__))

            iteration = engine.state.iteration
            # done_epochs = iteration // num_train_examples_per_epoch(cfg.dataset_name)
            iters_per_epoch = num_iters_per_epoch(cfg)
            max_iters = iters_per_epoch * cfg.max_epochs
            tb_writer = SummaryWriter(cfg.tb_dir)
            tb_tags = ['Top1-Acc', 'Top5-Acc', 'Loss']

            model.train()

            done_epochs = iteration // iters_per_epoch

            engine.save_hdf5(os.path.join(cfg.output_dir, 'init.hdf5'))

            recorded_train_time = 0
            recorded_train_examples = 0

            for epoch in range(done_epochs, cfg.max_epochs):

                pbar = tqdm(range(iters_per_epoch))
                top1 = AvgMeter()
                top5 = AvgMeter()
                losses = AvgMeter()
                discrip_str = 'Epoch-{}/{}'.format(epoch, cfg.max_epochs)
                pbar.set_description('Train' + discrip_str)

                if cfg.val_epoch_period > 0 and epoch % cfg.val_epoch_period == 0:
                    model.eval()
                    val_iters = 500 if cfg.dataset_name == 'imagenet' else 100  # use batch_size=100 for val on ImagenNet and CIFAR
                    eval_dict, _ = run_eval(val_dataloader,
                                            val_iters,
                                            model,
                                            criterion,
                                            discrip_str,
                                            dataset_name=cfg.dataset_name)
                    val_top1_value = eval_dict['top1'].item()
                    val_top5_value = eval_dict['top5'].item()
                    val_loss_value = eval_dict['loss'].item()
                    for tag, value in zip(
                            tb_tags,
                        [val_top1_value, val_top5_value, val_loss_value]):
                        tb_writer.add_scalars(tag, {'Val': value}, iteration)
                    engine.log(
                        'validate at epoch {}, top1={:.5f}, top5={:.5f}, loss={:.6f}'
                        .format(epoch, val_top1_value, val_top5_value,
                                val_loss_value))
                    model.train()

                for _ in pbar:

                    start_time = time.time()
                    data, label = load_cuda_data(train_dataloader,
                                                 cfg.dataset_name)
                    data_time = time.time() - start_time

                    train_net_time_start = time.time()
                    acc, acc5, loss = train_one_step(
                        model,
                        data,
                        label,
                        optimizer,
                        criterion,
                        param_name_to_merge_matrix=param_name_to_merge_matrix,
                        param_name_to_decay_matrix=param_name_to_decay_matrix)
                    train_net_time_end = time.time()

                    if iteration > TRAIN_SPEED_START * max_iters and iteration < TRAIN_SPEED_END * max_iters:
                        recorded_train_examples += cfg.global_batch_size
                        recorded_train_time += train_net_time_end - train_net_time_start

                    scheduler.step()

                    if iteration % cfg.tb_iter_period == 0 and is_main_process:
                        for tag, value in zip(
                                tb_tags,
                            [acc.item(), acc5.item(),
                             loss.item()]):
                            tb_writer.add_scalars(tag, {'Train': value},
                                                  iteration)

                    top1.update(acc.item())
                    top5.update(acc5.item())
                    losses.update(loss.item())

                    pbar_dic = OrderedDict()
                    pbar_dic['data-time'] = '{:.2f}'.format(data_time)
                    pbar_dic['cur_iter'] = iteration
                    pbar_dic['lr'] = scheduler.get_lr()[0]
                    pbar_dic['top1'] = '{:.5f}'.format(top1.mean)
                    pbar_dic['top5'] = '{:.5f}'.format(top5.mean)
                    pbar_dic['loss'] = '{:.5f}'.format(losses.mean)
                    pbar.set_postfix(pbar_dic)

                    if iteration >= max_iters or iteration % cfg.ckpt_iter_period == 0:
                        engine.update_iteration(iteration)
                        if (not engine.distributed) or (engine.distributed
                                                        and is_main_process):
                            engine.save_and_link_checkpoint(cfg.output_dir)

                    iteration += 1
                    if iteration >= max_iters:
                        break

                #   do something after an epoch?
                if iteration >= max_iters:
                    break
            #   do something after the training
            if recorded_train_time > 0:
                exp_per_sec = recorded_train_examples / recorded_train_time
            else:
                exp_per_sec = 0
            engine.log(
                'TRAIN speed: from {} to {} iterations, batch_size={}, examples={}, total_net_time={:.4f}, examples/sec={}'
                .format(int(TRAIN_SPEED_START * max_iters),
                        int(TRAIN_SPEED_END * max_iters),
                        cfg.global_batch_size, recorded_train_examples,
                        recorded_train_time, exp_per_sec))
            if cfg.save_weights:
                engine.save_checkpoint(cfg.save_weights)
                print('NOTE: training finished, saved to {}'.format(
                    cfg.save_weights))
            engine.save_hdf5(os.path.join(cfg.output_dir, 'finish.hdf5'))

        csgd_prune_and_save(engine=engine,
                            layer_idx_to_clusters=layer_idx_to_clusters,
                            save_file=pruned_weights,
                            succeeding_strategy=succeeding_strategy,
                            new_deps=target_deps)
Esempio n. 18
0
def ding_train(cfg:BaseConfigByEpoch, net=None, train_dataloader=None, val_dataloader=None, show_variables=False, convbuilder=None, beginning_msg=None,
               init_hdf5=None, no_l2_keywords=None, gradient_mask=None, use_nesterov=False):

    # LOCAL_RANK = 0
    #
    # num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    # is_distributed = num_gpus > 1
    #
    # if is_distributed:
    #     torch.cuda.set_device(LOCAL_RANK)
    #     torch.distributed.init_process_group(
    #         backend="nccl", init_method="env://"
    #     )
    #     synchronize()
    #
    # torch.backends.cudnn.benchmark = True

    ensure_dir(cfg.output_dir)
    ensure_dir(cfg.tb_dir)
    with Engine() as engine:

        is_main_process = (engine.world_rank == 0) #TODO correct?

        logger = engine.setup_log(
            name='train', log_dir=cfg.output_dir, file_name='log.txt')

        # -- typical model components model, opt,  scheduler,  dataloder --#
        if net is None:
            net = get_model_fn(cfg.dataset_name, cfg.network_type)

        if convbuilder is None:
            convbuilder = ConvBuilder(base_config=cfg)

        model = net(cfg, convbuilder).cuda()

        if train_dataloader is None:
            train_dataloader = create_dataset(cfg.dataset_name, cfg.dataset_subset, cfg.global_batch_size)
        if cfg.val_epoch_period > 0 and val_dataloader is None:
            val_dataloader = create_dataset(cfg.dataset_name, 'val', batch_size=100)    #TODO 100?

        print('NOTE: Data prepared')
        print('NOTE: We have global_batch_size={} on {} GPUs, the allocated GPU memory is {}'.format(cfg.global_batch_size, torch.cuda.device_count(), torch.cuda.memory_allocated()))

        # device = torch.device(cfg.device)
        # model.to(device)
        # model.cuda()

        if no_l2_keywords is None:
            no_l2_keywords = []
        optimizer = get_optimizer(cfg, model, no_l2_keywords=no_l2_keywords, use_nesterov=use_nesterov)
        scheduler = get_lr_scheduler(cfg, optimizer)
        criterion = get_criterion(cfg).cuda()

        # model, optimizer = amp.initialize(model, optimizer, opt_level="O0")

        engine.register_state(
            scheduler=scheduler, model=model, optimizer=optimizer)

        if engine.distributed:
            print('Distributed training, engine.world_rank={}'.format(engine.world_rank))
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[engine.world_rank],
                broadcast_buffers=False, )
            # model = DistributedDataParallel(model, delay_allreduce=True)
        elif torch.cuda.device_count() > 1:
            print('Single machine multiple GPU training')
            model = torch.nn.parallel.DataParallel(model)

        # for k, v in model.named_parameters():
        #     if v.dim() in [2, 4]:
        #         torch.nn.init.xavier_normal_(v)
        #         print('init {} as xavier_normal'.format(k))
        #     if 'bias' in k and 'bn' not in k.lower():
        #         torch.nn.init.zeros_(v)
        #         print('init {} as zero'.format(k))

        if cfg.init_weights:
            engine.load_checkpoint(cfg.init_weights, is_restore=True)

        if init_hdf5:
            engine.load_hdf5(init_hdf5)


        if show_variables:
            engine.show_variables()

        # ------------ do training ---------------------------- #
        if beginning_msg:
            engine.log(beginning_msg)
        logger.info("\n\nStart training with pytorch version {}".format(torch.__version__))

        iteration = engine.state.iteration
        # done_epochs = iteration // num_train_examples_per_epoch(cfg.dataset_name)
        iters_per_epoch = num_iters_per_epoch(cfg)
        max_iters = iters_per_epoch * cfg.max_epochs
        tb_writer = SummaryWriter(cfg.tb_dir)
        tb_tags = ['Top1-Acc', 'Top5-Acc', 'Loss']

        model.train()

        done_epochs = iteration // iters_per_epoch

        engine.save_hdf5(os.path.join(cfg.output_dir, 'init.hdf5'))

        # summary(model=model, input_size=(224, 224) if cfg.dataset_name == 'imagenet' else (32, 32), batch_size=cfg.global_batch_size)

        recorded_train_time = 0
        recorded_train_examples = 0

        if gradient_mask is not None:
            gradient_mask_tensor = {}
            for name, value in gradient_mask.items():
                gradient_mask_tensor[name] = torch.Tensor(value).cuda()
        else:
            gradient_mask_tensor = None

        for epoch in range(done_epochs, cfg.max_epochs):

            pbar = tqdm(range(iters_per_epoch))
            top1 = AvgMeter()
            top5 = AvgMeter()
            losses = AvgMeter()
            discrip_str = 'Epoch-{}/{}'.format(epoch, cfg.max_epochs)
            pbar.set_description('Train' + discrip_str)


            if cfg.val_epoch_period > 0 and epoch % cfg.val_epoch_period == 0:
                model.eval()
                val_iters = 500 if cfg.dataset_name == 'imagenet' else 100  # use batch_size=100 for val on ImagenNet and CIFAR
                eval_dict, _ = run_eval(val_dataloader, val_iters, model, criterion, discrip_str, dataset_name=cfg.dataset_name)
                val_top1_value = eval_dict['top1'].item()
                val_top5_value = eval_dict['top5'].item()
                val_loss_value = eval_dict['loss'].item()
                for tag, value in zip(tb_tags, [val_top1_value, val_top5_value, val_loss_value]):
                    tb_writer.add_scalars(tag, {'Val': value}, iteration)
                engine.log('validate at epoch {}, top1={:.5f}, top5={:.5f}, loss={:.6f}'.format(epoch, val_top1_value, val_top5_value, val_loss_value))
                model.train()

            for _ in pbar:

                start_time = time.time()
                data, label = load_cuda_data(train_dataloader, cfg.dataset_name)
                data_time = time.time() - start_time

                if_accum_grad = ((iteration % cfg.grad_accum_iters) != 0)

                train_net_time_start = time.time()
                acc, acc5, loss = train_one_step(model, data, label, optimizer, criterion, if_accum_grad, gradient_mask_tensor=gradient_mask_tensor)
                train_net_time_end = time.time()

                if iteration > TRAIN_SPEED_START * max_iters and iteration < TRAIN_SPEED_END * max_iters:
                    recorded_train_examples += cfg.global_batch_size
                    recorded_train_time += train_net_time_end - train_net_time_start

                scheduler.step()

                if iteration % cfg.tb_iter_period == 0 and is_main_process:
                    for tag, value in zip(tb_tags, [acc.item(), acc5.item(), loss.item()]):
                        tb_writer.add_scalars(tag, {'Train': value}, iteration)


                top1.update(acc.item())
                top5.update(acc5.item())
                losses.update(loss.item())

                pbar_dic = OrderedDict()
                pbar_dic['data-time'] = '{:.2f}'.format(data_time)
                pbar_dic['cur_iter'] = iteration
                pbar_dic['lr'] = scheduler.get_lr()[0]
                pbar_dic['top1'] = '{:.5f}'.format(top1.mean)
                pbar_dic['top5'] = '{:.5f}'.format(top5.mean)
                pbar_dic['loss'] = '{:.5f}'.format(losses.mean)
                pbar.set_postfix(pbar_dic)

                if iteration >= max_iters or iteration % cfg.ckpt_iter_period == 0:
                    engine.update_iteration(iteration)
                    if (not engine.distributed) or (engine.distributed and is_main_process):
                        engine.save_and_link_checkpoint(cfg.output_dir)

                iteration += 1
                if iteration >= max_iters:
                    break

            #   do something after an epoch?
            if iteration >= max_iters:
                break
        #   do something after the training
        if recorded_train_time > 0:
            exp_per_sec = recorded_train_examples / recorded_train_time
        else:
            exp_per_sec = 0
        engine.log(
            'TRAIN speed: from {} to {} iterations, batch_size={}, examples={}, total_net_time={:.4f}, examples/sec={}'
            .format(int(TRAIN_SPEED_START * max_iters), int(TRAIN_SPEED_END * max_iters), cfg.global_batch_size,
                    recorded_train_examples, recorded_train_time, exp_per_sec))
        if cfg.save_weights:
            engine.save_checkpoint(cfg.save_weights)
            print('NOTE: training finished, saved to {}'.format(cfg.save_weights))
        engine.save_hdf5(os.path.join(cfg.output_dir, 'finish.hdf5'))
Esempio n. 19
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from utils.strategies import AdvancedStrat
from utils.engine import Engine

#board_name = "match"
board_name = "test4"

strategy = AdvancedStrat()
engine = Engine(board_name)
#engine.setGoals([(16,10), (2,10)])
engine.setGoals([(18, 7), (1, 11)])
#engine.setGoals([(18,9), (1,9)])
engine.play(strategy)
Esempio n. 20
0
                      default='xtqa',
                      required=True)

    args.add_argument('--run_mode',
                      dest='run_mode',
                      choices=['train', 'test'],
                      type=str,
                      default='train',
                      required=True)

    args = args.parse_args()
    return args


if __name__ == '__main__':
    args = parse_input_args()
    cfg_file = 'configs/{}/{}.yml'.format(args.dataset_use, args.model)
    with open(cfg_file, 'r') as f:
        yml_dict = yaml.load(stream=f, Loader=yaml.BaseLoader)

    cfgs = CfgLoader(args.dataset_use, args.model).load()
    args_dict = cfgs.parse_to_dict(args)
    args_dict = {**args_dict, **yml_dict}
    cfgs.add_attr(args_dict)
    cfgs.proc()

    print('Configurations of Networks:')
    print(cfgs)
    engine = Engine(cfgs=cfgs)
    engine.load_method()
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from utils.strategies import SingleStrat
from utils.engine import Engine

board_name = "pathfindingWorld3"

strategy = SingleStrat()
engine = Engine(board_name)
engine.play(strategy)