Esempio n. 1
0
            if not os.path.isdir('%s/checkpoint' %save_root):
                os.makedirs('%s/checkpoint' %save_root)
            torch.save(source_net.state_dict(), '%s/checkpoint/%s-temp.pth' %(save_root, source_dataset_name))
            torch.save(target_net.state_dict(), '%s/checkpoint/%s-temp.pth' %(save_root, target_dataset_name))

        if loss_t < small_train_loss:
            small_train_loss = loss_t
            descend_count = 0

        else:
            descend_count += 1

        print('\nTraining loss: %.3f, descend count: %d' % (loss_t, descend_count))

        if descend_count >= 3:

            descend_count = 0
            optimizer_t.param_groups[0]['lr'] *= 0.1
            optimizer_s.param_groups[0]['lr'] *= 0.1
            print('Learning rate: %e' % optimizer_t.param_groups[0]['lr'])
            if optimizer_t.param_groups[0]['lr'] <= 1e-6:
                stop_flag = True
                break

    print('Best test acc: %.3f' % best_test_acc)
    best_acc_list.append(best_test_acc)

print(best_acc_list)
source_recorder.close()
target_recorder.close()
alpha_change_point_file.close()
    if args.lr_adjust == 'adaptive':

        if train_loss < min_train_loss:
            min_train_loss = train_loss
            ascent_count = 0
        else:
            ascent_count += 1

        print('Current Loss: %.3f [%.3f], ascent count: %d' % (train_loss, min_train_loss, ascent_count))

        if ascent_count >= 3:
            optimizer.param_groups[0]['lr'] *= 0.1
            ascent_count = 0
            if (optimizer.param_groups[0]['lr']) < (args.lr * 1e-3):
                break

    elif (epoch + 1) % args.lr_adjust == 0:

        optimizer.param_groups[0]['lr'] *= 0.1
        if (optimizer.param_groups[0]['lr']) < (args.lr * 1e-3):
            break

        print('Learning rate decrease to %e' % optimizer.param_groups[0]['lr'])

recorder.close()
for collection in [
    weight_quantization_error_recorder_collection, input_quantization_error_recorder_collection,
    weight_bit_allocation_collection, input_bit_allocation_collection
]:
    for recorder in collection.values():
        recorder.close()
Esempio n. 3
0
class Task():

    def __init__(self, task_name, task_type = 'prune', optimizer_type = 'adam',
                 save_root = None, SummaryPath = None, use_cuda = True, **kwargs):

        self.task_name = task_name
        self.task_type = task_type # prune, soft-quantize
        self.model_name, self.dataset_name = task_name.split('-')
        self.ratio = 'sample' if self.dataset_name in ['CIFARS'] else -1

        #######
        # Net #
        #######
        if task_type == 'prune':
            if self.model_name == 'ResNet20':
                if self.dataset_name in ['CIFAR10', 'CIFARS']:
                    self.net = resnet20_cifar()
                elif self.dataset_name == 'STL10':
                    self.net = resnet20_stl()
                else:
                    raise NotImplementedError
            elif self.model_name == 'ResNet32':
                if self.dataset_name in ['CIFAR10', 'CIFARS']:
                    self.net = resnet32_cifar()
                elif self.dataset_name == 'STL10':
                    self.net = resnet32_stl()
                else:
                    raise NotImplementedError
            elif self.model_name == 'ResNet56':
                if self.dataset_name in ['CIFAR10', 'CIFARS']:
                    self.net = resnet56_cifar()
                elif self.dataset_name == 'CIFAR100':
                    self.net = resnet56_cifar(num_classes=100)
                elif self.dataset_name == 'STL10':
                    self.net = resnet56_stl()
                else:
                    raise NotImplementedError
            elif self.model_name == 'ResNet18':
                if self.dataset_name == 'ImageNet':
                    self.net = resnet18()
                else:
                    raise NotImplementedError
            elif self.model_name == 'vgg11':
                self.net = vgg11() if self.dataset_name == 'CIFAR10' else vgg11_stl10()
            else:
                print(self.model_name, self.dataset_name)
                raise NotImplementedError
        elif task_type == 'soft-quantize':
            if self.model_name == 'ResNet20':
                if self.dataset_name in ['CIFAR10', 'CIFARS']:
                    self.net = soft_quantized_resnet20_cifar()
                elif self.dataset_name in ['STL10']:
                    self.net = soft_quantized_resnet20_stl()
            else:
                raise NotImplementedError
        else:
            raise ('Task type not defined.')


        self.meta_opt_flag = True # True for enabling meta leraning

        ##############
        # Meta Prune #
        ##############
        self.mask_dict = dict()
        self.meta_grad_dict = dict()
        self.meta_hidden_state_dict = dict()

        ######################
        # Meta Soft Quantize #
        ######################
        self.quantized = 0 # Quantized type
        self.alpha_dict = dict()
        self.alpha_hidden_dict = dict()
        self.sq_rate = 0
        self.s_rate = 0
        self.q_rate = 0

        ##########
        # Record #
        ##########
        self.dataset_type = 'large' if self.dataset_name in ['ImageNet'] else 'small'
        self.SummaryPath = SummaryPath
        self.save_root = save_root

        self.recorder = Recorder(self.SummaryPath, self.dataset_name, self.task_name)

        ####################
        # Load Pre-trained #
        ####################
        self.pretrain_path = '%s/%s-pretrain.pth' %(self.save_root, self.task_name)
        self.net.load_state_dict(torch.load(self.pretrain_path))
        print('Load pre-trained model from %s' %self.pretrain_path)

        if use_cuda:
            self.net.cuda()

        # Optimizer for this task
        if optimizer_type in ['Adam', 'adam']:
            self.optimizer = Adam(self.net.parameters(), lr=1e-3)
        else:
            self.optimizer = SGD(self.net.parameters())

        if self.dataset_name == 'ImageNet':
            try:
                self.train_loader = get_lmdb_imagenet('train', 128)
                self.test_loader = get_lmdb_imagenet('test', 100)
            except:
                self.train_loader = get_dataloader(self.dataset_name, 'train', 128)
                self.test_loader = get_dataloader(self.dataset_name, 'test', 100)
        else:
            self.train_loader = get_dataloader(self.dataset_name, 'train', 128, ratio=self.ratio)
            self.test_loader = get_dataloader(self.dataset_name, 'test', 128)

        self.iter_train_loader = yielder(self.train_loader)
        # For shared
        # self.loss = 0
        # self.niter = 0 # Overall iteration record
        # self.test_loss = 0
        # self.smallest_training_loss = 1e9
        # self.stop = False # Whether to stop training
        #
        # # For CIFAR dataset
        # # self.train_acc = AverageMeter()
        # self.total = 0 # Number of batches used in training
        # self.n_batch = 0 # Number of batches used in training
        # self.test_acc = 0
        # self.best_test_acc = 0
        # self.ascend_count = 0
        #
        # # For ImageNet dataset
        # # self.loss = AverageMeter()
        # self.top1 = AverageMeter()
        # self.top5 = AverageMeter()
        # self.batch_time = AverageMeter()
        # self.data_time = AverageMeter()
        # self.test_acc_top1 = 0
        # self.test_acc_top5 = 0
        # self.best_test_acc_top1 = 0
        # self.best_test_acc_top5 = 0
        #
        # #######################
        # # Parameters for Meta #
        # #######################
        # self.mask_dict = dict()
        # self.meta_grad_dict = dict()
        # self.meta_hidden_state_dict = dict()
        #
        # ###########################
        # # Open File for Recording #
        # ###########################
        # if self.dataset_type == 'small':
        #     self.loss_record = open('%s/%s-loss.txt' %(self.SummaryPath, self.task_name), 'w+')
        #     self.train_acc_record = open('%s/%s-train-acc.txt' %(self.SummaryPath, self.task_name), 'w+')
        #     self.test_acc_record = open('%s/%s-test-acc.txt' %(self.SummaryPath, self.task_name), 'w+')
        #     self.lr_record = open('%s/%s-lr.txt' %(self.SummaryPath, self.task_name), 'w+')
        #     # print('Initialize %s' %(self.task_name))
        # else:
        #     self.loss_record = open('%s/%s-loss.txt' % (self.SummaryPath, self.task_name), 'w+')
        #     self.train_top1_acc_record = open('%s/%s-train-top1-acc.txt' % (self.SummaryPath, self.task_name), 'w+')
        #     self.train_top5_acc_record = open('%s/%s-train-top5-acc.txt' % (self.SummaryPath, self.task_name), 'w+')
        #     self.test_top1_acc_record = open('%s/%s-test-top1-acc.txt' % (self.SummaryPath, self.task_name), 'w+')
        #     self.test_top5_acc_record = open('%s/%s-test-top5-acc.txt' % (self.SummaryPath, self.task_name), 'w+')
        #     self.lr_record = open('%s/%s-lr.txt' % (self.SummaryPath, self.task_name), 'w+')

    def train(self):
        self.net.train()

    def eval(self):
        self.net.eval()

    def zero_grad(self):
        self.optimizer.zero_grad()

    def step(self):
        self.optimizer.step()

    def update_record_performance(self, loss, acc, batch_size=0, lr = 1e-3, end=None, is_train = True):

        self.recorder.update(loss=loss, acc=acc, batch_size=batch_size, cur_lr=lr, end=end, is_train=is_train)

        # if is_train:
        #
        #     self.loss += loss
        #     self.n_batch += 1
        #     self.total += batch_size
        #     self.niter += 1
        #
        #     if self.dataset_type == 'small':
        #         self.top1.update(acc[0], batch_size)
        #
        #         self.loss_record.write('%d, %.8f\n' % (self.niter, self.loss / self.n_batch))
        #         self.train_acc_record.write('%d, %.3f\n' % (self.niter, self.top1.avg))
        #         self.lr_record.write('%d, %e\n' % (self.niter, self.optimizer.param_groups[0]['lr']))
        #
        #         self.flush([self.loss_record, self.train_acc_record, self.lr_record])
        #
        #     else:
        #         self.batch_time.update(time.time() - end)
        #         self.top1.update(acc[0], batch_size)
        #         self.top5.update(acc[1], batch_size)
        #
        #         self.loss_record.write('%d, %.8f\n' % (self.niter, self.loss / self.n_batch))
        #         self.train_top1_acc_record.write('%d, %.3f\n' % (self.niter, self.top1.avg))
        #         self.train_top5_acc_record.write('%d, %.3f\n' % (self.niter, self.top5.avg))
        #         self.lr_record.write('%d, %ef\n' % (self.niter, self.optimizer.param_groups[0]['lr']))
        #
        #         self.flush([self.loss_record, self.train_top1_acc_record, self.train_top5_acc_record, self.lr_record])
        #
        # else:
        #     self.test_loss = loss
        #
        #     if self.dataset_type == 'small':
        #
        #         self.test_acc = acc
        #
        #         if self.best_test_acc < self.test_acc:
        #             self.best_test_acc = self.test_acc
        #             print('[%s] Best test acc' %self.task_name)
        #             # self.save(self.SummaryPath)
        #
        #         self.test_acc_record.write('%d, %.3f\n' % (self.niter, self.test_acc))
        #         self.flush([self.test_acc_record])
        #
        #     else:
        #
        #         self.test_acc_top1, self.test_acc_top5 = acc[0], acc[1]
        #
        #         if self.best_test_acc_top1 < self.test_acc_top1 or self.best_test_acc_top5 < self.test_acc_top5:
        #             self.best_test_acc_top1 = self.test_acc_top1
        #             self.best_test_acc_top5 = self.test_acc_top5
        #             print('[%s] Best test acc' % self.task_name)
        #             # self.save(self.SummaryPath)
        #
        #         self.test_top1_acc_record.write('%d, %.3f\n' % (self.niter, self.test_acc_top1))
        #         self.test_top5_acc_record.write('%d, %.3f\n' % (self.niter, self.test_acc_top5))
        #
        #         self.flush([self.test_top1_acc_record, self.test_top5_acc_record])


    def reset_performance(self):

        # self.loss = 0
        #
        # if self.dataset_type == 'small':
        #     self.loss = 0
        #     # self.train_acc.reset()
        #     self.top1.reset()
        #     self.total = 0
        #     self.n_batch = 0
        # else:
        #     self.best_test_acc_top1 = 0
        #     self.best_test_acc_top5 = 0
        #     self.top1.reset()
        #     self.top5.reset()
        #     self.batch_time.reset()
        self.recorder.reset_performance()


    # def set_best_acc(self, test_acc):
    #     self.best_test_acc = test_acc


    def save(self, save_root):
        torch.save(self.net.state_dict(), '%s/%s-net.pth' %(save_root, self.task_name))

    def get_best_test_acc(self):

        # if self.dataset_type == 'small':
        #     return self.best_test_acc
        # else:
        #     return self.best_test_acc_top1, self.best_test_acc_top5
        return self.recorder.get_best_test_acc()

    def flush(self, file_list=None):

        for file in file_list:
            file.flush()

    def close(self):

        # if self.dataset_type == 'small':
        #     self.loss_record.close()
        #     self.train_acc_record.close()
        #     self.test_acc_record.close()
        #     self.lr_record.close()
        # else:
        #     self.loss_record.close()
        #     self.train_top1_acc_record.close()
        #     self.train_top5_acc_record.close()
        #     self.test_top1_acc_record.close()
        #     self.test_top5_acc_record.close()
        #     self.lr_record.close()
        self.recorder.close()

    def adjust_lr(self, adjust_type):

        # if self.dataset_type == 'small':
        #     if self.loss > self.smallest_training_loss:
        #         self.ascend_count += 1
        #     else:
        #         self.smallest_training_loss = self.loss
        #         self.ascend_count = 0
        #
        #     if self.ascend_count >= 3:
        #         self.ascend_count = 0
        #         self.optimizer.param_groups[0]['lr'] *= 0.1
        #         if self.optimizer.param_groups[0]['lr'] < 1e-6:
        #             self.stop = True
        #
        #     print('[%s] Current training loss: %.3f[%.3f], ascend count: %d'
        #           %(self.task_name, self.loss, self.smallest_training_loss, self.ascend_count))
        #     print('---------------------------------------------------')
        # else:
        #     raise NotImplementedError

        self.recorder.adjust_lr(self.optimizer)
Esempio n. 4
0
def main():
    if sys.platform.startswith('win'):
        # Add the _win_handler function to the windows console's handler function list
        win32api.SetConsoleCtrlHandler(_win_handler, True)
    if os.path.exists(
            os.path.join(config_file.config['config_file'], 'config.yaml')):
        config = sth.load_config(config_file.config['config_file'])
    else:
        config = config_file.config
        print(f'load config from config.')

    hyper_config = config['hyper parameters']
    train_config = config['train config']
    record_config = config['record config']

    basic_dir = record_config['basic_dir']
    last_name = record_config['project_name'] + '/' \
        + record_config['remark'] \
        + record_config['run_id']
    cp_dir = record_config['checkpoint_basic_dir'] + last_name
    cp_file = cp_dir + '/rb'
    log_dir = record_config['log_basic_dir'] + last_name
    excel_dir = record_config['excel_basic_dir'] + last_name
    config_dir = record_config['config_basic_dir'] + last_name
    sth.check_or_create(basic_dir, 'basic')
    sth.check_or_create(cp_dir, 'checkpoints')
    sth.check_or_create(log_dir, 'logs(summaries)')
    sth.check_or_create(excel_dir, 'excel')
    sth.check_or_create(config_dir, 'config')

    logger = create_logger(
        name='logger',
        console_level=logging.INFO,
        console_format='%(levelname)s : %(message)s',
        logger2file=record_config['logger2file'],
        file_name=log_dir + '\log.txt',
        file_level=logging.WARNING,
        file_format=
        '%(lineno)d - %(asctime)s - %(module)s - %(funcName)s - %(levelname)s - %(message)s'
    )
    if train_config['train']:
        sth.save_config(config_dir, config)

    if train_config['unity_mode']:
        env = UnityEnvironment()
    else:
        env = UnityEnvironment(
            file_name=train_config['unity_file'],
            no_graphics=True if train_config['train'] else False,
            base_port=train_config['port'])
    brain_name = env.external_brain_names[0]
    brain = env.brains[brain_name]
    # set the memory use proportion of GPU
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    # tf_config.gpu_options.per_process_gpu_memory_fraction = 0.5
    tf.reset_default_graph()
    graph = tf.Graph()
    with graph.as_default() as g:
        with tf.Session(graph=g, config=tf_config) as sess:
            logger.info('Algorithm: {0}'.format(
                train_config['algorithm'].name))
            if train_config['algorithm'] == config_file.algorithms.ppo_sep_ac:
                from ppo.ppo_base import PPO_SEP
                model = PPO_SEP(sess=sess,
                                s_dim=brain.vector_observation_space_size,
                                a_counts=brain.vector_action_space_size[0],
                                hyper_config=hyper_config)
                logger.info('PPO_SEP initialize success.')
            elif train_config['algorithm'] == config_file.algorithms.ppo_com:
                from ppo.ppo_base import PPO_COM
                model = PPO_COM(sess=sess,
                                s_dim=brain.vector_observation_space_size,
                                a_counts=brain.vector_action_space_size[0],
                                hyper_config=hyper_config)
                logger.info('PPO_COM initialize success.')
            elif train_config['algorithm'] == config_file.algorithms.sac:
                from sac.sac import SAC
                model = SAC(sess=sess,
                            s_dim=brain.vector_observation_space_size,
                            a_counts=brain.vector_action_space_size[0],
                            hyper_config=hyper_config)
                logger.info('SAC initialize success.')
            elif train_config['algorithm'] == config_file.algorithms.sac_no_v:
                from sac.sac_no_v import SAC_NO_V
                model = SAC_NO_V(sess=sess,
                                 s_dim=brain.vector_observation_space_size,
                                 a_counts=brain.vector_action_space_size[0],
                                 hyper_config=hyper_config)
                logger.info('SAC_NO_V initialize success.')
            elif train_config['algorithm'] == config_file.algorithms.ddpg:
                from ddpg.ddpg import DDPG
                model = DDPG(sess=sess,
                             s_dim=brain.vector_observation_space_size,
                             a_counts=brain.vector_action_space_size[0],
                             hyper_config=hyper_config)
                logger.info('DDPG initialize success.')
            elif train_config['algorithm'] == config_file.algorithms.td3:
                from td3.td3 import TD3
                model = TD3(sess=sess,
                            s_dim=brain.vector_observation_space_size,
                            a_counts=brain.vector_action_space_size[0],
                            hyper_config=hyper_config)
                logger.info('TD3 initialize success.')
            recorder = Recorder(log_dir,
                                excel_dir,
                                record_config,
                                logger,
                                max_to_keep=5,
                                pad_step_number=True,
                                graph=g)
            episode = init_or_restore(cp_dir, sess, recorder, cp_file)
            try:
                if train_config['train']:
                    train_OnPolicy(
                        sess=sess,
                        env=env,
                        brain_name=brain_name,
                        begin_episode=episode,
                        model=model,
                        recorder=recorder,
                        cp_file=cp_file,
                        hyper_config=hyper_config,
                        train_config=train_config) if not train_config[
                            'use_replay_buffer'] else train_OffPolicy(
                                sess=sess,
                                env=env,
                                brain_name=brain_name,
                                begin_episode=episode,
                                model=model,
                                recorder=recorder,
                                cp_file=cp_file,
                                hyper_config=hyper_config,
                                train_config=train_config)
                    tf.train.write_graph(g,
                                         cp_dir,
                                         'raw_graph_def.pb',
                                         as_text=False)
                    export_model(cp_dir, g)
                else:
                    inference(env, brain_name, model, train_config)
            except Exception as e:
                logger.error(e)
            finally:
                env.close()
    recorder.close()
    sys.exit()
Esempio n. 5
0
        losses.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)

        optimizer.step()

        # ------
        # Record
        # ------
        if recoder is not None:
            recoder.update(losses.item(),
                           batch_size=args.batch_size,
                           cur_lr=optimizer.param_groups[0]['lr'])
            recoder.print_training_result(batch_idx, len(train_loader))
        else:
            train_loss += losses.item()
            progress_bar(batch_idx, len(train_loader),
                         "Loss: %.3f" % (train_loss / (batch_idx + 1)))

    # -----
    # Test
    # -----
    eval_loss = evaluate(model, test_loader, criterion)
    if recoder is not None:
        recoder.update(eval_loss, is_train=False)
    print('[%2d] Test loss: %.3f' % (epoch_idx, eval_loss))

if recoder is not None:
    recoder.close()