Ejemplo n.º 1
0
    def __init__(self, args, data, opts):
        '''
        需要完成几个任务,第一个是对初始任务,怎么考虑
        第二是从构建此表,最终完成采样过程
        第三是根据采样过程,构建出模型的size,选取出需要用到的参数
        :param model:
        :param task:
        :param args:
        :param data:
        '''
        self.args = args
        self.data = data
        self.opts = opts
        self.controller = Controller(args=self.args,
                                     task_num=self.opts.num_task)
        self.controller_optim = Adam(self.controller.parameters(),
                                     lr=args.controller_lr)
        cuda_condition = torch.cuda.is_available() and args.with_cuda
        self.device = torch.device("cuda" if cuda_condition else "cpu")
        self.controller = self.controller.to(self.device)

        self.tasks_config = []
        self.task_acc = []
        self.model_dict = []
        self.task_scope = 1  # =>reuse
        self.general_scope = 1  # =>new
        if self.args.adapt: self.task_scope += 1
        if self.args.fuse: self.general_scope += 1
        self.tensorboard_writer = SummaryWriter()
        self.iter = 0
Ejemplo n.º 2
0
    def __init__(self, mdir, device, time_limit, explorer=False):
        """ Build vae, rnn, controller and environment. """
        self.explorer = explorer

        # Load controllers
        vae_file, rnn_file, ctrl_file = \
            [join(mdir, m, 'best.tar') for m in ['vae', 'mdrnn', 'ctrl']]

        if self.explorer:
            ctrl_file = join(mdir, 'exp', 'best.tar')

        assert exists(vae_file) and exists(rnn_file),\
            "Either vae or mdrnn is untrained."

        vae_state, rnn_state = [
            torch.load(fname, map_location={'cuda:0': str(device)})
            for fname in (vae_file, rnn_file)
        ]

        for m, s in (('VAE', vae_state), ('MDRNN', rnn_state)):
            print("Loading {} at epoch {} "
                  "with test loss {}".format(m, s['epoch'], s['precision']))

        self.vae = VAE(3, LSIZE).to(device)
        self.vae.load_state_dict(vae_state['state_dict'])

        # MDRNNCell
        self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device)
        self.mdrnn.load_state_dict(
            {k.strip('_l0'): v
             for k, v in rnn_state['state_dict'].items()})

        self.controller = Controller(LSIZE, RSIZE, ASIZE).to(device)

        # load controller if it was previously saved
        if exists(ctrl_file):
            ctrl_state = torch.load(ctrl_file,
                                    map_location={'cuda:0': str(device)})
            print("Loading Controller with reward {}".format(
                ctrl_state['reward']))
            self.controller.load_state_dict(ctrl_state['state_dict'])

        self.env = gym.make('CarRacing-v0')
        self.device = device

        self.time_limit = time_limit

        self.mdrnn_notcell = MDRNN(LSIZE, ASIZE, RSIZE, 5)
        self.mdrnn_notcell.to(device)
        self.mdrnn_notcell.load_state_dict(rnn_state['state_dict'])
Ejemplo n.º 3
0
    def get_data(self, request, id):
        response = api.get_controller(request, id)
        data = json.loads(response.text)

        controller = Controller(data["id"], data["controller_name"],
                                data["class_name"], data["enabled"])
        return controller
Ejemplo n.º 4
0
    def __init__(self, device, time_limit, discrete_VAE):
        """ Build vae, rnn, controller and environment. """

        self.env = gym.make('CarRacing-v0')

        self.device = device

        self.time_limit = time_limit

        self.discrete_VAE = discrete_VAE

        #Because the represenation is discrete, we increase the size of the latent vector
        if (self.discrete_VAE):
            LSIZE = 128

        self.vae = VAE(3, LSIZE, 1024)
        self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5)
        self.controller = Controller(LSIZE, RSIZE, ASIZE)
Ejemplo n.º 5
0
    def __init__(self, mdir, device, time_limit):
        """ Build vae, rnn, controller and environment. """
        # Loading world model and vae
        vae_file, rnn_file, ctrl_file = \
            [join(mdir, m, 'best.tar') for m in ['vae', 'mdrnn', 'ctrl']]

        assert exists(vae_file) and exists(rnn_file),\
            "Either vae or mdrnn is untrained."

        vae_state, rnn_state = [
            torch.load(fname, map_location={'cuda:0': str(device)})
            for fname in (vae_file, rnn_file)]

        for m, s in (('VAE', vae_state), ('MDRNN', rnn_state)):
            print("Loading {} at epoch {} "
                  "with test loss {}".format(
                      m, s['epoch'], s['precision']))

        self.vae = VAE(3, LSIZE).to(device)
        self.vae.load_state_dict(vae_state['state_dict'])

        self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device)
        self.mdrnn.load_state_dict(
            {k.strip('_l0'): v for k, v in rnn_state['state_dict'].items()})

        self.controller = Controller(LSIZE, RSIZE, ASIZE).to(device)

        # load controller if it was previously saved
        if exists(ctrl_file):
            ctrl_state = torch.load(ctrl_file, map_location={'cuda:0': str(device)})
            print("Loading Controller with reward {}".format(
                ctrl_state['reward']))
            self.controller.load_state_dict(ctrl_state['state_dict'])

        self.env = gym.make('CarRacing-v0')
        self.device = device

        self.time_limit = time_limit
Ejemplo n.º 6
0
    for s_id in range(rollouts):
        p_queue.put((s_id, best_guess))

    print("Evaluating...")
    for _ in tqdm(range(rollouts)):
        while r_queue.empty():
            sleep(.1)
        restimates.append(r_queue.get()[1])

    return best_guess, np.mean(restimates), np.std(restimates)


################################################################################
#                           Launch CMA                                         #
################################################################################
controller = Controller(LSIZE, RSIZE, ASIZE,
                        is_gate=args.is_gate)  # dummy instance

# define current best and load parameters
cur_best = None
ctrl_file = join(ctrl_dir, 'best.tar')
print("Attempting to load previous best...")
if exists(ctrl_file):
    state = torch.load(ctrl_file, map_location={'cuda:0': 'cpu'})
    cur_best = -state['reward']

    # changes so that can load previous mdoels even if the controller source
    # code has changes (add new parameters)
    load_model_safe_(controller, state['state_dict'])
    try:
        print(controller.gates, controller.is_gate)
    except AttributeError:
Ejemplo n.º 7
0
class Mutator:
    def __init__(self, args, data, opts):
        '''
        需要完成几个任务,第一个是对初始任务,怎么考虑
        第二是从构建此表,最终完成采样过程
        第三是根据采样过程,构建出模型的size,选取出需要用到的参数
        :param model:
        :param task:
        :param args:
        :param data:
        '''
        self.args = args
        self.data = data
        self.opts = opts
        self.controller = Controller(args=self.args,
                                     task_num=self.opts.num_task)
        self.controller_optim = Adam(self.controller.parameters(),
                                     lr=args.controller_lr)
        cuda_condition = torch.cuda.is_available() and args.with_cuda
        self.device = torch.device("cuda" if cuda_condition else "cpu")
        self.controller = self.controller.to(self.device)

        self.tasks_config = []
        self.task_acc = []
        self.model_dict = []
        self.task_scope = 1  # =>reuse
        self.general_scope = 1  # =>new
        if self.args.adapt: self.task_scope += 1
        if self.args.fuse: self.general_scope += 1
        self.tensorboard_writer = SummaryWriter()
        self.iter = 0

    def run(self):
        print('Experiment use {}'.format(self.args.base))
        if self.args.base == 'mlp':
            report_final_eval_acc, final_log, all_acc = self.run_mlp()
        elif self.args.base == 'cnn':
            report_final_eval_acc, final_log, all_acc = self.run_cnn()
        print('Acc:')
        for items in report_final_eval_acc:
            s = ''
            for item in items:
                s += '%.3f\t' % item
            print(s)
        print(all_acc)
        print(final_log)
        print(self.args)

    def controller_sample(self, task):
        if self.args.base == 'mlp':
            steps = self.args.mlp_linear
        elif self.args.base == 'cnn':
            steps = self.args.cnn_cnn_linear + self.args.cnn_mlp_linear
        else:
            steps = 0
            raise NotImplemented
        step_probs = []
        step_idx = []
        step_losses = []
        sample_idx = torch.tensor(0).view(-1).to(self.device)

        hidden = None
        for idx, step in enumerate(range(steps)):

            logit, hidden = self.controller(input=sample_idx,
                                            task=task,
                                            hidden=hidden)
            if self.args.greedy > 0 and random.random() < self.args.greedy:
                sample_idx = torch.tensor(
                    random.randint(
                        0, task * self.task_scope + self.general_scope -
                        1)).to(self.device)
                if self.args.base == 'cnn':
                    raise NotImplemented  # greedy should fix with cnn model
            else:
                sample_idx = torch.multinomial(F.softmax(logit, dim=-1),
                                               1).view(-1)
                if idx >= self.args.cnn_cnn_linear:
                    if sample_idx == 0:
                        pass
                    elif self.general_scope > 1 and step == self.general_scope - 1:
                        pass
                    else:
                        if self.args.adapt:
                            while (sample_idx - self.general_scope
                                   ) % self.task_scope + 1 == 2:
                                sample_idx = torch.multinomial(
                                    F.softmax(logit, dim=-1), 1).view(-1)
                            assert (sample_idx - self.general_scope
                                    ) % self.task_scope + 1 != 2
            assert sample_idx < task * self.task_scope + self.general_scope
            step_probs.append(F.softmax(logit, dim=-1).tolist())
            step_idx.append(sample_idx.item())
            step_losses.append(
                F.cross_entropy(logit.view(1, -1), sample_idx.view(-1)))
        step_losses = torch.stack(step_losses, dim=0)
        return step_probs, step_idx, torch.mean(step_losses)

    def crop_model(self, step_idx, default_config):
        def get_layer_dict(cur_model_dict, use_dict, layer):
            # 从一个模型的参数中,取出某一个层的参数
            for key, value in use_dict.items():
                if 'Stack{}'.format(layer) in key:
                    cur_model_dict[key] = value
            return cur_model_dict

        def init_dict(last_model_dict, cur_model_dict):
            # 将最近的一层的classify继承下来
            for key, value in last_model_dict.items():
                if 'classify' in key:
                    cur_model_dict[key] = value
            return cur_model_dict

        def fuse(cur_model_dict, layer):
            temp = dict()
            for use_dict in self.model_dict:
                for key, value in use_dict.items():
                    if 'Stack{}'.format(layer) in key:
                        if key in temp.keys():
                            temp[key].append(value)
                        else:
                            temp[key] = [value]
            for key, value in temp.items():
                cur_model_dict[key] = torch.mean(
                    torch.stack(value, dim=0),
                    dim=0)  # we assert all model shape equal
            return cur_model_dict

        def adapt_config(source_config):
            '''
            {'conv': [(64, 128, 3)]}
            :param source_config:
            :return:
            '''
            key = source_config.keys()
            assert len(key) == 1
            key = list(key)[0]
            assert key == 'conv'  # because just cnn could adapt, and this is ensured by <controller_sample> method
            source_config = deepcopy(source_config)
            value = source_config[key]
            assert isinstance(value, list)
            original_tuple = value[0]
            adapt_tuple = (original_tuple[1], original_tuple[1], 1)
            value.append(adapt_tuple)
            return source_config

        cur_model_dict = dict()
        cur_model_dict = init_dict(self.model_dict[-1], cur_model_dict)

        cur_model_config = []
        create_log = ''
        for layer, step in enumerate(step_idx):
            # 选择空间 [new, reuse 0, adapt 0, reuse 1, adapt1 ,.....]
            # step = step.item()

            if step == 0:
                create_log += 'NEW      '.format(layer)
                cur_model_config.append(default_config[layer])
            elif self.general_scope > 1 and step == self.general_scope - 1:
                create_log += 'Fuse from task above        '.format(layer)
                cur_model_config.append(
                    default_config[layer])  # we assert all shape equal
                cur_model_dict = fuse(cur_model_dict, layer)
            else:
                '''
                    test case1:
                        general_scope=2 task_scope=1
                        then  [0,1,2,3,4]
                        we get[new,fuse,reuse0,reuse1,reuse2]
                    test case2:
                        general_scope=1 task_scope=1
                        then  [0,1,2,3,4]
                        we get[new,reuse0,reuse1,reuse2,reuse3]
                '''
                task_num = (step - self.general_scope) // self.task_scope
                choice = (step - self.general_scope
                          ) % self.task_scope + 1  # adapt maybe wrong!
                use_dict = self.model_dict[task_num]
                use_config = self.tasks_config[task_num]
                if choice == 1:
                    create_log += 'REUSE from task {}        '.format(task_num)
                    cur_model_dict = get_layer_dict(cur_model_dict, use_dict,
                                                    layer)
                    cur_model_config.append(use_config[layer])
                elif self.args.adapt and choice == 2:
                    create_log += 'ADAPT from task {}        '.format(task_num)
                    assert layer < 3
                    cur_model_dict = get_layer_dict(cur_model_dict, use_dict,
                                                    layer)
                    cur_model_config.append(adapt_config(use_config[layer]))
                else:
                    raise NotImplemented
        assert len(cur_model_config) == len(step_idx)
        return cur_model_dict, cur_model_config, create_log

    def count_reward(self, cur_acc_lis, back_acc_list):
        '''
        
        :param cur_acc_lis: 当前任务上,不同采样过程中出现的acc
        :param back_acc_list: 当前采样的情况下,对历史任务的回测acc
        :return: 
        '''
        if len(cur_acc_lis) > 1:
            beta = cur_acc_lis[-1] / max(cur_acc_lis[:-1])
        else:
            beta = 0
        alpha = []
        assert len(back_acc_list) == len(self.task_acc)
        # for origin_acc, eval_back_acc in zip(self.task_acc, back_acc_list):
        #     acc_drop = max(0, origin_acc - eval_back_acc)
        #     # acc_drop = origin_acc - eval_back_acc  # TODO, find better reward
        #     alpha.append(acc_drop / origin_acc)
        # noise = 0.001
        # alpha = 1 / (torch.mean(torch.tensor(alpha)) + noise)
        # alpha = -1 * (torch.mean(torch.tensor(alpha))) #TODO, find better reward
        # alpha =  torch.sigmoid(-1 * (torch.mean(torch.tensor(alpha)))) - 0.5
        # alpha = -1 * (torch.mean(torch.tensor(alpha))) + 0.05
        # alpha = -1 * (torch.mean(torch.tensor(alpha))) + 0.5
        # alpha = -1 * (torch.max(torch.tensor(alpha))) + 0.1
        # reward = alpha + beta

        for origin_acc, eval_back_acc in zip(self.task_acc, back_acc_list):
            # acc_drop = max(0, origin_acc - eval_back_acc)
            acc_drop = eval_back_acc / origin_acc
            alpha.append(acc_drop)
        alpha = torch.mean(torch.tensor(alpha))
        reward = alpha
        if self.args.beta:
            reward += beta

        self.tensorboard_writer.add_scalar('Reward/Sum', reward, self.iter)
        self.tensorboard_writer.add_scalar('Reward/Alpha', alpha, self.iter)
        self.tensorboard_writer.add_scalar('Reward/Beta', beta, self.iter)
        self.iter += 1
        if self.args.baseline > 0:
            reward = reward - self.args.baseline
        return reward.item()

    def run_mlp(self):
        final_log = ''
        report_final_eval_acc = [[0.0] * self.opts.num_task
                                 for _ in range(self.opts.num_task)]

        if self.args.dataset == 'mnist':
            input_feature = 28 * 28
        elif self.args.dataset == 'cifar10':
            input_feature = 32 * 32
        else:
            input_feature = 0
            raise NotImplemented
        default_config = [{
            'mlp': (input_feature, self.args.mlp_size)
        }] + [{
            'mlp': (self.args.mlp_size, self.args.mlp_size)
        }] * (self.args.mlp_linear - 1)
        controller_dic = deepcopy(self.controller.state_dict())
        for task in range(self.opts.num_task):
            print(
                '--------------Create Config and Dict for task {}--------------'
                .format(task))
            if self.args.random:
                self.controller.load_state_dict(deepcopy(controller_dic))
            elif self.args.gaussian > 0:
                temp = deepcopy(self.controller.state_dict())
                for key, value in temp.items():
                    temp[key] = value + torch.randn_like(value) * (
                        self.args.gaussian**0.5)
                self.controller.load_state_dict(temp)
            elif self.args.random_c:
                temp = deepcopy(self.controller.state_dict())
                for key, value in temp.items():
                    if 'choice' in key:
                        temp[key] = controller_dic[key]
                self.controller.load_state_dict(temp)
            if task == 0:
                cur_model = MLP(default_config, self.args.mlp_size, self.opts)
                trainer = Trainer(model=cur_model,
                                  task=task,
                                  args=self.args,
                                  data=self.data)
                cur_acc, cur_model_dic = trainer.run()
                self.tasks_config.append(default_config)
                self.task_acc.append(cur_acc)
                self.model_dict.append(cur_model_dic)
                print('Task{} Best Acc is {}'.format(task, cur_acc))
                report_final_eval_acc[task][:task + 1] = [cur_acc]
            else:
                best_reward = float('-inf')
                cur_acc_lis = []
                cur_best_acc, cur_best_dic, cur_best_config, best_create_log, step_probs = 0, None, None, None, None
                report_back_acc_list = None
                if self.args.upper_bound:
                    valid_idx = list(range(task + 1))
                    total_choice = list(
                        itertools.product(valid_idx,
                                          repeat=self.args.mlp_linear)) * 5
                    total_step = len(total_choice)
                elif self.args.base_model:
                    total_choice = [[task] * self.args.mlp_linear]
                    total_step = 1
                else:
                    total_step = self.args.controller_steps
                for steps in range(total_step):
                    if self.args.upper_bound or self.args.base_model:
                        step_idx = list(total_choice[steps])
                    else:
                        self.controller.train()
                        step_probs, step_idx, sample_loss = self.controller_sample(
                            task)
                    cur_model_dict, cur_model_config, create_log = self.crop_model(
                        step_idx, default_config)
                    cur_model = MLP(cur_model_config, self.args.mlp_size,
                                    self.opts)
                    trainer = Trainer(model=cur_model,
                                      task=task,
                                      args=self.args,
                                      data=self.data)
                    trainer.reload_checkpoint(cur_model_dict)
                    cur_acc, cur_model_dic = trainer.run(
                        task_list=list(range(0, task)))
                    cur_acc_lis.append(cur_acc)
                    back_acc_list = trainer.history_eval(
                        task_list=list(range(0, task)))
                    reward = self.count_reward(cur_acc_lis, back_acc_list)
                    if steps % self.args.controller_logging_step == 0:
                        print(
                            '-------Logging at {} step for controller-------'.
                            format(steps))
                        print(create_log)
                        print('Reward:{}. '.format(reward))
                        if step_probs:
                            for step_prob in step_probs:
                                print(step_prob)
                    if reward > best_reward:
                        best_reward = reward
                        cur_best_dic = cur_model_dic
                        cur_best_acc = cur_acc
                        cur_best_config = cur_model_config
                        report_back_acc_list = back_acc_list
                        best_create_log = create_log
                    if self.args.upper_bound or self.args.base_model:
                        pass
                    else:
                        self.controller_optim.zero_grad()
                        loss = sample_loss * reward
                        loss.backward()
                        self.controller_optim.step()
                print('\033[95mAfter task {}'.format(task))
                print(best_create_log)
                final_log = final_log + best_create_log + '\n'
                print('best reward :{}\033[0m'.format(best_reward))
                self.tasks_config.append(cur_best_config)
                self.task_acc.append(cur_best_acc)
                self.model_dict.append(cur_best_dic)
                report_final_eval_acc[task][:len(report_back_acc_list) +
                                            1] = report_back_acc_list + [
                                                cur_best_acc
                                            ]
                if task == self.opts.num_task - 1:
                    all_acc = torch.mean(
                        torch.tensor(report_back_acc_list +
                                     [cur_best_acc])).item()
        return report_final_eval_acc, final_log, all_acc

    def run_cnn(self):
        final_log = ''
        report_final_eval_acc = [[0.0] * self.opts.num_task
                                 for _ in range(self.opts.num_task)]

        if self.args.dataset == 'mnist':
            input_size = 28
            input_channel = 1
        elif self.args.dataset == 'cifar10':
            input_size = 32
            input_channel = 3
        else:
            input_feature = 0
            raise NotImplemented
        #  (((inputsize-3)//2 -2)//2-1)//2
        final_size = ((
            (input_size - input_size // 8 + 1) // 2 - input_size // 10 + 1) //
                      2 - 1) // 2

        default_config = [{
            'conv': [(input_channel, 64, input_size // 8)]
        }, {
            'conv': [(64, 128, input_size // 10)]
        }, {
            'conv': [(128, 256, 2)]
        }, {
            'mlp': (final_size**2 * 256, 2048)
        }, {
            'mlp': (2048, 2048)
        }]

        controller_dic = deepcopy(self.controller.state_dict())
        for task in range(self.opts.num_task):
            print(
                '--------------Create Config and Dict for task {}--------------'
                .format(task))
            if self.args.random:
                self.controller.load_state_dict(deepcopy(controller_dic))
            elif self.args.gaussian > 0:
                temp = deepcopy(self.controller.state_dict())
                for key, value in temp.items():
                    temp[key] = value + torch.randn_like(value) * (
                        self.args.gaussian**0.5)
                self.controller.load_state_dict(temp)
            elif self.args.random_c:
                temp = deepcopy(self.controller.state_dict())
                for key, value in temp.items():
                    if 'choice' in key:
                        temp[key] = controller_dic[key]
                self.controller.load_state_dict(temp)
            if task == 0:
                cur_model = CNN(default_config, self.args.cnn_linear_size,
                                self.opts)
                trainer = Trainer(model=cur_model,
                                  task=task,
                                  args=self.args,
                                  data=self.data)
                cur_acc, cur_model_dic = trainer.run()
                self.tasks_config.append(default_config)
                self.task_acc.append(cur_acc)
                self.model_dict.append(cur_model_dic)
                print('Task{} Best Acc is {}'.format(task, cur_acc))
                report_final_eval_acc[task][:task + 1] = [cur_acc]
            else:
                best_reward = float('-inf')
                cur_acc_lis = []
                cur_best_acc, cur_best_dic, cur_best_config, best_create_log, step_probs = 0, None, None, None, None
                report_back_acc_list = None

                total_step = self.args.controller_steps
                for steps in range(total_step):
                    self.controller.train()
                    step_probs, step_idx, sample_loss = self.controller_sample(
                        task)
                    cur_model_dict, cur_model_config, create_log = self.crop_model(
                        step_idx, default_config)
                    cur_model = CNN(cur_model_config,
                                    self.args.cnn_linear_size, self.opts)
                    trainer = Trainer(model=cur_model,
                                      task=task,
                                      args=self.args,
                                      data=self.data)
                    trainer.reload_checkpoint(cur_model_dict)
                    cur_acc, cur_model_dic = trainer.run(
                        task_list=list(range(0, task)))
                    cur_acc_lis.append(cur_acc)
                    back_acc_list = trainer.history_eval(
                        task_list=list(range(0, task)))
                    reward = self.count_reward(cur_acc_lis, back_acc_list)
                    if steps % self.args.controller_logging_step == 0:
                        print(
                            '-------Logging at {} step for controller-------'.
                            format(steps))
                        print(create_log)
                        print('Reward:{}. '.format(reward))
                        if step_probs:
                            for step_prob in step_probs:
                                print(step_prob)
                    if reward > best_reward:
                        best_reward = reward
                        cur_best_dic = cur_model_dic
                        cur_best_acc = cur_acc
                        cur_best_config = cur_model_config
                        report_back_acc_list = back_acc_list
                        best_create_log = create_log
                    if self.args.upper_bound or self.args.base_model:
                        pass
                    else:
                        self.controller_optim.zero_grad()
                        loss = sample_loss * reward
                        loss.backward()
                        self.controller_optim.step()
                print('\033[95mAfter task {}'.format(task))
                print(best_create_log)
                final_log = final_log + best_create_log + '\n'
                print('best reward :{}\033[0m'.format(best_reward))
                self.tasks_config.append(cur_best_config)
                self.task_acc.append(cur_best_acc)
                self.model_dict.append(cur_best_dic)
                report_final_eval_acc[task][:len(report_back_acc_list) +
                                            1] = report_back_acc_list + [
                                                cur_best_acc
                                            ]
                if task == self.opts.num_task - 1:
                    all_acc = torch.mean(
                        torch.tensor(report_back_acc_list +
                                     [cur_best_acc])).item()
        return report_final_eval_acc, final_log, all_acc
Ejemplo n.º 8
0
# Fix numeric divergence due to bug in Cudnn
torch.backends.cudnn.benchmark = True

device = torch.device("cuda" if cuda else "cpu")

trained = 0
#model = VAE(3, LSIZE).to(device)
vae_model = VAE(3, LSIZE)
vae_model = torch.nn.DataParallel(vae_model, device_ids=[7])
vae_model.cuda(7)
vae_model.eval()
mdrnn_model = MDRNNCell(LSIZE, ASIZE, RSIZE, 5)
mdrnn_model = torch.nn.DataParallel(mdrnn_model, device_ids=[7])
mdrnn_model.cuda(7)
mdrnn_model.eval()
controller = torch.nn.DataParallel(Controller(LSIZE, RSIZE, ASIZE)).cuda()

vis = visdom.Visdom(env='dream')
image_window = vis.image(
    np.random.rand(RED_SIZE * 10, RED_SIZE * 10),
    opts=dict(title='dream!', caption='dream.'),
)

# check vae dir exists, if not, create it
dream_dir = join(args.logdir, 'dream')
vae_dir = join(args.logdir, 'vae')
reload_file = join(vae_dir, 'best.tar')
state = torch.load(reload_file)
print("Reloading model at epoch {}"
      ", with test error {}".format(state['epoch'], state['precision']))
vae_model.load_state_dict(state['state_dict'])
Ejemplo n.º 9
0
def train_explorer(logdir,
                   epochs=10,
                   n_samples=4,
                   pop_size=4,
                   display=True,
                   max_workers=10):
    results = {}
    results['best'] = []
    # multiprocessing variables
    num_workers = min(max_workers, n_samples * pop_size)
    time_limit = 1000

    # create tmp dir if non existent and clean it if existent
    tmp_dir = join(logdir, 'tmp_exp')
    if not exists(tmp_dir):
        mkdir(tmp_dir)
    else:
        for fname in listdir(tmp_dir):
            unlink(join(tmp_dir, fname))

    # create exp dir if non exitent
    explore_dir = join(logdir, 'explore')
    if not exists(explore_dir):
        mkdir(explore_dir)

    ################################################################################
    #                           Thread routines                                    #
    ################################################################################
    def slave_routine(p_queue, r_queue, e_queue, p_index):
        """ Thread routine.

        Threads interact with p_queue, the parameters queue, r_queue, the result
        queue and e_queue the end queue. They pull parameters from p_queue, execute
        the corresponding rollout, then place the result in r_queue.

        Each parameter has its own unique id. Parameters are pulled as tuples
        (s_id, params) and results are pushed as (s_id, result).  The same
        parameter can appear multiple times in p_queue, displaying the same id
        each time.

        As soon as e_queue is non empty, the thread terminate.

        When multiple gpus are involved, the assigned gpu is determined by the
        process index p_index (gpu = p_index % n_gpus).

        :args p_queue: queue containing couples (s_id, parameters) to evaluate
        :args r_queue: where to place results (s_id, results)
        :args e_queue: as soon as not empty, terminate
        :args p_index: the process index
        """
        # init routine
        gpu = p_index % torch.cuda.device_count()
        device = torch.device(
            'cuda:{}'.format(gpu) if torch.cuda.is_available() else 'cpu')

        # redirect streams
        sys.stdout = open(join(tmp_dir, str(getpid()) + '.out'), 'a')
        sys.stderr = open(join(tmp_dir, str(getpid()) + '.err'), 'a')

        # with torch.no_grad():
        #     r_gen = RolloutGenerator(logdir, device, time_limit)

        #     while e_queue.empty():
        #         if p_queue.empty():
        #             sleep(.1)
        #         else:
        #             s_id, params = p_queue.get()
        #             r_queue.put((s_id, r_gen.rollout(params)))

        with torch.no_grad():
            r_gen = RolloutGenerator(logdir, device, time_limit)

            while e_queue.empty():
                if p_queue.empty():
                    sleep(.1)
                else:
                    s_id, params = p_queue.get()
                    r_queue.put((s_id, r_gen.rollout(params)))

    ################################################################################
    #                Define queues and start workers                               #
    ################################################################################
    p_queue = Queue()
    r_queue = Queue()
    e_queue = Queue()

    for p_index in range(num_workers):
        Process(target=slave_routine,
                args=(p_queue, r_queue, e_queue, p_index)).start()

    ################################################################################
    #                           Evaluation                                         #
    ################################################################################
    def evaluate(solutions, results, rollouts=100):
        """ Give current controller evaluation.

        Evaluation is minus the cumulated reward averaged over rollout runs.

        :args solutions: CMA set of solutions
        :args results: corresponding results
        :args rollouts: number of rollouts

        :returns: minus averaged cumulated reward
        """
        index_min = np.argmin(results)
        best_guess = solutions[index_min]
        restimates = []

        for s_id in range(rollouts):
            p_queue.put((s_id, best_guess))

        print("Evaluating...")
        for _ in tqdm(range(rollouts)):
            while r_queue.empty():
                sleep(.1)
            restimates.append(r_queue.get()[1])

        return best_guess, np.mean(restimates), np.std(restimates)

    ################################################################################
    #                           Launch CMA                                         #
    ################################################################################
    controller = Controller(LSIZE, RSIZE, ASIZE)  # dummy instance

    # define current best and load parameters
    cur_best = None
    ctrl_file = join(explore_dir, 'best.tar')
    print("Attempting to load previous best...")
    if exists(ctrl_file):
        state = torch.load(ctrl_file, map_location={'cuda:0': 'cpu'})
        cur_best = -state['reward']
        controller.load_state_dict(state['state_dict'])
        print("Previous best was {}...".format(-cur_best))

    parameters = controller.parameters()
    es = cma.CMAEvolutionStrategy(flatten_parameters(parameters), 0.1,
                                  {'popsize': pop_size})

    epoch = 0
    log_step = 3
    while not es.stop():
        if cur_best is not None and -cur_best > target_return:
            print("Already better than target, breaking...")
            break

        r_list = [0] * pop_size  # result list
        solutions = es.ask()

        # push parameters to queue
        for s_id, s in enumerate(solutions):
            for _ in range(n_samples):
                p_queue.put((s_id, s))

        # retrieve results
        if display:
            pbar = tqdm(total=pop_size * n_samples)
        for _ in range(pop_size * n_samples):
            while r_queue.empty():
                sleep(.1)
            r_s_id, r = r_queue.get()
            r_list[r_s_id] += r / n_samples
            if display:
                pbar.update(1)
        if display:
            pbar.close()

        es.tell(solutions, r_list)
        es.disp()

        # evaluation and saving
        if epoch % log_step == log_step - 1:
            best_params, best, std_best = evaluate(solutions, r_list)

            # log the best
            results['best'].append(best)

            print("Current evaluation: {}".format(best))
            if not cur_best or cur_best > best:
                cur_best = best
                print("Saving new best with value {}+-{}...".format(
                    -cur_best, std_best))
                load_parameters(best_params, controller)
                torch.save(
                    {
                        'epoch': epoch,
                        'reward': -cur_best,
                        'state_dict': controller.state_dict()
                    }, join(explore_dir, 'best.tar'))

            if -best > target_return:
                print(
                    "Terminating controller training with value {}...".format(
                        best))
                break

        epoch += 1

    es.result_pretty()
    e_queue.put('EOP')

    return results
Ejemplo n.º 10
0
    def __init__(self,
                 mdir,
                 device,
                 time_limit,
                 number_goals,
                 Forward_model,
                 hiddengoals: bool,
                 curiosityreward=bool,
                 static=bool):
        """ Build vae, rnn, controller and environment. """
        # Loading world model and vae
        vae_file, rnn_file, ctrl_file, Dtild_file, hiddenvae_file = [
            join(mdir, m, 'best.tar')
            for m in ['vae', 'mdrnn', 'ctrl', 'dtild', 'hiddenvae']
        ]

        assert exists(vae_file) and exists(
            rnn_file), "Either vae or mdrnn is untrained."

        vae_state, rnn_state, hiddenvae_state = [
            torch.load(fname, map_location={'cuda:0': str(device)})
            for fname in (vae_file, rnn_file, hiddenvae_file)
        ]

        for m, s in (('VAE', vae_state), ('MDRNN', rnn_state),
                     ('HiddenVAE', hiddenvae_state)):
            print("Loading {} at epoch {} "
                  "with test loss {}".format(m, s['epoch'], s['precision']))

        self.vae = VAE(3, LSIZE).to(device)
        self.vae.load_state_dict(vae_state['state_dict'])

        self.HiddenVAE = HiddenVAE(256, LSIZE).to(device)
        self.HiddenVAE.load_state_dict(hiddenvae_state['state_dict'])

        self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device)
        self.mdrnn.load_state_dict(
            {k.strip('_l0'): v
             for k, v in rnn_state['state_dict'].items()})

        self.mdrnnBIG = MDRNN(LSIZE, ASIZE, RSIZE, 5).to(device)
        self.mdrnnBIG.load_state_dict(rnn_state["state_dict"])

        self.controller = Controller(256, 256, 6).to(device)

        self.env = gym.make('MiniGrid-MultiRoom-N6-v0')

        self.device = device
        self.number_goals = number_goals
        self.time_limit = time_limit

        self.vae_state = vae_state
        self.rnn_state = rnn_state
        self.hiddenvae_state = hiddenvae_state

        self.hiddengoals = hiddengoals
        self.curiosityreward = curiosityreward
        self.static = static
        self.Forward_model = Forward_model

        self.fmodel = Dtild(32, 256, 1, 32).to(device)
Ejemplo n.º 11
0
class RolloutGenerator(object):
    """ Utility to generate rollouts.

    Encapsulate everything that is needed to generate rollouts in the TRUE ENV
    using a controller with previously trained VAE and MDRNN.

    :attr vae: VAE model loaded from mdir/vae
    :attr mdrnn: MDRNN model loaded from mdir/mdrnn
    :attr controller: Controller, either loaded from mdir/ctrl or randomly
        initialized
    :attr env: instance of the CarRacing-v0 gym environment
    :attr device: device used to run VAE, MDRNN and Controller
    :attr time_limit: rollouts have a maximum of time_limit timesteps
    """
    def __init__(self, mdir, device, time_limit):
        """ Build vae, rnn, controller and environment. """
        # Loading world model and vae
        vae_file, rnn_file, ctrl_file = \
            [join(mdir, m, 'best.tar') for m in ['vae', 'mdrnn', 'ctrl']]

        assert exists(vae_file) and exists(rnn_file),\
            "Either vae or mdrnn is untrained."

        vae_state, rnn_state = [
            torch.load(fname, map_location={'cuda:0': str(device)})
            for fname in (vae_file, rnn_file)]

        for m, s in (('VAE', vae_state), ('MDRNN', rnn_state)):
            print("Loading {} at epoch {} "
                  "with test loss {}".format(
                      m, s['epoch'], s['precision']))

        self.vae = VAE(3, LSIZE).to(device)
        self.vae.load_state_dict(vae_state['state_dict'])

        self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device)
        self.mdrnn.load_state_dict(
            {k.strip('_l0'): v for k, v in rnn_state['state_dict'].items()})

        self.controller = Controller(LSIZE, RSIZE, ASIZE).to(device)

        # load controller if it was previously saved
        if exists(ctrl_file):
            ctrl_state = torch.load(ctrl_file, map_location={'cuda:0': str(device)})
            print("Loading Controller with reward {}".format(
                ctrl_state['reward']))
            self.controller.load_state_dict(ctrl_state['state_dict'])

        self.env = gym.make('CarRacing-v0')
        self.device = device

        self.time_limit = time_limit

    def get_action_and_transition(self, obs, hidden):
        """ Get action and transition.

        Encode obs to latent using the VAE, then obtain estimation for next
        latent and next hidden state using the MDRNN and compute the controller
        corresponding action.

        :args obs: current observation (1 x 3 x 64 x 64) torch tensor
        :args hidden: current hidden state (1 x 256) torch tensor

        :returns: (action, next_hidden)
            - action: 1D np array
            - next_hidden (1 x 256) torch tensor
        """
        _, latent_mu, _ = self.vae(obs)
        action = self.controller(latent_mu, hidden[0])
        _, _, _, _, _, next_hidden = self.mdrnn(action, latent_mu, hidden)
        return action.squeeze().cpu().numpy(), next_hidden

    def rollout(self, params, render=False):
        """ Execute a rollout and returns minus cumulative reward.

        Load :params: into the controller and execute a single rollout. This
        is the main API of this class.

        :args params: parameters as a single 1D np array

        :returns: minus cumulative reward
        """
        # copy params into the controller
        if params is not None:
            load_parameters(params, self.controller)

        obs = self.env.reset()

        # This first render is required !
        self.env.render()

        hidden = [
            torch.zeros(1, RSIZE).to(self.device)
            for _ in range(2)]

        cumulative = 0
        i = 0
        while True:
            obs = transform(obs).unsqueeze(0).to(self.device)
            action, hidden = self.get_action_and_transition(obs, hidden)
            obs, reward, done, _ = self.env.step(action)

            if render:
                self.env.render()

            cumulative += reward
            if done or i > self.time_limit:
                return - cumulative
            i += 1
Ejemplo n.º 12
0
    def __init__(self,
                 mdir,
                 device,
                 time_limit,
                 iteration_num=None,
                 video_dir=None):
        """ Build vae, rnn, controller and environment. """
        # Loading world model and vae
        vae_file, rnn_file, ctrl_file = [
            join(mdir, m, "best.tar") for m in ["vae", "mdrnn", "ctrl"]
        ]

        if iteration_num is not None:
            vae_file, rnn_file, ctrl_file = [
                join(mdir, m, "iter_{}".format(iteration_num), "best.tar")
                for m in ["vae", "mdrnn", "ctrl"]
            ]

        assert exists(vae_file) and exists(
            rnn_file), "Either vae or mdrnn is untrained."

        if iteration_num is not None:
            vae_file, rnn_file, ctrl_file = [
                join(mdir, m, "iter_{}".format(iteration_num), "best.tar")
                for m in ["vae", "mdrnn", "ctrl"]
            ]

        assert exists(vae_file) and exists(
            rnn_file), "Either vae or mdrnn is untrained."

        print("\nRollout Generator")

        vae_state, rnn_state = [
            torch.load(fname, map_location={"cuda:0": str(device)})
            for fname in (vae_file, rnn_file)
        ]

        print("Loading VAE from {}".format(vae_file))
        print("Loading RNN from {}".format(rnn_file))
        for m, s in (("VAE", vae_state), ("MDRNN", rnn_state)):
            print("Loading {} at epoch {} "
                  "with test loss {}".format(m, s["epoch"], s["precision"]))

        self.vae = VAE(3, LSIZE).to(device)
        self.vae.load_state_dict(vae_state["state_dict"])

        self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device)
        self.mdrnn.load_state_dict(
            {k.strip("_l0"): v
             for k, v in rnn_state["state_dict"].items()})

        self.controller = Controller(LSIZE, RSIZE, ASIZE).to(device)

        # load controller if it was previously saved
        if exists(ctrl_file):
            print("Loading Controller from {}".format(ctrl_file))
            ctrl_state = torch.load(ctrl_file,
                                    map_location={"cuda:0": str(device)})
            print("Loading Controller with reward {}".format(
                ctrl_state["reward"]))
            self.controller.load_state_dict(ctrl_state["state_dict"])

        self.env = gym.make("BipedalWalkerHardcore-v2")
        self.device = device

        self.time_limit = time_limit
Ejemplo n.º 13
0
class RolloutGenerator(object):
    """ Utility to generate rollouts.

    Encapsulate everything that is needed to generate rollouts in the TRUE ENV
    using a controller with previously trained VAE and MDRNN.

    :attr vae: VAE model loaded from mdir/vae
    :attr mdrnn: MDRNN model loaded from mdir/mdrnn
    :attr controller: Controller, either loaded from mdir/ctrl or randomly
        initialized
    :attr env: instance of the CarRacing-v0 gym environment
    :attr device: device used to run VAE, MDRNN and Controller
    :attr time_limit: rollouts have a maximum of time_limit timesteps
    """
    def __init__(self,
                 mdir,
                 device,
                 time_limit,
                 iteration_num=None,
                 video_dir=None):
        """ Build vae, rnn, controller and environment. """
        # Loading world model and vae
        vae_file, rnn_file, ctrl_file = [
            join(mdir, m, "best.tar") for m in ["vae", "mdrnn", "ctrl"]
        ]

        if iteration_num is not None:
            vae_file, rnn_file, ctrl_file = [
                join(mdir, m, "iter_{}".format(iteration_num), "best.tar")
                for m in ["vae", "mdrnn", "ctrl"]
            ]

        assert exists(vae_file) and exists(
            rnn_file), "Either vae or mdrnn is untrained."

        if iteration_num is not None:
            vae_file, rnn_file, ctrl_file = [
                join(mdir, m, "iter_{}".format(iteration_num), "best.tar")
                for m in ["vae", "mdrnn", "ctrl"]
            ]

        assert exists(vae_file) and exists(
            rnn_file), "Either vae or mdrnn is untrained."

        print("\nRollout Generator")

        vae_state, rnn_state = [
            torch.load(fname, map_location={"cuda:0": str(device)})
            for fname in (vae_file, rnn_file)
        ]

        print("Loading VAE from {}".format(vae_file))
        print("Loading RNN from {}".format(rnn_file))
        for m, s in (("VAE", vae_state), ("MDRNN", rnn_state)):
            print("Loading {} at epoch {} "
                  "with test loss {}".format(m, s["epoch"], s["precision"]))

        self.vae = VAE(3, LSIZE).to(device)
        self.vae.load_state_dict(vae_state["state_dict"])

        self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device)
        self.mdrnn.load_state_dict(
            {k.strip("_l0"): v
             for k, v in rnn_state["state_dict"].items()})

        self.controller = Controller(LSIZE, RSIZE, ASIZE).to(device)

        # load controller if it was previously saved
        if exists(ctrl_file):
            print("Loading Controller from {}".format(ctrl_file))
            ctrl_state = torch.load(ctrl_file,
                                    map_location={"cuda:0": str(device)})
            print("Loading Controller with reward {}".format(
                ctrl_state["reward"]))
            self.controller.load_state_dict(ctrl_state["state_dict"])

        self.env = gym.make("BipedalWalkerHardcore-v2")
        self.device = device

        self.time_limit = time_limit

    def get_action_and_transition(self, obs, hidden):
        """ Get action and transition.

        Encode obs to latent using the VAE, then obtain estimation for next
        latent and next hidden state using the MDRNN and compute the controller
        corresponding action.

        :args obs: current observation (1 x 3 x 64 x 64) torch tensor
        :args hidden: current hidden state (1 x 256) torch tensor

        :returns: (action, next_hidden)
            - action: 1D np array
            - next_hidden (1 x 256) torch tensor
        """
        _, latent_mu, _ = self.vae(obs)
        action = self.controller(latent_mu, hidden[0])
        _, _, _, _, _, next_hidden = self.mdrnn(action, latent_mu, hidden)
        return action.squeeze().cpu().numpy(), next_hidden

    def rollout(self,
                params,
                render=False,
                rollout_dir=None,
                rollout_num=0,
                video_dir=None):
        """ Execute a rollout and returns minus cumulative reward.

        Load :params: into the controller and execute a single rollout. This
        is the main API of this class.

        :args params: parameters as a single 1D np array

        :returns: minus cumulative reward
        """
        if video_dir is not None:
            self.env = wrappers.Monitor(
                self.env, "./{}/rollout_{}/".format(video_dir, rollout_num))
        # copy params into the controller
        if params is not None:
            load_parameters(params, self.controller)

        self.env.reset()

        # This first render is required !
        obs = self.env.render(mode='rgb_array')

        hidden = [torch.zeros(1, RSIZE).to(self.device) for _ in range(2)]

        cumulative = 0
        i = 0

        s_rollout = []
        r_rollout = []
        d_rollout = []
        a_rollout = []

        print('Starting to create the rollouts')

        while True:
            if i % 100 == 0:
                print("{} steps done of rollout".format(i))
            obs = transform(obs).unsqueeze(0).to(self.device)
            action, hidden = self.get_action_and_transition(obs, hidden)
            _, reward, done, _ = self.env.step(action)

            # Save rollout data
            im_frame = self.env.render(mode="rgb_array")
            img = PIL.Image.fromarray(im_frame)
            img = img.resize((64, 64))
            obs = np.array(img)
            s_rollout += [obs]
            r_rollout += [reward]
            d_rollout += [done]
            a_rollout += [action]

            if render:
                self.env.render()

            cumulative += reward
            if done or i > self.time_limit:
                print('Completed rollout with {} steps'.format(i))
                if rollout_dir is not None:
                    print("> End of rollout {}, {} frames...".format(
                        rollout_num, len(s_rollout)))
                    np.savez(
                        join(rollout_dir, "rollout_{}".format(rollout_num)),
                        observations=np.array(s_rollout),
                        rewards=np.array(r_rollout),
                        actions=np.array(a_rollout),
                        terminals=np.array(d_rollout),
                    )
                self.env.reset()
                return -cumulative
            i += 1
Ejemplo n.º 14
0
    for s_id in range(rollouts):
        p_queue.put((s_id, best_guess))

    print("Evaluating...")
    for _ in tqdm(range(rollouts)):
        while r_queue.empty():
            sleep(0.1)
        restimates.append(r_queue.get()[1])

    return best_guess, np.mean(restimates), np.std(restimates)


################################################################################
#                           Launch CMA                                         #
################################################################################
controller = Controller(LSIZE, RSIZE, ASIZE)  # dummy instance

# define current best and load parameters
cur_best = None
ctrl_file = join(prev_ctrl_dir, "best.tar")
print("Attempting to load previous best...")
if exists(ctrl_file):
    state = torch.load(ctrl_file, map_location={"cuda:0": "cpu"})
    # cur_best = -state["reward"]
    print("Loading Controller from {}".format(ctrl_file))
    controller.load_state_dict(state["state_dict"])
    # print("Previous best was {}...".format(-cur_best))

parameters = controller.parameters()
es = cma.CMAEvolutionStrategy(
    flatten_parameters(parameters), 0.1, {"popsize": pop_size}
Ejemplo n.º 15
0
                elif arg == 'fuzzy':
                    controller_fuzzy = True
                    mono = True
                    simulations /= 2
                else:
                    raise GetoptError()
    except GetoptError:
        usage()
        sys.exit(-1)

    #simulations = 5 if len(sys.argv) == 1 else int(sys.argv[1])
    for i in range(int(simulations * 2)):
        # create a controller
        if mono:
            control = FuzzyLogicController(
                log=log) if controller_fuzzy else Controller(log=log)
        else:
            control = Controller(
                log=log) if i % 2 == 0 else FuzzyLogicController(log=log)

        # create North-to-South and West-to-East lanes
        north2south = Lane(control,
                           S=15,
                           D=7,
                           name='North to South',
                           init_state=State.green)
        west2east = Lane(control,
                         S=15,
                         D=7,
                         name='West to East',
                         init_state=State.red)

M = buildMemory('weights/2019.12.07/mdn_rnn_weights')
get_hidden = K.function(M.layers[0].input, M.layers[0].output)


# In[ ]:


print(M.summary())


# In[ ]:


controller = Controller(32+256, 3)
controller.set_weights(np.load('./weights/C_weights.npy'))


# $$\text{Controller}: \mathbb R^{288} \rightarrow \mathbb R^3 $$

# In[ ]:


print('controller shape:')


# In[ ]:


print(controller.shape)
Ejemplo n.º 17
0
    for s_id in range(rollouts):
        p_queue.put((s_id, best_guess))

    print("Evaluating...")
    for _ in tqdm(range(rollouts)):
        while r_queue.empty():
            sleep(.1)
        restimates.append(r_queue.get()[1])

    return best_guess, np.mean(restimates), np.std(restimates)

################################################################################
#                           Launch CMA                                         #
################################################################################
controller = Controller(LSIZE, RSIZE, ASIZE)  # dummy instance

# define current best and load parameters
cur_best = None
ctrl_file = join(ctrl_dir, 'best.tar')
print("Attempting to load previous best...")
if exists(ctrl_file):
    state = torch.load(ctrl_file, map_location={'cuda:0': 'cpu'})
    cur_best = - state['reward']
    controller.load_state_dict(state['state_dict'])
    print("Previous best was {}...".format(-cur_best))

parameters = controller.parameters()
es = cma.CMAEvolutionStrategy(flatten_parameters(parameters), 0.1,
                              {'popsize': pop_size})
Ejemplo n.º 18
0
class RolloutGenerator(object):
    """ Utility to generate rollouts.

    Encapsulate everything that is needed to generate rollouts in the TRUE ENV
    using a controller with previously trained VAE and MDRNN.

    :attr vae: VAE model loaded from mdir/vae
    :attr mdrnn: MDRNN model loaded from mdir/mdrnn
    :attr controller: Controller, either loaded from mdir/ctrl or randomly
        initialized
    :attr env: instance of the CarRacing-v0 gym environment
    :attr device: device used to run VAE, MDRNN and Controller
    :attr time_limit: rollouts have a maximum of time_limit timesteps
    """
    def __init__(self, mdir, device, time_limit, explorer=False):
        """ Build vae, rnn, controller and environment. """
        self.explorer = explorer

        # Load controllers
        vae_file, rnn_file, ctrl_file = \
            [join(mdir, m, 'best.tar') for m in ['vae', 'mdrnn', 'ctrl']]

        if self.explorer:
            ctrl_file = join(mdir, 'exp', 'best.tar')

        assert exists(vae_file) and exists(rnn_file),\
            "Either vae or mdrnn is untrained."

        vae_state, rnn_state = [
            torch.load(fname, map_location={'cuda:0': str(device)})
            for fname in (vae_file, rnn_file)
        ]

        for m, s in (('VAE', vae_state), ('MDRNN', rnn_state)):
            print("Loading {} at epoch {} "
                  "with test loss {}".format(m, s['epoch'], s['precision']))

        self.vae = VAE(3, LSIZE).to(device)
        self.vae.load_state_dict(vae_state['state_dict'])

        # MDRNNCell
        self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device)
        self.mdrnn.load_state_dict(
            {k.strip('_l0'): v
             for k, v in rnn_state['state_dict'].items()})

        self.controller = Controller(LSIZE, RSIZE, ASIZE).to(device)

        # load controller if it was previously saved
        if exists(ctrl_file):
            ctrl_state = torch.load(ctrl_file,
                                    map_location={'cuda:0': str(device)})
            print("Loading Controller with reward {}".format(
                ctrl_state['reward']))
            self.controller.load_state_dict(ctrl_state['state_dict'])

        self.env = gym.make('CarRacing-v0')
        self.device = device

        self.time_limit = time_limit

        self.mdrnn_notcell = MDRNN(LSIZE, ASIZE, RSIZE, 5)
        self.mdrnn_notcell.to(device)
        self.mdrnn_notcell.load_state_dict(rnn_state['state_dict'])


#####$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$

# VERY LAZY. Copied from the other trainmdrnn file
# from trainmdrnn import get_loss, to_latent

    def to_latent(self, obs, next_obs):
        """ Transform observations to latent space.

        :args obs: 5D torch tensor (BSIZE, SEQ_LEN, ASIZE, SIZE, SIZE)
        :args next_obs: 5D torch tensor (BSIZE, SEQ_LEN, ASIZE, SIZE, SIZE)

        :returns: (latent_obs, latent_next_obs)
            - latent_obs: 4D torch tensor (BSIZE, SEQ_LEN, LSIZE)
            - next_latent_obs: 4D torch tensor (BSIZE, SEQ_LEN, LSIZE)
        """

        with torch.no_grad():
            obs, next_obs = [
                f.upsample(x.view(-1, 3, SIZE, SIZE),
                           size=RED_SIZE,
                           mode='bilinear',
                           align_corners=True) for x in (obs, next_obs)
            ]

            (obs_mu, obs_logsigma), (next_obs_mu, next_obs_logsigma) = [
                self.vae(x)[1:] for x in (obs, next_obs)
            ]

            SEQ_LEN = 1

            latent_obs, latent_next_obs = [
                (x_mu + x_logsigma.exp() * torch.randn_like(x_mu)).view(
                    BSIZE, SEQ_LEN, LSIZE)
                for x_mu, x_logsigma in [(
                    obs_mu, obs_logsigma), (next_obs_mu, next_obs_logsigma)]
            ]

        return latent_obs, latent_next_obs

    def mdrnn_exp_reward(self, latent_obs, action, reward, latent_next_obs,
                         hidden):
        """  # REMOVE TERMINAL

        Compute losses.

        The loss that is computed is:
        (GMMLoss(latent_next_obs, GMMPredicted) + MSE(reward, predicted_reward) +
             BCE(terminal, logit_terminal)) / (LSIZE + 2)
        The LSIZE + 2 factor is here to counteract the fact that the GMMLoss scales
        approximately linearily with LSIZE. All losses are averaged both on the
        batch and the sequence dimensions (the two first dimensions).

        :args latent_obs: (BSIZE, SEQ_LEN, LSIZE) torch tensor
        :args action: (BSIZE, SEQ_LEN, ASIZE) torch tensor
        :args reward: (BSIZE, SEQ_LEN) torch tensor
        :args latent_next_obs: (BSIZE, SEQ_LEN, LSIZE) torch tensor

        :returns: dictionary of losses, containing the gmm, the mse, the bce and
            the averaged loss.
        """

        mus, sigmas, logpi, rs, ds, next_hidden = self.mdrnn(
            action, latent_obs, hidden)
        gmm = gmm_loss(latent_next_obs, mus, sigmas, logpi)
        # bce = f.binary_cross_entropy_with_logits(ds, terminal)
        mse = f.mse_loss(rs, reward)
        loss = (gmm + mse) / (LSIZE + 2)
        return loss.squeeze().cpu().numpy()

    # def recon_error_reward(self, obs, hidden, obs_new):
    #     print('recon_error_reward')
    #     """Find out how good the reconstruction was.
    #     Encoding the vae to get mu and the controller action is deterministic, so its fine to be duplicated
    #     ??? maybe remove this and the above function because of unnecessary duplication
    #     """
    #     # obs_new = torch.from_numpy(np.moveaxis(obs_new, 2, 0).copy()).unsqueeze(0).to(self.device).type(torch.cuda.FloatTensor)
    #     # obs = obs.to(self.device).type(torch.cuda.FloatTensor)

    #     _, latent_mu, _ = self.vae(obs)
    #     action = self.controller(latent_mu, hidden[0])

    #     mus, sigmas, logpi, r, d, next_hidden = self.mdrnn(action, latent_mu, hidden)
    #     print('mus.size()', mus.size())
    #     print('sigmas.size()', sigmas.size())
    #     print('logpi.size()', logpi.size())
    #     print('r.size()', r.size())
    #     print('d.size()', d.size())
    #     print('next_hidden.size() [0], [1]', next_hidden[0].size(), next_hidden[1].size())

    #     recon_x = self.vae.decoder(mus.squeeze()).type(torch.cuda.FloatTensor) # ??? this is just mu, right? Still a bit confused
    #     print('obs_new.size()', obs_new.size())
    #     print('recon_x.size()', recon_x.size())

    #     # reward = -1*((recon_x - obs_new) ** 2).mean()
    #     reward = -1*F.mse_loss(recon_x, obs_new).item()

    def rollout(self, params, render=False):
        """ Execute a rollout and return reward

        Load :params: into the controller and execute a single rollout. This
        is the main API of this class.

        :args params: parameters as a single 1D np array

        :returns: minus cumulative reward if ctrl mode, cumulative recon_error if exp mode
        """
        # copy params into the controller
        if params is not None:
            load_parameters(params, self.controller)

        obs = self.env.reset()

        # This first render is required !
        self.env.render()

        hidden = [torch.zeros(1, RSIZE).to(self.device) for _ in range(2)]

        cumulative = 0
        i = 0
        while True:
            obs = transform(obs).unsqueeze(0).to(self.device)

            # GET ACTION
            _, latent_mu, _ = self.vae(obs)
            action = self.controller(latent_mu, hidden[0])
            _, _, _, _, _, next_hidden = self.mdrnn(action, latent_mu, hidden)
            action = action.squeeze().cpu().numpy()

            next_obs, reward, done, _ = self.env.step(action)

            if self.explorer:
                latent_obs, latent_next_obs = self.to_latent(
                    obs.unsqueeze(0),
                    transform(next_obs).unsqueeze(0).to(self.device))
                action = torch.from_numpy(action).unsqueeze(0)
                latent_obs = latent_obs.to(self.device).squeeze().unsqueeze(0)
                latent_next_obs = latent_next_obs.to(
                    self.device).squeeze().unsqueeze(0)
                action = action.to(self.device)
                reward = torch.from_numpy(np.array(reward)).unsqueeze(0).type(
                    torch.cuda.FloatTensor)
                reward = self.mdrnn_exp_reward(latent_obs, action, reward,
                                               latent_next_obs, hidden)

            obs = next_obs
            hidden = next_hidden

            if render:
                self.env.render()

            cumulative += reward
            if done or i > self.time_limit:
                return -cumulative
            i += 1
Ejemplo n.º 19
0
class RolloutGenerator(object):
    """ Utility to generate rollouts.

    Encapsulate everything that is needed to generate rollouts in the TRUE ENV
    using a controller with previously trained VAE and MDRNN.

    :attr vae: VAE model loaded from mdir/vae
    :attr mdrnn: MDRNN model loaded from mdir/mdrnn
    :attr controller: Controller, either loaded from mdir/ctrl or randomly
        initialized
    :attr env: instance of the CarRacing-v0 gym environment
    :attr device: device used to run VAE, MDRNN and Controller
    :attr time_limit: rollouts have a maximum of time_limit timesteps
    """
    def __init__(self, mdir, device, time_limit):
        """ Build vae, rnn, controller and environment. """
        # Loading world model and vae
        vae_file, rnn_file, ctrl_file = \
            [join(mdir, m, 'best.tar') for m in ['vae', 'mdrnn', 'ctrl']]

        assert exists(vae_file) and exists(rnn_file),\
            "Either vae or mdrnn is untrained."

        vae_state, rnn_state = [
            torch.load(fname, map_location={'cuda:0': str(device)})
            for fname in (vae_file, rnn_file)
        ]

        for m, s in (('VAE', vae_state), ('MDRNN', rnn_state)):
            print("Loading {} at epoch {} "
                  "with test loss {}".format(m, s['epoch'], s['precision']))

        self.vae = VAE(3, LSIZE).to(device)
        self.vae.load_state_dict(vae_state['state_dict'])

        self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device)
        self.mdrnn.load_state_dict(
            {k.strip('_l0'): v
             for k, v in rnn_state['state_dict'].items()})

        self.controller = Controller(LSIZE, RSIZE, ASIZE).to(device)

        # load controller if it was previously saved
        if exists(ctrl_file):
            ctrl_state = torch.load(ctrl_file,
                                    map_location={'cuda:0': str(device)})
            print("Loading Controller with reward {}".format(
                ctrl_state['reward']))
            self.controller.load_state_dict(ctrl_state['state_dict'])

        self.env = gym.make('CarRacing-v0')
        self.device = device

        self.time_limit = time_limit

    def get_action_and_transition(self, obs, hidden):
        """ Get action and transition.

        Encode obs to latent using the VAE, then obtain estimation for next
        latent and next hidden state using the MDRNN and compute the controller
        corresponding action.

        :args obs: current observation (1 x 3 x 64 x 64) torch tensor
        :args hidden: current hidden state (1 x 256) torch tensor

        :returns: (action, next_hidden)
            - action: 1D np array
            - next_hidden (1 x 256) torch tensor
        """
        _, latent_mu, _ = self.vae(obs)
        action = self.controller(latent_mu, hidden[0])
        _, _, _, _, _, next_hidden = self.mdrnn(action, latent_mu, hidden)
        return action.squeeze().cpu().numpy(), next_hidden

    def rollout(self, params, render=False):
        """ Execute a rollout and returns minus cumulative reward.

        Load :params: into the controller and execute a single rollout. This
        is the main API of this class.

        :args params: parameters as a single 1D np array

        :returns: minus cumulative reward
        """
        # copy params into the controller
        if params is not None:
            load_parameters(params, self.controller)

        obs = self.env.reset()

        # This first render is required !
        self.env.render()

        hidden = [torch.zeros(1, RSIZE).to(self.device) for _ in range(2)]

        cumulative = 0
        i = 0
        while True:
            obs = transform(obs).unsqueeze(0).to(self.device)
            action, hidden = self.get_action_and_transition(obs, hidden)
            obs, reward, done, _ = self.env.step(action)

            if render:
                self.env.render()

            cumulative += reward
            if done or i > self.time_limit:
                return -cumulative
            i += 1
Ejemplo n.º 20
0
def train_C_given_M(mdrnnCell, latent_dim, hidden_dim, action_dim):

    # Parameters
    num_episode = 1
    batch_size = 1
    learning_rate = 0.01
    gamma = 0.99
    done_threshold = np.log(0.5)

    interim_policy = Controller(latent_dim, hidden_dim, action_dim)
    optimizer = torch.optim.RMSprop(interim_policy.parameters(),
                                    lr=learning_rate)

    # Batch History
    state_pool = []
    action_pool = []
    reward_pool = []
    steps = 0

    for e in range(num_episode):

        # initial latent and hidden states
        z_t = torch.randn(1, LSIZE)
        h_t = 2 * [torch.zeros(1, RSIZE)]

        for t in range(1000):

            # pick action using policy net given z_t, h_t
            mean_a_t = interim_policy(z_t, h_t[0])
            action_policy_std = 0.1
            cov = action_policy_std * torch.eye(action_dim)
            stochastic_policy = MultivariateNormal(loc=mean_a_t,
                                                   covariance_matrix=cov)
            a_t = stochastic_policy.sample()

            mu, sigma, pi, r, d, n_h = mdrnnCell(a_t, z_t, h_t)
            # sample next z_t from N(mu, sigma)
            pi = pi.squeeze()
            mixt = Categorical(torch.exp(pi)).sample().item()

            z_t = mu[:,
                     mixt, :]  # + sigma[:, mixt, :] * torch.randn_like(mu[:, mixt, :])
            h_t = n_h

            reward = -0.1
            if d >= done_threshold:
                done = True
            else:
                done = False

            state_pool.append((z_t, h_t))
            action_pool.append(a_t)
            reward_pool.append(reward)

            steps += 1
            if done:
                break

        # Update policy
        if e > 0 and e % batch_size == 0:

            # Discount reward
            running_add = 0
            for i in reversed(range(steps)):
                if reward_pool[i] == 0:
                    running_add = 0
                else:
                    running_add = running_add * gamma + reward_pool[i]
                    reward_pool[i] = running_add

            # Normalize reward
            reward_mean = np.mean(reward_pool)
            reward_std = np.std(reward_pool)
            for i in range(steps):
                reward_pool[i] = (reward_pool[i] - reward_mean) / reward_std

            # Gradient Desent
            optimizer.zero_grad()

            for i in range(steps):
                z_t, h_t = state_pool[i]
                action = action_pool[i]
                reward = reward_pool[i]

                mean_a_t = interim_policy(z_t, h_t[0])
                action_policy_std = 0.1
                cov = action_policy_std * torch.eye(action_dim)
                stochastic_policy = MultivariateNormal(loc=mean_a_t,
                                                       covariance_matrix=cov)
                loss = -stochastic_policy.log_prob(
                    action) * reward  # Negtive score function x reward
                # TODO: why do we need to use retain_graph here?
                loss.backward(retain_graph=True)
                optimizer.step()

            state_pool = []
            action_pool = []
            reward_pool = []
            steps = 0

    return interim_policy
Ejemplo n.º 21
0
    for s_id in range(rollouts):
        p_queue.put((s_id, best_guess))

    print("Evaluating...")
    for _ in tqdm(range(rollouts)):
        while r_queue.empty():
            sleep(.1)
        restimates.append(r_queue.get()[1])

    return best_guess, np.mean(restimates), np.std(restimates)

################################################################################
#                           Launch CMA                                         #
################################################################################
controller = Controller(LSIZE, RSIZE, ASIZE)  # dummy instance

# define current best and load parameters
cur_best = None
ctrl_file = join(ctrl_dir, 'best.tar')
print("Attempting to load previous best...")

if exists(ctrl_file):
    state = torch.load(ctrl_file, map_location={'cuda:0': 'cpu'})
    cur_best = - state['reward']
    controller.load_state_dict(state['state_dict'])
    print("Previous best was {}...".format(-cur_best))

parameters = controller.parameters()
es = cma.CMAEvolutionStrategy(flatten_parameters(parameters), 0.1,
                              {'popsize': pop_size})
Ejemplo n.º 22
0
def ctrl_exp_gen_data(rollouts,
                      datadir,
                      logdir,
                      noise_type,
                      device,
                      use_ctrl_exp,
                      exp_prob=.5,
                      randomness_factor=.1):
    """ 
    randomness factor is the multiple we will multiply the current standard deviation by to get the std for the normal disnt std.
    This help because it is more resonable. Really should be updating based on parameter distances over updates, but whatever.
    ** read the openai parameter thing

    Uses fixed parameters for vae and mdrnn, but maybe change

    All the if use_ctrl_exp: should be switched to having the random thing inside a module, or at least consistent with the explorer way.
    """
    assert exists(logdir), "The directory does not exist..."
    exp_prob = float(exp_prob)

    env = gym.make("CarRacing-v0")
    seq_len = 1000

    if use_ctrl_exp:
        a_rollout = []

        #### Load controller and explorer
        ctrl_file = join(logdir, 'ctrl', 'best.tar')
        exp_file = join(logdir, 'exp', 'best.tar')

        controller = Controller(LSIZE, RSIZE, ASIZE).to(device)
        explorer = Controller(LSIZE, RSIZE, ASIZE).to(device)

        if exists(ctrl_file):
            ctrl_state = torch.load(ctrl_file,
                                    map_location={'cuda:0': str(device)})
            print("Loading Controller with reward {}".format(
                ctrl_state['reward']))
            controller.load_state_dict(ctrl_state['state_dict'])

        if exists(exp_file):
            exp_state = torch.load(exp_file,
                                   map_location={'cuda:0': str(device)})
            print("Loading Explorer with reward {}".format(
                exp_state['reward']))
            explorer.load_state_dict(exp_state['state_dict'])

        # Make the generators (this is unnecessary, shoul dbe organized some other way)
        ctrl_gen = RolloutGeneratorSingle(logdir, device, controller)
        exp_gen = RolloutGeneratorSingle(logdir, device, explorer)

        # for parameter noise exploration
        def update_params_noise(model, randomness_factor):
            def gaussian(ins, stddev=std):
                return ins + Variable(torch.randn(ins.size()).cuda() * stddev)

            all_params = []
            controller_new = controller
            for name, param in controller.named_parameters():
                all_params.append(param)

            std = np.std(np.array(params))
            print('Parameter mean: ', np.mean(np.array(params)))
            print('Parameter std: ', std)

            std = std * randomness_factor
            controller_new.apply(gaussian)
            return controller_new

    for i in range(rollouts):
        env.reset()
        env.env.viewer.window.dispatch_events()

        s_rollout = []
        r_rollout = []
        d_rollout = []

        if use_ctrl_exp:
            # randomize the explorer and controller
            explorer_new = update_params_noise(explorer, randomness_factor)
            controller_new = update_params_noise(controller, randomness_factor)

            # initialize the hidden state for the model:
            hidden = [torch.zeros(1, RSIZE).to(device) for _ in range(2)]

        else:
            if noise_type == 'white':
                a_rollout = [env.action_space.sample() for _ in range(seq_len)]
            elif noise_type == 'brown':
                a_rollout = sample_continuous_policy(env.action_space, seq_len,
                                                     1. / 50)

        t = 0
        while True:

            if use_ctrl_exp:
                # explore or exploit:
                if random.uniform(0, 1) < exp_prob:
                    action, obs, hidden = ctrl_gen(obs, hidden)
                else:
                    action, obs, hidden = exp_gen(obs, hidden)
                a_rollout.append(action)
            else:
                action = a_rollout[t]

            t += 1
            s, r, done, _ = env.step(action)
            env.env.viewer.window.dispatch_events()
            s_rollout += [s]
            r_rollout += [r]
            d_rollout += [done]
            if done:
                print("> End of rollout {}, {} frames...".format(
                    i, len(s_rollout)))
                np.savez(join(datadir, 'rollout_{}'.format(i)),
                         observations=np.array(s_rollout),
                         rewards=np.array(r_rollout),
                         actions=np.array(a_rollout),
                         terminals=np.array(d_rollout))
                break