def test_alpgmm(env,
                nb_episodes,
                gif=True,
                nb_dims=2,
                score_step=1000,
                verbose=True,
                params={}):
    # Init teacher
    task_generator = ALPGMM([0] * nb_dims, [1] * nb_dims, params=params)

    # Init book keeping
    rewards = []
    scores = []
    bk = {
        'weights': [],
        'covariances': [],
        'means': [],
        'tasks_lps': [],
        'episodes': [],
        'comp_grids': [],
        'comp_xs': [],
        'comp_ys': []
    }

    # Launch run
    for i in range(nb_episodes + 1):
        if (i % score_step) == 0:
            scores.append(env.get_score())
            if nb_dims == 2:
                if verbose:
                    print(env.cube_competence)
            else:
                if verbose:
                    print("it:{}, score:{}".format(i, scores[-1]))

        # Book keeping if ALP-GMM updated its GMM
        if i > 100 and (i % task_generator.fit_rate) == 0 and (gif is True):
            bk['weights'].append(task_generator.gmm.weights_.copy())
            bk['covariances'].append(task_generator.gmm.covariances_.copy())
            bk['means'].append(task_generator.gmm.means_.copy())
            bk['tasks_lps'] = task_generator.tasks_alps
            bk['episodes'].append(i)
            if nb_dims == 2:
                bk['comp_grids'].append(env.cube_competence.copy())
                bk['comp_xs'].append(env.bnds[0].copy())
                bk['comp_ys'].append(env.bnds[1].copy())

        task = task_generator.sample_task()
        reward = env.episode(task)
        task_generator.update(np.array(task), reward)
        rewards.append(reward)

    if gif and nb_dims == 2:
        print('Creating gif...')
        gmm_plot_gif(bk,
                     gifname='alpgmm_' + str(time.time()),
                     gifdir='toy_env_gifs/')
        print('Done (see graphics/toy_env_gifs/ folder)')
    return scores
Beispiel #2
0
    def __init__(self, mins, maxs, seed=None, params=dict()):
        self.seed = seed
        if not seed:
            self.seed = np.random.randint(42, 424242)
        np.random.seed(self.seed)

        # Task space boundaries
        self.mins = np.array(mins)
        self.maxs = np.array(maxs)

        self.use_alpgmm = False if "use_alpgmm" not in params else params[
            'use_alpgmm']
        #self.decorelate_alpgmm = False if "decorelate_alpgmm" not in params else params['decorelate_alpgmm']

        self.nb_alpgmm_gaussians = None
        if self.use_alpgmm:
            print("Using ALP-GMM with EGT")
            self.alpgmm = ALPGMM(mins, maxs, seed=seed, params=params)
            self.is_new_alpgmm = False  # boolean used to track alpgmm's periodic updates
            self.random_task_ratio = 0.02 if "random_task_ratio" not in params else params[
                "random_task_ratio"]
            self.sampled_gaussian_idx = None

            self.stop_R = False if "stop_R" not in params else params['stop_R']
            self.nb_eps_after_R = 0

        assert ('expert_gmms' in params)
        self.expert_means, self.expert_covs, self.expert_mean_rewards = params[
            'expert_gmms']
        self.expert_type = "P" if "expert_type" not in params else params[
            "expert_type"]
        self.r_list_len = 50 if "r_list_len" not in params else params[
            "r_list_len"]
        self.tol_ratio = 1.0 if "tol_ratio" not in params else params[
            "tol_ratio"]

        if self.expert_type == 'R':
            self.reward_list = deque(maxlen=self.r_list_len)

        self.expert_idx = -1
        self.episode_nb = 0
        self.current_means = None
        self.current_covs = None
        self.current_mean_r = None

        # Boring book-keeping
        self._update()
        self.bk = {
            'egt_covariances': [self.current_covs.copy()],
            'egt_means': [self.current_means.copy()],
            'egt_episodes': [self.episode_nb],
            'egt_tasks_origin': [],
            'egt_nb_alpgmm_gaussians': [self.nb_alpgmm_gaussians],
            'egt_expert_idx': [self.expert_idx]
        }
Beispiel #3
0
    def __init__(self, env, **kwargs):

        from teachDRL.teachers.algos.alp_gmm import ALPGMM

        super(ALPGMMTeacher, self).__init__(env)
        self.cond_bounds = self.env.unwrapped.cond_bounds
        self.midep_trgs = False
        env_param_lw_bounds = [self.cond_bounds[k][0] for k in self.usable_metrics]
        env_param_hi_bounds = [self.cond_bounds[k][1] for k in self.usable_metrics]
        self.alp_gmm = ALPGMM(env_param_lw_bounds, env_param_hi_bounds)
        self.trg_vec = None
        self.trial_reward = 0
        self.n_trial_steps = 0
Beispiel #4
0
    def __init__(self):
        # have to do above before call to parent to inirialize Evaluator correctly
        super(Teacher, self).__init__()
        # dictionary of param names to target histories as set by alp_gmm
        self.param_hist = {}
        envs = self.envs
        args = self.args
        env_param_bounds = envs.get_param_bounds()
        # in case we want to change this dynamically in the future (e.g., we may
        # not know how much traffic the agent can possibly produce in Micropolis)
        envs.set_param_bounds(env_param_bounds) # start with default bounds
        env_param_bounds = env_param_bounds
        num_env_params = 4
        env_param_ranges = []
        env_param_lw_bounds = []
        env_param_hi_bounds = []
        i = 0
        for k, v in env_param_bounds.items():
            if i < num_env_params:
                env_param_ranges += [abs(v[1] - v[0])]
                env_param_lw_bounds += [v[0]]
                env_param_hi_bounds += [v[1]]
                i += 1
            else:
                break
        alp_gmm = None
        if self.checkpoint:
            alp_gmm = self.checkpoint['alp_gmm']
        if alp_gmm is None:
            alp_gmm = ALPGMM(env_param_lw_bounds, env_param_hi_bounds)
        params_vec = alp_gmm.sample_task()
        self.alp_gmm = alp_gmm

        params = OrderedDict()
        print('\n env_param_bounds', env_param_bounds)
        print(params_vec)
        trial_remaining = args.max_step
        trial_reward = 0

        self.env_param_bounds = env_param_bounds
        self.num_env_params = num_env_params
        self.env_param_ranges = env_param_ranges
        self.params_vec = params_vec
        self.params = params
        self.trial_remaining = args.max_step
        self.trial_reward = trial_reward
Beispiel #5
0
class ALPGMMTeacher(gym.Wrapper):
    def __init__(self, env, **kwargs):

        from teachDRL.teachers.algos.alp_gmm import ALPGMM

        super(ALPGMMTeacher, self).__init__(env)
        self.cond_bounds = self.env.unwrapped.cond_bounds
        self.midep_trgs = False
        env_param_lw_bounds = [self.cond_bounds[k][0] for k in self.usable_metrics]
        env_param_hi_bounds = [self.cond_bounds[k][1] for k in self.usable_metrics]
        self.alp_gmm = ALPGMM(env_param_lw_bounds, env_param_hi_bounds)
        self.trg_vec = None
        self.trial_reward = 0
        self.n_trial_steps = 0

    def reset(self):
        if self.trg_vec is not None:
            if self.n_trial_steps == 0:
                # This is some whack shit that happens when we reset manually from the inference script.
                rew = 0
            else:
                rew = self.trial_reward / self.n_trial_steps
            self.alp_gmm.update(self.trg_vec, rew)
        trg_vec = self.alp_gmm.sample_task()
        self.trg_vec = trg_vec
        trgs = {k: trg_vec[i] for (i, k) in enumerate(self.usable_metrics)}
        #       print(trgs)
        self.set_trgs(trgs)
        self.trial_reward = 0
        self.n_trial_steps = 0

        return self.env.reset()

    def step(self, action):
        obs, rew, done, info = self.env.step(action)
        self.trial_reward += rew
        self.n_trial_steps += 1

        return obs, rew, done, info
def main():
    import random
    import gym_micropolis
    import game_of_life

    args = get_args()
    args.log_dir = args.save_dir + '/logs'
    assert args.algo in ['a2c', 'ppo', 'acktr']
    if args.recurrent_policy:
        assert args.algo in ['a2c', 'ppo'], \
            'Recurrent policy is not implemented for ACKTR'
    args.poet = True  # hacky

    num_updates = int(args.num_frames) // args.num_steps // args.num_processes

    torch.manual_seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    graph_name = args.save_dir.split('trained_models/')[1].replace('/', ' ')

    actor_critic = False
    agent = False
    past_steps = 0
    try:
        os.makedirs(args.log_dir)
    except OSError:
        files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv'))
        for f in files:
            if args.overwrite:
                os.remove(f)
            else:
                pass
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None
        win_eval = None
    if 'GameOfLife' in args.env_name:
        print('env name: {}'.format(args.env_name))
        num_actions = 1
    envs = make_vec_envs(args.env_name,
                         args.seed,
                         args.num_processes,
                         args.gamma,
                         args.log_dir,
                         args.add_timestep,
                         device,
                         False,
                         None,
                         args=args)

    if isinstance(envs.observation_space, gym.spaces.Discrete):
        num_inputs = envs.observation_space.n
    elif isinstance(envs.observation_space, gym.spaces.Box):
        if len(envs.observation_space.shape) == 3:
            in_w = envs.observation_space.shape[1]
            in_h = envs.observation_space.shape[2]
        else:
            in_w = 1
            in_h = 1
        num_inputs = envs.observation_space.shape[0]
    if isinstance(envs.action_space, gym.spaces.Discrete):
        out_w = 1
        out_h = 1
        if 'Micropolis' in args.env_name:  #otherwise it's set
            if args.power_puzzle:
                num_actions = 1
            else:
                num_actions = 19  # TODO: have this already from env
        elif 'GameOfLife' in args.env_name:
            num_actions = 1
        else:
            num_actions = envs.action_space.n
    elif isinstance(envs.action_space, gym.spaces.Box):
        if len(envs.action_space.shape) == 3:
            out_w = envs.action_space.shape[1]
            out_h = envs.action_space.shape[2]
        elif len(envs.action_space.shape) == 1:
            out_w = 1
            out_h = 1
        num_actions = envs.action_space.shape[-1]
    print('num actions {}'.format(num_actions))

    if args.auto_expand:
        args.n_recs -= 1
    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={
                              'map_width': args.map_width,
                              'num_actions': num_actions,
                              'recurrent': args.recurrent_policy,
                              'prebuild': args.prebuild,
                              'in_w': in_w,
                              'in_h': in_h,
                              'num_inputs': num_inputs,
                              'out_w': out_w,
                              'out_h': out_h
                          },
                          curiosity=args.curiosity,
                          algo=args.algo,
                          model=args.model,
                          args=args)
    if args.auto_expand:
        args.n_recs += 1

    evaluator = None

    if not agent:
        agent = init_agent(actor_critic, args)

#saved_model = os.path.join(args.save_dir, args.env_name + '.pt')
    if args.load_dir:
        saved_model = os.path.join(args.load_dir, args.env_name + '.tar')
    else:
        saved_model = os.path.join(args.save_dir, args.env_name + '.tar')
    vec_norm = get_vec_normalize(envs)
    alp_gmm = None
    if os.path.exists(saved_model) and not args.overwrite:
        checkpoint = torch.load(saved_model)
        saved_args = checkpoint['args']
        actor_critic.load_state_dict(checkpoint['model_state_dict'])
        actor_critic.to(device)
        actor_critic.cuda()
        #agent = init_agent(actor_critic, saved_args)
        agent.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        if args.auto_expand:
            if not args.n_recs - saved_args.n_recs == 1:
                print(
                    'can expand by 1 rec only from saved model, not {}'.format(
                        args.n_recs - saved_args.n_recs))
                raise Exception
            actor_critic.base.auto_expand()
            print('expanded net: \n{}'.format(actor_critic.base))
        past_steps = checkpoint['past_steps']
        ob_rms = checkpoint['ob_rms']

        past_steps = next(iter(
            agent.optimizer.state_dict()['state'].values()))['step']
        print('Resuming from step {}'.format(past_steps))

        #print(type(next(iter((torch.load(saved_model))))))
        #actor_critic, ob_rms = \
        #        torch.load(saved_model)
        #agent = \
        #    torch.load(os.path.join(args.save_dir, args.env_name + '_agent.pt'))
        #if not agent.optimizer.state_dict()['state'].values():
        #    past_steps = 0
        #else:

        #    raise Exception
        alp_gmm = checkpoint['alp_gmm']

        if vec_norm is not None:
            vec_norm.eval()
            vec_norm.ob_rms = ob_rms
        saved_args.num_frames = args.num_frames
        saved_args.vis_interval = args.vis_interval
        saved_args.eval_interval = args.eval_interval
        saved_args.overwrite = args.overwrite
        saved_args.n_recs = args.n_recs
        saved_args.intra_shr = args.intra_shr
        saved_args.inter_shr = args.inter_shr
        saved_args.map_width = args.map_width
        saved_args.render = args.render
        saved_args.print_map = args.print_map
        saved_args.load_dir = args.load_dir
        saved_args.experiment_name = args.experiment_name
        saved_args.log_dir = args.log_dir
        saved_args.save_dir = args.save_dir
        saved_args.num_processes = args.num_processes
        saved_args.n_chan = args.n_chan
        saved_args.prebuild = args.prebuild
        args = saved_args
    actor_critic.to(device)

    if 'LSTM' in args.model:
        recurrent_hidden_state_size = actor_critic.base.get_recurrent_state_size(
        )
    else:
        recurrent_hidden_state_size = actor_critic.recurrent_hidden_state_size
    if args.curiosity:
        rollouts = CuriosityRolloutStorage(
            args.num_steps,
            args.num_processes,
            envs.observation_space.shape,
            envs.action_space,
            recurrent_hidden_state_size,
            actor_critic.base.feature_state_size(),
            args=args)
    else:
        rollouts = RolloutStorage(args.num_steps,
                                  args.num_processes,
                                  envs.observation_space.shape,
                                  envs.action_space,
                                  recurrent_hidden_state_size,
                                  args=args)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    model = actor_critic.base
    reset_eval = False
    plotter = None
    if args.model == 'FractalNet' or args.model == 'fractal':
        n_cols = model.n_cols
        if args.rule == 'wide1' and args.n_recs > 3:
            col_step = 3
        else:
            col_step = 1
    else:
        n_cols = 0
        col_step = 1
    env_param_bounds = envs.venv.venv.get_param_bounds()
    envs.venv.venv.set_param_ranges(env_param_bounds)
    num_env_params = len(env_param_bounds)
    env_param_ranges = [abs(v[1] - v[0]) for k, v in env_param_bounds.items()]
    env_param_lw_bounds = [v[0] for k, v in env_param_bounds.items()]
    env_param_hi_bounds = [v[1] for k, v in env_param_bounds.items()]
    if alp_gmm is None:
        alp_gmm = ALPGMM(env_param_lw_bounds, env_param_hi_bounds)
    params_vec = alp_gmm.sample_task()
    params = OrderedDict()
    print('\n env_param_bounds', env_param_bounds)
    print(params_vec)
    trial_remaining = args.max_step
    trial_reward = 0
    for j in range(past_steps, num_updates):
        if trial_remaining == 0:
            trial_reward = trial_reward / args.num_processes
            alp_gmm.update(params_vec, trial_reward)
            trial_reward = 0
            trial_remaining = args.max_step
            # sample random environment parameters
            params_vec = alp_gmm.sample_task()
            prm_i = 0
            for k, v in env_param_bounds.items():
                params[k] = params_vec[prm_i]
                prm_i += 1
            envs.venv.venv.set_params(params)
        trial_remaining -= args.num_steps
        if reset_eval:
            print('post eval reset')
            obs = envs.reset()
            rollouts.obs[0].copy_(obs)
            rollouts.to(device)
            reset_eval = False
    #if np.random.rand(1) < 0.1:
    #    envs.venv.venv.remotes[1].send(('setRewardWeights', None))
        if args.model == 'FractalNet' and args.drop_path:
            #if args.intra_shr and args.inter_shr:
            #    n_recs = np.randint
            #    model.set_n_recs()
            model.set_drop_path()
        if args.model == 'fixed' and model.RAND:
            model.num_recursions = random.randint(1, model.map_width * 2)
        player_act = None
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                if args.render:
                    if args.num_processes == 1:
                        if not ('Micropolis' in args.env_name
                                or 'GameOfLife' in args.env_name):
                            envs.venv.venv.render()
                        else:
                            pass
                    else:
                        if not ('Micropolis' in args.env_name
                                or 'GameOfLife' in args.env_name):
                            envs.render()
                            envs.venv.venv.render()
                        else:
                            pass
                        #envs.venv.venv.remotes[0].send(('render', None))
                        #envs.venv.venv.remotes[0].recv()
                value, action, action_log_probs, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step],
                    rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step],
                    player_act=player_act,
                    icm_enabled=args.curiosity,
                    deterministic=False)

            # Observe reward and next obs
            obs, reward, done, infos = envs.step(action)

            player_act = None
            if args.render:
                if infos[0]:
                    if 'player_move' in infos[0].keys():
                        player_act = infos[0]['player_move']
            if args.curiosity:
                # run icm
                with torch.no_grad():

                    feature_state, feature_state_pred, action_dist_pred = actor_critic.icm_act(
                        (rollouts.obs[step], obs, action_bin))

                intrinsic_reward = args.eta * (
                    (feature_state - feature_state_pred).pow(2)).sum() / 2.
                if args.no_reward:
                    reward = 0
                reward += intrinsic_reward.cpu()

            for info in infos:
                if 'episode' in info.keys():
                    epi_reward = info['episode']['r']
                    episode_rewards.append(epi_reward)
                    trial_reward += epi_reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])

            if args.curiosity:
                rollouts.insert(obs, recurrent_hidden_states, action,
                                action_log_probs, value, reward, masks,
                                feature_state, feature_state_pred, action_bin,
                                action_dist_pred)
            else:
                rollouts.insert(obs, recurrent_hidden_states, action,
                                action_log_probs, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)
        if args.curiosity:
            value_loss, action_loss, dist_entropy, fwd_loss, inv_loss = agent.update(
                rollouts)
        else:
            value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if not dist_entropy:
            dist_entropy = 0
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n \
dist entropy {:.1f}, val/act loss {:.1f}/{:.1f},".format(
                    j, total_num_steps,
                    int((total_num_steps -
                         past_steps * args.num_processes * args.num_steps) /
                        (end - start)), len(episode_rewards),
                    np.mean(episode_rewards), np.median(episode_rewards),
                    np.min(episode_rewards), np.max(episode_rewards),
                    dist_entropy, value_loss, action_loss))
            if args.curiosity:
                print("fwd/inv icm loss {:.1f}/{:.1f}\n".format(
                    fwd_loss, inv_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            if evaluator is None:
                evaluator = Evaluator(args,
                                      actor_critic,
                                      device,
                                      envs=envs,
                                      vec_norm=vec_norm)

            model = evaluator.actor_critic.base

            col_idx = [-1, *range(0, n_cols, col_step)]
            for i in col_idx:
                evaluator.evaluate(column=i)
        #num_eval_frames = (args.num_frames // (args.num_steps * args.eval_interval * args.num_processes)) * args.num_processes *  args.max_step
        # making sure the evaluator plots the '-1'st column (the overall net)

            if args.vis:  #and j % args.vis_interval == 0:
                try:
                    # Sometimes monitor doesn't properly flush the outputs
                    win_eval = evaluator.plotter.visdom_plot(
                        viz,
                        win_eval,
                        evaluator.eval_log_dir,
                        graph_name,
                        args.algo,
                        args.num_frames,
                        n_graphs=col_idx)
                except IOError:
                    pass
        #elif args.model == 'fixed' and model.RAND:
        #    for i in model.eval_recs:
        #        evaluator.evaluate(num_recursions=i)
        #    win_eval = visdom_plot(viz, win_eval, evaluator.eval_log_dir, graph_name,
        #                           args.algo, args.num_frames, n_graphs=model.eval_recs)
        #else:
        #    evaluator.evaluate(column=-1)
        #    win_eval = visdom_plot(viz, win_eval, evaluator.eval_log_dir, graph_name,
        #                  args.algo, args.num_frames)
            reset_eval = True

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            ob_rms = getattr(get_vec_normalize(envs), 'ob_rms', None)
            save_model = copy.deepcopy(actor_critic)
            save_agent = copy.deepcopy(agent)
            if args.cuda:
                save_model.cpu()
            optim_save = save_agent.optimizer.state_dict()

            # experimental:
            torch.save(
                {
                    'past_steps':
                    next(iter(agent.optimizer.state_dict()['state'].values()))
                    ['step'],
                    'model_state_dict':
                    save_model.state_dict(),
                    'optimizer_state_dict':
                    optim_save,
                    'ob_rms':
                    ob_rms,
                    'args':
                    args,
                    'alp_gmm':
                    alp_gmm
                }, os.path.join(save_path, args.env_name + ".tar"))

        #save_model = [save_model,
        #              getattr(get_vec_normalize(envs), 'ob_rms', None)]

        #torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))
        #save_agent = copy.deepcopy(agent)

        #torch.save(save_agent, os.path.join(save_path, args.env_name + '_agent.pt'))
        #torch.save(actor_critic.state_dict(), os.path.join(save_path, args.env_name + "_weights.pt"))

        if args.vis and j % args.vis_interval == 0:
            if plotter is None:
                plotter = Plotter(n_cols, args.log_dir, args.num_processes)
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = plotter.visdom_plot(viz, win, args.log_dir, graph_name,
                                          args.algo, args.num_frames)
            except IOError:
                pass
    def __init__(self,
                 teacher,
                 nb_test_episodes,
                 param_env_bounds,
                 seed=None,
                 teacher_params={},
                 custom_test_param_vec=None):
        self.teacher = teacher

        self.nb_custom_tests = 0
        if custom_test_param_vec is not None:
            self.nb_custom_tests = len(custom_test_param_vec)
            custom_test_param_dicts = [
                param_vec_to_param_dict(param_env_bounds, vec)
                for vec in custom_test_param_vec
            ]
            self.custom_test_env_list = custom_test_param_dicts
            print('number of generated custom tests {}'.format(
                len(self.custom_test_env_list)))

        self.nb_test_episodes = nb_test_episodes
        self.test_ep_counter = 0
        self.eps = 1e-03
        self.param_env_bounds = copy.deepcopy(param_env_bounds)

        # figure out parameters boundaries vectors
        mins, maxs = [], []
        for name, bounds in param_env_bounds.items():
            if len(bounds) == 2:
                mins.append(bounds[0])
                maxs.append(bounds[1])
            elif len(
                    bounds
            ) == 3:  # third value is the number of dimensions having these bounds
                mins.extend([bounds[0]] * bounds[2])
                maxs.extend([bounds[1]] * bounds[2])
            else:
                print(
                    "ill defined boundaries, use [min, max, nb_dims] format or [min, max] if nb_dims=1"
                )
                exit(1)
        self.task_dim = len(mins)

        # setup tasks generator
        if teacher == 'Oracle':
            self.task_generator = GaussianOracleTeacher(
                mins, maxs, teacher_params['window_step_vector'], seed=seed)
        elif teacher == 'Random':
            self.task_generator = RandomTeacher(mins, maxs, seed=seed)
        elif teacher == 'RIAC':
            self.task_generator = RIAC(mins,
                                       maxs,
                                       seed=seed,
                                       params=teacher_params)
        elif teacher == 'ALP-GMM':
            self.task_generator = ALPGMM(mins,
                                         maxs,
                                         seed=seed,
                                         params=teacher_params)
        elif teacher == 'Covar-GMM':
            self.task_generator = CovarGMM(mins,
                                           maxs,
                                           seed=seed,
                                           params=teacher_params)
        elif teacher == 'EGT':
            self.task_generator = EGT(mins,
                                      maxs,
                                      seed=seed,
                                      params=teacher_params)
        elif teacher == 'AGAIN':
            self.task_generator = AGAIN(mins,
                                        maxs,
                                        seed=seed,
                                        params=teacher_params)
        elif teacher == 'ADR':
            self.task_generator = ADR(mins,
                                      maxs,
                                      seed=seed,
                                      params=teacher_params)
        else:
            print('Unknown teacher')
            raise NotImplementedError

        self.test_mode = None
        if self.task_dim == 2:
            self.test_mode = "uniform"  # "fixed_set"
        else:
            self.test_mode = "fixed_set"

        test_param_vec = None
        if self.test_mode == "fixed_set":  # WARNING only works for hexagon env
            test_param_vec = np.array(
                pickle.load(
                    open("teachDRL/teachers/test_sets/hexagon_test_set.pkl",
                         "rb")))
            #name = get_test_set_name(self.param_env_bounds)
            print('fixed set of {} tasks loaded'.format(len(test_param_vec)))
            #self.test_env_list = pickle.load( open("teachDRL/teachers/test_sets/"+name+".pkl", "rb" ) )
            #print('fixed set of {} tasks loaded: {}'.format(len(self.test_env_list),name))
        elif self.test_mode == "uniform":
            # select <nb_test_episodes> parameters choosen uniformly in the task space
            nb_steps = int(nb_test_episodes**(1 / self.task_dim))
            print(maxs[0])
            d1 = np.linspace(mins[0], maxs[0], nb_steps, endpoint=True)
            d2 = np.linspace(mins[1], maxs[1], nb_steps, endpoint=True)
            test_param_vec = np.transpose(
                [np.tile(d1, len(d2)),
                 np.repeat(d2, len(d1))])  # cartesian product
        test_param_dicts = [
            param_vec_to_param_dict(param_env_bounds, vec)
            for vec in test_param_vec
        ]
        self.test_env_list = test_param_dicts
        print('number of generated tests {}'.format(len(self.test_env_list)))
        # print(test_param_dicts)

        #data recording
        self.env_params_train = []
        self.env_train_rewards = []
        self.env_train_norm_rewards = []
        self.env_train_len = []

        self.env_params_test = []
        self.env_test_rewards = []
        self.env_test_len = []

        self.custom_env_params_test = []
        self.custom_env_test_rewards = []
        self.custom_env_test_len = []
Beispiel #8
0
train_rewards = []
episode_all_mastered = -1

# setup tasks generator
teacher_params['is_toy_env'] = True
if teacher_name == 'Oracle':
    Teacher = GaussianOracleTeacher(mins,
                                    maxs,
                                    teacher_params['window_step_vector'],
                                    seed=seed)
elif teacher_name == 'Random':
    Teacher = RandomTeacher(mins, maxs, seed=seed)
elif teacher_name == 'RIAC':
    Teacher = RIAC(mins, maxs, seed=seed, params=teacher_params)
elif teacher_name == 'ALP-GMM':
    Teacher = ALPGMM(mins, maxs, seed=seed, params=teacher_params)
elif teacher_name == 'Covar-GMM':
    Teacher = CovarGMM(mins, maxs, seed=seed, params=teacher_params)
elif teacher_name == 'EGT':
    Teacher = EGT(mins, maxs, seed=seed, params=teacher_params)
elif teacher_name == 'AGAIN':
    Teacher = AGAIN(mins, maxs, seed=seed, params=teacher_params)
elif teacher_name == 'ADR':
    Teacher = ADR(mins, maxs, seed=seed, params=teacher_params)
else:
    print('Unknown teacher')
    raise NotImplementedError

print('launching {} for {} on toy env with {} cubes and {} 90rot'.format(
    teacher_name, nb_episodes, nb_cubes, nb_task_space_rot))
# Main loop: collect experience in env and update/log each epoch
Beispiel #9
0
    def __init__(self,
                 teacher,
                 nb_test_episodes,
                 param_env_bounds,
                 seed=None,
                 teacher_params={}):
        self.teacher = teacher
        self.nb_test_episodes = nb_test_episodes
        self.test_ep_counter = 0
        self.eps = 1e-03
        self.param_env_bounds = copy.deepcopy(param_env_bounds)

        # figure out parameters boundaries vectors
        mins, maxs = [], []
        for name, bounds in param_env_bounds.items():
            if len(bounds) == 2:
                mins.append(bounds[0])
                maxs.append(bounds[1])
            elif len(
                    bounds
            ) == 3:  # third value is the number of dimensions having these bounds
                mins.extend([bounds[0]] * bounds[2])
                maxs.extend([bounds[1]] * bounds[2])
            else:
                print(
                    "ill defined boundaries, use [min, max, nb_dims] format or [min, max] if nb_dims=1"
                )
                exit(1)

        # setup tasks generator
        if teacher == 'Oracle':
            self.task_generator = OracleTeacher(
                mins, maxs, teacher_params['window_step_vector'], seed=seed)
        elif teacher == 'Random':
            self.task_generator = RandomTeacher(mins, maxs, seed=seed)
        elif teacher == 'RIAC':
            self.task_generator = RIAC(mins,
                                       maxs,
                                       seed=seed,
                                       params=teacher_params)
        elif teacher == 'ALP-GMM':
            self.task_generator = ALPGMM(mins,
                                         maxs,
                                         seed=seed,
                                         params=teacher_params)
        elif teacher == 'Covar-GMM':
            self.task_generator = CovarGMM(mins,
                                           maxs,
                                           seed=seed,
                                           params=teacher_params)
        else:
            print('Unknown teacher')
            raise NotImplementedError

        self.test_mode = "fixed_set"
        if self.test_mode == "fixed_set":
            name = get_test_set_name(self.param_env_bounds)
            self.test_env_list = pickle.load(
                open("teachDRL/teachers/test_sets/" + name + ".pkl", "rb"))
            print('fixed set of {} tasks loaded: {}'.format(
                len(self.test_env_list), name))

        #data recording
        self.env_params_train = []
        self.env_train_rewards = []
        self.env_train_norm_rewards = []
        self.env_train_len = []

        self.env_params_test = []
        self.env_test_rewards = []
        self.env_test_len = []
Beispiel #10
0
    def __init__(self, mins, maxs, seed=None, params=dict()):
        self.seed = seed
        if not seed:
            self.seed = np.random.randint(42, 424242)
        np.random.seed(self.seed)

        # Task space boundaries
        self.mins = np.array(mins)
        self.maxs = np.array(maxs)
        self.classroom_filename = "student_history" if "classroom_filename" not in params else params[
            'classroom_filename']
        self.classroom_portion = 100 if "classroom_portion" not in params else params[
            'classroom_portion']
        self.use_alpgmm = False if "use_alpgmm" not in params else params[
            'use_alpgmm']
        self.pre_test_epoch_idx = 2 if "pretrain_epochs" not in params else params[
            'pretrain_epochs']
        self.restart_after_pretrain = False if "restart_after_pretrain" not in params else params[
            'restart_after_pretrain']
        self.k = 5 if "k" not in params else params['k']
        self.random_expert = False if "random_expert" not in params else params[
            'random_expert']
        self.nb_test_epochs = 0
        self.use_ground_truth = False if 'use_ground_truth' not in params else params[
            'use_ground_truth']
        self.is_toy_env = False if "is_toy_env" not in params else params[
            'is_toy_env']
        self.current_student_params = params['student_params']
        #self.decorelate_alpgmm = False if "decorelate_alpgmm" not in params else params['decorelate_alpgmm']

        self.nb_alpgmm_gaussians = None

        # setting up alpgmm for pre-test phase
        self.alpgmm = ALPGMM(mins, maxs, seed=seed, params=params)
        self.is_new_alpgmm = False  # boolean used to track alpgmm's periodic updates
        self.random_task_ratio = 0.1
        self.post_pre_test_task_ratio = 0.02 if "random_task_ratio" not in params else params[
            "random_task_ratio"]
        self.in_end_rnd = self.post_pre_test_task_ratio if 'in_end_rnd' not in params else params[
            'in_end_rnd']

        self.sampled_gaussian_idx = None

        self.stop_R = False if "stop_R" not in params else params['stop_R']
        self.nb_eps_after_R = 0

        self.expert_means, self.expert_covs, self.expert_mean_rewards = None, None, None  # will be defined after pre test
        self.expert_type = "P" if "expert_type" not in params else params[
            "expert_type"]
        self.r_list_len = 50 if "r_list_len" not in params else params[
            "r_list_len"]
        self.tol_ratio = 1.0 if "tol_ratio" not in params else params[
            "tol_ratio"]

        if self.expert_type == 'R':
            self.reward_list = deque(maxlen=self.r_list_len)

        self.expert_idx = -1
        self.episode_nb = 0
        self.current_means = None
        self.current_covs = None
        self.current_mean_r = None

        # Boring book-keeping
        #self._update()
        self.bk = {
            'cegt_k': self.k,
            'cegt_pt': self.pre_test_epoch_idx,
            'cegt_expert_type': self.expert_type,
            'cegt_cf': self.classroom_filename,
            'cegt_rap': self.restart_after_pretrain,
            'stop_R': self.stop_R,
            'cegt_covariances': [],
            'cegt_means': [],
            'cegt_episodes': [self.episode_nb],
            'cegt_tasks_origin': [],
            'cegt_nb_alpgmm_gaussians': [],
            'cegt_expert_idx': [],
            'cegt_test_vectors': []
        }

        if self.pre_test_epoch_idx == 0:
            self.send_test_info(None, epoch_0=True)
Beispiel #11
0
class AGAIN():
    def __init__(self, mins, maxs, seed=None, params=dict()):
        self.seed = seed
        if not seed:
            self.seed = np.random.randint(42, 424242)
        np.random.seed(self.seed)

        # Task space boundaries
        self.mins = np.array(mins)
        self.maxs = np.array(maxs)
        self.classroom_filename = "student_history" if "classroom_filename" not in params else params[
            'classroom_filename']
        self.classroom_portion = 100 if "classroom_portion" not in params else params[
            'classroom_portion']
        self.use_alpgmm = False if "use_alpgmm" not in params else params[
            'use_alpgmm']
        self.pre_test_epoch_idx = 2 if "pretrain_epochs" not in params else params[
            'pretrain_epochs']
        self.restart_after_pretrain = False if "restart_after_pretrain" not in params else params[
            'restart_after_pretrain']
        self.k = 5 if "k" not in params else params['k']
        self.random_expert = False if "random_expert" not in params else params[
            'random_expert']
        self.nb_test_epochs = 0
        self.use_ground_truth = False if 'use_ground_truth' not in params else params[
            'use_ground_truth']
        self.is_toy_env = False if "is_toy_env" not in params else params[
            'is_toy_env']
        self.current_student_params = params['student_params']
        #self.decorelate_alpgmm = False if "decorelate_alpgmm" not in params else params['decorelate_alpgmm']

        self.nb_alpgmm_gaussians = None

        # setting up alpgmm for pre-test phase
        self.alpgmm = ALPGMM(mins, maxs, seed=seed, params=params)
        self.is_new_alpgmm = False  # boolean used to track alpgmm's periodic updates
        self.random_task_ratio = 0.1
        self.post_pre_test_task_ratio = 0.02 if "random_task_ratio" not in params else params[
            "random_task_ratio"]
        self.in_end_rnd = self.post_pre_test_task_ratio if 'in_end_rnd' not in params else params[
            'in_end_rnd']

        self.sampled_gaussian_idx = None

        self.stop_R = False if "stop_R" not in params else params['stop_R']
        self.nb_eps_after_R = 0

        self.expert_means, self.expert_covs, self.expert_mean_rewards = None, None, None  # will be defined after pre test
        self.expert_type = "P" if "expert_type" not in params else params[
            "expert_type"]
        self.r_list_len = 50 if "r_list_len" not in params else params[
            "r_list_len"]
        self.tol_ratio = 1.0 if "tol_ratio" not in params else params[
            "tol_ratio"]

        if self.expert_type == 'R':
            self.reward_list = deque(maxlen=self.r_list_len)

        self.expert_idx = -1
        self.episode_nb = 0
        self.current_means = None
        self.current_covs = None
        self.current_mean_r = None

        # Boring book-keeping
        #self._update()
        self.bk = {
            'cegt_k': self.k,
            'cegt_pt': self.pre_test_epoch_idx,
            'cegt_expert_type': self.expert_type,
            'cegt_cf': self.classroom_filename,
            'cegt_rap': self.restart_after_pretrain,
            'stop_R': self.stop_R,
            'cegt_covariances': [],
            'cegt_means': [],
            'cegt_episodes': [self.episode_nb],
            'cegt_tasks_origin': [],
            'cegt_nb_alpgmm_gaussians': [],
            'cegt_expert_idx': [],
            'cegt_test_vectors': []
        }

        if self.pre_test_epoch_idx == 0:
            self.send_test_info(None, epoch_0=True)

    def send_test_info(self, test_vec, epoch_0=False):
        self.bk['cegt_test_vectors'].append(test_vec)
        #print('len test vec is')
        #print(len(test_vec))
        #print(test_vec.shape)
        if epoch_0:  #do not increment if called from init
            assert (self.random_expert or self.use_ground_truth)
        else:
            self.nb_test_epochs += 1
        if self.nb_test_epochs == self.pre_test_epoch_idx:  # time to find an expert from classroom
            self.bk['pre_test_vec'] = test_vec
            # load classroom history
            path = "teachDRL/data/elders_knowledge/{}.pkl".format(
                self.classroom_filename)
            print("loading from {}".format(path))
            is_v2 = False
            if "v2" in self.classroom_filename:
                is_v2 = True

            student_ids, initial_test_vectors_list, last_test_vector, last_perfs, student_params = pickle.load(
                open(path, "rb"))
            if self.classroom_portion != 100:
                # take a random sample subpart of classroom
                sample_len = int(
                    len(student_ids) * (self.classroom_portion / 100))
                print('using only {} classroom data sampled randomly'.format(
                    sample_len))
                old_rnd_state = random.getstate()
                random.seed(self.seed)
                sampled_student_ids = random.sample(student_ids, sample_len)
                sampled_initial_test_vectors_list = []
                for kc_v in initial_test_vectors_list:
                    random.seed(self.seed)
                    sampled_initial_test_vectors_list.append(
                        random.sample(kc_v, sample_len))
                random.seed(self.seed)
                sampled_last_test_vector = random.sample(
                    last_test_vector, sample_len)
                random.seed(self.seed)
                sampled_last_perfs = random.sample(last_perfs, sample_len)
                if self.is_toy_env and is_v2:
                    random.seed(self.seed)
                    student_params['start_cube_idx'] = random.sample(
                        student_params['start_cube_idx'], sample_len)
                else:
                    print(
                        'portion of non toy env v2 classroom is not yet supported'
                    )
                    exit(1)
                random.setstate(old_rnd_state)  # restore random state
                # set classroom to classroom sample
                initial_test_vectors_list = sampled_initial_test_vectors_list
                student_ids = sampled_student_ids
                last_test_vector = sampled_last_test_vector
                last_perfs = sampled_last_perfs

            if self.random_expert:
                print('choosing expert randomly !')
                expert_id = np.random.choice(student_ids)
            else:
                expert_id = get_k_experts(
                    self.current_student_params,
                    test_vec,
                    initial_test_vectors_list,
                    last_test_vector,
                    student_ids,
                    student_params,
                    last_perfs,
                    k=self.k,
                    use_ground_truth=self.use_ground_truth,
                    test_vec_idx=self.pre_test_epoch_idx - 1,
                    is_toy_env=self.is_toy_env,
                    is_v2=is_v2)
            self.bk['selected_expert'] = expert_id
            print('expert selected is: {}'.format(expert_id))
            # loading expert
            folder_path = 'teachDRL/data/elders_knowledge/' + expert_id.rsplit(
                '_s', 1)[0] + '/' + expert_id
            print(folder_path)
            self.expert_means, self.expert_covs, self.expert_mean_rewards = load_expert_trajectory(
                folder_path, is_toy_env=self.is_toy_env)
            self._update()

            # add alpgmm gaussians
            if self.use_alpgmm and self.alpgmm.gmm is not None:
                self.current_means += self.alpgmm.gmm.means_.tolist()
                self.current_covs += self.alpgmm.gmm.covariances_.tolist()
                self.nb_alpgmm_gaussians = len(self.alpgmm.gmm.means_)
            return self.restart_after_pretrain
        return False

    def _update(self):
        if self.expert_type == 'P':  # Pool type, single GMM out of all expert GMMs
            #print('P-updating')
            self.current_means = [
                sub_item for sub_list in self.expert_means
                for sub_item in sub_list
            ]  # flatten
            self.current_covs = [
                sub_item for sub_list in self.expert_covs
                for sub_item in sub_list
            ]  # same
        elif self.expert_type == 'T':  # Time type, expert trajectory is stepped every 250 episodes
            #print('T-updating')
            self.expert_idx = min(self.episode_nb // 250,
                                  len(self.expert_means) - 1)
            self.current_means = self.expert_means[
                self.expert_idx].copy()  # flatten
            self.current_covs = self.expert_covs[self.expert_idx].copy()
        elif self.expert_type == 'R':  # Reward type, expert traj is stepped when mean reward > to previous self
            #print('R-updating')
            self.expert_idx = min(self.expert_idx + 1,
                                  len(self.expert_means) - 1)
            self.current_means = self.expert_means[self.expert_idx].copy()
            self.current_covs = self.expert_covs[self.expert_idx].copy()
            self.current_mean_r = self.expert_mean_rewards[
                self.expert_idx] * self.tol_ratio
            self.reward_list = deque(maxlen=self.r_list_len)
        else:
            print('Unknown expert type')
            exit(1)

    def update(self, task, reward):
        #print("current means: {}, covs {}".format(len(self.current_means), len(self.current_covs)))
        #print("expert_idx: {}".format(self.expert_idx))
        self.episode_nb += 1

        if self.nb_test_epochs < self.pre_test_epoch_idx:  # pre-test phase, only use alp-gmm
            self.is_new_alpgmm = self.alpgmm.update(task, reward)
            if self.is_new_alpgmm:
                self.bk['cegt_covariances'].append(
                    self.alpgmm.gmm.covariances_.copy())
                self.bk['cegt_means'].append(self.alpgmm.gmm.means_.copy())
                self.bk['cegt_episodes'].append(self.episode_nb)
                self.bk['cegt_expert_idx'].append(self.expert_idx)
            return self.is_new_alpgmm

        just_updated_gmm = False
        # handle AGAIN-R/T to ALP-GMM transition after finishing expert curriculum
        if self.use_alpgmm and (
                self.expert_type == "R" or self.expert_type
                == 'T') and self.stop_R and self.expert_idx == (
                    len(self.expert_means) -
                    1) and self.nb_alpgmm_gaussians is not None:
            if self.nb_eps_after_R == 0:  # when AGAIN reaches the end of the expert curriculum, it can change rnd sampling
                self.random_task_ratio = 0.1
                self.post_pre_test_task_ratio = self.in_end_rnd  # switch back to high-exploration strategy
                print(
                    'switching to rnd of {} since last IN idx reached'.format(
                        self.in_end_rnd))
                if self.expert_type == 'R':
                    self.expert_type = "stoppedR"
                    self.bk['stoppedR_episode'] = self.episode_nb
                elif self.expert_type == 'T':
                    self.expert_type = "stoppedT"
                    self.bk['stoppedT_episode'] = self.episode_nb

        # handle AGAIN-R/T smooth re-update of last IN gaussian
        if self.use_alpgmm and (
                self.expert_type == "stoppedR" or self.expert_type
                == 'stoppedT') and self.stop_R and self.expert_idx == (
                    len(self.expert_means) -
                    1) and self.nb_alpgmm_gaussians is not None:
            if self.nb_eps_after_R == 0:  # first time, init last IN GMM gaussian tracking to update ALP periodically
                self.last_IN_gaussians_alps = [
                    deque(maxlen=100) for _ in range(
                        len(self.current_means) - self.nb_alpgmm_gaussians)
                ]
                self.added_since_fit = 0
                assert ((len(self.current_means) -
                         self.nb_alpgmm_gaussians) == len(
                             self.expert_means[-1]))
                print('TIME TO START POST IN, last expert has len {} -->  {}'.
                      format(len(self.expert_means[-1]),
                             self.expert_means[-1]))
            elif self.added_since_fit == 100:  # time to re-update the final IN lps gmm
                print('last in update time')
                #print(self.last_IN_gaussians_alps)
                just_updated_gmm = True
                self.added_since_fit = 0
                for i, alp_window in enumerate(self.last_IN_gaussians_alps):
                    if len(alp_window) == 0:
                        self.current_means[i][-1] = 0.0
                    else:
                        self.current_means[i][-1] = np.mean(alp_window)
                # remove alp-gmm gaussians to fit update pipeline (they will be re-added
                self.current_means = self.current_means[:-self.
                                                        nb_alpgmm_gaussians]
                self.current_covs = self.current_covs[:-self.
                                                      nb_alpgmm_gaussians]
                print('post in update to {}'.format(self.current_means))

            if self.sampled_gaussian_idx < (
                    len(self.current_means) -
                    self.nb_alpgmm_gaussians):  # last task from IN
                #print('adding alp to IN idx {} out of {}'.format(self.sampled_gaussian_idx,len(self.current_means) - self.nb_alpgmm_gaussians))
                self.last_IN_gaussians_alps[self.sampled_gaussian_idx].append(
                    self.alpgmm.alps[-1])
                self.added_since_fit += 1
            self.nb_eps_after_R += 1

        # handle IN-R to ALP-GMM transition after finishing expert curriculum
        if self.expert_type == "R" and self.stop_R and self.expert_idx == (
                len(self.expert_means) - 1):
            self.use_alpgmm = True
            self.nb_eps_after_R += 1
            if self.nb_eps_after_R == 250:  # after a long time in last expert index, change strategy
                self.expert_type = "stoppedR"
                self.random_task_ratio = 0.1
                self.bk['stoppedR_episode'] = self.episode_nb
                # replace last expert idx by alpgmm gaussians
                self.current_means = []
                self.current_covs = []
                self.nb_alpgmm_gaussians = len(self.alpgmm.gmm.means_)
                just_updated_gmm = True

        # PROCESS DATA FOR R or T variants
        if self.expert_type == 'R' and self.bk['cegt_tasks_origin'][
                -1] == 'egt':  # add reward to list if from egt
            self.reward_list.append(reward)
        # check whether a GMM update is necessary, depending on the expert type
        if (self.expert_type == 'T' and (self.episode_nb % 250) == 0)\
           or (self.expert_type == 'R' and len(self.reward_list) == self.r_list_len and np.mean(self.reward_list) >= self.current_mean_r):
            if self.expert_idx != (
                    len(self.expert_means) -
                    1):  # if not already at the end of expert curricula
                self._update()
                just_updated_gmm = True

        if self.use_alpgmm:
            if just_updated_gmm and self.nb_alpgmm_gaussians is not None:  # expert changed, add alpgmm part
                self.current_means += self.alpgmm.gmm.means_.tolist()
                self.current_covs += self.alpgmm.gmm.covariances_.tolist()

            # send data to alpgmm
            self.is_new_alpgmm = self.alpgmm.update(task, reward)

            if self.is_new_alpgmm:
                # update current GMM by replacing old gaussians from alpgmm with new ones
                if self.nb_alpgmm_gaussians is not None:
                    # remove old gaussians
                    self.current_means = self.current_means[:-self.
                                                            nb_alpgmm_gaussians]
                    self.current_covs = self.current_covs[:-self.
                                                          nb_alpgmm_gaussians]
                # add new gaussians
                #print('adding stuff')
                self.current_means += self.alpgmm.gmm.means_.tolist()
                self.current_covs += self.alpgmm.gmm.covariances_.tolist()
                self.nb_alpgmm_gaussians = len(self.alpgmm.gmm.means_)
                just_updated_gmm = True

        # book-keeping
        if just_updated_gmm:
            self.bk['cegt_covariances'].append(self.current_covs.copy())
            self.bk['cegt_means'].append(self.current_means.copy())
            self.bk['cegt_episodes'].append(self.episode_nb)
            self.bk['cegt_expert_idx'].append(self.expert_idx)
            self.bk['cegt_nb_alpgmm_gaussians'].append(
                self.nb_alpgmm_gaussians)
        return just_updated_gmm

    def sample_task(self):
        new_task = None
        task_origin = None
        #print(self.episode_nb)
        # pre-test phase, only use alp-gmm
        if self.nb_test_epochs < self.pre_test_epoch_idx:
            #print('pre-test-task-sampling')
            if (self.episode_nb < 250) or (np.random.random() <
                                           self.random_task_ratio):
                # Random task sampling
                new_task = self.alpgmm.random_task_generator.sample()
                task_origin = 'random'
            else:
                # alp-gmm task sampling
                task_origin = 'alpgmm'
                alp_means = []
                for pos in self.alpgmm.gmm.means_:
                    alp_means.append(pos[-1])

                # 2 - Sample Gaussian proportionally to its mean ALP
                idx = proportional_choice(alp_means, eps=0.0)

                # 3 - Sample task in Gaussian, without forgetting to remove ALP dimension
                new_task = np.random.multivariate_normal(
                    self.alpgmm.gmm.means_[idx],
                    self.alpgmm.gmm.covariances_[idx])[:-1]
                new_task = np.clip(new_task, self.mins,
                                   self.maxs).astype(np.float32)
            self.bk['cegt_tasks_origin'].append(task_origin)
            return new_task

        #print(self.random_task_ratio)
        if self.use_alpgmm and np.random.random(
        ) < self.post_pre_test_task_ratio:
            # Random task sampling
            new_task = self.alpgmm.random_task_generator.sample()
            task_origin = 'random'
        else:
            # ALP-based task sampling

            # 1 - Retrieve the mean ALP value of each Gaussian in the GMM
            alp_means = []
            for means in self.current_means:
                alp_means.append(means[-1])

            # 2 - Sample Gaussian proportionally to its mean ALP
            idx = proportional_choice(alp_means, eps=0.0)
            self.sampled_gaussian_idx = idx

            # 3 - Sample task in Gaussian, without forgetting to remove ALP dimension
            new_task = np.random.multivariate_normal(
                self.current_means[idx], self.current_covs[idx])[:-1]
            new_task = np.clip(new_task, self.mins,
                               self.maxs).astype(np.float32)
            task_origin = 'egt'
            if self.use_alpgmm and self.alpgmm.gmm is not None:
                if idx >= len(self.current_means) - self.nb_alpgmm_gaussians:
                    task_origin = 'alpgmm'
        #print(task_origin)
        # boring book-keeping
        self.bk['cegt_tasks_origin'].append(task_origin)

        return new_task

    def dump(self, dump_dict):
        self.bk['cegt_initial_expert_means'] = self.expert_means
        self.bk['cegt_initial_expert_covs'] = self.expert_covs
        self.bk['cegt_student_param'] = self.current_student_params
        if self.expert_type == 'R' or self.expert_type == "stoppedR":
            self.bk[
                'cegt_initial_expert_mean_rewards'] = self.expert_mean_rewards
        dump_dict.update(self.bk)
        if self.use_alpgmm:
            dump_dict.update(self.alpgmm.bk)
        return dump_dict
Beispiel #12
0
class EGT():
    def __init__(self, mins, maxs, seed=None, params=dict()):
        self.seed = seed
        if not seed:
            self.seed = np.random.randint(42, 424242)
        np.random.seed(self.seed)

        # Task space boundaries
        self.mins = np.array(mins)
        self.maxs = np.array(maxs)

        self.use_alpgmm = False if "use_alpgmm" not in params else params[
            'use_alpgmm']
        #self.decorelate_alpgmm = False if "decorelate_alpgmm" not in params else params['decorelate_alpgmm']

        self.nb_alpgmm_gaussians = None
        if self.use_alpgmm:
            print("Using ALP-GMM with EGT")
            self.alpgmm = ALPGMM(mins, maxs, seed=seed, params=params)
            self.is_new_alpgmm = False  # boolean used to track alpgmm's periodic updates
            self.random_task_ratio = 0.02 if "random_task_ratio" not in params else params[
                "random_task_ratio"]
            self.sampled_gaussian_idx = None

            self.stop_R = False if "stop_R" not in params else params['stop_R']
            self.nb_eps_after_R = 0

        assert ('expert_gmms' in params)
        self.expert_means, self.expert_covs, self.expert_mean_rewards = params[
            'expert_gmms']
        self.expert_type = "P" if "expert_type" not in params else params[
            "expert_type"]
        self.r_list_len = 50 if "r_list_len" not in params else params[
            "r_list_len"]
        self.tol_ratio = 1.0 if "tol_ratio" not in params else params[
            "tol_ratio"]

        if self.expert_type == 'R':
            self.reward_list = deque(maxlen=self.r_list_len)

        self.expert_idx = -1
        self.episode_nb = 0
        self.current_means = None
        self.current_covs = None
        self.current_mean_r = None

        # Boring book-keeping
        self._update()
        self.bk = {
            'egt_covariances': [self.current_covs.copy()],
            'egt_means': [self.current_means.copy()],
            'egt_episodes': [self.episode_nb],
            'egt_tasks_origin': [],
            'egt_nb_alpgmm_gaussians': [self.nb_alpgmm_gaussians],
            'egt_expert_idx': [self.expert_idx]
        }

    def _update(self):
        if self.expert_type == 'P':  # Pool type, single GMM out of all expert GMMs
            #print('P-updating')
            self.current_means = [
                sub_item for sub_list in self.expert_means
                for sub_item in sub_list
            ]  # flatten
            self.current_covs = [
                sub_item for sub_list in self.expert_covs
                for sub_item in sub_list
            ]  # same
        elif self.expert_type == 'T':  # Time type, expert trajectory is stepped every 250 episodes
            #print('T-updating')
            self.expert_idx = min(self.episode_nb // 250,
                                  len(self.expert_means) - 1)
            self.current_means = self.expert_means[
                self.expert_idx].copy()  # flatten
            self.current_covs = self.expert_covs[self.expert_idx].copy()
        elif self.expert_type == 'R':  # Reward type, expert traj is stepped when mean reward > to previous self
            #print('R-updating')
            self.expert_idx = min(self.expert_idx + 1,
                                  len(self.expert_means) - 1)
            self.current_means = self.expert_means[self.expert_idx].copy()
            self.current_covs = self.expert_covs[self.expert_idx].copy()
            self.current_mean_r = self.expert_mean_rewards[
                self.expert_idx] * self.tol_ratio
            self.reward_list = deque(maxlen=self.r_list_len)
        else:
            print('Unknown expert type')
            exit(1)

    def update(self, task, reward):
        #print("current means: {}, covs {}".format(len(self.current_means), len(self.current_covs)))
        #print("expert_idx: {}".format(self.expert_idx))
        self.episode_nb += 1
        just_updated_gmm = False
        if self.use_alpgmm and self.expert_type == "R" and self.stop_R and self.expert_idx == (
                len(self.expert_means) -
                1) and self.nb_alpgmm_gaussians is not None:
            self.nb_eps_after_R += 1
            if self.nb_eps_after_R == 250:  # after a long time in last expert index, change strategy
                self.expert_type = "stoppedR"
                self.random_task_ratio = 0.1
                self.current_means = []
                self.current_covs = []
                self.bk['stoppedR_episode'] = self.episode_nb
                just_updated_gmm = True

        # process new data
        if self.expert_type == 'R' and self.bk['egt_tasks_origin'][
                -1] == 'egt':  # add reward to list if from egt
            self.reward_list.append(reward)
        # check whether a GMM update is necessary, depending on the expert type
        if (self.expert_type == 'T' and (self.episode_nb % 250) == 0)\
           or (self.expert_type == 'R' and len(self.reward_list) == self.r_list_len and np.mean(self.reward_list) > self.current_mean_r):
            if self.expert_idx != (
                    len(self.expert_means) -
                    1):  # if not already at the end of expert curricula
                self._update()
                just_updated_gmm = True

        if self.use_alpgmm:
            if just_updated_gmm and self.nb_alpgmm_gaussians is not None:  # expert changed, add alpgmm part
                self.current_means += self.alpgmm.gmm.means_.tolist()
                self.current_covs += self.alpgmm.gmm.covariances_.tolist()

            # send data to alpgmm
            self.is_new_alpgmm = self.alpgmm.update(task, reward)

            if self.is_new_alpgmm:
                # update current GMM by replacing old gaussians from alpgmm with new ones
                if self.nb_alpgmm_gaussians is not None:
                    # remove old gaussians
                    self.current_means = self.current_means[:-self.
                                                            nb_alpgmm_gaussians]
                    self.current_covs = self.current_covs[:-self.
                                                          nb_alpgmm_gaussians]
                # add new gaussians
                #print('adding stuff')
                self.current_means += self.alpgmm.gmm.means_.tolist()
                self.current_covs += self.alpgmm.gmm.covariances_.tolist()
                self.nb_alpgmm_gaussians = len(self.alpgmm.gmm.means_)
                just_updated_gmm = True

        # # smoothly update the ALP value of expert gaussians if at last IEC index
        # if self.expert_idx == (len(self.expert_means) - 1) and self.use_alpgmm and self.nb_alpgmm_gaussians is not None:
        #     if self.expert_type == 'T' or (self.expert_type == 'R' and np.mean(self.reward_list) > self.current_mean_r):
        #         if self.bk['egt_tasks_origin'][-1] == 'egt':
        #             assert(self.sampled_gaussian_idx < (len(self.current_means) - self.nb_alpgmm_gaussians))
        #             cur_alp = self.current_means[self.sampled_gaussian_idx][-1] # update alp of corresponding Gaussian
        #             self.current_means[self.sampled_gaussian_idx][-1] = cur_alp * (49/50) + (self.alpgmm.alps[-1]/50)

        # book-keeping
        if just_updated_gmm:
            self.bk['egt_covariances'].append(self.current_covs.copy())
            self.bk['egt_means'].append(self.current_means.copy())
            self.bk['egt_episodes'].append(self.episode_nb)
            self.bk['egt_expert_idx'].append(self.expert_idx)
            self.bk['egt_nb_alpgmm_gaussians'].append(self.nb_alpgmm_gaussians)

    def sample_task(self):
        new_task = None
        task_origin = None
        if self.use_alpgmm and np.random.random() < self.random_task_ratio:
            # Random task sampling
            new_task = self.alpgmm.random_task_generator.sample()
            task_origin = 'random'
        else:
            # ALP-based task sampling

            # 1 - Retrieve the mean ALP value of each Gaussian in the GMM
            alp_means = []
            for means in self.current_means:
                alp_means.append(means[-1])

            # 2 - Sample Gaussian proportionally to its mean ALP
            idx = proportional_choice(alp_means, eps=0.0)
            self.sampled_gaussian_idx = idx

            # 3 - Sample task in Gaussian, without forgetting to remove ALP dimension
            new_task = np.random.multivariate_normal(
                self.current_means[idx], self.current_covs[idx])[:-1]
            new_task = np.clip(new_task, self.mins,
                               self.maxs).astype(np.float32)
            task_origin = 'egt'
            if self.use_alpgmm and self.alpgmm.gmm is not None:
                if idx >= len(self.current_means) - self.nb_alpgmm_gaussians:
                    task_origin = 'alpgmm'
        #print(task_origin)
        # boring book-keeping
        self.bk['egt_tasks_origin'].append(task_origin)

        return new_task

    def dump(self, dump_dict):
        self.bk['egt_initial_expert_means'] = self.expert_means
        self.bk['egt_initial_expert_covs'] = self.expert_covs
        if self.expert_type == 'R' or self.expert_type == "stoppedR":
            self.bk[
                'egt_initial_expert_mean_rewards'] = self.expert_mean_rewards
        dump_dict.update(self.bk)
        if self.use_alpgmm:
            dump_dict.update(self.alpgmm.bk)
        return dump_dict