Exemple #1
0
def rollout(env, agent, max_path_length=10000, animated=False, speedup=1):
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    o = env.reset()
    agent.reset()
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        a, agent_info = agent.get_action(o)
        next_o, r, d, env_info = env.step(a,o)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        if d:
            break
        o = next_o
        if animated:
            env.render()
    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
Exemple #2
0
def rollout_policy(agent, env, max_path_length=200, speedup=1, get_image_observations=False, animated=False):
    """
    Mostly taken from https://github.com/bstadie/third_person_im/blob/master/sandbox/bradly/third_person/algos/cyberpunk_trainer.py#L164
    Generate a rollout for a given policy
    """
    observations = []
    im_observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    o = env.reset()
    path_length = 0

    while path_length <= max_path_length:
        a, agent_info = agent.get_action(o)
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))

        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        o = next_o
        if get_image_observations:
            if not animated:
                pixel_array = env.render(mode="rgb_array")
            else:
                pixel_array = env.render()

            if pixel_array is None and not animated:
                # Not convinced that behaviour works for all environments, so until
                # such a time as I'm convinced of this, drop into a debug shell
                print("Problem! Couldn't get pixels! Dropping into debug shell.")
                import pdb; pdb.set_trace()
            im_observations.append(pixel_array)
        if d:
            rewards.append(r)
            break
        else:
            rewards.append(r)

    if animated:
        env.render(close=True)

    im_observations = tensor_utils.stack_tensor_list(im_observations)
    observations = tensor_utils.stack_tensor_list(observations)
    rewards = tensor_utils.stack_tensor_list(rewards)

    return dict(
        observations=observations,
        im_observations=im_observations,
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=rewards,
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
Exemple #3
0
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, save_video=True, video_filename='sim_out.mp4', reset_arg=None):
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    images = []
    o = env.reset(reset_args=reset_arg)
    agent.reset()
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        a, agent_info = agent.get_action(o)
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        if d: # and not animated:  # TODO testing
            break
        o = next_o
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)
            if save_video:
                from PIL import Image
                image = env.wrapped_env.wrapped_env.get_viewer().get_image()
                pil_image = Image.frombytes('RGB', (image[1], image[2]), image[0])
                images.append(np.flipud(np.array(pil_image)))

    if animated:
        if save_video and len(images) >= max_path_length:
            import moviepy.editor as mpy
            clip = mpy.ImageSequenceClip(images, fps=20*speedup)
            if video_filename[-3:] == 'gif':
                clip.write_gif(video_filename, fps=20*speedup)
            else:
                clip.write_videofile(video_filename, fps=20*speedup)
        #return

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
Exemple #4
0
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1,
            always_return_paths=False):
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    o = env.reset()
    agent.reset()
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        a, agent_info = agent.get_action(o)
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        if d:
            break
        o = next_o
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)
    if animated and not always_return_paths:
        return

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
Exemple #5
0
def ed_dec_rollout(env,
                   agents,
                   max_path_length=np.inf,
                   animated=False,
                   speedup=1):
    if (agents.recurrent):
        assert isinstance(
            agents,
            GSMDPRecurrentPolicy), 'Recurrent policy is not a GSMDP class'
    """Decentralized rollout"""
    n_agents = len(env.agents)
    observations = [[] for _ in range(n_agents)]
    actions = [[] for _ in range(n_agents)]
    rewards = [[] for _ in range(n_agents)]
    agent_infos = [[] for _ in range(n_agents)]
    env_infos = [[] for _ in range(n_agents)]
    offset_t_sojourn = [[] for _ in range(n_agents)]
    olist = env.reset()
    assert len(olist) == n_agents, "{} != {}".format(len(olist), n_agents)

    agents.reset(dones=[True for _ in range(n_agents)])
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        agents_to_act = [
            i for i, j in enumerate(olist) if j != [None] * len(j)
        ]
        if (not agents.recurrent):
            alist, agent_info_list = agents.get_actions(
                [olist[i] for i in agents_to_act])
            agent_info_list = tensor_utils.split_tensor_dict_list(
                agent_info_list)
        else:
            alist, agent_info_list = agents.get_actions(olist)
            alist = [a for a in alist if a != None]
            agent_info_list = tensor_utils.split_tensor_dict_list(
                agent_info_list)
            agent_info_list = [
                ainfo for i, ainfo in enumerate(agent_info_list)
                if i in agents_to_act
            ]

        next_actions = [None] * n_agents  # will fill in in the loop

        # For each agent
        for ind, o in enumerate([olist[j] for j in agents_to_act]):
            # ind refers to non-None indicies
            # i refers to indices with Nones
            i = agents_to_act[ind]
            observations[i].append(env.observation_space.flatten(o))
            # observations[i].append(o) # REMOVE THIS AND UNCOMMENT THE ABOVE LINE
            actions[i].append(env.action_space.flatten(alist[ind]))
            next_actions[i] = alist[ind]
            if agent_info_list is None:
                agent_infos[i].append({})
            else:
                agent_infos[i].append(agent_info_list[ind])

        # take next actions

        next_olist, rlist, d, env_info = env.step(np.asarray(next_actions))

        # update sojourn time (we should associate ts from next_olist to r, not current)

        for i, r in enumerate(rlist):
            if r is None: continue
            # skip reward if agent has not acted yet
            if (len(observations[i]) > 0):
                rewards[i].append(r)
                offset_t_sojourn[i].append(
                    env.observation_space.flatten(next_olist[i])[-1])
                env_infos[i].append(env_info)
        path_length = max([len(o) for o in observations])
        if d:
            break
        olist = next_olist
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)

    if (path_length == max_path_length):
        # probably have some paths that aren't the right length
        for ind, o in enumerate(observations):
            r = rewards[ind]
            if (len(o) > len(r)):
                assert(len(o) <= (len(r) + 1)), \
                 'len(o) %d, len(r) %d' % (len(o), len(r))
                # delete last elem of obs, actions, agent infos
                del observations[ind][-1]
                del actions[ind][-1]
                del agent_infos[ind][-1]

    if animated:
        env.render()

    # remove empty agent trajectories
    observations = [o for o in observations if len(o) > 0]
    actions = [a for a in actions if len(a) > 0]
    rewards = [r for r in rewards if len(r) > 0]
    agent_infos = [i for i in agent_infos if len(i) > 0]
    env_infos = [e for e in env_infos if len(e) > 0]
    offset_t_sojourn = [o for o in offset_t_sojourn if len(o) > 0]

    if (any(
            map(lambda x: x < n_agents, [
                len(observations),
                len(actions),
                len(rewards),
                len(agent_infos),
                len(env_infos)
            ]))):
        print('\nWARNING: \n')
        print('n_agents: ', n_agents)
        print('len(observations): ', len(observations))
        print('len(actions): ', len(actions))
        print('len(rewards): ', len(rewards))
        print('len(agent_infos): ', len(agent_infos))
        print('len(env_infos): ', len(env_infos))

    return [
        dict(
            observations=tensor_utils.stack_tensor_list(observations[i]),
            actions=tensor_utils.stack_tensor_list(actions[i]),
            rewards=tensor_utils.stack_tensor_list(rewards[i]),
            agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos[i]),
            env_infos=tensor_utils.stack_tensor_dict_list(env_infos[i]),
            offset_t_sojourn=tensor_utils.stack_tensor_list(
                offset_t_sojourn[i]),
        ) for i in range(len(observations))
    ]
    def obtain_samples(self, itr, determ=False):
        # logger.log("Obtaining samples for iteration %d..." % itr)
        paths = []
        n_samples = 0
        obses = self.vec_env.reset()
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.algo.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy
        import time
        while n_samples < self.algo.batch_size:
            t = time.time()
            policy.reset(dones)
            actions, agent_infos = policy.get_actions(obses)
            if determ:
                actions = agent_infos['mean']

            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths.append(
                        dict(
                            observations=self.env_spec.observation_space.
                            flatten_n(running_paths[idx]["observations"]),
                            actions=self.env_spec.action_space.flatten_n(
                                running_paths[idx]["actions"]),
                            rewards=tensor_utils.stack_tensor_list(
                                running_paths[idx]["rewards"]),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None
            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        # logger.record_tabular("PolicyExecTime", policy_time)
        # logger.record_tabular("EnvExecTime", env_time)
        # logger.record_tabular("ProcessExecTime", process_time)

        return paths
Exemple #7
0
def rollout(env, agent, line_params, max_path_length=np.inf, animated=False):
    """
    Modified rollout function from rllab.sampler.utils to run
    arbitrary straight trajectories.
    """
    observations = []
    rewards = []
    actions = []
    agent_infos = []
    env_infos = []

    projected_trajectory = []
    x0, y0, angle = line_params
    env.reset()
    agent.reset()

    # Force start state to be zeros
    # Note: Because env is an instance of NormalizedEnv, there is no
    #   way of writing a custom function that I can use to set the
    #   initial state. Consequently we just force set it here.
    start_yaw = angle
    start_state = np.array([x0, y0, start_yaw, 0, 0, 0])
    env._wrapped_env._state = start_state
    o = np.zeros(5)

    path_length = 0
    if animated:
        env.render()
    print('--------------------')
    while path_length < max_path_length:
        print('')
        state = env._wrapped_env._state
        print('State = ', state)
        projected_o = StraightEnv.project_line(state, x0, y0, angle)
        print('Projected state = ', projected_o)
        _, agent_info = agent.get_action(projected_o[1:])
        a = agent_info['mean']
        print('Computed action = ', a)
        next_o, r, d, env_info = env.step(a)
        print('Next observation = ', next_o)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        projected_trajectory.append(projected_o)
        path_length += 1
        if d:
            break
        o = next_o
        if animated:
            env.render()
    print('--------------------')

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    ), projected_trajectory
Exemple #8
0
    def process_samples(self, itr, paths, update_baseline=True):
        baselines = []
        returns = []

        if hasattr(self.algo.baseline, "predict_n"):
            all_path_baselines = self.algo.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.algo.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = path["rewards"] + \
                     self.algo.discount * path_baselines[1:] - \
                     path_baselines[:-1]
            path["advantages"] = special.discount_cumsum(
                deltas, self.algo.discount * self.algo.gae_lambda)
            path["returns"] = special.discount_cumsum(path["rewards"],
                                                      self.algo.discount)
            baselines.append(path_baselines[:-1])
            returns.append(path["returns"])

        if hasattr(self.algo, 'epopt_epsilon'):
            if self.algo.epopt_epsilon < 1.0 and self.algo.epopt_after_iter <= itr:
                # prune the paths
                target_path_size = len(paths) * self.algo.epopt_epsilon
                sorted_indices = np.argsort(
                    [path["returns"][0] for path in paths])
                idx = 0
                si_idx = 0
                while True:
                    if sorted_indices[si_idx] > target_path_size:
                        paths.pop(idx)
                        idx -= 1
                    idx += 1
                    si_idx += 1
                    if idx >= len(paths):
                        break

        ev = special.explained_variance_1d(np.concatenate(baselines),
                                           np.concatenate(returns))

        if not self.algo.policy.recurrent:
            observations = tensor_utils.concat_tensor_list(
                [path["observations"] for path in paths])
            actions = tensor_utils.concat_tensor_list(
                [path["actions"] for path in paths])
            rewards = tensor_utils.concat_tensor_list(
                [path["rewards"] for path in paths])
            returns = tensor_utils.concat_tensor_list(
                [path["returns"] for path in paths])
            advantages = tensor_utils.concat_tensor_list(
                [path["advantages"] for path in paths])
            env_infos = tensor_utils.concat_tensor_dict_list(
                [path["env_infos"] for path in paths])
            agent_infos = tensor_utils.concat_tensor_dict_list(
                [path["agent_infos"] for path in paths])

            if self.algo.center_adv:
                advantages = util.center_advantages(advantages)

            if self.algo.positive_adv:
                advantages = util.shift_advantages_to_positive(advantages)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = []
            ct = 0
            for path in paths:
                if path['env_infos']['dyn_model_id'][-1] == 0:
                    undiscounted_returns.append(sum(path["rewards"]))
                if path['env_infos']['dyn_model_id'][-1] == 1:
                    ct += 1
            print('path count with fake dynamics: ', ct,
                  len(undiscounted_returns), len(paths))

            ent = np.mean(self.algo.policy.distribution.entropy(agent_infos))

            samples_data = dict(
                observations=observations,
                actions=actions,
                rewards=rewards,
                returns=returns,
                advantages=advantages,
                env_infos=env_infos,
                agent_infos=agent_infos,
                paths=paths,
            )
        else:
            max_path_length = max([len(path["advantages"]) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path["observations"] for path in paths]
            obs = tensor_utils.pad_tensor_n(obs, max_path_length)

            if self.algo.center_adv:
                raw_adv = np.concatenate(
                    [path["advantages"] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [(path["advantages"] - adv_mean) / adv_std
                       for path in paths]
            else:
                adv = [path["advantages"] for path in paths]

            adv = np.asarray(
                [tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path["actions"] for path in paths]
            actions = tensor_utils.pad_tensor_n(actions, max_path_length)

            rewards = [path["rewards"] for path in paths]
            rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

            returns = [path["returns"] for path in paths]
            returns = tensor_utils.pad_tensor_n(returns, max_path_length)

            agent_infos = [path["agent_infos"] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in agent_infos
            ])

            env_infos = [path["env_infos"] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in env_infos
            ])

            valids = [np.ones_like(path["returns"]) for path in paths]
            valids = tensor_utils.pad_tensor_n(valids, max_path_length)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.sum(
                self.algo.policy.distribution.entropy(agent_infos) *
                valids) / np.sum(valids)

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                returns=returns,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                paths=paths,
            )

        if update_baseline:
            logger.log("fitting baseline...")
            if hasattr(self.algo.baseline, 'fit_with_samples'):
                self.algo.baseline.fit_with_samples(paths, samples_data)
            else:
                self.algo.baseline.fit(paths)
            logger.log("fitted")

        logger.record_tabular('Iteration', itr)
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageReturn', np.mean(undiscounted_returns))
        logger.record_tabular('ExplainedVariance', ev)
        logger.record_tabular('NumTrajs', len(paths))
        logger.record_tabular('Entropy', ent)
        logger.record_tabular('Perplexity', np.exp(ent))
        logger.record_tabular('StdReturn', np.std(undiscounted_returns))
        logger.record_tabular('MaxReturn', np.max(undiscounted_returns))
        logger.record_tabular('MinReturn', np.min(undiscounted_returns))

        return samples_data
    def obtain_agent_info_offpolicy(self,
                                    itr,
                                    expert_trajs_dir=None,
                                    offpol_trajs=None,
                                    treat_as_expert_traj=False,
                                    log_prefix=''):
        assert expert_trajs_dir is None, "deprecated"
        start = time.time()
        if offpol_trajs is None:
            assert expert_trajs_dir is not None, "neither offpol_trajs nor expert_trajs_dir is provided"
            if self.use_pooled_goals:
                for t, taskidx in enumerate(self.goals_idxs_for_itr_dict[itr]):
                    assert np.array_equal(
                        self.goals_pool[taskidx],
                        self.goals_to_use_dict[itr][t]), "fail"
                offpol_trajs = {
                    t: joblib.load(expert_trajs_dir + str(taskidx) +
                                   self.expert_trajs_suffix + ".pkl")
                    for t, taskidx in enumerate(
                        self.goals_idxs_for_itr_dict[itr])
                }
            else:
                offpol_trajs = joblib.load(expert_trajs_dir + str(itr) +
                                           self.expert_trajs_suffix + ".pkl")

            offpol_trajs = {
                tasknum: offpol_trajs[tasknum]
                for tasknum in range(self.meta_batch_size)
            }

        # some initial rearrangement
        tasknums = offpol_trajs.keys(
        )  # tasknums is range(self.meta_batch_size) as can be seen above
        for t in tasknums:
            for path in offpol_trajs[t]:
                if 'expert_actions' not in path.keys(
                ) and treat_as_expert_traj:
                    # print("copying expert actions, you should do this only 1x per metaitr")
                    path['expert_actions'] = np.clip(deepcopy(path['actions']),
                                                     -1.0, 1.0)

                if treat_as_expert_traj:
                    path['agent_infos'] = dict(
                        mean=[[0.0] * len(path['actions'][0])] *
                        len(path['actions']),
                        log_std=[[0.0] * len(path['actions'][0])] *
                        len(path['actions']))
                else:
                    path['agent_infos'] = [None] * len(path['rewards'])

        if not treat_as_expert_traj:
            print("debug12, running offpol on own previous samples")
            running_path_idx = {t: 0 for t in tasknums}
            running_intra_path_idx = {t: 0 for t in tasknums}
            while max([running_path_idx[t] for t in tasknums
                       ]) > -0.5:  # we cycle until all indices are -1
                observations = [
                    offpol_trajs[t][running_path_idx[t]]['observations'][
                        running_intra_path_idx[t]] for t in tasknums
                ]
                actions, agent_infos = self.policy.get_actions(observations)
                agent_infos = split_tensor_dict_list(agent_infos)
                for t, action, agent_info in zip(itertools.count(), actions,
                                                 agent_infos):
                    offpol_trajs[t][running_path_idx[t]]['agent_infos'][
                        running_intra_path_idx[t]] = agent_info
                    # INDEX JUGGLING:
                    if -0.5 < running_intra_path_idx[t] < len(offpol_trajs[t][
                            running_path_idx[t]]['rewards']) - 1:
                        # if we haven't reached the end:
                        running_intra_path_idx[t] += 1
                    else:

                        if -0.5 < running_path_idx[t] < len(
                                offpol_trajs[t]) - 1:
                            # we wrap up the agent_infos
                            offpol_trajs[t][running_path_idx[t]]['agent_infos'] = \
                                stack_tensor_dict_list(offpol_trajs[t][running_path_idx[t]]['agent_infos'])
                            # if we haven't reached the last path:
                            running_intra_path_idx[t] = 0
                            running_path_idx[t] += 1
                        elif running_path_idx[t] == len(offpol_trajs[t]) - 1:
                            offpol_trajs[t][running_path_idx[t]]['agent_infos'] = \
                                stack_tensor_dict_list(offpol_trajs[t][running_path_idx[t]]['agent_infos'])
                            running_intra_path_idx[t] = -1
                            running_path_idx[t] = -1
                        else:
                            # otherwise we set the running index to -1 to signal a stop
                            running_intra_path_idx[t] = -1
                            running_path_idx[t] = -1
        total_time = time.time() - start
        # logger.record_tabular(log_prefix+"TotalExecTime", total_time)
        return offpol_trajs
Exemple #10
0
def rollout(env,
            agent,
            max_path_length=np.inf,
            animated=False,
            speedup=1,
            init_state=None,
            no_action=False,
            using_gym=False,
            noise=0,
            o=None,
            plan=None):
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    dones = []
    # no_action = True
    if o is None:
        if init_state is not None:
            o = env.reset(init_state)
        else:
            o = env.reset()
        agent.reset()
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        if not using_gym:
            a, agent_info = agent.get_action(o)
        else:
            if hasattr(agent, 'relative_goals') and agent.relative_goals:
                ag = env_info['xy_pos'] if len(
                    env_infos) > 0 else env.init_goal_obs
                goal = plan(
                    ag,
                    env.current_goal) if plan is not None else env.current_goal
                a = agent.get_actions([o], ag, goal, noise_eps=noise)
                agent_infos = None
            else:
                a = agent.get_actions([o],
                                      env.transform_to_goal_space(o),
                                      env.current_goal,
                                      noise_eps=noise)
                # a = agent.get_actions([o], np.zeros_like(env.current_goal), np.zeros_like(env.current_goal), noise_eps=noise)
                agent_infos = None

        if no_action:
            a = np.zeros_like(a)
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        if agent_infos is not None:
            agent_infos.append(agent_info)
        env_infos.append(env_info)
        dones.append(d)
        path_length += 1
        if d:
            break
        o = next_o
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)
    if animated:
        env.render(close=False)

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        # agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos) if agent_infos is not None else None,
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
        dones=np.asarray(dones),
        last_obs=o,
    )
Exemple #11
0
    def process_samples(self, itr, paths):

        if self.normalize_reward:
            # Update reward mean/std Q.
            rewards = []
            for i in xrange(len(paths)):
                rewards.append(paths[i]['rewards'])
            rewards_flat = np.hstack(rewards)
            self._reward_mean.append(np.mean(rewards_flat))
            self._reward_std.append(np.std(rewards_flat))

            # Normalize rewards.
            reward_mean = np.mean(np.asarray(self._reward_mean))
            reward_std = np.mean(np.asarray(self._reward_std))
            for i in xrange(len(paths)):
                paths[i]['rewards'] = (paths[i]['rewards'] -
                                       reward_mean) / (reward_std + 1e-8)

        if itr > 0:
            kls = []
            for i in xrange(len(paths)):
                kls.append(paths[i]['KL'])

            kls_flat = np.hstack(kls)

            logger.record_tabular('Expl_MeanKL', np.mean(kls_flat))
            logger.record_tabular('Expl_StdKL', np.std(kls_flat))
            logger.record_tabular('Expl_MinKL', np.min(kls_flat))
            logger.record_tabular('Expl_MaxKL', np.max(kls_flat))

            # Perform normlization of the intrinsic rewards.
            if self.use_kl_ratio:
                if self.use_kl_ratio_q:
                    # Update kl Q
                    self.kl_previous.append(np.median(np.hstack(kls)))
                    previous_mean_kl = np.mean(np.asarray(self.kl_previous))
                    for i in xrange(len(kls)):
                        kls[i] = kls[i] / previous_mean_kl

            # Add KL ass intrinsic reward to external reward
            for i in xrange(len(paths)):
                paths[i]['rewards'] = paths[i]['rewards'] + self.eta * kls[i]

            # Discount eta
            self.eta *= self.eta_discount

        else:
            logger.record_tabular('Expl_MeanKL', 0.)
            logger.record_tabular('Expl_StdKL', 0.)
            logger.record_tabular('Expl_MinKL', 0.)
            logger.record_tabular('Expl_MaxKL', 0.)

        baselines = []
        returns = []
        for path in paths:
            path_baselines = np.append(self.baseline.predict(path), 0)
            deltas = path["rewards"] + \
                self.discount * path_baselines[1:] - \
                path_baselines[:-1]
            path["advantages"] = special.discount_cumsum(
                deltas, self.discount * self.gae_lambda)
            path["returns"] = special.discount_cumsum(path["rewards_orig"],
                                                      self.discount)
            baselines.append(path_baselines[:-1])
            returns.append(path["returns"])

        if not self.policy.recurrent:
            observations = tensor_utils.concat_tensor_list(
                [path["observations"] for path in paths])
            actions = tensor_utils.concat_tensor_list(
                [path["actions"] for path in paths])
            rewards = tensor_utils.concat_tensor_list(
                [path["rewards"] for path in paths])
            advantages = tensor_utils.concat_tensor_list(
                [path["advantages"] for path in paths])
            env_infos = tensor_utils.concat_tensor_dict_list(
                [path["env_infos"] for path in paths])
            agent_infos = tensor_utils.concat_tensor_dict_list(
                [path["agent_infos"] for path in paths])

            if self.center_adv:
                advantages = util.center_advantages(advantages)

            if self.positive_adv:
                advantages = util.shift_advantages_to_positive(advantages)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [
                sum(path["rewards_orig"]) for path in paths
            ]

            ent = np.mean(self.policy.distribution.entropy(agent_infos))

            ev = special.explained_variance_1d(np.concatenate(baselines),
                                               np.concatenate(returns))

            samples_data = dict(
                observations=observations,
                actions=actions,
                rewards=rewards,
                advantages=advantages,
                env_infos=env_infos,
                agent_infos=agent_infos,
                paths=paths,
            )
        else:
            max_path_length = max([len(path["advantages"]) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path["observations"] for path in paths]
            obs = np.array(
                [tensor_utils.pad_tensor(ob, max_path_length) for ob in obs])

            if self.center_adv:
                raw_adv = np.concatenate(
                    [path["advantages"] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [(path["advantages"] - adv_mean) / adv_std
                       for path in paths]
            else:
                adv = [path["advantages"] for path in paths]

            adv = np.array(
                [tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path["actions"] for path in paths]
            actions = np.array(
                [tensor_utils.pad_tensor(a, max_path_length) for a in actions])

            rewards = [path["rewards"] for path in paths]
            rewards = np.array(
                [tensor_utils.pad_tensor(r, max_path_length) for r in rewards])

            agent_infos = [path["agent_infos"] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in agent_infos
            ])

            env_infos = [path["env_infos"] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in env_infos
            ])

            valids = [np.ones_like(path["returns"]) for path in paths]
            valids = np.array(
                [tensor_utils.pad_tensor(v, max_path_length) for v in valids])

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.mean(self.policy.distribution.entropy(agent_infos))

            ev = special.explained_variance_1d(np.concatenate(baselines),
                                               np.concatenate(returns))

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                paths=paths,
            )

        logger.log("fitting baseline...")
        self.baseline.fit(paths)
        logger.log("fitted")

        logger.record_tabular('Iteration', itr)
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageReturn', np.mean(undiscounted_returns))
        logger.record_tabular('ExplainedVariance', ev)
        logger.record_tabular('NumTrajs', len(paths))
        logger.record_tabular('Entropy', ent)
        logger.record_tabular('Perplexity', np.exp(ent))
        logger.record_tabular('StdReturn', np.std(undiscounted_returns))
        logger.record_tabular('MaxReturn', np.max(undiscounted_returns))
        logger.record_tabular('MinReturn', np.min(undiscounted_returns))

        return samples_data
    def obtain_samples(self,
                       itr,
                       num_samples=None,
                       log=True,
                       log_prefix='RandomSampler-'):
        if num_samples is None:
            num_samples = self.algo.batch_size

        paths = []
        n_samples_collected = 0
        obses = self.vec_env.reset()
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(num_samples)
        env_time = 0
        process_time = 0

        policy = self.algo.policy
        import time
        while n_samples_collected < num_samples:
            # random actions
            t = time.time()
            actions = np.stack([
                self.vec_env.action_space.sample() for _ in range(len(obses))
            ],
                               axis=0)
            policy_time = time.time() - t
            agent_infos = {}

            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths.append(
                        dict(
                            observations=self.env_spec.observation_space.
                            flatten_n(running_paths[idx]["observations"]),
                            actions=self.env_spec.action_space.flatten_n(
                                running_paths[idx]["actions"]),
                            rewards=tensor_utils.stack_tensor_list(
                                running_paths[idx]["rewards"]),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))
                    n_samples_collected += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None
            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        if log:
            logger.record_tabular(log_prefix + "PolicyExecTime", policy_time)
            logger.record_tabular(log_prefix + "EnvExecTime", env_time)
            logger.record_tabular(log_prefix + "ProcessExecTime", process_time)

        return paths
Exemple #13
0
def sample_paths(N,
                 policy,
                 baseline,
                 env_mode='train',
                 T=1e6,
                 gamma=1,
                 mujoco_env=True,
                 normalized_env=False,
                 env=None):
    # Directly specifying env works only when sampling in series

    # set random seed (needed for multiprocessing)
    np.random.seed()

    if env == None:
        env = get_environment(env_mode)
    T = min(T, env.horizon)
    T = max(1, T)
    # sometimes, env is not initialized correctly in multiprocessing
    # this is just a sanity check and step size should essentially be zero.

    print("####### Worker started #######")

    paths = []

    for ep in range(N):

        observations = []
        actions = []
        rewards = []
        agent_infos = []
        env_infos = []
        qpos = []
        qvel = []

        o = env.reset()
        if mujoco_env == True:
            if normalized_env:
                qpos.append(env.wrapped_env.env.model.data.qpos.reshape(-1))
                qvel.append(env.wrapped_env.env.model.data.qvel.reshape(-1))
            else:
                qpos.append(env.env.model.data.qpos.reshape(-1))
                qvel.append(env.env.model.data.qvel.reshape(-1))
        done = False
        t = 0

        while t < T and done != True:
            a, agent_info = policy.get_action(o)
            next_o, r, done, env_info = env.step(a)
            observations.append(env.observation_space.flatten(o))
            actions.append(env.action_space.flatten(a))
            rewards.append(r)
            agent_infos.append(agent_info)
            env_infos.append(env_info)
            if mujoco_env == True:
                if normalized_env:
                    qpos.append(
                        env.wrapped_env.env.model.data.qpos.reshape(-1))
                    qvel.append(
                        env.wrapped_env.env.model.data.qvel.reshape(-1))
                else:
                    qpos.append(env.env.model.data.qpos.reshape(-1))
                    qvel.append(env.env.model.data.qvel.reshape(-1))
            o = next_o
            t += 1

        # make a path dictionary
        # Also store the path belief and env data used in the trajectory
        #try:
        #    path_belief = env.env.belief
        #except Exception as e:
        #    path_belief = str(e)
        # path_model = env.env

        qpos_flat = tensor_utils.stack_tensor_list(qpos)
        qvel_flat = tensor_utils.stack_tensor_list(qvel)

        path = dict(
            observations=tensor_utils.stack_tensor_list(observations),
            actions=tensor_utils.stack_tensor_list(actions),
            rewards=tensor_utils.stack_tensor_list(rewards),
            agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
            env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
            qpos=qpos_flat,
            qvel=qvel_flat,
            #path_belief=path_belief,
            #path_model=path_model,
        )

        # TODO: Storing the path model is too space inefficient. Need to find alternative

        # compute returns using the path
        path_baseline = baseline.predict(path)
        advantages = []
        returns = []
        return_so_far = 0
        for t in range(len(rewards) - 1, -1, -1):
            return_so_far = rewards[t] + gamma * return_so_far
            returns.append(return_so_far)
            advantage = return_so_far - path_baseline[t]
            advantages.append(advantage)

        # advantages and returns are stored backward in time
        advantages = np.array(advantages[::-1])
        returns = np.array(returns[::-1])

        # normalize advantages
        advantages = (advantages - np.mean(advantages)) / (np.std(advantages) +
                                                           1e-8)

        path["advantages"] = advantages
        path["returns"] = returns

        paths.append(path)

    #print "Env body_mass : ", env.env.model.body_mass[1]
    print("====== Worker finished ======")

    return paths
Exemple #14
0
    def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix='', extra_input=None, extra_input_dim=None, preupdate=False, save_img_obs=False):
        # reset_args: arguments to pass to the environments to reset
        # return_dict: whether or not to return a dictionary or list form of paths

        logger.log("Obtaining samples for iteration %d..." % itr)

        if extra_input is not None:
            if extra_input == "onehot_exploration":
                if preupdate:
                    print("debug, using extra_input onehot")
                    def expand_obs(obses, path_nums):
                        extra = [special.to_onehot(path_num % extra_input_dim, extra_input_dim) for path_num in path_nums]
                        return np.concatenate((obses, extra), axis=1)
                else:
                    print("debug, using extra_input zeros")
                    def expand_obs(obses, path_nums):
                        extra = [np.zeros(extra_input_dim) for path_num in path_nums]
                        return np.concatenate((obses, extra),axis=1)
            elif extra_input == "onehot_hacked":
                if preupdate:
                    print("debug, using extra_input onehot")
                    def expand_obs(obses, path_nums):
                        extra = [special.to_onehot(3, extra_input_dim) for path_num in path_nums]
                        return np.concatenate((obses, extra), axis=1)
                else:
                    print("debug, using extra_input zeros")
                    def expand_obs(obses, path_nums):
                        extra = [np.zeros(extra_input_dim) for path_num in path_nums]
                        return np.concatenate((obses, extra),axis=1)
            elif extra_input == "gaussian_exploration":
                if preupdate:
                    print("debug, using extra_input gaussian")

                    def expand_obs(obses, path_nums):
                        extra = [np.random.normal(0.,1.,size=(extra_input_dim,)) for path_num in path_nums]
                        return np.concatenate((obses, extra), axis=1)
                else:
                    print("debug, using extra_input zeros")
                    def expand_obs(obses, path_nums):
                        extra = [np.zeros(extra_input_dim) for path_num in path_nums]
                        return np.concatenate((obses, extra), axis=1)


            else:
                def expand_obs(obses, path_nums):
                    return obses
        else:
            def expand_obs(obses, path_nums):
                return obses
        #paths = []
        paths = {}
        for i in range(self.vec_env.num_envs):
            paths[i] = []

        # if the reset args are not list/numpy, we set the same args for each env
        if reset_args is not None and (type(reset_args) != list and type(reset_args) != np.ndarray):
            assert False, "debug, should we be using this?"
            print("WARNING, will vectorize reset_args")
            reset_args = [reset_args]*self.vec_env.num_envs


        n_samples = 0
        path_nums = [0] * self.vec_env.num_envs # keeps track on which rollout we are for each environment instance
        obses = self.vec_env.reset(reset_args)
        obses = expand_obs(obses, path_nums)
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy
        import time

        while n_samples < self.batch_size:
            t = time.time()
            policy.reset(dones)
            actions, agent_infos = policy.get_actions(obses)
            # print("debug, agent_infos", agent_infos)
            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions, reset_args)   # TODO: instead of receive obs from env, we'll receive it from the policy as a feed_dict
            next_obses = expand_obs(next_obses,path_nums)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions,
                                                                                    rewards, env_infos, agent_infos,
                                                                                    dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths[idx].append(dict(
                        observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]),
                        actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]),
                        rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]),
                        env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]),
                        agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]),
                    ))
                    n_samples += len(running_paths[idx]["rewards"])  # TODO: let's also add the incomplete running_paths to paths
                    running_paths[idx] = None
                    path_nums[idx] += 1
            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        # adding the incomplete paths
        # for idx in range(self.vec_env.num_envs):
        #     if running_paths[idx] is not None:
        #         paths[idx].append(dict(
        #             observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]),
        #             actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]),
        #             rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]),
        #             env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]),
        #             agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]),
        #         ))


        pbar.stop()





      #  logger.record_tabular(log_prefix + "PolicyExecTime", policy_time)
      #  logger.record_tabular(log_prefix + "EnvExecTime", env_time)
       # logger.record_tabular(log_prefix + "ProcessExecTime", process_time)

        if not return_dict:
            flatten_list = lambda l: [item for sublist in l for item in sublist]
            paths = flatten_list(paths.values())
            #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()])

        return paths
Exemple #15
0
def rarl_rollout(env,
                 agent1,
                 agent2,
                 policy_num,
                 max_path_length=np.inf,
                 animated=False,
                 speedup=1,
                 always_return_paths=False):
    #logger.log("rollout~~~~~~~~~~~~~~~~~~~")
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    o = env.reset()
    agent1.reset()
    agent2.reset()
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        a1, agent1_info = agent1.get_action(o)
        a2, agent2_info = agent2.get_action(o)
        action_true = np.append(a1, a2)
        Action = {}
        Action['action'] = np.append(a1, a2)
        # Action['dist1'] = agent1_info
        # Action['dist2'] = agent2_info
        Action['policy_num'] = policy_num
        next_o, r, d, env_info = env.step(Action)
        # print(' ')
        # print('policy_num: ',policy_num,' a1: ',a1,' a2: ',a2,' reward: ',r)

        if policy_num == 1:
            observations.append(agent1._env_spec.observation_space.flatten(o))
            rewards.append(r)
            actions.append(agent1._env_spec.action_space.flatten(a1))
            agent_infos.append(agent1_info)
        else:
            observations.append(agent2._env_spec.observation_space.flatten(o))
            rewards.append(r)
            actions.append(agent2._env_spec.action_space.flatten(a2))
            agent_infos.append(agent2_info)

        env_infos.append(env_info)
        path_length += 1
        if d:
            break
        o = next_o
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)
    if animated and not always_return_paths:
        return

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
Exemple #16
0
    def obtain_samples(self, itr):
        logger.log("Obtaining samples for iteration %d..." % itr)
        paths = []
        n_samples = 0
        obses = self.vec_env.reset()
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.algo.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0
        import time
        while n_samples < self.algo.batch_size:
            t = time.time()
            self.algo.policy.reset(dones)
            actions, agent_infos = self.algo.policy.get_actions(obses)
            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in xrange(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in xrange(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions,
                                                                                    rewards, env_infos, agent_infos,
                                                                                    dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths.append(dict(
                        observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]),
                        actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]),
                        rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]),
                        env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]),
                        agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]),
                    ))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None
            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        logger.record_tabular("PolicyExecTime", policy_time)
        logger.record_tabular("EnvExecTime", env_time)
        logger.record_tabular("ProcessExecTime", process_time)

        return paths
def rollout_torch(env,
                  agent,
                  max_path_length=np.inf,
                  animated=False,
                  speedup=1,
                  always_return_paths=False,
                  extra_clip=False,
                  terminate_only_max_path=False):
    observations = []
    next_observations = []
    normalized_observations = []
    normalized_next_observations = []
    unscaled_actions = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    mask = []
    o = env.reset()
    try:
        agent.reset()
    except AttributeError:
        pass
    path_length = 0
    t = 0

    def handle_obs(o):
        # get list with bools if output of env is normalized
        if isinstance(env, TorchModel):
            normalized_obs = env.normalized_output
        else:
            normalized_obs = [False] * len(o)

        unnormalized_idx = [i for i, x in enumerate(normalized_obs) if not x]
        normalized_idx = [i for i, x in enumerate(normalized_obs) if x]

        lb, ub = env.observation_space.bounds
        # normalize the unnormalized idx
        normalized_unnormalized_val = (
            2 * (o[unnormalized_idx] - lb[unnormalized_idx]) /
            (ub[unnormalized_idx] - lb[unnormalized_idx])) - 1
        normalized_unnormalized_val = np.clip(normalized_unnormalized_val, -1,
                                              1)
        # unnormalize the normalized idx
        unnormalized_normalized_val = lb[normalized_idx] + (
            o[normalized_idx] + 1.) * 0.5 * (ub[normalized_idx] -
                                             lb[normalized_idx])
        unnormalized_normalized_val = np.clip(unnormalized_normalized_val,
                                              lb[normalized_idx],
                                              ub[normalized_idx])

        # put everything together
        normalized_obs = np.zeros(o.shape)
        normalized_obs[normalized_idx] = o[normalized_idx]
        normalized_obs[unnormalized_idx] = normalized_unnormalized_val
        unnormalized_obs = np.zeros(o.shape)
        unnormalized_obs[unnormalized_idx] = o[unnormalized_idx]
        unnormalized_obs[normalized_idx] = unnormalized_normalized_val
        # do extra clipping since original values could be out of bounds
        if extra_clip:
            normalized_obs = np.clip(normalized_obs, -1, 1)
            unnormalized_obs = np.clip(unnormalized_obs, lb, ub)

        # TODO: build own function for this
        # select the right observations for the agent
        normalized_policy_input = agent.normalized_input
        normalized_policy_input_idx = [
            i for i, x in enumerate(normalized_policy_input) if x
        ]
        unnormalized_policy_input_idx = [
            i for i, x in enumerate(normalized_policy_input) if not x
        ]

        policy_input = np.zeros(o.shape)
        policy_input[normalized_policy_input_idx] = normalized_obs[
            normalized_policy_input_idx]
        policy_input[unnormalized_policy_input_idx] = unnormalized_obs[
            unnormalized_policy_input_idx]
        agent_obs_torch_var = (torch.from_numpy(policy_input.astype(
            np.float32))).unsqueeze(0)

        # select the right observations for the env
        if isinstance(env, TorchModel):
            normalized_env_input = env.normalized_input_obs
        else:
            normalized_env_input = [False] * len(o)
        normalized_env_input_idx = [
            i for i, x in enumerate(normalized_env_input) if x
        ]
        unnormalized_env_input_idx = [
            i for i, x in enumerate(normalized_env_input) if not x
        ]
        env_input = np.zeros(o.shape)
        env_input[normalized_env_input_idx] = normalized_obs[
            normalized_env_input_idx]
        env_input[unnormalized_env_input_idx] = unnormalized_obs[
            unnormalized_env_input_idx]
        env_obs_torch_var = (torch.from_numpy(env_input.astype(np.float32)))

        return normalized_obs, unnormalized_obs, agent_obs_torch_var, env_obs_torch_var

    def handle_action(a):
        normalized_a = agent.normalized_output
        # scale only the normalized action outputs
        unnormalized_idx = [i for i, x in enumerate(normalized_a) if not x]
        normalized_idx = [i for i, x in enumerate(normalized_a) if x]

        lb, ub = env.action_space.bounds

        # normalize the unnormalized idx
        normalized_unnormalized_val = (
            2 * (a[unnormalized_idx] - lb[unnormalized_idx]) /
            (ub[unnormalized_idx] - lb[unnormalized_idx])) - 1
        normalized_unnormalized_val = np.clip(normalized_unnormalized_val, -1,
                                              1)
        # unnormalize the normalized idx
        unnormalized_normalized_val = lb[normalized_idx] + (
            a[normalized_idx] + 1.) * 0.5 * (ub[normalized_idx] -
                                             lb[normalized_idx])
        unnormalized_normalized_val = np.clip(unnormalized_normalized_val,
                                              lb[normalized_idx],
                                              ub[normalized_idx])

        # put everything together
        normalized_a = np.zeros(a.shape)
        normalized_a[normalized_idx] = a[normalized_idx]
        normalized_a[unnormalized_idx] = normalized_unnormalized_val
        unnormalized_a = np.zeros(a.shape)
        unnormalized_a[unnormalized_idx] = a[unnormalized_idx]
        unnormalized_a[normalized_idx] = unnormalized_normalized_val

        # do extra clipping since original values could be out of bounds
        if extra_clip:
            normalized_a = np.clip(normalized_a, -1, 1)
            unnormalized_a = np.clip(unnormalized_a, lb, ub)

        unscaled_a = normalized_a
        action = unnormalized_a

        # select the right actions for the env
        if isinstance(env, TorchModel):
            normalized_env_input = env.normalized_input_a
        else:
            normalized_env_input = [False] * len(a)
        normalized_env_input_idx = [
            i for i, x in enumerate(normalized_env_input) if x
        ]
        unnormalized_env_input_idx = [
            i for i, x in enumerate(normalized_env_input) if not x
        ]
        env_input = np.zeros(a.shape)
        env_input[normalized_env_input_idx] = normalized_a[
            normalized_env_input_idx]
        env_input[unnormalized_env_input_idx] = unnormalized_a[
            unnormalized_env_input_idx]
        env_a_np_var = env_input

        return action, unscaled_a, env_a_np_var

    if animated:
        env.render()
    while path_length < max_path_length:
        # TODO: it might be the case that the env is not giving a numpy array
        normalized_o, o, agent_obs_torch, env_obs_torch = handle_obs(o)
        a, agent_info = agent.select_action(agent_obs_torch, t)
        #print(a, agent_obs_torch)
        a, unscaled_a, env_a_torch = handle_action(a)
        if isinstance(env, TorchModel):
            #print(env_a_torch, env_obs_torch, o)
            #print(a, unscaled_a, env_a_torch)
            next_orig_o, r, d, env_info = env.step(env_a_torch, env_obs_torch,
                                                   o)
        else:
            next_orig_o, r, d, env_info = env.step(a)
        normalized_next_o, next_o, _, _ = handle_obs(next_orig_o)
        observations.append(env.observation_space.flatten(o))
        normalized_observations.append(
            env.observation_space.flatten(normalized_o))
        next_observations.append(next_o)
        normalized_next_observations.append(normalized_next_o)
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        unscaled_actions.append(env.action_space.flatten(unscaled_a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)
            print(o, r, a, next_o)
        path_length += 1
        if d and not terminate_only_max_path:
            mask.append(0)
            break
        elif path_length == max_path_length:
            mask.append(0)  # add termination when we reached max time
            break
        elif not d:
            mask.append(1)
        else:
            mask.append(0)
        o = next_orig_o
        t += 1
    if animated:
        try:
            env.close()
        except AttributeError:
            pass
    if animated and not always_return_paths:
        return

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        next_observations=tensor_utils.stack_tensor_list(next_observations),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
        mask=tensor_utils.stack_tensor_list(mask),
        normalized_observations=tensor_utils.stack_tensor_list(
            normalized_observations),
        normalized_next_observations=tensor_utils.stack_tensor_list(
            normalized_next_observations),
        unscaled_actions=tensor_utils.stack_tensor_list(unscaled_actions),
    )
Exemple #18
0
def rollout(env,
            agent,
            max_path_length=np.inf,
            animated=False,
            speedup=1,
            save_video=True,
            video_filename='sim_out.mp4',
            reset_arg=None,
            use_maml=False,
            maml_task_index=None,
            maml_num_tasks=None,
            use_rl2=False,
            new_trial=True):
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    images = []
    o = env.reset(reset_args=reset_arg)
    if use_rl2:
        agent.reset(new_trial=new_trial)
    else:
        agent.reset()
    path_length = 0
    if animated:
        env1 = env
        while hasattr(env1, "wrapped_env"):
            env1 = env1.wrapped_env
        if hasattr(env1, "viewer_setup"):
            env1.viewer_setup()
        env.render()
    while path_length < max_path_length:
        if not use_maml and not use_rl2:
            a, agent_info = agent.get_action(observation=o)
        else:
            a, agent_info = agent.get_action_single_env(
                observation=o, idx=maml_task_index, num_tasks=maml_num_tasks)
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        if d:  # and not animated:  # TODO testing
            break
        o = next_o
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)
            if save_video:
                from PIL import Image
                image = env.wrapped_env.wrapped_env.get_viewer().get_image()
                pil_image = Image.frombytes('RGB', (image[1], image[2]),
                                            image[0])
                images.append(np.flipud(np.array(pil_image)))

    if animated:
        if save_video and len(images) >= max_path_length:
            import moviepy.editor as mpy
            clip = mpy.ImageSequenceClip(images, fps=20 * speedup)
            if video_filename[-3:] == 'gif':
                clip.write_gif(video_filename, fps=20 * speedup)
            else:
                clip.write_videofile(video_filename, fps=20 * speedup)
        #return

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
Exemple #19
0
    def obtain_samples(self, itr, oracle_policy):
        logger.log("Obtaining samples for iteration %d..." % itr)
        paths = []
        agent_only_paths = []
        oracle_only_paths = []
        n_samples = 0
        obses = self.vec_env.reset()
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs
        agent_only_running_paths = [None] * self.vec_env.num_envs
        oracle_only_running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.algo.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy
        import time

        while n_samples < self.algo.batch_size:
            t = time.time()
            policy.reset(dones)

            agent_actions, binary_actions, agent_infos = policy.get_actions(
                obses)
            oracle_actions, oracle_agent_infos = oracle_policy.get_actions(
                obses)
            sigma = np.round(binary_actions)

            actions_1 = np.array([
                sigma[0, 0] * agent_actions[0, :] +
                sigma[0, 1] * oracle_actions[0, :]
            ])
            actions_2 = np.array([
                sigma[1, 0] * agent_actions[1, :] +
                sigma[1, 1] * oracle_actions[1, :]
            ])

            actions = np.concatenate((actions_1, actions_2), axis=0)

            policy_time += time.time() - t
            t = time.time()

            next_obses, rewards, dones, env_infos = self.vec_env.step(
                actions, itr)

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]

            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)

                if done:
                    paths.append(
                        dict(
                            observations=self.env_spec.observation_space.
                            flatten_n(running_paths[idx]["observations"]),
                            actions=self.env_spec.action_space.flatten_n(
                                running_paths[idx]["actions"]),
                            rewards=tensor_utils.stack_tensor_list(
                                running_paths[idx]["rewards"]),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None

            if sigma[0, 0] == 1 or sigma[1, 0] == 1:

                for idx, observation, action, reward, env_info, agent_info, done in zip(
                        itertools.count(), obses, actions, rewards, env_infos,
                        agent_infos, dones):
                    if agent_only_running_paths[idx] is None:
                        agent_only_running_paths[idx] = dict(
                            observations=[],
                            actions=[],
                            rewards=[],
                            env_infos=[],
                            agent_infos=[],
                        )
                    agent_only_running_paths[idx]["observations"].append(
                        observation)
                    agent_only_running_paths[idx]["actions"].append(action)
                    agent_only_running_paths[idx]["rewards"].append(reward)
                    agent_only_running_paths[idx]["env_infos"].append(env_info)
                    agent_only_running_paths[idx]["agent_infos"].append(
                        agent_info)

                    if done:
                        agent_only_paths.append(
                            dict(
                                observations=self.env_spec.observation_space.
                                flatten_n(agent_only_running_paths[idx]
                                          ["observations"]),
                                actions=self.env_spec.action_space.flatten_n(
                                    agent_only_running_paths[idx]["actions"]),
                                rewards=tensor_utils.stack_tensor_list(
                                    agent_only_running_paths[idx]["rewards"]),
                                env_infos=tensor_utils.stack_tensor_dict_list(
                                    agent_only_running_paths[idx]
                                    ["env_infos"]),
                                agent_infos=tensor_utils.
                                stack_tensor_dict_list(
                                    agent_only_running_paths[idx]
                                    ["agent_infos"]),
                            ))
                        n_samples += len(
                            agent_only_running_paths[idx]["rewards"])
                        agent_only_running_paths[idx] = None
            """
            To get paths taken by the oracle
            """
            # elif sigma[0] == 0. or sigma[1] == 0.:

            #     for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions,
            #                                                                             rewards, env_infos, agent_infos,
            #                                                                             dones):
            #         if oracle_only_running_paths[idx] is None:
            #             oracle_only_running_paths[idx] = dict(
            #                 observations=[],
            #                 actions=[],
            #                 rewards=[],
            #                 env_infos=[],
            #                 agent_infos=[],
            #             )
            #         oracle_only_running_paths[idx]["observations"].append(observation)
            #         oracle_only_running_paths[idx]["actions"].append(action)
            #         oracle_only_running_paths[idx]["rewards"].append(reward)
            #         oracle_only_running_paths[idx]["env_infos"].append(env_info)
            #         oracle_only_running_paths[idx]["agent_infos"].append(agent_info)

            #         if done:
            #             oracle_only_paths.append(dict(
            #                 observations=self.env_spec.observation_space.flatten_n(oracle_only_running_paths[idx]["observations"]),
            #                 actions=self.env_spec.action_space.flatten_n(oracle_only_running_paths[idx]["actions"]),
            #                 rewards=tensor_utils.stack_tensor_list(oracle_only_running_paths[idx]["rewards"]),
            #                 env_infos=tensor_utils.stack_tensor_dict_list(oracle_only_running_paths[idx]["env_infos"]),
            #                 agent_infos=tensor_utils.stack_tensor_dict_list(oracle_only_running_paths[idx]["agent_infos"]),
            #             ))
            #             n_samples += len(oracle_only_running_paths[idx]["rewards"])
            #             oracle_only_running_paths[idx] = None

            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        logger.record_tabular("PolicyExecTime", policy_time)
        logger.record_tabular("EnvExecTime", env_time)
        logger.record_tabular("ProcessExecTime", process_time)

        #return paths, agent_only_paths, oracle_only_paths
        return paths, agent_only_paths
Exemple #20
0
def rollout(env, agent, max_path_length=np.inf, reset_start_rollout=True, keep_rendered_rgbs=False,
            animated=False, speedup=1):
    """
    :param reset_start_rollout: whether to reset the env when calling this function
    :param keep_rendered_rgbs: whether to keep a list of all rgb_arrays (for future video making)
    """
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    terminated = []
    if reset_start_rollout:
        o = env.reset()  # otherwise it will never advance!!
    else:
        if isinstance(env, NormalizedEnv):
            o = env.wrapped_env.get_current_obs()
        else:
            o = env.get_current_obs()
    agent.reset()
    path_length = 0
    if animated:
        env.render()
    if keep_rendered_rgbs:  # will return a new entry to the path dict with all rendered images
        rendered_rgbs = [env.render(mode='rgb_array')]
    while path_length < max_path_length:
        # print("next_o", len(o))
        # print("env", env)
        a, agent_info = agent.get_action(o)
        next_o, r, d, env_info = env.step(a)
        # print("next_obs", next_o.shape)
        # print("env", env)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        if d:
            terminated.append(1)
            break
        terminated.append(0)
        o = next_o
        if keep_rendered_rgbs:  # will return a new entry to the path dict with all rendered images
            rendered_rgbs.append(env.render(mode='rgb_array'))
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)
    # if animated:   # this is off as in the case of being an inner rollout, it will close the outer renderer!
        # env.render(close=True)

    path_dict = dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),  # here it concatenates all lower-level paths!
        # termination indicates if the rollout was terminated or if we simply reached the limit of steps: important
        # when BOTH happend at the same time, to still be able to know it was the done (for hierarchized envs)
        terminated=tensor_utils.stack_tensor_list(terminated),
    )
    if keep_rendered_rgbs:
        path_dict['rendered_rgbs'] = tensor_utils.stack_tensor_list(rendered_rgbs)

    return path_dict
Exemple #21
0
    def process_samples(self, itr, paths, prefix='', log=True, fast_process=False, testitr=False, metalearn_baseline=False , isExpertTraj = False):
        baselines = []
        returns = []
        if testitr:
            metalearn_baseline = False
        train_baseline = (itr in BASELINE_TRAINING_ITRS)
        if not fast_process:
            for idx, path in enumerate(paths):
                path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount)
        if not fast_process and not metalearn_baseline:
            if log:
                pass
                #logger.log("fitting baseline...")
            if hasattr(self.algo.baseline, 'fit_with_samples'):
                self.algo.baseline.fit_with_samples(paths, samples_data)  # TODO: doesn't seem like this is ever used
            else:
                # print("debug21 baseline before fitting",self.algo.baseline.predict(paths[0])[0:2], "...",self.algo.baseline.predict(paths[0])[-3:-1])
                # print("debug23 predloss before fitting",np.mean([np.mean(np.square(p['returns']-self.algo.baseline.predict(p))) for p in paths]))

                self.algo.baseline.fit(paths, log=log)
                # print("debug25 predloss AFTER  fitting",np.mean([np.mean(np.square(p['returns']-self.algo.baseline.predict(p))) for p in paths]))
                # print("debug22 returns                ",paths[0]['returns'][0:2], "...",paths[0]['returns'][-3:-1])
                # print("debug24 baseline after  fitting",self.algo.baseline.predict(paths[0])[0:2], "...", self.algo.baseline.predict(paths[0])[-3:-1])
            if log:
                pass
                #logger.log("fitted")

            if 'switch_to_init_dist' in dir(self.algo.baseline):
                self.algo.baseline.switch_to_init_dist()

            if train_baseline:
                self.algo.baseline.fit_train_baseline(paths)

            if hasattr(self.algo.baseline, "predict_n"):
                all_path_baselines = self.algo.baseline.predict_n(paths)
            else:
                all_path_baselines = [self.algo.baseline.predict(path) for path in paths]


        for idx, path in enumerate(paths):
            if not fast_process and not metalearn_baseline:
                # if idx==0:
                    # print("debug22", all_path_baselines[idx])
                    # print("debug23", path['returns'])

                path_baselines = np.append(all_path_baselines[idx], 0)
                deltas = path["rewards"] + \
                         self.algo.discount * path_baselines[1:] - \
                         path_baselines[:-1]
                path["advantages"] = special.discount_cumsum(
                    deltas, self.algo.discount * self.algo.gae_lambda)
                baselines.append(path_baselines[:-1])
            if not fast_process:
                returns.append(path["returns"])
            if "expert_actions" not in path.keys():
                if ("expert_actions" in path["env_infos"].keys()):
                    path["expert_actions"] = path["env_infos"]["expert_actions"]
              
    
                else:
                    # assert False, "you shouldn't need expert_actions"
                    path["expert_actions"] = np.array([[None]*len(path['actions'][0])] * len(path['actions']))


        if not fast_process and not metalearn_baseline: # TODO: we want the ev eventually
            ev = special.explained_variance_1d(
                np.concatenate(baselines),
                np.concatenate(returns)
            )
            l2 = np.linalg.norm(np.array(baselines)-np.array(returns))

        if not self.algo.policy.recurrent:
            observations = tensor_utils.concat_tensor_list([path["observations"] for path in paths])
            actions = tensor_utils.concat_tensor_list([path["actions"] for path in paths])

            if not fast_process:
                rewards = tensor_utils.concat_tensor_list([path["rewards"] for path in paths])
                returns = tensor_utils.concat_tensor_list([path["returns"] for path in paths])

            if "env_infos" in paths[0].keys():
                env_infos = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths])

            if not fast_process and not metalearn_baseline:
                advantages = tensor_utils.concat_tensor_list([path["advantages"] for path in paths])
                # print("debug, advantages are", advantages,)
                # print("debug, shape of advantages is", type(advantages), np.shape(advantages))

            expert_actions = tensor_utils.concat_tensor_list([path["expert_actions"] for path in paths])
            agent_infos = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in paths])

            if not fast_process and not metalearn_baseline:
                if self.algo.center_adv:
                    advantages = util.center_advantages(advantages)
                if self.algo.positive_adv:
                    advantages = util.shift_advantages_to_positive(advantages)
                if "meta_predict" in dir(self.algo.baseline):
                    # print("debug, advantages are", advantages, )
                    advantages = advantages + self.algo.baseline.meta_predict(observations)
                    print("debug, metalearned baseline constant is", self.algo.baseline.meta_predict(observations)[0:2],"...",self.algo.baseline.meta_predict(observations)[-3:-1])
                    # print("debug, metalearned baseline constant shape is", np.shape(self.algo.baseline.meta_predict(observations)))
                # print("debug, advantages are", advantages[0:2],"...", advantages[-3:-1])
                # print("debug, advantages shape is", np.shape(advantages))

            # average_discounted_return = \
            #     np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path.get("rewards",[0])) for path in paths]

            # ent = np.mean(self.algo.policy.distribution.entropy(agent_infos))
            if fast_process:
                samples_data = dict(
                    observations=observations,
                    actions=actions,
                    agent_infos=agent_infos,
                    paths=paths,
                    expert_actions=expert_actions,
                )
            elif metalearn_baseline:
                samples_data = dict(
                    observations=observations,
                    actions=actions,
                    rewards=rewards,
                    returns=returns,
                    agent_infos=agent_infos,
                    paths=paths,
                    expert_actions=expert_actions,
                )
                if 'agent_infos_orig' in paths[0].keys():
                    agent_infos_orig = tensor_utils.concat_tensor_dict_list([path["agent_infos_orig"] for path in paths])
                    samples_data["agent_infos_orig"] = agent_infos_orig
            else:
                samples_data = dict(
                    observations=observations,
                    actions=actions,
                    rewards=rewards,
                    returns=returns,
                    advantages=advantages,
                    env_infos=env_infos,
                    agent_infos=agent_infos,
                    paths=paths,
                    expert_actions=expert_actions,
                )
                if 'agent_infos_orig' in paths[0].keys():
                    agent_infos_orig = tensor_utils.concat_tensor_dict_list([path["agent_infos_orig"] for path in paths])
                    samples_data["agent_infos_orig"] = agent_infos_orig

        else:
            max_path_length = max([len(path["advantages"]) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path["observations"] for path in paths]
            obs = tensor_utils.pad_tensor_n(obs, max_path_length)

            if self.algo.center_adv:
                raw_adv = np.concatenate([path["advantages"] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [(path["advantages"] - adv_mean) / adv_std for path in paths]
            else:
                adv = [path["advantages"] for path in paths]

            adv = np.asarray([tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path["actions"] for path in paths]
            actions = tensor_utils.pad_tensor_n(actions, max_path_length)

            rewards = [path["rewards"] for path in paths]
            rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

            returns = [path["returns"] for path in paths]
            returns = tensor_utils.pad_tensor_n(returns, max_path_length)

            agent_infos = [path["agent_infos"] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list(
                [tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos]
            )

            env_infos = [path["env_infos"] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list(
                [tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos]
            )

            valids = [np.ones_like(path["returns"]) for path in paths]
            valids = tensor_utils.pad_tensor_n(valids, max_path_length)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path.get("rewards",[0])) for path in paths]

            # ent = np.sum(self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids)

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                returns=returns,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                paths=paths,
            )
        if log:
            # logger.record_tabular('Iteration', itr)
            # logger.record_tabular('AverageDiscountedReturn',
            #                      average_discounted_return)
            logger.record_tabular(prefix + 'NumTrajs', len(paths))
            if testitr and prefix == "1":
             # TODO make this functional for more than 1 iteration
                self.memory["AverageReturnLastTest"]=np.mean(undiscounted_returns)
                self.memory["AverageReturnBestTest"]=max(self.memory["AverageReturnLastTest"],self.memory["AverageReturnBestTest"])
                if self.memory["AverageReturnBestTest"] == 0.0:
                    self.memory["AverageReturnBestTest"] = self.memory["AverageReturnLastTest"]

            if not testitr and prefix == '1':
                logger.record_tabular(prefix + 'AverageExpertReturn', np.mean(undiscounted_returns))

            #if testitr:

            logger.record_tabular(prefix + 'AverageReturn', np.mean(undiscounted_returns))
            logger.record_tabular(prefix + 'StdReturn', np.std(undiscounted_returns))
            logger.record_tabular(prefix + 'MaxReturn', np.max(undiscounted_returns))
            logger.record_tabular(prefix + 'MinReturn', np.min(undiscounted_returns))


            if not fast_process and not metalearn_baseline:
                logger.record_tabular(prefix + 'ExplainedVariance', ev)
                logger.record_tabular(prefix + 'BaselinePredLoss', l2)

            
            # logger.record_tabular(prefix + 'Entropy', ent)
            # logger.record_tabular(prefix + 'Perplexity', np.exp(ent))
          
            # if "env_infos" in paths[0].keys() and "success_left" in paths[0]["env_infos"].keys():
            #     logger.record_tabular(prefix + 'success_left', eval_success_left(paths))
            #     logger.record_tabular(prefix + 'success_right', eval_success_right(paths))
            # else:
                # logger.record_tabular(prefix + 'success_left', -1.0)
                # logger.record_tabular(prefix + 'success_right', -1.0)
        # if metalearn_baseline:
        #     if hasattr(self.algo.baseline, "revert"):
        #         self.algo.baseline.revert()

        return samples_data
Exemple #22
0
    def obtain_samples(self,
                       itr,
                       max_path_length,
                       batch_size,
                       max_n_trajs=None):
        logger.log("Obtaining samples for iteration %d..." % itr)
        paths = []
        n_samples = 0
        dones = np.asarray([True] * self.vec_env.n_envs)
        obses = self.vec_env.reset(dones)
        running_paths = [None] * self.vec_env.n_envs

        pbar = ProgBarCounter(batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.policy
        import time
        while n_samples < batch_size:
            t = time.time()
            if hasattr(self.vec_env, "handle_policy_reset"):
                self.vec_env.handle_policy_reset(policy, dones)
            else:
                policy.reset(dones)
            actions, agent_infos = policy.get_actions(obses)
            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(
                actions, max_path_length=max_path_length)

            if np.any(dones):
                new_obses = self.vec_env.reset(dones)
                reset_idx = 0
                for idx, done in enumerate(dones):
                    if done:
                        next_obses[idx] = new_obses[reset_idx]
                        reset_idx += 1

            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.n_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.n_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths.append(
                        dict(
                            observations=self.env_spec.observation_space.
                            flatten_n(running_paths[idx]["observations"]),
                            actions=self.env_spec.action_space.flatten_n(
                                running_paths[idx]["actions"]),
                            rewards=tensor_utils.stack_tensor_list(
                                running_paths[idx]["rewards"]),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))

                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None

            if max_n_trajs is not None and len(paths) >= max_n_trajs:
                break

            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        logger.record_tabular("PolicyExecTime", policy_time)
        logger.record_tabular("EnvExecTime", env_time)
        logger.record_tabular("ProcessExecTime", process_time)

        return paths
Exemple #23
0
def rollout_hide(env,
                 agents,
                 max_path_length=np.inf,
                 animated=False,
                 speedup=1,
                 always_return_paths=False,
                 mode=None,
                 hide_tmax=None,
                 init_state=None,
                 init_goal=None,
                 return_states_as_list=False):

    ## HIDE AGENT
    # animated = True
    # Reset the model configuration
    # print('Init goal: ', init_goal)
    if env.spec.id[:6] == 'Blocks':
        env.reset()
        obs = env.env.env.reload_model(pose=init_state, goal=init_goal)
    else:
        env.reset()
        obs = env.env.env.reload_model(pose=init_state, goal=init_goal)

    # time.sleep(1)
    # if animated:
    #     print('rollout: HIDE')
    #     frame_skip_prev = env.env.unwrapped.frame_skip
    #     env.env.unwrapped.frame_skip = 20

    hide_observations = []
    hide_states = []
    hide_actions = []
    hide_rewards = []
    hide_agent_infos = []
    hide_env_infos = []

    # Hide is capable of stopping so let's set stop if available
    # WARNING: It is important to do all this stuff after reset, since
    # blocks dependent stuff could be reset from config file as well
    if mode is not None:
        if mode == 'seek_force_only':
            env.env.env.use_stop = True
            env.env.env.add_mnist_reward(False)
            env.env.env.use_mnist_stop_criteria(False)
        elif mode == 'reach_center_and_stop':
            env.env.env.use_stop = True
            env.env.env.use_distance2center_stop_criteria = False
            prev_set_limit = env.env.unwrapped.step_limit
            if hide_tmax is not None:
                env.env.unwrapped.step_limit = hide_tmax

    agents['hide'].reset()
    hide_path_length = 0

    if animated:
        env.render()

    while hide_path_length < max_path_length:
        a, agent_info = agents['hide'].get_action(obs)
        if animated:
            env.render()

        # need to do it before the step, to match states to observations in the vector
        hide_states.append(env.env.unwrapped.get_all_pose())

        obs_next, r, d, env_info = env.step(a)
        # print('action:', a)

        hide_observations.append(obs)
        hide_rewards.append(r)
        hide_actions.append(env.action_space.flatten(a))
        hide_agent_infos.append(agent_info)
        hide_env_infos.append(env_info)
        hide_path_length += 1
        obs = obs_next
        if d:
            print('Hide | path_length:', hide_path_length)
            break

    if mode is not None:
        if mode == 'seek_force_only':
            env.env.env.use_stop = False
            env.env.env.add_mnist_reward(True)
            env.env.env.use_mnist_stop_criteria(True)
        elif mode == 'reach_center_and_stop':
            env.env.env.use_stop = False
            env.env.env.use_distance2center_stop_criteria = True
            if hide_tmax is not None:
                env.env.unwrapped.step_limit = prev_set_limit

    if not return_states_as_list:
        hide_states = tensor_utils.stack_tensor_list(hide_states)

    hide_paths = dict(
        observations=e2e_tensor_utils.stack_tensor_list(hide_observations),
        actions=tensor_utils.stack_tensor_list(hide_actions),
        rewards=tensor_utils.stack_tensor_list(hide_rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(hide_agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(hide_env_infos),
        states=hide_states,
    )
    # print('Episode done:', hide_path_length)
    return hide_paths
    def obtain_samples(self,
                       itr,
                       init_state=None,
                       reset_args=None,
                       return_dict=False,
                       log_prefix=''):
        # reset_args: arguments to pass to the environments to reset
        # return_dict: whether or not to return a dictionary or list form of paths

        logger.log("Obtaining samples for iteration %d..." % itr)

        #paths = []
        paths = {}
        for i in range(self.vec_env.num_envs):
            paths[i] = []
        # if the reset args are not list/numpy, we set the same args for each env
        if reset_args is not None and (type(reset_args) != list
                                       and type(reset_args) != np.ndarray):
            reset_args = [reset_args] * self.vec_env.num_envs
        if init_state is not None:
            init_state = [init_state] * self.vec_env.num_envs

            n_samples = 0
            obses = self.vec_env.reset(init_state, reset_args)
            dones = np.asarray([True] * self.vec_env.num_envs)
            running_paths = [None] * self.vec_env.num_envs
        else:

            n_samples = 0
            obses = self.vec_env.reset(reset_args)
            dones = np.asarray([True] * self.vec_env.num_envs)
            running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.algo.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        #policy = self.algo.policy
        import time

        while n_samples < self.algo.max_path_length:
            t = time.time()
            #self.env_spec.reset(reset_args = reset_args)
            #policy.reset(dones)
            actions, agent_infos = self.get_MPC_action(obses)

            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(
                actions, reset_args)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths[idx].append(
                        dict(
                            observations=running_paths[idx]["observations"],
                            actions=running_paths[idx]["actions"],
                            rewards=tensor_utils.stack_tensor_list(
                                running_paths[idx]["rewards"]),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None
            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        logger.record_tabular(log_prefix + "PolicyExecTime", policy_time)
        logger.record_tabular(log_prefix + "EnvExecTime", env_time)
        logger.record_tabular(log_prefix + "ProcessExecTime", process_time)

        if not return_dict:
            flatten_list = lambda l: [
                item for sublist in l for item in sublist
            ]
            paths = flatten_list(paths.values())
            #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()])

        return paths
Exemple #25
0
def rollout_hide_seek(env,
                      agents,
                      max_path_length=np.inf,
                      animated=False,
                      speedup=1,
                      always_return_paths=False,
                      mode=None,
                      hide_tmax=None):
    # animated = True
    ## HIDE AGENT
    #Reset the model configuration
    env.reset()
    obs = env.env.env.reload_model()
    last_goal = env.env.unwrapped.get_all_pose()

    # print('-----------------------------------------------------')
    # print('goal hide: ', env.env.env.goal, 'obs:', obs)

    # if animated:
    #     print('rollout: HIDE')
    # print('Frame skip = ', env.env.unwrapped.frame_skip)
    # frame_skip_prev = env.env.unwrapped.frame_skip
    # env.env.unwrapped.frame_skip = 20

    hide_observations = []
    hide_actions = []
    hide_rewards = []
    hide_agent_infos = []
    hide_env_infos = []

    # Hide is capable of stopping so let's set stop if available
    # WARNING: It is important to do all this stuff after reset, since
    # blocks dependent stuff could be reset from config file as well
    if mode is not None:
        if mode == 'seek_force_only':
            env.env.env.use_stop = True
            env.env.env.add_mnist_reward(False)
            env.env.env.use_mnist_stop_criteria(False)
        elif mode == 'reach_center_and_stop':
            env.env.env.use_stop = True
            env.env.env.use_distance2center_stop_criteria = False
            prev_set_limit = env.env.unwrapped.step_limit
            if hide_tmax is not None:
                env.env.unwrapped.step_limit = hide_tmax
            # print('rollout: hide step_limit = ', env.env.unwrapped.step_limit)

    agents['hide'].reset()
    hide_path_length = 0

    if animated:
        env.render()
    # print('rollout: HIDE')
    while hide_path_length < max_path_length:
        a, agent_info = agents['hide'].get_action(obs)
        # print('hide action: ', a)
        if animated:
            env.render()
        obs_next, r, d, env_info = env.step(a)
        hide_observations.append(obs)
        hide_rewards.append(r)
        hide_actions.append(env.action_space.flatten(a))
        hide_agent_infos.append(agent_info)
        hide_env_infos.append(env_info)
        hide_path_length += 1
        last_pose = env.env.unwrapped.get_all_pose()
        # last_goal = copy.deepcopy(env.env.env.goal)
        obs = obs_next
        # print('hide obs: ', obs_next)
        # time.sleep(0.5)
        # if r > 0:
        #     print('!!!!!!!!!!!!!! r:', r, 'stop crit: ', env.env.unwrapped.use_distance2center_stop_criteria)
        if d:
            break
            # print('step hide')
    # print('-------------------------')
    # print('goal hide last: ', env.env.env.goal, 'obs:', obs[1])

    hide_paths = dict(
        observations=e2e_tensor_utils.stack_tensor_list(hide_observations),
        actions=tensor_utils.stack_tensor_list(hide_actions),
        rewards=tensor_utils.stack_tensor_list(hide_rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(hide_agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(hide_env_infos),
    )

    if animated:
        time.sleep(1)

    ##############################################
    # SEEK AGENT
    # print('last obs: ', obs[1])
    if env.spec.id[:6] != 'Blocks' or env.spec.id[:12] == 'BlocksSimple':
        # Avoiding randomization for blocks env
        env.reset()  #must do reset for reacher otherwise it feaks out
    obs = env.env.env.reload_model(pose=last_pose, goal=last_goal)
    # print('goal seek: ', env.env.env.goal, 'obs:', obs)

    # print('Timelen max = ', env.env.unwrapped.step_limit)
    # print('Prev limit = ', prev_set_limit)
    if animated:
        print('rollout: SEEK')
        # env.env.unwrapped.frame_skip = 10

    # print('rollout: SEEK')
    if mode is not None:
        if mode == 'seek_force_only':
            env.env.env.use_stop = False
            env.env.env.add_mnist_reward(True)
            env.env.env.use_mnist_stop_criteria(True)
        elif mode == 'reach_center_and_stop':
            env.env.env.use_stop = False
            env.env.env.use_distance2center_stop_criteria = True
            if hide_tmax is not None:
                env.env.unwrapped.step_limit = prev_set_limit

    seek_observations = []
    seek_actions = []
    seek_rewards = []
    seek_agent_infos = []
    seek_env_infos = []

    # obs = env.reset()
    agents['seek'].reset()
    seek_path_length = 0

    if animated:
        env.render()
    while seek_path_length < max_path_length:
        # if seek_path_length < 2: print('seek obs: ', obs)
        a, agent_info = agents['seek'].get_action(obs)
        if animated:
            # print('Seek obs: ', obs, 'action:', a)
            # print('action:', a)
            env.render()
        obs_next, r, d, env_info = env.step(a)
        seek_observations.append(obs)
        seek_rewards.append(r)
        seek_actions.append(env.action_space.flatten(a))
        seek_agent_infos.append(agent_info)
        seek_env_infos.append(env_info)
        seek_path_length += 1

        if d:
            # print('break ...')
            break
        obs = obs_next
        # print('step seek')

    # if animated:
    #     env.env.unwrapped.frame_skip = frame_skip_prev

    if animated and not always_return_paths:
        return

    seek_paths = dict(
        observations=e2e_tensor_utils.stack_tensor_list(seek_observations),
        actions=tensor_utils.stack_tensor_list(seek_actions),
        rewards=tensor_utils.stack_tensor_list(seek_rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(seek_agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(seek_env_infos),
    )

    hide_paths['actions'] = hide_paths['actions'].astype(glob_config.dtype)
    seek_paths['actions'] = seek_paths['actions'].astype(glob_config.dtype)
    hide_paths['rewards'] = hide_paths['rewards'].astype(glob_config.dtype)
    seek_paths['rewards'] = seek_paths['rewards'].astype(glob_config.dtype)

    return {'hide': hide_paths, 'seek': seek_paths}
Exemple #26
0
def rollout_w_truth(env,
                    agent,
                    max_path_length=np.inf,
                    animated=False,
                    save_gif=False,
                    speedup=1,
                    mean=np.zeros(2),
                    std=np.ones(2),
                    seed=-1,
                    **kwargs):
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    o = env.reset(seed=seed)

    truth = defaultdict(list)
    ef = env.wrapped_env.j.rollout_ego_features(env.wrapped_env.simparams)
    for d in ef:
        for key, val in d.items():
            truth[key].append(val)

    agent.reset()
    path_length = 0
    if animated:
        env.render()
    if save_gif:
        initial_simparams0 = env.wrapped_env.copy_simparams()
        initial_simparams1 = env.wrapped_env.copy_simparams()

    while path_length < max_path_length:
        a, agent_info = agent.get_action(o)
        a = (a * std) + mean

        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        if d:
            break
        o = next_o
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)
    if save_gif:
        actions = [
            np.clip(action,
                    *env.wrapped_env.j.action_space_bounds(initial_simparams0))
            for action in actions
        ]
        env.wrapped_env.save_gif(initial_simparams0,
                                 np.column_stack(actions),
                                 kwargs['filename'],
                                 truth_simparams=initial_simparams1)

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    ), truth
Exemple #27
0
def rollout_seek(env,
                 agents,
                 max_path_length=np.inf,
                 animated=False,
                 speedup=1,
                 always_return_paths=False,
                 mode=None):
    ##############################################
    # SEEK AGENT
    # env.env.unwrapped.reload_model(pose=last_pose)

    seek_observations = []
    seek_actions = []
    seek_rewards = []
    seek_agent_infos = []
    seek_env_infos = []

    if mode == 'mnist_stop':
        env.env.env.use_stop = False
        env.env.env.use_mnist_reward(True)
        env.env.env.use_mnist_stop_criteria(True)
    else:
        env.env.env.use_stop = False

    obs = env.reset()
    agents['seek'].reset()
    seek_path_length = 0
    # print('obs: ', obs[1])

    if animated:
        env.render()
    while seek_path_length < max_path_length:
        a, agent_info = agents['seek'].get_action(obs)
        if animated:
            env.render()
        obs_next, r, d, env_info = env.step(a)
        seek_observations.append(obs)
        seek_rewards.append(r)
        seek_actions.append(env.action_space.flatten(a))
        seek_agent_infos.append(agent_info)
        seek_env_infos.append(env_info)
        seek_path_length += 1
        obs = obs_next
        if d:
            break
            print('SEEK Test | path_length:', seek_path_length)
    if animated and not always_return_paths:
        return

    seek_paths = dict(
        observations=e2e_tensor_utils.stack_tensor_list(seek_observations),
        actions=tensor_utils.stack_tensor_list(seek_actions),
        rewards=tensor_utils.stack_tensor_list(seek_rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(seek_agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(seek_env_infos),
    )

    seek_paths['actions'] = seek_paths['actions'].astype(glob_config.dtype)
    seek_paths['rewards'] = seek_paths['rewards'].astype(glob_config.dtype)

    return {'seek': seek_paths}
Exemple #28
0
    def process_samples(self, itr, paths):
        baselines = []
        returns = []

        if hasattr(self.algo.baseline, "predict_n"):
            all_path_baselines = self.algo.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.algo.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = path["rewards"] + \
                     self.algo.discount * path_baselines[1:] - \
                     path_baselines[:-1]
            path["advantages"] = special.discount_cumsum(
                deltas, self.algo.discount * self.algo.gae_lambda)
            path["returns"] = special.discount_cumsum(path["rewards"],
                                                      self.algo.discount)
            baselines.append(path_baselines[:-1])
            returns.append(path["returns"])

        ev = special.explained_variance_1d(np.concatenate(baselines),
                                           np.concatenate(returns))

        if not self.algo.policy.recurrent:
            observations = tensor_utils.concat_tensor_list(
                [path["observations"] for path in paths])
            actions = tensor_utils.concat_tensor_list(
                [path["actions"] for path in paths])
            rewards = tensor_utils.concat_tensor_list(
                [path["rewards"] for path in paths])
            returns = tensor_utils.concat_tensor_list(
                [path["returns"] for path in paths])
            advantages = tensor_utils.concat_tensor_list(
                [path["advantages"] for path in paths])
            env_infos = tensor_utils.concat_tensor_dict_list(
                [path["env_infos"] for path in paths])
            agent_infos = tensor_utils.concat_tensor_dict_list(
                [path["agent_infos"] for path in paths])
            if self.algo.center_adv:
                advantages = util.center_advantages(advantages)

            if self.algo.positive_adv:
                advantages = util.shift_advantages_to_positive(advantages)
            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.mean(self.algo.policy.distribution.entropy(agent_infos))

            samples_data = dict(
                observations=observations,
                actions=actions,
                rewards=rewards,
                returns=returns,
                advantages=advantages,
                env_infos=env_infos,
                agent_infos=agent_infos,
                paths=paths,
            )
        else:
            max_path_length = max([len(path["advantages"]) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path["observations"] for path in paths]
            obs = tensor_utils.pad_tensor_n(obs, max_path_length)

            if self.algo.center_adv:
                raw_adv = np.concatenate(
                    [path["advantages"] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [(path["advantages"] - adv_mean) / adv_std
                       for path in paths]
            else:
                adv = [path["advantages"] for path in paths]

            adv = np.asarray(
                [tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path["actions"] for path in paths]
            actions = tensor_utils.pad_tensor_n(actions, max_path_length)

            rewards = [path["rewards"] for path in paths]
            rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

            returns = [path["returns"] for path in paths]
            returns = tensor_utils.pad_tensor_n(returns, max_path_length)

            agent_infos = [path["agent_infos"] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in agent_infos
            ])

            env_infos = [path["env_infos"] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in env_infos
            ])

            valids = [np.ones_like(path["returns"]) for path in paths]
            valids = tensor_utils.pad_tensor_n(valids, max_path_length)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.sum(
                self.algo.policy.distribution.entropy(agent_infos) *
                valids) / np.sum(valids)

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                returns=returns,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                paths=paths,
            )

        logger.log("fitting baseline...")
        if hasattr(self.algo.baseline, 'fit_with_samples'):
            self.algo.baseline.fit_with_samples(paths, samples_data)
        else:
            self.algo.baseline.fit(paths)
        logger.log("fitted")

        logger.record_tabular('Iteration', itr)
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageReturn', np.mean(undiscounted_returns))
        logger.record_tabular('ExplainedVariance', ev)
        logger.record_tabular('NumTrajs', len(paths))
        logger.record_tabular('Entropy', ent)
        logger.record_tabular('Perplexity', np.exp(ent))
        logger.record_tabular('StdReturn', np.std(undiscounted_returns))
        logger.record_tabular('MaxReturn', np.max(undiscounted_returns))
        logger.record_tabular('MinReturn', np.min(undiscounted_returns))

        return samples_data
Exemple #29
0
def rollout_debug(env,
                  agents,
                  max_path_length=np.inf,
                  animated=False,
                  speedup=1,
                  always_return_paths=False):
    ##############################################
    # SEEK AGENT
    # env.env.unwrapped.reload_model(pose=last_pose)

    animated = True
    always_return_paths = True

    seek_observations = []
    seek_actions = []
    seek_rewards = []
    seek_agent_infos = []
    seek_env_infos = []

    env.env.env.use_stop = False
    env.env.env.use_mnist_reward(True)
    env.env.env.use_mnist_stop_criteria(True)

    obs = env.reset()
    agents['seek'].reset()
    seek_path_length = 0

    if animated:
        env.render()
    while seek_path_length < max_path_length:
        a, agent_info = agents['seek'].get_action(obs)
        if animated:
            env.render()
        obs_next, r, d, env_info = env.step(a)
        seek_observations.append(obs)
        seek_rewards.append(r)
        seek_actions.append(env.action_space.flatten(a))
        seek_agent_infos.append(agent_info)
        seek_env_infos.append(env_info)
        seek_path_length += 1
        obs = obs_next

        print('Distance = ', env_info['act_min_dist'], ' Max_dist = ',
              env_info['act_dist_max'])
        time.sleep(0.5)
        if d:
            break
            # print('step seek')
    if animated and not always_return_paths:
        return

    seek_paths = dict(
        observations=e2e_tensor_utils.stack_tensor_list(seek_observations),
        actions=tensor_utils.stack_tensor_list(seek_actions),
        rewards=tensor_utils.stack_tensor_list(seek_rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(seek_agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(seek_env_infos),
    )

    seek_paths['actions'] = seek_paths['actions'].astype(glob_config.dtype)
    seek_paths['rewards'] = seek_paths['rewards'].astype(glob_config.dtype)

    return {'seek': seek_paths}
Exemple #30
0
def rollout(env,
            agent,
            max_path_length=np.inf,
            animated=False,
            speedup=1,
            controller=None):
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    dones = []
    o = env.reset()
    agent.reset()
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        # import ipdb; ipdb.set_trace()
        # To test if the weights are correct, we need to use our local
        # get_action function
        # controller = control.StraightController()
        a, agent_info = controller.get_action(o.T)

        # a, agent_info = agent.get_action(o)
        next_o, r, d, env_info = env.step(a)
        if isinstance(env.observation_space, list):
            n = len(env.shadow_envs)
            observations.append([
                env.shadow_envs[i].observation_space.flatten_n(o[i])
                for i in range(n)
            ])
            rewards.append(r)
            actions.append([
                env.shadow_envs[i].action_space.flatten_n(a[i])
                for i in range(n)
            ])
        else:
            observations.append(env.observation_space.flatten(o))
            rewards.append(r)
            actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        dones.append(d)
        path_length += 1
        if d:
            break
        o = next_o
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)
    if animated:
        env.render(close=True)

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
        dones=np.asarray(dones),
        last_obs=o,
    )
Exemple #31
0
def rollout_brownian(env,
                     agents,
                     max_path_length=np.inf,
                     animated=False,
                     speedup=1,
                     always_return_paths=False,
                     mode=None,
                     hide_tmax=None):

    ##############################################
    ## HIDE AGENT
    env.reset()
    # 현재 agent['hide'].starts에서 random하게 하나를 뽑는다.
    start_pose, start_pose_id = agents['hide'].sample_one_start()
    start_pose = np.array(start_pose)
    # 현재 agent['hide'].starts에서 p의 확률로 agent['hide'].starts_old에서 1-p의 확률로 random하게 하나를 뽑는다.
    goal, goal_id = agents['hide'].sample_one_goal()
    obs = env.env.env.reload_model(pose=start_pose, goal=goal)
    # print("++++++++++++++++++++++++++++++++++++")
    # print('start_pose:', start_pose[0][0:2], '     goal:', goal[0][0:2])
    # print('start_pose:', np.array(obs[0][0:2]) * 2.4, '     goal:',np.array(obs[0][-2:]) * 2.4)

    ##############################################
    ## SEEK AGENT
    # print('rollout: Student')
    if animated:
        env.render()
        # env.env.unwrapped.frame_skip = 10

    if mode is not None:
        if mode == 'seek_force_only':
            env.env.env.use_stop = False
            env.env.env.add_mnist_reward(True)
            env.env.env.use_mnist_stop_criteria(True)
        elif mode == 'reach_center_and_stop':
            env.env.env.use_stop = False
            env.env.env.use_distance2center_stop_criteria = True

    seek_observations = []
    seek_actions = []
    seek_rewards = []
    seek_agent_infos = []
    seek_env_infos = []

    agents['seek'].reset()
    seek_path_length = 0

    if animated:
        env.render()
    while seek_path_length < max_path_length:
        # if seek_path_length < 2: print('seek obs: ', obs)
        a, agent_info = agents['seek'].get_action(obs)
        # print('action:',a)
        if animated:
            env.render()
        obs_next, r, d, env_info = env.step(a)
        seek_observations.append(obs)
        seek_rewards.append(r)
        seek_actions.append(env.action_space.flatten(a))
        seek_agent_infos.append(agent_info)
        seek_env_infos.append(env_info)
        seek_path_length += 1

        if d:
            print('SEEK| path_length:', len(seek_rewards))
            break
        obs = obs_next
        # print('step seek')

    ## Here we assigning if the goal was reached for a particular goal
    step_limit = env.env.unwrapped.step_limit
    goal_reached = float(seek_path_length < step_limit)

    # starts값에 대한 reward값 저장
    if agents['hide'].reverse_mode:
        agents['hide'].rewards[start_pose_id].append(goal_reached)
    else:
        agents['hide'].rewards[goal_id].append(goal_reached)

    if animated and not always_return_paths:
        return

    seek_paths = dict(
        observations=e2e_tensor_utils.stack_tensor_list(seek_observations),
        actions=tensor_utils.stack_tensor_list(seek_actions),
        rewards=tensor_utils.stack_tensor_list(seek_rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(seek_agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(seek_env_infos),
    )

    seek_paths['actions'] = seek_paths['actions'].astype(glob_config.dtype)
    seek_paths['rewards'] = seek_paths['rewards'].astype(glob_config.dtype)

    return {'seek': seek_paths}
Exemple #32
0
def ed_simpy_dec_rollout(env,
                         agents,
                         max_path_length=np.inf,
                         animated=False,
                         speedup=1):
    if (agents.recurrent):
        assert isinstance(
            agents,
            GSMDPRecurrentPolicy), 'Recurrent policy is not a GSMDP class'
    """Decentralized rollout"""
    n_agents = len(env.agents)
    observations = [[] for _ in range(n_agents)]
    actions = [[] for _ in range(n_agents)]
    rewards = [[] for _ in range(n_agents)]
    agent_infos = [[] for _ in range(n_agents)]
    env_infos = [[] for _ in range(n_agents)]
    offset_t_sojourn = [[] for _ in range(n_agents)]

    agents.reset(dones=[True for _ in range(n_agents)])
    agent_policies = [None] * n_agents
    for i in range(n_agents):
        agent_policies[i] = lambda obs: get_actions_wrapper(
            agents, i, n_agents, obs)
        # if(not agents.recurrent):
        # 	agent_policies[i] = lambda obs: agents.get_actions([obs])
        # else:
        # 	agent_policies[i] = lambda obs: agents.get_actions(obs_to_ith_loc(obs, i, n_agents))

    observations, actions, rewards, agent_infos, env_infos, offset_t_sojourn = env.wrapped_env.reset_and_sim(
        agent_policies)

    # remove empty agent trajectories
    observations = [o for o in observations if len(o) > 0]
    actions = [a for a in actions if len(a) > 0]
    rewards = [r for r in rewards if len(r) > 0]
    agent_infos = [i for i in agent_infos if len(i) > 0]
    env_infos = [e for e in env_infos if len(e) > 0]
    offset_t_sojourn = [o for o in offset_t_sojourn if len(o) > 0]

    if (any(
            map(lambda x: x < n_agents, [
                len(observations),
                len(actions),
                len(rewards),
                len(agent_infos),
                len(env_infos)
            ]))):
        print('\nWARNING: \n')
        print('n_agents: ', n_agents)
        print('len(observations): ', len(observations))
        print('len(actions): ', len(actions))
        print('len(rewards): ', len(rewards))
        print('len(agent_infos): ', len(agent_infos))
        print('len(env_infos): ', len(env_infos))

    return [
        dict(
            observations=tensor_utils.stack_tensor_list(observations[i]),
            actions=tensor_utils.stack_tensor_list(actions[i]),
            rewards=tensor_utils.stack_tensor_list(rewards[i]),
            agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos[i]),
            env_infos=tensor_utils.stack_tensor_dict_list(env_infos[i]),
            offset_t_sojourn=tensor_utils.stack_tensor_list(
                offset_t_sojourn[i]),
        ) for i in range(len(observations))
    ]
    def obtain_samples(self,
                       itr,
                       reset_args=None,
                       task_idxs=None,
                       return_dict=False,
                       log_prefix=''):
        # reset_args: arguments to pass to the environments to reset
        # return_dict: whether or not to return a dictionary or list form of paths

        logger.log("Obtaining samples for iteration %d..." % itr)

        #paths = []
        paths = {}
        for i in range(self.vec_env.num_envs):
            paths[i] = []

        # if the reset args are not list/numpy, we set the same args for each env
        if reset_args is not None and (type(reset_args) != list
                                       and type(reset_args) != np.ndarray):
            reset_args = [reset_args] * self.vec_env.num_envs

        n_samples = 0
        curr_noises = [
            np.random.normal(0, 1, size=(self.latent_dim, ))
            for _ in range(self.vec_env.num_envs)
        ]

        #curr_noises = [np.ones(size = (self.latent_dim)) for _ in range(self.vec_env.num_envs)]
        obses = self.vec_env.reset(reset_args)
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.algo.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy
        import time

        while n_samples < self.algo.batch_size:
            t = time.time()
            #print(obses.shape,task_idxs.shape,curr_noises[0].shape)
            policy.reset(dones)  #TODO: What the hell does this do?
            actions, agent_infos = policy.get_actions(obses, task_idxs,
                                                      curr_noises)

            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(
                actions, reset_args)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done, noise in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones, curr_noises):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(observations=[],
                                              actions=[],
                                              rewards=[],
                                              env_infos=[],
                                              agent_infos=[],
                                              noises=[])
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                running_paths[idx]["noises"].append(noise)

                if done:
                    paths[idx].append(
                        dict(
                            observations=self.env_spec.observation_space.
                            flatten_n(running_paths[idx]["observations"]),
                            noises=self.flatten_n(
                                running_paths[idx]["noises"]),
                            actions=self.env_spec.action_space.flatten_n(
                                running_paths[idx]["actions"]),
                            rewards=tensor_utils.stack_tensor_list(
                                running_paths[idx]["rewards"]),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None
                    curr_noises[idx] = np.random.normal(
                        0, 1, size=(self.latent_dim, ))
                    #curr_noises[idx] = np.ones(size=(self.latent_dim))

            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        logger.record_tabular(log_prefix + "PolicyExecTime", policy_time)
        logger.record_tabular(log_prefix + "EnvExecTime", env_time)
        logger.record_tabular(log_prefix + "ProcessExecTime", process_time)

        if not return_dict:
            flatten_list = lambda l: [
                item for sublist in l for item in sublist
            ]
            paths = flatten_list(paths.values())
            #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()])

        return paths
    def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix=''):
        # reset_args: arguments to pass to the environments to reset
        # return_dict: whether or not to return a dictionary or list form of paths

        logger.log("Obtaining samples for iteration %d..." % itr)

        #paths = []
        paths = {}
        for i in range(self.vec_env.num_envs):
            paths[i] = []

        # if the reset args are not list/numpy, we set the same args for each env
        if reset_args is not None and (type(reset_args) != list and type(reset_args)!=np.ndarray):
            reset_args = [reset_args]*self.vec_env.num_envs

        n_samples = 0
        obses = self.vec_env.reset(reset_args)
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.algo.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy
        import time


        while n_samples < self.algo.batch_size:
            t = time.time()
            policy.reset(dones)
            actions, agent_infos = policy.get_actions(obses)

            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions, reset_args)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions,
                                                                                    rewards, env_infos, agent_infos,
                                                                                    dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths[idx].append(dict(
                        observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]),
                        actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]),
                        rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]),
                        env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]),
                        agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]),
                    ))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None
            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        logger.record_tabular(log_prefix+"PolicyExecTime", policy_time)
        logger.record_tabular(log_prefix+"EnvExecTime", env_time)
        logger.record_tabular(log_prefix+"ProcessExecTime", process_time)

        if not return_dict:
            flatten_list = lambda l: [item for sublist in l for item in sublist]
            paths = flatten_list(paths.values())
            #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()])

        return paths
Exemple #35
0
def rollout_policy(agent,
                   env,
                   max_path_length=200,
                   speedup=1,
                   get_image_observations=False,
                   animated=False):
    """
    Mostly taken from https://github.com/bstadie/third_person_im/blob/master/sandbox/bradly/third_person/algos/cyberpunk_trainer.py#L164
    Generate a rollout for a given policy
    """
    observations = []
    im_observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    o = env.reset()
    path_length = 0

    while path_length <= max_path_length:
        a, agent_info = agent.get_action(o)
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))

        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        o = next_o
        if get_image_observations:
            if not animated:
                pixel_array = env.render(mode="rgb_array")
            else:
                pixel_array = env.render()

            if pixel_array is None and not animated:
                # Not convinced that behaviour works for all environments, so until
                # such a time as I'm convinced of this, drop into a debug shell
                print(
                    "Problem! Couldn't get pixels! Dropping into debug shell.")
                import pdb
                pdb.set_trace()
            im_observations.append(pixel_array)
        if d:
            rewards.append(r)
            break
        else:
            rewards.append(r)

    if animated:
        env.render(close=True)

    im_observations = tensor_utils.stack_tensor_list(im_observations)
    observations = tensor_utils.stack_tensor_list(observations)
    rewards = tensor_utils.stack_tensor_list(rewards)

    return dict(
        observations=observations,
        im_observations=im_observations,
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=rewards,
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
Exemple #36
0
    def process_samples(self, itr, paths):
        baselines = []
        returns = []
        for path in paths:
            path_baselines = np.append(self.algo.baseline.predict(path), 0)
            deltas = path["rewards"] + \
                     self.algo.discount * path_baselines[1:] - \
                     path_baselines[:-1]
            path["advantages"] = special.discount_cumsum(
                deltas, self.algo.discount * self.algo.gae_lambda)
            path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount)
            baselines.append(path_baselines[:-1])
            returns.append(path["returns"])

        if not self.algo.policy.recurrent:
            observations = tensor_utils.concat_tensor_list([path["observations"] for path in paths])
            actions = tensor_utils.concat_tensor_list([path["actions"] for path in paths])
            rewards = tensor_utils.concat_tensor_list([path["rewards"] for path in paths])
            advantages = tensor_utils.concat_tensor_list([path["advantages"] for path in paths])
            env_infos = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths])
            agent_infos = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in paths])

            if self.algo.center_adv:
                advantages = util.center_advantages(advantages)

            if self.algo.positive_adv:
                advantages = util.shift_advantages_to_positive(advantages)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.mean(self.algo.policy.distribution.entropy(agent_infos))

            ev = special.explained_variance_1d(
                np.concatenate(baselines),
                np.concatenate(returns)
            )

            samples_data = dict(
                observations=observations,
                actions=actions,
                rewards=rewards,
                advantages=advantages,
                env_infos=env_infos,
                agent_infos=agent_infos,
                paths=paths,
            )
        else:
            max_path_length = max([len(path["advantages"]) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path["observations"] for path in paths]
            obs = np.array([tensor_utils.pad_tensor(ob, max_path_length) for ob in obs])

            if self.algo.center_adv:
                raw_adv = np.concatenate([path["advantages"] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [(path["advantages"] - adv_mean) / adv_std for path in paths]
            else:
                adv = [path["advantages"] for path in paths]

            adv = np.array([tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path["actions"] for path in paths]
            actions = np.array([tensor_utils.pad_tensor(a, max_path_length) for a in actions])

            rewards = [path["rewards"] for path in paths]
            rewards = np.array([tensor_utils.pad_tensor(r, max_path_length) for r in rewards])

            agent_infos = [path["agent_infos"] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list(
                [tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos]
            )

            env_infos = [path["env_infos"] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list(
                [tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos]
            )

            valids = [np.ones_like(path["returns"]) for path in paths]
            valids = np.array([tensor_utils.pad_tensor(v, max_path_length) for v in valids])

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.sum(self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids)

            ev = special.explained_variance_1d(
                np.concatenate(baselines),
                np.concatenate(returns)
            )

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                paths=paths,
            )

        logger.log("fitting baseline...")
        self.algo.baseline.fit(paths)
        logger.log("fitted")

        logger.record_tabular('Iteration', itr)
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageReturn', np.mean(undiscounted_returns))
        logger.record_tabular('ExplainedVariance', ev)
        logger.record_tabular('NumTrajs', len(paths))
        logger.record_tabular('Entropy', ent)
        logger.record_tabular('Perplexity', np.exp(ent))
        logger.record_tabular('StdReturn', np.std(undiscounted_returns))
        logger.record_tabular('MaxReturn', np.max(undiscounted_returns))
        logger.record_tabular('MinReturn', np.min(undiscounted_returns))

        return samples_data
Exemple #37
0
def rollout_policy(agent,
                   env,
                   max_path_length=200,
                   reward_extractor=None,
                   speedup=1,
                   get_image_observations=False,
                   num_frames=4,
                   concat_timesteps=True,
                   animated=False):
    """
    Mostly taken from https://github.com/bstadie/third_person_im/blob/master/sandbox/bradly/third_person/algos/cyberpunk_trainer.py#L164
    Generate a rollout for a given policy
    """
    observations = []
    im_observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    o = env.reset()
    path_length = 0

    while path_length <= max_path_length:
        a, agent_info = agent.get_action(o)
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))

        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        o = next_o
        if get_image_observations:
            if not animated:
                pixel_array = env.render(mode="rgb_array")
            else:
                pixel_array = env.render()

            if pixel_array is None and not animated:
                # Not convinced that behaviour works for all environments, so until
                # such a time as I'm convinced of this, drop into a debug shell
                print(
                    "Problem! Couldn't get pixels! Dropping into debug shell.")
                import pdb
                pdb.set_trace()
            im_observations.append(pixel_array)
        if d:
            rewards.append(r)
            break
        else:
            rewards.append(r)
    # if animated:
    # env.render(close=True)

    im_observations = tensor_utils.stack_tensor_list(im_observations)

    observations = tensor_utils.stack_tensor_list(observations)

    if reward_extractor is not None:
        #TODO: remove/replace this
        if concat_timesteps:
            true_rewards = tensor_utils.stack_tensor_list(rewards)
            obs_pls_three = np.zeros(
                (observations.shape[0], num_frames, observations.shape[1]))
            # import pdb; pdb.set_trace()
            for iter_step in range(0, obs_pls_three.shape[0]):
                for i in range(num_frames):
                    idx_plus_three = min(iter_step + num_frames,
                                         obs_pls_three.shape[0] - 1)
                    obs_pls_three[iter_step,
                                  i, :] = observations[idx_plus_three, :]
            rewards = reward_extractor.get_reward(obs_pls_three)
        else:
            true_rewards = tensor_utils.stack_tensor_list(rewards)
            rewards = reward_extractor.get_reward(observations)
    else:
        rewards = tensor_utils.stack_tensor_list(rewards)
        true_rewards = rewards

    return dict(
        observations=observations,
        im_observations=im_observations,
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=rewards,
        true_rewards=true_rewards,
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
def sample_paths(N, 
    policy, 
    baseline, 
    env_mode='train', 
    T=1e6, 
    gamma=1, 
    normalized_env=False,
    env=None):
    # Directly specifying env works only when sampling in series

    # set random seed (needed for multiprocessing)
    np.random.seed()

    if env == None:
        env = get_environment(env_mode)
    T = min(T, env.horizon)
    T = max(1, T)  
    # sometimes, env is not initialized correctly in multiprocessing
    # this is just a sanity check and step size should essentially be zero.

    print "####### Worker started #######"

    paths = []

    for ep in range(N):
        
        observations=[]
        actions=[]
        rewards=[]
        agent_infos = []
        env_infos = []
        qpos = []
        qvel = []

        o = env.reset()
        if normalized_env:
            qpos.append(env.wrapped_env.env.model.data.qpos.reshape(-1))
            qvel.append(env.wrapped_env.env.model.data.qvel.reshape(-1))
        else:
            qpos.append(env.env.model.data.qpos.reshape(-1))
            qvel.append(env.env.model.data.qvel.reshape(-1))
        done = False
        t = 0

        while t < T and done != True:
            a, agent_info = policy.get_action(o)
            next_o, r, done, env_info = env.step(a)
            observations.append(env.observation_space.flatten(o))
            actions.append(env.action_space.flatten(a))
            rewards.append(r)
            agent_infos.append(agent_info)
            env_infos.append(env_info)
            if normalized_env:
                qpos.append(env.wrapped_env.env.model.data.qpos.reshape(-1))
                qvel.append(env.wrapped_env.env.model.data.qvel.reshape(-1))
            else:
                qpos.append(env.env.model.data.qpos.reshape(-1))
                qvel.append(env.env.model.data.qvel.reshape(-1))
            o = next_o
            t += 1

        # make a path dictionary
        # Also store the path belief and env data used in the trajectory
        try:
            path_belief = env.env.belief
        except Exception as e:
            path_belief = str(e)

        # path_model = env.env

        path = dict(
            observations=tensor_utils.stack_tensor_list(observations),
            actions=tensor_utils.stack_tensor_list(actions),
            rewards=tensor_utils.stack_tensor_list(rewards),
            agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
            env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
            qpos=tensor_utils.stack_tensor_list(qpos),
            qvel=tensor_utils.stack_tensor_list(qvel),
            #path_belief=path_belief,
            #path_model=path_model,
        )

        # TODO: Storing the path model is too space inefficient. Need to find alternative
        
        # compute returns using the path
        path_baseline = baseline.predict(path)
        advantages = []
        returns = []
        return_so_far = 0
        for t in xrange(len(rewards) - 1, -1, -1):
            return_so_far = rewards[t] + gamma * return_so_far
            returns.append(return_so_far)
            advantage = return_so_far - path_baseline[t]
            advantages.append(advantage)

        # advantages and returns are stored backward in time
        advantages = np.array(advantages[::-1])
        returns = np.array(returns[::-1])
        
        # normalize advantages
        advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8)
        
        path["advantages"] = advantages
        path["returns"] = returns

        paths.append(path)

    #print "Env body_mass : ", env.env.model.body_mass[1]
    print "====== Worker finished ======"

    return paths
Exemple #39
0
    def process_samples(self, itr, paths):

        if self.normalize_reward:
            # Update reward mean/std Q.
            rewards = []
            for i in xrange(len(paths)):
                rewards.append(paths[i]['rewards'])
            rewards_flat = np.hstack(rewards)
            self._reward_mean.append(np.mean(rewards_flat))
            self._reward_std.append(np.std(rewards_flat))

            # Normalize rewards.
            reward_mean = np.mean(np.asarray(self._reward_mean))
            reward_std = np.mean(np.asarray(self._reward_std))
            for i in xrange(len(paths)):
                paths[i]['rewards'] = (
                    paths[i]['rewards'] - reward_mean) / (reward_std + 1e-8)

        if itr > 0:
            kls = []
            for i in xrange(len(paths)):
                kls.append(paths[i]['KL'])

            kls_flat = np.hstack(kls)

            logger.record_tabular('Expl_MeanKL', np.mean(kls_flat))
            logger.record_tabular('Expl_StdKL', np.std(kls_flat))
            logger.record_tabular('Expl_MinKL', np.min(kls_flat))
            logger.record_tabular('Expl_MaxKL', np.max(kls_flat))

            # Perform normlization of the intrinsic rewards.
            if self.use_kl_ratio:
                if self.use_kl_ratio_q:
                    # Update kl Q
                    self.kl_previous.append(np.median(np.hstack(kls)))
                    previous_mean_kl = np.mean(np.asarray(self.kl_previous))
                    for i in xrange(len(kls)):
                        kls[i] = kls[i] / previous_mean_kl

            # Add KL ass intrinsic reward to external reward
            for i in xrange(len(paths)):
                paths[i]['rewards'] = paths[i]['rewards'] + self.eta * kls[i]

            # Discount eta
            self.eta *= self.eta_discount

        else:
            logger.record_tabular('Expl_MeanKL', 0.)
            logger.record_tabular('Expl_StdKL', 0.)
            logger.record_tabular('Expl_MinKL', 0.)
            logger.record_tabular('Expl_MaxKL', 0.)

        baselines = []
        returns = []
        for path in paths:
            path_baselines = np.append(self.baseline.predict(path), 0)
            deltas = path["rewards"] + \
                self.discount * path_baselines[1:] - \
                path_baselines[:-1]
            path["advantages"] = special.discount_cumsum(
                deltas, self.discount * self.gae_lambda)
            path["returns"] = special.discount_cumsum(
                path["rewards_orig"], self.discount)
            baselines.append(path_baselines[:-1])
            returns.append(path["returns"])

        if not self.policy.recurrent:
            observations = tensor_utils.concat_tensor_list(
                [path["observations"] for path in paths])
            actions = tensor_utils.concat_tensor_list(
                [path["actions"] for path in paths])
            rewards = tensor_utils.concat_tensor_list(
                [path["rewards"] for path in paths])
            advantages = tensor_utils.concat_tensor_list(
                [path["advantages"] for path in paths])
            env_infos = tensor_utils.concat_tensor_dict_list(
                [path["env_infos"] for path in paths])
            agent_infos = tensor_utils.concat_tensor_dict_list(
                [path["agent_infos"] for path in paths])

            if self.center_adv:
                advantages = util.center_advantages(advantages)

            if self.positive_adv:
                advantages = util.shift_advantages_to_positive(advantages)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [
                sum(path["rewards_orig"]) for path in paths]

            ent = np.mean(self.policy.distribution.entropy(agent_infos))

            ev = special.explained_variance_1d(
                np.concatenate(baselines),
                np.concatenate(returns)
            )

            samples_data = dict(
                observations=observations,
                actions=actions,
                rewards=rewards,
                advantages=advantages,
                env_infos=env_infos,
                agent_infos=agent_infos,
                paths=paths,
            )
        else:
            max_path_length = max([len(path["advantages"]) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path["observations"] for path in paths]
            obs = np.array(
                [tensor_utils.pad_tensor(ob, max_path_length) for ob in obs])

            if self.center_adv:
                raw_adv = np.concatenate(
                    [path["advantages"] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [
                    (path["advantages"] - adv_mean) / adv_std for path in paths]
            else:
                adv = [path["advantages"] for path in paths]

            adv = np.array(
                [tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path["actions"] for path in paths]
            actions = np.array(
                [tensor_utils.pad_tensor(a, max_path_length) for a in actions])

            rewards = [path["rewards"] for path in paths]
            rewards = np.array(
                [tensor_utils.pad_tensor(r, max_path_length) for r in rewards])

            agent_infos = [path["agent_infos"] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list(
                [tensor_utils.pad_tensor_dict(
                    p, max_path_length) for p in agent_infos]
            )

            env_infos = [path["env_infos"] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list(
                [tensor_utils.pad_tensor_dict(
                    p, max_path_length) for p in env_infos]
            )

            valids = [np.ones_like(path["returns"]) for path in paths]
            valids = np.array(
                [tensor_utils.pad_tensor(v, max_path_length) for v in valids])

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.mean(self.policy.distribution.entropy(agent_infos))

            ev = special.explained_variance_1d(
                np.concatenate(baselines),
                np.concatenate(returns)
            )

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                paths=paths,
            )

        logger.log("fitting baseline...")
        self.baseline.fit(paths)
        logger.log("fitted")

        logger.record_tabular('Iteration', itr)
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageReturn', np.mean(undiscounted_returns))
        logger.record_tabular('ExplainedVariance', ev)
        logger.record_tabular('NumTrajs', len(paths))
        logger.record_tabular('Entropy', ent)
        logger.record_tabular('Perplexity', np.exp(ent))
        logger.record_tabular('StdReturn', np.std(undiscounted_returns))
        logger.record_tabular('MaxReturn', np.max(undiscounted_returns))
        logger.record_tabular('MinReturn', np.min(undiscounted_returns))

        return samples_data