Esempio n. 1
0
def rollout(env,
            agent,
            max_path_length=np.inf,
            animated=False,
            speedup=1,
            save_video=True,
            video_filename='sim_out.mp4',
            reset_args=None,
            policy_contexts=None):
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    images = []
    o = env.reset(reset_args=reset_args, policy_contexts=policy_contexts)
    agent.reset()
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        a, agent_info = agent.get_action(o)
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        if d:  # and not animated:  # TODO testing
            break
        o = next_o
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)
            if save_video:
                from PIL import Image
                image = env.wrapped_env.wrapped_env.get_viewer().get_image()
                pil_image = Image.frombytes('RGB', (image[1], image[2]),
                                            image[0])
                images.append(np.flipud(np.array(pil_image)))

    if animated:
        if save_video and len(images) >= max_path_length:
            import moviepy.editor as mpy
            clip = mpy.ImageSequenceClip(images, fps=20 * speedup)
            if video_filename[-3:] == 'gif':
                clip.write_gif(video_filename, fps=20 * speedup)
            else:
                clip.write_videofile(video_filename, fps=20 * speedup)
        #return

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
Esempio n. 2
0
def rollout(env, agent, max_path_length=10000, animated=False, speedup=1):
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    o = env.reset()
    agent.reset()
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        a, agent_info = agent.get_action(o)
        next_o, r, d, env_info = env.step(a,o)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        if d:
            break
        o = next_o
        if animated:
            env.render()
    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
Esempio n. 3
0
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1):
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    o = env.reset()
    agent.reset()
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        a, agent_info = agent.get_action(o)
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        if d:
            break
        o = next_o
        if animated:
            env.render()
    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
Esempio n. 4
0
def rollout_snn(env,
                agent,
                max_path_length=np.inf,
                reset_start_rollout=True,
                switch_lat_every=0,
                animated=False,
                speedup=1):
    """
    :param reset_start_rollout: whether to reset at the start of every rollout
    :param switch_lat_every: potential change in latents (by resetting the agent with forced resample lat)
    """
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    if reset_start_rollout:
        o = env.reset(
        )  # in case rollout is called to produce parts of a trajectory: otherwise it will never advance!!
    else:
        if isinstance(env, NormalizedEnv):
            o = env.wrapped_env.get_current_obs()
        else:
            o = env.get_current_obs()
    agent.reset()  # this resamples a latent in SNNs!
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        if switch_lat_every > 0 and path_length % switch_lat_every == 0:
            agent.reset(
                force_resample_lat=True)  # here forced to resample a latent
        a, agent_info = agent.get_action(o)
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        if d:
            break
        o = next_o
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(
            env_infos),  # here it concatenates all lower-level paths!
        #  So all elements are np.arrays of max_path_length x time_steps_agg x corresp_dim
        #  hence the next concatenation done by sampler at the higher level doesn't work because the mismatched dim
        #  1 and not 0!!
    )
Esempio n. 5
0
def rollout_record(env,
                   agent,
                   max_path_length=np.inf,
                   animated=False,
                   speedup=1):
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    o = env.reset()

    # environment descriptions
    descs = defaultdict(list)
    for key, val in env.describe().items():
        descs[key].append(val)

    agent.reset()
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        a, agent_info = agent.get_action(o)
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)

        # NOTE: this runs describe after the environment is done, so describe
        #       must be happy with this.
        # NOTE: this appending is dangerous! it makes the assumption that vals of
        #       different keys must all be simulatenously present, which might not be true;
        #       the result is a misalignment of vals of different keys further down!
        for key, val in env.describe().items():
            descs[key].append(val)

        path_length += 1
        if d:
            break
        o = next_o
        if animated:
            env.render()
    results = dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos))
    for key, val in descs.items():
        # Keys ending with with _varlist should be concatenated not stacked
        if len(key) > 8 and key[-8:] == '_varlist':
            results[key[:-8]] = np.concatenate(val, axis=0)
        else:
            results[key] = tensor_utils.stack_tensor_list(val)

    return results
Esempio n. 6
0
def rollout(env,
            agent,
            max_path_length=np.inf,
            animated=False,
            speedup=1,
            always_return_paths=True):
    violation_cost = 0
    boundary_violation_cost = 0
    observations = []
    actions = []
    rewards = []
    succ_rewards = []
    succ_rate = 0
    agent_infos = []
    env_infos = []
    o = env.reset()
    agent.reset()
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        a, agent_info = agent.get_action(o)
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        succ_rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        if d:
            break
        o = next_o
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)
    if animated and not always_return_paths:
        return
    succ_rate = 1

    sum = 0
    for i in range(len(rewards)):
        sum += rewards[i]
    print("Episode Reward : ", sum)

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
        violation_cost=violation_cost,
        boundary_violation_cost=boundary_violation_cost,
        succ_return=succ_rewards,
        succ_rate=succ_rate,
    )
Esempio n. 7
0
def rollout_policy(agent, env, max_path_length=200, speedup=1, get_image_observations=False, animated=False):
    """
    Mostly taken from https://github.com/bstadie/third_person_im/blob/master/sandbox/bradly/third_person/algos/cyberpunk_trainer.py#L164
    Generate a rollout for a given policy
    """
    observations = []
    im_observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    o = env.reset()
    path_length = 0

    while path_length <= max_path_length:
        a, agent_info = agent.get_action(o)
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))

        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        o = next_o
        if get_image_observations:
            if not animated:
                pixel_array = env.render(mode="rgb_array")
            else:
                pixel_array = env.render()

            if pixel_array is None and not animated:
                # Not convinced that behaviour works for all environments, so until
                # such a time as I'm convinced of this, drop into a debug shell
                print("Problem! Couldn't get pixels! Dropping into debug shell.")
                import pdb; pdb.set_trace()
            im_observations.append(pixel_array)
        if d:
            rewards.append(r)
            break
        else:
            rewards.append(r)

    if animated:
        env.render(close=True)

    im_observations = tensor_utils.stack_tensor_list(im_observations)
    observations = tensor_utils.stack_tensor_list(observations)
    rewards = tensor_utils.stack_tensor_list(rewards)

    return dict(
        observations=observations,
        im_observations=im_observations,
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=rewards,
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
Esempio n. 8
0
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1,
            always_return_paths=False):
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    o = env.reset()
    agent.reset()
    path_length = 0
    if animated:
        env.render()

    count = 0
    indexes = np.concatenate([[i for i in [np.array((0.09, 0.0, 0.0)),
                                           np.array((0.0, 0.09, 0.0)),
                                           np.array((-0.09, 0.0, 0.0)),
                                           np.array((0.0, -0.09, 0.0))
                                           ]] for x in range(500)])
    while path_length < max_path_length:
        a, agent_info = agent.get_action(o)
        next_o, r, d, env_info = env.step(a)

        if r == 10.0:
            env.wrapped_env.env.goal = env.wrapped_env.env.goal + indexes[count]
            count = count + 1
            print("newnew: {0} {1}".format(env.wrapped_env.env.goal, path_length))
            print(np.array(
                [env.wrapped_env.env.sim.data.qvel[x] for x in env.wrapped_env.env._ref_joint_vel_indexes]
            ))

        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        if d:
            break
        o = next_o
        if animated:
            env.render()
            env.wrapped_env.env.viewer.viewer.add_marker(pos=env.wrapped_env.env.goal,
                                                             size=np.array((0.02, 0.02, 0.02)), label='goal',
                                                             rgba=[1, 0, 0, 0.5])
            # timestep = 0.05
            # time.sleep(timestep / speedup)
    if animated and not always_return_paths:
        return

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
Esempio n. 9
0
def rarl_rollout(env, agent1, agent2, policy_num, policy2_num,max_path_length=np.inf, animated=False, speedup=1,
            always_return_paths=False):
    #logger.log("rollout~~~~~~~~~~~~~~~~~~~")
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    o = env.reset()
    agent1.reset()
    agent2.reset()
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        a1, agent1_info = agent1.get_action(o)
        a2, agent2_info = agent2.get_action(o)
        action_true = np.append(a1,a2)
        Action = {}
        Action['action'] = np.append(a1,a2)
        # Action['dist1'] = agent1_info
        # Action['dist2'] = agent2_info
        Action['policy_num'] = policy_num
        Action['policy2_num'] = policy2_num
        next_o, r, d, env_info = env.step(Action)
        # print(' ')
        # print('policy_num: ',policy_num,' a1: ',a1,' a2: ',a2,' reward: ',r)
        observations.append(env.observation_space.flatten(o))

        if policy_num==1:
            rewards.append(r)
            actions.append(env.action_space.flatten(a1))
            agent_infos.append(agent1_info)
        else:
            rewards.append(r)
            actions.append(env.action_space.flatten(a2))  
            agent_infos.append(agent2_info)
            
        env_infos.append(env_info)
        path_length += 1
        if d:
            break
        o = next_o
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)
    if animated and not always_return_paths:
        return

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
Esempio n. 10
0
def rollout(env,
            agent,
            max_path_length=np.inf,
            animated=False,
            speedup=1,
            always_return_paths=False):
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []

    if hasattr(env._wrapped_env, 'generate_grid'):
        env._wrapped_env.generate_grid = False
    if hasattr(env._wrapped_env, 'generate_b0_start_goal'):
        env._wrapped_env.generate_b0_start_goal = False
    # print(env._wrapped_env.env_img)
    o = env.reset()
    if hasattr(agent, 'prob_network') and hasattr(
            agent.prob_network, '_l_gru') and hasattr(
                agent.prob_network._l_gru, 'map'):
        agent.reset(env._wrapped_env.env_img, env._wrapped_env.goal_img,
                    env._wrapped_env.b0_img)
    else:
        agent.reset()
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        a, agent_info = agent.get_action(o)
        # print('action: ',a)
        # print('env ob: ',o)
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        if d:
            break
        o = next_o
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)
    if animated and not always_return_paths:
        return

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
Esempio n. 11
0
def rollout(env,
            agent,
            max_path_length=np.inf,
            animated=False,
            speedup=1,
            always_return_paths=False,
            env_start_state=None,
            agent_start_state=None):
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    if env_start_state is not None:  # not all envs support init_state
        o = env.reset(init_state=env_start_state)
    else:
        o = env.reset()
    if agent_start_state is not None:
        agent.reset(init_state=agent_start_state)
    else:
        agent.reset()
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        a, agent_info = agent.get_action(o)
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        if np.any(
                np.logical_or(np.logical_or(np.isinf(a), np.isnan(a)),
                              np.abs(a) > 1e3)):
            warnings.warn("Invalid action detected")
            rewards[-1] = -1000.0
            break
        if d:
            break
        o = next_o
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)
    if animated and not always_return_paths:
        return

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
def rollout(env,
            agent,
            max_path_length=np.inf,
            animated=False,
            speedup=1,
            always_return_paths=False):
    observations = []
    next_observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    o = env.reset()
    agent.reset()
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        a, agent_info = agent.get_action(o)
        try:
            if agent.scale_action:  # check if action needs to be scaled
                if isinstance(env.action_space, Box):
                    # rescale the action
                    lb, ub = env.action_space.bounds
                    a = lb + (a + 1.) * 0.5 * (ub - lb)
                    a = np.clip(a, lb, ub)
        except AttributeError:
            pass
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))
        next_observations.append(env.observation_space.flatten(next_o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)
            print(o, r, a, next_o, d)
        path_length += 1
        if d:
            break
        o = next_o
    if animated and not always_return_paths:
        return

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        next_observations=tensor_utils.stack_tensor_list(next_observations),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
Esempio n. 13
0
def rollout(env,
            agent,
            max_path_length=np.inf,
            animated=False,
            speedup=1,
            always_return_paths=False,
            include_original_frames=False):
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    if include_original_frames:
        bare_env = _get_bare_env(env)
        if not hasattr(bare_env, 'get_original_frames'):
            include_original_frames = False
    o = env.reset()
    agent.reset()
    path_length = 0
    if animated:
        env.render()
    next_o = None
    while path_length < max_path_length:
        a, agent_info = agent.get_action(o)
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        if d:
            break
        o = next_o
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)
    if animated and not always_return_paths:
        return

    result = dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
        last_observation=next_o,
    )
    if include_original_frames:
        original_frames = bare_env.get_original_frames()
        if original_frames is not None:
            result['original_frames'] = original_frames
    return result
Esempio n. 14
0
def rollout(env,
            agent,
            max_path_length=np.inf,
            animated=False,
            speedup=1,
            init_state=None,
            no_action=False):
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    dones = []
    # no_action = True
    if init_state is not None:
        o = env.reset(init_state)
    else:
        o = env.reset()
    agent.reset()
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        a, agent_info = agent.get_action(o)
        #print(np.clip(a, -1., 1.))
        if no_action:
            a = np.zeros_like(a)
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        dones.append(d)
        path_length += 1
        if d:
            break
        o = next_o
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)
    if animated:
        env.render(close=False)

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
        dones=np.asarray(dones),
        last_obs=o,
    )
Esempio n. 15
0
def dec_rollout(env,
                agents,
                max_path_length=np.inf,
                animated=False,
                speedup=1):
    """Decentralized rollout"""
    n_agents = len(env.agents)
    observations = [[] for _ in range(n_agents)]
    actions = [[] for _ in range(n_agents)]
    rewards = [[] for _ in range(n_agents)]
    agent_infos = [[] for _ in range(n_agents)]
    env_infos = [[] for _ in range(n_agents)]
    olist = env.reset()
    assert len(olist) == n_agents, "{} != {}".format(len(olist), n_agents)
    agents.reset(dones=[True for _ in range(n_agents)])
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        alist, agent_info_list = agents.get_actions(olist)
        agent_info_list = tensor_utils.split_tensor_dict_list(agent_info_list)
        # For each agent
        for i, o in enumerate(olist):
            observations[i].append(env.observation_space.flatten(o))
            actions[i].append(env.action_space.flatten(alist[i]))
            if agent_info_list is None:
                agent_infos[i].append({})
            else:
                agent_infos[i].append(agent_info_list[i])

        next_olist, rlist, d, env_info = env.step(np.asarray(alist))
        for i, r in enumerate(rlist):
            rewards[i].append(r)
            env_infos[i].append(env_info)
        path_length += 1
        if d:
            break
        olist = next_olist
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)
    if animated:
        env.render()

    return [
        dict(
            observations=tensor_utils.stack_tensor_list(observations[i]),
            actions=tensor_utils.stack_tensor_list(actions[i]),
            rewards=tensor_utils.stack_tensor_list(rewards[i]),
            agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos[i]),
            env_infos=tensor_utils.stack_tensor_dict_list(env_infos[i]),
        ) for i in range(n_agents)
    ]
Esempio n. 16
0
def conc_rollout(env,
                 agents,
                 max_path_length=np.inf,
                 animated=False,
                 speedup=1):
    """Concurrent rollout"""
    n_agents = len(env.agents)
    observations = [[] for _ in range(n_agents)]
    actions = [[] for _ in range(n_agents)]
    rewards = [[] for _ in range(n_agents)]
    agent_infos = [[] for _ in range(n_agents)]
    env_infos = [[] for _ in range(n_agents)]
    olist = env.reset()
    for agent in agents:
        agent.reset()
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        alist = []
        # For each agent
        for i, o in enumerate(olist):
            a, ainfo = agents[i].get_action(o)
            alist.append(a)
            observations[i].append(env.observation_space.flatten(o))
            actions[i].append(env.action_space.flatten(a))
            if ainfo is None:
                agent_infos[i].append({})
            else:
                agent_infos[i].append(ainfo)
        next_olist, rlist, d, env_info = env.step(np.asarray(alist))
        for i, r in enumerate(rlist):
            rewards[i].append(r)
            env_infos[i].append(env_info)
        path_length += 1
        if d:
            break
        olist = next_olist
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)
    if animated:
        env.render()

    return [
        dict(
            observations=tensor_utils.stack_tensor_list(observations[i]),
            actions=tensor_utils.stack_tensor_list(actions[i]),
            rewards=tensor_utils.stack_tensor_list(rewards[i]),
            agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos[i]),
            env_infos=tensor_utils.stack_tensor_dict_list(env_infos[i]),
        ) for i in range(n_agents)
    ]
Esempio n. 17
0
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1):
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    dones = []
    o = env.reset()
    agent.reset()
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        a, agent_info = agent.get_action(o)
        next_o, r, d, env_info = env.step(a)
        if isinstance(env.observation_space, list):
            n = len(env.shadow_envs)
            observations.append([
                env.shadow_envs[i].observation_space.flatten_n(o[i])
                for i in range(n)
            ])
            rewards.append(r)
            actions.append([
                env.shadow_envs[i].action_space.flatten_n(a[i])
                for i in range(n)
            ])
        else:
            observations.append(env.observation_space.flatten(o))
            rewards.append(r)
            actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        dones.append(d)
        path_length += 1
        if d:
            break
        o = next_o
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)
    if animated:
        env.render(close=True)

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
        dones=np.asarray(dones),
        last_obs=o,
    )
Esempio n. 18
0
def rollout(env,
            agents,
            max_path_length=np.inf,
            animated=False,
            speedup=1,
            always_return_paths=False):
    ##############################################
    # SEEK AGENT
    seek_observations = []
    seek_actions = []
    seek_rewards = []
    seek_agent_infos = []
    seek_env_infos = []

    obs = env.reset()
    agents['seek'].reset()
    seek_path_length = 0

    if animated:
        env.render()
    while seek_path_length < max_path_length:
        # print('rollout: obs shape = ', obs[0].shape)
        a, agent_info = agents['seek'].get_action(obs)
        if animated:
            env.render()
        obs_next, r, d, env_info = env.step(a)
        seek_observations.append(obs)
        seek_rewards.append(r)
        seek_actions.append(env.action_space.flatten(a))
        seek_agent_infos.append(agent_info)
        seek_env_infos.append(env_info)
        seek_path_length += 1
        obs = obs_next

        if d:
            break

    if animated and not always_return_paths:
        return

    seek_paths = dict(
        observations=e2e_tensor_utils.stack_tensor_list(seek_observations),
        actions=tensor_utils.stack_tensor_list(seek_actions),
        rewards=tensor_utils.stack_tensor_list(seek_rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(seek_agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(seek_env_infos),
    )

    seek_paths['actions'] = seek_paths['actions'].astype(glob_config.dtype)
    seek_paths['rewards'] = seek_paths['rewards'].astype(glob_config.dtype)

    return {'seek': seek_paths}
Esempio n. 19
0
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, save_video=True, video_filename='sim_out.mp4', reset_arg=None):
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    images = []
    o = env.reset(reset_args=reset_arg)
    agent.reset()
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        a, agent_info = agent.get_action(o)
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        if d: # and not animated:  # TODO testing
            break
        o = next_o
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)
            if save_video:
                from PIL import Image
                image = env.wrapped_env.wrapped_env.get_viewer().get_image()
                pil_image = Image.frombytes('RGB', (image[1], image[2]), image[0])
                images.append(np.flipud(np.array(pil_image)))

    if animated:
        if save_video and len(images) >= max_path_length:
            import moviepy.editor as mpy
            clip = mpy.ImageSequenceClip(images, fps=20*speedup)
            if video_filename[-3:] == 'gif':
                clip.write_gif(video_filename, fps=20*speedup)
            else:
                clip.write_videofile(video_filename, fps=20*speedup)
        #return

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
Esempio n. 20
0
def rollout_tendon(env, agent, always_return_paths=True,
                   render_mode="", save_frames=False,
                   lengths=None, goal=None, tangent_vec_goal=None):
    """adjusted rollout method
    agent=policy"""
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    # reset environment according to given starting point, and goal, or randomly
    if lengths is not None and goal is not None and tangent_vec_goal is not None:
        o = env._wrapped_env.reset(lengths, goal, tangent_vec_goal)
    elif lengths is None and goal is not None and tangent_vec_goal is not None:
        o = env._wrapped_env.reset(lengths, goal, tangent_vec_goal)
    elif lengths is None and goal is not None and tangent_vec_goal is None:
        o = env._wrapped_env.reset(lengths, goal, tangent_vec_goal)
    else:
        o = env.reset()
    agent.reset()
    if render_mode:
        env.render(mode=render_mode, save_frames=save_frames)
    while True:
        a, agent_info = agent.get_action(o) # agent = policy
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        if render_mode:
            env.render(mode=render_mode, save_frames=save_frames)
        if d:
            env.render(mode=render_mode, save_frames=save_frames)
            observations.append(env.observation_space.flatten(next_o)) # also append terminal observation
            env_infos.append(env_info) # only append terminal info
            break
        o = next_o

    if not always_return_paths:
        return

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
Esempio n. 21
0
def rollout(env, pi_low, pi_high, tx=700, ty=0, max_path_length=np.inf, animated=False, speedup=1,
            always_return_paths=False):
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    o = env.reset()
    x, y, z = env.robot.body_xyz
    r, p, yaw = env.robot.body_rpy
    target_theta = np.arctan2(
        ty - y,
        tx - x)
    angle_to_target = target_theta - yaw
 
    print('direction: ', o[0], o[1])
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        a_high = pi_high([x, y, tx, ty]) 
        feed_o[0] = np.cos(a_high) # get direction
        feed_o[1] = np.sin(a_high)
        a, agent_info = agent.get_action(feed_o)
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        if d:
            break
        o = next_o
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)
    if animated and not always_return_paths:
        return

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
Esempio n. 22
0
def rollout_hide(env,
                 agents,
                 mode,
                 max_path_length=np.inf,
                 init_state=None,
                 init_goal=None):
    # Reset the model configuration
    env.reset()
    obs = env.reload_model(pose=init_state, goal=init_goal)

    hide_observations = []
    hide_states = []
    hide_actions = []
    hide_rewards = []
    hide_agent_infos = []
    hide_env_infos = []

    print("rollout : HIDE")
    agents['hide'].reset()
    hide_path_length = 0

    while hide_path_length < max_path_length:
        env.render()
        a, agent_info = agents['hide'].get_action(obs)
        hide_states.append(env.unwrapped.get_all_pose())
        #print('-------> state: ', obs[0:2])
        obs_next, r, d, env_info = env.step(a)

        hide_observations.append(obs)
        hide_rewards.append(r)
        hide_actions.append(a)
        hide_agent_infos.append(agent_info)
        hide_env_infos.append(env_info)
        hide_path_length += 1
        obs = obs_next
        if d:
            break

    hide_paths = dict(
        observations=e2e_tensor_utils.stack_tensor_list(hide_observations),
        actions=tensor_utils.stack_tensor_list(hide_actions),
        rewards=tensor_utils.stack_tensor_list(hide_rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(hide_agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(hide_env_infos),
        states=hide_states,
    )
    print('Episode done:', hide_path_length)
    return hide_paths
Esempio n. 23
0
    def collect_trajs_for_cost(self, n_trajs, pol, env, dom, cls):
        paths = []
        #print(n_trajs)
        for iter_step in trange(0, n_trajs):
            paths.append(
                self.cyberpunk_rollout(agent=pol,
                                       env=env,
                                       max_path_length=self.horizon,
                                       reward_extractor=None))
        # for p in paths:
        #     print(p['im_observations'].shape)
        data_matrix = tensor_utils.stack_tensor_list(
            [p['im_observations'] for p in paths])
        class_matrix = np.tile(cls, (n_trajs, self.horizon, 1))
        dom_matrix = np.tile(dom, (n_trajs, self.horizon, 1))

        #data_matrix = np.zeros(shape=(n_trajs, self.horizon, self.im_height, self.im_width, self.im_channels))
        #class_matrix = np.zeros(shape=(n_trajs, self.horizon, 2))
        #dom_matrix = np.zeros(shape=(n_trajs, self.horizon, 2))
        #for path, path_step in zip(paths, range(0, len(paths))):
        #    for sub_path, time_step in zip(path['im_observations'], range(0, self.horizon)):
        #        data_matrix[path_step, time_step, :, :, :] = sub_path
        #        class_matrix[path_step, time_step, :] = path['class']
        #        dom_matrix[path_step, time_step, :] = path['dom']

        return dict(data=data_matrix, classes=class_matrix, domains=dom_matrix)
Esempio n. 24
0
def rollout(env,
            agent,
            max_path_length=np.inf,
            animated=False,
            speedup=1,
            always_return_paths=False,
            action_noise=0.0):
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    o = env.reset()
    agent.reset()
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        a_sampled, agent_info = agent.get_action(o)
        # use the mean here for eval.
        a = agent_info['mean']
        a += np.random.randn(len(a)) * action_noise
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        if d:
            break
        o = next_o
        if animated:
            env.render()
            timestep = 0.05
            # No slowmotion
            # time.sleep(timestep / speedup)
    # if animated and not always_return_paths:
    #     return

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
Esempio n. 25
0
def rollout(env,
            agent,
            max_path_length=np.inf,
            animated=False,
            speedup=1,
            always_return_paths=False):
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    o = env.reset()
    agent.reset()
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        a, agent_info = agent.get_action(o)
        # print("action", a)
        # a = 4
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        # print("action", a)
        # print("env_infos", env_info)
        path_length += 1
        if d:
            break
        o = next_o
        if animated:
            env.render()
            timestep = 0.05
            # time.sleep(timestep / speedup)
    if animated and not always_return_paths:
        return
    # print("agent_infos", agent_infos)
    # print("env_infos", env_infos)
    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
Esempio n. 26
0
def rollout(env,
            agent,
            max_path_length=np.inf,
            animated=False,
            speedup=1,
            always_return_paths=False):
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    o = env.reset()
    agent.reset()
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        a, agent_info = agent.get_action(o)
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        if d:
            break
        o = next_o
        if animated:
            env.render()
            env.wrapped_env.env.viewer.viewer.add_marker(
                pos=env.wrapped_env.env.goal,
                size=np.array((0.02, 0.02, 0.02)),
                label='goal',
                rgba=[1, 0, 0, 0.5])
            # timestep = 0.05
            # time.sleep(timestep / speedup)
    if animated and not always_return_paths:
        return

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
Esempio n. 27
0
def reset():
    sh.agent.reset()

    if not sh.observations is None:
        path = dict(
            observations=tensor_utils.stack_tensor_list(sh.observations),
            actions=tensor_utils.stack_tensor_list(sh.actions),
            rewards=tensor_utils.stack_tensor_list(sh.rewards),
            agent_infos=tensor_utils.stack_tensor_dict_list(sh.agent_infos),
            env_infos=tensor_utils.stack_tensor_dict_list(sh.env_infos),
        )

        sh.paths.append(path)
        sh.count += len(sh.observations)

        # check if it is time to update
        if sh.count > sh.algor.batch_size:
            itr = sh.itera
            with logger.prefix('itr #%d | ' % itr):
                paths = sh.paths
                samples_data = sh.algor.sampler.process_samples(itr, paths)
                sh.algor.log_diagnostics(paths)
                sh.algor.optimize_policy(itr, samples_data)
                logger.log("saving snapshot...")
                params = sh.algor.get_itr_snapshot(itr, samples_data)
                sh.algor.current_itr = itr + 1
                params["algo"] = sh.algor
                if sh.algor.store_paths:
                    params["paths"] = samples_data["paths"]
                logger.save_itr_params(itr, params)
                logger.log("saved")
                logger.dump_tabular(with_prefix=False)

                sh.paths = []

                if sh.algor.plot:
                    sh.algor.update_plot()
                    if sh.algor.pause_for_plot:
                        raw_input("Plotting evaluation run: Press Enter to "
                                  "continue...")

            sh.itera += 1
            sh.count = 0

    # reset arrays
    sh.observations, sh.actions, sh.rewards, sh.agent_infos, sh.env_infos = [], [], [], [], []
Esempio n. 28
0
def cent_rollout(env,
                 agent,
                 max_path_length=np.inf,
                 animated=False,
                 speedup=1):
    """Centralized rollout"""
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    o = env.reset()
    agent.reset()
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        a, agent_info = agent.get_action(o)
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))
        if isinstance(r, (list, np.ndarray)):
            assert (r == r[0]).all()
            r = r[0]
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        if d:
            break
        o = next_o
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)
    if animated:
        env.render()
    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
Esempio n. 29
0
def gif_rollout(env, agent, max_path_length=np.inf, save_gif=False, **kwargs):
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    o = env.reset()
    initial_simparams = env.initial_simparams
    #agent.reset()
    path_length = 0
    #if animated:
    #    env.render()
    while path_length < max_path_length:
        a, agent_info = agent.get_action(o)
        next_o, r, d, env_info = env.step(a)
        observations.append(o.reshape(-1))
        rewards.append(r)
        actions.append(a.reshape(-1))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        if d:
            break
        o = next_o
    if save_gif:
        actions = [
            np.clip(action, *env.j.action_space_bounds(initial_simparams))
            for action in actions
        ]
        env.save_gif(initial_simparams, np.column_stack(actions),
                     kwargs['filename'])
        #env.render(close=True)
        #env.render()

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
    def collect_trajs_for_cost(self, n_trajs, pol, env, cls):
        paths = []
        #print(n_trajs)
        for iter_step in range(0, n_trajs):
            paths.append(self.cyberpunk_rollout(agent=pol, env=env, max_path_length=self.horizon,
                                                reward_extractor=None))


        data_matrix = tensor_utils.stack_tensor_list([p['im_observations'] for p in paths])
        class_matrix = np.tile(cls, (n_trajs, self.horizon, 1))
        
        return dict(data=data_matrix, classes=class_matrix)
Esempio n. 31
0
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1):
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    o = env.reset()
    agent.reset()
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        #debug = open('debug.log', 'a'); debug.write('rllab/sampler/utils.py/agent.get_action(o)\n'); debug.close()
        a, agent_info = agent.get_action(o)
        #debug = open('debug.log', 'a'); debug.write('rllab/sampler/utils.py/env.step(a)\n'); debug.close()
        next_o, r, d, env_info = env.step(a)
        #debug = open('debug.log', 'a'); debug.write('rllab/sampler/utils.py/observations.append()\n'); debug.close()
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        if d:
            break
        o = next_o
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)
    if animated:
        env.render(close=True)
        # env.render()
    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
Esempio n. 32
0
def rollout(env,
            agent,
            max_path_length=np.inf,
            speedup=1,
            save_video=False,
            video_filename='sim_out.mp4',
            reset_arg=3,
            renderMode='human',
            return_images=False):
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    images = []
    o = env.reset(reset_args=reset_arg)

    agent.reset()
    path_length = 0

    while path_length < max_path_length:
        a, agent_info = agent.get_action(o)
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        if return_images or save_video:
            images.append(env.render(renderMode))
        path_length += 1
        if d:  # and not animated:  # TODO testing
            break
        o = next_o

    if save_video:
        import moviepy.editor as mpy
        clip = mpy.ImageSequenceClip(images, fps=20 * speedup)
        if video_filename[-3:] == 'gif':
            clip.write_gif(video_filename, fps=20 * speedup)
        else:
            clip.write_videofile(video_filename, fps=20 * speedup)

    if return_images:
        return dict(
            observations=tensor_utils.stack_tensor_list(observations),
            actions=tensor_utils.stack_tensor_list(actions),
            rewards=tensor_utils.stack_tensor_list(rewards),
            agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
            env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
            images=np.array(images))
    else:
        return dict(
            observations=tensor_utils.stack_tensor_list(observations),
            actions=tensor_utils.stack_tensor_list(actions),
            rewards=tensor_utils.stack_tensor_list(rewards),
            agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
            env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
        )
Esempio n. 33
0
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1,
            always_return_paths=False):
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    o = env.reset()
    agent.reset()
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        a, agent_info = agent.get_action(o)
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        if d:
            break
        o = next_o
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)
    if animated and not always_return_paths:
        return

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
Esempio n. 34
0
def sample_paths(N, 
    policy, 
    baseline, 
    env_mode='train', 
    T=1e6, 
    gamma=1, 
    normalized_env=False,
    env=None):
    # Directly specifying env works only when sampling in series

    # set random seed (needed for multiprocessing)
    np.random.seed()

    if env == None:
        env = get_environment(env_mode)
    T = min(T, env.horizon)
    T = max(1, T)  
    # sometimes, env is not initialized correctly in multiprocessing
    # this is just a sanity check and step size should essentially be zero.

    print "####### Worker started #######"

    paths = []

    for ep in range(N):
        
        observations=[]
        actions=[]
        rewards=[]
        agent_infos = []
        env_infos = []
        qpos = []
        qvel = []

        o = env.reset()
        if normalized_env:
            qpos.append(env.wrapped_env.env.model.data.qpos.reshape(-1))
            qvel.append(env.wrapped_env.env.model.data.qvel.reshape(-1))
        else:
            qpos.append(env.env.model.data.qpos.reshape(-1))
            qvel.append(env.env.model.data.qvel.reshape(-1))
        done = False
        t = 0

        while t < T and done != True:
            a, agent_info = policy.get_action(o)
            next_o, r, done, env_info = env.step(a)
            observations.append(env.observation_space.flatten(o))
            actions.append(env.action_space.flatten(a))
            rewards.append(r)
            agent_infos.append(agent_info)
            env_infos.append(env_info)
            if normalized_env:
                qpos.append(env.wrapped_env.env.model.data.qpos.reshape(-1))
                qvel.append(env.wrapped_env.env.model.data.qvel.reshape(-1))
            else:
                qpos.append(env.env.model.data.qpos.reshape(-1))
                qvel.append(env.env.model.data.qvel.reshape(-1))
            o = next_o
            t += 1

        # make a path dictionary
        # Also store the path belief and env data used in the trajectory
        try:
            path_belief = env.env.belief
        except Exception as e:
            path_belief = str(e)

        # path_model = env.env

        path = dict(
            observations=tensor_utils.stack_tensor_list(observations),
            actions=tensor_utils.stack_tensor_list(actions),
            rewards=tensor_utils.stack_tensor_list(rewards),
            agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
            env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
            qpos=tensor_utils.stack_tensor_list(qpos),
            qvel=tensor_utils.stack_tensor_list(qvel),
            #path_belief=path_belief,
            #path_model=path_model,
        )

        # TODO: Storing the path model is too space inefficient. Need to find alternative
        
        # compute returns using the path
        path_baseline = baseline.predict(path)
        advantages = []
        returns = []
        return_so_far = 0
        for t in xrange(len(rewards) - 1, -1, -1):
            return_so_far = rewards[t] + gamma * return_so_far
            returns.append(return_so_far)
            advantage = return_so_far - path_baseline[t]
            advantages.append(advantage)

        # advantages and returns are stored backward in time
        advantages = np.array(advantages[::-1])
        returns = np.array(returns[::-1])
        
        # normalize advantages
        advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8)
        
        path["advantages"] = advantages
        path["returns"] = returns

        paths.append(path)

    #print "Env body_mass : ", env.env.model.body_mass[1]
    print "====== Worker finished ======"

    return paths
Esempio n. 35
0
    def obtain_samples(self, itr):
        logger.log("Obtaining samples for iteration %d..." % itr)
        paths = []
        n_samples = 0
        obses = self.vec_env.reset()
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.algo.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0
        import time
        while n_samples < self.algo.batch_size:
            t = time.time()
            self.algo.policy.reset(dones)
            actions, agent_infos = self.algo.policy.get_actions(obses)
            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in xrange(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in xrange(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions,
                                                                                    rewards, env_infos, agent_infos,
                                                                                    dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths.append(dict(
                        observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]),
                        actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]),
                        rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]),
                        env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]),
                        agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]),
                    ))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None
            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        logger.record_tabular("PolicyExecTime", policy_time)
        logger.record_tabular("EnvExecTime", env_time)
        logger.record_tabular("ProcessExecTime", process_time)

        return paths
    def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix=''):
        # reset_args: arguments to pass to the environments to reset
        # return_dict: whether or not to return a dictionary or list form of paths

        logger.log("Obtaining samples for iteration %d..." % itr)

        #paths = []
        paths = {}
        for i in range(self.vec_env.num_envs):
            paths[i] = []

        # if the reset args are not list/numpy, we set the same args for each env
        if reset_args is not None and (type(reset_args) != list and type(reset_args)!=np.ndarray):
            reset_args = [reset_args]*self.vec_env.num_envs

        n_samples = 0
        obses = self.vec_env.reset(reset_args)
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.algo.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy
        import time


        while n_samples < self.algo.batch_size:
            t = time.time()
            policy.reset(dones)
            actions, agent_infos = policy.get_actions(obses)

            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions, reset_args)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions,
                                                                                    rewards, env_infos, agent_infos,
                                                                                    dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths[idx].append(dict(
                        observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]),
                        actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]),
                        rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]),
                        env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]),
                        agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]),
                    ))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None
            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        logger.record_tabular(log_prefix+"PolicyExecTime", policy_time)
        logger.record_tabular(log_prefix+"EnvExecTime", env_time)
        logger.record_tabular(log_prefix+"ProcessExecTime", process_time)

        if not return_dict:
            flatten_list = lambda l: [item for sublist in l for item in sublist]
            paths = flatten_list(paths.values())
            #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()])

        return paths