Esempio n. 1
0
def test_model():
    env_name = 'DartHopperPT-v1'
    env = make_parallel(1, env_name, num=2)

    env2 = make(env_name, num=2, stochastic=False)
    batch_size = 30
    horizon = 100

    s = []
    for i in range(batch_size):
        env2.reset()
        s.append(get_state(env2))

    param = get_params(env2)
    params = np.array([param for i in range(batch_size)])
    env2.env.noisy_input = False

    s = np.array(s)
    a = [[env2.action_space.sample() for j in range(horizon)]
         for i in range(batch_size)]
    a = np.array(a)

    for i in range(3):
        obs, _, done, _ = env2.step(a[-1][i])
        if done:
            break

    for i in tqdm.trange(1):
        r, obs, mask = env(params, s, a)
    print(obs[-1][:3])
Esempio n. 2
0
def test():
    env = make('DartHopperPT-v1', num=5)
    """
    env.reset()
    for i in tqdm.trange(10000):
        env.step(env.action_space.sample())
        """

    env.reset()
    state = get_state(env)
    for i in tqdm.trange(10000):
        env.reset()
        set_state(env, state)
        state = state + np.random.normal(state.shape)
        env.step(env.action_space.sample())
        state = get_state(env)
Esempio n. 3
0
def collect_trajectories(eval_env, policy, num_traj, max_horizon, use_state, use_done=True, random_policy=True):
    gt_params = get_params(eval_env)

    for i in range(num_traj):
        obs = eval_env.reset()
        set_params(eval_env, gt_params)

        init_state = get_state(eval_env)
        observations = None
        actions = None
        masks = None

        if use_state:
            obs = init_state

        policy.reset()
        for j in range(max_horizon):
            if np.random.random() > 0 and random_policy: # explore
                action = eval_env.action_space.sample()
            else:
                action = policy(obs)
            obs, _, done, _ = eval_env.step(action)

            if observations is None:
                observations = np.zeros((max_horizon, len(obs)))
                actions = np.zeros((max_horizon, len(action)))
                actions -= 10000000
                masks = np.zeros(max_horizon)

            observations[j] = obs
            actions[j] = action
            masks[j] = 1


            if use_state:
                # we always record the observation instead of the state
                obs = get_state(eval_env)
            if done and use_done:
                break

        if j == 0:
            continue

        yield init_state, observations, actions, masks
Esempio n. 4
0
def eval_policy(policy, eval_env, eval_episodes=10, save_video=0, video_path="video{}.avi", timestep=int(1e9), use_state=True, set_gt_params=False, print_timestep=10000):

    avg_reward = 0.
    acc = []

    trajectories = []
    rewards = []
    for episode_id in tqdm.trange(eval_episodes):
        state, done = eval_env.reset(), False

        out = None
        if isinstance(policy, object):
            if 'reset' in policy.__dir__():
                policy.reset()

        if set_gt_params:
            policy.set_params(get_params(eval_env))

        #while not done:
        states = []
        actions = []
        for i in tqdm.trange(timestep):
            if i % print_timestep == print_timestep-1:
                print('\n\n', avg_reward, "past: ", rewards, '\n\n')

            if use_state:
                state = get_state(eval_env)
            states.append(state.tolist())
            action = policy(state)
            actions.append(action.tolist())
            state, reward, done, info = eval_env.step(action)
            avg_reward += reward
            if done:
                break
        states.append(state.tolist())

        if out is not None:
            out.release()
        trajectories.append([states, actions])

        rewards.append(avg_reward)
        avg_reward = 0


    print("---------------------------------------")
    print(f"Evaluation over {eval_episodes} episodes: {np.mean(rewards):.3f},  std: {np.std(rewards)}")
    if len(acc) > 0:
        print(f"Evaluation success rate over {eval_episodes} episodes: {np.mean(acc):.3f}")
    print("---------------------------------------")
    return trajectories, rewards
Esempio n. 5
0
    def _get_data(self, path):
        if os.path.exists(path):
            with open(path, 'rb') as f:
                return pickle.load(f)

        gt_params = []

        train_traj_offline = []  # trajs at random position..
        train_traj_online = []  # recent trajs
        test_traj = []

        eval_env = self.eval_env
        eval_env.env.resample_MP = False

        for i in tqdm.trange(self.total):
            eval_env.env.resample_MP = True
            eval_env.reset()
            eval_env.env.resample_MP = False
            gt_param = get_params(eval_env)
            gt_params.append(gt_param)
            self.policy.set_params(gt_param)
            self.policy.reset()

            # collect_train_traj
            states = []
            observations = []
            actions = []

            obs = eval_env.reset()

            while True:
                # collect the whole trajectories
                # policy 1
                action = self.policy(obs)
                states.append(get_state(eval_env))
                actions.append(action)

                obs, r, done, _ = eval_env.step(action)
                observations.append(obs)
                if done:
                    break

            if len(observations) < self.max_horizon * 2:
                continue

            # traj: states, observation, actions, mask
            test_idx = np.random.randint(self.max_horizon,
                                         len(states) - self.max_horizon)
            test_traj.append(
                (states[test_idx],
                 np.array(observations[test_idx:test_idx + self.max_horizon]),
                 np.array(actions[test_idx:test_idx + self.max_horizon])))

            train = []
            train_online = []
            #for i in range(self.num_train):
            #idx = np.random.randint(len(states))
            for idx in range(self.num_train):
                train.append((states[idx], observations[idx:idx + 1],
                              np.array(actions[idx:idx + 1])))

            #for idx in range(test_idx - self.max_horizon, test_idx-1):
            for i in range(self.num_train):
                idx = np.random.randint(test_idx)
                train_online.append((states[idx], observations[idx:idx + 1],
                                     actions[idx:idx + 1]))
            train = [np.array([j[i] for j in train]) for i in range(3)]
            train_online = [
                np.array([j[i] for j in train_online]) for i in range(3)
            ]

            train_traj_offline.append(train)
            train_traj_online.append(train_online)

        print(np.array(gt_params).shape)
        data = [test_traj, train_traj_offline, train_traj_online, gt_params]
        with open(path, 'wb') as f:
            pickle.dump(data, f)
        return data
Esempio n. 6
0
def online_osi(eval_env, osi, policy, num_init_traj, max_horizon, eval_episodes, use_state=True, print_timestep=1000, resample_MP=True, online=True, ensemble=1, gt=False):
    # fix the seed...
    from osi import seed 
    seed(eval_env, 0)
    parameters = []
    for i in range(100):
        eval_env.reset()
        parameters.append(get_params(eval_env))

    resample_MP_init = eval_env.env.resample_MP
    rewards = []
    for episode in tqdm.trange(eval_episodes):
        osi.reset()

        eval_env.env.resample_MP = resample_MP
        eval_env.reset()
        eval_env.env.resample_MP = False
        if parameters is not None:
            set_params(eval_env, parameters[episode])

        for init_state, observations, actions, masks in collect_trajectories(eval_env, policy, num_init_traj, max_horizon, use_state):
            osi.update(init_state, observations, actions, masks)

        #params = osi.get_params()
        print('gt', get_params(eval_env))
        if gt:
            params = get_params(eval_env)
        else:
            params = osi.find_min(ensemble, method='all') # get a good initialization
        policy.set_params(params)
        #print(params, get_params(eval_env))
        dist = np.linalg.norm((params - get_params(eval_env)), axis=-1)

        total_rewards = []

        for xx in range(5):
            reward = 0
            obs, state = eval_env.reset(), get_state(eval_env)
            policy.reset()

            states = []
            observations = []
            actions = []
            states.append(states)

            for i in range(1000):
                if use_state:
                    action = policy(state)
                else:
                    action = policy(obs)

                obs, r, done, _ = eval_env.step(action)
                state = get_state(eval_env)
                states.append(state)


                observations.append(obs)
                actions.append(action)

                if i % print_timestep == print_timestep - 1:
                    print('\n\n', reward, "past: ", rewards[-10:], len(rewards), '\n\n')

                if i % max_horizon == max_horizon - 1 and i > max_horizon + 3 and online:
                    xx = i//max_horizon
                    if xx % online == online - 1:
                        idx = i - max_horizon - 1
                        osi.update(states[idx], observations[idx:idx+max_horizon], actions[idx:idx+max_horizon], 1, maxlen=3)
                        tmp = osi.cem.iter_num
                        #osi.cem.iter_num = 5 # we need at least 10 iterations??
                        osi.cem.iter_num = 10 # we need at least 10 iterations??
                        osi.cem.std = 0.1
                        osi.cem.num_mutation = 100
                        osi.cem.num_elite = 5
                        params = params * 0.5 + osi.get_params() * 0.5 # don't know if this is ok
                        policy.set_params(params)
                    print(params, get_params(eval_env))
                    print('\n\n', reward, "past: ", rewards[-10:], len(rewards), '\n\n')


                reward += r
                #if i % print_timestep == print_timestep-1 or done:
                #    print('\n\n', reward, "past: ", rewards[-10:], len(rewards), '\n\n')
                if done:
                    break
            rewards.append(reward)


    print("---------------------------------------")
    print(f"Evaluation over {eval_episodes} episodes: {np.mean(rewards):.3f},  std: {np.std(rewards)}")
    print("---------------------------------------")
    return rewards, dist