def test_model(): env_name = 'DartHopperPT-v1' env = make_parallel(1, env_name, num=2) env2 = make(env_name, num=2, stochastic=False) batch_size = 30 horizon = 100 s = [] for i in range(batch_size): env2.reset() s.append(get_state(env2)) param = get_params(env2) params = np.array([param for i in range(batch_size)]) env2.env.noisy_input = False s = np.array(s) a = [[env2.action_space.sample() for j in range(horizon)] for i in range(batch_size)] a = np.array(a) for i in range(3): obs, _, done, _ = env2.step(a[-1][i]) if done: break for i in tqdm.trange(1): r, obs, mask = env(params, s, a) print(obs[-1][:3])
def test(): env = make('DartHopperPT-v1', num=5) """ env.reset() for i in tqdm.trange(10000): env.step(env.action_space.sample()) """ env.reset() state = get_state(env) for i in tqdm.trange(10000): env.reset() set_state(env, state) state = state + np.random.normal(state.shape) env.step(env.action_space.sample()) state = get_state(env)
def collect_trajectories(eval_env, policy, num_traj, max_horizon, use_state, use_done=True, random_policy=True): gt_params = get_params(eval_env) for i in range(num_traj): obs = eval_env.reset() set_params(eval_env, gt_params) init_state = get_state(eval_env) observations = None actions = None masks = None if use_state: obs = init_state policy.reset() for j in range(max_horizon): if np.random.random() > 0 and random_policy: # explore action = eval_env.action_space.sample() else: action = policy(obs) obs, _, done, _ = eval_env.step(action) if observations is None: observations = np.zeros((max_horizon, len(obs))) actions = np.zeros((max_horizon, len(action))) actions -= 10000000 masks = np.zeros(max_horizon) observations[j] = obs actions[j] = action masks[j] = 1 if use_state: # we always record the observation instead of the state obs = get_state(eval_env) if done and use_done: break if j == 0: continue yield init_state, observations, actions, masks
def eval_policy(policy, eval_env, eval_episodes=10, save_video=0, video_path="video{}.avi", timestep=int(1e9), use_state=True, set_gt_params=False, print_timestep=10000): avg_reward = 0. acc = [] trajectories = [] rewards = [] for episode_id in tqdm.trange(eval_episodes): state, done = eval_env.reset(), False out = None if isinstance(policy, object): if 'reset' in policy.__dir__(): policy.reset() if set_gt_params: policy.set_params(get_params(eval_env)) #while not done: states = [] actions = [] for i in tqdm.trange(timestep): if i % print_timestep == print_timestep-1: print('\n\n', avg_reward, "past: ", rewards, '\n\n') if use_state: state = get_state(eval_env) states.append(state.tolist()) action = policy(state) actions.append(action.tolist()) state, reward, done, info = eval_env.step(action) avg_reward += reward if done: break states.append(state.tolist()) if out is not None: out.release() trajectories.append([states, actions]) rewards.append(avg_reward) avg_reward = 0 print("---------------------------------------") print(f"Evaluation over {eval_episodes} episodes: {np.mean(rewards):.3f}, std: {np.std(rewards)}") if len(acc) > 0: print(f"Evaluation success rate over {eval_episodes} episodes: {np.mean(acc):.3f}") print("---------------------------------------") return trajectories, rewards
def _get_data(self, path): if os.path.exists(path): with open(path, 'rb') as f: return pickle.load(f) gt_params = [] train_traj_offline = [] # trajs at random position.. train_traj_online = [] # recent trajs test_traj = [] eval_env = self.eval_env eval_env.env.resample_MP = False for i in tqdm.trange(self.total): eval_env.env.resample_MP = True eval_env.reset() eval_env.env.resample_MP = False gt_param = get_params(eval_env) gt_params.append(gt_param) self.policy.set_params(gt_param) self.policy.reset() # collect_train_traj states = [] observations = [] actions = [] obs = eval_env.reset() while True: # collect the whole trajectories # policy 1 action = self.policy(obs) states.append(get_state(eval_env)) actions.append(action) obs, r, done, _ = eval_env.step(action) observations.append(obs) if done: break if len(observations) < self.max_horizon * 2: continue # traj: states, observation, actions, mask test_idx = np.random.randint(self.max_horizon, len(states) - self.max_horizon) test_traj.append( (states[test_idx], np.array(observations[test_idx:test_idx + self.max_horizon]), np.array(actions[test_idx:test_idx + self.max_horizon]))) train = [] train_online = [] #for i in range(self.num_train): #idx = np.random.randint(len(states)) for idx in range(self.num_train): train.append((states[idx], observations[idx:idx + 1], np.array(actions[idx:idx + 1]))) #for idx in range(test_idx - self.max_horizon, test_idx-1): for i in range(self.num_train): idx = np.random.randint(test_idx) train_online.append((states[idx], observations[idx:idx + 1], actions[idx:idx + 1])) train = [np.array([j[i] for j in train]) for i in range(3)] train_online = [ np.array([j[i] for j in train_online]) for i in range(3) ] train_traj_offline.append(train) train_traj_online.append(train_online) print(np.array(gt_params).shape) data = [test_traj, train_traj_offline, train_traj_online, gt_params] with open(path, 'wb') as f: pickle.dump(data, f) return data
def online_osi(eval_env, osi, policy, num_init_traj, max_horizon, eval_episodes, use_state=True, print_timestep=1000, resample_MP=True, online=True, ensemble=1, gt=False): # fix the seed... from osi import seed seed(eval_env, 0) parameters = [] for i in range(100): eval_env.reset() parameters.append(get_params(eval_env)) resample_MP_init = eval_env.env.resample_MP rewards = [] for episode in tqdm.trange(eval_episodes): osi.reset() eval_env.env.resample_MP = resample_MP eval_env.reset() eval_env.env.resample_MP = False if parameters is not None: set_params(eval_env, parameters[episode]) for init_state, observations, actions, masks in collect_trajectories(eval_env, policy, num_init_traj, max_horizon, use_state): osi.update(init_state, observations, actions, masks) #params = osi.get_params() print('gt', get_params(eval_env)) if gt: params = get_params(eval_env) else: params = osi.find_min(ensemble, method='all') # get a good initialization policy.set_params(params) #print(params, get_params(eval_env)) dist = np.linalg.norm((params - get_params(eval_env)), axis=-1) total_rewards = [] for xx in range(5): reward = 0 obs, state = eval_env.reset(), get_state(eval_env) policy.reset() states = [] observations = [] actions = [] states.append(states) for i in range(1000): if use_state: action = policy(state) else: action = policy(obs) obs, r, done, _ = eval_env.step(action) state = get_state(eval_env) states.append(state) observations.append(obs) actions.append(action) if i % print_timestep == print_timestep - 1: print('\n\n', reward, "past: ", rewards[-10:], len(rewards), '\n\n') if i % max_horizon == max_horizon - 1 and i > max_horizon + 3 and online: xx = i//max_horizon if xx % online == online - 1: idx = i - max_horizon - 1 osi.update(states[idx], observations[idx:idx+max_horizon], actions[idx:idx+max_horizon], 1, maxlen=3) tmp = osi.cem.iter_num #osi.cem.iter_num = 5 # we need at least 10 iterations?? osi.cem.iter_num = 10 # we need at least 10 iterations?? osi.cem.std = 0.1 osi.cem.num_mutation = 100 osi.cem.num_elite = 5 params = params * 0.5 + osi.get_params() * 0.5 # don't know if this is ok policy.set_params(params) print(params, get_params(eval_env)) print('\n\n', reward, "past: ", rewards[-10:], len(rewards), '\n\n') reward += r #if i % print_timestep == print_timestep-1 or done: # print('\n\n', reward, "past: ", rewards[-10:], len(rewards), '\n\n') if done: break rewards.append(reward) print("---------------------------------------") print(f"Evaluation over {eval_episodes} episodes: {np.mean(rewards):.3f}, std: {np.std(rewards)}") print("---------------------------------------") return rewards, dist