def populate_task(env, policy): logger.log("Populating workers...") singleton_pool.run_each( _worker_populate_task, [(pickle.dumps(env), pickle.dumps(policy))] * singleton_pool.n_parallel ) logger.log("Populated")
def sample_paths( policy_params, max_samples, max_path_length=np.inf, env_params=None, scope=None): """ :param policy_params: parameters for the policy. This will be updated on each worker process :param max_samples: desired maximum number of samples to be collected. The actual number of collected samples might be greater since all trajectories will be rolled out either until termination or until max_path_length is reached :param max_path_length: horizon / maximum length of a single trajectory :return: a list of collected paths """ singleton_pool.run_each( _worker_set_policy_params, [(policy_params, scope)] * singleton_pool.n_parallel ) if env_params is not None: singleton_pool.run_each( _worker_set_env_params, [(env_params, scope)] * singleton_pool.n_parallel ) return singleton_pool.run_collect( _worker_collect_one_path, threshold=max_samples, args=(max_path_length, scope), show_prog_bar=True )
def populate_task(env, policy, dynamics): logger.log("Populating workers...") singleton_pool.run_each( _worker_populate_task, [(env, policy, dynamics)] * singleton_pool.n_parallel ) logger.log("Populated")
def populate_task(env, policy, scope=None): logger.log("Populating workers...") if singleton_pool.n_parallel > 1: singleton_pool.run_each( _worker_populate_task, [(pickle.dumps(env), pickle.dumps(policy), scope)] * singleton_pool.n_parallel ) else: # avoid unnecessary copying G = _get_scoped_G(singleton_pool.G, scope) G.env = env G.policy = policy logger.log("Populated")
def step(self, action_n): results = singleton_pool.run_each( worker_run_step, [(action_n, self.scope) for _ in self._alloc_env_ids], ) results = [x for x in results if x is not None] ids, obs, rewards, dones, env_infos = list(zip(*results)) ids = np.concatenate(ids) obs = self.observation_space.unflatten_n(np.concatenate(obs)) rewards = np.concatenate(rewards) dones = np.concatenate(dones) env_infos = tensor_utils.split_tensor_dict_list(tensor_utils.concat_tensor_dict_list(env_infos)) if env_infos is None: env_infos = [dict() for _ in range(self.num_envs)] items = list(zip(ids, obs, rewards, dones, env_infos)) items = sorted(items, key=lambda x: x[0]) ids, obs, rewards, dones, env_infos = list(zip(*items)) obs = list(obs) rewards = np.asarray(rewards) dones = np.asarray(dones) self.ts += 1 dones[self.ts >= self.max_path_length] = True reset_obs = self._run_reset(dones) for (i, done) in enumerate(dones): if done: obs[i] = reset_obs[i] self.ts[i] = 0 return obs, rewards, dones, tensor_utils.stack_tensor_dict_list(list(env_infos))
def sample_paths( policy_params, dynamics_params, max_samples, max_path_length=np.inf, itr=None, normalize_reward=None, reward_mean=None, reward_std=None, kl_batch_size=None, n_itr_update=None, use_replay_pool=None, obs_mean=None, obs_std=None, act_mean=None, act_std=None, second_order_update=None ): """ :param policy_params: parameters for the policy. This will be updated on each worker process :param max_samples: desired maximum number of samples to be collected. The actual number of collected samples might be greater since all trajectories will be rolled out either until termination or until max_path_length is reached :param max_path_length: horizon / maximum length of a single trajectory :return: a list of collected paths """ singleton_pool.run_each( _worker_set_policy_params, [(policy_params,)] * singleton_pool.n_parallel ) # Set dynamics params. # -------------------- singleton_pool.run_each( _worker_set_dynamics_params, [(dynamics_params,)] * singleton_pool.n_parallel ) # -------------------- return singleton_pool.run_collect( _worker_collect_one_path, threshold=max_samples, args=(max_path_length, itr, normalize_reward, reward_mean, reward_std, kl_batch_size, n_itr_update, use_replay_pool, obs_mean, obs_std, act_mean, act_std, second_order_update), show_prog_bar=True )
def __init__(self, env, n, max_path_length, scope=None): if scope is None: # initialize random scope scope = str(uuid.uuid4()) envs_per_worker = int(np.ceil(n * 1.0 / singleton_pool.n_parallel)) alloc_env_ids = [] rest_alloc = n start_id = 0 for _ in range(singleton_pool.n_parallel): n_allocs = min(envs_per_worker, rest_alloc) alloc_env_ids.append(list(range(start_id, start_id + n_allocs))) start_id += n_allocs rest_alloc = max(0, rest_alloc - envs_per_worker) singleton_pool.run_each(worker_init_envs, [(alloc, scope, env) for alloc in alloc_env_ids]) self._alloc_env_ids = alloc_env_ids self._action_space = env.action_space self._observation_space = env.observation_space self._num_envs = n self.scope = scope self.ts = np.zeros(n, dtype='int') self.max_path_length = max_path_length
def sample_paths( policy_params, max_samples, max_path_length=np.inf, env_params=None, scope=None, reset_arg=None, show_prog_bar=True, multi_task=False): """ :param policy_params: parameters for the policy. This will be updated on each worker process :param max_samples: desired maximum number of samples to be collected. The actual number of collected samples might be greater since all trajectories will be rolled out either until termination or until max_path_length is reached :param max_path_length: horizon / maximum length of a single trajectory :return: a list of collected paths """ if multi_task: assert len(policy_params) == singleton_pool.n_parallel all_params = [(params, scope) for params in policy_params] singleton_pool.run_each( _worker_set_policy_params, all_params, ) else: singleton_pool.run_each( _worker_set_policy_params, [(policy_params, scope)] * singleton_pool.n_parallel ) if env_params is not None: singleton_pool.run_each( _worker_set_env_params, [(env_params, scope)] * singleton_pool.n_parallel ) if multi_task: args = [(max_path_length, scope, arg) for arg in reset_arg] return singleton_pool.run_collect( _worker_collect_one_path, threshold=max_samples, args=args, show_prog_bar=show_prog_bar, multi_task=multi_task, ) else: return singleton_pool.run_collect( _worker_collect_one_path, threshold=max_samples, args=(max_path_length, scope, reset_arg), show_prog_bar=show_prog_bar, multi_task=multi_task, )
def _run_reset(self, dones): dones = np.asarray(dones) results = singleton_pool.run_each( worker_run_reset, [(dones, self.scope) for _ in self._alloc_env_ids], ) ids, flat_obs = list(map(np.concatenate, list(zip(*results)))) zipped = list(zip(ids, flat_obs)) sorted_obs = np.asarray([x[1] for x in sorted(zipped, key=lambda x: x[0])]) done_ids, = np.where(dones) done_flat_obs = sorted_obs[done_ids] done_unflat_obs = self.observation_space.unflatten_n(done_flat_obs) all_obs = [None] * self.num_envs done_cursor = 0 for idx, done in enumerate(dones): if done: all_obs[idx] = done_unflat_obs[done_cursor] done_cursor += 1 return all_obs
def initialize(n_parallel): singleton_pool.initialize(n_parallel) singleton_pool.run_each( _worker_init, [(id,) for id in xrange(singleton_pool.n_parallel)])
def start_worker(self): if singleton_pool.n_parallel > 1: singleton_pool.run_each(worker_init_tf) parallel_sampler.populate_task(self.algo.env, self.algo.policy) if singleton_pool.n_parallel > 1: singleton_pool.run_each(worker_init_tf_vars)
def terminate_task(scope=None): singleton_pool.run_each(_worker_terminate_task, [(scope, )] * singleton_pool.n_parallel)
def populate_task(env, policy): logger.log("Populating workers...") singleton_pool.run_each(_worker_populate_task, [(pickle.dumps(env), pickle.dumps(policy))] * singleton_pool.n_parallel) logger.log("Populated")
def populate_task(env, policy, dynamics): logger.log("Populating workers...") singleton_pool.run_each(_worker_populate_task, [(env, policy, dynamics)] * singleton_pool.n_parallel) logger.log("Populated")
def sample_paths( policy_params, max_samples, max_path_length=np.inf, env_params=None, scope=None, iter = 0, env = None, policy = None, baseline = None, sim_percentage = 1.0/3.0, target_task = None): """ :param policy_params: parameters for the policy. This will be updated on each worker process :param max_samples: desired maximum number of samples to be collected. The actual number of collected samples might be greater since all trajectories will be rolled out either until termination or until max_path_length is reached :param max_path_length: horizon / maximum length of a single trajectory :return: a list of collected paths """ singleton_pool.run_each( _worker_set_policy_params, [(policy_params, scope)] * singleton_pool.n_parallel ) if env_params is not None: singleton_pool.run_each( _worker_set_env_params, [(env_params, scope)] * singleton_pool.n_parallel ) if target_task is not None: singleton_pool.run_each(_worker_update_dyn, [('target_task', target_task, scope)] * singleton_pool.n_parallel) if singleton_pool.G.ensemble_dynamics['use_ens_dyn'] and iter > 0: singleton_pool.run_each(_worker_update_dyn, [('dyn_model_choice', 0, scope)] * singleton_pool.n_parallel) result1 = singleton_pool.run_collect( _worker_collect_one_path, threshold=max_samples * (sim_percentage), args=(max_path_length, scope), show_prog_bar=True ) singleton_pool.run_each(_worker_update_dyn, [('dyn_model_choice', 1, scope)] * singleton_pool.n_parallel) singleton_pool.run_each(_worker_update_dyn, [('base_paths', result1, scope)] * singleton_pool.n_parallel) singleton_pool.run_each(_worker_update_dyn, [('baseline', baseline, scope)] * singleton_pool.n_parallel) result2 = singleton_pool.run_collect( _worker_collect_one_path, threshold=max_samples * (1-sim_percentage), args=(max_path_length, scope), show_prog_bar=True ) result = result1 + result2 #result = result1 else: result = singleton_pool.run_collect( _worker_collect_one_path, threshold=max_samples, args=(max_path_length, scope), show_prog_bar=True ) logger.log('Collected Traj Num: '+str(len(result))) if 'model_parameters' in result[0]['env_infos'] and logger._snapshot_dir is not None: mp_rew_raw = [] for path in result: mp_rew_raw.append([np.array(path['env_infos']['model_parameters'][-1]), path['rewards'].sum()]) mp_rew_raw.sort(key=lambda x: str(x[0])) #print(mp_rew_raw) mp_rew = [] i = 0 while True: if i >= len(mp_rew_raw) - 1: break cur_mp = mp_rew_raw[i][0] cur_rew = mp_rew_raw[i][1] cur_mp_num = 1 for j in range(i + 1, len(mp_rew_raw)): if (mp_rew_raw[j][0] - cur_mp).any(): break cur_rew += mp_rew_raw[j][1] cur_mp_num += 1 i += cur_mp_num mp_rew.append([np.array(cur_mp), cur_rew * 1.0 / cur_mp_num]) mp_rew.sort(key=lambda x: x[1]) filename = logger._snapshot_dir + '/mp_rew_' + str(iter) + '.pkl' pickle.dump(mp_rew, open(filename, 'wb')) if singleton_pool.G.ensemble_dynamics['use_ens_dyn']: dyn_training_x = [] dyn_training_y = [] dyn_training_result = result if iter > 0: dyn_training_result = result1 for path in dyn_training_result: for state_act in path['env_infos']['state_act']: dyn_training_x.append(state_act) for next_state in path['env_infos']['next_state']: dyn_training_y.append(next_state) singleton_pool.G.ensemble_dynamics['training_buffer_x'] += dyn_training_x singleton_pool.G.ensemble_dynamics['training_buffer_y'] += dyn_training_y if len(singleton_pool.G.ensemble_dynamics['training_buffer_x']) > 10000: singleton_pool.G.ensemble_dynamics['training_buffer_x'] = singleton_pool.G.ensemble_dynamics['training_buffer_x'][-10000:] singleton_pool.G.ensemble_dynamics['training_buffer_y'] = singleton_pool.G.ensemble_dynamics['training_buffer_y'][-10000:] if iter %1 ==0: optimize_iter = 100 if iter != 0: optimize_iter = 5 singleton_pool.G.ensemble_dynamics['dyn_models'][0].fit(singleton_pool.G.ensemble_dynamics['training_buffer_x'], singleton_pool.G.ensemble_dynamics['training_buffer_y'], iter = optimize_iter) #singleton_pool.G.ensemble_dynamics['transition_locator'].fit(singleton_pool.G.ensemble_dynamics['training_buffer_x'], singleton_pool.G.ensemble_dynamics['training_buffer_y']) print('fitted dynamic models and transition locator') singleton_pool.run_each(_worker_update_dyn, [('dyn_models', singleton_pool.G.ensemble_dynamics['dyn_models'], scope)] * singleton_pool.n_parallel) #singleton_pool.run_each(_worker_update_dyn, [('transition_locator', # singleton_pool.G.ensemble_dynamics['transition_locator'], scope)] * singleton_pool.n_parallel) if logger._snapshot_dir is not None: joblib.dump(singleton_pool.G.ensemble_dynamics['dyn_models'], logger._snapshot_dir+'/dyn_models.pkl', compress=True) # augment the data with synthetic data '''if iter > 0: logger.log('Synthetizing data...') bg = time.time() dartenv = env._wrapped_env.env.env dartenv.dyn_model_id = 1 dartenv.reset() if env._wrapped_env.monitoring: dartenv = dartenv.env data_size = int(max_samples * (1-sim_percentage)) random_state = [] for i in range(data_size): path = result[np.random.randint(len(result))] state_act = path['env_infos']['state_act'][np.random.randint(len(path['env_infos']['state_act']))] state = state_act[0:singleton_pool.G.ensemble_dynamics['dyn_models'][0].state_dim] random_state.append(state + np.random.uniform(low=0.01, high = 0.01, size=len(state))) obs = [] for i in range(data_size): dartenv.set_state_vector(random_state[i]) obs.append(dartenv._get_obs()) raw_actions = policy.get_actions(obs) actions = raw_actions[0] next_state = [] for i in range(data_size): next_state.append(singleton_pool.G.ensemble_dynamics['dyn_models'][0].do_simulation(random_state[i], actions[i], 4)) rewards = [] for i in range(data_size): rewards.append(dartenv.get_reward(random_state[i], actions[i], next_state[i], 0.2)) for i in range(data_size): newpath = {} newpath['rewards'] = np.array([rewards[i]]) newpath['env_infos'] = {} newpath['env_infos']['dyn_model_id'] = np.array([1]) env_info_keys = list(result[0]['env_infos'].keys()) for key in env_info_keys: if key not in newpath['env_infos']: newpath['env_infos'][key] = np.copy(result[0]['env_infos'][key][[-1]]) newpath['observations'] = np.array([obs[i]]) newpath['actions'] = np.array([actions[i]]) newpath['agent_infos'] = {} newpath['agent_infos']['log_std'] = raw_actions[1]['log_std'][[i]] newpath['agent_infos']['mean'] = raw_actions[1]['mean'][[i]] result.append(newpath) dartenv.dyn_model_id = 0 ed = time.time() logger.log('Synthesize done, created: '+str(ed-bg))''' return result
def update_env_params(env_params, scope=None,): singleton_pool.run_each( _worker_set_env_params, [(env_params, scope)] * singleton_pool.n_parallel )
def terminate_task(scope=None): singleton_pool.run_each(_worker_terminate_task, [(scope, )] * singleton_pool.n_parallel) del _cached_populate_env[scope] del _cached_populate_policy[scope]
def set_seed(seed): singleton_pool.run_each( _worker_set_seed, [(seed + i,) for i in xrange(singleton_pool.n_parallel)] )
def terminate_task(scope=None): singleton_pool.run_each( _worker_terminate_task, [(scope,)] * singleton_pool.n_parallel )
def terminate_task(): singleton_pool.run_each( _worker_terminate_task, [tuple()] * singleton_pool.n_parallel )
def set_seed(seed): singleton_pool.run_each( _worker_set_seed, [(seed + i,) for i in range(singleton_pool.n_parallel)] )
def evaluate(env, agent, max_path_length, n_paths, ma_mode, disc): if singleton_pool.n_parallel > 1: singleton_pool.run_each(worker_init_tf) ma_sampler.populate_task(env, agent, ma_mode) if singleton_pool.n_parallel > 1: singleton_pool.run_each(worker_init_tf_vars) curr_policy_params = agent.get_param_values( ) if ma_mode != 'concurrent' else [ag.get_param_values() for ag in agent] logger.log('Collecting paths...') paths = sample_paths(policy_params=curr_policy_params, env_params=None, max_samples=n_paths * max_path_length, max_path_length=max_path_length, ma_mode=ma_mode, scope=None) ma_sampler.terminate_task(scope=None) if ma_mode == 'centralized': ret = [] discret = [] envinfo = [] for path in paths: pathret = path['rewards'].sum() pathdiscret = special.discount_cumsum(path['rewards'], disc) info = path['env_infos'] ret.append(pathret) discret.append(pathdiscret) envinfo.append(info) logger.log('Done!') # for n_path in range(n_paths): # path = cent_rollout(env, agent, max_path_length) # pathret = path['rewards'].sum() # pathdiscret = special.discount_cumsum(path['rewards'], disc) # info = path['env_infos'] # ret.append(pathret) # discret.append(pathdiscret) # envinfo.append(info) dictinfo = { k: np.mean(v) for k, v in tensor_utils.stack_tensor_dict_list(envinfo).items() } return dict(ret=np.mean(ret), discret=np.mean(discret), **dictinfo) elif ma_mode == 'decentralized': agent2paths = {} for agid in range(len(env.agents)): agent2paths[agid] = [] for agpaths in paths: for agid, agpath in enumerate(agpaths): agent2paths[agid].append(agpath) # for n_path in range(n_paths): # paths = dec_rollout(env, agent, max_path_length) # for agid, agpath in enumerate(paths): # agent2paths[agid].append(agpath) rets, retsstd, discrets, infos = [], [], [], [] retlist = [] path_rets = [] for agid, paths in agent2paths.items(): agent_rets = [np.sum(path['rewards']) for path in paths] retlist.append(agent_rets) pr = [path['rewards'] for path in paths] rets.append(np.mean([path['rewards'].sum() for path in paths])) retsstd.append(np.std([path['rewards'].sum() for path in paths])) discrets.append( np.mean([ special.discount_cumsum(path['rewards'], disc)[0] for path in paths ])) infos.append({ k: np.mean(v) for k, v in tensor_utils.stack_tensor_dict_list( [path['env_infos'] for path in paths]).items() }) logger.log('Done!') dictinfos = tensor_utils.stack_tensor_dict_list(infos) retlist = np.mean(retlist, axis=0) # return dict(ret=rets, retstd=retsstd, discret=discrets, path_reward=path_reward, retlist=retlist, **dictinfos) return dict(ret=rets, retstd=retsstd, discret=discrets, retlist=retlist, **dictinfos) elif ma_mode == 'concurrent': raise NotImplementedError()
def initialize(n_parallel): singleton_pool.initialize(n_parallel) singleton_pool.run_each(_worker_init, [(id, ) for id in xrange(singleton_pool.n_parallel)])
def initialize(n_parallel): print(("parallel_sampler:initialize n_parallel", n_parallel)) singleton_pool.initialize(n_parallel) singleton_pool.run_each(_worker_init, [(id,) for id in range(singleton_pool.n_parallel)])