def sample_random_paths(max_samples, ma_mode, sampler, max_path_length=np.inf, scope=None): singleton_pool.run_collect(_worker_collect_path_random_one_env, threshold=max_samples, args=(max_path_length, ma_mode, sampler, scope), show_prog_bar=True)
def sample_paths( policy_params, max_samples, max_path_length=np.inf, env_params=None, scope=None, reset_arg=None, show_prog_bar=True, multi_task=False): """ :param policy_params: parameters for the policy. This will be updated on each worker process :param max_samples: desired maximum number of samples to be collected. The actual number of collected samples might be greater since all trajectories will be rolled out either until termination or until max_path_length is reached :param max_path_length: horizon / maximum length of a single trajectory :return: a list of collected paths """ if multi_task: assert len(policy_params) == singleton_pool.n_parallel all_params = [(params, scope) for params in policy_params] singleton_pool.run_each( _worker_set_policy_params, all_params, ) else: singleton_pool.run_each( _worker_set_policy_params, [(policy_params, scope)] * singleton_pool.n_parallel ) if env_params is not None: singleton_pool.run_each( _worker_set_env_params, [(env_params, scope)] * singleton_pool.n_parallel ) if multi_task: args = [(max_path_length, scope, arg) for arg in reset_arg] return singleton_pool.run_collect( _worker_collect_one_path, threshold=max_samples, args=args, show_prog_bar=show_prog_bar, multi_task=multi_task, ) else: return singleton_pool.run_collect( _worker_collect_one_path, threshold=max_samples, args=(max_path_length, scope, reset_arg), show_prog_bar=show_prog_bar, multi_task=multi_task, )
def sample_paths( policy_params, max_samples, max_path_length=np.inf, env_params=None, scope=None, reset_arg=None, show_prog_bar=True, multi_task=False): """ :param policy_params: parameters for the policy. This will be updated on each worker process :param max_samples: desired maximum number of samples to be collected. The actual number of collected samples might be greater since all trajectories will be rolled out either until termination or until max_path_length is reached :param max_path_length: horizon / maximum length of a single trajectory :return: a list of collected paths """ if multi_task: assert len(policy_params) == singleton_pool.n_parallel all_params = [(params, scope) for params in policy_params] singleton_pool.run_each( _worker_set_policy_params, all_params, ) else: singleton_pool.run_each( _worker_set_policy_params, [(policy_params, scope)] * singleton_pool.n_parallel ) if env_params is not None: singleton_pool.run_each( _worker_set_env_params, [(env_params, scope)] * singleton_pool.n_parallel ) if multi_task: args = [(max_path_length, scope, arg) for arg in reset_arg] return singleton_pool.run_collect( _worker_collect_one_path, threshold=max_samples, args=args, show_prog_bar=show_prog_bar, multi_task=multi_task, ) else: return singleton_pool.run_collect( _worker_collect_one_path, threshold=max_samples, args=(max_path_length, scope, reset_arg), show_prog_bar=show_prog_bar, multi_task=multi_task, )
def sample_paths(policy_params, max_samples, max_path_length=np.inf, low_policy_params=None, env_params=None, scope=None): """ :param policy_params: parameters for the policy. This will be updated on each worker process :param max_samples: desired maximum number of samples to be collected. The actual number of collected samples might be greater since all trajectories will be rolled out either until termination or until max_path_length is reached :param max_path_length: horizon / maximum length of a single trajectory :return: a list of collected paths """ singleton_pool.run_each(_worker_set_policy_params, [(policy_params, scope)] * singleton_pool.n_parallel) if low_policy_params is not None: singleton_pool.run_each(_worker_set_low_policy_params, [(low_policy_params, scope)] * singleton_pool.n_parallel) if env_params is not None: singleton_pool.run_each(_worker_set_env_params, [(env_params, scope)] * singleton_pool.n_parallel) return singleton_pool.run_collect(_worker_collect_one_path, threshold=max_samples, args=(max_path_length, scope), show_prog_bar=True)
def sample_paths(policy_params, dynamics_params, max_samples, max_path_length=np.inf, itr=None, obs_mean=None, obs_std=None, act_mean=None, act_std=None): """ :param policy_params: parameters for the policy. This will be updated on each worker process :param max_samples: desired maximum number of samples to be collected. The actual number of collected samples might be greater since all trajectories will be rolled out either until termination or until max_path_length is reached :param max_path_length: horizon / maximum length of a single trajectory :return: a list of collected paths """ singleton_pool.run_each(_worker_set_policy_params, [(policy_params, )] * singleton_pool.n_parallel) # Set dynamics params. # -------------------- singleton_pool.run_each(_worker_set_dynamics_params, [(dynamics_params, )] * singleton_pool.n_parallel) # -------------------- return singleton_pool.run_collect(_worker_collect_one_path, threshold=max_samples, args=(max_path_length, itr, obs_mean, obs_std, act_mean, act_std), show_prog_bar=True)
def sample_paths(policy_params, max_samples, max_path_length=np.inf, env_params=None, scope=None, useImitationPolicy=False, useImitationEnv=False, count_traj=False, terminate_only_max_path=False): """ :param policy_params: parameters for the policy. This will be updated on each worker process :param max_samples: desired maximum number of samples to be collected. The actual number of collected samples might be greater since all trajectories will be rolled out either until termination or until max_path_length is reached :param max_path_length: horizon / maximum length of a single trajectory :param count_traj: if true then max_samples is the desired maximum number of trajectories to be collected. :return: a list of collected paths """ if not useImitationPolicy and not policy_params is None: singleton_pool.run_each(_worker_set_policy_params, [(policy_params, scope)] * singleton_pool.n_parallel) if env_params is not None: singleton_pool.run_each(_worker_set_env_params, [(env_params, scope)] * singleton_pool.n_parallel) return singleton_pool.run_collect( _worker_collect_one_path, threshold=max_samples, args=(max_path_length, scope, useImitationPolicy, useImitationEnv, count_traj, terminate_only_max_path), show_prog_bar=True)
def sample_paths( policy_params, max_samples, max_path_length=np.inf, env_params=None, scope=None): """ :param policy_params: parameters for the policy. This will be updated on each worker process :param max_samples: desired maximum number of samples to be collected. The actual number of collected samples might be greater since all trajectories will be rolled out either until termination or until max_path_length is reached :param max_path_length: horizon / maximum length of a single trajectory :return: a list of collected paths """ singleton_pool.run_each( _worker_set_policy_params, [(policy_params, scope)] * singleton_pool.n_parallel ) if env_params is not None: singleton_pool.run_each( _worker_set_env_params, [(env_params, scope)] * singleton_pool.n_parallel ) return singleton_pool.run_collect( _worker_collect_one_path, threshold=max_samples, args=(max_path_length, scope), show_prog_bar=True )
def sample_paths( policy_params, max_samples, max_path_length=np.inf, dyn_model=None, #Updated env_params=None, scope=None, policy=None, #Updated rau=None, #Updated delta=0, #Updated constraint_fn=None, #Updated constraint_cost_fn=None, #Updated HCMPC_Activation=False, #Updated Constrained=False, #Updated for Constrained Algorithms ): """ :param policy_params: parameters for the policy. This will be updated on each worker process :param max_samples: desired maximum number of samples to be collected. The actual number of collected samples might be greater since all trajectories will be rolled out either until termination or until max_path_length is reached :param max_path_length: horizon / maximum length of a single trajectory :return: a list of collected paths """ singleton_pool.run_each(_worker_set_policy_params, [(policy_params, scope)] * singleton_pool.n_parallel) if env_params is not None: singleton_pool.run_each(_worker_set_env_params, [(env_params, scope)] * singleton_pool.n_parallel) #Updated if (Constrained): return singleton_pool.run_collect( _worker_collect_one_path_constrained, threshold=max_samples, args=(dyn_model, max_path_length, scope, policy, rau, delta, constraint_fn, constraint_cost_fn, HCMPC_Activation), show_prog_bar=True) else: return singleton_pool.run_collect(_worker_collect_one_path, threshold=max_samples, args=(max_path_length, scope), show_prog_bar=True)
def sample_paths( policy_params, dynamics_params, max_samples, max_path_length=np.inf, itr=None, normalize_reward=None, reward_mean=None, reward_std=None, kl_batch_size=None, n_itr_update=None, use_replay_pool=None, obs_mean=None, obs_std=None, act_mean=None, act_std=None, second_order_update=None, use_hide=True, use_hide_alg='my', show_rollout_chance=0.01, hide_tmax=10, mode=None ): """ :param policy_params: (dict) parameters for policies. This will be updated on each worker process :param max_samples: desired maximum number of samples to be collected. The actual number of collected samples might be greater since all trajectories will be rolled out either until termination or until max_path_length is reached :param max_path_length: horizon / maximum length of a single trajectory :return: a list of collected paths """ singleton_pool.run_each( _worker_set_policy_params, [(policy_params,)] * singleton_pool.n_parallel ) # Set dynamics params. # -------------------- singleton_pool.run_each( _worker_set_dynamics_params, [(dynamics_params,)] * singleton_pool.n_parallel ) # -------------------- # max_samples(params['batch_size']을 넘지 않으면서 random한 수의 path를 rollout return singleton_pool.run_collect( _worker_collect_one_path, threshold=max_samples, args=(max_path_length, itr, normalize_reward, reward_mean, reward_std, kl_batch_size, n_itr_update, use_replay_pool, obs_mean, obs_std, act_mean, act_std, second_order_update, use_hide, use_hide_alg, mode, show_rollout_chance, hide_tmax), show_prog_bar=False )
def sample_paths_a2c(policy_params, max_samples, ma_mode, max_path_length=np.inf, env_params=None, scope=None): singleton_pool.run_each(_worker_set_policy_params, [(policy_params, ma_mode, scope)] * singleton_pool.n_parallel) if env_params is not None: singleton_pool.run_each(_worker_set_env_params, [(env_params, scope)] * singleton_pool.n_parallel) return singleton_pool.run_collect(_worker_collect_path_one_env_a2c, threshold=max_samples, args=(max_path_length, ma_mode, scope), show_prog_bar=True)
def sample_paths( policy_params, dynamics_params, max_samples, max_path_length=np.inf, itr=None, normalize_reward=None, reward_mean=None, reward_std=None, kl_batch_size=None, n_itr_update=None, use_replay_pool=None, obs_mean=None, obs_std=None, act_mean=None, act_std=None, second_order_update=None ): """ :param policy_params: parameters for the policy. This will be updated on each worker process :param max_samples: desired maximum number of samples to be collected. The actual number of collected samples might be greater since all trajectories will be rolled out either until termination or until max_path_length is reached :param max_path_length: horizon / maximum length of a single trajectory :return: a list of collected paths """ singleton_pool.run_each( _worker_set_policy_params, [(policy_params,)] * singleton_pool.n_parallel ) # Set dynamics params. # -------------------- singleton_pool.run_each( _worker_set_dynamics_params, [(dynamics_params,)] * singleton_pool.n_parallel ) # -------------------- return singleton_pool.run_collect( _worker_collect_one_path, threshold=max_samples, args=(max_path_length, itr, normalize_reward, reward_mean, reward_std, kl_batch_size, n_itr_update, use_replay_pool, obs_mean, obs_std, act_mean, act_std, second_order_update), show_prog_bar=True )
def sample_paths(policy_params, max_samples, ma_mode, max_path_length=np.inf, env_params=None, scope=None): if ma_mode == 'concurrent': assert isinstance(policy_params, list) singleton_pool.run_each(_worker_set_policy_params, [(policy_params, ma_mode, scope)] * singleton_pool.n_parallel) if env_params is not None: singleton_pool.run_each(_worker_set_env_params, [(env_params, scope)] * singleton_pool.n_parallel) return singleton_pool.run_collect(_worker_collect_path_one_env, threshold=max_samples, args=(max_path_length, ma_mode, scope), show_prog_bar=True)
def sample_paths(policy_params, max_samples, max_path_length=np.inf, env_params=None, scope=None, reset_arg=None, show_prog_bar=True, multi_task=False, extra_infos=None, taskIdx=0): """ :param policy_params: parameters for the policy. This will be updated on each worker process :param max_samples: desired maximum number of samples to be collected. The actual number of collected samples might be greater since all trajectories will be rolled out either until termination or until max_path_length is reached :param max_path_length: horizon / maximum length of a single trajectory :return: a list of collected paths """ #if preupdate: singleton_pool.run_each(_worker_set_policy_params, [(policy_params, scope)] * singleton_pool.n_parallel) return singleton_pool.run_collect( _worker_collect_one_path, numPaths=max_samples / max_path_length, args_list=[ (max_path_length, scope, np.array(reset_arg), taskIdx, extra_infos) ] * singleton_pool.n_parallel, show_prog_bar=show_prog_bar, multi_task=multi_task, )
def sample_paths( policy_params, max_samples, max_path_length=np.inf, env_params=None, scope=None, iter = 0, env = None, policy = None, baseline = None, sim_percentage = 1.0/3.0, target_task = None): """ :param policy_params: parameters for the policy. This will be updated on each worker process :param max_samples: desired maximum number of samples to be collected. The actual number of collected samples might be greater since all trajectories will be rolled out either until termination or until max_path_length is reached :param max_path_length: horizon / maximum length of a single trajectory :return: a list of collected paths """ singleton_pool.run_each( _worker_set_policy_params, [(policy_params, scope)] * singleton_pool.n_parallel ) if env_params is not None: singleton_pool.run_each( _worker_set_env_params, [(env_params, scope)] * singleton_pool.n_parallel ) if target_task is not None: singleton_pool.run_each(_worker_update_dyn, [('target_task', target_task, scope)] * singleton_pool.n_parallel) if singleton_pool.G.ensemble_dynamics['use_ens_dyn'] and iter > 0: singleton_pool.run_each(_worker_update_dyn, [('dyn_model_choice', 0, scope)] * singleton_pool.n_parallel) result1 = singleton_pool.run_collect( _worker_collect_one_path, threshold=max_samples * (sim_percentage), args=(max_path_length, scope), show_prog_bar=True ) singleton_pool.run_each(_worker_update_dyn, [('dyn_model_choice', 1, scope)] * singleton_pool.n_parallel) singleton_pool.run_each(_worker_update_dyn, [('base_paths', result1, scope)] * singleton_pool.n_parallel) singleton_pool.run_each(_worker_update_dyn, [('baseline', baseline, scope)] * singleton_pool.n_parallel) result2 = singleton_pool.run_collect( _worker_collect_one_path, threshold=max_samples * (1-sim_percentage), args=(max_path_length, scope), show_prog_bar=True ) result = result1 + result2 #result = result1 else: result = singleton_pool.run_collect( _worker_collect_one_path, threshold=max_samples, args=(max_path_length, scope), show_prog_bar=True ) logger.log('Collected Traj Num: '+str(len(result))) if 'model_parameters' in result[0]['env_infos'] and logger._snapshot_dir is not None: mp_rew_raw = [] for path in result: mp_rew_raw.append([np.array(path['env_infos']['model_parameters'][-1]), path['rewards'].sum()]) mp_rew_raw.sort(key=lambda x: str(x[0])) #print(mp_rew_raw) mp_rew = [] i = 0 while True: if i >= len(mp_rew_raw) - 1: break cur_mp = mp_rew_raw[i][0] cur_rew = mp_rew_raw[i][1] cur_mp_num = 1 for j in range(i + 1, len(mp_rew_raw)): if (mp_rew_raw[j][0] - cur_mp).any(): break cur_rew += mp_rew_raw[j][1] cur_mp_num += 1 i += cur_mp_num mp_rew.append([np.array(cur_mp), cur_rew * 1.0 / cur_mp_num]) mp_rew.sort(key=lambda x: x[1]) filename = logger._snapshot_dir + '/mp_rew_' + str(iter) + '.pkl' pickle.dump(mp_rew, open(filename, 'wb')) if singleton_pool.G.ensemble_dynamics['use_ens_dyn']: dyn_training_x = [] dyn_training_y = [] dyn_training_result = result if iter > 0: dyn_training_result = result1 for path in dyn_training_result: for state_act in path['env_infos']['state_act']: dyn_training_x.append(state_act) for next_state in path['env_infos']['next_state']: dyn_training_y.append(next_state) singleton_pool.G.ensemble_dynamics['training_buffer_x'] += dyn_training_x singleton_pool.G.ensemble_dynamics['training_buffer_y'] += dyn_training_y if len(singleton_pool.G.ensemble_dynamics['training_buffer_x']) > 10000: singleton_pool.G.ensemble_dynamics['training_buffer_x'] = singleton_pool.G.ensemble_dynamics['training_buffer_x'][-10000:] singleton_pool.G.ensemble_dynamics['training_buffer_y'] = singleton_pool.G.ensemble_dynamics['training_buffer_y'][-10000:] if iter %1 ==0: optimize_iter = 100 if iter != 0: optimize_iter = 5 singleton_pool.G.ensemble_dynamics['dyn_models'][0].fit(singleton_pool.G.ensemble_dynamics['training_buffer_x'], singleton_pool.G.ensemble_dynamics['training_buffer_y'], iter = optimize_iter) #singleton_pool.G.ensemble_dynamics['transition_locator'].fit(singleton_pool.G.ensemble_dynamics['training_buffer_x'], singleton_pool.G.ensemble_dynamics['training_buffer_y']) print('fitted dynamic models and transition locator') singleton_pool.run_each(_worker_update_dyn, [('dyn_models', singleton_pool.G.ensemble_dynamics['dyn_models'], scope)] * singleton_pool.n_parallel) #singleton_pool.run_each(_worker_update_dyn, [('transition_locator', # singleton_pool.G.ensemble_dynamics['transition_locator'], scope)] * singleton_pool.n_parallel) if logger._snapshot_dir is not None: joblib.dump(singleton_pool.G.ensemble_dynamics['dyn_models'], logger._snapshot_dir+'/dyn_models.pkl', compress=True) # augment the data with synthetic data '''if iter > 0: logger.log('Synthetizing data...') bg = time.time() dartenv = env._wrapped_env.env.env dartenv.dyn_model_id = 1 dartenv.reset() if env._wrapped_env.monitoring: dartenv = dartenv.env data_size = int(max_samples * (1-sim_percentage)) random_state = [] for i in range(data_size): path = result[np.random.randint(len(result))] state_act = path['env_infos']['state_act'][np.random.randint(len(path['env_infos']['state_act']))] state = state_act[0:singleton_pool.G.ensemble_dynamics['dyn_models'][0].state_dim] random_state.append(state + np.random.uniform(low=0.01, high = 0.01, size=len(state))) obs = [] for i in range(data_size): dartenv.set_state_vector(random_state[i]) obs.append(dartenv._get_obs()) raw_actions = policy.get_actions(obs) actions = raw_actions[0] next_state = [] for i in range(data_size): next_state.append(singleton_pool.G.ensemble_dynamics['dyn_models'][0].do_simulation(random_state[i], actions[i], 4)) rewards = [] for i in range(data_size): rewards.append(dartenv.get_reward(random_state[i], actions[i], next_state[i], 0.2)) for i in range(data_size): newpath = {} newpath['rewards'] = np.array([rewards[i]]) newpath['env_infos'] = {} newpath['env_infos']['dyn_model_id'] = np.array([1]) env_info_keys = list(result[0]['env_infos'].keys()) for key in env_info_keys: if key not in newpath['env_infos']: newpath['env_infos'][key] = np.copy(result[0]['env_infos'][key][[-1]]) newpath['observations'] = np.array([obs[i]]) newpath['actions'] = np.array([actions[i]]) newpath['agent_infos'] = {} newpath['agent_infos']['log_std'] = raw_actions[1]['log_std'][[i]] newpath['agent_infos']['mean'] = raw_actions[1]['mean'][[i]] result.append(newpath) dartenv.dyn_model_id = 0 ed = time.time() logger.log('Synthesize done, created: '+str(ed-bg))''' return result