def sample_paths_one_core(N, policy, T=1e6, env=None, env_name=None, pegasus_seed=None, mode='sample'): """ params: N : number of sample points policy : policy to be used to sample the data T : maximum length of trajectory env : env object to sample from env_name : name of env to be sampled from (one of env or env_name must be specified) pegasus_seed : seed for environment (numpy speed must be set externally) """ if env_name is None and env is None: print("No environment specified! Error will be raised") if env is None: env = get_environment(env_name) if pegasus_seed is not None: env.env._seed(pegasus_seed) T = min(T, env.horizon) start_time = timer.time() print("####### Gathering Samples #######") sampled_so_far = 0 paths = [] seed = pegasus_seed if pegasus_seed is not None else 0 while sampled_so_far < N: if mode == 'sample': this_path = base_sampler.do_rollout(1, policy, T, env, env_name, seed) # do 1 rollout elif mode == 'evaluation': this_path = eval_sampler.do_evaluation_rollout( 1, policy, env, env_name, seed) else: print( "Mode has to be either 'sample' for training time or 'evaluation' for test time performance" ) break paths.append(this_path[0]) seed += 1 sampled_so_far += len(this_path[0]["rewards"]) print("======= Samples Gathered ======= | >>>> Time taken = %f " % (timer.time() - start_time)) print( "................................. | >>>> # samples = %i # trajectories = %i " % (sampled_so_far, len(paths))) return paths
def sample_paths_parallel(N, policy, T=1e2, env_name=None, pegasus_seed=None, num_cpu='max', max_process_time=300, max_timeouts=4, suppress_print=False, mode='sample'): if num_cpu == None or num_cpu == 'max': num_cpu = mp.cpu_count() elif num_cpu == 1: return base_sampler.do_rollout(N, policy, T, None, env_name, pegasus_seed) else: num_cpu = min(mp.cpu_count(), num_cpu) paths_per_cpu = int(np.ceil(N / num_cpu)) args_list = [] for i in range(num_cpu): if pegasus_seed is None: args_list_cpu = [ paths_per_cpu, policy, T, None, env_name, pegasus_seed ] else: args_list_cpu = [ paths_per_cpu, policy, T, None, env_name, pegasus_seed + i * paths_per_cpu ] args_list.append(args_list_cpu) # Do multiprocessing if suppress_print == False: start_time = timer.time() print("####### Gathering Samples #######") results = _try_multiprocess(args_list, num_cpu, max_process_time, max_timeouts, mode) paths = [] # result is a paths type and results is list of paths for result in results: for path in result: paths.append(path) if suppress_print == False: print("======= Samples Gathered ======= | >>>> Time taken = %f " % (timer.time() - start_time)) return paths
def sample_paths(N, policy, T=1e2, env=None, env_name=None, pegasus_seed=None, mode='sample'): """ Function to sample path :param mode : 'sample' means base_sampler rollout, 'evaluation' means it tests policy based on given path :return : path dictionary with all data regarding a path """ if mode == 'sample': return base_sampler.do_rollout(N, policy, T, env, env_name, pegasus_seed) elif mode == 'evaluation': return eval_sampler.do_evaluation_rollout(N, policy, env, env_name, pegasus_seed) else: print( "Mode has to be either 'sample' for training time or 'evaluation' for test time performance" )
observations.shape[0]), size=2, replace=False) first_obs = observations[first_obs] second_obs = observations[second_obs] pairwise_dist = np.linalg.norm(first_obs - second_obs) array[i] = pairwise_dist mean = np.mean(array) return mean if (__name__ == "__main__"): # x = [0,0,0,0,1] # gamma = 0.9 # y = discount_sum(x, gamma) # print(y) N = 1 pol = base_sampler.RandomPolicy((-2, 2)) T = 5 env = None env_name = "Pendulum-v0" env = GymEnv(env_name) gamma = 0.9 paths = base_sampler.do_rollout(N, pol, T, env) compute_returns(paths, gamma) baseline = linear_baseline.LinearBaseline(env.spec) compute_advantages(paths, baseline, gamma, gae_lambda=0.9) mean = get_avg_step_distance(paths) print(mean)
Predicts the value function for each state with a set of fit weights :param path : it is dictionary as returned by base_sampler :return : Returns a list containing the predicted value function for each state in the path """ if self._coeffs is None: return np.zeros(len(path["rewards"])) return self._features(path).dot(self._coeffs) if (__name__ == "__main__"): #What is this code trying to do? N = 1 pol = RandomPolicy((-2, 2)) T = 5 env_name = 'Pendulum-v0' y = base_sampler.do_rollout(N=N, policy=pol, T=5, env_name=env_name) y2 = base_sampler.do_rollout(N=N, policy=pol, T=5, env_name=env_name) # print(y) env = GymEnv(env_name) base_line = LinearBaseline(env.spec) features = base_line._features(y[0]) print(features) compute_returns(y, 1) compute_returns(y2, 1) # print('returns: ', y[0]['returns']) # print(base_line.predict(y[0])) errors = base_line.fit(y, True) # print('y : ', y) # print('returns: ', y2[4]['returns']) # print('predict: ', base_line.predict(y2[4]))