def load_and_run_model(env, name): controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False) print('Running {:s}'.format(name)) rollout(env, pilco, timesteps=T_sim, verbose=False, SUBS=SUBS)
def work_acer(self): b_states=[None] done = True step = 0 print(self.name, " using ", self.offline_steps, "offline steps, per online step") while step < self.MAX_STEPS: """ """ self.agent.update_target() # n -step rollout from the environment, with n = RETURN_STEPS or until done. b_states, b_actions, b_rewards, b_mus, done = rollout(self.agent, self.env, [b_states[-1]], done, self.RETURN_STEPS) pi, q_a, val = self.agent.get_retrace_values(b_states[:-1], b_actions) importance_weights = np.divide(pi, np.add(b_mus, 1e-14)) importance_weights_a = np.take(np.reshape(importance_weights, [-1]), ( np.arange(importance_weights.shape[0]) * importance_weights.shape[1] + b_actions)) #calculate retrace values. retrace_targets = q_retrace(b_rewards, done, q_a, val, importance_weights_a, self.DISCOUNT) #update step, returns current global step and summary (not used here) _, step = self.agent.update_step(b_states[:-1], b_actions, retrace_targets, importance_weights) # append trajectory to the replay buffer self.memory.remember((b_states, b_actions, b_rewards, b_mus, done)) #offline version, instead of rollout the trajectory is sampled. if self.offline_steps>0 and self.memory.can_sample(): for _ in range(self.offline_steps): mem_states, mem_actions, mem_rewards, mem_mus, mem_done = self.memory.sample_from_memory() pi, q_a, val = self.agent.get_retrace_values(mem_states[:-1], mem_actions) importance_weights = np.divide(pi, np.add(mem_mus, 1e-14)) importance_weights_a = np.take(np.reshape(importance_weights, [-1]), ( np.arange(importance_weights.shape[0]) * importance_weights.shape[1] + mem_actions)) retrace_targets = q_retrace(mem_rewards, mem_done, q_a, val, importance_weights_a, self.DISCOUNT) sum, step = self.agent.update_step(mem_states[:-1], mem_actions, retrace_targets, importance_weights)
def eval(self): """Evaluate deterministically the Gaussian policy. Returns: np.array: Expected accumulated reward """ # Put models in evaluation mode for model in self.trainable_models: model.eval() for rr in range(self.eval_rollouts): rollout_info = rollout( self.env, self.policy, max_horizon=self.max_horizon, fixed_horizon=self.fixed_horizon, render=self.render, return_info_dict=True, device=self.torch_device, deterministic=True, ) self.logging_eval_rewards[rr] = torch.tensor( rollout_info['reward']).mean() self.logging_eval_returns[rr] = torch.tensor( rollout_info['reward']).sum() self.num_eval_interactions += 1 gt.stamp('eval') return self.logging_eval_returns.mean().item()
def run(): env = gym.make('CorridorSmall-v10') action_space = list(range(env.action_space.n)) q = Approximator_ResidualBoosting(action_space) initial_learning_rate = 0.15 learning_rate = initial_learning_rate initial_epsilon = 0.15 epsilon = initial_epsilon batch_size = 10 for learning_iteration in range(1000): policy = Policy_EpsilonGreedy(q, epsilon) episodes = [rollout(policy, env) for _ in range(batch_size)] targets = TD0_targets(episodes, q) X, Y_target = zip(*targets) Y_target = np.reshape(Y_target, (-1, 1)) learning_rate = decay(initial_learning_rate, learning_iteration) epsilon = decay(initial_epsilon, learning_iteration) q.learn(learning_rate, X, Y_target) if learning_iteration % 1 == 0: greedy_policy = Policy_Greedy(q) reward_sum = avg( test_policy(greedy_policy, env) for _ in range(10)) print( f"Episode {learning_iteration*batch_size} Reward {reward_sum} lr {learning_rate} epsilon {epsilon}" )
def rollout_plans(env: LegacyEnv, plans: np.ndarray, states: np.ndarray): returns = np.empty((plans.shape[0], plans.shape[1])) assert len(returns.shape) == 2 assert len(plans.shape) == 4 for i in range(plans.shape[0]): for j in range(plans.shape[1]): returns[i, j] = rollout(plans[i, j], env, states[j]) return returns
def work_and_eval_acer(self, net_saver, TB_DIR, evalrewards=[]): b_states = [None] done = True step = 0 runningreward = 1 bestreward = 0 rewardlist=[] if evalrewards !=[]: runningreward = evalrewards[-1] print(runningreward) next_verbose = 0 summary_writer = tf.summary.FileWriter(TB_DIR + "/tb", self.sess.graph, flush_secs=30) print(self.name, " using ", self.offline_steps, "offline steps, per online step") while step < self.MAX_STEPS: self.agent.update_target() b_states, b_actions, b_rewards, b_mus, done = rollout(self.agent, self.env, [b_states[-1]], done, self.RETURN_STEPS) pi, q_a, val = self.agent.get_retrace_values(b_states[:-1], b_actions) rewardlist.append(np.sum(b_rewards)) importance_weights = np.divide(pi, np.add(b_mus, 1e-14)) importance_weights_a = np.take(np.reshape(importance_weights, [-1]), ( np.arange(importance_weights.shape[0]) * importance_weights.shape[1] + b_actions)) retrace_targets = q_retrace(b_rewards, done, q_a, val, importance_weights_a, self.DISCOUNT) sum, step = self.agent.update_step(b_states[:-1], b_actions, retrace_targets, importance_weights) self.memory.remember((b_states, b_actions, b_rewards, b_mus, done)) if done: bestreward = np.maximum(bestreward,np.sum(rewardlist)) runningreward = 0.9*runningreward+0.1*np.sum(rewardlist) evalrewards.append(runningreward) np.savetxt(TB_DIR + "reward.out",evalrewards) rewardlist=[] if step > next_verbose: print("Worker ", self.name, "At ", step, " Running/Max: ", runningreward, bestreward, " Frames:", self.memory.counter) print("pi:", self.agent.get_pi(b_states[-1])) print("Saving Model") next_verbose +=(self.MAX_STEPS/100) net_saver.save(self.sess, TB_DIR + "checkpoints/model" + str(step) + ".cptk") if sum is not None: summary_writer.add_summary(sum, step) if self.offline_steps>0 and self.memory.can_sample(): for _ in range(self.offline_steps): mem_states, mem_actions, mem_rewards, mem_mus, mem_done = self.memory.sample_from_memory() pi, q_a, val = self.agent.get_retrace_values(mem_states[:-1], mem_actions) importance_weights = np.divide(pi, np.add(mem_mus, 1e-14)) importance_weights_a = np.take(np.reshape(importance_weights, [-1]), ( np.arange(importance_weights.shape[0]) * importance_weights.shape[1] + mem_actions)) retrace_targets = q_retrace(mem_rewards, mem_done, q_a, val, importance_weights_a, self.DISCOUNT) sum, step = self.agent.update_step(mem_states[:-1], mem_actions, retrace_targets, importance_weights)
def run(env, config): action_space = list(range(env.action_space.n)) replay_buffer = Replay_buffer() q = Approximator_ResidualBoosting(action_space) learning_rate = config.initial_learning_rate epsilon = config.initial_epsilon interaction_count = 0 for learning_iteration in range(config.learning_iterations): if learning_iteration % 1 == 0: greedy_policy = Policy_Greedy(q) reward_sum = avg( test_rollout(greedy_policy, env) for _ in range(config.test_rollouts)) print( f"Episode {learning_iteration*config.rollout_batch_size:05d} Reward {reward_sum:05f} lr {learning_rate:05f} epsilon {epsilon:05f}" ) yield interaction_count, reward_sum policy = Policy_EpsilonGreedy(q, epsilon=epsilon) episodes = [ list(rollout(policy, env)) for _ in range(config.rollout_batch_size) ] interaction_count += sum(map(len, episodes)) replay_buffer += episodes sampled_episodes = replay_buffer.sample(config.replay_batch_size) targets = TD0_targets(sampled_episodes, q, config.discount) X, Y_target = zip(*targets) Y_target = np.reshape(Y_target, (-1, 1)) learning_rate = decay(config.initial_learning_rate, learning_iteration * config.rollout_batch_size) epsilon = decay(config.initial_epsilon, learning_iteration * config.rollout_batch_size) q.learn(learning_rate, X, Y_target)
parser.add_argument('--max_length', type=int, default=1000, help='Max length of rollout') parser.add_argument('--speedup', type=int, default=1, help='Speedup') parser.add_argument('--loop', type=int, default=1, help='# of loops') args = parser.parse_args() policy = None env = None while True: if ':' in args.file: # fetch file using ssh os.system("rsync -avrz %s /tmp/%s.pkl" % (args.file, filename)) data = joblib.load("/tmp/%s.pkl" % filename) if policy: new_policy = data['policy'] policy.set_param_values(new_policy.get_param_values()) path = rollout(env, policy, max_path_length=args.max_length, animated=True, speedup=args.speedup) else: policy = data['policy'] env = data['env'] path = rollout(env, policy, max_path_length=args.max_length, animated=True, speedup=args.speedup) else: data = joblib.load(args.file) policy = data['policy'] env = data['env'] path = rollout(env, policy ) break
def safe_swimmer_run(seed=0, logging=False): env = SwimmerWrapper() state_dim = 9 control_dim = 2 SUBS = 2 maxiter = 60 max_action = 1.0 m_init = np.reshape(np.zeros(state_dim), (1, state_dim)) # initial state mean S_init = 0.05 * np.eye(state_dim) J = 1 N = 12 T = 25 bf = 30 T_sim = 100 # Reward function that dicourages the joints from hitting their max angles weights_l = np.zeros(state_dim) weights_l[0] = 0.5 max_ang = (100 / 180 * np.pi) * 0.95 R1 = LinearReward(state_dim, weights_l) C1 = SingleConstraint(1, low=-max_ang, high=max_ang, inside=False) C2 = SingleConstraint(2, low=-max_ang, high=max_ang, inside=False) C3 = SingleConstraint(3, low=-max_ang, high=max_ang, inside=False) R = CombinedRewards(state_dim, [R1, C1, C2, C3], coefs=[1.0, -10.0, -10.0, -10.0]) th = 0.2 # Initial random rollouts to generate a dataset X, Y, _, _ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=True) for i in range(1, J): X_, Y_, _, _ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=True) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) pilco = PILCO((X, Y), controller=controller, horizon=T, reward=R, m_init=m_init, S_init=S_init) for model in pilco.mgpr.models: model.likelihood.variance.assign(0.001) set_trainable(model.likelihood.variance, False) new_data = True eval_runs = T_sim evaluation_returns_full = np.zeros((N, eval_runs)) evaluation_returns_sampled = np.zeros((N, eval_runs)) X_eval = [] for rollouts in range(N): print("**** ITERATION no", rollouts, " ****") if new_data: pilco.optimize_models(maxiter=100) new_data = False pilco.optimize_policy(maxiter=1, restarts=2) m_p = np.zeros((T, state_dim)) S_p = np.zeros((T, state_dim, state_dim)) predicted_risk1 = np.zeros(T) predicted_risk2 = np.zeros(T) predicted_risk3 = np.zeros(T) for h in range(T): m_h, S_h, _ = pilco.predict(m_init, S_init, h) m_p[h, :], S_p[h, :, :] = m_h[:], S_h[:, :] predicted_risk1[h], _ = C1.compute_reward(m_h, S_h) predicted_risk2[h], _ = C2.compute_reward(m_h, S_h) predicted_risk3[h], _ = C3.compute_reward(m_h, S_h) estimate_risk1 = 1 - np.prod(1.0 - predicted_risk1) estimate_risk2 = 1 - np.prod(1.0 - predicted_risk2) estimate_risk3 = 1 - np.prod(1.0 - predicted_risk3) overall_risk = 1 - (1 - estimate_risk1) * (1 - estimate_risk2) * ( 1 - estimate_risk3) if overall_risk < th: X_new, Y_new, _, _ = rollout(env, pilco, timesteps=T_sim, verbose=True, SUBS=SUBS) new_data = True # Update dataset X = np.vstack((X, X_new[:T, :])) Y = np.vstack((Y, Y_new[:T, :])) pilco.mgpr.set_data((X, Y)) if estimate_risk1 < th / 10: R.coefs.assign(R.coefs.value() * [1.0, 0.75, 1.0, 1.0]) if estimate_risk2 < th / 10: R.coefs.assign(R.coefs.value() * [1.0, 1.0, 0.75, 1.0]) if estimate_risk3 < th / 10: R.coefs.assign(R.coefs.value() * [1.0, 1.0, 1.0, 0.75]) else: print("*********CHANGING***********") if estimate_risk1 > th / 3: R.coefs.assign(R.coefs.value() * [1.0, 1.5, 1.0, 1.0]) if estimate_risk2 > th / 3: R.coefs.assign(R.coefs.value() * [1.0, 1.0, 1.5, 1.0]) if estimate_risk3 > th / 3: R.coefs.assign(R.coefs.value() * [1.0, 1.0, 1.0, 1.5]) _, _, r = pilco.predict(m_init, S_init, T)
def safe_cars(seed=0): T = 25 th = 0.10 np.random.seed(seed) J = 5 N = 5 eval_runs = 5 env = LinearCars() # Initial random rollouts to generate a dataset X1, Y1, _, _ = rollout(env, pilco=None, timesteps=T, verbose=True, random=True, render=False) for i in range(1, 5): X1_, Y1_, _, _ = rollout(env, pilco=None, timesteps=T, verbose=True, random=True, render=False) X1 = np.vstack((X1, X1_)) Y1 = np.vstack((Y1, Y1_)) env = Normalised_Env(np.mean(X1[:, :4], 0), np.std(X1[:, :4], 0)) X, Y, _, _ = rollout(env, pilco=None, timesteps=T, verbose=True, random=True, render=False) for i in range(1, J): X_, Y_, _, _ = rollout(env, pilco=None, timesteps=T, verbose=True, random=True, render=False) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim m_init = np.transpose(X[0, :-1, None]) S_init = 0.1 * np.eye(state_dim) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=40, max_action=0.2) #w1 = np.diag([1.5, 0.001, 0.001, 0.001]) #t1 = np.divide(np.array([3.0, 1.0, 3.0, 1.0]) - env.m, env.std) #R1 = ExponentialReward(state_dim=state_dim, t=t1, W=w1) # R1 = LinearReward(state_dim=state_dim, W=np.array([0.1, 0.0, 0.0, 0.0])) R1 = LinearReward(state_dim=state_dim, W=np.array([ 1.0 * env.std[0], 0., 0., 0, ])) bound_x1 = 1 / env.std[0] bound_x2 = 1 / env.std[2] B = RiskOfCollision( 2, [-bound_x1 - env.m[0] / env.std[0], -bound_x2 - env.m[2] / env.std[2]], [bound_x1 - env.m[0] / env.std[0], bound_x2 - env.m[2] / env.std[2]]) pilco = SafePILCO((X, Y), controller=controller, mu=-300.0, reward_add=R1, reward_mult=B, horizon=T, m_init=m_init, S_init=S_init) for model in pilco.mgpr.models: model.likelihood.variance.assign(0.001) set_trainable(model.likelihood.variance, False) # define tolerance new_data = True # init = tf.global_variables_initializer() evaluation_returns_full = np.zeros((N, eval_runs)) evaluation_returns_sampled = np.zeros((N, eval_runs)) X_eval = [] for rollouts in range(N): print("***ITERATION**** ", rollouts) if new_data: pilco.optimize_models(maxiter=100) new_data = False pilco.optimize_policy(maxiter=20, restarts=2) # check safety m_p = np.zeros((T, state_dim)) S_p = np.zeros((T, state_dim, state_dim)) predicted_risks = np.zeros(T) predicted_rewards = np.zeros(T) for h in range(T): m_h, S_h, _ = pilco.predict(m_init, S_init, h) m_p[h, :], S_p[h, :, :] = m_h[:], S_h[:, :] predicted_risks[h], _ = B.compute_reward(m_h, S_h) predicted_rewards[h], _ = R1.compute_reward(m_h, S_h) overall_risk = 1 - np.prod(1.0 - predicted_risks) print("Predicted episode's return: ", sum(predicted_rewards)) print("Overall risk ", overall_risk) print("Mu is ", pilco.mu.numpy()) print("bound1 ", bound_x1, " bound1 ", bound_x2) if overall_risk < th: X_new, Y_new, _, _ = rollout(env, pilco=pilco, timesteps=T, verbose=True, render=False) new_data = True X = np.vstack((X, X_new)) Y = np.vstack((Y, Y_new)) pilco.mgpr.set_data((X, Y)) if overall_risk < (th / 4): pilco.mu.assign(0.75 * pilco.mu.numpy()) else: X_new, Y_new, _, _ = rollout(env, pilco=pilco, timesteps=T, verbose=True, render=False) print(m_p[:, 0] - X_new[:, 0]) print(m_p[:, 2] - X_new[:, 2]) print("*********CHANGING***********") _, _, r = pilco.predict(m_init, S_init, T) print(r) # to verify this actually changes, run the reward wrapper before and after on the same trajectory pilco.mu.assign(1.5 * pilco.mu.numpy()) _, _, r = pilco.predict(m_init, S_init, T) print(r)
args.epsilon_decay_factor = 0.99 args.lr = 0.001 args.gamma = 0.90 policy = DQNPolicy(make_dqn(statesize, actionsize), statesize, actionsize, lr=args.lr, gamma=args.gamma) utils.qlearn(env, policy, args) torch.save(policy.model, args.model) # From here, take from mp7.py # Environment (a Markov Decision Process model) # Q Model model = utils.loadmodel(args.model, env, statesize, actionsize) print("Model: {}".format(model)) # Rollout _, rewards = utils.rollout(env, model, args.episodes, args.epsilon, render=True) # Report #Evaluate total rewards for MountainCar environment score = np.array([np.array(rewards) > -200.0]).sum() print('Score: ' + str(score) + '/' + str(args.episodes))
def compare_grad(args): set_seed(args.seed) env = LQR( N=args.xu_dim[0], M=args.xu_dim[1], lims=100, init_scale=1.0, max_steps=args.H, Sigma_s_kappa=1.0, Q_kappa=1.0, P_kappa=1.0, A_norm=1.0, B_norm=1.0, Sigma_s_scale=args.noise, ) #K = env.optimal_controller() K = np.random.randn(env.M, env.N) mean_network = nn.Linear(*K.shape[::-1], bias=False) mean_network.weight.data = tensor(K) policy = GaussianPolicy(*K.shape[::-1], mean_network, learn_std=False, gate_output=False) out_set = set() # here Sigma_a = np.diag(np.ones(env.M)) mc_grads = [] for i in tqdm(range(args.n_trajs), 'mc'): noises = np.random.randn(env.max_steps, env.M) states, actions, rewards, _, _ = rollout(env, policy, noises) if len(states) < args.H: out_set.add('mc') break mc_grads.append( get_gaussian_policy_gradient(states, actions, rewards, policy, variance_reduced_loss)) mc_grads = np.asarray(mc_grads) mc_means = np.cumsum(mc_grads, axis=0) / np.arange( 1, len(mc_grads) + 1)[:, np.newaxis, np.newaxis] rqmc_grads = [] #loc = torch.zeros(env.max_steps * env.M) #scale = torch.ones(env.max_steps * env.M) #rqmc_noises = Normal_RQMC(loc, scale).sample(torch.Size([args.n_trajs])).data.numpy() rqmc_noises = uniform2normal( random_shift( ssj_uniform( args.n_trajs, args.H * env.M, ).reshape(args.n_trajs, args.H, env.M), 0, )) for i in tqdm(range(args.n_trajs), 'rqmc'): states, actions, rewards, _, _ = rollout( env, policy, rqmc_noises[i].reshape(env.max_steps, env.M)) if len(states) < args.H: out_set.add('rqmc') break rqmc_grads.append( get_gaussian_policy_gradient(states, actions, rewards, policy, variance_reduced_loss)) rqmc_grads = np.asarray(rqmc_grads) rqmc_means = np.cumsum(rqmc_grads, axis=0) / np.arange( 1, len(rqmc_grads) + 1)[:, np.newaxis, np.newaxis] arqmc_means_dict = {} #arqmc_noises = get_rqmc_noises(args.n_trajs, args.H, env.M, 'array') uniform_noises = ssj_uniform(args.n_trajs, env.M) # n_trajs , action_dim arqmc_noises = uniform2normal( random_shift(np.expand_dims(uniform_noises, 1).repeat(args.H, 1), 0)) # n_trajs, horizon, action_dim for sorter in args.sorter: arqmc_grads = [] sort_f = get_sorter(sorter, env, K) data = ArrayRQMCSampler(env, args.n_trajs, sort_f=sort_f).sample(policy, arqmc_noises) for traj in data: states, actions, rewards = np.asarray(traj['states']), np.asarray( traj['actions']), np.asarray(traj['rewards']) if len(states) < args.H: out_set.add('arqmc_{}'.format(sorter)) break arqmc_grads.append( get_gaussian_policy_gradient(states, actions, rewards, policy, variance_reduced_loss)) arqmc_grads = np.asarray(arqmc_grads) arqmc_means = np.cumsum(arqmc_grads, axis=0) / np.arange( 1, len(arqmc_grads) + 1)[:, np.newaxis, np.newaxis] arqmc_means_dict[sorter] = arqmc_means expected_grad = env.expected_policy_gradient(K, Sigma_a) mc_errors = [np.nan] if 'mc' in out_set else (( mc_means - expected_grad)**2).reshape(mc_means.shape[0], -1).mean( 1) # why the sign is reversed? rqmc_errors = [np.nan] if 'rqmc' in out_set else ( (rqmc_means - expected_grad)**2).reshape(rqmc_means.shape[0], -1).mean(1) arqmc_errors_dict = { sorter: [np.nan] if 'arqmc_{}'.format(sorter) in out_set else ((arqmc_means - expected_grad)**2).reshape(arqmc_means.shape[0], -1).mean(1) for sorter, arqmc_means in arqmc_means_dict.items() } info = { **vars(args), 'out': out_set, 'expected_grad': expected_grad, 'means': { 'mc': mc_means, 'rqmc': rqmc_means, **arqmc_means_dict, }, } if args.save_fn is not None: with open(save_fn, 'wb') as f: dill.dump( dict(mc_errors=mc_errors, rqmc_errors=rqmc_errors, arqmc_errors_dict=arqmc_errors_dict, info=info), f) if args.show_fig: mc_data = pd.DataFrame({ 'name': 'mc', 'x': np.arange(len(mc_errors)), 'error': mc_errors, }) rqmc_data = pd.DataFrame({ 'name': 'rqmc', 'x': np.arange(len(rqmc_errors)), 'error': rqmc_errors, }) arqmc_data = pd.concat([ pd.DataFrame({ 'name': 'arqmc_{}'.format(sorter), 'x': np.arange(len(arqmc_errors)), 'error': arqmc_errors, }) for sorter, arqmc_errors in arqmc_errors_dict.items() ]) plot = sns.lineplot(x='x', y='error', hue='name', data=pd.concat([mc_data, rqmc_data, arqmc_data])) plot.set(yscale='log') plt.show() return mc_errors, rqmc_errors, arqmc_errors_dict, info
def compare_cost(args): set_seed(args.seed) env = LQR( #N=20, #M=12, init_scale=1.0, max_steps=args.H, # 10, 20 Sigma_s_kappa=1.0, Q_kappa=1.0, P_kappa=1.0, A_norm=1.0, B_norm=1.0, Sigma_s_scale=0.0, ) K = env.optimal_controller() mean_network = nn.Linear(*K.shape[::-1], bias=False) mean_network.weight.data = tensor(K) policy = GaussianPolicy(*K.shape[::-1], mean_network, learn_std=False, gate_output=False) # mc mc_costs = [] # individual mc_means = [] # cumulative for i in tqdm(range(args.n_trajs), 'mc'): noises = np.random.randn(env.max_steps, env.M) _, _, rewards, _, _ = rollout(env, policy, noises) mc_costs.append(-rewards.sum()) mc_means.append(np.mean(mc_costs)) # rqmc rqmc_costs = [] rqmc_means = [] rqmc_noises = get_rqmc_noises(args.n_trajs, env.max_steps, env.M, 'trajwise') for i in tqdm(range(args.n_trajs), 'rqmc'): _, _, rewards, _, _ = rollout(env, policy, rqmc_noises[i]) rqmc_costs.append(-rewards.sum()) rqmc_means.append(np.mean(rqmc_costs)) # array rqmc arqmc_costs_dict = {} arqmc_means_dict = {} arqmc_noises = get_rqmc_noises(args.n_trajs, env.max_steps, env.M, 'ssj') #arqmc_noises = get_rqmc_noises(args.n_trajs, env.max_steps, env.M, 'array') for sorter in args.sorter: arqmc_costs = [] arqmc_means = [] sort_f = get_sorter(sorter, env) data = ArrayRQMCSampler(env, args.n_trajs, sort_f=sort_f).sample(policy, arqmc_noises) for traj in data: rewards = np.asarray(traj['rewards']) arqmc_costs.append(-rewards.sum()) arqmc_means.append(np.mean(arqmc_costs)) arqmc_costs_dict[sorter] = arqmc_costs arqmc_means_dict[sorter] = arqmc_means expected_cost = env.expected_cost(K, np.diag(np.ones(env.M))) mc_errors = np.abs(mc_means - expected_cost) rqmc_errors = np.abs(rqmc_means - expected_cost) arqmc_errors_dict = { sorter: np.abs(arqmc_means - expected_cost) for sorter, arqmc_means in arqmc_means_dict.items() } logger.info('mc: {}, rqmc: {} '.format(mc_errors[-1], rqmc_errors[-1]) + \ ' '.join(['arqmc ({}): {}'.format(sorter, arqmc_errors[-1]) for sorter, arqmc_errors in arqmc_errors_dict.items()])) info = { **vars(args), 'mc_costs': mc_costs, 'rqmc_costs': rqmc_costs, 'arqmc_costs': arqmc_costs } if args.save_fn is not None: with open(args.save_fn, 'wb') as f: dill.dump( dict(mc_errors=mc_errors, rqmc_errors=rqmc_errors, arqmc_errors_dict=arqmc_errors_dict, info=info), f) if args.show_fig: data = pd.concat([ pd.DataFrame({ 'name': 'mc', 'x': np.arange(len(mc_errors)), 'error': mc_errors, }), pd.DataFrame({ 'name': 'rqmc', 'x': np.arange(len(rqmc_errors)), 'error': rqmc_errors, }), pd.concat([ pd.DataFrame({ 'name': 'arqmc_{}'.format(sorter), 'x': np.arange(len(arqmc_errors)), 'error': arqmc_errors, }) for sorter, arqmc_errors in arqmc_errors_dict.items() ]), ]) plot = sns.lineplot(x='x', y='error', hue='name', data=data) plot.set(yscale='log') plt.show() return mc_errors, rqmc_errors, arqmc_errors_dict, info
config.gpu_options.allow_growth = True # Reward function parameters: lin_pos[3] + ang_pos[3] + lin_vel[3] + ang_vel[3] # x y z r p q x. y. z. r. p. q. target = np.array([0.0, 0.0, 0.4075, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) weights = np.diag([0.3, 0.3, 2.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0, 1.0, 1.0, 0.2]) subs=2 m_init = np.random.randn(12)*0.01 S_init = m_init*0.0 + 0.02 with tf.Session(config=config, graph=tf.Graph()) as sess: env = gym.make('VrepBalanceBot2-v0') # Initial random rollouts to generate a dataset X,Y = rollout(env=env, pilco=None, random=True, timesteps=80, SUBS=subs, render=False) for i in range(1,12): #uniform action sampling X_, Y_ = rollout(env=env, pilco=None, random=True, timesteps=80, SUBS=subs, render=False) X = np.vstack((X, X_)).astype(np.float64) Y = np.vstack((Y, Y_)).astype(np.float64) for i in range(1,24): #Gaussian/Normal distribution action sampling X_, Y_ = rollout(env=env, pilco=None, random="Normal", timesteps=80, SUBS=subs, render=False) X = np.vstack((X, X_)).astype(np.float64) Y = np.vstack((Y, Y_)).astype(np.float64) for i in range(1,4): #No action sampling; u := 0 X_, Y_ = rollout(env=env, pilco=None, random=None, timesteps=80, SUBS=subs, render=False) X = np.vstack((X, X_)).astype(np.float64) Y = np.vstack((Y, Y_)).astype(np.float64) state_dim = Y.shape[1]
# Load data into arrays all_obs = np.zeros((args.num_rollouts, max_path_length, flat_obs)) all_rewards = np.zeros((args.num_rollouts, max_path_length)) rew = [] ### changes start import ipdb ipdb.set_trace() if args.weight: func = args.weight controller = control.StraightController(func) ### changes end for j in range(args.num_rollouts): # run a single rollout of the experiment path = rollout(env=env, agent=policy, controller=controller) # collect the observations and rewards from the rollout new_obs = path['observations'] all_obs[j, :new_obs.shape[0], :new_obs.shape[1]] = new_obs new_rewards = path['rewards'] all_rewards[j, :len(new_rewards)] = new_rewards # print the cumulative reward of the most recent rollout print("Round {}, return: {}".format(j, sum(new_rewards))) rew.append(sum(new_rewards)) # print the average cumulative reward across rollouts print("Average, std return: {}, {}".format(np.mean(rew), np.std(rew))) # ensure that a reward_plots folder exists in the directory, and if not,
np.random.seed(seed) torch.cuda.manual_seed(seed) torch.manual_seed(seed) env.seed(seed) # Get models from file itr_dir = 'itr_%03d' % args.iteration if args.iteration > -1 else 'last_itr' models_dir = osp.join(args.log_dir, 'models', itr_dir) policy_file = osp.join(models_dir, 'policy.pt') policy = torch.load(policy_file, map_location=lambda storage, loc: storage) print('\n' * 5) print('--->horizon', horizon) rollout(env, policy, max_horizon=horizon, fixed_horizon=True, render=True, return_info_dict=False, scale_pol_output=True, device='cpu', record_video_name=None, deterministic=not args.stochastic) if not args.record: input("Press a key to close the script") env.close()
hidden_sizes=(32,32,), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=3000, max_path_length=env.horizon, n_itr=100, discount=0.995, step_size=0.01, plot=False, ) algo.train() rollout(env, policy) with open("models/rc_gradient/agentturn" + "policy.pkl", "w") as f: f.dump(policy) # run_experiment_lite( # algo.train(), # # Number of parallel workers for sampling # n_parallel=4, # # Only keep the snapshot parameters for the last iteration # snapshot_mode="last", # script="scripts/run_experiment_lite_rl.py", # # script="scripts/run_experiment_lite.py", # log_dir="Results/Tmp", # # Specifies the seed for the experiment. If this is not provided, a random seed # # will be used
import numpy as np import gym from pilco.models import PILCO from pilco.controllers import RbfController, LinearController from pilco.rewards import ExponentialReward import tensorflow as tf from tensorflow import logging np.random.seed(0) from utils import rollout, policy with tf.Session(graph=tf.Graph()) as sess: env = gym.make('Pendulum-v0') # Initial random rollouts to generate a dataset X,Y = rollout(env=env, pilco=None, random=True, timesteps=40) for i in range(1,3): X_, Y_ = rollout(env=env, pilco=None, random=True, timesteps=40) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=5) #controller = LinearController(state_dim=state_dim, control_dim=control_dim) pilco = PILCO(X, Y, controller=controller, horizon=40) # Example of user provided reward function, setting a custom target state # R = ExponentialReward(state_dim=state_dim, t=np.array([0.1,0,0,0])) # pilco = PILCO(X, Y, controller=controller, horizon=40, reward=R)
weights[0, 0] = 1.0 weights[3, 3] = 1.0 m_init = np.zeros(state_dim)[None, :] S_init = 0.005 * np.eye(state_dim) T = 40 J = 5 N = 12 T_sim = 130 restarts = True lens = [] env = DoublePendWrapper() # Initial random rollouts to generate a dataset X, Y, _, _ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, render=True) for i in range(1, J): X_, Y_, _, _ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=True, render=True) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim
import numpy as np import tensorflow as tf from pilco.controllers import RbfController from pilco.models import PILCO from utils import rollout np.random.seed(0) with tf.Session(graph=tf.Graph()) as sess: env = gym.make('InvertedPendulum-v2') # Evaluate random actions so we know how bad random is random_rewards = [] for i in range(1, 100): _, Y_, rewards = rollout(env=env, pilco=None, random=True, timesteps=40) random_rewards.append(sum(rewards)) # Initial random rollouts to generate a dataset X, Y, _ = rollout(env=env, pilco=None, random=True, timesteps=40) random_rewards = [] for i in range(1, 3): X_, Y_, rewards = rollout(env=env, pilco=None, random=True, timesteps=40) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) random_rewards.append(sum(rewards))
max_action = 2.0 # used by the controller, but really defined by the environment # Reward function parameters target = np.array([1.0, 0.0, 0.0]) weights = np.diag([2.0, 2.0, 0.3]) # Environment defined m_init = np.reshape([-1.0, 0.0, 0.0], (1, 3)) S_init = np.diag([0.01, 0.01, 0.01]) # Random rollouts X, Y = rollout(env, None, timesteps=T, verbose=False, random=True, SUBS=SUBS, render=True) for i in range(1, J): X_, Y_ = rollout(env, None, timesteps=T, verbose=False, random=True, SUBS=SUBS, render=True) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) print(X)
import numpy as np import gym from pilco.models import PILCO from pilco.controllers import RbfController, LinearController from pilco.rewards import ExponentialReward import tensorflow as tf np.random.seed(0) from utils import policy, rollout, Normalised_Env SUBS = 5 T = 25 env = gym.make('MountainCarContinuous-v0') # Initial random rollouts to generate a dataset X1, Y1, _, _ = rollout(env=env, pilco=None, random=True, timesteps=T, SUBS=SUBS, render=True) for i in range(1, 5): X1_, Y1_, _, _ = rollout(env=env, pilco=None, random=True, timesteps=T, SUBS=SUBS, render=True) X1 = np.vstack((X1, X1_)) Y1 = np.vstack((Y1, Y1_)) env.close() env = Normalised_Env('MountainCarContinuous-v0', np.mean(X1[:, :2], 0), np.std(X1[:, :2], 0))
# weights[0,0] = 0.5 # weights[3,3] = 0.5 m_init = np.zeros(state_dim)[None, :] S_init = 0.01 * np.eye(state_dim) T = 100 J = 7 N = 15 T_sim = 100 restarts=True lens = [] with tf.Session() as sess: env = DriftCarWrapper() # Initial random rollouts to generate a dataset X,Y = rollout(env, None, timesteps=T, random=True, SUBS=SUBS) for i in range(1,J): X_, Y_ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=True) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = PILCO(X, Y, controller=controller, horizon=T, reward=R, m_init=m_init, S_init=S_init) # for numerical stability
env = gym.make('Pendulum-v0') action_dim = env.action_space.shape[0] state_dim = env.observation_space.shape[0] ddpg = DDPG(state_dim, action_dim, [env.action_space.low, env.action_space.high]) if args.type == 'train': get_state = lambda x: x for i in range(10000): total_reward = ddpg.update(env, get_state) print("Iteration " + str(i) + " reward: " + str(total_reward)) if i % 20 == 0: [_, _, rewards] = rollout(env, ddpg.curr_policy(), get_state, render=True) total_reward = np.sum(np.array(rewards)) print("Test reward: " + str(total_reward)) if i % 100 == 0: ddpg.save_model(args.file) policy = ddpg.curr_policy() rollout(env, policy, get_state, render=True) ddpg.save_model(args.file) elif args.type == 'test': ddpg.load_model(args.file) get_state = lambda x: x for i in range(20): [_, _, rewards] = rollout(env,
self.env.render() if __name__ == '__main__': env = TendonGymEnv() e = np.array( [[1]]) # Max control input. Set too low can lead to Cholesky failures. T = 10 maxiter = 10 T_sim = 300 buffer_size = 600 verbose = True X, Y, _, _, _ = rollout(env=env, pilco=None, random=True, timesteps=T_sim, render=False, verbose=verbose) for i in range(1, 1): X_, Y_, _, _, _ = rollout(env=env, pilco=None, random=True, timesteps=T_sim, render=False, verbose=verbose) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim m_init = np.reshape(np.zeros(state_dim),
def evaluate_prob_success(env, policy): rolls = [rollout(env, policy, show=False) for i in range(100)] reward, successes = zip(*rolls) print np.mean(reward) return sum(successes) * 1. / 100