def sampler_adj(pi_adj, pi_s, env, samples_n, render=True): D = list() state = env.reset() for i in range(samples_n): if render: env.render() u_action = utils.policy(env, pi_s, state, False) state_copy = state a = np.ndarray.tolist(state_copy) a.extend(np.ndarray.tolist(u_action)) pi_adj_action = pi_adj.predict(np.array(a).reshape(1, -1))[0] action = u_action + pi_adj_action + np.random.normal(0, SIGMA) state_next, reward, terminal, info = env.step(action) D.append([state, action, state_next]) state = state_next if terminal: state = env.reset() break y = np.array([np.array(xi) for xi in D]) return y
def plot_pilco_source_learning_curve(): env = gym.make('continuous-cartpole-v0') env.seed(73) pilcos = ['initial'] + [str(i) for i in range(6)] rewards = [] for i, p in enumerate(pilcos): controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(p), controller=controller, reward=R, sparse=False) score_logger = ScoreLogger('Score for Model {:d}'.format(i)) state = env.reset() step = 0 xs = [] angles = [] while True: xs.append(state[0]) angles.append(state[2]) step += 1 env.render() u_action = utils.policy(env, pilco, state, False) state_copy = state a = np.ndarray.tolist(state_copy) a.extend(np.ndarray.tolist(u_action)) state_next, reward, terminal, info = env.step(u_action) reward = reward if not terminal else -reward state = state_next if terminal: print('Run: {:d}, score: {:d}'.format(i, step)) score_logger.add_score(step, i) break rewards.append(step) plt.plot(xs, angles) plt.savefig('pilco-{:d}_states_plot'.format(i), bbox_inches="tight") plt.close() env.close() plt.plot([i for i, _ in enumerate(pilcos)], rewards) plt.savefig('pilco_rewards_plot', bbox_inches="tight") plt.close() return rewards, xs, angles
def loader(name): env = gym.make('continuous-cartpole-v99') env.seed(73) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False) score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS') # observation_space = env.observation_space.shape[0] run = 0 while True: run += 1 state = env.reset() step = 0 while True: step += 1 env.render() #TODO RUN PI ADJUST action = utils.policy(env, pilco, state, False) # TODO RUN PI ADJUST COMMENT THE NEXT LINE state_next, reward, terminal, info = env.step(action) # reward = reward if not terminal else -reward state = state_next if terminal: print("Run: " + str(run) + ", score: " + str(step)) score_logger.add_score(step, run) break env.env.close()
def see_progression(pilco_name='saved/pilco-continuous-cartpole-5', transfer_name='{:d}true_dyn_pi_adj.pkl', adjust=True): env = gym.make('continuous-cartpole-v99') env.seed(1) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco(pilco_name, controller=controller, reward=R, sparse=False) rewards = [] for i in range(10): print('Running {:s}'.format(transfer_name.format(i))) if adjust: with open(transfer_name.format(i), 'rb') as inp2: pi_adjust = pickle.load(inp2) score_logger = ScoreLogger('Score for Model {:d}'.format(i)) state = env.reset() step = 0 while True: step += 1 env.render() u_action = utils.policy(env, pilco, state, False) state_copy = state a = np.ndarray.tolist(state_copy) a.extend(np.ndarray.tolist(u_action)) if adjust: pi_adjust_action = pi_adjust.predict( np.array(a).reshape(1, -1))[0] else: pi_adjust_action = 0 # ENABLE THIS TO SEE IT RUN WITHOUT THE ADJUSTMENT state_next, reward, terminal, info = env.step(u_action + pi_adjust_action) reward = reward if not terminal else -reward state = state_next if terminal: print('Run: {:d}, score: {:d}'.format(i, step)) score_logger.add_score(step, i) break rewards.append(step) env.close() return rewards
def run_transfer_model_and_plot_pos(env_name, pilco_name, fig_file_name, fig_title, transfer_name=None, save=True): env = gym.make(env_name) env.seed(1) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(pilco_name), controller=controller, reward=R, sparse=False) if transfer_name is not None: with open(transfer_name, 'rb') as inp2: pi_adjust = pickle.load(inp2) xs = [] angles = [] state = env.reset() for _ in range(1000): xs.append(state[0]) angles.append(state[2]) env.render() u_action = policy(env, pilco, state, False) state_copy = state a = np.ndarray.tolist(state_copy) a.extend(np.ndarray.tolist(u_action)) if transfer_name is not None: pi_adjust_action = pi_adjust.predict(np.array(a).reshape(1, -1))[0] else: pi_adjust_action = 0 state_next, reward, terminal, info = env.step(u_action + pi_adjust_action) state = state_next if terminal: break env.close() xs = np.array(xs) angles = np.array(angles) plt.plot(xs, angles) plt.quiver(xs[:-1], angles[:-1], xs[1:] - xs[:-1], angles[1:] - angles[:-1], scale_units='xy', angles='xy', scale=1, color='blue', width=1e-2) plt.xlabel('position') plt.ylabel('angle') plt.title(fig_title) plt.xlim(-0.2, 0.2) plt.ylim(-0.2, 0.2) if save: plt.savefig(fig_file_name, bbox_inches="tight") plt.close()
def souce_loader(name): Rs = np.empty(10).reshape(1, 10) env = gym.make('continuous-cartpole-v99') env.seed(73) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False) env = gym.make('continuous-cartpole-v99') pi_adjust = None score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS') run = 0 avg_reward = 0 while run != 101: run += 1 if (run % 20 == 0): print('run: ', run) state = env.reset() # print(state) # input() step = 0 while True: step += 1 # env.render() # TODO RUN PI ADJUST u_action = utils.policy(env, pilco, state, False) state_copy = state # TODO RUN PI ADJUST COMMENT THE NEXT LINE state_next, reward, terminal, info = env.step(u_action) reward = reward if not terminal else -reward state = state_next if terminal: # print("Run: " + ", score: " + str(step)) score_logger.add_score(step, run) avg_reward = avg_reward + step break avg_reward = avg_reward / run env.env.close() return (avg_reward)
def true_loader(name): env = gym.make('continuous-cartpole-v99') env.seed(73) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False) with open('9true_dyn_pi_adj.pkl', 'rb') as inp2: pi_adjust = pickle.load(inp2) # with open('10_pi_adj.pkl', 'rb') as inp2: # good_pi = pickle.load(inp2) score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS') run = 0 while True: run += 1 state = env.reset() # print(state) # input() step = 0 while True: step += 1 env.render() u_action = utils.policy(env, pilco, state, False) state_copy = state a = np.ndarray.tolist(state_copy) a.extend(np.ndarray.tolist(u_action)) action = pi_adjust.predict(np.array(a).reshape(1, -1))[0] state_next, reward, terminal, info = env.step(action + u_action) reward = reward if not terminal else -reward state = state_next if terminal: print("Run: " + ", score: " + str(step)) score_logger.add_score(step, run) break env.env.close()
def test(env_info, total_episodes=3, noise_std=0.2): ou_noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(noise_std) * np.ones(1)) for _ in range(total_episodes): prev_state = env.reset() while True: env.render() tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0) action = policy(actor_model, tf_prev_state, ou_noise, **env_info) state, reward, done, info = env.step(action) if done: break prev_state = state
def train(env_info, buffer, total_episodes=100, noise_std=0.2, gamma=0.99, tau=0.005): ou_noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(noise_std) * np.ones(1)) # Making the weights equal initially target_actor.set_weights(actor_model.get_weights()) target_critic.set_weights(critic_model.get_weights()) # To store reward history of each episode ep_reward_list = [] for ep in tqdm(range(total_episodes)): prev_state = env.reset() episodic_reward = 0 while True: tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0) action = policy(actor_model, tf_prev_state, ou_noise, **env_info) # Recieve state and reward from environment. state, reward, done, info = env.step(action) buffer.record((prev_state, action, reward, state)) episodic_reward += reward buffer.learn(gamma) update_target(target_actor.variables, actor_model.variables, tau) update_target(target_critic.variables, critic_model.variables, tau) if reward >= 40: print(f"-- Episode: {ep} "+"-"*20) print(f"Action: {action}") print(f"Reward: {reward}\tDone: {done}\nState: {state}") # End this episode when `done` is True if done: break prev_state = state ep_reward_list.append(episodic_reward)
def sampler(pi, env, samples_n, trials=1, render=True): D = None for t in range(trials): state = env.reset() for i in range(samples_n): if render: env.render() action = utils.policy(env, pi, state, False) + np.random.normal(0, SIGMA) state_next, reward, terminal, info = env.step(action) if D is not None: D = np.vstack((D, [state, action, state_next])) elif D is None: D = np.array([state, action, state_next]).reshape(1, -1) state = state_next if terminal: break return D
return "arn:aws:s3:::mozilla-releng-%s-archiver" % rgn else: return "arn:aws:s3:::mozilla-releng-staging-%s-archiver" % rgn cft.resources.add(Resource( name, 'AWS::IAM::User', Properties({ 'Policies': [ policy("tooltoolbucketaccess", { "Effect": "Allow", "Action": [ "s3:ListBucket", "s3:ListBucketMultipartUploads", "s3:ListBucketVersions", ], "Resource": [ bucket_arn_tooltool('use1'), bucket_arn_tooltool('usw1'), bucket_arn_tooltool('usw2') ] }), policy("tooltoolobjectaccess", { "Effect": "Allow", "Action": [ "s3:AbortMultipartUpload", "s3:DeleteObject", "s3:DeleteObjectVersion", "s3:GetObject", "s3:GetObjectAcl", "s3:GetObjectTorrent",
if is_prod: return "arn:aws:s3:::mozilla-releng-%s-tooltool" % rgn else: return "arn:aws:s3:::mozilla-releng-staging-%s-tooltool" % rgn cft.resources.add(Resource( name, 'AWS::IAM::User', Properties({ 'Policies': [ policy("tooltoolbucketaccess", { "Effect": "Allow", "Action": [ "s3:ListBucket", "s3:ListBucketMultipartUploads", "s3:ListBucketVersions", ], "Resource": [ bucket_arn('use1'), bucket_arn('usw1'), bucket_arn('usw2') ] }), policy("tooltoolobjectaccess", { "Effect": "Allow", "Action": [ "s3:AbortMultipartUpload", "s3:DeleteObject", "s3:DeleteObjectVersion", "s3:GetObject", "s3:GetObjectAcl", "s3:GetObjectTorrent",
def loader(name): Rs = np.empty(10).reshape(1, 10) env = gym.make('continuous-cartpole-v99') env.seed(73) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False) for pick in range(1, 11): env = gym.make('continuous-cartpole-v99') with open(str(pick) + '_pi_adj.pkl', 'rb') as inp2: pi_adjust = pickle.load(inp2) score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS') run = 0 avg_reward = 0 while run != 101: run += 1 if (run % 20 == 0): print('run: ', run) state = env.reset() # print(state) # input() step = 0 while True: step += 1 #env.render() #TODO RUN PI ADJUST u_action = utils.policy(env, pilco, state, False) state_copy = state a = np.ndarray.tolist(state_copy) a.extend(np.ndarray.tolist(u_action)) action = pi_adjust.predict(np.array(a).reshape(1, -1)) action = action[0] if action[0] > 1: action[0] = 1 elif action[0] < -1: action[0] = -1 # TODO RUN PI ADJUST COMMENT THE NEXT LINE state_next, reward, terminal, info = env.step(action) reward = reward if not terminal else -reward state = state_next if terminal: # print("Run: " + ", score: " + str(step)) score_logger.add_score(step, run) avg_reward = avg_reward + step break avg_reward = avg_reward / run env.env.close() Rs[0][pick - 1] = avg_reward return (Rs)