def test_dagger(filename='imitation_output.txt', dataname='dagger_data.csv'): """Get metrics for DAGGER algorithm. Gets necessary data to answer q1 and q2 in the extra credit portion (DAGGER) in Question 2. Parameters ---------- filename: str Name of file to append DAGGER performance on wrapper environment to. dataname: str Name of file to write evaluation data on base env to. """ with tf.Session() as sess: # Load expert expert = imit.load_model('CartPole-v0_config.yaml', 'CartPole-v0_weights.h5f') # Initialize environments env = gym.make('CartPole-v0') eval_env = gym.make('CartPole-v0') eval_env = imit.wrap_cartpole(eval_env) # Initialize policy model. policy = imit.load_model('CartPole-v0_config.yaml') policy.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) # Run DAGGER mean_rewards, min_rewards, max_rewards = imit.dagger( expert, policy, env, eval_env) # Test on wrapper environment. rewards = imit.test_cloned_policy(eval_env, policy, render=False) hard_mean = np.mean(rewards) hard_std = np.std(rewards) # append to file f = open(filename, 'a+') f.write(DAGGER_OUTPUT % (hard_mean, hard_std)) f.close() # convert data to .csv format data_string = "Mean,Min,Max\n" for i in range(len(mean_rewards)): data_string += "%.4f,%.4f,%.4f\n" % ( mean_rewards[i], min_rewards[i], max_rewards[i]) # write data to file f = open(dataname, 'w') f.write(data_string) f.close()
def test_policy(num_episodes): """Train and test imitation-based policy. Parameters ---------- num_episodes: int Number of episodes to generate data for imitation policy. Returns ------- final loss, final accuracy, mean reward, reward std, wrapper mean reward, wrapper reward std """ with tf.Session() as sess: # load expert and policy model expert = imit.load_model('CartPole-v0_config.yaml', 'CartPole-v0_weights.h5f') policy = imit.load_model('CartPole-v0_config.yaml') policy.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) # initialize environment env = gym.make('CartPole-v0') # generate data from expert states, actions = imit.generate_expert_training_data( expert, env, num_episodes=num_episodes, render=False) # train policy history = policy.fit(states, actions, epochs=50, verbose=2) # get performance values final_loss = history.history['loss'][-1] final_accuracy = history.history['acc'][-1] rewards = imit.test_cloned_policy(env, policy, render=False) mean = np.mean(rewards) std = np.std(rewards) env = imit.wrap_cartpole(env) hard_rewards = imit.test_cloned_policy(env, policy, render=False) hard_mean = np.mean(hard_rewards) hard_std = np.std(hard_rewards) return final_loss, final_accuracy, mean, std, hard_mean, hard_std
def main(): model_config_path = "CartPole-v0_config.yaml" model_weight_path = "CartPole-v0_weights.h5f" env = gym.make('CartPole-v0') #env = wrap_cartpole(env) clone_model = load_model(model_config_path=model_config_path) expert_model = load_model(model_config_path=model_config_path, model_weights_path=model_weight_path) states, actions = generate_expert_training_data(expert_model, env, num_episodes=100, render=True) optimizer = keras.optimizers.Adam() clone_model.compile(optimizer, loss='binary_crossentropy', metrics=['accuracy']) clone_model.fit(states, actions, epochs=50) test_cloned_policy(env, expert_model, num_episodes=5, render=False) test_cloned_policy(env, clone_model, num_episodes=5, render=False)
def evaluate_expert(): """Evaluate expert on the wrapper environment. Return ----- mean(rewards), std(rewards) """ with tf.Session() as sess: env = gym.make('CartPole-v0') env = imit.wrap_cartpole(env) expert = imit.load_model('CartPole-v0_config.yaml', 'CartPole-v0_weights.h5f') rewards = imit.test_cloned_policy(env, expert, render=False) return np.mean(rewards), np.std(rewards)
f.write("%s\n" % each_thing) if __name__ == '__main__': # fancy printing RED = '\033[91m' BOLD = '\033[1m' ENDC = '\033[0m' LINE = "%s%s##############################################################################%s" % ( RED, BOLD, ENDC) env = gym.make('CartPole-v0') env_wrap = gym.make('CartPole-v0') env_wrap = imitation.wrap_cartpole(env_wrap) expert = imitation.load_model('CartPole-v0_config.yaml', 'CartPole-v0_weights.h5f') # test_cloned_policy(env, cloned_policy) episode_length_list = [1, 10, 50, 100] loss_all, accuracy_all = [], [] mean_reward_clones_list, mean_reward_clones_wrap_list = [], [] std_reward_clones_list, std_reward_clones_wrap_list = [], [] for curr_num_episodes in episode_length_list: str_1 = "Imitator with number of episodes = {}".format( curr_num_episodes) msg = "\n%s\n" % (LINE) + "%s%s\n" % (BOLD, str_1) + "%s\n" % (LINE) print(str(msg)) # train on vanilla env states_arr, actions_arr = imitation.generate_expert_training_data( expert, env, num_episodes=curr_num_episodes, render=False)
import os import time from deeprl_hw3.imitation import load_model from deeprl_hw3.reinforce2 import run_one_episode, train_nn, get_total_reward from keras.optimizers import Adam from keras import backend as K MAX_TRAIN_EPOCHS = 10000 EVA_INTERVAL = 10 gamma = 0.99 LR = 0.001 STEP_SIZE = 0.001 env = gym.make('CartPole-v0') EVAL_EPISODES = 100 sess = K.get_session() nn = load_model('CartPole-v0_config.yaml', None) nn.compile('SGD', 'mse', metrics=['accuracy']) sess.run(tf.global_variables_initializer()) file_path = 'Q3.txt' f = open(file_path, 'w') train_cnt = 0 eval_cnt = 0 end_cnt = 0 train_flag = True while train_flag == True: train_start = time.time() states, actions, rewards, softmaxes, dones = run_one_episode(env, nn) #print(len(states)) '''
from deeprl_hw3 import imitation import gym import argparse import os expert_yaml = os.path.join(os.getcwd(), 'CartPole-v0_config.yaml') expert_h5f = os.path.join(os.getcwd(), 'CartPole-v0_weights.h5f') expert = imitation.load_model(expert_yaml, expert_h5f) env = gym.make('CartPole-v0') cmdline = argparse.ArgumentParser() cmdline.add_argument("-e", "--episodes", dest="num_episodes", default=100, help="Number of episodes from expert") if __name__ == '__main__': args = cmdline.parse_args() # Problem 2. print("===== Problem 2.1 =====") obz, act = imitation.generate_expert_training_data(expert, env, num_episodes=int( args.num_episodes), render=False) model = imitation.load_model(expert_yaml) imitation.behavior_cloning(model, obz, act) print("===== Problem 2.2 =====") imitation.test_cloned_policy(env, model, render=False)
def reinforce(env, sess, gamma = 0.98, alpha = 0.00025, callback = None): """Policy gradient algorithm Parameters ---------- env: gym.core.Env Environment being run on. sess: tf.Session Tensorflow session for convenience. gamma: float Gamma value used for discounting and in the policy gradient algorithm. alpha: float Alpha value used in the policy gradient algorithm. callback: function Callback used to log learning metrics Returns ------- model: Keras model trained with policy gradient descent. """ # Initialize model model = imit.load_model('CartPole-v0_config.yaml') tf.initialize_all_variables() # Create gradient action_input = tf.placeholder(dtype=tf.int32, shape=(1,)) loss = tf.log(tf.gather(tf.reshape(model.output, (2,)), [action_input])) grads = tf.gradients(loss, model.weights) # Helper to compute gradient based on state-action pair def get_gradient(state, action): return sess.run(grads, feed_dict = { model.input : state, action_input : [action] }) rewards = [] iteration = 0 while True: # Run model for an episode S, A, R = run_episode(env, model) reward = sum(R) rewards.append(reward) if callback != None: callback(iteration, reward, model) print("REWARD (%d): %.4f" % (iteration, reward)) # Convergence condition is having a sufficiently small std with a sufficiently large mean # over the past 20 rewards. if len(rewards) > 20 and np.std(np.array(rewards[-20:])) < 3. and np.mean(np.array(rewards[-5:])) > 50.: print("CONVERGED") return model # Get discounted rewards. G = process_rewards(R, gamma) # Update weights w.r.t. gradients. weights = model.get_weights() for t in range(len(S)): gradients = get_gradient(S[t].reshape((1,4)), A[t]) assert(len(weights) == len(gradients)) for i in range(len(weights)): weights[i] += alpha * G[t] *(gamma**t)* gradients[i] model.set_weights(weights) iteration += 1
def run_Q2(env, env_hard, EXPERT_EPISODES, TRAIN_EPOCHS, folder_path): file_path = folder_path + 'Q2_' + str(EXPERT_EPISODES) + '_' + str( TRAIN_EPOCHS) + '.txt' f = open(file_path, 'w') f.write('Parameters:\n') f.write('EXPET_EPISODES:' + str(EXPERT_EPISODES) + '\n') f.write('TRAIN_EPOCHS:' + str(TRAIN_EPOCHS) + '\n') #test all parameters expert = load_model('CartPole-v0_config.yaml', 'CartPole-v0_weights.h5f') learner = load_model('CartPole-v0_config.yaml', None) adam = Adam() expert.compile(adam, 'binary_crossentropy', metrics=['accuracy']) learner.compile(adam, 'binary_crossentropy', metrics=['accuracy']) print('Prepare expert data with episodes num:', EXPERT_EPISODES) expert_states, expert_actions = generate_expert_training_data( expert, env, num_episodes=EXPERT_EPISODES, render=False) print('Expert data is ready. Start to train learner with epoch num:', TRAIN_EPOCHS) history = LossHistory() learner.fit(expert_states, expert_actions, epochs=TRAIN_EPOCHS, callbacks=[history]) weights_path = folder_path + 'Q2_' + str(EXPERT_EPISODES) + '_' + str( TRAIN_EPOCHS) + '.h5' learner.save_weights(weights_path) print('Test expert in normal env.........................................') expert_reward_summary, expert_reward_avg, expert_reward_std = test_cloned_policy( env, expert, num_episodes=100, render=False) print( 'Test learner in normal env.........................................') learner_reward_summary, learner_reward_avg, learner_reward_std = test_cloned_policy( env, learner, num_episodes=100, render=False) print('Test expert in hard Env.........................................') hard_expert_reward_summary, hard_expert_reward_avg, hard_expert_reward_std = test_cloned_policy( env_hard, expert, num_episodes=100, render=False) print('Test learner in hard Env.........................................') hard_learner_reward_summary, hard_learner_reward_avg, hard_learner_reward_std = test_cloned_policy( env_hard, learner, num_episodes=100, render=False) f.write('Expert Test in Normal Env:\n') f.write(str(expert_reward_avg) + ' ' + str(expert_reward_std) + '\n') f.write('Learner Test in Normal Env:\n') f.write(str(learner_reward_avg) + ' ' + str(learner_reward_std) + '\n') f.write('Expert Test in Hard Env:\n') f.write( str(hard_expert_reward_avg) + ' ' + str(hard_expert_reward_std) + '\n') f.write('Learner Test in Hard Env:\n') f.write( str(hard_learner_reward_avg) + ' ' + str(hard_learner_reward_std) + '\n') f.write('Learner Training History:\n') for i in range(TRAIN_EPOCHS): f.write( str(history.losses[i]) + ' ' + str(history.accues[i]) + '\n') f.write('Evaluate History:\n') for i in range(100): f.write( str(expert_reward_summary[i]) + ';' + str(learner_reward_summary[i]) + ';' + str(hard_expert_reward_summary[i]) + ';' + str(hard_learner_reward_summary[i]) + '\n') f.close()
def run_Q2(env, env_hard, EXPERT_EPISODES, TRAIN_EPOCHS, folder_path): file_path = folder_path + 'Q2_' + str(EXPERT_EPISODES) + '_' + str( TRAIN_EPOCHS) + '.txt' f = open(file_path, 'w') f.write('Parameters:\n') f.write('EXPET_EPISODES:' + str(EXPERT_EPISODES) + '\n') f.write('TRAIN_EPOCHS:' + str(TRAIN_EPOCHS) + '\n') #test all parameters expert = load_model('CartPole-v0_config.yaml', 'CartPole-v0_weights.h5f') learner = load_model('CartPole-v0_config.yaml', None) adam = Adam() expert.compile(adam, 'binary_crossentropy', metrics=['accuracy']) learner.compile(adam, 'binary_crossentropy', metrics=['accuracy']) print('Generate initial data from learner') data, _ = generate_expert_training_data(learner, env, num_episodes=1, render=False) print('Qurey expert for labels ') q_values = expert.predict(data) labels = np.argmax(q_values, axis=1) onehot_labels = np.zeros((labels.shape[0], 2)) for i in range(labels.shape[0]): onehot_labels[i, labels[i]] = 1 #print(onehot_labels) #print(onehot_labels.shape) print('Expert qurey is ready. Start to train learner with epoch num:', TRAIN_EPOCHS) history = LossHistory() train_cnt = 0 while train_cnt < TRAIN_EPOCHS: learner.fit(data, onehot_labels, epochs=1, callbacks=[history]) #generate new data for DAAGER # use the same function as generate expert, but using the learner model new_data, _ = generate_expert_training_data(learner, env, num_episodes=1, render=False) print('Qurey expert for labels ') new_q_values = expert.predict(new_data) new_labels = np.argmax(new_q_values, axis=1) new_onehot_labels = np.zeros((new_labels.shape[0], 2)) for i in range(new_labels.shape[0]): new_onehot_labels[i, new_labels[i]] = 1 data = np.vstack((data, new_data)) onehot_labels = np.vstack((onehot_labels, new_onehot_labels)) print(onehot_labels.shape) train_cnt = train_cnt + 1 weights_path = folder_path + 'Q2_' + str(EXPERT_EPISODES) + '_' + str( TRAIN_EPOCHS) + '.h5' learner.save_weights(weights_path) print('Test expert in normal env.........................................') expert_reward_summary, expert_reward_avg, expert_reward_std = test_cloned_policy( env, expert, num_episodes=100, render=False) print( 'Test learner in normal env.........................................') learner_reward_summary, learner_reward_avg, learner_reward_std = test_cloned_policy( env, learner, num_episodes=100, render=False) print('Test expert in hard Env.........................................') hard_expert_reward_summary, hard_expert_reward_avg, hard_expert_reward_std = test_cloned_policy( env_hard, expert, num_episodes=100, render=False) print('Test learner in hard Env.........................................') hard_learner_reward_summary, hard_learner_reward_avg, hard_learner_reward_std = test_cloned_policy( env_hard, learner, num_episodes=100, render=False) f.write('Expert Test in Normal Env:\n') f.write(str(expert_reward_avg) + ' ' + str(expert_reward_std) + '\n') f.write('Learner Test in Normal Env:\n') f.write(str(learner_reward_avg) + ' ' + str(learner_reward_std) + '\n') f.write('Expert Test in Hard Env:\n') f.write( str(hard_expert_reward_avg) + ' ' + str(hard_expert_reward_std) + '\n') f.write('Learner Test in Hard Env:\n') f.write( str(hard_learner_reward_avg) + ' ' + str(hard_learner_reward_std) + '\n') f.write('Learner Training History:\n') for i in range(TRAIN_EPOCHS): f.write( str(history.losses[i]) + ' ' + str(history.accues[i]) + '\n') f.write('Evaluate History:\n') for i in range(100): f.write( str(expert_reward_summary[i]) + ' ' + str(learner_reward_summary[i]) + ' ' + str(hard_expert_reward_summary[i]) + ' ' + str(hard_learner_reward_summary[i]) + '\n') f.close()