def main(): sess = tf.Session(config=cf.tf_config) dqn = DQN(cf, sess) sess.run(tf.global_variables_initializer()) if bool(args.e): dqn.evaluate(load_model=True) else: dqn.learn() sess.close()
def main(args): set_random_seed(args.seed) env = gym.make("CartPole-v0") agent = DQN(env, args) agent.construct_model(args.gpu) # load pre-trained models or init new a model. saver = tf.train.Saver(max_to_keep=1) if args.model_path is not None: saver.restore(agent.sess, args.model_path) ep_base = int(args.model_path.split('_')[-1]) best_mean_rewards = float(args.model_path.split('/')[-1].split('_')[0]) else: agent.sess.run(tf.global_variables_initializer()) ep_base = 0 best_mean_rewards = None rewards_history, steps_history = [], [] train_steps = 0 # Training for ep in range(args.max_ep): state = env.reset() ep_rewards = 0 for step in range(env.spec.max_episode_steps): # pick action action = agent.sample_action(state, policy='egreedy') # execution action. next_state, reward, done, debug = env.step(action) train_steps += 1 ep_rewards += reward # modified reward to speed up learning reward = 0.1 if not done else -1 # learn and Update net parameters agent.learn(state, action, reward, next_state, done) state = next_state if done: break steps_history.append(train_steps) if not rewards_history: rewards_history.append(ep_rewards) else: rewards_history.append(rewards_history[-1] * 0.9 + ep_rewards * 0.1) # decay epsilon if agent.epsilon > args.final_epsilon: agent.epsilon -= (args.init_epsilon - args.final_epsilon) / args.max_ep # evaluate during training if ep % args.log_every == args.log_every - 1: total_reward = 0 for i in range(args.test_ep): state = env.reset() for j in range(env.spec.max_episode_steps): action = agent.sample_action(state, policy='greedy') state, reward, done, _ = env.step(action) total_reward += reward if done: break current_mean_rewards = total_reward / args.test_ep print('Episode: %d Average Reward: %.2f' % (ep + 1, current_mean_rewards)) # save model if current model outperform the old one if best_mean_rewards is None or (current_mean_rewards >= best_mean_rewards): best_mean_rewards = current_mean_rewards if not os.path.isdir(args.save_path): os.makedirs(args.save_path) save_name = args.save_path + str(round(best_mean_rewards, 2)) \ + '_' + str(ep_base + ep + 1) saver.save(agent.sess, save_name) print('Model saved %s' % save_name) plt.plot(steps_history, rewards_history) plt.xlabel('steps') plt.ylabel('running avg rewards') plt.show()
def main(args): set_random_seed(args.seed) env = gym.make('CartPole-v0') agent = DQN(env, args) agent.construct_model(args.gpu) # load pretrained models or init new a model. saver = tf.train.Saver(max_to_keep=1) if args.model_path is not None: saver.restore(agent.sess, args.model_path) ep_base = int(args.model_path.split('_')[-1]) best_mean_rewards = float(args.model_path.split('/')[-1].split('_')[0]) else: agent.sess.run(tf.global_variables_initializer()) ep_base = 0 best_mean_rewards = None rewards_history, steps_history = [], [] train_steps = 0 # Training for ep in range(args.max_ep): state = env.reset() ep_rewards = 0 for step in range(env.spec.timestep_limit): # pick action action = agent.sample_action(state, policy='egreedy') # Execution action. next_state, reward, done, debug = env.step(action) train_steps += 1 ep_rewards += reward # modified reward to speed up learning reward = 0.1 if not done else -1 # Learn and Update net parameters agent.learn(state, action, reward, next_state, done) state = next_state if done: break steps_history.append(train_steps) if not rewards_history: rewards_history.append(ep_rewards) else: rewards_history.append( rewards_history[-1] * 0.9 + ep_rewards * 0.1) # Decay epsilon if agent.epsilon > args.final_epsilon: agent.epsilon -= (args.init_epsilon - args.final_epsilon) / args.max_ep # Evaluate during training if ep % args.log_every == args.log_every-1: total_reward = 0 for i in range(args.test_ep): state = env.reset() for j in range(env.spec.timestep_limit): action = agent.sample_action(state, policy='greedy') state, reward, done, _ = env.step(action) total_reward += reward if done: break current_mean_rewards = total_reward / args.test_ep print('Episode: %d Average Reward: %.2f' % (ep + 1, current_mean_rewards)) # save model if current model outpeform the old one if best_mean_rewards is None or (current_mean_rewards >= best_mean_rewards): best_mean_rewards = current_mean_rewards if not os.path.isdir(args.save_path): os.makedirs(args.save_path) save_name = args.save_path + str(round(best_mean_rewards, 2)) \ + '_' + str(ep_base + ep + 1) saver.save(agent.sess, save_name) print('Model saved %s' % save_name) # plot training rewards plt.plot(steps_history, rewards_history) plt.xlabel('steps') plt.ylabel('running avg rewards') plt.show()
def main(args): # Hyper parameters MAX_EPISODE = 10000 # training episode INITIAL_EPSILON = 0.5 # starting value of epsilon FINAL_EPSILON = 0.01 # final value of epsilon TEST_EPISODE = 100 env = gym.make('CartPole-v0') agent = DQN(env, double_q=args.double) agent.construct_model(args.gpu) saver = tf.train.Saver(max_to_keep=2) if args.model_path is not None: saver.restore(agent.sess, args.model_path) ep_base = int(args.model_path.split('_')[-1]) mean_rewards = float(args.model_path.split('/')[-1].split('_')[0]) else: agent.sess.run(tf.global_variables_initializer()) ep_base = 0 mean_rewards = None # Training for ep in range(MAX_EPISODE): state = env.reset() for step in range(env.spec.timestep_limit): # pick action action = agent.sample_action(state, policy='egreedy') # Execution action. next_state, reward, done, debug = env.step(action) # modified reward to speed up learning reward = 0.1 if not done else -1 # Learn and Update net parameters agent.learn(state, action, reward, next_state, done) state = next_state if done: break # Update epsilon if agent.epsilon > FINAL_EPSILON: agent.epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / MAX_EPISODE # Evaluate during training if ep % args.log_every == args.log_every - 1: total_reward = 0 for i in range(TEST_EPISODE): state = env.reset() for j in range(env.spec.timestep_limit): action = agent.sample_action(state, policy='greedy') state, reward, done, _ = env.step(action) total_reward += reward if done: break mean_rewards = total_reward / float(TEST_EPISODE) print('Episode:', ep + 1, ' Average Reward:', mean_rewards) print('Global steps:', agent.global_step) if not os.path.isdir(args.save_path): os.makedirs(args.save_path) save_name = args.save_path + str(round(mean_rewards, 2)) + '_' \ + str(ep_base+ep+1) saver.save(agent.sess, save_name)