def main(args): # load env env = gym.make('CartPole-v0') # load agent agent = DQN(env) agent.construct_model(args.gpu) # load model or init a new saver = tf.train.Saver() if args.model_path is not None: # reuse saved model saver.restore(agent.sess, args.model_path) else: # build a new model agent.init_var() # training loop for ep in range(args.ep): # reset env total_rewards = 0 state = env.reset() while True: env.render() # sample actions action = agent.sample_action(state, policy='greedy') # act! next_state, reward, done, _ = env.step(action) total_rewards += reward # state shift state = next_state if done: break print('Ep%s Reward: %s ' % (ep + 1, total_rewards))
def main(args): # load env env = gym.make('CartPole-v0') # load agent agent = DQN(env) agent.construct_model(args.gpu) # load model or init a new saver = tf.train.Saver() if args.model_path is not None: # reuse saved model saver.restore(agent.sess, args.model_path) else: # build a new model agent.init_var() # training loop for ep in range(args.ep): # reset env total_rewards = 0 state = env.reset() while True: env.render() # sample actions action = agent.sample_action(state, policy='greedy') # act! next_state, reward, done, _ = env.step(action) total_rewards += reward # state shift state = next_state if done: break print('Ep%s Reward: %s ' % (ep+1, total_rewards))
def main(args): set_random_seed(args.seed) env = gym.make("CartPole-v0") agent = DQN(env, args) agent.construct_model(args.gpu) # load pre-trained models or init new a model. saver = tf.train.Saver(max_to_keep=1) if args.model_path is not None: saver.restore(agent.sess, args.model_path) ep_base = int(args.model_path.split('_')[-1]) best_mean_rewards = float(args.model_path.split('/')[-1].split('_')[0]) else: agent.sess.run(tf.global_variables_initializer()) ep_base = 0 best_mean_rewards = None rewards_history, steps_history = [], [] train_steps = 0 # Training for ep in range(args.max_ep): state = env.reset() ep_rewards = 0 for step in range(env.spec.max_episode_steps): # pick action action = agent.sample_action(state, policy='egreedy') # execution action. next_state, reward, done, debug = env.step(action) train_steps += 1 ep_rewards += reward # modified reward to speed up learning reward = 0.1 if not done else -1 # learn and Update net parameters agent.learn(state, action, reward, next_state, done) state = next_state if done: break steps_history.append(train_steps) if not rewards_history: rewards_history.append(ep_rewards) else: rewards_history.append(rewards_history[-1] * 0.9 + ep_rewards * 0.1) # decay epsilon if agent.epsilon > args.final_epsilon: agent.epsilon -= (args.init_epsilon - args.final_epsilon) / args.max_ep # evaluate during training if ep % args.log_every == args.log_every - 1: total_reward = 0 for i in range(args.test_ep): state = env.reset() for j in range(env.spec.max_episode_steps): action = agent.sample_action(state, policy='greedy') state, reward, done, _ = env.step(action) total_reward += reward if done: break current_mean_rewards = total_reward / args.test_ep print('Episode: %d Average Reward: %.2f' % (ep + 1, current_mean_rewards)) # save model if current model outperform the old one if best_mean_rewards is None or (current_mean_rewards >= best_mean_rewards): best_mean_rewards = current_mean_rewards if not os.path.isdir(args.save_path): os.makedirs(args.save_path) save_name = args.save_path + str(round(best_mean_rewards, 2)) \ + '_' + str(ep_base + ep + 1) saver.save(agent.sess, save_name) print('Model saved %s' % save_name) plt.plot(steps_history, rewards_history) plt.xlabel('steps') plt.ylabel('running avg rewards') plt.show()
def main(args): env = gym.make("CartPole-v0") if args.seed >= 0: random_seed(args.seed) env.seed(args.seed) agent = DQN(env, args) model = get_model(out_dim=env.action_space.n, lr=args.lr) agent.set_model(model) rewards_history, steps_history = [], [] train_steps = 0 # Training for ep in range(args.max_ep): state = env.reset() ep_rewards = 0 for step in range(env.spec.timestep_limit): # sample action action = agent.sample_action(state, policy="egreedy") # apply action next_state, reward, done, debug = env.step(action) train_steps += 1 ep_rewards += reward # modified reward to speed up learning reward = 0.1 if not done else -1 # train agent.train(state, action, reward, next_state, done) state = next_state if done: break steps_history.append(train_steps) if not rewards_history: rewards_history.append(ep_rewards) else: rewards_history.append(rewards_history[-1] * 0.9 + ep_rewards * 0.1) # Decay epsilon if agent.epsilon > args.final_epsilon: decay = (args.init_epsilon - args.final_epsilon) / args.max_ep agent.epsilon -= decay # Evaluate during training if ep % args.log_every == args.log_every - 1: total_reward = 0 for i in range(args.test_ep): state = env.reset() for j in range(env.spec.timestep_limit): if args.render: env.render() action = agent.sample_action(state, policy="greedy") state, reward, done, _ = env.step(action) total_reward += reward if done: break current_mean_rewards = total_reward / args.test_ep print("Episode: %d Average Reward: %.2f" % (ep + 1, current_mean_rewards)) # plot training rewards plt.plot(steps_history, rewards_history) plt.xlabel("steps") plt.ylabel("running avg rewards") plt.show()
def main(args): set_random_seed(args.seed) env = gym.make('CartPole-v0') agent = DQN(env, args) agent.construct_model(args.gpu) # load pretrained models or init new a model. saver = tf.train.Saver(max_to_keep=1) if args.model_path is not None: saver.restore(agent.sess, args.model_path) ep_base = int(args.model_path.split('_')[-1]) best_mean_rewards = float(args.model_path.split('/')[-1].split('_')[0]) else: agent.sess.run(tf.global_variables_initializer()) ep_base = 0 best_mean_rewards = None rewards_history, steps_history = [], [] train_steps = 0 # Training for ep in range(args.max_ep): state = env.reset() ep_rewards = 0 for step in range(env.spec.timestep_limit): # pick action action = agent.sample_action(state, policy='egreedy') # Execution action. next_state, reward, done, debug = env.step(action) train_steps += 1 ep_rewards += reward # modified reward to speed up learning reward = 0.1 if not done else -1 # Learn and Update net parameters agent.learn(state, action, reward, next_state, done) state = next_state if done: break steps_history.append(train_steps) if not rewards_history: rewards_history.append(ep_rewards) else: rewards_history.append( rewards_history[-1] * 0.9 + ep_rewards * 0.1) # Decay epsilon if agent.epsilon > args.final_epsilon: agent.epsilon -= (args.init_epsilon - args.final_epsilon) / args.max_ep # Evaluate during training if ep % args.log_every == args.log_every-1: total_reward = 0 for i in range(args.test_ep): state = env.reset() for j in range(env.spec.timestep_limit): action = agent.sample_action(state, policy='greedy') state, reward, done, _ = env.step(action) total_reward += reward if done: break current_mean_rewards = total_reward / args.test_ep print('Episode: %d Average Reward: %.2f' % (ep + 1, current_mean_rewards)) # save model if current model outpeform the old one if best_mean_rewards is None or (current_mean_rewards >= best_mean_rewards): best_mean_rewards = current_mean_rewards if not os.path.isdir(args.save_path): os.makedirs(args.save_path) save_name = args.save_path + str(round(best_mean_rewards, 2)) \ + '_' + str(ep_base + ep + 1) saver.save(agent.sess, save_name) print('Model saved %s' % save_name) # plot training rewards plt.plot(steps_history, rewards_history) plt.xlabel('steps') plt.ylabel('running avg rewards') plt.show()
def main(args): # Hyper parameters MAX_EPISODE = 10000 # training episode INITIAL_EPSILON = 0.5 # starting value of epsilon FINAL_EPSILON = 0.01 # final value of epsilon TEST_EPISODE = 100 env = gym.make('CartPole-v0') agent = DQN(env, double_q=args.double) agent.construct_model(args.gpu) saver = tf.train.Saver(max_to_keep=2) if args.model_path is not None: saver.restore(agent.sess, args.model_path) ep_base = int(args.model_path.split('_')[-1]) mean_rewards = float(args.model_path.split('/')[-1].split('_')[0]) else: agent.sess.run(tf.global_variables_initializer()) ep_base = 0 mean_rewards = None # Training for ep in range(MAX_EPISODE): state = env.reset() for step in range(env.spec.timestep_limit): # pick action action = agent.sample_action(state, policy='egreedy') # Execution action. next_state, reward, done, debug = env.step(action) # modified reward to speed up learning reward = 0.1 if not done else -1 # Learn and Update net parameters agent.learn(state, action, reward, next_state, done) state = next_state if done: break # Update epsilon if agent.epsilon > FINAL_EPSILON: agent.epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / MAX_EPISODE # Evaluate during training if ep % args.log_every == args.log_every - 1: total_reward = 0 for i in range(TEST_EPISODE): state = env.reset() for j in range(env.spec.timestep_limit): action = agent.sample_action(state, policy='greedy') state, reward, done, _ = env.step(action) total_reward += reward if done: break mean_rewards = total_reward / float(TEST_EPISODE) print('Episode:', ep + 1, ' Average Reward:', mean_rewards) print('Global steps:', agent.global_step) if not os.path.isdir(args.save_path): os.makedirs(args.save_path) save_name = args.save_path + str(round(mean_rewards, 2)) + '_' \ + str(ep_base+ep+1) saver.save(agent.sess, save_name)