def main(): global state_dim, num_actions env_name = 'CartPole-v0' env = gym.make(env_name) sess = tf.Session() optimizer = tf.train.RMSPropOptimizer(learning_rate=0.0001, decay=0.9) writer = tf.summary.FileWriter("/tmp/{}-experiment-1".format(env_name)) state_dim = env.observation_space.shape[0] num_actions = env.action_space.n pg_reinforce = PolicyGradientREINFORCE(sess, optimizer, policy_network, state_dim, num_actions, summary_writer=writer, discount_factor=1.0) MAX_EPISODES = 10000 MAX_STEPS = 200 episode_history = deque(maxlen=100) for i_episode in range(MAX_EPISODES): # initialize state = env.reset() total_rewards = 0 for t in range(MAX_STEPS): #env.render() print("state", state, state[np.newaxis, :]) action = pg_reinforce.sampleAction(state[np.newaxis, :]) next_state, reward, done, _ = env.step(action) total_rewards += reward reward = -10 if done else 0.1 # normalize reward #print("action, next_state, reward, done", action, next_state, reward, done) pg_reinforce.storeRollout(state, action, reward) state = next_state if done: break pg_reinforce.updateModel() episode_history.append(total_rewards) mean_rewards = np.mean(episode_history) print("Episode {}".format(i_episode)) print("Finished after {} timesteps".format(t + 1)) print("Reward for this episode: {}".format(total_rewards)) print("Average reward for last 100 episodes: {:.2f}".format( mean_rewards)) if mean_rewards >= 195.0 and len(episode_history) >= 100: print("Environment {} solved after {} episodes".format( env_name, i_episode + 1)) break print()
initializer=tf.truncated_normal_initializer()) b1 = tf.get_variable("b1", [20], initializer=tf.constant_initializer(0)) h1 = tf.nn.tanh(tf.matmul(states, W1) + b1) W2 = tf.get_variable( "W2", [20, num_actions], initializer=tf.truncated_normal_initializer(stddev=0.1)) b2 = tf.get_variable("b2", [num_actions], initializer=tf.constant_initializer(0)) p = tf.matmul(h1, W2) + b2 return p pg_reinforce = PolicyGradientREINFORCE(policy_session, policy_optimizer, policy_network, state_dim, entropy_bonus=entropy_bonus, summary_writer=policy_writer, summary_every=policy_summary_every) # Initializing Sampler sampler = Sampler(pg_reinforce, env) # Q-network parameters q_session = tf.Session() q_optimizer = tf.train.AdamOptimizer(learning_rate=0.001) q_writer = tf.train.SummaryWriter("/home/drl/DRL/tensorflow-reinforce/tmp/q/") q_summary_every = 10 def action_masker(array):
""" define policy neural network """ W1 = tf.get_variable("W1", [state_dim, 20], initializer=tf.random_normal_initializer()) b1 = tf.get_variable("b1", [20], initializer=tf.constant_initializer(0)) h1 = tf.nn.tanh(tf.matmul(states, W1) + b1) W2 = tf.get_variable("W2", [20, num_actions], initializer=tf.random_normal_initializer(stddev=0.1)) b2 = tf.get_variable("b2", [num_actions], initializer=tf.constant_initializer(0)) p = tf.matmul(h1, W2) + b2 return p pg_reinforce = PolicyGradientREINFORCE(sess, optimizer, policy_network, state_dim, summary_writer=writer, summary_every=1) sampler = Sampler(pg_reinforce, env) reward = [] for _ in tqdm(range(30)): batch = sampler.collect_one_batch() pg_reinforce.update_parameters(batch["states"], batch["actions"], batch["monte_carlo_returns"]) reward.append(batch["rewards"].sum()/200) show_image(reward) # batch = sampler.collect_one_batch()
shape=[1], initializer=tf.constant_initializer(0)) value = (tf.matmul(tf.reshape(output, [-1, gru_unit_size]), w_value) + b_value) return logit, final_state, value pg_reinforce = PolicyGradientREINFORCE(sess, optimizer, policy_network, state_dim, num_actions, gru_unit_size, num_step, num_layers, save_path + env.spec.id, global_step, max_gradient=max_gradient, entropy_bonus=entropy_bonus, summary_writer=writer, summary_every=10, loss_function=loss_function) sampler = Sampler(pg_reinforce, env, gru_unit_size, num_step, num_layers, max_step, batch_size,
def play_ner(): actions = 2 global AGENT global robot_data env_ctl = initialise_game(robot_data, True) # initialise a decision robot episode = 1 sess = tf.Session() optimizer = tf.train.RMSPropOptimizer(learning_rate=0.0001, decay=0.9) episode_history = deque(maxlen=100) total_rewards = 0 mean_rw = [] print(">>>>>> Playing game ..") if AGENT == "random": robot = RobotRandom(actions) elif AGENT == "DQN": env = env_ctl.get_new_environment() observation = env.first_observation() robot = RobotCNNDQN(actions) while episode <= MAX_EPISODE: #print '>>>>>>> Current game round ', episode, 'Maximum ', MAX_EPISODE action = robot.get_action(observation) #print '> Action', action observation2, reward, terminal, info = env.step(action) total_rewards += reward #print '> Reward', reward robot.update(observation, action, reward, observation2, terminal) observation = observation2 if terminal == True: env = env_ctl.get_new_environment() observation = env.first_observation() episode_history.append(total_rewards) mean_rewards = np.mean(episode_history) print mean_rewards mean_rw.append(mean_rewards) total_rewards = 0 episode += 1 print '> Terminal <' #mean_rw=np.array(mean_rw) #np.save('./data/mean_rw.npy',mean_rw) elif AGENT == "RDQN": env = env_ctl.get_new_PR_environment() observation = env.first_observation() robot = RobotRDQN(actions) while episode <= MAX_EPISODE: #print '>>>>>>> Current game round ', episode, 'Maximum ', MAX_EPISODE action = robot.get_action(observation) print '> Action', action observation2, reward, terminal, info = env.step(action) print '> Reward', reward robot.update(observation[0], action, reward, observation2[0], terminal) observation = observation2 if terminal == True: env = env_ctl.get_new_PR_environment() observation = env.first_observation() episode += 1 print '> Terminal <' elif AGENT == "PGRL": env = env_ctl.get_new_environment() observation = np.reshape(env.first_observation(), (1, state_dim)) robot = PolicyGradientREINFORCE(sess, optimizer, policy_network, state_dim, actions, summary_writer=None) saver = tf.train.Saver(max_to_keep=None) while episode <= MAX_EPISODE: print '>>>>>>> Current game round ', episode, 'Maximum ', MAX_EPISODE action = robot.sampleAction(observation) print '> Action', action observation2, reward, terminal, info = env.step(action) print '> Reward', reward robot.storeRollout(observation, action, reward) observation = np.reshape(observation2, (1, state_dim)) if terminal == True: robot.updateModel() env = env_ctl.get_new_environment() if env == None: env_ctl = initialise_game(robot_data, True) env = env_ctl.get_new_PR_environment() observation = np.reshape(env.first_observation(), (1, state_dim)) episode += 1 if episode % 1000 == 0: print "saving model" + "\n" path = saver.save(sess, "./model/%s_robot_model" % AGENT, global_step=episode) tempstr = 'have saved model to ' + path print tempstr print '> Terminal <' elif AGENT == "PRPGRL": env = env_ctl.get_new_PR_environment() observation = (np.reshape(env.first_observation()[0], (1, state_dim)), env.first_observation()[1]) robot = PRPolicyGradientREINFORCE(sess, optimizer, policy_network, state_dim, actions, summary_writer=None) saver = tf.train.Saver(max_to_keep=None) while episode <= MAX_EPISODE: print '>>>>>>> Current game round ', episode, 'Maximum ', MAX_EPISODE action = robot.sampleAction(observation) print '> Action', action observation2, reward, terminal, info = env.step(action) print '> Reward', reward robot.storeRollout(observation[0], action, reward) observation = (np.reshape(observation2[0], (1, state_dim)), observation2[1]) if terminal == True: robot.updateModel() env = env_ctl.get_new_PR_environment() if env == None: env_ctl = initialise_game(robot_data, True) env = env_ctl.get_new_PR_environment() observation = (np.reshape(env.first_observation()[0], (1, state_dim)), env.first_observation()[1]) episode += 1 if episode % 1000 == 0: print "saving model" + "\n" path = saver.save(sess, './model/%s_robot_model' % AGENT, global_step=episode) tempstr = 'have saved model to ' + path print tempstr print '> Terminal <' else: print "** There is no robot." raise SystemExit return robot
h1 = tf.nn.tanh(tf.matmul(states, W1) + b1) W2 = tf.get_variable(name="W2", shape=[20, num_actions], initializer=tf.random_normal_initializer(stddev=0.1)) b2 = tf.get_variable(name="b2", shape=[num_actions], initializer=tf.constant_initializer(0)) p = tf.matmul(h1, W2) + b2 return p pg_reinforce = PolicyGradientREINFORCE(sess, optimizer, policy_network, state_dim, num_actions, summary_writer=writer) MAX_EPISODES = 10000 MAX_STEPS = 200 episode_history = deque(maxlen=100) for e in range(MAX_EPISODES): # initialize state = env.reset() total_rewards = 0 for t in range(MAX_STEPS): env.render() action = pg_reinforce.sampleAction(state[np.newaxis, :])
W1 = tf.get_variable("W1", [state_dim, 20], initializer=tf.random_normal_initializer()) b1 = tf.get_variable("b1", [20], initializer=tf.constant_initializer(0)) h1 = tf.nn.tanh(tf.matmul(states, W1) + b1) W2 = tf.get_variable("W2", [20, num_actions], initializer=tf.random_normal_initializer(stddev=0.1)) b2 = tf.get_variable("b2", [num_actions], initializer=tf.constant_initializer(0)) p = tf.matmul(h1, W2) + b2 return p pg_reinforce = PolicyGradientREINFORCE(sess, optimizer, policy_network, state_dim, num_actions, discount_factor=discount_factor, summary_writer=writer) NUM_ITR = 1000 BATCH_SIZE = 100 MAX_STEPS = 200 episode_history = deque(maxlen=100) for i_itr in xrange(NUM_ITR): episodes = [] total_rewards = 0 for i_batch in xrange(BATCH_SIZE): # initialize state = env.reset() rewards, states, actions, returns = [], [], [], []