Example #1
0
def main():
    global state_dim, num_actions

    env_name = 'CartPole-v0'
    env = gym.make(env_name)

    sess = tf.Session()
    optimizer = tf.train.RMSPropOptimizer(learning_rate=0.0001, decay=0.9)
    writer = tf.summary.FileWriter("/tmp/{}-experiment-1".format(env_name))

    state_dim = env.observation_space.shape[0]
    num_actions = env.action_space.n

    pg_reinforce = PolicyGradientREINFORCE(sess,
                                           optimizer,
                                           policy_network,
                                           state_dim,
                                           num_actions,
                                           summary_writer=writer,
                                           discount_factor=1.0)

    MAX_EPISODES = 10000
    MAX_STEPS = 200

    episode_history = deque(maxlen=100)
    for i_episode in range(MAX_EPISODES):

        # initialize
        state = env.reset()
        total_rewards = 0

        for t in range(MAX_STEPS):
            #env.render()
            print("state", state, state[np.newaxis, :])
            action = pg_reinforce.sampleAction(state[np.newaxis, :])
            next_state, reward, done, _ = env.step(action)

            total_rewards += reward
            reward = -10 if done else 0.1  # normalize reward
            #print("action, next_state, reward, done", action, next_state, reward, done)
            pg_reinforce.storeRollout(state, action, reward)

            state = next_state
            if done: break

        pg_reinforce.updateModel()

        episode_history.append(total_rewards)
        mean_rewards = np.mean(episode_history)

        print("Episode {}".format(i_episode))
        print("Finished after {} timesteps".format(t + 1))
        print("Reward for this episode: {}".format(total_rewards))
        print("Average reward for last 100 episodes: {:.2f}".format(
            mean_rewards))
        if mean_rewards >= 195.0 and len(episode_history) >= 100:
            print("Environment {} solved after {} episodes".format(
                env_name, i_episode + 1))
            break
        print()
                         initializer=tf.truncated_normal_initializer())
    b1 = tf.get_variable("b1", [20], initializer=tf.constant_initializer(0))
    h1 = tf.nn.tanh(tf.matmul(states, W1) + b1)
    W2 = tf.get_variable(
        "W2", [20, num_actions],
        initializer=tf.truncated_normal_initializer(stddev=0.1))
    b2 = tf.get_variable("b2", [num_actions],
                         initializer=tf.constant_initializer(0))
    p = tf.matmul(h1, W2) + b2
    return p


pg_reinforce = PolicyGradientREINFORCE(policy_session,
                                       policy_optimizer,
                                       policy_network,
                                       state_dim,
                                       entropy_bonus=entropy_bonus,
                                       summary_writer=policy_writer,
                                       summary_every=policy_summary_every)

# Initializing Sampler
sampler = Sampler(pg_reinforce, env)

# Q-network parameters
q_session = tf.Session()
q_optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
q_writer = tf.train.SummaryWriter("/home/drl/DRL/tensorflow-reinforce/tmp/q/")
q_summary_every = 10


def action_masker(array):
Example #3
0
   """ define policy neural network """
   W1 = tf.get_variable("W1", [state_dim, 20],
                        initializer=tf.random_normal_initializer())
   b1 = tf.get_variable("b1", [20],
                        initializer=tf.constant_initializer(0))
   h1 = tf.nn.tanh(tf.matmul(states, W1) + b1)
   W2 = tf.get_variable("W2", [20, num_actions],
                        initializer=tf.random_normal_initializer(stddev=0.1))
   b2 = tf.get_variable("b2", [num_actions],
                        initializer=tf.constant_initializer(0))
   p = tf.matmul(h1, W2) + b2
   return p

pg_reinforce = PolicyGradientREINFORCE(sess,
                                       optimizer,
                                       policy_network,
                                       state_dim,
                                       summary_writer=writer,
                                       summary_every=1)

sampler = Sampler(pg_reinforce, env)

reward = []
for _ in tqdm(range(30)):
    batch = sampler.collect_one_batch()
    pg_reinforce.update_parameters(batch["states"], batch["actions"], batch["monte_carlo_returns"])
    reward.append(batch["rewards"].sum()/200)

show_image(reward)


# batch = sampler.collect_one_batch()
Example #4
0
                                  shape=[1],
                                  initializer=tf.constant_initializer(0))

    value = (tf.matmul(tf.reshape(output, [-1, gru_unit_size]), w_value) +
             b_value)
    return logit, final_state, value


pg_reinforce = PolicyGradientREINFORCE(sess,
                                       optimizer,
                                       policy_network,
                                       state_dim,
                                       num_actions,
                                       gru_unit_size,
                                       num_step,
                                       num_layers,
                                       save_path + env.spec.id,
                                       global_step,
                                       max_gradient=max_gradient,
                                       entropy_bonus=entropy_bonus,
                                       summary_writer=writer,
                                       summary_every=10,
                                       loss_function=loss_function)

sampler = Sampler(pg_reinforce,
                  env,
                  gru_unit_size,
                  num_step,
                  num_layers,
                  max_step,
                  batch_size,
Example #5
0
def play_ner():
    actions = 2
    global AGENT

    global robot_data
    env_ctl = initialise_game(robot_data, True)
    # initialise a decision robot
    episode = 1
    sess = tf.Session()
    optimizer = tf.train.RMSPropOptimizer(learning_rate=0.0001, decay=0.9)
    episode_history = deque(maxlen=100)
    total_rewards = 0

    mean_rw = []
    print(">>>>>> Playing game ..")
    if AGENT == "random":
        robot = RobotRandom(actions)
    elif AGENT == "DQN":
        env = env_ctl.get_new_environment()
        observation = env.first_observation()
        robot = RobotCNNDQN(actions)
        while episode <= MAX_EPISODE:
            #print '>>>>>>> Current game round ', episode, 'Maximum ', MAX_EPISODE
            action = robot.get_action(observation)
            #print '> Action', action
            observation2, reward, terminal, info = env.step(action)

            total_rewards += reward
            #print '> Reward', reward
            robot.update(observation, action, reward, observation2, terminal)
            observation = observation2
            if terminal == True:
                env = env_ctl.get_new_environment()
                observation = env.first_observation()
                episode_history.append(total_rewards)
                mean_rewards = np.mean(episode_history)
                print mean_rewards
                mean_rw.append(mean_rewards)
                total_rewards = 0
                episode += 1
                print '> Terminal <'
    #mean_rw=np.array(mean_rw)
    #np.save('./data/mean_rw.npy',mean_rw)
    elif AGENT == "RDQN":
        env = env_ctl.get_new_PR_environment()

        observation = env.first_observation()
        robot = RobotRDQN(actions)
        while episode <= MAX_EPISODE:
            #print '>>>>>>> Current game round ', episode, 'Maximum ', MAX_EPISODE
            action = robot.get_action(observation)
            print '> Action', action
            observation2, reward, terminal, info = env.step(action)
            print '> Reward', reward
            robot.update(observation[0], action, reward, observation2[0],
                         terminal)
            observation = observation2
            if terminal == True:
                env = env_ctl.get_new_PR_environment()
                observation = env.first_observation()
                episode += 1
                print '> Terminal <'
    elif AGENT == "PGRL":
        env = env_ctl.get_new_environment()
        observation = np.reshape(env.first_observation(), (1, state_dim))
        robot = PolicyGradientREINFORCE(sess,
                                        optimizer,
                                        policy_network,
                                        state_dim,
                                        actions,
                                        summary_writer=None)
        saver = tf.train.Saver(max_to_keep=None)
        while episode <= MAX_EPISODE:
            print '>>>>>>> Current game round ', episode, 'Maximum ', MAX_EPISODE
            action = robot.sampleAction(observation)
            print '> Action', action
            observation2, reward, terminal, info = env.step(action)
            print '> Reward', reward
            robot.storeRollout(observation, action, reward)
            observation = np.reshape(observation2, (1, state_dim))
            if terminal == True:
                robot.updateModel()
                env = env_ctl.get_new_environment()
                if env == None:
                    env_ctl = initialise_game(robot_data, True)
                    env = env_ctl.get_new_PR_environment()
                observation = np.reshape(env.first_observation(),
                                         (1, state_dim))
                episode += 1
                if episode % 1000 == 0:
                    print "saving model" + "\n"
                    path = saver.save(sess,
                                      "./model/%s_robot_model" % AGENT,
                                      global_step=episode)
                    tempstr = 'have saved model to ' + path
                    print tempstr
                print '> Terminal <'

    elif AGENT == "PRPGRL":
        env = env_ctl.get_new_PR_environment()

        observation = (np.reshape(env.first_observation()[0],
                                  (1, state_dim)), env.first_observation()[1])
        robot = PRPolicyGradientREINFORCE(sess,
                                          optimizer,
                                          policy_network,
                                          state_dim,
                                          actions,
                                          summary_writer=None)
        saver = tf.train.Saver(max_to_keep=None)
        while episode <= MAX_EPISODE:
            print '>>>>>>> Current game round ', episode, 'Maximum ', MAX_EPISODE
            action = robot.sampleAction(observation)
            print '> Action', action
            observation2, reward, terminal, info = env.step(action)
            print '> Reward', reward
            robot.storeRollout(observation[0], action, reward)
            observation = (np.reshape(observation2[0],
                                      (1, state_dim)), observation2[1])
            if terminal == True:
                robot.updateModel()
                env = env_ctl.get_new_PR_environment()
                if env == None:
                    env_ctl = initialise_game(robot_data, True)
                    env = env_ctl.get_new_PR_environment()
                observation = (np.reshape(env.first_observation()[0],
                                          (1, state_dim)),
                               env.first_observation()[1])
                episode += 1
                if episode % 1000 == 0:
                    print "saving model" + "\n"
                    path = saver.save(sess,
                                      './model/%s_robot_model' % AGENT,
                                      global_step=episode)
                    tempstr = 'have saved model to ' + path
                    print tempstr
                print '> Terminal <'
    else:
        print "** There is no robot."
        raise SystemExit
    return robot
Example #6
0
    h1 = tf.nn.tanh(tf.matmul(states, W1) + b1)

    W2 = tf.get_variable(name="W2",
                         shape=[20, num_actions],
                         initializer=tf.random_normal_initializer(stddev=0.1))
    b2 = tf.get_variable(name="b2",
                         shape=[num_actions],
                         initializer=tf.constant_initializer(0))
    p = tf.matmul(h1, W2) + b2

    return p


pg_reinforce = PolicyGradientREINFORCE(sess,
                                       optimizer,
                                       policy_network,
                                       state_dim,
                                       num_actions,
                                       summary_writer=writer)

MAX_EPISODES = 10000
MAX_STEPS = 200
episode_history = deque(maxlen=100)

for e in range(MAX_EPISODES):
    # initialize
    state = env.reset()
    total_rewards = 0

    for t in range(MAX_STEPS):
        env.render()
        action = pg_reinforce.sampleAction(state[np.newaxis, :])
Example #7
0
    W1 = tf.get_variable("W1", [state_dim, 20],
                         initializer=tf.random_normal_initializer())
    b1 = tf.get_variable("b1", [20], initializer=tf.constant_initializer(0))
    h1 = tf.nn.tanh(tf.matmul(states, W1) + b1)
    W2 = tf.get_variable("W2", [20, num_actions],
                         initializer=tf.random_normal_initializer(stddev=0.1))
    b2 = tf.get_variable("b2", [num_actions],
                         initializer=tf.constant_initializer(0))
    p = tf.matmul(h1, W2) + b2
    return p


pg_reinforce = PolicyGradientREINFORCE(sess,
                                       optimizer,
                                       policy_network,
                                       state_dim,
                                       num_actions,
                                       discount_factor=discount_factor,
                                       summary_writer=writer)
NUM_ITR = 1000
BATCH_SIZE = 100
MAX_STEPS = 200

episode_history = deque(maxlen=100)
for i_itr in xrange(NUM_ITR):
    episodes = []
    total_rewards = 0
    for i_batch in xrange(BATCH_SIZE):
        # initialize
        state = env.reset()
        rewards, states, actions, returns = [], [], [], []