def replay_train(critic: Critic, critic_copy: Critic, actor: Actor,
                 actor_copy: Actor, train_batch):
    state_stack = np.empty(input_size).reshape(1, input_size)
    action_stack = np.empty(output_size).reshape(1, output_size)
    sampled_action_stack = np.empty(output_size).reshape(1, output_size)
    y_stack = np.empty(output_size).reshape(1, output_size)

    for state, action, reward, next_state, done in train_batch:
        a = np.empty(output_size).reshape(1, output_size)
        s_a = np.empty(output_size).reshape(1, output_size)
        y = np.empty(output_size).reshape(1, output_size)

        sampled_action_copy = actor_copy.action(next_state)
        sampled_action = actor.action(state)
        sampled_q_value = critic_copy.q_value(next_state, sampled_action_copy)
        state = np.reshape(state, newshape=(1, input_size))

        if done:
            y[0, output_size - 1] = reward
        else:
            y[0, output_size - 1] = reward + dis * sampled_q_value[0][0]

        a[0, output_size - 1] = action
        s_a[0, output_size - 1] = sampled_action

        state_stack = np.vstack([state_stack, state])
        action_stack = np.vstack([action_stack, a])
        sampled_action_stack = np.vstack([sampled_action_stack, s_a])
        y_stack = np.vstack([y_stack, y])

    state_stack = np.delete(state_stack, 0, 0)
    action_stack = np.delete(action_stack, 0, 0)
    sampled_action_stack = np.delete(sampled_action_stack, 0, 0)
    y_stack = np.delete(y_stack, 0, 0)

    loss, _ = critic.update(state_stack, action_stack, y_stack)
    gradient = critic.get_gradient(state_stack, sampled_action_stack)
    actor.update(state_stack, gradient)

    return loss
def bot_play(actor: Actor):
    # See our trained network in action
    s = env.reset()
    reward_sum = 0

    while True:
        env.render()
        a = actor.action(s)
        s, reward, done, _ = env.step(a)
        reward_sum += reward

        if done:
            print("Total score: {}".format(reward_sum))
            break
Example #3
0
def Main():
    max_episodes = 50000
    replay_buffer = deque()

    with tf.name_scope("network"):
        actor = Actor(n_state=input_size,
                      n_action=output_size,
                      n_layers=1,
                      n_units=400,
                      scope="actor")
        actor_copy = Actor(n_state=input_size,
                           n_action=output_size,
                           n_layers=1,
                           n_units=400,
                           scope="a_copy")
        critic = Critic(n_state=input_size,
                        n_action=output_size,
                        n_layers=1,
                        n_units=400,
                        scope="critic")
        critic_copy = Critic(n_state=input_size,
                             n_action=output_size,
                             n_layers=1,
                             n_units=400,
                             scope="c_copy")

    with tf.name_scope("train"):
        actor_copy_ops = get_copy_var_ops(actor_copy.get_variables(),
                                          actor.get_variables())
        # get_copy_var_ops(dest_scope_name="actor_copy", src_scope_name="actor")
        critic_copy_ops = get_copy_var_ops(critic_copy.get_variables(),
                                           critic.get_variables())
        #get_copy_var_ops(dest_scope_name="critic_copy", src_scope_name="critic")
        actor_soft_copy_ops = get_copy_var_ops(actor_copy.get_variables(),
                                               actor.get_variables(), "soft")
        #get_copy_var_ops(dest_scope_name="actor_copy", src_scope_name="actor", op_name="soft")
        critic_soft_copy_ops = get_copy_var_ops(critic_copy.get_variables(),
                                                critic.get_variables(), "soft")
        #get_copy_var_ops(dest_scope_name="critic_copy", src_scope_name="critic", op_name="soft")

    with tf.name_scope("miscellaneous"):
        init = tf.global_variables_initializer()
        noise_generator = Uhlenbeck(action_dimension=output_size, mu=0.6)
        saver = tf.train.Saver()

    with tf.Session() as sess:
        # initialize variables
        sess.run(init)
        # copy the variables
        sess.run([actor_copy_ops, critic_copy_ops])
        # set the current session to models
        actor.set_session(sess)
        actor_copy.set_session(sess)
        critic.set_session(sess)
        critic_copy.set_session(sess)
        # iterate through the episodes
        for episode in range(max_episodes):
            done = False
            step_count = 0
            state = env.reset()
            noise_generator.reset()
            loss = 0.0
            while not done:
                env.render()
                action = actor.action(state) + noise_generator.noise()
                next_state, reward, done, _ = env.step(action)

                replay_buffer.append((state, action, reward, next_state, done))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()

                state = next_state
                step_count += 1
                if step_count % 100 == 1:
                    print("Step {}, chosed action {}, reward {}".format(
                        step_count, action, reward))

                if len(replay_buffer) < 64:
                    continue

                mini_batch = random.sample(replay_buffer, 64)
                loss = replay_train(critic, critic_copy, actor, actor_copy,
                                    mini_batch)
                sess.run([actor_soft_copy_ops, critic_soft_copy_ops])

                if done:
                    print("Loss : {}".format(loss))

            if episode % 10 == 1:
                print("Episode: {} steps: {}".format(episode, step_count))
                print("Loss : {}".format(loss))
                save_path = saver.save(sess, "./model.ckpt")