Example #1
0
def train(sess, actor, critic):
    t = 0  # test counter

    sess.run(tf.global_variables_initializer())

    # initialize actor, critic and replay buffer
    actor.update_target_network()
    critic.update_target_network()
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    s = Models(0, 0, -0.101485, 0.100951, 0.819996, -0.00146549, -1.27,
               4.11e-6, 2.26e-7, 0, 0, 0, 0, 0, 0, 0, 0, 0)

    print s.current_state()

    for i in range(MAX_EPISODES):

        if not i % 10 and i > 0 and replay_buffer.size() > MIN_BUFFER_SIZE:
            TEST = True
            t += 1
        else:
            TEST = False
        # initialize noise process
        noise = np.zeros(ACTION_DIMENSION)
        total_episode_reward = 0

        for j in range(MAX_EPISODE_LENGTH):
            s0 = s.current_state()
            a = compute_action(actor, s0, noise)
            # computing next step, reward and terminal
            s2 = s.next_states(s0, a)
            r = s.calc_reward(s2, s0)
            print s.current_state()
            terminal = s.calc_terminal()

            if not TEST:
                replay_buffer.add(np.reshape(s0, (actor.s_dim, )),
                                  np.reshape(a, actor.a_dim), r, terminal,
                                  np.reshape(s2, (actor.s_dim, )))

            total_episode_reward += r

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if not TEST:
                if replay_buffer.size() > MIN_BUFFER_SIZE:
                    s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
                        MINIBATCH_SIZE)

                    # calculate targets
                    target_q = critic.predict_target(
                        s2_batch, actor.predict_target(s2_batch))

                    y_i = []
                    for k in range(MINIBATCH_SIZE):
                        if t_batch[k]:
                            y_i.append(r_batch[k])
                        else:
                            y_i.append(r_batch[k] + GAMMA * target_q[k])

                    # Update the critic given the targets
                    predicted_q_value, _ = critic.train(
                        s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                    # ep_ave_max_q += np.amax(predicted_q_value)

                    # Update the actor policy using the sampled gradient
                    a_outs = actor.predict(s_batch)
                    grads = critic.action_gradients(s_batch, a_outs)
                    actor.train(s_batch, grads[0])

                    # Update target networks
                    actor.update_target_network()
                    critic.update_target_network()

            if not TEST:
                write_csv_learn(s0, a, s2, terminal, r, total_episode_reward)

            else:
                write_csv_test(s0, a, s2, terminal, r, total_episode_reward)

            if not terminal == 0:
                print t, i, j, total_episode_reward  # printing n of test, n of train, length of the episode,
                # tot ep reward
                break

        s = s.reset()
Example #2
0
def train(sess_2, actor, critic, mod, test, train_flag=False):
    t = 0  # test counter

    time = 0
    step = 0.03

    sess_2.run(tf.global_variables_initializer())

    # initialize actor, critic and replay buffer, and initial state
    actor.update_target_network()
    critic.update_target_network()
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    # the initial state has to be change also in the init function below
    s = Models()

    # print s.current_state()

    for i in range(MAX_EPISODES):

        if test and not i % 20 and i > 0 and replay_buffer.size(
        ) > MIN_BUFFER_SIZE:
            TEST = True
            t += 1
        else:
            TEST = False
        # initialize noise process
        noise = np.zeros(ACTION_DIMENSION)
        total_episode_reward = 0

        for j in range(MAX_EPISODE_LENGTH):
            s0 = s.current_state()
            a = compute_action(actor, s0, noise)
            model_input = (np.hstack([s0, a])).reshape(1, 24)
            s2 = mod.prediction(measured_input=model_input)
            s.import_state(s2[0])
            r = s.calc_reward(s2[0], s0)
            # print phase, s.current_state()
            terminal = s.calc_terminal(s2)

            if not TEST:
                replay_buffer.add(np.reshape(s0, (actor.s_dim, )),
                                  np.reshape(a, actor.a_dim), r, terminal,
                                  np.reshape(s2, (actor.s_dim, )))

            total_episode_reward += r

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > MIN_BUFFER_SIZE:
                if not TEST:
                    train_flag = True
                    s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
                        MINIBATCH_SIZE)

                    # calculate targets
                    target_q = critic.predict_target(
                        s2_batch, actor.predict_target(s2_batch))

                    y_i = []
                    for k in range(MINIBATCH_SIZE):
                        if t_batch[k]:
                            y_i.append(r_batch[k])
                        else:
                            y_i.append(r_batch[k] + GAMMA * target_q[k])

                    # Update the critic given the targets
                    predicted_q_value, _ = critic.train(
                        s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                    # ep_ave_max_q += np.amax(predicted_q_value)

                    # Update the actor policy using the sampled gradient
                    a_outs = actor.predict(s_batch)
                    grads = critic.action_gradients(s_batch, a_outs)
                    actor.train(s_batch, grads[0])

                    # Update target networks
                    actor.update_target_network()
                    critic.update_target_network()

                    # csv for animation
                if not TEST:
                    write_csv_animation_train(time, s0[0:9])
                    time = time + step
                    if terminal == 2:
                        time = 0
                else:
                    write_csv_animation_test(time, s0[0:9])
                    time = time + step
                    if terminal == 2:
                        time = 0

                if not TEST:
                    write_csv_learn(i - t, j, s0, a, s2[0], terminal, r,
                                    total_episode_reward)

                else:
                    write_csv_test(t, j, s0, a, s2[0], terminal, r,
                                   total_episode_reward)

            if not terminal == 0:
                print train_flag, t, i - t, j, total_episode_reward  # printing n of test, n of train, length of the episode,
                # tot ep reward
                break

        s.reset()