Esempio n. 1
0
epi = 100
step_limit = 500
avg_steps = []

avg_reward = []

env = gym.make('CartPole-v0')

with tf.Session() as sess:
    sess.run(init)

    for i_episode in range(epi):

        # obs = env.reset()
        obs = model.reset()

        for step in range(1, model.Nsim + 1):

            action_val = action.eval(feed_dict={X: obs.reshape(1, num_inputs)})
            action_picked = action_list[action_val[0][0]]  #
            model.u[step, 0] = model.u[step - 1, 0] + action_picked  #

            model.x[step, :] = model.next_state(model.x[step - 1, :],
                                                model.u[step, :])  #

            obs = model.x[step, :]  #
            reward = reward_calc(model.x[step, 1], 324.5)  #

            # obs, reward, done, _ = env.step(action_val[0][0])
Esempio n. 2
0
trainer = tf.train.GradientDescentOptimizer(learning_rate=rl.learning_rate)
updateModel = trainer.minimize(loss)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

rList = []

with tf.Session() as sess:

    sess.run(init)
    saver.restore(sess, "/tmp/model.ckpt")

    for i in range(num_of_episodes):

        model.reset(random_init=True)
        rAll = 0

        for j in range(1, model.Nsim + 1):

            if j % rl.eval_period == 0:
                s = rl.state_detection(model.x[j - 1, 1])

                a, allQ = sess.run(
                    [predict, Qout],
                    feed_dict={inputs: np.identity(len(states))[s:s + 1]})

                number = np.random.rand()
                if number < 0.0:
                    a = [np.random.randint(0, len(actions))]