epi = 100 step_limit = 500 avg_steps = [] avg_reward = [] env = gym.make('CartPole-v0') with tf.Session() as sess: sess.run(init) for i_episode in range(epi): # obs = env.reset() obs = model.reset() for step in range(1, model.Nsim + 1): action_val = action.eval(feed_dict={X: obs.reshape(1, num_inputs)}) action_picked = action_list[action_val[0][0]] # model.u[step, 0] = model.u[step - 1, 0] + action_picked # model.x[step, :] = model.next_state(model.x[step - 1, :], model.u[step, :]) # obs = model.x[step, :] # reward = reward_calc(model.x[step, 1], 324.5) # # obs, reward, done, _ = env.step(action_val[0][0])
trainer = tf.train.GradientDescentOptimizer(learning_rate=rl.learning_rate) updateModel = trainer.minimize(loss) init = tf.global_variables_initializer() saver = tf.train.Saver() rList = [] with tf.Session() as sess: sess.run(init) saver.restore(sess, "/tmp/model.ckpt") for i in range(num_of_episodes): model.reset(random_init=True) rAll = 0 for j in range(1, model.Nsim + 1): if j % rl.eval_period == 0: s = rl.state_detection(model.x[j - 1, 1]) a, allQ = sess.run( [predict, Qout], feed_dict={inputs: np.identity(len(states))[s:s + 1]}) number = np.random.rand() if number < 0.0: a = [np.random.randint(0, len(actions))]