Esempio n. 1
0
        mdp.add_frame(st_next)
        st_next = mdp.get_MDP_state()
        if Done:
            dt = 1
        else:
            dt = 0
        totalR += rt

        # store transition
        R.StoreTransition(st, at, np.array([rt]), st_next, dt)
        st = st_next

        if episode_i > OBSERVATION_PHASE:
            for mini_batch in xrange(BATCHES):
                # sample mini batch
                s_batch, a_batch, r_batch, stag_batch, terminal_batch, _ = R.SampleMiniBatch(
                    MINI_BATCH)

                Q_next = Critic.target_predict(
                    stag_batch, Actor.target_predict(stag_batch))
                Y = r_batch + GAMMA * Q_next * (1 - terminal_batch)

                Critic.train(Y, s_batch, a_batch)

                a_for_grad = Actor.predict(s_batch)
                grads = Critic.gradients(s_batch, a_batch)
                Actor.train(s_batch, grads)

                Actor.target_train()
                Critic.target_train()

        if Done is True:
Esempio n. 2
0
        st_next = mdp.get_MDP_state()
        if Done:
            dt = 1
        else:
            dt = 0
        totalR += rt

        # store transition
        R.StoreTransition(st, np.array([a_index]), np.array([rt]), st_next, dt)
        st = st_next

        E_local = [0]
        if episode_i > OBSERVATION_PHASE:
            for mini_batch in xrange(BATCHES):
                # sample mini batch
                s_batch, a_batch, r_batch, stag_batch, terminal_batch, _ = R.SampleMiniBatch(
                    MINI_BATCH)

                Y = Q.evaluate(sess, s_batch)

                #Q_next_arg = Q.evaluate(sess, stag_batch)
                #Q_next_argmax = np.argmax(Q_next_arg,1)
                #Q_next_target = Q_target.evaluate(sess, stag_batch)

                #a_batch = a_batch.astype(int)
                #for i in range(MINI_BATCH):
                #	Y[i,a_batch[i,0]] = r_batch[i,0] + GAMMA*Q_next_target[i,Q_next_argmax[i]] * (1-terminal_batch[i])

                #error = Q.train(sess, s_batch, Y)

                # old DQN
                Q_next = Q_target.evaluate(sess, stag_batch)