Esempio n. 1
0
    for t in xrange(1, STEPS + 1):
        if DISPLAY:
            env.render()
        at = Actor.predict(st) + n.Sample()
        # execute action
        st_next, rt, Done, _ = env.step(at[0])
        mdp.add_frame(st_next)
        st_next = mdp.get_MDP_state()
        if Done:
            dt = 1
        else:
            dt = 0
        totalR += rt

        # store transition
        R.StoreTransition(st, at, np.array([rt]), st_next, dt)
        st = st_next

        if episode_i > OBSERVATION_PHASE:
            for mini_batch in xrange(BATCHES):
                # sample mini batch
                s_batch, a_batch, r_batch, stag_batch, terminal_batch, _ = R.SampleMiniBatch(
                    MINI_BATCH)

                Q_next = Critic.target_predict(
                    stag_batch, Actor.target_predict(stag_batch))
                Y = r_batch + GAMMA * Q_next * (1 - terminal_batch)

                Critic.train(Y, s_batch, a_batch)

                a_for_grad = Actor.predict(s_batch)
Esempio n. 2
0
File: DQN.py Progetto: ataitler/DQN
		if EXP_PROB > EPSILON:
			EXP_PROB -= ann_fric

		# execute action
		st_next, rt, Done, _ = env.step(at)
		mdp.add_frame(st_next)
		rt = rt+T
		st_next = mdp.get_MDP_state()
		if Done:
			dt = 1
		else:
			dt = 0
		totalR += rt
		
		# store transition
		R.StoreTransition(st, np.array([a_index]), np.array([rt]), st_next, dt)
		st = st_next

		if episode_i > OBSERVATION_PHASE:
			for mini_batch in xrange(BATCHES):
				# sample mini batch
				s_batch, a_batch, r_batch, stag_batch, terminal_batch = R.SampleMiniBatch(MINI_BATCH)
				
				Y = Q.evaluate(sess, s_batch)				
	
				Q_next = Q_target.evaluate(sess, stag_batch)
				Q_next_max = np.amax(Q_next,1)

				a_batch = a_batch.astype(int)
				for i in range(MINI_BATCH):
					Y[i,a_batch[i,0]] = r_batch[i,0] + GAMMA*Q_next_max[i] * (1-terminal_batch[i])