Ejemplo n.º 1
0
                Q_next = Q_target.evaluate(sess, stag_batch)
                Q_next_max = np.amax(Q_next, 1)

                a_batch = a_batch.astype(int)
                for i in range(MINI_BATCH):
                    Y[i, a_batch[i, 0]] = r_batch[
                        i, 0] + GAMMA * Q_next_max[i] * (1 - terminal_batch[i])

                # train on estimated Q next and rewards
                error = Q.train(sess, s_batch, Y)

        if Done is True:
            break

    # run validation simulations
    L.AddRecord('network_left',
                simulator.SimulateNeuralEpisode(Q, sess, env_left, False))
    L.AddRecord('network_middle',
                simulator.SimulateNeuralEpisode(Q, sess, env_middle, False))
    L.AddRecord('network_right',
                simulator.SimulateNeuralEpisode(Q, sess, env_right, False))
    L.AddRecord(
        'policy_left',
        simulator.SimulatePolicyEpisode(policy, discretizer, env_left, False))
    L.AddRecord(
        'policy_middle',
        simulator.SimulatePolicyEpisode(policy, discretizer, env_middle,
                                        False))
    L.AddRecord(
        'policy_right',
        simulator.SimulatePolicyEpisode(policy, discretizer, env_right, False))
Ejemplo n.º 2
0
					error = Q.train_output(sess, s_batch, Y)
				else:
					error = Q.train(sess, s_batch, Y)
				E_local.append(error)

		E_local = sum(E_local)/len(E_local)
		totalE += E_local

		if Done is True:
			break

	totalE = totalE/t

	# run validation simulations
	a = simulator.SimulateNeuralEpisode(Q, sess, env_left, False)
	L.AddRecord('network_left', a)
	L.AddRecord('network_middle',simulator.SimulateNeuralEpisode(Q, sess, env_middle, False))
	L.AddRecord('network_right',simulator.SimulateNeuralEpisode(Q, sess, env_right, False))
	L.AddRecord('policy_left',simulator.SimulatePolicyEpisode(policy,discretizer, env_left, False))
	L.AddRecord('policy_middle',simulator.SimulatePolicyEpisode(policy,discretizer, env_middle, False))
	L.AddRecord('policy_right',simulator.SimulatePolicyEpisode(policy,discretizer, env_right, False))
	L.AddRecord('total_reward', totalR)
	L.AddRecord('error', totalE)
	s_est, _, _, _, _, num = R.SampleMiniBatch(V_EST)
	Q_est_arg = Q.evaluate(sess, s_est)
	Q_est_argmax = np.argmax(Q_est_arg,1)*1.0
	V_est = Q_est_argmax.sum()/num*1.0
	L.AddRecord('estimated_value', V_est)
	
	# update target network
	if steps >= C_STEPS:
Ejemplo n.º 3
0
		temp = []
		for l in xrange(numlogs):
			temp.append(all_logs[l][i])
		avg.append(sum(temp)/len(temp)*1.0)			

	avgep = []
	var = 0
	avgep_up = []
	avgep_down = []
	for i in xrange(length-LEN):
		a = sum(avg[i:i+LEN])/LEN*1.0
		var = np.sqrt(np.var(avg[i:i+LEN]))
		avgep.append(a)
		avgep_up.append(avgep[i] + var)
		avgep_down.append(avgep[i] - var)
		L.AddRecord(logname,avgep[i])
		L.AddRecord(logname+'_up',avgep_up[i])
		L.AddRecord(logname+'_down',avgep_down[i])
	
L.Save(OUTLOG)
t = np.arange(1,len(avgep)+1)
var_up = np.array(avgep_up)
var_down = np.array(avgep_down)
plt.figure(1)
plt.fill_between(t,var_down, var_up, facecolor='blue', linewidth=0.0,alpha=0.5)
plt.plot(t,avgep,'b')
plt.show()



Ejemplo n.º 4
0
Archivo: ddpg.py Proyecto: ataitler/DQN
def train(sess, env, actor, critic):

    env_left = gym.make(ENV_LEFT)
    env_middle = gym.make(ENV_MIDDLE)
    env_right = gym.make(ENV_RIGHT)
    L = Logger()
    log_not_empty = L.Load(LOG_FILE)
    if log_not_empty:
        print("Log file loaded")
    else:
        ("Creating new log file")
        L.AddNewLog('network_left')
        L.AddNewLog('network_middle')
        L.AddNewLog('network_right')
        L.AddNewLog('total_reward')
        L.AddNewLog('estimated_value')
        L.AddNewLog('network_random')

    simulator = Simulator(MAX_EP_STEPS, STATE, 1, -0.5, None)

    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.initialize_all_variables())
    writer = tf.train.SummaryWriter(SUMMARY_DIR, sess.graph)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
    n = OUnoise(INPUT)
    for i in xrange(MAX_EPISODES):

        s = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0
        n.Reset()
        for j in xrange(MAX_EP_STEPS):

            if RENDER_ENV:
                env.render()

            # Added exploration noise
            #a = actor.predict(np.reshape(s, (1, 8))) + (1. / (1. + i + j))
            a = actor.predict(np.reshape(s, (1, STATE))) + n.Sample()

            s2, r, terminal, info = env.step(a[0])
            r += -0.5

            replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r, \
                terminal, np.reshape(s2, (actor.s_dim,)))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(MINIBATCH_SIZE)

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in xrange(MINIBATCH_SIZE):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + GAMMA * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            ep_reward += r

            if terminal:
                break

        summary_str = sess.run(summary_ops,
                               feed_dict={
                                   summary_vars[0]: ep_reward,
                                   summary_vars[1]: ep_ave_max_q / float(j)
                               })

        writer.add_summary(summary_str, i)
        writer.flush()

        print 'episode ', i, ' | Reward: %.2i' % int(ep_reward), " | Episode", i, \
            '| Qmax: %.4f' % (ep_ave_max_q / float(j))

        # log statistics
        L.AddRecord(
            'network_left',
            simulator.SimulateContNeuralEpisode(actor, sess, env_left, False))
        L.AddRecord(
            'network_middle',
            simulator.SimulateContNeuralEpisode(actor, sess, env_middle,
                                                False))
        L.AddRecord(
            'network_right',
            simulator.SimulateContNeuralEpisode(actor, sess, env_right, False))
        temp_r = 0
        for rand_i in xrange(10):
            temp_r = temp_r + simulator.SimulateContNeuralEpisode(
                actor, sess, env, False) * 0.1
        L.AddRecord('network_random', temp_r)
        L.AddRecord('total_reward', ep_reward)
        if replay_buffer.size() > V_EST:
            num = V_EST
        else:
            num = replay_buffer.size()
        s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
            num)
        Q = critic.predict(s_batch, actor.predict(s_batch))
        V_est = Q.sum() / num * 1.0
        L.AddRecord('estimated_value', V_est)

        if i % SAVE_RATE == 0:
            L.Save(LOG_FILE)
Ejemplo n.º 5
0
                        i, 0] + GAMMA * Q_next_max[i] * (1 - terminal_batch[i])

                # train on estimated Q next and rewards
                error = Q.train(sess, s_batch, Y)
                E_local.append(error)

        E_local = sum(E_local) / len(E_local)
        totalE += E_local

        if Done is True:
            break

    totalE = totalE / t

    # run validation simulations
    L.AddRecord('network_left',
                simulator.SimulateNeuralEpisode(Q, sess, env_left, False))
    L.AddRecord('network_middle',
                simulator.SimulateNeuralEpisode(Q, sess, env_middle, False))
    L.AddRecord('network_right',
                simulator.SimulateNeuralEpisode(Q, sess, env_right, False))
    temp_r = 0
    for rand_i in xrange(10):
        temp_r = temp_r + simulator.SimulateNeuralEpisode(Q, sess, env,
                                                          False) * 0.1
    L.AddRecord('network_random', temp_r)
    L.AddRecord('total_reward', totalR)
    L.AddRecord('error', totalE)
    s_est, _, _, _, _, num = R.SampleMiniBatch(V_EST)
    Q_est_arg = Q.evaluate(sess, s_est)
    Q_est_argmax = np.argmax(Q_est_arg, 1) * 1.0
    V_est = Q_est_argmax.sum() / num * 1.0