Ejemplo n.º 1
0
buf = R.LoadBuffer(OUT_DIR + BUFFER_FILE)
if buf:
    EXP_PROB = EPSILON
    populated = R.GetOccupency()
    print("Replay buffer loaded from disk, occupied: " + str(populated))
else:
    print("Creating new replay buffer")

# initialize logger
L = Logger()
log_not_empty = L.Load(OUT_DIR + LOG_FILE)
if log_not_empty:
    print("Log file loaded")
else:
    ("Creating new log file")
    L.AddNewLog('network_left')
    L.AddNewLog('network_middle')
    L.AddNewLog('network_right')
    L.AddNewLog('policy_left')
    L.AddNewLog('policy_middle')
    L.AddNewLog('policy_right')

# load saved model
ckpt = tf.train.get_checkpoint_state(OUT_DIR)
if ckpt and ckpt.model_checkpoint_path:
    saver.restore(sess, ckpt.model_checkpoint_path)
    print("Model loaded from disk")

# define action discretization
max_a = env.action_space.high[0]
min_a = env.action_space.low[0]
Ejemplo n.º 2
0
if buf:
	OBSERVATION_PHASE = 0
	EXP_PROB = EPSILON
	populated = R.GetOccupency()
	print("Replay buffer loaded from disk, occupied: " + str(populated))
else:
	print("Creating new replay buffer")

# initialize logger
L = Logger()
log_not_empty = L.Load(OUT_DIR+LOG_FILE)
if log_not_empty:
	print ("Log file loaded")
else:
	("Creating new log file")
	L.AddNewLog('network_left')
	L.AddNewLog('network_middle')
	L.AddNewLog('network_right')
	L.AddNewLog('policy_left')
	L.AddNewLog('policy_middle')
	L.AddNewLog('policy_right')
	L.AddNewLog('error')
	L.AddNewLog('total_reward')
	L.AddNewLog('estimated_value')

# load saved model
ckpt = tf.train.get_checkpoint_state(OUT_DIR)
if ckpt and ckpt.model_checkpoint_path:
	saver.restore(sess,ckpt.model_checkpoint_path)
	print("Model loaded from disk")
Ejemplo n.º 3
0
OUTLOG = 'statistics/log_gDQN'
Base = '/home/ayal/Documents/gym/Code/'
DQN_home = 'DQN_hockey/hockey_DDQN_deepmind/hockey_DQN_'+str(num)+'_V'
DDPG_home = 'DDPG2/results'
gDQN_home = 'DQN_hockey/hockey_numeric_3points/hockey_multinet1_decay_rate1.2_2_V'

gDQNd = [Base+gDQN_home+'7/logs',Base+gDQN_home+'6/logs',Base+gDQN_home+'5/logs',Base+gDQN_home+'2/logs']#,Base+gDQN_home+'4/logs']
DDPGd = [Base+DDPG_home+'/logs',Base+DDPG_home+'1/logs',Base+DDPG_home+'2/logs',Base+DDPG_home+'3/logs',Base+DDPG_home+'4/logs']
DQNd = [Base+DQN_home+'1/logs',Base+DQN_home+'2/logs',Base+DQN_home+'3/logs',Base+DQN_home+'4/logs',Base+DQN_home+'5/logs']


numlogs = len(gDQNd)
LEN = 100

L = Logger()
L.AddNewLog('network_left')
L.AddNewLog('network_left_up')
L.AddNewLog('network_left_down')
L.AddNewLog('network_middle')
L.AddNewLog('network_middle_up')
L.AddNewLog('network_middle_down')
L.AddNewLog('network_right')
L.AddNewLog('network_right_up')
L.AddNewLog('network_right_down')
L.AddNewLog('network_random')
L.AddNewLog('network_random_up')
L.AddNewLog('network_random_down')
L.AddNewLog('estimated_value')
L.AddNewLog('estimated_value_up')
L.AddNewLog('estimated_value_down')
Ejemplo n.º 4
0
Archivo: ddpg.py Proyecto: ataitler/DQN
def train(sess, env, actor, critic):

    env_left = gym.make(ENV_LEFT)
    env_middle = gym.make(ENV_MIDDLE)
    env_right = gym.make(ENV_RIGHT)
    L = Logger()
    log_not_empty = L.Load(LOG_FILE)
    if log_not_empty:
        print("Log file loaded")
    else:
        ("Creating new log file")
        L.AddNewLog('network_left')
        L.AddNewLog('network_middle')
        L.AddNewLog('network_right')
        L.AddNewLog('total_reward')
        L.AddNewLog('estimated_value')
        L.AddNewLog('network_random')

    simulator = Simulator(MAX_EP_STEPS, STATE, 1, -0.5, None)

    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.initialize_all_variables())
    writer = tf.train.SummaryWriter(SUMMARY_DIR, sess.graph)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
    n = OUnoise(INPUT)
    for i in xrange(MAX_EPISODES):

        s = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0
        n.Reset()
        for j in xrange(MAX_EP_STEPS):

            if RENDER_ENV:
                env.render()

            # Added exploration noise
            #a = actor.predict(np.reshape(s, (1, 8))) + (1. / (1. + i + j))
            a = actor.predict(np.reshape(s, (1, STATE))) + n.Sample()

            s2, r, terminal, info = env.step(a[0])
            r += -0.5

            replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r, \
                terminal, np.reshape(s2, (actor.s_dim,)))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(MINIBATCH_SIZE)

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in xrange(MINIBATCH_SIZE):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + GAMMA * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            ep_reward += r

            if terminal:
                break

        summary_str = sess.run(summary_ops,
                               feed_dict={
                                   summary_vars[0]: ep_reward,
                                   summary_vars[1]: ep_ave_max_q / float(j)
                               })

        writer.add_summary(summary_str, i)
        writer.flush()

        print 'episode ', i, ' | Reward: %.2i' % int(ep_reward), " | Episode", i, \
            '| Qmax: %.4f' % (ep_ave_max_q / float(j))

        # log statistics
        L.AddRecord(
            'network_left',
            simulator.SimulateContNeuralEpisode(actor, sess, env_left, False))
        L.AddRecord(
            'network_middle',
            simulator.SimulateContNeuralEpisode(actor, sess, env_middle,
                                                False))
        L.AddRecord(
            'network_right',
            simulator.SimulateContNeuralEpisode(actor, sess, env_right, False))
        temp_r = 0
        for rand_i in xrange(10):
            temp_r = temp_r + simulator.SimulateContNeuralEpisode(
                actor, sess, env, False) * 0.1
        L.AddRecord('network_random', temp_r)
        L.AddRecord('total_reward', ep_reward)
        if replay_buffer.size() > V_EST:
            num = V_EST
        else:
            num = replay_buffer.size()
        s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
            num)
        Q = critic.predict(s_batch, actor.predict(s_batch))
        V_est = Q.sum() / num * 1.0
        L.AddRecord('estimated_value', V_est)

        if i % SAVE_RATE == 0:
            L.Save(LOG_FILE)
Ejemplo n.º 5
0
saver = tf.train.Saver()
ckpt = tf.train.get_checkpoint_state(OUT_DIR)
if ckpt and ckpt.model_checkpoint_path:
    saver.restore(sess, ckpt.model_checkpoint_path)
    print("Model loaded from disk")

# initialize logger
L = Logger()
log_not_empty = L.Load(OUT_DIR + LOG_FILE)
if log_not_empty:
    print("Log file loaded")
else:
    ("Creating new log file")
    if ENVIRONMENT_NAME is 'Hockey-v2':
        L.AddNewLog('network_left')
        L.AddNewLog('network_middle')
        L.AddNewLog('network_right')
        L.AddNewLog('network_random')
#	L.AddNewLog('error')
    L.AddNewLog('total_reward')
    L.AddNewLog('estimated_value')
    L.AddNewLog('network_random')

if ENVIRONMENT_NAME is 'Hockey-v2':
    simulator = Simulator(STEPS, STATE_SIZE, FRAMES, T, None)
steps = steps_counter.evaluate(sess)
C_steps_counter.evaluate(sess)
for ep in range(EPISODES):
    episodes_counter.increment(sess)
    # open up a game state