-
Notifications
You must be signed in to change notification settings - Fork 2
/
Network.py
121 lines (104 loc) · 5.2 KB
/
Network.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# -*- coding: utf-8 -*-
"""
Network class
@author: thomas
"""
import gym
import numpy as np
import tensorflow as tf
import logging
class Network():
''' A3C specification '''
def __init__(self,cluster,env,task_index,learning_rate=0.001):
''' Set-up network '''
action_dim, discrete = check_action_space(env) # detect action space
with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:{}".format(task_index),cluster=cluster)):
import tflearn # need to import within the tf.device statement for the tflearn.is_training variable to be shared !
#tflearn.init_graph()
#training = tf.get_variable(tflearn.get_training_mode().name,initializer=False)
#tf.get_variable(
# Placeholders
self.s = tf.placeholder("float32",np.array(np.append(None,env.observation_shape)))
self.A = tf.placeholder("float32", (None,))
self.V = tf.placeholder("float32", (None,))
if discrete:
self.a = tf.placeholder("int32", (None,)) # discrete action space
self.a_one_hot = tf.one_hot(self.a,action_dim)
else:
self.a = tf.placeholder("float32", np.append(None,action_dim)) # continuous action space
# Network
ff = encoder_s(self.s,scope='encoder',reuse=False)
self.p_out = tflearn.fully_connected(ff, n_units=action_dim, activation='softmax')
self.v_out = tflearn.fully_connected(ff, n_units=1, activation='linear')
##### A3C #######
# Compute log_pi
log_probs = tf.log(tf.clip_by_value(self.p_out,1e-20,1.0)) # log pi
if discrete:
log_pi_given_a = tf.reduce_sum(log_probs * self.a_one_hot,reduction_indices=1)
else:
raise(NotImplementedError)
# Losses
p_loss = -1*log_pi_given_a * self.A
entropy_loss = -1*tf.reduce_sum(self.p_out * log_probs,reduction_indices=1,name="entropy_loss") # policy entropy
v_loss = tf.nn.l2_loss(self.V - self.v_out, name="v_loss")
loss1 = tf.add(p_loss,0.01*entropy_loss)
self.loss = tf.add(loss1,v_loss)
# Trainer
optimizer = tf.train.AdamOptimizer(learning_rate)
self.trainer = optimizer.minimize(self.loss)
# Global counter
#self.global_step = tf.get_variable('global_step', [],
# initializer = tf.constant_initializer(0),
# trainable = False)
#self.global_step = tf.Variable(0)
#self.step_op = tf.Variable(0, trainable=False, name='step')
#self.step_t = tf.placeholder("int32",(1,))
#self.step_inc_op = self.step_op.assign_add(tf.squeeze(self.step_t), use_locking=True)
# other stuff
self.summary_placeholders, self.update_ops, self.summary_op = setup_summaries() # Summary operations
self.saver = tf.train.Saver(max_to_keep=10)
self.init_op = tf.initialize_all_variables()
print('network initialized')
####################
# Smaller functions
####################
def encoder_s(s,scope,reuse):
import tflearn
layer1 = tflearn.fully_connected(s, n_units=4, activation='relu',scope='{}{}'.format(scope,1),reuse=reuse)
layer2 = tflearn.fully_connected(layer1, n_units=2, activation='relu',scope='{}{}'.format(scope,2),reuse=reuse)
return layer2
def check_action_space(env):
'''check the action properties of the env '''
if isinstance(env.action_space,gym.spaces.Box):
action_dim = env.action_space.shape[0] # should the zero be here?
discrete = False
elif isinstance(env.action_space,gym.spaces.Discrete):
action_dim = env.action_space.n
discrete = True
return action_dim, discrete
def setup_summaries():
''' Defines summary ops '''
# Placeholders
episode_reward = tf.Variable(0.)
episode_ave_max_q = tf.Variable(0.)
epsilon = tf.Variable(0.)
summary_vars = [episode_reward, episode_ave_max_q, epsilon]
# Summaries
tf.scalar_summary("Episode Reward", episode_reward)
tf.scalar_summary("Max Q Value", episode_ave_max_q)
tf.scalar_summary("Epsilon", epsilon)
summary_placeholders = [tf.placeholder("float") for i in range(len(summary_vars))]
# Update and merge ops
update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in range(len(summary_vars))]
summary_op = tf.merge_all_summaries()
return summary_placeholders, update_ops, summary_op
#### Test ####
if __name__ == '__main__':
''' Test network setup '''
workers = ["localhost:{}".format(p) for p in range(2222,2222+2+1,1)]
ps = ["localhost:{}".format(p) for p in range(2222+2+1,2222+2+1+2,1)]
cluster_spec = {"ps":ps,"worker": workers}
cluster = tf.train.ClusterSpec(cluster_spec)
env = gym.make('MsPacman-v0')
mynet = Network(cluster,env,task_index=0,learning_rate=0.001)
print('Network works, with placeholder {}'.format(mynet.s))