def __init__(self, args, environment): self.args = args self.environment = environment self.tensorboard_process = None self.training_iterations = 0 self.batch_loss = 0 self.learning_rate = 0 self.sess = self.start_session(args) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.learning_rate_op = self.build_learning_rate(step=self.global_step) with op.context(floatx=tf.float32, floatsafe=False): self.train_op = None self.priority_op = None self.loss_op = None self.agent_output_action = None self.agent_output = None self.agent_network = None self.build_networks() assert (self.train_op is not None and self.priority_op is not None and self.loss_op is not None and self.agent_output_action is not None and self.agent_output is not None), 'Network implementation must define the operations found on this line' self.assign_ops = self.build_assign_ops() self.initialize()
def build(self, states): with op.context(default_activation_fn='relu'): conv1, w1, b1 = op.conv2d(states, size=8, filters=32, stride=4, name='conv1') conv2, w2, b2 = op.conv2d(conv1, size=4, filters=64, stride=2, name='conv2') conv3, w3, b3 = op.conv2d(conv2, size=3, filters=64, stride=1, name='conv3') fc4, w4, b4 = op.linear(op.flatten(conv3, name="fc4"), 512, name='fc4') output, w5, b5 = op.linear(fc4, self.environment.get_num_actions(), activation_fn='none', name='output') return output
def build(self, states): with op.context(default_activation_fn='relu'): fc1, w1, b1 = op.linear(op.flatten(states, name="fc1_flatten"), 500, name='fc1') fc2, w2, b2 = op.linear(fc1, 500, name='fc2') value, w3, b3 = op.linear(fc2, self.environment.get_num_actions(), activation_fn='none', name='value') advantages, w4, b4 = op.linear(fc2, self.environment.get_num_actions(), activation_fn='none', name='advantages') # Dueling DQN - http://arxiv.org/pdf/1511.06581v3.pdf output = value + (advantages - op.mean(advantages, keep_dims=True)) return output
def build(self, states): with op.context(default_activation_fn='relu'): conv1, w1, b1 = op.conv2d(states, size=8, filters=32, stride=4, name='conv1') conv2, w2, b2 = op.conv2d(conv1, size=4, filters=64, stride=2, name='conv2') conv3, w3, b3 = op.conv2d(conv2, size=3, filters=64, stride=1, name='conv3') fc4, w4, b4 = op.linear(op.flatten(conv3, name="fc4"), 512, name='fc4') output, w5, b5 = op.linear(fc4, self.environment.get_num_actions(), activation_fn='none', name='output') raw_sigma, w6, b6 = op.linear(fc4, self.environment.get_num_actions(), name='variance') raw_sigma += 0.0001 # to avoid divide by zero sigma = tf.exp(raw_sigma) return output, sigma
def build(self, states): with tf.variable_scope('net'), op.context(default_activation_fn='relu'): conv1, w1, b1 = op.conv2d(states, size=8, filters=32, stride=4, name='conv1') conv2, w2, b2 = op.conv2d(conv1, size=4, filters=64, stride=2, name='conv2') conv3, w3, b3 = op.conv2d(conv2, size=3, filters=64, stride=1, name='conv3') fc4, w4, b4 = op.linear(op.flatten(conv3), 256, name='fc4') h, w5, b5 = op.linear(fc4, 256, name='h') h1, w6, b6 = op.linear(h, 256, name='h1') hhat, w7, b7 = op.linear(h1, 256, name='hhat') fc8, w8, b8 = op.linear(op.merge(h, hhat, name="fc8"), 256, name='fc8') output, w9, b9 = op.linear(fc8, self.environment.get_num_actions(), activation_fn='none', name='output') with tf.name_scope('prediction'), tf.variable_scope('net', reuse=True), op.context(default_activation_fn='relu'): hhat_conv1, _, _ = op.conv2d(self.inputs.lookaheads, size=8, filters=32, stride=4, name='conv1') hhat_conv2, _, _ = op.conv2d(hhat_conv1, size=4, filters=64, stride=2, name='conv2') hhat_conv3, _, _ = op.conv2d(hhat_conv2, size=3, filters=64, stride=1, name='conv3') hhat_truth, _, _ = op.linear(op.flatten(hhat_conv3), 256, name='fc4') self.constraint_error = tf.reduce_mean((hhat - hhat_truth)**2, reduction_indices=1, name='prediction_error') return output
def build(self, states): with op.context(default_activation_fn='relu'): conv1, w1, b1 = op.conv2d(states, size=8, filters=32, stride=4, name='conv1') conv2, w2, b2 = op.conv2d(conv1, size=4, filters=64, stride=2, name='conv2') conv3, w3, b3 = op.conv2d(conv2, size=3, filters=64, stride=1, name='conv3') conv3_flatten = op.flatten(conv3, name="conv3_flatten") fc4_value, w4, b4 = op.linear(conv3_flatten, 512, name='fc4_value') value, w5, b5 = op.linear(fc4_value, 1, activation_fn='none', name='value') fc4_advantage, w6, b6 = op.linear(conv3_flatten, 512, name='fc4_advantages') advantages, w7, b7 = op.linear(fc4_advantage, self.environment.get_num_actions(), activation_fn='none', name='advantages') # Dueling DQN - http://arxiv.org/pdf/1511.06581v3.pdf output = value + (advantages - op.mean(advantages, keep_dims=True)) return output
def build(self, states): with op.context(default_activation_fn='relu'): # Common Perception l1, w1, b1 = op.conv2d(states, size=8, filters=32, stride=4, name='conv1') # A Side l2a, w2, b2 = op.conv2d(l1, size=4, filters=64, stride=2, name='a_conv2') l2a_fc, w3, b3 = op.linear(op.flatten(l2a, name="a_fc4"), 32, activation_fn='none', name='a_fc3') # B Side l2b, w4, b4 = op.conv2d(l1, size=4, filters=64, stride=2, name='b_conv2') l2b_fc, w5, b5 = op.linear(op.flatten(l2b, name="b_fc4"), 32, activation_fn='none', name='b_fc3') # Causal Matrix l2a_fc_e = op.expand(l2a_fc, 2, name='a') # now ?x32x1 l2b_fc_e = op.expand(l2b_fc, 1, name='b') # now ?x1x32 causes = op.flatten(tf.batch_matmul(l2a_fc_e, l2b_fc_e, name='causes')) l4, w6, b6 = op.linear(causes, 512, name='l4') output, w5, b5 = op.linear(l4, self.environment.get_num_actions(), activation_fn='none', name='output') return output