def vgg_backbone(image, qw=1): with argscope(Conv2DQuant, nl=tf.identity, use_bias=False, W_init=variance_scaling_initializer(mode='FAN_IN'), data_format=get_arg_scope()['Conv2D']['data_format'], nbit=qw): logits = (LinearWrap(image) .Conv2DQuant('conv1', 96, 7, stride=2, nl=tf.nn.relu, is_quant=False) .MaxPooling('pool1', shape=2, stride=2, padding='VALID') # 56 .BNReLUQuant('bnquant2_0') .Conv2DQuant('conv2_1', 256, 3, nl=getBNReLUQuant) .Conv2DQuant('conv2_2', 256, 3, nl=getBNReLUQuant) .Conv2DQuant('conv2_3', 256, 3) .MaxPooling('pool2', shape=2, stride=2, padding='VALID') # 28 .BNReLUQuant('bnquant3_0') .Conv2DQuant('conv3_1', 512, 3, nl=getBNReLUQuant) .Conv2DQuant('conv3_2', 512, 3, nl=getBNReLUQuant) .Conv2DQuant('conv3_3', 512, 3) .MaxPooling('pool3', shape=2, stride=2, padding='VALID') # 14 .BNReLUQuant('bnquant4_0') .Conv2DQuant('conv4_1', 512, 3, nl=getBNReLUQuant) .Conv2DQuant('conv4_2', 512, 3, nl=getBNReLUQuant) .Conv2DQuant('conv4_3', 512, 3) .MaxPooling('pool4', shape=2, stride=2, padding='VALID') # 7 .BNReLUQuant('bnquant5') .Conv2DQuant('fc5', 4096, 7, nl=getfcBNReLUQuant, padding='VALID', use_bias=True) .Conv2DQuant('fc6', 4096, 1, nl=getfcBNReLU, padding='VALID', use_bias=True) .FullyConnected('fc7', out_dim=1000, nl=tf.identity, W_init=variance_scaling_initializer(mode='FAN_IN'))()) return logits
def make_fingerprint(x, is_train, fc_dropout, seed): """ Calculates 'fingerprint' of timeseries, to feed into attention layer :param x: :param is_train: :param fc_dropout: :param seed: :return: """ with tf.variable_scope("fingerpint"): # x = tf.expand_dims(x, -1) with tf.variable_scope('convnet', initializer=layers.variance_scaling_initializer(seed=seed)): c11 = tf.layers.conv1d(x, filters=16, kernel_size=7, activation=tf.nn.relu, padding='same') c12 = tf.layers.conv1d(c11, filters=16, kernel_size=3, activation=tf.nn.relu, padding='same') pool1 = tf.layers.max_pooling1d(c12, 2, 2, padding='same') c21 = tf.layers.conv1d(pool1, filters=32, kernel_size=3, activation=tf.nn.relu, padding='same') c22 = tf.layers.conv1d(c21, filters=32, kernel_size=3, activation=tf.nn.relu, padding='same') pool2 = tf.layers.max_pooling1d(c22, 2, 2, padding='same') c31 = tf.layers.conv1d(pool2, filters=64, kernel_size=3, activation=tf.nn.relu, padding='same') c32 = tf.layers.conv1d(c31, filters=64, kernel_size=3, activation=tf.nn.relu, padding='same') pool3 = tf.layers.max_pooling1d(c32, 2, 2, padding='same') dims = pool3.shape.dims pool3 = tf.reshape(pool3, [-1, dims[1].value * dims[2].value]) if is_train and fc_dropout < 1.0: cnn_out = tf.nn.dropout(pool3, fc_dropout, seed=seed) else: cnn_out = pool3 with tf.variable_scope('fc_convnet', initializer=layers.variance_scaling_initializer(factor=1.0, mode='FAN_IN', seed=seed)): fc_encoder = tf.layers.dense(cnn_out, 512, activation=selu, name='fc_encoder') out_encoder = tf.layers.dense(fc_encoder, 16, activation=selu, name='out_encoder') return out_encoder
def googlenet_backbone(image, qw=1): with argscope(Conv2DQuant, nl=tf.identity, use_bias=False, W_init=variance_scaling_initializer(mode='FAN_IN'), data_format=get_arg_scope()['Conv2D']['data_format'], nbit=qw, is_quant=True if qw > 0 else False): logits = (LinearWrap(image) .Conv2DQuant('conv1', 64, 7, stride=2, is_quant=False) .MaxPooling('pool1', shape=3, stride=2, padding='SAME') .BNReLUQuant('pool1/out') .Conv2DQuant('conv2/3x3_reduce', 192, 1, nl=getBNReLUQuant) .Conv2DQuant('conv2/3x3', 192, 3) .MaxPooling('pool2', shape=3, stride=2, padding='SAME') .BNReLUQuant('pool2/out') .apply(inception_block, 'incpetion_3a', 96, 128, 32) .apply(inception_block, 'incpetion_3b', 192, 192, 96, is_last_block=True) .apply(inception_block, 'incpetion_4a', 256, 208, 48) .apply(inception_block, 'incpetion_4b', 224, 224, 64) .apply(inception_block, 'incpetion_4c', 192, 256, 64) .apply(inception_block, 'incpetion_4d', 176, 288, 64) .apply(inception_block, 'incpetion_4e', 384, 320, 128, is_last_block=True) .apply(inception_block, 'incpetion_5a', 384, 320, 128) .apply(inception_block, 'incpetion_5b', 512, 384, 128, is_last_block=True, is_last=True) .GlobalAvgPooling('pool5') .FullyConnected('linear', out_dim=1000, nl=tf.identity)()) return logits
def create_dc_actor_critic(self, h_size, num_layers): num_streams = 1 hidden_streams = self.create_new_obs(num_streams, h_size, num_layers) hidden = hidden_streams[0] if self.use_recurrent: tf.Variable(self.m_size, name="memory_size", trainable=False, dtype=tf.int32) self.prev_action = tf.placeholder(shape=[None], dtype=tf.int32, name='prev_action') self.prev_action_oh = c_layers.one_hot_encoding(self.prev_action, self.a_size) hidden = tf.concat([hidden, self.prev_action_oh], axis=1) self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name='recurrent_in') hidden, self.memory_out = self.create_recurrent_encoder(hidden, self.memory_in) self.memory_out = tf.identity(self.memory_out, name='recurrent_out') self.policy = tf.layers.dense(hidden, self.a_size, activation=None, use_bias=False, kernel_initializer=c_layers.variance_scaling_initializer(factor=0.01)) self.all_probs = tf.nn.softmax(self.policy, name="action_probs") self.output = tf.multinomial(self.policy, 1) self.output = tf.identity(self.output, name="action") self.value = tf.layers.dense(hidden, 1, activation=None) self.value = tf.identity(self.value, name="value_estimate") self.entropy = -tf.reduce_sum(self.all_probs * tf.log(self.all_probs + 1e-10), axis=1) self.action_holder = tf.placeholder(shape=[None], dtype=tf.int32) self.selected_actions = c_layers.one_hot_encoding(self.action_holder, self.a_size) self.all_old_probs = tf.placeholder(shape=[None, self.a_size], dtype=tf.float32, name='old_probabilities') # We reshape these tensors to [batch x 1] in order to be of the same rank as continuous control probabilities. self.probs = tf.expand_dims(tf.reduce_sum(self.all_probs * self.selected_actions, axis=1), 1) self.old_probs = tf.expand_dims(tf.reduce_sum(self.all_old_probs * self.selected_actions, axis=1), 1)
def __init__(self, brain, h_size=128, lr=1e-4, n_layers=2, m_size=128, normalize=False, use_recurrent=False): LearningModel.__init__(self, m_size, normalize, use_recurrent, brain) num_streams = 1 hidden_streams = self.create_new_obs(num_streams, h_size, n_layers) hidden = hidden_streams[0] self.dropout_rate = tf.placeholder(dtype=tf.float32, shape=[], name="dropout_rate") hidden_reg = tf.layers.dropout(hidden, self.dropout_rate) if self.use_recurrent: self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name='recurrent_in') hidden_reg, self.memory_out = self.create_recurrent_encoder(hidden_reg, self.memory_in) self.memory_out = tf.identity(self.memory_out, name='recurrent_out') self.policy = tf.layers.dense(hidden_reg, self.a_size, activation=None, use_bias=False, kernel_initializer=c_layers.variance_scaling_initializer(factor=0.01)) if brain.vector_action_space_type == "discrete": self.action_probs = tf.nn.softmax(self.policy) self.sample_action_float = tf.multinomial(self.policy, 1) self.sample_action_float = tf.identity(self.sample_action_float, name="action") self.sample_action = tf.cast(self.sample_action_float, tf.int32) self.true_action = tf.placeholder(shape=[None], dtype=tf.int32, name="teacher_action") self.action_oh = tf.one_hot(self.true_action, self.a_size) self.loss = tf.reduce_sum(-tf.log(self.action_probs + 1e-10) * self.action_oh) self.action_percent = tf.reduce_mean(tf.cast( tf.equal(tf.cast(tf.argmax(self.action_probs, axis=1), tf.int32), self.sample_action), tf.float32)) else: self.sample_action = tf.identity(self.policy, name="action") self.true_action = tf.placeholder(shape=[None, self.a_size], dtype=tf.float32, name="teacher_action") self.loss = tf.reduce_sum(tf.squared_difference(self.true_action, self.sample_action)) optimizer = tf.train.AdamOptimizer(learning_rate=lr) self.update = optimizer.minimize(self.loss)
def _build_graph(self, inputs): image, label = inputs image = image / 128.0 assert tf.test.is_gpu_available() image = tf.transpose(image, [0, 3, 1, 2]) with argscope([Conv2DQuant, MaxPooling, BatchNorm], data_format='NCHW'), \ argscope(Conv2DQuant, nl=tf.identity, use_bias=False, kernel_shape=3, W_init=variance_scaling_initializer(mode='FAN_IN'), nbit=self.qw, is_quant=True if self.qw > 0 else False): l = Conv2DQuant('conv0', image, 128, nl=BNReLU, is_quant=False) if self.qa > 0: l = QuantizedActiv('quant1', l, self.qa) l = Conv2DQuant('conv1', l, 128) # 32 l = MaxPooling('pool2', l, shape=2, stride=2, padding='VALID') l = BNReLU('bn2', l) if self.qa > 0: l = QuantizedActiv('quant2', l, self.qa) l = Conv2DQuant('conv2', l, 256, nl=BNReLU) if self.qa > 0: l = QuantizedActiv('quant3', l, self.qa) l = Conv2DQuant('conv3', l, 256) # 16 l = MaxPooling('pool4', l, shape=2, stride=2, padding='VALID') l = BNReLU('bn4', l) if self.qa > 0: l = QuantizedActiv('quant4', l, self.qa) l = Conv2DQuant('conv4', l, 512, nl=BNReLU) if self.qa > 0: l = QuantizedActiv('quant5', l, self.qa) l = Conv2DQuant('conv5', l, 512) # 8 l = MaxPooling('pool6', l, shape=2, stride=2, padding='VALID') l = BNReLU('bn6', l) # 4 logits = FullyConnected('linear', l, out_dim=10, nl=tf.identity) prob = tf.nn.softmax(logits, name='output') cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') wrong = prediction_incorrect(logits, label) # monitor training error add_moving_summary(tf.reduce_mean(wrong, name='train_error')) wd_cost = tf.multiply(WEIGHT_DECAY, regularize_cost('.*/W', tf.nn.l2_loss), name='wd_cost') add_moving_summary(cost, wd_cost) add_param_summary(('.*/W', ['histogram'])) # monitor W self.cost = tf.add_n([cost, wd_cost], name='cost')
def create_cc_actor_critic(self, h_size, num_layers): """ Creates Continuous control actor-critic model. :param h_size: Size of hidden linear layers. :param num_layers: Number of hidden linear layers. """ hidden_streams = self.create_observation_streams(2, h_size, num_layers) if self.use_recurrent: tf.Variable(self.m_size, name="memory_size", trainable=False, dtype=tf.int32) self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name='recurrent_in') _half_point = int(self.m_size / 2) hidden_policy, memory_policy_out = self.create_recurrent_encoder( hidden_streams[0], self.memory_in[:, :_half_point], self.sequence_length, name='lstm_policy') hidden_value, memory_value_out = self.create_recurrent_encoder( hidden_streams[1], self.memory_in[:, _half_point:], self.sequence_length, name='lstm_value') self.memory_out = tf.concat([memory_policy_out, memory_value_out], axis=1, name='recurrent_out') else: hidden_policy = hidden_streams[0] hidden_value = hidden_streams[1] mu = tf.layers.dense(hidden_policy, self.a_size, activation=None, kernel_initializer=c_layers.variance_scaling_initializer(factor=0.01)) log_sigma_sq = tf.get_variable("log_sigma_squared", [self.a_size], dtype=tf.float32, initializer=tf.zeros_initializer()) sigma_sq = tf.exp(log_sigma_sq) epsilon = tf.random_normal(tf.shape(mu), dtype=tf.float32) # Clip and scale output to ensure actions are always within [-1, 1] range. self.output_pre = mu + tf.sqrt(sigma_sq) * epsilon output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3 self.output = tf.identity(output_post, name='action') self.selected_actions = tf.stop_gradient(output_post) # Compute probability of model output. a = tf.exp(-1 * tf.pow(tf.stop_gradient(self.output_pre) - mu, 2) / (2 * sigma_sq)) b = 1 / tf.sqrt(2 * sigma_sq * np.pi) all_probs = tf.multiply(a, b) self.all_probs = tf.identity(all_probs, name='action_probs') self.entropy = tf.reduce_mean(0.5 * tf.log(2 * np.pi * np.e * sigma_sq)) value = tf.layers.dense(hidden_value, 1, activation=None) self.value = tf.identity(value, name="value_estimate") self.all_old_probs = tf.placeholder(shape=[None, self.a_size], dtype=tf.float32, name='old_probabilities') # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control. self.probs = tf.identity(self.all_probs) self.old_probs = tf.identity(self.all_old_probs)
def variance_scaling(factor=2.0, mode='FAN_IN', uniform=False, seed=None, dtype=tf.float32, name='Xavier'): """Variance Scaling. Returns an initializer that generates tensors without scaling variance. When initializing a deep network, it is in principle advantageous to keep the scale of the input variance constant, so it does not explode or diminish by reaching the final layer. This initializer use the following formula: ``` if mode='FAN_IN': # Count only number of input connections. n = fan_in elif mode='FAN_OUT': # Count only number of output connections. n = fan_out elif mode='FAN_AVG': # Average number of inputs and output connections. n = (fan_in + fan_out)/2.0 truncated_normal(shape, 0.0, stddev=sqrt(factor / n)) ``` To get http://arxiv.org/pdf/1502.01852v1.pdf use (Default): - factor=2.0 mode='FAN_IN' uniform=False To get http://arxiv.org/abs/1408.5093 use: - factor=1.0 mode='FAN_IN' uniform=True To get http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf use: - factor=1.0 mode='FAN_AVG' uniform=True. To get xavier_initializer use either: - factor=1.0 mode='FAN_AVG' uniform=True. - factor=1.0 mode='FAN_AVG' uniform=False. Args: factor: Float. A multiplicative factor. mode: String. 'FAN_IN', 'FAN_OUT', 'FAN_AVG'. uniform: Whether to use uniform or normal distributed random initialization. seed: A Python integer. Used to create random seeds. See `set_random_seed` for behavior. dtype: The data type. Only floating point types are supported. name: name of the op. Returns: An initializer that generates tensors with unit variance. Raises: ValueError: if `dtype` is not a floating point type. TypeError: if `mode` is not in ['FAN_IN', 'FAN_OUT', 'FAN_AVG']. """ with get_name_scope(name): return tflayers.variance_scaling_initializer( factor=factor, mode=mode, uniform=uniform, seed=seed, dtype=dtype)
def densenet_backbone(image, qw=1): with argscope(Conv2DQuant, nl=tf.identity, use_bias=False, W_init=variance_scaling_initializer(mode='FAN_IN'), data_format=get_arg_scope()['Conv2D']['data_format'], nbit=qw, is_quant=True if qw > 0 else False): logits = (LinearWrap(image) .Conv2DQuant('conv1', 2 * GROWTH_RATE, 7, stride=2, nl=BNReLU, is_quant=False) .MaxPooling('pool1', shape=3, stride=2, padding='SAME') # 56 .apply(add_dense_block, 'block0', 6) # 28 .apply(add_dense_block, 'block1', 12) # 14 .apply(add_dense_block, 'block2', 24) # 7 .apply(add_dense_block, 'block3', 16, last=True) .BNReLU('bnrelu_last') .GlobalAvgPooling('gap') .FullyConnected('linear', out_dim=1000, nl=tf.identity, W_init=variance_scaling_initializer(mode='FAN_IN'))()) return logits
def create_continuous_state_encoder(self, h_size, activation, num_layers): """ Builds a set of hidden state encoders. :param h_size: Hidden layer size. :param activation: What type of activation function to use for layers. :param num_layers: number of hidden layers to create. :return: List of hidden layer tensors. """ hidden = self.normalized_state for j in range(num_layers): hidden = tf.layers.dense(hidden, h_size, activation=activation, kernel_initializer=c_layers.variance_scaling_initializer(1.0)) return hidden
def conv1d(self, net, num_ker, ker_size, stride): # 1D-convolution net = convolution2d( net, num_outputs=num_ker, kernel_size=[ker_size, 1], stride=[stride, 1], padding='SAME', activation_fn=None, normalizer_fn=None, weights_initializer=variance_scaling_initializer(), weights_regularizer=l2_regularizer(self.weight_decay), biases_initializer=tf.zeros_initializer) return net
def resnet_backbone(image, num_blocks, group_func, block_func, qw=1): with argscope(Conv2DQuant, nl=tf.identity, use_bias=False, W_init=variance_scaling_initializer(mode='FAN_OUT'), data_format=get_arg_scope()['Conv2D']['data_format'], nbit=qw): logits = (LinearWrap(image) .Conv2DQuant('conv0', 64, 7, stride=2, nl=BNReLU, is_quant=False) .MaxPooling('pool0', shape=3, stride=2, padding='SAME') .apply(group_func, 'group0', block_func, 64, num_blocks[0], 1) .apply(group_func, 'group1', block_func, 128, num_blocks[1], 2) .apply(group_func, 'group2', block_func, 256, num_blocks[2], 2) .apply(group_func, 'group3', block_func, 512, num_blocks[3], 2, is_last=True) .GlobalAvgPooling('gap') .FullyConnected('linear', 1000, nl=tf.identity)()) return logits
def compressed_readout(rnn_out, hparams, dropout, seed): """ FC compression layer, reduces RNN output depth to hparams.attention_depth :param rnn_out: :param hparams: :param dropout: :param seed: :return: """ if dropout < 1.0: rnn_out = tf.nn.dropout(rnn_out, dropout, seed=seed) return tf.layers.dense(rnn_out, hparams.attention_depth, use_bias=True, activation=selu, kernel_initializer=layers.variance_scaling_initializer(factor=1.0, seed=seed), name='compress_readout' )
def create_continuous_observation_encoder(observation_input, h_size, activation, num_layers, scope, reuse): """ Builds a set of hidden state encoders. :param reuse: Whether to re-use the weights within the same scope. :param scope: Graph scope for the encoder ops. :param observation_input: Input vector. :param h_size: Hidden layer size. :param activation: What type of activation function to use for layers. :param num_layers: number of hidden layers to create. :return: List of hidden layer tensors. """ with tf.variable_scope(scope): hidden = observation_input for i in range(num_layers): hidden = tf.layers.dense(hidden, h_size, activation=activation, reuse=reuse, name="hidden_{}".format(i), kernel_initializer=c_layers.variance_scaling_initializer(1.0)) return hidden
def create_cc_actor_critic(self, h_size, num_layers): num_streams = 2 hidden_streams = self.create_new_obs(num_streams, h_size, num_layers) if self.use_recurrent: tf.Variable(self.m_size, name="memory_size", trainable=False, dtype=tf.int32) self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name='recurrent_in') _half_point = int(self.m_size / 2) hidden_policy, memory_policy_out = self.create_recurrent_encoder( hidden_streams[0], self.memory_in[:, :_half_point], name='lstm_policy') hidden_value, memory_value_out = self.create_recurrent_encoder( hidden_streams[1], self.memory_in[:, _half_point:], name='lstm_value') self.memory_out = tf.concat([memory_policy_out, memory_value_out], axis=1, name='recurrent_out') else: hidden_policy = hidden_streams[0] hidden_value = hidden_streams[1] self.mu = tf.layers.dense(hidden_policy, self.a_size, activation=None, use_bias=False, kernel_initializer=c_layers.variance_scaling_initializer(factor=0.01)) self.log_sigma_sq = tf.get_variable("log_sigma_squared", [self.a_size], dtype=tf.float32, initializer=tf.zeros_initializer()) self.sigma_sq = tf.exp(self.log_sigma_sq) self.epsilon = tf.random_normal(tf.shape(self.mu), dtype=tf.float32) self.output = self.mu + tf.sqrt(self.sigma_sq) * self.epsilon self.output = tf.identity(self.output, name='action') a = tf.exp(-1 * tf.pow(tf.stop_gradient(self.output) - self.mu, 2) / (2 * self.sigma_sq)) b = 1 / tf.sqrt(2 * self.sigma_sq * np.pi) self.all_probs = tf.multiply(a, b, name="action_probs") self.entropy = tf.reduce_mean(0.5 * tf.log(2 * np.pi * np.e * self.sigma_sq)) self.value = tf.layers.dense(hidden_value, 1, activation=None) self.value = tf.identity(self.value, name="value_estimate") self.all_old_probs = tf.placeholder(shape=[None, self.a_size], dtype=tf.float32, name='old_probabilities') # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control. self.probs = tf.identity(self.all_probs) self.old_probs = tf.identity(self.all_old_probs)
def _myForwardPass(self): cnn_feats = self._ph.cnn_feats pred_polys = self._ph.pred_polys pred_mask_imgs = self._ph.pred_mask_imgs last_cell_state_1 = self._ph.cells_1[:, -1, :, :, :] last_cell_state_2 = self._ph.cells_2[:, -1, :, :, :] weight_decay = 0.00001 predicted_history = tf.zeros(shape=(self.batch_size, 28, 28, 1)) # Drawing the canvas for i in range(self.seq_len): pred_polys_t = pred_polys[:, i] # batch x indices = tf.concat( [tf.reshape(tf.range(0, self.batch_size), (self.batch_size, 1)), tf.cast(pred_polys_t, tf.int32)], axis=1) updates = tf.ones(shape=self.batch_size) pred_polys_t = tf.scatter_nd(indices, updates, shape=(self.batch_size, 28, 28)) predicted_history = predicted_history + tf.expand_dims(pred_polys_t, axis=-1) xt = tf.concat([cnn_feats, predicted_history, pred_mask_imgs, last_cell_state_1, last_cell_state_2], axis=3) with slim.arg_scope([slim.conv2d], kernel_size=[3, 3], stride=1, weights_regularizer=slim.l2_regularizer(weight_decay), activation_fn=tf.nn.relu, normalizer_fn=slim.batch_norm, normalizer_params={"is_training": self.is_training, "decay": 0.99, "center": True, "scale": True}, weights_initializer=layers.variance_scaling_initializer( factor=2.0, mode='FAN_IN', uniform=False) ): self._conv1 = slim.conv2d(xt, scope="conv1", num_outputs=16) self._conv2 = slim.conv2d(self._conv1, scope="conv2", num_outputs=1) output = layers.fully_connected(slim.flatten(self._conv2), 1, weights_regularizer=layers.l2_regularizer(1e-5), scope="FC") return output
def __init__(self, lr, brain, h_size, epsilon, max_step, normalize, num_layers): """ Creates Continuous Control Actor-Critic model. :param brain: State-space size :param h_size: Hidden layer size """ super(ContinuousControlModel, self).__init__() s_size = brain.state_space_size a_size = brain.action_space_size self.normalize = normalize self.create_global_steps() self.create_reward_encoder() hidden_state, hidden_visual, hidden_policy, hidden_value = None, None, None, None encoders = [] if brain.number_observations > 0: for i in range(brain.number_observations): height_size, width_size = brain.camera_resolutions[i][ 'height'], brain.camera_resolutions[i]['width'] bw = brain.camera_resolutions[i]['blackAndWhite'] encoders.append( self.create_visual_encoder(height_size, width_size, bw, h_size, 2, tf.nn.tanh, num_layers)) hidden_visual = tf.concat(encoders, axis=2) if brain.state_space_size > 0: s_size = brain.state_space_size if brain.state_space_type == "continuous": hidden_state = self.create_continuous_state_encoder( s_size, h_size, 2, tf.nn.tanh, num_layers) else: hidden_state = self.create_discrete_state_encoder( s_size, h_size, 2, tf.nn.tanh, num_layers) if hidden_visual is None and hidden_state is None: raise Exception( "No valid network configuration possible. " "There are no states or observations in this brain") elif hidden_visual is not None and hidden_state is None: hidden_policy, hidden_value = hidden_visual elif hidden_visual is None and hidden_state is not None: hidden_policy, hidden_value = hidden_state elif hidden_visual is not None and hidden_state is not None: hidden_policy = tf.concat([hidden_visual[0], hidden_state[0]], axis=1) hidden_value = tf.concat([hidden_visual[1], hidden_state[1]], axis=1) self.batch_size = tf.placeholder(shape=None, dtype=tf.int32, name='batch_size') self.mu = tf.layers.dense( hidden_policy, a_size, activation=None, use_bias=False, kernel_initializer=c_layers.variance_scaling_initializer( factor=0.01)) self.log_sigma_sq = tf.get_variable("log_sigma_squared", [a_size], dtype=tf.float32, initializer=tf.zeros_initializer()) self.sigma_sq = tf.exp(self.log_sigma_sq) self.epsilon = tf.placeholder(shape=[None, a_size], dtype=tf.float32, name='epsilon') self.output = self.mu + tf.sqrt(self.sigma_sq) * self.epsilon self.output = tf.identity(self.output, name='action') a = tf.exp(-1 * tf.pow(tf.stop_gradient(self.output) - self.mu, 2) / (2 * self.sigma_sq)) b = 1 / tf.sqrt(2 * self.sigma_sq * np.pi) self.probs = tf.multiply(a, b, name="action_probs") self.entropy = tf.reduce_sum(0.5 * tf.log(2 * np.pi * np.e * self.sigma_sq)) self.value = tf.layers.dense(hidden_value, 1, activation=None, use_bias=False) self.value = tf.identity(self.value, name="value_estimate") self.old_probs = tf.placeholder(shape=[None, a_size], dtype=tf.float32, name='old_probabilities') self.create_ppo_optimizer(self.probs, self.old_probs, self.value, self.entropy, 0.0, epsilon, lr, max_step)
def create_dc_actor_critic(self, h_size, num_layers): """ Creates Discrete control actor-critic model. :param h_size: Size of hidden linear layers. :param num_layers: Number of hidden linear layers. """ hidden_streams = self.create_observation_streams(1, h_size, num_layers) hidden = hidden_streams[0] if self.use_recurrent: self.prev_action = tf.placeholder(shape=[None, len(self.act_size)], dtype=tf.int32, name='prev_action') prev_action_oh = tf.concat([ tf.one_hot(self.prev_action[:, i], self.act_size[i]) for i in range(len(self.act_size)) ], axis=1) hidden = tf.concat([hidden, prev_action_oh], axis=1) self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name='recurrent_in') hidden, memory_out = self.create_recurrent_encoder( hidden, self.memory_in, self.sequence_length) self.memory_out = tf.identity(memory_out, name='recurrent_out') policy_branches = [] for size in self.act_size: policy_branches.append( tf.layers.dense( hidden, size, activation=None, use_bias=False, kernel_initializer=c_layers.variance_scaling_initializer( factor=0.01))) self.all_log_probs = tf.concat([branch for branch in policy_branches], axis=1, name="action_probs") self.action_masks = tf.placeholder(shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks") output, normalized_logits = self.create_discrete_action_masking_layer( self.all_log_probs, self.action_masks, self.act_size) self.output = tf.identity(output) self.normalized_logits = tf.identity(normalized_logits, name='action') value = tf.layers.dense(hidden, 1, activation=None) self.value = tf.identity(value, name="value_estimate") self.action_holder = tf.placeholder(shape=[None, len(policy_branches)], dtype=tf.int32, name="action_holder") self.action_oh = tf.concat([ tf.one_hot(self.action_holder[:, i], self.act_size[i]) for i in range(len(self.act_size)) ], axis=1) self.selected_actions = tf.stop_gradient(self.action_oh) self.all_old_log_probs = tf.placeholder( shape=[None, sum(self.act_size)], dtype=tf.float32, name='old_probabilities') _, old_normalized_logits = self.create_discrete_action_masking_layer( self.all_old_log_probs, self.action_masks, self.act_size) action_idx = [0] + list(np.cumsum(self.act_size)) self.entropy = tf.reduce_sum((tf.stack([ tf.nn.softmax_cross_entropy_with_logits_v2( labels=tf.nn.softmax( self.all_log_probs[:, action_idx[i]:action_idx[i + 1]]), logits=self.all_log_probs[:, action_idx[i]:action_idx[i + 1]]) for i in range(len(self.act_size)) ], axis=1)), axis=1) self.log_probs = tf.reduce_sum((tf.stack([ -tf.nn.softmax_cross_entropy_with_logits_v2( labels=self.action_oh[:, action_idx[i]:action_idx[i + 1]], logits=normalized_logits[:, action_idx[i]:action_idx[i + 1]]) for i in range(len(self.act_size)) ], axis=1)), axis=1, keepdims=True) self.old_log_probs = tf.reduce_sum((tf.stack([ -tf.nn.softmax_cross_entropy_with_logits_v2( labels=self.action_oh[:, action_idx[i]:action_idx[i + 1]], logits=old_normalized_logits[:, action_idx[i]:action_idx[i + 1]]) for i in range(len(self.act_size)) ], axis=1)), axis=1, keepdims=True)
def _build_model(self, input_batch): inputs_image, inputs_Beta = tf.split(input_batch, num_or_size_splits=2, axis=3) if self.data_format == 'NCHW': reduction_axis = [2, 3] _inputs_image = tf.cast(tf.transpose(inputs_image, [0, 3, 1, 2]), tf.float32) _inputs_Beta = tf.cast(tf.transpose(inputs_Beta, [0, 3, 1, 2]), tf.float32) else: reduction_axis = [1, 2] _inputs_image = tf.cast(inputs_image, tf.float32) _inputs_Beta = tf.cast(inputs_Beta, tf.float32) with arg_scope([layers.conv2d], num_outputs=16, kernel_size=3, stride=1, padding='SAME', data_format=self.data_format, activation_fn=None, weights_initializer=layers.variance_scaling_initializer(), weights_regularizer=layers.l2_regularizer(2e-4), biases_initializer=tf.constant_initializer(0.2), biases_regularizer=None),\ arg_scope([layers.batch_norm], decay=0.9, center=True, scale=True, updates_collections=None, is_training=self.is_training, fused=True, data_format=self.data_format),\ arg_scope([layers.avg_pool2d], kernel_size=[3,3], stride=[2,2], padding='SAME', data_format=self.data_format): with tf.variable_scope('Layer1'): # 256*256 W = tf.get_variable('W', shape=[3,3,1,64],\ initializer=layers.variance_scaling_initializer(), \ dtype=tf.float32, \ regularizer=layers.l2_regularizer(5e-4)) b = tf.get_variable('b', shape=[64], dtype=tf.float32, \ initializer=tf.constant_initializer(0.2)) conv = tf.nn.bias_add( \ tf.nn.conv2d(tf.cast(_inputs_image, tf.float32), \ W, [1,1,1,1], 'SAME', \ data_format=self.data_format), b, \ data_format=self.data_format, name='Layer1') actv = tf.nn.relu(conv) prob_map = tf.nn.conv2d(tf.cast(_inputs_Beta, tf.float32), \ tf.abs(W), [1,1,1,1], 'SAME', \ data_format=self.data_format) out_L1 = tf.add_n([actv, prob_map]) with tf.variable_scope('Layer2'): # 256*256 conv = layers.conv2d(out_L1) actv = tf.nn.relu(layers.batch_norm(conv)) with tf.variable_scope('Layer3'): # 256*256 conv1 = layers.conv2d(actv) actv1 = tf.nn.relu(layers.batch_norm(conv1)) conv2 = layers.conv2d(actv1) bn2 = layers.batch_norm(conv2) res = tf.add(actv, bn2) with tf.variable_scope('Layer4'): # 256*256 conv1 = layers.conv2d(res) actv1 = tf.nn.relu(layers.batch_norm(conv1)) conv2 = layers.conv2d(actv1) bn2 = layers.batch_norm(conv2) res = tf.add(res, bn2) with tf.variable_scope('Layer5'): # 256*256 conv1 = layers.conv2d(res) actv1 = tf.nn.relu(layers.batch_norm(conv1)) conv2 = layers.conv2d(actv1) bn = layers.batch_norm(conv2) res = tf.add(res, bn) with tf.variable_scope('Layer6'): # 256*256 conv1 = layers.conv2d(res) actv1 = tf.nn.relu(layers.batch_norm(conv1)) conv2 = layers.conv2d(actv1) bn = layers.batch_norm(conv2) res = tf.add(res, bn) with tf.variable_scope('Layer7'): # 256*256 conv1 = layers.conv2d(res) actv1 = tf.nn.relu(layers.batch_norm(conv1)) conv2 = layers.conv2d(actv1) bn = layers.batch_norm(conv2) res = tf.add(res, bn) with tf.variable_scope('Layer8'): # 256*256 convs = layers.conv2d(res, kernel_size=1, stride=2) convs = layers.batch_norm(convs) conv1 = layers.conv2d(res) actv1 = tf.nn.relu(layers.batch_norm(conv1)) conv2 = layers.conv2d(actv1) bn = layers.batch_norm(conv2) pool = layers.avg_pool2d(bn) res = tf.add(convs, pool) with tf.variable_scope('Layer9'): # 128*128 convs = layers.conv2d(res, num_outputs=64, kernel_size=1, stride=2) convs = layers.batch_norm(convs) conv1 = layers.conv2d(res, num_outputs=64) actv1 = tf.nn.relu(layers.batch_norm(conv1)) conv2 = layers.conv2d(actv1, num_outputs=64) bn = layers.batch_norm(conv2) pool = layers.avg_pool2d(bn) res = tf.add(convs, pool) with tf.variable_scope('Layer10'): # 64*64 convs = layers.conv2d(res, num_outputs=128, kernel_size=1, stride=2) convs = layers.batch_norm(convs) conv1 = layers.conv2d(res, num_outputs=128) actv1 = tf.nn.relu(layers.batch_norm(conv1)) conv2 = layers.conv2d(actv1, num_outputs=128) bn = layers.batch_norm(conv2) pool = layers.avg_pool2d(bn) res = tf.add(convs, pool) with tf.variable_scope('Layer11'): # 32*32 convs = layers.conv2d(res, num_outputs=256, kernel_size=1, stride=2) convs = layers.batch_norm(convs) conv1 = layers.conv2d(res, num_outputs=256) actv1 = tf.nn.relu(layers.batch_norm(conv1)) conv2 = layers.conv2d(actv1, num_outputs=256) bn = layers.batch_norm(conv2) pool = layers.avg_pool2d(bn) res = tf.add(convs, pool) with tf.variable_scope('Layer12'): # 16*16 conv1 = layers.conv2d(res, num_outputs=512) actv1 = tf.nn.relu(layers.batch_norm(conv1)) conv2 = layers.conv2d(actv1, num_outputs=512) bn = layers.batch_norm(conv2) avgp = tf.reduce_mean(bn, reduction_axis, keep_dims=True) ip = layers.fully_connected( layers.flatten(avgp), num_outputs=2, activation_fn=None, normalizer_fn=None, weights_initializer=tf.random_normal_initializer(mean=0., stddev=0.01), biases_initializer=tf.constant_initializer(0.), scope='ip') self.outputs = ip return self.outputs
def cudnn_lstm_layer(inputs, batch_size, num_units, lengths=None, stack_size=1, rnn_dropout_drop_amt=0, is_training=True, bidirectional=True): """Create a LSTM layer that uses cudnn.""" inputs_t = tf.transpose(inputs, [1, 0, 2]) if lengths is not None: all_outputs = [inputs_t] for i in range(stack_size): with tf.variable_scope('stack_' + str(i)): with tf.variable_scope('forward'): lstm_fw = contrib_cudnn_rnn.CudnnLSTM( num_layers=1, num_units=num_units, direction='unidirectional', dropout=rnn_dropout_drop_amt, kernel_initializer=contrib_layers. variance_scaling_initializer(), bias_initializer=tf.zeros_initializer(), ) c_fw = tf.zeros([1, batch_size, num_units], tf.float32) h_fw = tf.zeros([1, batch_size, num_units], tf.float32) outputs_fw, _ = lstm_fw(all_outputs[-1], (h_fw, c_fw), training=is_training) combined_outputs = outputs_fw if bidirectional: with tf.variable_scope('backward'): lstm_bw = contrib_cudnn_rnn.CudnnLSTM( num_layers=1, num_units=num_units, direction='unidirectional', dropout=rnn_dropout_drop_amt, kernel_initializer=contrib_layers. variance_scaling_initializer(), bias_initializer=tf.zeros_initializer(), ) c_bw = tf.zeros([1, batch_size, num_units], tf.float32) h_bw = tf.zeros([1, batch_size, num_units], tf.float32) inputs_reversed = tf.reverse_sequence(all_outputs[-1], lengths, seq_axis=0, batch_axis=1) outputs_bw, _ = lstm_bw(inputs_reversed, (h_bw, c_bw), training=is_training) outputs_bw = tf.reverse_sequence(outputs_bw, lengths, seq_axis=0, batch_axis=1) combined_outputs = tf.concat([outputs_fw, outputs_bw], axis=2) all_outputs.append(combined_outputs) # for consistency with cudnn, here we just return the top of the stack, # although this can easily be altered to do other things, including be # more resnet like return tf.transpose(all_outputs[-1], [1, 0, 2]) else: lstm = contrib_cudnn_rnn.CudnnLSTM( num_layers=stack_size, num_units=num_units, direction='bidirectional' if bidirectional else 'unidirectional', dropout=rnn_dropout_drop_amt, kernel_initializer=contrib_layers.variance_scaling_initializer(), bias_initializer=tf.zeros_initializer(), ) stack_multiplier = 2 if bidirectional else 1 c = tf.zeros([stack_multiplier * stack_size, batch_size, num_units], tf.float32) h = tf.zeros([stack_multiplier * stack_size, batch_size, num_units], tf.float32) outputs, _ = lstm(inputs_t, (h, c), training=is_training) outputs = tf.transpose(outputs, [1, 0, 2]) return outputs
def _build_graph(self, inputs): inp, label = inputs with argscope([Conv2DWithTrackedMults, BatchNorm], data_format='NHWC'), \ argscope([Conv2DWithTrackedMults], nl=tf.identity, \ W_init=variance_scaling_initializer(mode='FAN_OUT')), \ argscope([DepthwiseSeparableConvWithTrackedMults, \ Conv2DWithTrackedMults, \ FullyConnectedWithTrackedMults], \ network_complexity=self.network_complexity): l = self.net_fn(inp, self.batchnorm, self.n_context) logits = FullyConnectedWithTrackedMults( 'last_linear', l, out_dim=self.n_spks, nl=tf.identity, network_complexity=self.network_complexity) prob = tf.nn.softmax(logits, name='output') # used for validation accuracy of utterance identity_guesses = flatten(tf.argmax(prob, axis=1)) uniq_identities, _, count = tf.unique_with_counts(identity_guesses) idx_to_identity_with_most_votes = tf.argmax(count) chosen_identity = tf.gather(uniq_identities, idx_to_identity_with_most_votes) wrong = tf.expand_dims(tf.not_equal(chosen_identity, tf.cast(label[0], tf.int64)), axis=0, name='utt-wrong') cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') add_moving_summary(cost) wrong = prediction_incorrect(logits, label, 1, name='wrong-top1') add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1')) with tf.name_scope('original-weight-summaries'): add_param_summary(('.*/W', ['rms', 'histogram'])) add_param_summary(('.*/b', ['rms', 'histogram'])) with tf.name_scope('activation-summaries'): def fn(name): return (name.endswith('output') or name.endswith('output:0')) tensors = get_tensors_from_graph(tf.get_default_graph(), fn) print("Adding activation tensors to summary:", tensors) for tensor in tensors: add_tensor_summary(tensor, ['rms', 'histogram']) if self.regularize: # decreasing regularization on all W of fc layers wd_w = tf.train.exponential_decay(0.0002, get_global_step_var(), 480000, 0.2, True) wd_cost = tf.multiply(wd_w, regularize_cost('.*/W', tf.nn.l2_loss), name='wd_cost') add_moving_summary(wd_cost) self.cost = tf.add_n([cost, wd_cost], name='cost') else: self.cost = tf.identity(cost, name='cost') tf.constant([self.network_complexity['mults']], name='TotalMults') tf.constant([self.network_complexity['weights']], name='TotalWeights') logger.info("Parameter count: {}".format(self.network_complexity))
def _build_graph(self, inputs): image, label = inputs image = image / 128.0 assert tf.test.is_gpu_available() image = tf.transpose(image, [0, 3, 1, 2]) def residual(name, l, increase_dim=False, first=False, reuse=False): shape = l.get_shape().as_list() in_channel = shape[1] if increase_dim: out_channel = in_channel * 2 stride1 = 2 else: out_channel = in_channel stride1 = 1 with tf.variable_scope(name, reuse=reuse) as scope: b1 = l if first else BNReLU(l) c1 = Conv2D('conv1', b1, out_channel, stride=stride1, nl=BNReLU) c2 = Conv2D('conv2', c1, out_channel) if increase_dim: l = AvgPooling('pool', l, 2) l = tf.pad(l, [[0, 0], [in_channel // 2, in_channel // 2], [0, 0], [0, 0]]) l = 0.01 * c2 + l return l with argscope([Conv2D, AvgPooling, BatchNorm, GlobalAvgPooling], data_format='NCHW'), \ argscope(Conv2D, nl=tf.identity, use_bias=False, kernel_shape=3, W_init=variance_scaling_initializer(mode='FAN_OUT')): l = Conv2D('conv0', image, 16, nl=BNReLU) l = residual('res1.0', l, first=True) for k in range(1, self.n): l = residual('res1.1', l, reuse=True if k > 1 else False) # 32,c=16 l = residual('res2.0', l, increase_dim=True) for k in range(1, self.n): l = residual('res2.1', l, reuse=True if k > 1 else False) # 16,c=32 l = residual('res3.0', l, increase_dim=True) for k in range(1, self.n): l = residual('res3.1', l, reuse=True if k > 1 else False) l = BNReLU('bnlast', l) # 8,c=64 l = GlobalAvgPooling('gap', l) logits = FullyConnected('linear', l, out_dim=10, nl=tf.identity) prob = tf.nn.softmax(logits, name='output') cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') wrong = prediction_incorrect(logits, label) # monitor training error add_moving_summary(tf.reduce_mean(wrong, name='train_error')) # weight decay on all W of fc layers wd_w = tf.train.exponential_decay(0.0002, get_global_step_var(), 480000, 0.2, True) wd_cost = tf.multiply(wd_w, regularize_cost('.*/W', tf.nn.l2_loss), name='wd_cost') add_moving_summary(cost, wd_cost) add_param_summary(('.*/W', ['histogram'])) # monitor W self.cost = tf.add_n([cost, wd_cost], name='cost')
from copy import copy from tensorflow.contrib.layers import variance_scaling_initializer, xavier_initializer from sklearn.preprocessing import minmax_scale from sklearn.model_selection import KFold from sklearn.preprocessing import OneHotEncoder import tensorflow.contrib.slim as slim from tensorflow.python.ops.parallel_for.gradients import batch_jacobian import itertools seed = np.random.randint(0, 100) np.random.seed(seed) ## initializer x_init = xavier_initializer() v_init = variance_scaling_initializer() ## activation functions def sigmoid(x): return tf.nn.sigmoid(x) def softmax(x): return tf.nn.softmax(x) def relu_dropout(x, keep_prob): return tf.nn.dropout(tf.nn.relu(x), 1) def lrelu(x , alpha = 0.2 , name="LeakyReLU"): return tf.maximum(x , alpha*x)
def _build_graph(self, inputs): self.is_training = get_current_tower_context().is_training image, label = inputs image = image / 128.0 assert tf.test.is_gpu_available() image = tf.transpose(image, [0, 3, 1, 2]) all_cnt = tf.constant(self.n * 3 + 2, tf.float32, name="all_cnt") preds = [] epsilon = get_scalar_var('epsilon', self.EPSILON, summary=True) def residual_convs(l, first, out_channel, stride1): b1 = l if first else BNReLU(l) c1 = Conv2D('conv1', b1, out_channel, stride=stride1, nl=BNReLU) c2 = Conv2D('conv2', c1, out_channel) return c2 def residual(name, l, increase_dim=False, first=False): shape = l.get_shape().as_list() in_channel = shape[1] block_type = name[3] self.k = self.k + 1 if (first): p = 0.0 else: numerator = self.k p = numerator * float(1 - self.p_l) / (3 * self.n) if increase_dim: out_channel = in_channel * 2 stride1 = 2 short_cut = AvgPooling('pool', l, 2) short_cut = tf.pad(short_cut, [[0, 0], [in_channel // 2, in_channel // 2], [0, 0], [0, 0]]) else: out_channel = in_channel stride1 = 1 short_cut = l with tf.variable_scope(name) as scope: if self.is_training: print("Inside the Training Block in Model CLASS") means = p mask = tf.where((tf.random_uniform([1]) - means < 0), tf.ones([1], dtype=tf.float32), tf.zeros([1], dtype=tf.float32), 'mask') invrted_drop_out = 1 / ((1 - means) + 0.000001) is_dropped = tf.nn.relu(tf.reduce_sum(mask)) is_dropped = tf.where(tf.equal(is_dropped, 1.0), 1.0, 0.0, 'is_dropped') add_moving_summary(is_dropped) else: print("Inside the Test Block in Model CLASS") mask = 0 invrted_drop_out = 1 l = residual_convs(l, first, out_channel, stride1) l = mask * short_cut + (1.0 - mask) * l identity_w = (1.0 - mask) * strict_identity(l, self.EPSILON) identity_w = tf.nn.relu(tf.reduce_sum(identity_w)) l = identity_w * invrted_drop_out * l + short_cut is_discarded = tf.where(tf.equal(identity_w, 0.0), 1.0, 0.0, 'is_discarded') preds.append(is_discarded) add_moving_summary(is_discarded) return l side_output_cost = [] with argscope([Conv2D, AvgPooling, BatchNorm, GlobalAvgPooling], data_format='NCHW'), \ argscope(Conv2D, nl=tf.identity, use_bias=False, kernel_shape=3, W_init=variance_scaling_initializer(mode='FAN_OUT')): l = Conv2D('conv0', image, 16, nl=BNReLU) l = residual('res1.0', l, first=True) for k in range(1, self.n): l = residual('res1.{}'.format(k), l) # 32,c=16 l = residual('res2.0', l, increase_dim=True) for k in range(1, self.n): l = residual('res2.{}'.format(k), l) if k == self.n / 2: side_output_cost.append( side_output('res2.{}'.format(k), l, label, self.NUM_CLASS)) # 16,c=32 l = residual('res3.0', l, increase_dim=True) for k in range(1, self.n): l = residual('res3.' + str(k), l) l = BNReLU('bnlast', l) # 8,c=64 l = GlobalAvgPooling('gap', l) logits = FullyConnected('linear', l, out_dim=self.NUM_CLASS, nl=tf.identity) cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') wrong = prediction_incorrect(logits, label) # monitor training error add_moving_summary(tf.reduce_mean(wrong, name='train_error')) discarded_cnt = tf.add_n(preds, name="discarded_cnt") discarded_ratio = tf.divide(discarded_cnt, all_cnt, name="discarded_ratio") add_moving_summary(discarded_cnt, discarded_ratio) # weight decay on all W of fc layers wd_w = tf.train.exponential_decay(0.0002, get_global_step_var(), 480000, 0.2, True) wd_cost = tf.multiply(wd_w, regularize_cost('.*/W', tf.nn.l2_loss), name='wd_cost') add_moving_summary(cost, wd_cost) add_param_summary(('.*/W', ['histogram'])) # monitor W side_loss_w = [0.1] side_output_cost = [tf.multiply(side_loss_w[i], side_output_cost[i])\ for i in range(len(side_output_cost))] loss = side_output_cost + [cost, wd_cost] self.cost = tf.add_n(loss, name='cost')
import tensorflow as tf from tensor2tensor.layers.common_layers import conv1d, dense from tensorflow.contrib.cudnn_rnn import CudnnLSTM from tensorflow.contrib.keras import backend from tensorflow.contrib.layers import variance_scaling_initializer, l2_regularizer initializer = lambda: variance_scaling_initializer( factor=1.0, mode='FAN_AVG', uniform=True, dtype=tf.float32) initializer_relu = lambda: variance_scaling_initializer( factor=2.0, mode='FAN_IN', uniform=False, dtype=tf.float32) regularizer = l2_regularizer(scale=3e-7) # cudnnLSTM def BiLSTM(x, filters, dropout=0.0, name='BiLSTM', layers=1, return_state=False): cudnn_lstm = CudnnLSTM(layers, filters, direction='bidirectional', name=name) if type(x) == list: assert len(x) == 2 x1, x2 = x # cudnn compatibility: time first, batch second x1 = tf.transpose(x1, [1, 0, 2]) x2 = tf.transpose(x2, [1, 0, 2]) x1, x1_state = cudnn_lstm(x1) # state:[2, bs, dim]
def _build_graph(self, inputs): image, label = inputs image = image / 128.0 assert tf.test.is_gpu_available() image = tf.transpose(image, [0, 3, 1, 2]) cnt = tf.placeholder(tf.int32, [None, 784], name='x-input') def first_block(name, l): in_channel = l.get_shape().as_list()[1] out_channel = in_channel * 2 grp = int(name[3]) if self.discard_first_block[grp - 1] == 1: l = AvgPooling('pool', l, 2) l = tf.pad(l, [[0, 0], [in_channel // 2, in_channel // 2], [0, 0], [0, 0]]) else: l = residual(name, l, increase_dim=True) return l def residual(name, l, increase_dim=False, first=False): shape = l.get_shape().as_list() in_channel = shape[1] if increase_dim: out_channel = in_channel * 2 stride1 = 2 else: out_channel = in_channel stride1 = 1 #implement: full pre-activation with tf.variable_scope(name) as scope: b1 = l if first else BNReLU(l) c1 = Conv2D('conv1', b1, out_channel, stride=stride1, nl=BNReLU) c2 = Conv2D('conv2', c1, out_channel) if increase_dim: l = AvgPooling('pool', l, 2) l = tf.pad(l, [[0, 0], [in_channel // 2, in_channel // 2], [0, 0], [0, 0]]) l = c2 + l return l l = AvgPooling('pool', l, 2) with argscope([Conv2D, AvgPooling, BatchNorm, GlobalAvgPooling], data_format='NCHW'), \ argscope(Conv2D, nl=tf.identity, use_bias=False, kernel_shape=3, W_init=variance_scaling_initializer(mode='FAN_OUT')): #pdb.set_trace() l = Conv2D('conv0', image, 16, nl=BNReLU) l = residual('res1.0', l, first=True) for k in range(1, self.structure[0]): l = residual('res1.{}'.format(k), l) # 32,c=16 #l = residual('res2.0', l, increase_dim=True) l = first_block('res2.0', l) for k in range(1, self.structure[1]): l = residual('res2.{}'.format(k), l) # 16,c=32 #l = residual('res3.0', l, increase_dim=True) l = first_block('res3.0', l) for k in range(1, self.structure[2]): l = residual('res3.' + str(k), l) l = BNReLU('bnlast', l) # 8,c=64 l = GlobalAvgPooling('gap', l) logits = FullyConnected('linear', l, out_dim=self.NUM_CLASS, nl=tf.identity) print("logits: " + str(logits.shape)) #prob = tf.nn.softmax(logits, name='output') cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) print("cost1: " + str(cost.shape)) cost = tf.reduce_mean(cost, name='cross_entropy_loss') print("cost2: " + str(cost.shape)) wrong = prediction_incorrect(logits, label) # monitor training error add_moving_summary(tf.reduce_mean(wrong, name='train_error')) # weight decay on all W of fc layers wd_w = tf.train.exponential_decay(0.0002, get_global_step_var(), 480000, 0.2, True) wd_cost = tf.multiply(wd_w, regularize_cost('.*/W', tf.nn.l2_loss), name='wd_cost') add_moving_summary(cost, wd_cost) add_param_summary(('.*/W', ['histogram'])) # monitor W self.cost = tf.add_n([cost, wd_cost], name='cost')
def general_xavier_initializer(seed=None): return layers.variance_scaling_initializer(factor=2.0 / (1 + leak**2), mode='FAN_IN', seed=seed, uniform=False, dtype=tf.float32)
import logging logging.basicConfig(format="[%(asctime)s] %(message)s", datefmt="%m-%d %H:%M:%S") import numpy as np import tensorflow as tf from tensorflow.python.ops import rnn_cell from tensorflow.examples.tutorials.mnist import input_data from tensorflow.contrib.layers import variance_scaling_initializer WEIGHT_INITIALIZER = tf.contrib.layers.xavier_initializer() #WEIGHT_INITIALIZER = tf.uniform_unit_scaling_initializer() logger = logging.getLogger(__name__) he_uniform = variance_scaling_initializer(factor=2.0, mode="FAN_IN", uniform=False) data_format = "NCHW" def get_shape(layer): return layer.get_shape().as_list() #def get_shape(layer): # if data_format == "NHWC": # batch, height, width, channel = layer.get_shape().as_list() # elif data_format == "NCHW": # batch, channel, height, width = layer.get_shape().as_list() # else: # raise ValueError("Unknown data_format: %s" % data_format) # return batch, height, width, channel def skew(inputs, scope="skew"): with tf.name_scope(scope):
import tensorflow as tf # (1) scale inputs to zero mean and unit variance # (2) use SELUs def selu(x): with ops.name_scope('elu') as scope: alpha = 1.6732632423543772848170429916717 scale = 1.0507009873554804934193349852946 return scale*tf.where(x>=0.0, x, alpha*tf.nn.elu(x)) # (3) initialize weights with stddev sqrt(1/n) # e.g. use: initializer = layers.variance_scaling_initializer(factor=1.0, mode='FAN_IN') # (4) use this dropout def dropout_selu(x, rate, alpha= -1.7580993408473766, fixedPointMean=0.0, fixedPointVar=1.0, noise_shape=None, seed=None, name=None, training=False): """Dropout to a value with rescaling.""" def dropout_selu_impl(x, rate, alpha, noise_shape, seed, name): keep_prob = 1.0 - rate x = ops.convert_to_tensor(x, name="x") if isinstance(keep_prob, numbers.Real) and not 0 < keep_prob <= 1: raise ValueError("keep_prob must be a scalar tensor or a float in the " "range (0, 1], got %g" % keep_prob) keep_prob = ops.convert_to_tensor(keep_prob, dtype=x.dtype, name="keep_prob") keep_prob.get_shape().assert_is_compatible_with(tensor_shape.scalar())
def default_init(seed): # replica of tf.glorot_uniform_initializer(seed=seed) return layers.variance_scaling_initializer(factor=1.0, mode="FAN_AVG", uniform=True, seed=seed)
def _build_network(self): with tf.variable_scope(self.net_name): # input place holders self._X = tf.placeholder(tf.float32, [None, self.seq_length, self.data_dim], name="input_x") self._MONEY = tf.placeholder(tf.float32, [None, 3], name="input_money") self._keep_prob = tf.placeholder(tf.float32, name="kp") self._train_mode = tf.placeholder(tf.bool, name='train_mode') bn_params = { 'is_training': self._train_mode, 'decay': 0.9, 'fused': False } multi_cells = rnn.MultiRNNCell([self.lstm_cell(self.data_dim, self._keep_prob) for _ in range(2)], state_is_tuple=True) outputs, _states = tf.nn.dynamic_rnn(multi_cells, self._X, dtype=tf.float32) rnn_output = fully_connected(outputs, self.data_dim, activation_fn=tf.nn.elu, weights_initializer=variance_scaling_initializer(dtype=tf.float32), normalizer_fn=batch_norm, normalizer_params=bn_params) rnn_output = tf.nn.dropout(rnn_output, keep_prob=self._keep_prob) rnn_output = tf.reshape(rnn_output, [-1, self.seq_length * self.data_dim]) money = fully_connected(self._MONEY, 128, activation_fn=tf.nn.elu, weights_initializer=variance_scaling_initializer(dtype=tf.float32), normalizer_fn=batch_norm, normalizer_params=bn_params) money = tf.nn.dropout(money, keep_prob=self._keep_prob) output = fully_connected(rnn_output, 128, activation_fn=tf.nn.elu, weights_initializer=variance_scaling_initializer(dtype=tf.float32), normalizer_fn=batch_norm, normalizer_params=bn_params) output = tf.nn.dropout(output, keep_prob=self._keep_prob) output = fully_connected(output + money, 256, activation_fn=tf.nn.elu, weights_initializer=variance_scaling_initializer(dtype=tf.float32), normalizer_fn=batch_norm, normalizer_params=bn_params) output = tf.nn.dropout(output, keep_prob=self._keep_prob) output = fully_connected(output, 512, activation_fn=tf.nn.elu, weights_initializer=variance_scaling_initializer(dtype=tf.float32), normalizer_fn=batch_norm, normalizer_params=bn_params) output = tf.nn.dropout(output, keep_prob=self._keep_prob) self._Qpred = fully_connected(output, self.output_size, activation_fn=None, weights_initializer=variance_scaling_initializer(dtype=tf.float32), normalizer_fn=batch_norm, normalizer_params=bn_params) self._Y = tf.placeholder(shape=[None, self.output_size], dtype=tf.float32) self._loss = tf.reduce_mean(tf.square(self._Y - self._Qpred)) self._train = tf.train.AdamOptimizer(learning_rate=self.l_rate).minimize(self._loss)
def _run_backend_specific_init(self): self._initializer = tf_layers.variance_scaling_initializer( mode='FAN_IN', factor=1, uniform=self.args['uniform'], seed=self.args['seed'], dtype=self._get_dtype())
def _build_net(self, actor_inputs, msg_inputs, critic_inputs, actions_other, RNN_SIZE, TRAINING, a_size): w_init = layers.variance_scaling_initializer() with tf.variable_scope('actor'): conv1_actor = layers.conv2d(inputs=actor_inputs, padding="SAME", num_outputs=RNN_SIZE // 4, kernel_size=[3, 3], stride=1, data_format="NHWC", weights_initializer=w_init, activation_fn=tf.nn.relu) conv1a_actor = layers.conv2d(inputs=conv1_actor, padding="SAME", num_outputs=RNN_SIZE // 4, kernel_size=[3, 3], stride=1, data_format="NHWC", weights_initializer=w_init, activation_fn=tf.nn.relu) flat = layers.flatten(conv1a_actor) state_init_actor = np.zeros((1, HIDDEN_STATE_SIZE)) # rnn_input: a concatenation of the processed observations and message input rnn_input = tf.expand_dims(tf.concat([flat, msg_inputs], axis=1), [0]) seq_length = tf.shape(rnn_input)[:1] self.state_in_actor = tf.placeholder(shape=(1, HIDDEN_STATE_SIZE), dtype=tf.float32) rnn_cell = RNN_Cell(RNN_SIZE, self.num_agents, STATE_REPR_SIZE, a_size, msg_length, HIDDEN_STATE_SIZE) # rnn_cell = tf.nn.rnn_cell.BasicRNNCell(num_units = RNN_SIZE, activation = 'relu') rnn_output, state_out_actor = tf.nn.dynamic_rnn( rnn_cell, rnn_input, initial_state=self.state_in_actor, sequence_length=seq_length, dtype=tf.float32, time_major=False) rnn_out = tf.reshape(rnn_output, [-1, RNN_SIZE]) policy = layers.fully_connected(inputs=rnn_out, num_outputs=a_size, activation_fn=tf.nn.softmax) P_msg = layers.fully_connected(inputs=rnn_out, num_outputs=msg_length, activation_fn=tf.nn.sigmoid) with tf.variable_scope('critic'): conv1_critic = layers.conv2d(inputs=critic_inputs, padding="SAME", num_outputs=RNN_SIZE // 4, kernel_size=[3, 3], stride=1, data_format="NHWC", weights_initializer=w_init, activation_fn=tf.nn.relu) conv1a_critic = layers.conv2d(inputs=conv1_critic, padding="SAME", num_outputs=RNN_SIZE // 4, kernel_size=[3, 3], stride=1, data_format="NHWC", weights_initializer=w_init, activation_fn=tf.nn.relu) flat_critic = tf.nn.relu(layers.flatten(conv1a_critic)) action_layer = layers.fully_connected(inputs=actions_other, num_outputs=ACTION_REPR_SIZE) hidden_input_critic = tf.concat([flat_critic, action_layer], 1) h1_critic = layers.fully_connected(inputs=flat_critic, num_outputs=RNN_SIZE) h2_critic = layers.fully_connected(inputs=h1_critic, num_outputs=RNN_SIZE) value = layers.fully_connected( inputs=h2_critic, num_outputs=1, weights_initializer=normalized_columns_initializer(1.0), biases_initializer=None, activation_fn=None) return policy, P_msg, value, state_out_actor, state_init_actor
def __init__(self, lr, brain, h_size, epsilon, beta, max_step, normalize, num_layers): """ Creates Discrete Control Actor-Critic model. :param brain: State-space size :param h_size: Hidden layer size """ super(DiscreteControlModel, self).__init__() self.create_global_steps() self.create_reward_encoder() self.normalize = normalize hidden_state, hidden_visual, hidden = None, None, None if brain.number_observations > 0: encoders = [] for i in range(brain.number_observations): height_size, width_size = brain.camera_resolutions[i][ 'height'], brain.camera_resolutions[i]['width'] bw = brain.camera_resolutions[i]['blackAndWhite'] encoders.append( self.create_visual_encoder(height_size, width_size, bw, h_size, 1, tf.nn.elu, num_layers)[0]) hidden_visual = tf.concat(encoders, axis=1) if brain.state_space_size > 0: s_size = brain.state_space_size if brain.state_space_type == "continuous": hidden_state = self.create_continuous_state_encoder( s_size, h_size, 1, tf.nn.elu, num_layers)[0] else: hidden_state = self.create_discrete_state_encoder( s_size, h_size, 1, tf.nn.elu, num_layers)[0] if hidden_visual is None and hidden_state is None: raise Exception( "No valid network configuration possible. " "There are no states or observations in this brain") elif hidden_visual is not None and hidden_state is None: hidden = hidden_visual elif hidden_visual is None and hidden_state is not None: hidden = hidden_state elif hidden_visual is not None and hidden_state is not None: hidden = tf.concat([hidden_visual, hidden_state], axis=1) a_size = brain.action_space_size self.batch_size = tf.placeholder(shape=None, dtype=tf.int32, name='batch_size') self.policy = tf.layers.dense( hidden, a_size, activation=None, use_bias=False, kernel_initializer=c_layers.variance_scaling_initializer( factor=0.01)) self.probs = tf.nn.softmax(self.policy, name="action_probs") self.output = tf.multinomial(self.policy, 1) self.output = tf.identity(self.output, name="action") self.value = tf.layers.dense( hidden, 1, activation=None, use_bias=False, kernel_initializer=c_layers.variance_scaling_initializer( factor=1.0)) self.value = tf.identity(self.value, name="value_estimate") self.entropy = -tf.reduce_sum(self.probs * tf.log(self.probs + 1e-10), axis=1) self.action_holder = tf.placeholder(shape=[None], dtype=tf.int32) self.selected_actions = c_layers.one_hot_encoding( self.action_holder, a_size) self.old_probs = tf.placeholder(shape=[None, a_size], dtype=tf.float32, name='old_probabilities') self.responsible_probs = tf.reduce_sum(self.probs * self.selected_actions, axis=1) self.old_responsible_probs = tf.reduce_sum(self.old_probs * self.selected_actions, axis=1) self.create_ppo_optimizer(self.responsible_probs, self.old_responsible_probs, self.value, self.entropy, beta, epsilon, lr, max_step)
import functools as ft import tensorflow as tf import tensorflow.contrib.layers as tflayers # Initializers sigma = 1.0 weight_initializer = tflayers.variance_scaling_initializer(mode="FAN_AVG", uniform=True, factor=sigma) bias_initializer = tf.zeros_initializer() def mlp_input_layer(n_inputs): return tf.placeholder(dtype=tf.float32, shape=[None, n_inputs]) def hidden_layer(inputs, n_input, n_neurons): # Variables for hidden weights and biases weight_hidden = tf.Variable(weight_initializer([n_input, n_neurons])) bias_hidden = tf.Variable(bias_initializer([n_neurons])) # Hidden layer return tf.nn.relu(tf.add(tf.matmul(inputs, weight_hidden), bias_hidden)) def mlp_output_layer(inputs, n_inputs, n_output): weight_out = tf.Variable(weight_initializer([n_inputs, n_output])) bias_out = tf.Variable(bias_initializer([n_output])) return tf.transpose(tf.add(tf.matmul(inputs, weight_out), bias_out))
def scaled_init(scale): return c_layers.variance_scaling_initializer(scale)
def vgg_backbone(image, qw=1): with argscope(Conv2DQuant, nl=tf.identity, use_bias=False, W_init=variance_scaling_initializer(mode='FAN_IN'), data_format=get_arg_scope()['Conv2D']['data_format'], nbit=qw): logits = ( LinearWrap(image).Conv2DQuant('conv1', 96, 7, stride=2, nl=tf.nn.relu, is_quant=False).MaxPooling( 'pool1', shape=2, stride=2, padding='VALID') # 56 .BNReLUQuant('bnquant2_0').Conv2DQuant( 'conv2_1', 256, 3, nl=getBNReLUQuant).Conv2DQuant( 'conv2_2', 256, 3, nl=getBNReLUQuant).Conv2DQuant( 'conv2_3', 256, 3).MaxPooling('pool2', shape=2, stride=2, padding='VALID') # 28 .BNReLUQuant('bnquant3_0').Conv2DQuant( 'conv3_1', 512, 3, nl=getBNReLUQuant).Conv2DQuant( 'conv3_2', 512, 3, nl=getBNReLUQuant).Conv2DQuant( 'conv3_3', 512, 3).MaxPooling('pool3', shape=2, stride=2, padding='VALID') # 14 .BNReLUQuant('bnquant4_0').Conv2DQuant( 'conv4_1', 512, 3, nl=getBNReLUQuant).Conv2DQuant( 'conv4_2', 512, 3, nl=getBNReLUQuant).Conv2DQuant( 'conv4_3', 512, 3).MaxPooling('pool4', shape=2, stride=2, padding='VALID') # 7 .BNReLUQuant('bnquant5').Conv2DQuant( 'fc5', 4096, 7, nl=getfcBNReLUQuant, padding='VALID', use_bias=True).Conv2DQuant( 'fc6', 4096, 1, nl=getfcBNReLU, padding='VALID', use_bias=True).FullyConnected( 'fc7', out_dim=1000, nl=tf.identity, W_init=variance_scaling_initializer(mode='FAN_IN'))()) return logits
from tensorflow.python.ops import math_ops from tensorflow.python.ops import random_ops from tensorflow.python.ops import array_ops from tensorflow.python.layers import utils import tensorflow as tf def selu(x): with ops.name_scope('elu') as scope: alpha = 1.6732632423543772848170429916717 scale = 1.0507009873554804934193349852946 return scale * tf.where(x >= 0.0, x, alpha * tf.nn.elu(x)) initializer = layers.variance_scaling_initializer(factor=1.0, mode='FAN_IN') # (4) use this dropout def dropout_selu(x, rate, alpha=-1.7580993408473766, fixedPointMean=0.0, fixedPointVar=1.0, noise_shape=None, seed=None, name=None, training=False): """Dropout to a value with rescaling.""" def dropout_selu_impl(x, rate, alpha, noise_shape, seed, name): keep_prob = 1.0 - rate
def _build_graph(self, inputs): image, label = inputs image = image / 128.0 assert tf.test.is_gpu_available() image = tf.transpose(image, [0, 3, 1, 2]) all_cnt = tf.constant(self.n * 3+2, tf.float32, name="all_cnt") preds = [] epsilon = get_scalar_var('epsilon', self.EPSILON, summary=True) def residual_convs(l, first,out_channel,stride1): b1 = l if first else BNReLU(l) c1 = Conv2D('conv1', b1, out_channel, stride=stride1, nl=BNReLU) c2 = Conv2D('conv2', c1, out_channel) return c2 def residual(name, l, increase_dim=False, first=False): shape = l.get_shape().as_list() in_channel = shape[1] if increase_dim: out_channel = in_channel * 2 stride1 = 2 short_cut = AvgPooling('pool', l, 2) short_cut = tf.pad(short_cut, [[0, 0], [in_channel // 2, in_channel // 2], [0, 0], [0, 0]]) else: out_channel = in_channel stride1 = 1 short_cut = l with tf.variable_scope(name) as scope: l = residual_convs(l,first,out_channel,stride1) attention = se_module(l,out_channel,16) identity_w = strict_identity_se(attention, self.EPSILON) # # apply strict identity l = identity_w * l + short_cut gt_0 = tf.zeros_like(identity_w) gt_1 = tf.ones_like(identity_w) # print(gt_0,identity_w) # identity_w # # monitor is_discarded is_discarded = tf.where( tf.equal(identity_w,gt_0), gt_1, gt_0) add_moving_summary(tf.reduce_sum(is_discarded)) preds.append(tf.reduce_sum(is_discarded)) return l side_output_cost = [] with argscope([Conv2D, AvgPooling, BatchNorm, GlobalAvgPooling], data_format='NCHW'), \ argscope(Conv2D, nl=tf.identity, use_bias=False, kernel_shape=3, W_init=variance_scaling_initializer(mode='FAN_OUT')): l = Conv2D('conv0', image, 16, nl=BNReLU) l = residual('res1.0', l, first=True) for k in range(1, self.n): l= residual('res1.{}'.format(k), l) # 32,c=16 l = residual('res2.0', l, increase_dim=True) for k in range(1, self.n): l = residual('res2.{}'.format(k), l) # if k == self.n/2: # side_output_cost.append(side_output('res2.{}'.format(k), l, label, self.NUM_CLASS)) # 16,c=32 l = residual('res3.0', l, increase_dim=True) for k in range(1, self.n): l = residual('res3.' + str(k), l) l = BNReLU('bnlast', l) # 8,c=64 l = GlobalAvgPooling('gap', l) logits = FullyConnected('linear', l, out_dim=self.NUM_CLASS, nl = tf.identity) cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') wrong = prediction_incorrect(logits, label) # monitor training error add_moving_summary(tf.reduce_mean(wrong, name='train_error')) discarded_cnt = tf.add_n(preds, name="discarded_cnt") # discarded_ratio = tf.divide( # discarded_cnt, all_cnt, name = "discarded_ratio") add_moving_summary(discarded_cnt) # weight decay on all W of fc layers wd_w = tf.train.exponential_decay(0.0002, get_global_step_var(), 480000, 0.2, True) wd_cost = tf.multiply(wd_w, regularize_cost('.*/W', tf.nn.l2_loss), name='wd_cost') add_moving_summary(cost, wd_cost) add_param_summary(('.*/W', ['histogram'])) # monitor W side_loss_w = [0.1] # side_output_cost = [tf.multiply(side_loss_w[i], side_output_cost[i])\ # for i in range(len(side_output_cost))] loss = [cost, wd_cost] self.cost = tf.add_n(loss, name='cost')
def _build_graph(self, inputs): image, label = inputs image = image / 128.0 image = tf.transpose(image, [0, 3, 1, 2]) def residual(name, l, increase_dim=False, first=False): shape = l.get_shape().as_list() in_channel = shape[1] if increase_dim: out_channel = in_channel * 2 stride1 = 2 else: out_channel = in_channel stride1 = 1 with tf.variable_scope(name) as scope: b1 = l if first else BNReLU(l) c1 = Conv2D('conv1', b1, out_channel, stride=stride1, nl=BNReLU) c2 = Conv2D('conv2', c1, out_channel) if increase_dim: l = AvgPooling('pool', l, 2) l = tf.pad(l, [[0, 0], [in_channel // 2, in_channel // 2], [0, 0], [0, 0]]) l = c2 + l return l with argscope([Conv2D, AvgPooling, BatchNorm, GlobalAvgPooling], data_format='NCHW'), \ argscope(Conv2D, nl=tf.identity, use_bias=False, kernel_shape=3, W_init=variance_scaling_initializer(mode='FAN_OUT')): l = Conv2D('conv0', image, 16, nl=BNReLU) l = residual('res1.0', l, first=True) for k in range(1, self.n): l = residual('res1.{}'.format(k), l) # 32,c=16 l = residual('res2.0', l, increase_dim=True) for k in range(1, self.n): l = residual('res2.{}'.format(k), l) # 16,c=32 l = residual('res3.0', l, increase_dim=True) for k in range(1, self.n): l = residual('res3.' + str(k), l) l = BNReLU('bnlast', l) # 8,c=64 l = GlobalAvgPooling('gap', l) logits = FullyConnected('linear', l, out_dim=10, nl=tf.identity) prob = tf.nn.softmax(logits, name='output') cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') wrong = prediction_incorrect(logits, label) # monitor training error add_moving_summary(tf.reduce_mean(wrong, name='train_error')) # weight decay on all W of fc layers wd_w = tf.train.exponential_decay(0.0002, get_global_step_var(), 480000, 0.2, True) wd_cost = tf.multiply(wd_w, regularize_cost('.*/W', tf.nn.l2_loss), name='wd_cost') add_moving_summary(cost, wd_cost) add_param_summary(('.*/W', ['histogram'])) # monitor W self.cost = tf.add_n([cost, wd_cost], name='cost')
def _build_model(self, inputs): self.inputs = inputs if self.data_format == 'NCHW': reduction_axis = [2, 3] _inputs = tf.cast(tf.transpose(inputs, [0, 3, 1, 2]), tf.float32) else: reduction_axis = [1, 2] _inputs = tf.cast(inputs, tf.float32) with arg_scope([layers.conv2d], num_outputs=16, kernel_size=3, stride=1, padding='SAME', data_format=self.data_format, activation_fn=None, weights_initializer=layers.variance_scaling_initializer(), weights_regularizer=layers.l2_regularizer(2e-4), biases_initializer=tf.constant_initializer(0.2), biases_regularizer=None),\ arg_scope([layers.batch_norm], decay=0.9, center=True, scale=True, updates_collections=None, is_training=self.is_training, fused=True, data_format=self.data_format),\ arg_scope([layers.avg_pool2d], kernel_size=[3,3], stride=[2,2], padding='SAME', data_format=self.data_format): with tf.variable_scope('Layer1'): conv = layers.conv2d(_inputs, num_outputs=64, kernel_size=3) actv = tf.nn.relu(layers.batch_norm(conv)) with tf.variable_scope('Layer2'): conv = layers.conv2d(actv) actv = tf.nn.relu(layers.batch_norm(conv)) with tf.variable_scope('Layer3'): conv1 = layers.conv2d(actv) actv1 = tf.nn.relu(layers.batch_norm(conv1)) conv2 = layers.conv2d(actv1) bn2 = layers.batch_norm(conv2) res = tf.add(actv, bn2) with tf.variable_scope('Layer4'): conv1 = layers.conv2d(res) actv1 = tf.nn.relu(layers.batch_norm(conv1)) conv2 = layers.conv2d(actv1) bn2 = layers.batch_norm(conv2) res = tf.add(res, bn2) with tf.variable_scope('Layer5'): conv1 = layers.conv2d(res) actv1 = tf.nn.relu(layers.batch_norm(conv1)) conv2 = layers.conv2d(actv1) bn = layers.batch_norm(conv2) res = tf.add(res, bn) with tf.variable_scope('Layer6'): conv1 = layers.conv2d(res) actv1 = tf.nn.relu(layers.batch_norm(conv1)) conv2 = layers.conv2d(actv1) bn = layers.batch_norm(conv2) res = tf.add(res, bn) with tf.variable_scope('Layer7'): conv1 = layers.conv2d(res) actv1 = tf.nn.relu(layers.batch_norm(conv1)) conv2 = layers.conv2d(actv1) bn = layers.batch_norm(conv2) res = tf.add(res, bn) with tf.variable_scope('Layer8'): convs = layers.conv2d(res, kernel_size=1, stride=2) convs = layers.batch_norm(convs) conv1 = layers.conv2d(res) actv1 = tf.nn.relu(layers.batch_norm(conv1)) conv2 = layers.conv2d(actv1) bn = layers.batch_norm(conv2) pool = layers.avg_pool2d(bn) res = tf.add(convs, pool) with tf.variable_scope('Layer9'): convs = layers.conv2d(res, num_outputs=64, kernel_size=1, stride=2) convs = layers.batch_norm(convs) conv1 = layers.conv2d(res, num_outputs=64) actv1 = tf.nn.relu(layers.batch_norm(conv1)) conv2 = layers.conv2d(actv1, num_outputs=64) bn = layers.batch_norm(conv2) pool = layers.avg_pool2d(bn) res = tf.add(convs, pool) with tf.variable_scope('Layer10'): convs = layers.conv2d(res, num_outputs=128, kernel_size=1, stride=2) convs = layers.batch_norm(convs) conv1 = layers.conv2d(res, num_outputs=128) actv1 = tf.nn.relu(layers.batch_norm(conv1)) conv2 = layers.conv2d(actv1, num_outputs=128) bn = layers.batch_norm(conv2) pool = layers.avg_pool2d(bn) res = tf.add(convs, pool) with tf.variable_scope('Layer11'): convs = layers.conv2d(res, num_outputs=256, kernel_size=1, stride=2) convs = layers.batch_norm(convs) conv1 = layers.conv2d(res, num_outputs=256) actv1 = tf.nn.relu(layers.batch_norm(conv1)) conv2 = layers.conv2d(actv1, num_outputs=256) bn = layers.batch_norm(conv2) pool = layers.avg_pool2d(bn) res = tf.add(convs, pool) with tf.variable_scope('Layer12'): conv1 = layers.conv2d(res, num_outputs=512) actv1 = tf.nn.relu(layers.batch_norm(conv1)) conv2 = layers.conv2d(actv1, num_outputs=512) res = tf.nn.relu(layers.batch_norm(conv2)) with tf.variable_scope('ensemble'): conv1 = layers.conv2d(res, num_outputs=512) actv1 = tf.nn.relu(layers.batch_norm(conv1)) conv2 = layers.conv2d(res, num_outputs=512) actv2 = tf.nn.sigmoid(layers.batch_norm(conv2)) conv3 = layers.conv2d(res, num_outputs=512) actv3 = tf.nn.tanh(layers.batch_norm(conv3)) avgp1 = tf.reduce_mean(actv1, reduction_axis, keep_dims=True) avgp2 = tf.reduce_mean(actv2, reduction_axis, keep_dims=True) avgp3 = tf.reduce_mean(actv3, reduction_axis, keep_dims=True) with tf.variable_scope('combine'): ip1 = layers.fully_connected( layers.flatten(avgp1), num_outputs=2, activation_fn=None, normalizer_fn=None, weights_initializer=tf.random_normal_initializer( mean=0., stddev=0.01), biases_initializer=tf.constant_initializer(0.), scope='ip1') ip2 = layers.fully_connected( layers.flatten(avgp2), num_outputs=2, activation_fn=None, normalizer_fn=None, weights_initializer=tf.random_normal_initializer( mean=0., stddev=0.01), biases_initializer=tf.constant_initializer(0.), scope='ip2') ip3 = layers.fully_connected( layers.flatten(avgp3), num_outputs=2, activation_fn=None, normalizer_fn=None, weights_initializer=tf.random_normal_initializer( mean=0., stddev=0.01), biases_initializer=tf.constant_initializer(0.), scope='ip3') combine = tf.concat([ip1, ip2, ip3], 1) ip = layers.fully_connected( combine, num_outputs=2, activation_fn=None, normalizer_fn=None, weights_initializer=tf.random_normal_initializer(mean=0., stddev=0.01), biases_initializer=tf.constant_initializer(0.), scope='ip') self.outputs = ip return self.outputs
def get_q_values_op(self, state, seq_len, scope, reuse=False): """ Returns Q values for all actions Args: state: (tf tensor) shape = (batch_size, img height, img width, nchannels) scope: (string) scope name, that specifies if target network or not reuse: (bool) reuse of variables in the scope Returns: out: (tf tensor) of shape = (batch_size, num_actions) :param seq_len: """ # this information might be useful num_actions = self.train_env.action_space.n ############################################################## """ TODO: implement the computation of Q values like in the paper https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf you may find the section "model architecture" of the appendix of the nature paper particulary useful. store your result in out of shape = (batch_size, num_actions) HINT: you may find tensorflow.contrib.layers useful (imported) make sure to understand the use of the scope param you can use any other methods from tensorflow you are not allowed to import extra packages (like keras, lasagne, cafe, etc.) """ ############################################################## ################ YOUR CODE HERE - 10-15 lines ################ ### YOUR CODE HERE (~10-15 lines) cells = [ tf.contrib.rnn.GRUCell(self.config.n_hidden_rnn, reuse=reuse) for _ in range(self.config.n_layers_rnn) ] multi_layer_cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=False) with tf.variable_scope(scope): # f is of shape [batch_s, max_timesteps, num_hidden] rnn_outputs, last_states = tf.nn.dynamic_rnn( multi_layer_cell, state, sequence_length=seq_len, dtype=tf.float32 ) if self.config.n_hidden_fc: fc = layers.fully_connected( inputs=last_states, num_outputs=self.config.n_hidden_fc, activation_fn=tf.nn.relu, reuse=reuse, weights_initializer=layers.variance_scaling_initializer() ) logits_input = fc if self.config.n_hidden_fc else last_states logits = layers.fully_connected( inputs=logits_input, num_outputs=num_actions, activation_fn=None, reuse=reuse, weights_initializer=layers.variance_scaling_initializer() ) ############################################################## ######################## END YOUR CODE ####################### return logits
def _build_graph(self, inputs): image, label = inputs image = tf.cast(image, tf.float32) * (1.0 / 255) # Wrong mean/std are used for compatibility with pre-trained models. # Should actually add a RGB-BGR conversion here. image_mean = tf.constant([0.485, 0.456, 0.406], dtype=tf.float32) image_std = tf.constant([0.229, 0.224, 0.225], dtype=tf.float32) image = (image - image_mean) / image_std if self.data_format == 'NCHW': image = tf.transpose(image, [0, 3, 1, 2]) def shortcut(l, n_in, n_out, stride): if n_in != n_out: return Conv2D('convshortcut', l, n_out, 1, stride=stride) else: return l def basicblock(l, ch_out, stride, preact): ch_in = l.get_shape().as_list()[1] if preact == 'both_preact': l = BNReLU('preact', l) input = l elif preact != 'no_preact': input = l l = BNReLU('preact', l) else: input = l l = Conv2D('conv1', l, ch_out, 3, stride=stride, nl=BNReLU) l = Conv2D('conv2', l, ch_out, 3) return l + shortcut(input, ch_in, ch_out, stride) def bottleneck(l, ch_out, stride, preact): ch_in = l.get_shape().as_list()[1] if preact == 'both_preact': l = BNReLU('preact', l) input = l elif preact != 'no_preact': input = l l = BNReLU('preact', l) else: input = l l = Conv2D('conv1', l, ch_out, 1, nl=BNReLU) l = Conv2D('conv2', l, ch_out, 3, stride=stride, nl=BNReLU) l = Conv2D('conv3', l, ch_out * 4, 1) return l + shortcut(input, ch_in, ch_out * 4, stride) def layer(layername, l, block_func, features, count, stride, first=False): with tf.variable_scope(layername): with tf.variable_scope('block0'): l = block_func(l, features, stride, 'no_preact' if first else 'both_preact') for i in range(1, count): with tf.variable_scope('block{}'.format(i)): l = block_func(l, features, 1, 'default') return l cfg = { 18: ([2, 2, 2, 2], basicblock), 34: ([3, 4, 6, 3], basicblock), 50: ([3, 4, 6, 3], bottleneck), 101: ([3, 4, 23, 3], bottleneck) } defs, block_func = cfg[DEPTH] with argscope(Conv2D, nl=tf.identity, use_bias=False, W_init=variance_scaling_initializer(mode='FAN_OUT')), \ argscope([Conv2D, MaxPooling, GlobalAvgPooling, BatchNorm], data_format=self.data_format): l = Conv2D('conv1', image, 64, 7, stride=2, nl=tf.identity) # l = BatchNorm('conv1_bn', l) l = MaxPooling('pool1', l, 3, stride=2, padding='SAME') l_bra = BatchNorm('res2a_bn2a', l) l_bra = WinogradImTrans('WinogradImTrans_2a_2a', l_bra, tf.nn.relu) l_bra = WinogradConv( 'Winograd_W2a_2a', l_bra, 64, 64, mask=mask_dict['Winograd_W2a_2a/W'] if use_mask else None) l_bra = BatchNorm('res2a_bn2b', l_bra) l_bra = WinogradImTrans('WinogradImTrans_2a_2b', l_bra, tf.nn.relu) l_bra = WinogradConv( 'Winograd_W2a_2b', l_bra, 64, 64, mask=mask_dict['Winograd_W2a_2b/W'] if use_mask else None) l_bra = BatchNorm('res2a_bn2c', l_bra) # l = tf.nn.relu(l) l = BNReLU('res2a_1_relu', l) l = Conv2D('res2a_1', l, 64, 1, nl=tf.identity) l = BatchNorm('res2a_bn1', l) l = l + l_bra l_bra = BatchNorm('res2b_bn2a', l) l_bra = WinogradImTrans('WinogradImTrans_2b_2a', l_bra, tf.nn.relu) l_bra = WinogradConv( 'Winograd_W2b_2a', l_bra, 64, 64, mask=mask_dict['Winograd_W2b_2a/W'] if use_mask else None) l_bra = BatchNorm('res2b_bn2b', l_bra) l_bra = WinogradImTrans('WinogradImTrans_2b_2b', l_bra, tf.nn.relu) l_bra = WinogradConv( 'Winograd_W2b_2b', l_bra, 64, 64, mask=mask_dict['Winograd_W2b_2b/W'] if use_mask else None) l_bra = BatchNorm('res2b_bn2c', l_bra) l = tf.nn.relu(l) l = l + l_bra l = MaxPooling('pool2', l, 3, stride=2, padding='SAME') l_bra = BatchNorm('res3a_bn2a', l) l_bra = WinogradImTrans('WinogradImTrans_3a_2a', l_bra, tf.nn.relu) l_bra = WinogradConv( 'Winograd_W3a_2a', l_bra, 64, 128, mask=mask_dict['Winograd_W3a_2a/W'] if use_mask else None) l_bra = BatchNorm('res3a_bn2b', l_bra) l_bra = WinogradImTrans('WinogradImTrans_3a_2b', l_bra, tf.nn.relu) l_bra = WinogradConv( 'Winograd_W3a_2b', l_bra, 128, 128, mask=mask_dict['Winograd_W3a_2b/W'] if use_mask else None) l_bra = BatchNorm('res3a_bn2c', l_bra) l = tf.nn.relu(l) l = Conv2D('res3a_1', l, 128, 1, nl=tf.identity) l = BatchNorm('res3a_bn1', l) l = l + l_bra l_bra = BatchNorm('res3b_bn2a', l) l_bra = WinogradImTrans('WinogradImTrans_3b_2a', l_bra, tf.nn.relu) l_bra = WinogradConv( 'Winograd_W3b_2a', l_bra, 128, 128, mask=mask_dict['Winograd_W3b_2a/W'] if use_mask else None) l_bra = BatchNorm('res3b_bn2b', l_bra) l_bra = WinogradImTrans('WinogradImTrans_3b_2b', l_bra, tf.nn.relu) l_bra = WinogradConv( 'Winograd_W3b_2b', l_bra, 128, 128, mask=mask_dict['Winograd_W3b_2b/W'] if use_mask else None) l_bra = BatchNorm('res3b_bn2c', l_bra) l = tf.nn.relu(l) l = l + l_bra l = MaxPooling('pool3', l, 3, stride=2, padding='SAME') l_bra = BatchNorm('res4a_bn2a', l) l_bra = WinogradImTrans('WinogradImTrans_4a_2a', l_bra, tf.nn.relu) l_bra = WinogradConv( 'Winograd_W4a_2a', l_bra, 128, 256, mask=mask_dict['Winograd_W4a_2a/W'] if use_mask else None) l_bra = BatchNorm('res4a_bn2b', l_bra) l_bra = WinogradImTrans('WinogradImTrans_4a_2b', l_bra, tf.nn.relu) l_bra = WinogradConv( 'Winograd_W4a_2b', l_bra, 256, 256, mask=mask_dict['Winograd_W4a_2b/W'] if use_mask else None) l_bra = BatchNorm('res4a_bn2c', l_bra) l = tf.nn.relu(l) l = Conv2D('res4a_1', l, 256, 1, nl=tf.identity) l = BatchNorm('res4a_bn1', l) l = l + l_bra l_bra = BatchNorm('res4b_bn2a', l) l_bra = WinogradImTrans('WinogradImTrans_4b_2a', l_bra, tf.nn.relu) l_bra = WinogradConv( 'Winograd_W4b_2a', l_bra, 256, 256, mask=mask_dict['Winograd_W4b_2a/W'] if use_mask else None) l_bra = BatchNorm('res4b_bn2b', l_bra) l_bra = WinogradImTrans('WinogradImTrans_4b_2b', l_bra, tf.nn.relu) l_bra = WinogradConv( 'Winograd_W4b_2b', l_bra, 256, 256, mask=mask_dict['Winograd_W4b_2b/W'] if use_mask else None) l_bra = BatchNorm('res4b_bn2c', l_bra) l = tf.nn.relu(l) l = l + l_bra # l = MaxPooling('pool4', l, 3, stride=2, padding='SAME') l_bra = BatchNorm('res5a_bn2a', l) l_bra = WinogradImTrans('WinogradImTrans_5a_2a', l_bra, tf.nn.relu) l_bra = WinogradConv( 'Winograd_W5a_2a', l_bra, 256, 512, mask=mask_dict['Winograd_W5a_2a/W'] if use_mask else None) l_bra = BatchNorm('res5a_bn2b', l_bra) l_bra = WinogradImTrans('WinogradImTrans_5a_2b', l_bra, tf.nn.relu) l_bra = WinogradConv( 'Winograd_W5a_2b', l_bra, 512, 512, mask=mask_dict['Winograd_W5a_2b/W'] if use_mask else None) l_bra = BatchNorm('res5a_bn2c', l_bra) l = tf.nn.relu(l) l = Conv2D('res5a_1', l, 512, 1, nl=tf.identity) l = BatchNorm('res5a_bn1', l) l = l + l_bra l_bra = BatchNorm('res5b_bn2a', l) l_bra = WinogradImTrans('WinogradImTrans_5b_2a', l_bra, tf.nn.relu) l_bra = WinogradConv( 'Winograd_W5b_2a', l_bra, 512, 512, mask=mask_dict['Winograd_W5b_2a/W'] if use_mask else None) l_bra = BatchNorm('res5b_bn2b', l_bra) l_bra = WinogradImTrans('WinogradImTrans_5b_2b', l_bra, tf.nn.relu) l_bra = WinogradConv( 'Winograd_W5b_2b', l_bra, 512, 512, mask=mask_dict['Winograd_W5b_2b/W'] if use_mask else None) l_bra = BatchNorm('res5b_bn2c', l_bra) l = tf.nn.relu(l) l = l + l_bra l = tf.nn.relu(l) l = GlobalAvgPooling('gap', l) l = Dropout('drop_fc', l, 0.85) # l = Dropout('drop_fc', l, 0.7) logits = FullyConnected('linear', l, 1000, nl=tf.identity) loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) loss = tf.reduce_mean(loss, name='xentropy-loss') wrong = prediction_incorrect(logits, label, 1, name='wrong-top1') add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1')) wrong = prediction_incorrect(logits, label, 5, name='wrong-top5') add_moving_summary(tf.reduce_mean(wrong, name='train-error-top5')) wd_cost = regularize_cost('.*/W', l2_regularizer(1e-4), name='l2_regularize_loss') add_moving_summary(loss, wd_cost) self.cost = tf.add_n([loss, wd_cost], name='cost')
DDPG_CFG.online_policy_net_var_scope = 'online_policy' DDPG_CFG.target_policy_net_var_scope = 'target_policy' # -- 1 input norm layers -- DDPG_CFG.actor_input_normalizer = batch_norm DDPG_CFG.actor_input_norm_params = { 'is_training': is_training, 'data_format': 'NHWC', 'updates_collections': None, 'scale': False, # not gamma. let next fc layer to scale. 'center': True # beta. } # -- 2 fc layers -- DDPG_CFG.actor_n_fc_units = [400, 300] DDPG_CFG.actor_fc_activations = [tf.nn.elu] * 2 DDPG_CFG.actor_fc_initializers = [variance_scaling_initializer()] * 2 DDPG_CFG.actor_fc_regularizers = [None] * 2 DDPG_CFG.actor_fc_normalizers = [batch_norm] * 2 DDPG_CFG.actor_fc_norm_params = [{ 'is_training': is_training, 'data_format': 'NHWC', 'updates_collections': None, 'scale': False, 'center': True }] * 2 # -- 1 output layer -- #TODO try actor no BN.use l2 reg on weights only. DDPG_CFG.actor_output_layer_normalizers = batch_norm DDPG_CFG.actor_output_layer_norm_params = { 'is_training': is_training,
def create_cc_actor_critic(self, h_size, num_layers): """ Creates Continuous control actor-critic model. :param h_size: Size of hidden linear layers. :param num_layers: Number of hidden linear layers. """ hidden_streams = self.create_observation_streams(2, h_size, num_layers) if self.use_recurrent: self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name='recurrent_in') _half_point = int(self.m_size / 2) hidden_policy, memory_policy_out = self.create_recurrent_encoder( hidden_streams[0], self.memory_in[:, :_half_point], self.sequence_length, name='lstm_policy') hidden_value, memory_value_out = self.create_recurrent_encoder( hidden_streams[1], self.memory_in[:, _half_point:], self.sequence_length, name='lstm_value') self.memory_out = tf.concat([memory_policy_out, memory_value_out], axis=1, name='recurrent_out') else: hidden_policy = hidden_streams[0] hidden_value = hidden_streams[1] mu = tf.layers.dense( hidden_policy, self.act_size[0], activation=None, kernel_initializer=c_layers.variance_scaling_initializer( factor=0.01)) log_sigma_sq = tf.get_variable("log_sigma_squared", [self.act_size[0]], dtype=tf.float32, initializer=tf.zeros_initializer()) sigma_sq = tf.exp(log_sigma_sq) self.epsilon = tf.placeholder(shape=[None, self.act_size[0]], dtype=tf.float32, name='epsilon') # Clip and scale output to ensure actions are always within [-1, 1] range. self.output_pre = mu + tf.sqrt(sigma_sq) * self.epsilon output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3 self.output = tf.identity(output_post, name='action') self.selected_actions = tf.stop_gradient(output_post) # Compute probability of model output. all_probs = - 0.5 * tf.square(tf.stop_gradient(self.output_pre) - mu) / sigma_sq \ - 0.5 * tf.log(2.0 * np.pi) - 0.5 * log_sigma_sq self.all_log_probs = tf.identity(all_probs, name='action_probs') self.entropy = 0.5 * tf.reduce_mean( tf.log(2 * np.pi * np.e) + log_sigma_sq) value = tf.layers.dense(hidden_value, 1, activation=None) self.value = tf.identity(value, name="value_estimate") self.all_old_log_probs = tf.placeholder(shape=[None, self.act_size[0]], dtype=tf.float32, name='old_probabilities') # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control. self.log_probs = tf.reduce_sum((tf.identity(self.all_log_probs)), axis=1, keepdims=True) self.old_log_probs = tf.reduce_sum( (tf.identity(self.all_old_log_probs)), axis=1, keepdims=True)
class Posenet: # Building blocks weight_init = staticmethod( variance_scaling_initializer()) # "MSRA" initialization weight_decay = staticmethod(slim.l2_regularizer(5e-5)) activation_function = staticmethod(tf.nn.relu) def __init__(self, endpoint='Mixed_7c', n_fc=2048, loss_type='standard', output_type='quat'): self.endpoint = endpoint self.n_fc = n_fc self.layers = {} if loss_type not in ('standard', 'min'): raise ValueError('Unknown loss') self.loss_type = loss_type if output_type not in ('quat', 'axis'): raise ValueError('Unknown output type') self.output_type = output_type def create_stream(self, data_input, dropout, trainable): with slim.arg_scope([slim.conv2d], padding='SAME', activation_fn=self.activation_function, weights_initializer=self.weight_init, weights_regularizer=self.weight_decay, trainable=trainable): last_output, layers = inception.inception_v3_base( data_input, scope='Inception_V3', final_endpoint=self.endpoint) # Global average pooling last_output = tf.reduce_mean(last_output, [1, 2]) last_output = tf.reshape(last_output, [-1, self.n_fc]) last_output = slim.fully_connected(last_output, self.n_fc, scope='fc0', trainable=trainable) if dropout is not None: last_output = slim.dropout(last_output, keep_prob=dropout, scope='dropout_0') last_output = slim.fully_connected(last_output, self.n_fc, scope='fc1', trainable=trainable) if dropout is not None: last_output = slim.dropout(last_output, keep_prob=dropout, scope='dropout_1') # Pose Regression n_last = 7 if self.output_type == 'quat' else 6 last_output = slim.fully_connected( last_output, n_last, normalizer_fn=None, scope='fc2', activation_fn=None, weights_initializer=self.weight_init, weights_regularizer=self.weight_decay, trainable=trainable) layers['last_output'] = last_output self.layers = layers return self.slice_output(last_output), layers def slice_output(self, output): x = tf.slice(output, [0, 0], [-1, 3]) k = 4 if self.output_type == 'quat' else 3 q = tf.nn.l2_normalize(tf.slice(output, [0, 3], [-1, k]), 1) return {'x': x, 'q': q} def loss(self, outputs, gt, beta, learn_beta): x_loss = tf.reduce_sum(tf.abs(tf.sub(outputs["x"], gt["x"])), 1) q_loss = tf.reduce_sum(tf.abs(tf.sub(outputs["q"], gt["q"])), 1) if self.output_type == 'quat' and self.loss_type == 'min': q_neg_loss = tf.reduce_sum( tf.abs(tf.sub(tf.negative(outputs["q"]), gt["q"])), 1) q_loss = tf.minimum(q_loss, q_neg_loss) x_loss = tf.reduce_mean(x_loss) q_loss = tf.reduce_mean(q_loss) if learn_beta: total_loss = tf.add(tf.truediv(x_loss, beta), tf.mul(q_loss, beta)) else: total_loss = tf.add(x_loss, tf.mul(q_loss, beta)) return x_loss, q_loss, total_loss def create_validation(self, inputs, labels, beta=None): summaries = [] with tf.variable_scope('PoseNet', reuse=True): try: weight = tf.get_default_graph().get_tensor_by_name( 'PoseNet/learned_beta:0') learn_beta = True print('Using learned beta') except KeyError: if beta is None: raise ValueError('The value of beta has to be specified') weight = tf.constant(beta, tf.float32) learn_beta = False outputs, layers = self.create_stream(inputs, dropout=None, trainable=False) gt = self.slice_output(labels) x_loss, q_loss, total_loss = self.loss(outputs, gt, weight, learn_beta) # And scalar smmaries summaries.append( tf.summary.scalar('validation/Positional Loss', x_loss)) summaries.append( tf.summary.scalar('validation/Orientation Loss', q_loss)) summaries.append(tf.summary.scalar('validation/Total Loss', total_loss)) return outputs, total_loss, summaries def create_testable(self, inputs, dropout=None): with tf.variable_scope('PoseNet'): outputs, _ = self.create_stream(inputs, dropout=dropout, trainable=False) return outputs def create_trainable(self, inputs, labels, dropout=0.7, beta=500, learn_beta=False): summaries = [] with tf.variable_scope('PoseNet'): outputs, layers = self.create_stream(inputs, dropout, trainable=True) gt = self.slice_output(labels) if learn_beta: weight = tf.Variable(tf.constant(beta, tf.float32), name="learned_beta") else: weight = tf.constant(beta, tf.float32) x_loss, q_loss, total_loss = self.loss(outputs, gt, weight, learn_beta) # And scalar smmaries summaries.append(tf.summary.scalar('train/Positional Loss', x_loss)) summaries.append(tf.summary.scalar('train/Orientation Loss', q_loss)) summaries.append(tf.summary.scalar('train/Total Loss', total_loss)) if learn_beta: summaries.append(tf.summary.scalar('train/Beta', weight)) return outputs, total_loss, summaries
def _build_graph(self, input_vars): image, label = input_vars def shortcut(l, n_in, n_out, stride): if n_in != n_out: l = Conv2D('convshortcut', l, n_out, 1, stride=stride) return BatchNorm('bnshortcut', l) else: return l def bottleneck(l, ch_out, stride, preact): ch_in = l.get_shape().as_list()[-1] input = l if preact == 'both_preact': l = tf.nn.relu(l, name='preact-relu') input = l l = Conv2D('conv1', l, ch_out, 1, stride=stride) l = BatchNorm('bn1', l) l = tf.nn.relu(l) l = Conv2D('conv2', l, ch_out, 3) l = BatchNorm('bn2', l) l = tf.nn.relu(l) l = Conv2D('conv3', l, ch_out * 4, 1) l = BatchNorm('bn3', l) # put bn at the bottom return l + shortcut(input, ch_in, ch_out * 4, stride) def layer(l, layername, features, count, stride, first=False): with tf.variable_scope(layername): with tf.variable_scope('block0'): l = bottleneck(l, features, stride, 'no_preact' if first else 'both_preact') for i in range(1, count): with tf.variable_scope('block{}'.format(i)): l = bottleneck(l, features, 1, 'both_preact') return l cfg = {50: ([3, 4, 6, 3]), 101: ([3, 4, 23, 3]), 152: ([3, 8, 36, 3])} defs = cfg[MODEL_DEPTH] with argscope(Conv2D, nl=tf.identity, use_bias=False, W_init=variance_scaling_initializer(mode='FAN_OUT')): # tensorflow with padding=SAME will by default pad [2,3] here. # but caffe conv with stride will pad [3,3] image = tf.pad(image, [[0, 0], [3, 3], [3, 3], [0, 0]]) fc1000 = (LinearWrap(image).Conv2D( 'conv0', 64, 7, stride=2, nl=BNReLU, padding='VALID').MaxPooling( 'pool0', shape=3, stride=2, padding='SAME').apply( layer, 'group0', 64, defs[0], 1, first=True).apply( layer, 'group1', 128, defs[1], 2).apply(layer, 'group2', 256, defs[2], 2).apply( layer, 'group3', 512, defs[3], 2).tf.nn.relu().GlobalAvgPooling( 'gap').FullyConnected('fc1000', 1000, nl=tf.identity)()) prob = tf.nn.softmax(fc1000, name='prob') nr_wrong = prediction_incorrect(fc1000, label, name='wrong-top1') nr_wrong = prediction_incorrect(fc1000, label, 5, name='wrong-top5')
# 2*2*512 max_pool_normal = tf.nn.max_pool(section4_normal, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') max_pool_eigen = tf.nn.max_pool(section4_eigen, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') sum_avg = tf.concat([max_pool_normal, max_pool_eigen], axis=3) with tf.variable_scope('Full_Connected_Layer'): sum_avg_flat = tf.reshape(sum_avg, [-1, int(sum_avg.get_shape()[3])]) weight = tf.get_variable('weight', [sum_avg_flat.get_shape()[1], 7], initializer=layers.variance_scaling_initializer(), regularizer=layers.l2_regularizer(weight_decay)) bias = tf.get_variable('bias', [7], initializer=tf.zeros_initializer()) y_conv = tf.nn.bias_add(tf.matmul(sum_avg_flat, weight), bias) tf.summary.histogram(' ', y_conv) with tf.name_scope('Loss'): cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=y_conv, labels=y_)) l2 = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) loss = cost + l2 tf.summary.scalar('Loss', loss) with tf.variable_scope('Learning_rate'): global_step = tf.Variable(0, trainable=False) learning_rate = tf.train.exponential_decay(
def build(inputs, labels, weights, is_training=True, needs_vgg=False): if needs_vgg: vgg_layers, vgg_layer_names = read_vgg_init(FLAGS.vgg_init_dir) weight_decay = 5e-4 bn_params = { # Decay for the moving averages. 'decay': 0.999, 'center': True, 'scale': True, # epsilon to prevent 0s in variance. 'epsilon': 0.001, # None to force the updates 'updates_collections': None, 'is_training': is_training, } with tf.contrib.framework.arg_scope( [layers.convolution2d], kernel_size=3, stride=1, padding='SAME', rate=1, activation_fn=tf.nn.relu, # normalizer_fn=layers.batch_norm, normalizer_params=bn_params, # weights_initializer=layers.variance_scaling_initializer(), normalizer_fn=None, weights_initializer=None, weights_regularizer=layers.l2_regularizer(weight_decay)): net = layers.convolution2d(inputs, 64, scope='conv1_1') net = layers.convolution2d(net, 64, scope='conv1_2') net = layers.max_pool2d(net, 2, 2, scope='pool1') net = layers.convolution2d(net, 128, scope='conv2_1') net = layers.convolution2d(net, 128, scope='conv2_2') net = layers.max_pool2d(net, 2, 2, scope='pool2') net = layers.convolution2d(net, 256, scope='conv3_1') net = layers.convolution2d(net, 256, scope='conv3_2') net = layers.convolution2d(net, 256, scope='conv3_3') paddings = [[0, 0], [0, 0]] crops = [[0, 0], [0, 0]] block_size = 2 net = tf.space_to_batch(net, paddings=paddings, block_size=block_size) net = layers.convolution2d(net, 512, scope='conv4_1') net = layers.convolution2d(net, 512, scope='conv4_2') net = layers.convolution2d(net, 512, scope='conv4_3') net = tf.batch_to_space(net, crops=crops, block_size=block_size) block_size = 4 net = tf.space_to_batch(net, paddings=paddings, block_size=block_size) net = layers.convolution2d(net, 512, scope='conv5_1') net = layers.convolution2d(net, 512, scope='conv5_2') net = layers.convolution2d(net, 512, scope='conv5_3') net = tf.batch_to_space(net, crops=crops, block_size=block_size) with tf.contrib.framework.arg_scope( [layers.convolution2d], stride=1, padding='SAME', weights_initializer=layers.variance_scaling_initializer(), activation_fn=tf.nn.relu, normalizer_fn=layers.batch_norm, normalizer_params=bn_params, weights_regularizer=layers.l2_regularizer(FLAGS.weight_decay)): net = layers.convolution2d( net, 512, kernel_size=3, scope='conv6_1', rate=4) logits = layers.convolution2d( net, FLAGS.num_classes, 1, padding='SAME', activation_fn=None, scope='unary_2', rate=2) print('logits', logits.get_shape()) logits = tf.image.resize_bilinear( logits, [FLAGS.img_height, FLAGS.img_width], name='resize_score') loss = get_loss(logits, labels, weights, is_training=is_training) if is_training and needs_vgg: init_op, init_feed = create_init_op(vgg_layers) return logits, loss, init_op, init_feed return logits, loss
def resnn(self, sequences): """Build the resnn model. Args: page_batch: Sequences returned from inputs_train() or inputs_eval. Returns: Logits. """ self.model_conf() # [batch_size, html_len, 1, we_dim] target_expanded = tf.expand_dims(sequences, 2) # First convolution with tf.variable_scope('conv_layer1'): net = self.conv1d(target_expanded, self.groups[0].num_ker, 7, 2) # if self.special_first: net = self.BN_ReLU(net) # Max pool net = tf.nn.max_pool(net, [1, 3, 1, 1], strides=[1, 2, 1, 1], padding='SAME') if self.ror_l1: net_l1 = net # stacking Residual Units for group_i, group in enumerate(self.groups): if self.ror_l2: net_l2 = net for unit_i in range(group.num_units): net = self.residual_unit(net, group_i, unit_i) if self.ror_l2: # this is necessary to prevent loss exploding net_l2 = self.BN_ReLU(net_l2) net_l2 = self.conv1d(net_l2, self.groups[group_i].num_ker, self.bott_size13, 2) net = net + net_l2 if self.ror_l1: net_l1 = self.BN_ReLU(net_l1) net_l1 = self.conv1d(net_l1, self.groups[-1].num_ker, self.bott_size13, 2 **len(self.groups)) net = net + net_l1 # an extra activation before average pooling if self.special_first: with tf.variable_scope('special_BN_ReLU'): net = self.BN_ReLU(net) # padding should be VALID for global average pooling # output: batch*1*1*channels net_shape = net.get_shape().as_list() net = tf.nn.avg_pool(net, ksize=[1, net_shape[1], net_shape[2], 1], strides=[1, 1, 1, 1], padding='VALID') net_shape = net.get_shape().as_list() softmax_len = net_shape[1] * net_shape[2] * net_shape[3] net = tf.reshape(net, [-1, softmax_len]) # add dropout if self.dropout: with tf.name_scope("dropout"): net = tf.nn.dropout(net, self.dropout_keep_prob) # 1D-fully connected nueral network with tf.variable_scope('FC-layer'): net = fully_connected( net, num_outputs=self.num_cats, activation_fn=None, normalizer_fn=None, weights_initializer=variance_scaling_initializer(), weights_regularizer=l2_regularizer(self.weight_decay), biases_initializer=tf.zeros_initializer, ) return net
# --*-- encoding: UTF-8 --*-- from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import numpy as np import tensorflow as tf tf.GraphKeys.VARIABLES = tf.GraphKeys.GLOBAL_VARIABLES from luna import FLAGS, batch_norm from tensorflow.contrib.layers import variance_scaling_initializer FLAGS.LABEL_NUMBER = 2 XAVIER_INIT = tf.contrib.layers.xavier_initializer(seed=FLAGS.SEED) RELU_INIT = variance_scaling_initializer() Wb = { 'W1': tf.get_variable('W1', [5, 5, 5, FLAGS.CHANNEL_NUMBER, 16], tf.float32, XAVIER_INIT), 'b1': tf.Variable(tf.zeros([16])), 'W2': tf.get_variable('W2', [3, 3, 3, 16, 24], tf.float32, XAVIER_INIT), 'b2': tf.Variable(tf.zeros([24])), 'W3': tf.get_variable('W3', [3, 3, 3, 24, 32], tf.float32, XAVIER_INIT), 'b3': tf.Variable(tf.zeros([32])), 'W4': tf.get_variable('W4', [3, 3, 3, 32, 48], tf.float32, XAVIER_INIT), 'b4': tf.Variable(tf.zeros([48])), 'W5': tf.get_variable('W5', [3, 3, 3, 48, 64], tf.float32, XAVIER_INIT), 'b5': tf.Variable(tf.zeros([64])), 'fcw1': tf.get_variable('fcw1', [36 * 64, 32], tf.float32, XAVIER_INIT), 'fcb1': tf.Variable(tf.zeros([32])), 'fcw2': tf.get_variable('fcw2', [32, FLAGS.LABEL_NUMBER], tf.float32, XAVIER_INIT),
def main(using_to_serve): """ Set up the data pipelines, create the computational graph, train the model, and evaluate the results. """ SAVE_MODEL_DIR = "/srv/tmp/encoder/pre_commit_test" #with_moves_7_regularized" TRAINING_FILENAME_PATTERN = "/srv/databases/chess_engine/move_scoring_1/move_scoring_training_set_*.tfrecords" VALIDATION_FILENAME_PATTERN = "/srv/databases/chess_engine/move_scoring_1/move_scoring_validation_set_*.tfrecords" # TESTING_FILENAME_PATTERN = "/srv/databases/chess_engine/move_scoring_1/move_scoring_testing_set_*.tfrecords" TRAIN_OP_SUMMARIES = ["gradient_norm", "gradients"] NUM_INPUT_FILTERS = 15 # NUM_OUTPUTS = 1792 OPTIMIZER = 'Adam' TRAINING_BATCH_SIZE = 64 #200#500 VALIDATION_BATCH_SIZE = 20000 # TESTING_BATCH_SIZE = 10000 LOG_ITERATION_INTERVAL = 10000 LEARNING_RATE = 1e-3 #.000001#.000002 MAKE_CNN_MODULES_TRAINABLE = True init_conv_layers_fn = None noise_factor = .5 WEIGHT_REGULARIZATION_FN = lambda: layers.l2_regularizer(noise_factor ) #lambda:None LABEL_TENSOR_NAME = "Reshape:0" #"inception_module_3_path_1/Relu:0" KERNEL_INITIALIZER = lambda: layers.variance_scaling_initializer( factor=1, mode='FAN_IN', uniform=True) ENCODER_MODULE = [[ [20, 3], # 720 [35, 3], # 560 [400, 4] ]] # 400 DECODER_MODULE = [[ [100, 2, 1], #400 [45, 3, 1], #800 [55, 3, 1] ]] #2160 with final layer having 80*8*8=5120 INCEPTION_MODULES = [ ENCODER_MODULE, lambda tensor: ann_h.build_transposed_inception_module_with_batch_norm( tensor, DECODER_MODULE, kernel_initializer=KERNEL_INITIALIZER, mode=tf.estimator.ModeKeys.TRAIN, padding="valid", weight_regularizer=WEIGHT_REGULARIZATION_FN, num_previously_built_inception_modules=1)[0], lambda tensor: tf.layers.conv2d_transpose( tensor, 80, 3, strides=1, kernel_initializer=KERNEL_INITIALIZER(), kernel_regularizer=WEIGHT_REGULARIZATION_FN(), padding="valid", activation=None, use_bias=False), ] BATCHES_IN_TRAINING_EPOCH = (7 * 2009392) // (TRAINING_BATCH_SIZE) BATCHES_IN_VALIDATION_EPOCH = 2009392 // VALIDATION_BATCH_SIZE learning_decay_function = lambda gs: tf.train.exponential_decay( LEARNING_RATE, global_step=gs, decay_steps=BATCHES_IN_TRAINING_EPOCH, #25, decay_rate=0.96, staircase=True) # Create the Estimator the_estimator = Estimator( model_fn=ann_h.encoder_builder_fn, model_dir=SAVE_MODEL_DIR, config=tf.estimator.RunConfig().replace( save_checkpoints_steps=LOG_ITERATION_INTERVAL, save_summary_steps=LOG_ITERATION_INTERVAL, session_config=tf.ConfigProto(gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=.3))), params={ "optimizer": OPTIMIZER, "log_interval": LOG_ITERATION_INTERVAL, "model_dir": SAVE_MODEL_DIR, "inception_modules": INCEPTION_MODULES, "learning_rate": LEARNING_RATE, "train_summaries": TRAIN_OP_SUMMARIES, "learning_decay_function": learning_decay_function, "trainable_cnn_modules": MAKE_CNN_MODULES_TRAINABLE, "conv_init_fn": init_conv_layers_fn, "num_input_filters": NUM_INPUT_FILTERS, "kernel_regularizer": WEIGHT_REGULARIZATION_FN, "label_tensor_name": LABEL_TENSOR_NAME, "kernel_initializer": KERNEL_INITIALIZER, }) validation_hook = ann_h.ValidationRunHook( step_increment=BATCHES_IN_TRAINING_EPOCH, estimator=the_estimator, input_fn_creator=lambda: ann_h.encoder_tf_records_input_data_fn( VALIDATION_FILENAME_PATTERN, VALIDATION_BATCH_SIZE, include_unoccupied=NUM_INPUT_FILTERS == 16, repeat=False, shuffle=False), temp_num_steps_in_epoch=BATCHES_IN_VALIDATION_EPOCH, recall_input_fn_creator_after_evaluate=True) the_estimator.train( input_fn=ann_h.encoder_tf_records_input_data_fn( TRAINING_FILENAME_PATTERN, TRAINING_BATCH_SIZE, shuffle_buffer_size=50000, include_unoccupied=NUM_INPUT_FILTERS == 16), hooks=[validation_hook], # max_steps=1, )
def build(inputs): weight_decay = 5e-4 bn_params = { # Decay for the moving averages. 'decay': 0.999, 'center': True, 'scale': True, # epsilon to prevent 0s in variance. 'epsilon': 0.001, # None to force the updates 'updates_collections': None, 'is_training': False, } with tf.contrib.framework.arg_scope( [layers.convolution2d], kernel_size=3, stride=1, padding='SAME', rate=1, activation_fn=tf.nn.relu, # normalizer_fn=layers.batch_norm, normalizer_params=bn_params, # weights_initializer=layers.variance_scaling_initializer(), normalizer_fn=None, weights_initializer=None, weights_regularizer=layers.l2_regularizer(weight_decay)): import pdb #pdb.set_trace() net = layers.convolution2d(inputs, 64, scope='conv1_1') net = layers.convolution2d(net, 64, scope='conv1_2') net = layers.max_pool2d(net, 2, 2, scope='pool1') net = layers.convolution2d(net, 128, scope='conv2_1') net = layers.convolution2d(net, 128, scope='conv2_2') net = layers.max_pool2d(net, 2, 2, scope='pool2') net = layers.convolution2d(net, 256, scope='conv3_1') net = layers.convolution2d(net, 256, scope='conv3_2') net = layers.convolution2d(net, 256, scope='conv3_3') paddings = [[0, 0], [0, 0]] crops = [[0, 0], [0, 0]] block_size = 2 net = tf.space_to_batch(net, paddings=paddings, block_size=block_size) net = layers.convolution2d(net, 512, scope='conv4_1') net = layers.convolution2d(net, 512, scope='conv4_2') net = layers.convolution2d(net, 512, scope='conv4_3') net = tf.batch_to_space(net, crops=crops, block_size=block_size) block_size = 4 net = tf.space_to_batch(net, paddings=paddings, block_size=block_size) net = layers.convolution2d(net, 512, scope='conv5_1') net = layers.convolution2d(net, 512, scope='conv5_2') net = layers.convolution2d(net, 512, scope='conv5_3') net = tf.batch_to_space(net, crops=crops, block_size=block_size) with tf.contrib.framework.arg_scope( [layers.convolution2d], stride=1, padding='SAME', weights_initializer=layers.variance_scaling_initializer(), activation_fn=tf.nn.relu, normalizer_fn=layers.batch_norm, normalizer_params=bn_params, weights_regularizer=layers.l2_regularizer(1e-3)): net = layers.convolution2d(net, 512, kernel_size=3, scope='conv6_1', rate=4) logits = layers.convolution2d(net, 19, 1, padding='SAME', activation_fn=None, scope='unary_2', rate=2) print('logits', logits.get_shape()) #logits=tf.image.resize_bilinear(logits,[256,512],name='resize_score') return logits
def _build_graph(self, inputs): image, label = inputs image = tf.cast(image, tf.float32) * (1.0 / 255) # Wrong mean/std are used for compatibility with pre-trained models. # Should actually add a RGB-BGR conversion here. image_mean = tf.constant([0.485, 0.456, 0.406], dtype=tf.float32) image_std = tf.constant([0.229, 0.224, 0.225], dtype=tf.float32) image = (image - image_mean) / image_std if self.data_format == 'NCHW': image = tf.transpose(image, [0, 3, 1, 2]) def shortcut(l, n_in, n_out, stride): if n_in != n_out: return Conv2D('convshortcut', l, n_out, 1, stride=stride) else: return l def basicblock(l, ch_out, stride, preact): ch_in = l.get_shape().as_list()[1] if preact == 'both_preact': l = BNReLU('preact', l) input = l elif preact != 'no_preact': input = l l = BNReLU('preact', l) else: input = l l = Conv2D('conv1', l, ch_out, 3, stride=stride, nl=BNReLU) l = Conv2D('conv2', l, ch_out, 3) return l + shortcut(input, ch_in, ch_out, stride) def bottleneck(l, ch_out, stride, preact): ch_in = l.get_shape().as_list()[1] if preact == 'both_preact': l = BNReLU('preact', l) input = l elif preact != 'no_preact': input = l l = BNReLU('preact', l) else: input = l l = Conv2D('conv1', l, ch_out, 1, nl=BNReLU) l = Conv2D('conv2', l, ch_out, 3, stride=stride, nl=BNReLU) l = Conv2D('conv3', l, ch_out * 4, 1) return l + shortcut(input, ch_in, ch_out * 4, stride) def layer(l, layername, block_func, features, count, stride, first=False): with tf.variable_scope(layername): with tf.variable_scope('block0'): l = block_func(l, features, stride, 'no_preact' if first else 'both_preact') for i in range(1, count): with tf.variable_scope('block{}'.format(i)): l = block_func(l, features, 1, 'default') return l cfg = { 18: ([2, 2, 2, 2], basicblock), 34: ([3, 4, 6, 3], basicblock), 50: ([3, 4, 6, 3], bottleneck), 101: ([3, 4, 23, 3], bottleneck) } defs, block_func = cfg[DEPTH] with argscope(Conv2D, nl=tf.identity, use_bias=False, W_init=variance_scaling_initializer(mode='FAN_OUT')), \ argscope([Conv2D, MaxPooling, GlobalAvgPooling, BatchNorm], data_format=self.data_format): logits = (LinearWrap(image) .Conv2D('conv0', 64, 7, stride=2, nl=BNReLU) .MaxPooling('pool0', shape=3, stride=2, padding='SAME') .apply(layer, 'group0', block_func, 64, defs[0], 1, first=True) .apply(layer, 'group1', block_func, 128, defs[1], 2) .apply(layer, 'group2', block_func, 256, defs[2], 2) .apply(layer, 'group3', block_func, 512, defs[3], 2) .BNReLU('bnlast') .GlobalAvgPooling('gap') .FullyConnected('linear', 1000, nl=tf.identity)()) loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) loss = tf.reduce_mean(loss, name='xentropy-loss') wrong = prediction_incorrect(logits, label, 1, name='wrong-top1') add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1')) wrong = prediction_incorrect(logits, label, 5, name='wrong-top5') add_moving_summary(tf.reduce_mean(wrong, name='train-error-top5')) wd_cost = regularize_cost('.*/W', l2_regularizer(1e-4), name='l2_regularize_loss') add_moving_summary(loss, wd_cost) self.cost = tf.add_n([loss, wd_cost], name='cost')