def _build_train(self): print("Build train graph") all_h, self.train_reset = self._model(self.x_train, True, False) log_probs = self._get_log_probs(all_h, self.y_train, batch_size=self.batch_size, is_training=True) self.loss = tf.reduce_sum(log_probs) / tf.to_float(self.batch_size) self.train_ppl = tf.exp(tf.reduce_mean(log_probs)) tf_variables = [ var for var in tf.trainable_variables() if var.name.startswith(self.name) ] self.num_vars = count_model_params(tf_variables) print("-" * 80) print("Model has {} parameters".format(self.num_vars)) loss = self.loss if self.rnn_l2_reg is not None: loss += (self.rnn_l2_reg * tf.reduce_sum(all_h**2) / tf.to_float(self.batch_size)) if self.rnn_slowness_reg is not None: loss += (self.rnn_slowness_reg * self.all_h_diff / tf.to_float(self.batch_size)) self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name="global_step") (self.train_op, self.lr, self.grad_norm, self.optimizer, self.grad_norms) = get_train_ops( loss, tf_variables, self.global_step, clip_mode=self.clip_mode, grad_bound=self.grad_bound, l2_reg=self.l2_reg, lr_warmup_val=self.lr_warmup_val, lr_warmup_steps=self.lr_warmup_steps, lr_init=self.lr_init, lr_dec_start=self.lr_dec_start, lr_dec_every=self.lr_dec_every, lr_dec_rate=self.lr_dec_rate, lr_dec_min=self.lr_dec_min, optim_algo=self.optim_algo, moving_average=self.optim_moving_average, sync_replicas=self.sync_replicas, num_aggregate=self.num_aggregate, num_replicas=self.num_replicas, get_grad_norms=True, )
def _build_train(self): print("-" * 80) print("Build train graph") logits = self._model(self.x_train, is_training=True) log_probs = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=self.y_train) self.loss = tf.reduce_mean(log_probs) if self.use_aux_heads: log_probs = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.aux_logits, labels=self.y_train) self.aux_loss = tf.reduce_mean(log_probs) train_loss = self.loss + 0.4 * self.aux_loss else: train_loss = self.loss self.train_preds = tf.argmax(logits, axis=1) self.train_preds = tf.to_int32(self.train_preds) self.train_acc = tf.equal(self.train_preds, self.y_train) self.train_acc = tf.to_int32(self.train_acc) self.train_acc = tf.reduce_sum(self.train_acc) tf_variables = [ var for var in tf.trainable_variables() if ( var.name.startswith(self.name) and "aux_head" not in var.name)] self.num_vars = count_model_params(tf_variables) print("Model has {0} params".format(self.num_vars)) self.train_op, self.lr, self.grad_norm, self.optimizer = get_train_ops( train_loss, tf_variables, self.global_step, clip_mode=self.clip_mode, grad_bound=self.grad_bound, l2_reg=self.l2_reg, lr_init=self.lr_init, lr_dec_start=self.lr_dec_start, lr_dec_every=self.lr_dec_every, lr_dec_rate=self.lr_dec_rate, lr_cosine=self.lr_cosine, lr_max=self.lr_max, lr_min=self.lr_min, lr_T_0=self.lr_T_0, lr_T_mul=self.lr_T_mul, num_train_batches=self.num_train_batches, optim_algo=self.optim_algo, sync_replicas=self.sync_replicas, num_aggregate=self.num_aggregate, num_replicas=self.num_replicas)
def _build_train(self): print("Build train graph") logits = self._model(self.x_train, True) log_probs = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=self.y_train) self.loss = tf.reduce_mean(log_probs) self.train_preds = tf.argmax(logits, axis=1) self.train_preds = tf.to_int32(self.train_preds) self.train_acc = tf.equal(self.train_preds, self.y_train) self.train_acc = tf.to_int32(self.train_acc) self.train_acc = tf.reduce_sum(self.train_acc) tf_variables = [ var for var in tf.trainable_variables() if var.name.startswith(self.name) ] self.num_vars = count_model_params(tf_variables) print("-" * 80) for var in tf_variables: print(var) self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name="global_step") self.train_op, self.lr, self.grad_norm, self.optimizer = get_train_ops( self.loss, tf_variables, self.global_step, clip_mode=self.clip_mode, grad_bound=self.grad_bound, l2_reg=self.l2_reg, lr_init=self.lr_init, lr_dec_start=self.lr_dec_start, lr_dec_every=self.lr_dec_every, lr_dec_rate=self.lr_dec_rate, optim_algo=self.optim_algo, sync_replicas=self.sync_replicas, num_aggregate=self.num_aggregate, num_replicas=self.num_replicas)
def _model(self, images, is_training, reuse=False): """Compute the logits given the images.""" if self.fixed_arc is None: is_training = True with tf.variable_scope(self.name, reuse=reuse): # the first two inputs with tf.variable_scope("stem_conv"): w = create_weight("w", [3, 3, 3, self.out_filters * 3]) x = tf.nn.conv2d( images, w, [1, 1, 1, 1], "SAME", data_format=self.data_format) x = batch_norm(x, is_training, data_format=self.data_format) if self.data_format == "NHCW": split_axis = 3 elif self.data_format == "NCHW": split_axis = 1 else: raise ValueError("Unknown data_format '{0}'".format(self.data_format)) layers = [x, x] # building layers in the micro space out_filters = self.out_filters for layer_id in range(self.num_layers + 2): with tf.variable_scope("layer_{0}".format(layer_id)): if layer_id not in self.pool_layers: if self.fixed_arc is None: x = self._enas_layer( layer_id, layers, self.normal_arc, out_filters) else: x = self._fixed_layer( layer_id, layers, self.normal_arc, out_filters, 1, is_training, normal_or_reduction_cell="normal") else: out_filters *= 2 if self.fixed_arc is None: x = self._factorized_reduction(x, out_filters, 2, is_training) layers = [layers[-1], x] x = self._enas_layer( layer_id, layers, self.reduce_arc, out_filters) else: x = self._fixed_layer( layer_id, layers, self.reduce_arc, out_filters, 2, is_training, normal_or_reduction_cell="reduction") print("Layer {0:>2d}: {1}".format(layer_id, x)) layers = [layers[-1], x] # auxiliary heads self.num_aux_vars = 0 if (self.use_aux_heads and layer_id in self.aux_head_indices and is_training): print("Using aux_head at layer {0}".format(layer_id)) with tf.variable_scope("aux_head"): aux_logits = tf.nn.relu(x) aux_logits = tf.layers.average_pooling2d( aux_logits, [5, 5], [3, 3], "VALID", data_format=self.actual_data_format) with tf.variable_scope("proj"): inp_c = self._get_C(aux_logits) w = create_weight("w", [1, 1, inp_c, 128]) aux_logits = tf.nn.conv2d(aux_logits, w, [1, 1, 1, 1], "SAME", data_format=self.data_format) aux_logits = batch_norm(aux_logits, is_training=True, data_format=self.data_format) aux_logits = tf.nn.relu(aux_logits) with tf.variable_scope("avg_pool"): inp_c = self._get_C(aux_logits) hw = self._get_HW(aux_logits) w = create_weight("w", [hw, hw, inp_c, 768]) aux_logits = tf.nn.conv2d(aux_logits, w, [1, 1, 1, 1], "SAME", data_format=self.data_format) aux_logits = batch_norm(aux_logits, is_training=True, data_format=self.data_format) aux_logits = tf.nn.relu(aux_logits) with tf.variable_scope("fc"): aux_logits = global_avg_pool(aux_logits, data_format=self.data_format) inp_c = aux_logits.get_shape()[1].value w = create_weight("w", [inp_c, 10]) aux_logits = tf.matmul(aux_logits, w) self.aux_logits = aux_logits aux_head_variables = [ var for var in tf.trainable_variables() if ( var.name.startswith(self.name) and "aux_head" in var.name)] self.num_aux_vars = count_model_params(aux_head_variables) print("Aux head uses {0} params".format(self.num_aux_vars)) x = tf.nn.relu(x) x = global_avg_pool(x, data_format=self.data_format) if is_training and self.keep_prob is not None and self.keep_prob < 1.0: x = tf.nn.dropout(x, self.keep_prob) with tf.variable_scope("fc"): inp_c = self._get_C(x) w = create_weight("w", [inp_c, 10]) x = tf.matmul(x, w) return x