def _create_gumbel_control_variate(self, logQHard, temperature=None): '''Calculate gumbel control variate. ''' if temperature is None: temperature = self.hparams.temperature logQ, softSamples = self._recognition_network(sampler=functools.partial( self._random_sample_soft, temperature=temperature)) softELBO, _ = self._generator_network(softSamples, logQ) logQ = tf.add_n(logQ) # Generate the softELBO_v (should be the same value but different grads) logQ_v, softSamples_v = self._recognition_network(sampler=functools.partial( self._random_sample_soft_v, temperature=temperature)) softELBO_v, _ = self._generator_network(softSamples_v, logQ_v) logQ_v = tf.add_n(logQ_v) # Compute losses learning_signal = tf.stop_gradient(softELBO_v) # Control variate h = (tf.stop_gradient(learning_signal) * tf.add_n(logQHard) - softELBO + softELBO_v) extra = (softELBO_v, -softELBO + softELBO_v) return h, extra
def __call__(self, flow=None): """Constructs the Sequential and its inner pieces. Args: flow: Input `Tensor` object. (Default value = None) Returns: Output of this `Parallel`. """ # build inner pieces. with tf.variable_op_scope([], self.name, 'Parallel', reuse=self.reuse): if not self.reuse: self.reuse = True outputs = [] for i, piece in enumerate(self.child_pieces): outputs.append(piece(flow)) if self.mode == 'concat': return tf.concat(self.along_dim, outputs) elif self.mode == 'mean': return tf.add_n(outputs) / len(self.child_pieces) elif self.mode == 'sum': return tf.add_n(outputs)
def loss(self, traindata): """build models, calculate losses. Args: traindata: 4-D Tensor of shape `[batch, height, width, channels]`. Returns: dict of each models' losses. """ generated = self.g(self.z, training=True) g_outputs = self.d(generated, training=True, name='g') t_outputs = self.d(traindata, training=True, name='t') # add each losses to collection tf.add_to_collection( 'g_losses', tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=tf.ones([self.batch_size], dtype=tf.int64), logits=g_outputs))) tf.add_to_collection( 'd_losses', tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=tf.ones([self.batch_size], dtype=tf.int64), logits=t_outputs))) tf.add_to_collection( 'd_losses', tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=tf.zeros([self.batch_size], dtype=tf.int64), logits=g_outputs))) return { self.g: tf.add_n(tf.get_collection('g_losses'), name='total_g_loss'), self.d: tf.add_n(tf.get_collection('d_losses'), name='total_d_loss'), }
def loss(self, predicts, labels, objects_num): """Add Loss to all the trainable variables Args: predicts: 4-D tensor [batch_size, cell_size, cell_size, 5 * boxes_per_cell] ===> (num_classes, boxes_per_cell, 4 * boxes_per_cell) labels : 3-D tensor of [batch_size, max_objects, 5] objects_num: 1-D tensor [batch_size] """ class_loss = tf.constant(0, tf.float32) object_loss = tf.constant(0, tf.float32) noobject_loss = tf.constant(0, tf.float32) coord_loss = tf.constant(0, tf.float32) loss = [0, 0, 0, 0] for i in range(self.batch_size): predict = predicts[i, :, :, :] label = labels[i, :, :] object_num = objects_num[i] nilboy = tf.ones([7,7,2]) tuple_results = tf.while_loop(self.cond1, self.body1, [tf.constant(0), object_num, [class_loss, object_loss, noobject_loss, coord_loss], predict, label, nilboy]) for j in range(4): loss[j] = loss[j] + tuple_results[2][j] nilboy = tuple_results[5] tf.add_to_collection('losses', (loss[0] + loss[1] + loss[2] + loss[3])/self.batch_size) tf.summary.scalar('class_loss', loss[0]/self.batch_size) tf.summary.scalar('object_loss', loss[1]/self.batch_size) tf.summary.scalar('noobject_loss', loss[2]/self.batch_size) tf.summary.scalar('coord_loss', loss[3]/self.batch_size) tf.summary.scalar('weight_loss', tf.add_n(tf.get_collection('losses')) - (loss[0] + loss[1] + loss[2] + loss[3])/self.batch_size ) return tf.add_n(tf.get_collection('losses'), name='total_loss'), nilboy
def _full_batch_training_op(self, inputs, cluster_idx_list, cluster_centers): """Creates an op for training for full batch case. Args: inputs: list of input Tensors. cluster_idx_list: A vector (or list of vectors). Each element in the vector corresponds to an input row in 'inp' and specifies the cluster id corresponding to the input. cluster_centers: Tensor Ref of cluster centers. Returns: An op for doing an update of mini-batch k-means. """ cluster_sums = [] cluster_counts = [] epsilon = tf.constant(1e-6, dtype=inputs[0].dtype) for inp, cluster_idx in zip(inputs, cluster_idx_list): with ops.colocate_with(inp): cluster_sums.append(tf.unsorted_segment_sum(inp, cluster_idx, self._num_clusters)) cluster_counts.append(tf.unsorted_segment_sum( tf.reshape(tf.ones(tf.reshape(tf.shape(inp)[0], [-1])), [-1, 1]), cluster_idx, self._num_clusters)) with ops.colocate_with(cluster_centers): new_clusters_centers = tf.add_n(cluster_sums) / ( tf.cast(tf.add_n(cluster_counts), cluster_sums[0].dtype) + epsilon) if self._clusters_l2_normalized(): new_clusters_centers = tf.nn.l2_normalize(new_clusters_centers, dim=1) return tf.assign(cluster_centers, new_clusters_centers)
def _tower_loss(iterator, num_of_classes, ignore_label, scope, reuse_variable): """Calculates the total loss on a single tower running the deeplab model. Args: iterator: An iterator of type tf.data.Iterator for images and labels. num_of_classes: Number of classes for the dataset. ignore_label: Ignore label for the dataset. scope: Unique prefix string identifying the deeplab tower. reuse_variable: If the variable should be reused. Returns: The total loss for a batch of data. """ with tf.variable_scope( tf.get_variable_scope(), reuse=True if reuse_variable else None): _build_deeplab(iterator, {common.OUTPUT_TYPE: num_of_classes}, ignore_label) losses = tf.losses.get_losses(scope=scope) for loss in losses: tf.summary.scalar('Losses/%s' % loss.op.name, loss) regularization_loss = tf.losses.get_regularization_loss(scope=scope) tf.summary.scalar('Losses/%s' % regularization_loss.op.name, regularization_loss) total_loss = tf.add_n([tf.add_n(losses), regularization_loss]) return total_loss
def _make_objectives(self): # TODO: Hacky, will cause clashes if multiple DPG instances. policy_params = self._policy_params() critic_params = [var for var in tf.all_variables() if "critic/" in var.name] self.policy_params = policy_params self.critic_params = critic_params # Policy objective: maximize on-policy critic activations mean_critic_over_time = tf.add_n(self.critic_on) / self.seq_length mean_critic = tf.reduce_mean(mean_critic_over_time) self.policy_objective = -mean_critic # DEV tf.scalar_summary("critic(a_pred).mean", mean_critic) # Critic objective: minimize MSE of off-policy Q-value predictions q_errors = [tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(critic_off_t, q_targets_t))#tf.square(critic_off_t - q_targets_t)) for critic_off_t, q_targets_t in zip(self.critic_off_pre, self.q_targets)] self.critic_objective = tf.add_n(q_errors) / self.seq_length tf.scalar_summary("critic_objective", self.critic_objective) mean_critic_off = tf.reduce_mean(tf.add_n(self.critic_off)) / self.seq_length tf.scalar_summary("critic(a_explore).mean", mean_critic_off) tf.scalar_summary("a_pred.mean", tf.reduce_mean(tf.add_n(self.a_pred)) / self.seq_length) tf.scalar_summary("a_pred.maxabs", tf.reduce_max(tf.abs(tf.pack(self.a_pred))))
def top_sharded(self, sharded_body_output, sharded_targets, data_parallelism, weights_fn=common_layers.weights_nonzero): """Transform all shards of targets. Classes with cross-shard interaction will override this function. Args: sharded_body_output: A list of Tensors. sharded_targets: A list of Tensors. data_parallelism: a expert_utils.Parallelism object. weights_fn: function from targets to target weights. Returns: shaded_logits: A list of Tensors. training_loss: a Scalar. """ sharded_logits = data_parallelism(self.top, sharded_body_output, sharded_targets) loss_num, loss_den = data_parallelism( common_layers.padded_cross_entropy, sharded_logits, sharded_targets, self._model_hparams.label_smoothing, weights_fn=weights_fn) loss = tf.add_n(loss_num) / tf.maximum(1.0, tf.add_n(loss_den)) return sharded_logits, loss
def build_model(self): self.x = tf.placeholder(tf.float32, [self.reader.vocab_size], name="input") self.x_idx = tf.placeholder(tf.int32, [None], name='x_idx') # mask paddings self.build_encoder() self.build_generator() self.objective = self.kl +self.recons_loss # optimizer for alternative update optimizer1 = tf.train.AdamOptimizer(learning_rate=self.learning_rate) optimizer2 = tf.train.AdamOptimizer(learning_rate=0.1) fullvars = tf.GraphKeys.TRAINABLE_VARIABLES print 'fullvars:',fullvars enc_vars = tf.get_collection(fullvars,scope='encoder') print enc_vars dec_vars = tf.get_collection(fullvars,scope='generator') print dec_vars self.lossL2_enc = tf.add_n([ tf.nn.l2_loss(v) for v in enc_vars if 'bias' not in v.name]) * 0.0001 self.lossL2_dec = tf.add_n([ tf.nn.l2_loss(v) for v in dec_vars if 'bias' not in v.name]) print 'lossL2_enc:',self.lossL2_enc print 'lossL2_dec:',self.lossL2_dec enc_grads = tf.gradients(self.kl+self.lossL2_enc, enc_vars) dec_grads = tf.gradients(self.recons_loss+self.lossL2_dec, dec_vars) self.optim_enc = optimizer1.apply_gradients(zip(enc_grads, enc_vars)) self.optim_dec = optimizer2.apply_gradients(zip(dec_grads, dec_vars))
def multilevel_rpn_losses( multilevel_anchors, multilevel_label_logits, multilevel_box_logits): """ Args: multilevel_anchors: #lvl RPNAnchors multilevel_label_logits: #lvl tensors of shape HxWxA multilevel_box_logits: #lvl tensors of shape HxWxAx4 Returns: label_loss, box_loss """ num_lvl = len(cfg.FPN.ANCHOR_STRIDES) assert len(multilevel_anchors) == num_lvl assert len(multilevel_label_logits) == num_lvl assert len(multilevel_box_logits) == num_lvl losses = [] with tf.name_scope('rpn_losses'): for lvl in range(num_lvl): anchors = multilevel_anchors[lvl] label_loss, box_loss = rpn_losses( anchors.gt_labels, anchors.encoded_gt_boxes(), multilevel_label_logits[lvl], multilevel_box_logits[lvl], name_scope='level{}'.format(lvl + 2)) losses.extend([label_loss, box_loss]) total_label_loss = tf.add_n(losses[::2], name='label_loss') total_box_loss = tf.add_n(losses[1::2], name='box_loss') add_moving_summary(total_label_loss, total_box_loss) return total_label_loss, total_box_loss
def __init__(self, nr_gpu, input, model): super(MultiGPUGANTrainer, self).__init__() assert nr_gpu > 1 raw_devices = ['/gpu:{}'.format(k) for k in range(nr_gpu)] # Setup input input = StagingInput(input) cbs = input.setup(model.get_inputs_desc()) self.register_callback(cbs) # Build the graph with multi-gpu replication def get_cost(*inputs): model.build_graph(*inputs) return [model.d_loss, model.g_loss] self.tower_func = TowerFuncWrapper(get_cost, model.get_inputs_desc()) devices = [LeastLoadedDeviceSetter(d, raw_devices) for d in raw_devices] cost_list = DataParallelBuilder.build_on_towers( list(range(nr_gpu)), lambda: self.tower_func(*input.get_input_tensors()), devices) # Simply average the cost here. It might be faster to average the gradients with tf.name_scope('optimize'): d_loss = tf.add_n([x[0] for x in cost_list]) * (1.0 / nr_gpu) g_loss = tf.add_n([x[1] for x in cost_list]) * (1.0 / nr_gpu) opt = model.get_optimizer() # run one d_min after one g_min g_min = opt.minimize(g_loss, var_list=model.g_vars, colocate_gradients_with_ops=True, name='g_op') with tf.control_dependencies([g_min]): d_min = opt.minimize(d_loss, var_list=model.d_vars, colocate_gradients_with_ops=True, name='d_op') # Define the training iteration self.train_op = d_min
def _hourglass(self, inputs, n, numOut, name = 'hourglass'): """ Hourglass Module Args: inputs : Input Tensor n : Number of downsampling step numOut : Number of Output Features (channels) name : Name of the block """ with tf.name_scope(name): # Upper Branch up_1 = self._residual(inputs, numOut, name = 'up_1') # Lower Branch low_ = tf.contrib.layers.max_pool2d(inputs, [2,2], [2,2], padding='VALID') low_1= self._residual(low_, numOut, name = 'low_1') if n > 0: low_2 = self._hourglass(low_1, n-1, numOut, name = 'low_2') else: low_2 = self._residual(low_1, numOut, name = 'low_2') low_3 = self._residual(low_2, numOut, name = 'low_3') up_2 = tf.image.resize_nearest_neighbor(low_3, tf.shape(low_3)[1:3]*2, name = 'upsampling') if self.modif: # Use of RELU return tf.nn.relu(tf.add_n([up_2,up_1]), name='out_hg') else: return tf.add_n([up_2,up_1], name='out_hg')
def combined_loss_G(self,batch_size_tf): """ Calculates the sum of the combined adversarial, lp and GDL losses in the given proportion. Used for training the generative model. @param gen_frames: A list of tensors of the generated frames at each scale. @param gt_frames: A list of tensors of the ground truth frames at each scale. @param d_preds: A list of tensors of the classifications made by the discriminator model at each scale. @param lam_adv: The percentage of the adversarial loss to use in the combined loss. @param lam_lp: The percentage of the lp loss to use in the combined loss. @param lam_gdl: The percentage of the GDL loss to use in the combined loss. @param l_num: 1 or 2 for l1 and l2 loss, respectively). @param alpha: The power to which each gradient term is raised in GDL loss. @return: The combined adversarial, lp and GDL losses. """ diceterm=loss_dice(self.G, self.CT_GT, self.num_classes,batch_size_tf) fcnterm=lossfcn(self.G, self.CT_GT, self.num_classes, batch_size_tf, self.classweights) if self.adversarial: bceterm=tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(self.D_logits_, tf.ones_like(self.D_))) loss_=self.lam_dice*diceterm + self.lam_fcn*fcnterm + self.lam_adv*bceterm tf.add_to_collection('losses', loss_) loss = tf.add_n(tf.get_collection('losses'), name='total_loss') return loss, diceterm, fcnterm, bceterm else: loss_=self.lam_dice*diceterm + self.lam_fcn*fcnterm tf.add_to_collection('losses', loss_) loss = tf.add_n(tf.get_collection('losses'), name='total_loss') return loss, self.lam_dice*diceterm, self.lam_fcn*fcnterm
def solve(global_step): """add solver to losses""" # learning reate lr = _configure_learning_rate(82783, global_step) optimizer = _configure_optimizer(lr) tf.summary.scalar('learning_rate', lr) # compute and apply gradient losses = tf.get_collection(tf.GraphKeys.LOSSES) regular_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) regular_loss = tf.add_n(regular_losses) out_loss = tf.add_n(losses) total_loss = tf.add_n(losses + regular_losses) tf.summary.scalar('total_loss', total_loss) tf.summary.scalar('out_loss', out_loss) tf.summary.scalar('regular_loss', regular_loss) update_ops = [] variables_to_train = _get_variables_to_train() # update_op = optimizer.minimize(total_loss) gradients = optimizer.compute_gradients(total_loss, var_list=variables_to_train) grad_updates = optimizer.apply_gradients(gradients, global_step=global_step) update_ops.append(grad_updates) # update moving mean and variance if FLAGS.update_bn: update_bns = tf.get_collection(tf.GraphKeys.UPDATE_OPS) update_bn = tf.group(*update_bns) update_ops.append(update_bn) return tf.group(*update_ops)
def _build(self, dataset, feature_transformer): if self.samples_per_class is not None: if dataset not in self.dataset_map: # datasets are outside of frames from while loops with tf.control_dependencies(None): self.dataset_map[dataset] = utils.sample_n_per_class( dataset, self.samples_per_class) dataset = self.dataset_map[dataset] stats = collections.defaultdict(list) losses = [] # TODO(lmetz) move this to ingraph control flow? for _ in xrange(self.averages): loss, stat = self._build_once(dataset, feature_transformer) losses.append(loss) for k, v in stat.items(): stats[k].append(v) stats = {k: tf.add_n(v) / float(len(v)) for k, v in stats.items()} summary_updates = [] for k, v in stats.items(): tf.summary.scalar(k, v) with tf.control_dependencies(summary_updates): return tf.add_n(losses) / float(len(losses))
def _read(self, keys, redundant_states): read = _comp_mul(keys, redundant_states) if self._num_copies > 1: xs_real = tf.split(1, self._num_copies, _comp_real(read)) xs_imag = tf.split(1, self._num_copies, _comp_imag(read)) read = (tf.add_n(xs_real)/self._num_copies, tf.add_n(xs_imag)/self._num_copies) return read
def after_apply(self): self._moving_averager = tf.train.ExponentialMovingAverage(decay=self._beta, zero_debias=self._zero_debias) assert self._grads != None and len(self._grads) > 0 after_apply_ops = [] # get per var g**2 and norm**2 self._grad_squared = [] self._grad_norm_squared = [] for v, g in zip(self._tvars, self._grads): with ops.colocate_with(v): self._grad_squared.append(tf.square(g) ) self._grad_norm_squared = [tf.reduce_sum(grad_squared) for grad_squared in self._grad_squared] # the following running average on squared norm of gradient is shared by grad_var and dist_to_opt avg_op = self._moving_averager.apply(self._grad_norm_squared) with tf.control_dependencies([avg_op] ): self._grad_norm_squared_avg = [self._moving_averager.average(val) for val in self._grad_norm_squared] self._grad_norm_squared = tf.add_n(self._grad_norm_squared) self._grad_norm_squared_avg = tf.add_n(self._grad_norm_squared_avg) after_apply_ops.append(avg_op) with tf.control_dependencies([avg_op] ): curv_range_ops = self.curvature_range() after_apply_ops += curv_range_ops grad_var_ops = self.grad_variance() after_apply_ops += grad_var_ops dist_to_opt_ops = self.dist_to_opt() after_apply_ops += dist_to_opt_ops return tf.group(*after_apply_ops)
def __init__(self, gan=None, config=None, trainer=None, name="SelfSupervisedTrainHook"): super().__init__(config=config, gan=gan, trainer=trainer, name=name) g_loss = [] d_loss = [] if hasattr(self.gan.inputs, 'frames'): x = gan.x0#gan.inputs.x g = gan.g0#gan.generator.sample else: x = gan.inputs.x g = gan.generator.sample reuse = False for i in range(4): if gan.width() != gan.height() and i % 2 == 0: continue _x = tf.image.rot90(x, i+1) _g = tf.image.rot90(g, i+1) stacked = tf.concat([_x, _g], axis=0) shared = gan.create_discriminator(stacked, reuse=True).named_layers['shared'] r = gan.create_component(config["r"], input=shared, reuse=reuse) reuse=True gan.discriminator.add_variables(r) gan.generator.add_variables(r) labels = tf.one_hot(i, 4) _dl = tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=r.sample[0]) _gl = tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=r.sample[1]) d_loss.append(_dl) g_loss.append(_gl) self.g_loss = (self.config.alpha or 1.0) * tf.add_n(g_loss) self.d_loss = (self.config.beta or 1.0) * tf.add_n(d_loss) self.gan.add_metric('ssgl', self.g_loss) self.gan.add_metric('ssdl', self.d_loss)
def _build_graph(self, inputs): image, label = inputs image = tf.expand_dims(image, 3) image = image * 2 - 1 # center the pixels values at zero with argscope(Conv2D, kernel_shape=3, nl=tf.nn.relu, out_channel=32): M = self._build_keras_model() logits = M(image) prob = tf.nn.softmax(logits, name='prob') # a Bx10 with probabilities # a vector of length B with loss of each sample cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') # the average cross-entropy loss wrong = symbolic_functions.prediction_incorrect(logits, label, name='incorrect') train_error = tf.reduce_mean(wrong, name='train_error') summary.add_moving_summary(train_error) wd_cost = tf.add_n(M.losses, name='regularize_loss') # this is how Keras manage regularizers self.cost = tf.add_n([wd_cost, cost], name='total_cost') summary.add_moving_summary(cost, wd_cost, self.cost) # this is the keras naming summary.add_param_summary(('conv2d.*/kernel', ['histogram', 'rms']))
def sequence_loss_by_example(inputs, targets, weights, loss_function, average_across_timesteps=True, name=None): """Sampled softmax loss for a sequence of inputs (per example). Args: inputs: List of 2D Tensors of shape [batch_size x hid_dim]. targets: List of 1D batch-sized int32 Tensors of the same length as logits. weights: List of 1D batch-sized float-Tensors of the same length as logits. loss_function: Sampled softmax function (inputs, labels) -> loss average_across_timesteps: If set, divide the returned cost by the total label weight. name: Optional name for this operation, default: 'sequence_loss_by_example'. Returns: 1D batch-sized float Tensor: The log-perplexity for each sequence. Raises: ValueError: If len(inputs) is different from len(targets) or len(weights). """ if len(targets) != len(inputs) or len(weights) != len(inputs): raise ValueError('Lengths of logits, weights, and targets must be the same ' '%d, %d, %d.' % (len(inputs), len(weights), len(targets))) with tf.op_scope(inputs + targets + weights, name, 'sequence_loss_by_example'): log_perp_list = [] for inp, target, weight in zip(inputs, targets, weights): crossent = loss_function(inp, target) log_perp_list.append(crossent * weight) log_perps = tf.add_n(log_perp_list) if average_across_timesteps: total_size = tf.add_n(weights) total_size += 1e-12 # Just to avoid division by 0 for all-0 weights. log_perps /= total_size return log_perps
def _shake_shake_block(x, output_filters, stride, is_training): """Builds a full shake-shake sub layer.""" batch_size = tf.shape(x)[0] # Generate random numbers for scaling the branches rand_forward = [ tf.random_uniform( [batch_size, 1, 1, 1], minval=0, maxval=1, dtype=tf.float32) for _ in range(2) ] rand_backward = [ tf.random_uniform( [batch_size, 1, 1, 1], minval=0, maxval=1, dtype=tf.float32) for _ in range(2) ] # Normalize so that all sum to 1 total_forward = tf.add_n(rand_forward) total_backward = tf.add_n(rand_backward) rand_forward = [samp / total_forward for samp in rand_forward] rand_backward = [samp / total_backward for samp in rand_backward] zipped_rand = zip(rand_forward, rand_backward) branches = [] for branch, (r_forward, r_backward) in enumerate(zipped_rand): with tf.variable_scope('branch_{}'.format(branch)): b = _shake_shake_branch(x, output_filters, stride, r_forward, r_backward, is_training) branches.append(b) res = _shake_shake_skip_connection(x, output_filters, stride) return res + tf.add_n(branches)
def loss_sharded(self, sharded_top_out, sharded_targets, data_parallelism): """Compute loss for all shards.""" sharded_loss_num, sharded_loss_den = data_parallelism( self.loss, sharded_top_out, sharded_targets) loss = tf.add_n(sharded_loss_num) / tf.maximum(1.0, tf.add_n(sharded_loss_den)) return loss
def loss(logits, labels, lambs): # put a sigfunction on logits and then transpose logits = tf.transpose(framwork.sig_func(logits)) # according to the labels, erase rows which is not in labels labels_unique = tf.constant(range(NUM_CLASSES), dtype=tf.int32) labels_num = NUM_CLASSES # logits = tf.gather(logits, indices=labels_unique) # lambs = tf.gather(lambs, indices=labels_unique) # set the value of each row to True when it occurs in labels template = tf.tile(tf.expand_dims(labels_unique, dim=1), [1, BATCH_SIZE]) labels_expand = tf.tile(tf.expand_dims(labels, dim=0), [labels_num, 1]) indict_logic = tf.equal(labels_expand, template) # split the tensor along rows logit_list = tf.split(0, labels_num, logits) indict_logic_list = tf.split(0, labels_num, indict_logic) lambda_list = tf.split(0, NUM_CLASSES, lambs) # loss_list = list() # for i in range(self.image_classes): # loss_list.append(framwork.loss_func(logit_list[i], indict_logic_list[i], lambda_list[i])) loss_list = map(framwork.loss_func, logit_list, indict_logic_list, lambda_list) losses = tf.add_n(loss_list) tf.add_to_collection('losses', losses) # The total loss is defined as the cross entropy loss plus all of the weight # decay terms (L2 loss). return tf.add_n(tf.get_collection('losses'), name='total_loss')
def create(self): gan = self.gan config = self.config ops = self.gan.ops split = len(gan.generator.children)+len(gan.generator.parents)+1 #generator structure: # x, gp1, ..., gpn, gc1, ..., gcm d_real = self.d_real d_fake = self.d_fake net = gan.discriminator.sample ds = self.split_batch(net, split) d_real = ds[0] d_fake = tf.add_n(ds[1:len(gan.generator.parents)+1])/(len(gan.generator.parents)) d_loss, _ = self._create(d_real, d_fake) ds = self.split_batch(net, split) d_real = ds[0] d_fake = tf.add_n(ds[1+len(gan.generator.parents):])/(len(gan.generator.children)) _, g_loss = self._create(d_real, d_fake) self.children_losses = self.split_batch(g_loss, len(gan.generator.children)) d_loss = ops.squash(d_loss, config.reduce or tf.reduce_mean) #linear doesn't work with this g_loss = ops.squash(g_loss, config.reduce or tf.reduce_mean) self.sample = [d_loss, g_loss] self.d_loss = d_loss self.g_loss = g_loss return self.sample
def weight_decay(penalty_type, penalty): """Add weight decay. Args: model: TensorflowGraph. Returns: A scalar tensor containing the weight decay cost. Raises: NotImplementedError: If an unsupported penalty type is requested. """ variables = [] # exclude bias variables for v in tf.trainable_variables(): if v.get_shape().ndims == 2: variables.append(v) with tf.name_scope('weight_decay'): if penalty_type == 'l1': cost = tf.add_n([tf.reduce_sum(tf.abs(v)) for v in variables]) elif penalty_type == 'l2': cost = tf.add_n([tf.nn.l2_loss(v) for v in variables]) else: raise NotImplementedError('Unsupported penalty_type %s' % penalty_type) cost *= penalty #tf.scalar_summary('Weight Decay Cost', cost) return cost
def allreduce_grads_hierarchical(all_grads, devices, average=False): """ Hierarchical allreduce for DGX-1 system. Args: all_grads (K x N): List of list of gradients. N is the number of variables. devices ([str]): K str for the K devices. average (bool): average gradients or not. Returns: (K x N): same as input, but each grad is replaced by the average over K lists. """ num_gpu = len(devices) assert num_gpu == 8, num_gpu assert len(all_grads) == num_gpu, len(all_grads) group_size = num_gpu // 2 agg_all_grads = [] # N x K for varid, grads in enumerate(zip(*all_grads)): # grads: K gradients g0_main_gpu = varid % num_gpu g1_main_gpu = (g0_main_gpu + group_size) % num_gpu g0_start = 0 if g0_main_gpu < group_size else group_size g1_start = 0 if g1_main_gpu < group_size else group_size assert g0_start != g1_start g0_grads = grads[g0_start: g0_start + group_size] g1_grads = grads[g1_start: g1_start + group_size] with tf.device(devices[g0_main_gpu]): g0_agg = tf.add_n(g0_grads, name='group0_agg') with tf.device(devices[g1_main_gpu]): g1_agg = tf.add_n(g1_grads, name='group1_agg') g1_total_agg = tf.add(g0_agg, g1_agg, name='group1_total_agg') with tf.device(devices[g0_main_gpu]): g0_total_agg = tf.identity(g1_total_agg, name='group0_total_agg') agg_grads = [] # K aggregated grads for k in range(num_gpu): if (k < group_size) == (g0_main_gpu < group_size): main_gpu = g0_total_agg else: main_gpu = g1_total_agg with tf.device(devices[k]): if not average: device_total_agg = tf.identity( main_gpu, name='device{}_total_agg'.format(k)) else: # TODO where to put average? device_total_agg = tf.multiply( main_gpu, 1.0 / num_gpu, name='device{}_total_agg'.format(k)) agg_grads.append(device_total_agg) agg_all_grads.append(agg_grads) # transpose agg_all_grads = list(zip(*agg_all_grads)) # K x Nvar return agg_all_grads
def get_train_collection(self): ret = dict() ret['rpn_loss_cls'] = tf.add_n(tf.get_collection('rpn_loss_cls')) ret['rpn_loss_box'] = tf.add_n(tf.get_collection('rpn_loss_box')) ret['loss_cls'] = tf.add_n(tf.get_collection('loss_cls')) ret['loss_box'] = tf.add_n(tf.get_collection('loss_box')) ret['tot_losses'] = tf.add_n(tf.get_collection('losses')) return ret
def _integral(lower, upper): result = [] for f_scale, nd_bounds, nd_integral, normalisation_1 in all_integrals: nd_normalisation_2 = [] for bounds, integral in zip(nd_bounds, nd_integral): integral_bounds = find_common_bounds([Region(lower, upper)], bounds) nd_normalisation_2.append(_integrate_component(integral_bounds, integral)) result.append(f_scale/tf.add_n(normalisation_1)*tf.add_n(nd_normalisation_2)) return tf.add_n(result)
def sequence_loss_by_example(logits, targets, weights, num_decoder_symbols, average_across_timesteps=True, softmax_loss_function=None, name=None): """Weighted cross-entropy loss for a sequence of logits (per example). Args: logits: list of 2D Tensors of shape [batch_size x num_decoder_symbols]. nick logits are 2d tensors targets: list of 1D batch-sized int32-Tensors of the same length as logits. weights: list of 1D batch-sized float-Tensors of the same length as logits. num_decoder_symbols: integer, number of decoder symbols (output classes). average_across_timesteps: If set, divide the returned cost by the total label weight. softmax_loss_function: function (inputs-batch, labels-batch) -> loss-batch to be used instead of the standard softmax (the default if this is None). name: optional name for this operation, default: "sequence_loss_by_example". Returns: 1D batch-sized float Tensor: the log-perplexity for each sequence. notice here they take the ln(perplexity) -- which is why you get loss as you do Raises: ValueError: if len(logits) is different from len(targets) or len(weights). """ if len(targets) != len(logits) or len(weights) != len(logits): raise ValueError("Lengths of logits, weights, and targets must be the same " "%d, %d, %d." % (len(logits), len(weights), len(targets))) with tf.op_scope(logits + targets + weights, name, "sequence_loss_by_example"): batch_size = tf.shape(targets[0])[0] log_perp_list = [] length = batch_size * num_decoder_symbols #this represents the batch size x vocab size for i in xrange(len(logits)): if softmax_loss_function is None: # TODO(lukaszkaiser): There is no SparseCrossEntropy in TensorFlow, so # we need to first cast targets into a dense representation, and as # SparseToDense does not accept batched inputs, we need to do this by # re-indexing and re-sizing. When TensorFlow adds SparseCrossEntropy, # rewrite this method. indices = targets[i] + num_decoder_symbols * tf.range(batch_size) with tf.device("/cpu:0"): # Sparse-to-dense must happen on CPU for now. dense = tf.sparse_to_dense(indices, tf.expand_dims(length, 0), 1.0, 0.0) target = tf.reshape(dense, [-1, num_decoder_symbols]) crossent = tf.nn.softmax_cross_entropy_with_logits( logits[i], target, name="SequenceLoss/CrossEntropy{0}".format(i)) else: crossent = softmax_loss_function(logits[i], targets[i]) log_perp_list.append(crossent * weights[i]) #this determines the cost I think? log_perps = tf.add_n(log_perp_list) #this adds all the elements in the tensor together if average_across_timesteps: total_size = tf.add_n(weights) #nick, this adds element wise all the of weights -- this produces just one number! total_size += 1e-12 # Just to avoid division by 0 for all-0 weights. This is adding it to just one number! total_size = total_size + 1e-12 log_perps /= total_size #one number is produced here! this is equivalent to log_perps = log_perps/total_size return log_perps #this is the natural log of your perplexity
def __init__(self, params, network, loss, score, optimizer, image_summary=True): self.params = params self.network = network self.loss = loss self.score = score self.optimizer = optimizer self.root_path = os.path.dirname(os.path.realpath(__file__)) self.results_path = os.path.join(self.root_path, 'results') self.experiment_path = os.path.join(self.results_path, params['experiment']) self.trial_path = os.path.join(self.experiment_path, params['trial']) self.checkpoint_path = os.path.join(self.results_path, self.params['experiment'], self.params['trial']) self.model_path = os.path.join(self.checkpoint_path, 'model.ckpt') if not os.path.exists(self.results_path): os.mkdir(self.results_path) if not os.path.exists(self.experiment_path): os.mkdir(self.experiment_path) if not os.path.exists(self.trial_path): os.mkdir(self.trial_path) for i in range(len(self.network.weights)): tf.add_to_collection('losses', tf.mul(tf.nn.l2_loss(self.network.weights[i]), self.params['weight_decay'])) tf.histogram_summary('weights/layer #%d' % i, self.network.weights[i]) tf.histogram_summary('biases/layer #%d' % i, self.network.biases[i]) weight_loss = tf.add_n(tf.get_collection('losses')) tf.add_to_collection('losses', self.loss) total_loss = tf.add_n(tf.get_collection('losses')) tf.scalar_summary('loss/base', self.loss) tf.scalar_summary('loss/weights', weight_loss) tf.scalar_summary('loss/total', total_loss) if image_summary: tf.image_summary('images/reference', self.network.y_) tf.image_summary('images/distorted', self.network.x) tf.image_summary('images/cleaned', tf.minimum(self.network.output(), 1.)) tf.scalar_summary('score/train', self.score) self.train_summary_step = tf.merge_all_summaries() self.score_placeholder = tf.placeholder(tf.float32) self.val_summary_step = tf.scalar_summary('score/validation', self.score_placeholder) self.test_summary_step = tf.scalar_summary('score/test', self.score_placeholder) self.summary_writer = tf.train.SummaryWriter(self.trial_path) self.global_step = tf.Variable(0, trainable=False, name='global_step') self.train_step = self.optimizer.minimize(total_loss, global_step=self.global_step) self.saver = tf.train.Saver()
def train_3d_nn(): time0 = time.time() chunks_ids = get_ids(DATA_PATH) X, Y = get_data(chunks_ids, DATA_PATH) print("Total time to load data: " + str(timedelta(seconds=int(round(time.time() - time0))))) print('Splitting into train, validation sets') Y = np.argmax(Y, axis=1) # Crunch 4 classes to 2 Y[Y == 2] = 1 Y[Y == 3] = 1 train_x, validation_x, train_y, validation_y = model_selection.train_test_split( X, Y, random_state=42, stratify=Y, test_size=0.20) klass_weights = np.asarray([69838.0 / 40513.0, 69838.0 / 29325.0]) # Free up X and Y memory del X del Y print("Total time to split: " + str(timedelta(seconds=int(round(time.time() - time0))))) print('train_x: {}'.format(train_x.shape)) print('validation_x: {}'.format(validation_x.shape)) print('train_y: {}'.format(train_y.shape)) print('validation_y: {}'.format(validation_y.shape)) train_y = (np.arange(FLAGS.num_classes) == train_y[:, None]) + 0 validation_y = (np.arange(FLAGS.num_classes) == validation_y[:, None]) + 0 # Seed numpy random to generate identical random numbers every time (used in batching) np.random.seed(42) def get_validation_batch(validation_x_ids, validation_y, batch_number): num_images = len(validation_x_ids) count = 0 start_index = batch_number * FLAGS.batch_size end_index = start_index + FLAGS.batch_size end_index = num_images if end_index > num_images else end_index real_batch_size = end_index - start_index validation_x = np.ndarray([ real_batch_size, FLAGS.chunk_size, FLAGS.chunk_size, FLAGS.chunk_size, 1 ], dtype=np.float32) for chunk_id in validation_x_ids[start_index:end_index]: chunk = np.load(DATA_PATH + chunk_id + '_X.npy').astype(np.float32, copy=False) validation_x[count, :, :, :, :] = img_to_rgb(chunk) count = count + 1 return validation_x, validation_y[start_index:end_index] def feed_dict(is_train, batch_number=0): if is_train: x_batch, y_batch = get_batch(train_x, train_y) k = FLAGS.dropout else: x_batch, y_batch = get_validation_batch(validation_x, validation_y, batch_number) k = 1.0 crss_entrpy_weights = np.ones((y_batch.shape[0])) for m in range(y_batch.shape[0]): crss_entrpy_weights[m] = np.amax(y_batch[m] * klass_weights) return { x: x_batch, y_labels: y_batch, keep_prob: k, cross_entropy_weights: crss_entrpy_weights } # Graph construction graph = tf.Graph() with graph.as_default(): x = tf.placeholder(tf.float32, shape=[ None, FLAGS.chunk_size, FLAGS.chunk_size, FLAGS.chunk_size, 1 ], name='x') y = tf.placeholder(tf.float32, shape=[None, FLAGS.num_classes], name='y') y_labels = tf.placeholder(tf.float32, shape=[None, FLAGS.num_classes], name='y_labels') cross_entropy_weights = tf.placeholder(tf.float32, shape=[None], name='cross_entropy_weights') keep_prob = tf.placeholder(tf.float32) class_weights_base = tf.ones_like(y_labels) class_weights = tf.multiply(class_weights_base, [69838.0 / 40513.0, 69838.0 / 29325.0]) # layer1 conv1_1_out, conv1_1_weights = conv3d(inputs=x, filter_size=3, num_filters=16, num_channels=1, strides=[1, 3, 3, 3, 1], layer_name='conv1_1') relu1_1_out = relu_3d(inputs=conv1_1_out, layer_name='relu1_1') conv1_2_out, conv1_2_weights = conv3d(inputs=relu1_1_out, filter_size=3, num_filters=16, num_channels=16, strides=[1, 3, 3, 3, 1], layer_name='conv1_2') relu1_2_out = relu_3d(inputs=conv1_2_out, layer_name='relu1_2') pool1_out = max_pool_3d(inputs=relu1_2_out, filter_size=[1, 2, 2, 2, 1], strides=[1, 2, 2, 2, 1], layer_name='pool1') # layer2 conv2_1_out, conv2_1_weights = conv3d(inputs=pool1_out, filter_size=3, num_filters=32, num_channels=16, strides=[1, 3, 3, 3, 1], layer_name='conv2_1') relu2_1_out = relu_3d(inputs=conv2_1_out, layer_name='relu2_1') conv2_2_out, conv2_2_weights = conv3d(inputs=relu2_1_out, filter_size=3, num_filters=32, num_channels=32, strides=[1, 3, 3, 3, 1], layer_name='conv2_2') relu2_2_out = relu_3d(inputs=conv2_2_out, layer_name='relu2_2') pool2_out = max_pool_3d(inputs=relu2_2_out, filter_size=[1, 2, 2, 2, 1], strides=[1, 2, 2, 2, 1], layer_name='pool2') # layer3 conv3_1_out, conv3_1_weights = conv3d(inputs=pool2_out, filter_size=3, num_filters=64, num_channels=32, strides=[1, 3, 3, 3, 1], layer_name='conv3_1') relu3_1_out = relu_3d(inputs=conv3_1_out, layer_name='relu3_1') conv3_2_out, conv3_2_weights = conv3d(inputs=relu3_1_out, filter_size=3, num_filters=64, num_channels=64, strides=[1, 3, 3, 3, 1], layer_name='conv3_2') relu3_2_out = relu_3d(inputs=conv3_2_out, layer_name='relu3_2') conv3_3_out, conv3_3_weights = conv3d(inputs=relu3_2_out, filter_size=3, num_filters=64, num_channels=64, strides=[1, 3, 3, 3, 1], layer_name='conv3_3') relu3_3_out = relu_3d(inputs=conv3_3_out, layer_name='relu3_3') pool3_out = max_pool_3d(inputs=relu3_3_out, filter_size=[1, 2, 2, 2, 1], strides=[1, 2, 2, 2, 1], layer_name='pool3') # layer4 conv4_1_out, conv4_1_weights = conv3d(inputs=pool3_out, filter_size=3, num_filters=128, num_channels=64, strides=[1, 3, 3, 3, 1], layer_name='conv4_1') relu4_1_out = relu_3d(inputs=conv4_1_out, layer_name='relu4_1') conv4_2_out, conv4_2_weights = conv3d(inputs=relu4_1_out, filter_size=3, num_filters=128, num_channels=128, strides=[1, 3, 3, 3, 1], layer_name='conv4_2') relu4_2_out = relu_3d(inputs=conv4_2_out, layer_name='relu4_2') conv4_3_out, conv4_3_weights = conv3d(inputs=relu4_2_out, filter_size=3, num_filters=128, num_channels=128, strides=[1, 3, 3, 3, 1], layer_name='conv4_3') relu4_3_out = relu_3d(inputs=conv4_3_out, layer_name='relu4_3') pool4_out = max_pool_3d(inputs=relu4_3_out, filter_size=[1, 2, 2, 2, 1], strides=[1, 2, 2, 2, 1], layer_name='pool4') # layer5 conv5_1_out, conv5_1_weights = conv3d(inputs=pool4_out, filter_size=3, num_filters=256, num_channels=128, strides=[1, 3, 3, 3, 1], layer_name='conv5_1') relu5_1_out = relu_3d(inputs=conv5_1_out, layer_name='relu5_1') conv5_2_out, conv5_2_weights = conv3d(inputs=relu5_1_out, filter_size=3, num_filters=256, num_channels=256, strides=[1, 3, 3, 3, 1], layer_name='conv5_2') relu5_2_out = relu_3d(inputs=conv5_2_out, layer_name='relu5_2') conv5_3_out, conv5_3_weights = conv3d(inputs=relu5_2_out, filter_size=3, num_filters=256, num_channels=256, strides=[1, 3, 3, 3, 1], layer_name='conv5_3') relu5_3_out = relu_3d(inputs=conv5_3_out, layer_name='relu5_3') pool5_out = max_pool_3d(inputs=relu5_3_out, filter_size=[1, 2, 2, 2, 1], strides=[1, 2, 2, 2, 1], layer_name='pool5') flatten5_out, flatten5_features = flatten_3d(pool5_out, layer_name='flatten5') # layer6 dense6_out = dense_3d(inputs=flatten5_out, num_inputs=int(flatten5_out.shape[1]), num_outputs=4096, layer_name='fc6') relu6_out = relu_3d(inputs=dense6_out, layer_name='relu6') dropout6_out = dropout_3d(inputs=relu6_out, keep_prob=0.5, layer_name='drop6') # layer7 dense7_out = dense_3d(inputs=dropout6_out, num_inputs=int(dropout6_out.shape[1]), num_outputs=4096, layer_name='fc7') relu7_out = relu_3d(inputs=dense7_out, layer_name='relu7') dropout7_out = dropout_3d(inputs=relu7_out, keep_prob=0.5, layer_name='drop7') # layer8 dense8_out = dense_3d(inputs=dropout7_out, num_inputs=int(dropout7_out.shape[1]), num_outputs=1000, layer_name='fc8') # layer9 dense9_out = dense_3d(inputs=dense8_out, num_inputs=int(dense8_out.shape[1]), num_outputs=FLAGS.num_classes, layer_name='fc9') # Final softmax y = tf.nn.softmax(dense9_out) # Overall Metrics Calculations with tf.name_scope('log_loss'): log_loss = tf.losses.log_loss(y_labels, y, epsilon=10e-15) tf.summary.scalar('log_loss', log_loss) with tf.name_scope('softmax_cross_entropy'): softmax_cross_entropy = tf.losses.softmax_cross_entropy( y_labels, dense9_out) tf.summary.scalar('softmax_cross_entropy', softmax_cross_entropy) with tf.name_scope('accuracy'): correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_labels, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) tf.summary.scalar('accuracy', accuracy) with tf.name_scope('weighted_log_loss'): weighted_log_loss = tf.losses.log_loss( y_labels, y, weights=class_weights, epsilon=10e-15) + tf.add_n( tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) tf.summary.scalar('weighted_log_loss', weighted_log_loss) with tf.name_scope('weighted_softmax_cross_entropy'): weighted_softmax_cross_entropy = tf.losses.softmax_cross_entropy( y_labels, dense9_out, weights=cross_entropy_weights) tf.summary.scalar('weighted_softmax_cross_entropy', weighted_softmax_cross_entropy) with tf.name_scope('sparse_softmax_cross_entropy'): y_labels_argmax_int = tf.to_int32(tf.argmax(y_labels, axis=1)) sparse_softmax_cross_entropy = tf.losses.sparse_softmax_cross_entropy( labels=y_labels_argmax_int, logits=dense9_out) tf.summary.scalar('sparse_softmax_cross_entropy', sparse_softmax_cross_entropy) with tf.name_scope('weighted_sparse_softmax_cross_entropy'): y_labels_argmax_int = tf.to_int32(tf.argmax(y_labels, axis=1)) weighted_sparse_softmax_cross_entropy = tf.losses.sparse_softmax_cross_entropy( labels=y_labels_argmax_int, logits=dense9_out, weights=cross_entropy_weights) tf.summary.scalar('weighted_sparse_softmax_cross_entropy', weighted_sparse_softmax_cross_entropy) # Class Based Metrics calculations y_pred_class = tf.argmax(y, 1) y_labels_class = tf.argmax(y_labels, 1) confusion_matrix = tf.confusion_matrix(y_labels_class, y_pred_class, num_classes=FLAGS.num_classes) sum_row_0 = tf.reduce_sum(confusion_matrix[0, :]) sum_row_1 = tf.reduce_sum(confusion_matrix[1, :]) # sum_row_2 = tf.reduce_sum(confusion_matrix[2, :]) # sum_row_3 = tf.reduce_sum(confusion_matrix[3, :]) sum_col_0 = tf.reduce_sum(confusion_matrix[:, 0]) sum_col_1 = tf.reduce_sum(confusion_matrix[:, 1]) # sum_col_2 = tf.reduce_sum(confusion_matrix[:, 2]) # sum_col_3 = tf.reduce_sum(confusion_matrix[:, 3]) sum_all = tf.reduce_sum(confusion_matrix[:, :]) with tf.name_scope('precision'): precision_0 = confusion_matrix[0, 0] / sum_col_0 precision_1 = confusion_matrix[1, 1] / sum_col_1 # precision_2 = confusion_matrix[2,2] / sum_col_2 # precision_3 = confusion_matrix[3,3] / sum_col_3 tf.summary.scalar('precision_0', precision_0) tf.summary.scalar('precision_1', precision_1) # tf.summary.scalar('precision_2', precision_2) # tf.summary.scalar('precision_3', precision_3) with tf.name_scope('recall'): recall_0 = confusion_matrix[0, 0] / sum_row_0 recall_1 = confusion_matrix[1, 1] / sum_row_1 # recall_2 = confusion_matrix[2,2] / sum_row_2 # recall_3 = confusion_matrix[3,3] / sum_row_3 tf.summary.scalar('recall_0', recall_0) tf.summary.scalar('recall_1', recall_1) # tf.summary.scalar('recall_2', recall_2) # tf.summary.scalar('recall_3', recall_3) with tf.name_scope('specificity'): tn_0 = sum_all - (sum_row_0 + sum_col_0 - confusion_matrix[0, 0]) fp_0 = sum_col_0 - confusion_matrix[0, 0] specificity_0 = tn_0 / (tn_0 + fp_0) tn_1 = sum_all - (sum_row_1 + sum_col_1 - confusion_matrix[1, 1]) fp_1 = sum_col_1 - confusion_matrix[1, 1] specificity_1 = tn_1 / (tn_1 + fp_1) # tn_2 = sum_all - (sum_row_2 + sum_col_2 - confusion_matrix[2,2]) # fp_2 = sum_col_2 - confusion_matrix[2,2] # specificity_2 = tn_2 / (tn_2 + fp_2) # # tn_3 = sum_all - (sum_row_3 + sum_col_3 - confusion_matrix[3,3]) # fp_3 = sum_col_3 - confusion_matrix[3,3] # specificity_3 = tn_3 / (tn_3 + fp_3) tf.summary.scalar('specificity_0', specificity_0) tf.summary.scalar('specificity_1', specificity_1) # tf.summary.scalar('specificity_2', specificity_2) # tf.summary.scalar('specificity_3', specificity_3) with tf.name_scope('true_positives'): tp_0 = confusion_matrix[0, 0] tp_1 = confusion_matrix[1, 1] # tp_2 = confusion_matrix[2,2] # tp_3 = confusion_matrix[3,3] tf.summary.scalar('true_positives_0', tp_0) tf.summary.scalar('true_positives_1', tp_1) # tf.summary.scalar('true_positives_2', tp_2) # tf.summary.scalar('true_positives_3', tp_3) with tf.name_scope('true_negatives'): tf.summary.scalar('true_negatives_0', tn_0) tf.summary.scalar('true_negatives_1', tn_1) # tf.summary.scalar('true_negatives_2', tn_2) # tf.summary.scalar('true_negatives_3', tn_3) with tf.name_scope('false_positives'): tf.summary.scalar('false_positives_0', fp_0) tf.summary.scalar('false_positives_1', fp_1) # tf.summary.scalar('false_positives_2', fp_2) # tf.summary.scalar('false_positives_3', fp_3) with tf.name_scope('false_negatives'): fn_0 = sum_row_0 - tp_0 fn_1 = sum_row_1 - tp_1 # fn_2 = sum_row_2 - tp_2 # fn_3 = sum_row_3 - tp_3 tf.summary.scalar('false_negatives_0', fn_0) tf.summary.scalar('false_negatives_1', fn_1) # tf.summary.scalar('false_negatives_2', fn_2) # tf.summary.scalar('false_negatives_3', fn_3) with tf.name_scope('log_loss_by_class'): log_loss_0 = tf.losses.log_loss(y_labels[0], y[0], epsilon=10e-15) log_loss_1 = tf.losses.log_loss(y_labels[1], y[1], epsilon=10e-15) # log_loss_2 = tf.losses.log_loss(y_labels[2], y[2], epsilon=10e-15) # log_loss_3 = tf.losses.log_loss(y_labels[3], y[3], epsilon=10e-15) #added extra '_' to avoid tenosorboard name collision with the main log_loss metric tf.summary.scalar('log_loss__0', log_loss_0) tf.summary.scalar('log_loss__1', log_loss_1) # tf.summary.scalar('log_loss__2', log_loss_2) # tf.summary.scalar('log_loss__3', log_loss_3) with tf.name_scope('softmax_cross_entropy_by_class'): softmax_cross_entropy_0 = tf.losses.softmax_cross_entropy( y_labels[0], dense9_out[0]) softmax_cross_entropy_1 = tf.losses.softmax_cross_entropy( y_labels[1], dense9_out[1]) # softmax_cross_entropy_2 = tf.losses.softmax_cross_entropy(y_labels[2], dense9_out[2]) # softmax_cross_entropy_3 = tf.losses.softmax_cross_entropy(y_labels[3], dense9_out[3]) tf.summary.scalar('softmax_cross_entropy_0', softmax_cross_entropy_0) tf.summary.scalar('softmax_cross_entropy_1', softmax_cross_entropy_1) # tf.summary.scalar('softmax_cross_entropy_2', softmax_cross_entropy_2) # tf.summary.scalar('softmax_cross_entropy_3', softmax_cross_entropy_3) with tf.name_scope('accuracy_by_class'): accuracy_0 = (tp_0 + tn_0) / (tp_0 + fp_0 + fn_0 + tn_0) accuracy_1 = (tp_1 + tn_1) / (tp_1 + fp_1 + fn_1 + tn_1) # accuracy_2 = (tp_2 + tn_2)/(tp_2 + fp_2 + fn_2 + tn_2) # accuracy_3 = (tp_3 + tn_3)/(tp_3 + fp_3 + fn_3 + tn_3) tf.summary.scalar('accuracy_0', accuracy_0) tf.summary.scalar('accuracy_1', accuracy_1) # tf.summary.scalar('accuracy_2', accuracy_2) # tf.summary.scalar('accuracy_3', accuracy_3) with tf.name_scope('weighted_log_loss_by_class'): weighted_log_loss_0 = tf.losses.log_loss(y_labels[0], y[0], weights=class_weights[0], epsilon=10e-15) weighted_log_loss_1 = tf.losses.log_loss(y_labels[1], y[1], weights=class_weights[1], epsilon=10e-15) # weighted_log_loss_2 = tf.losses.log_loss(y_labels[2], y[2], weights=class_weights[2], epsilon=10e-15) # weighted_log_loss_3 = tf.losses.log_loss(y_labels[3], y[3], weights=class_weights[3], epsilon=10e-15) tf.summary.scalar('weighted_log_loss_0', weighted_log_loss_0) tf.summary.scalar('weighted_log_loss_1', weighted_log_loss_1) # tf.summary.scalar('weighted_log_loss_2', weighted_log_loss_2) # tf.summary.scalar('weighted_log_loss_3', weighted_log_loss_3) with tf.name_scope('f1_score_by_class'): f1_score_0 = 2 * (precision_0 * recall_0) / (precision_0 + recall_0) f1_score_1 = 2 * (precision_1 * recall_1) / (precision_1 + recall_1) # f1_score_2 = 2 * (precision_2 * recall_2) / (precision_2 + recall_2) # f1_score_3 = 2 * (precision_3 * recall_3) / (precision_3 + recall_3) # #f1_score = (f1_score_0 * 40591.0/69920.0) + (f1_score_1 * 14624.0/69920.0) + (f1_score_2 * 10490.0/69920.0) + (f1_score_3 *4215.0/ 69920.0) tf.summary.scalar('f1_score_0', f1_score_0) tf.summary.scalar('f1_score_1', f1_score_1) # tf.summary.scalar('f1_score_2', f1_score_2) # tf.summary.scalar('f1_score_3', f1_score_3) with tf.name_scope('train'): optimizer = tf.train.AdamOptimizer( learning_rate=1e-4, name='adam_optimizer').minimize(softmax_cross_entropy) merged = tf.summary.merge_all() saver = tf.train.Saver() # Setting up config config = tf.ConfigProto() config.gpu_options.allow_growth = FLAGS.allow_growth config.log_device_placement = FLAGS.log_device_placement config.allow_soft_placement = FLAGS.allow_soft_placement # timestamp used to identify the start of run start_timestamp = str(int(time.time())) model_id = str(uuid.uuid4()) # Name used to save all artifacts of run run_name = 'runType={0:}_timestamp={1:}_batchSize={2:}_maxIterations={3:}_numTrain={4:}_numValidation={5:}_modelId={6:}' train_run_name = run_name.format('train', start_timestamp, FLAGS.batch_size, FLAGS.max_iterations, train_x.shape[0], validation_x.shape[0], model_id) test_run_name = run_name.format('test', start_timestamp, FLAGS.batch_size, FLAGS.max_iterations, train_x.shape[0], validation_x.shape[0], model_id) print('Run_name: {}'.format(train_run_name)) k_count = 0 with tf.Session(graph=graph, config=config) as sess: train_writer = tf.summary.FileWriter( TENSORBOARD_SUMMARIES + train_run_name, sess.graph) test_writer = tf.summary.FileWriter( TENSORBOARD_SUMMARIES + test_run_name, sess.graph) sess.run([ tf.global_variables_initializer(), tf.local_variables_initializer() ]) for i in tqdm(range(FLAGS.max_iterations)): if (i % FLAGS.iteration_analysis == 0) or (i == (FLAGS.max_iterations - 1)): save_model(sess, model_id, saver) # Validation num_batches = int( math.ceil(float(len(validation_x)) / FLAGS.batch_size)) for k in range(num_batches): _, step_summary = sess.run([y, merged], feed_dict=feed_dict(False, k)) test_writer.add_summary(step_summary, k_count) k_count = k_count + 1 else: # Train _, step_summary = sess.run([optimizer, merged], feed_dict=feed_dict(True)) train_writer.add_summary(step_summary, i) train_writer.close() test_writer.close() # Clossing session sess.close()
def main(): """Create the model and start the training.""" args = get_arguments() os.environ['CUDA_DEVIDE_ORDER'] = "PCI_BUS_ID" os.environ['CUDA_VISIBLE_DEVICES'] = args.GPU h, w = map(int, args.input_size.split(',')) input_size = (h, w) tf.set_random_seed(args.random_seed) # Create queue coordinator. coord = tf.train.Coordinator() # Load reader. with tf.name_scope("create_inputs"): reader = ImageReader(args.data_dir, args.data_list, input_size, args.random_scale, args.random_mirror, args.ignore_label, IMG_MEAN, coord) image_batch, label_batch = reader.dequeue(args.batch_size) image_batch075 = tf.image.resize_images( image_batch, [int(h * 0.75), int(w * 0.75)]) image_batch05 = tf.image.resize_images( image_batch, [int(h * 0.5), int(w * 0.5)]) # Create network. with tf.variable_scope('', reuse=False): net = DeepLabResNetModel_34({'data': image_batch}, is_training=args.is_training, num_classes=args.num_classes) with tf.variable_scope('', reuse=True): net075 = DeepLabResNetModel_34({'data': image_batch075}, is_training=args.is_training, num_classes=args.num_classes) with tf.variable_scope('', reuse=True): net05 = DeepLabResNetModel_34({'data': image_batch05}, is_training=args.is_training, num_classes=args.num_classes) # For a small batch size, it is better to keep # the statistics of the BN layers (running means and variances) # frozen, and to not update the values provided by the pre-trained model. # If is_training=True, the statistics will be updated during the training. # Note that is_training=False still updates BN parameters gamma (scale) and beta (offset) # if they are presented in var_list of the optimiser definition. # Predictions. raw_output100 = net.layers['fc1_voc12'] raw_output075 = net075.layers['fc1_voc12'] raw_output05 = net05.layers['fc1_voc12'] raw_output = tf.reduce_max(tf.stack([ raw_output100, tf.image.resize_images(raw_output075, tf.shape(raw_output100)[1:3, ]), tf.image.resize_images(raw_output05, tf.shape(raw_output100)[1:3, ]) ]), axis=0) # Which variables to load. Running means and variances are not trainable, # thus all_variables() should be restored. restore_var = [ v for v in tf.global_variables() if 'fc' not in v.name or not args.not_restore_last ] all_trainable = [ v for v in tf.trainable_variables() if 'beta' not in v.name and 'gamma' not in v.name ] fc_trainable = [v for v in all_trainable if 'fc' in v.name] conv_trainable = [v for v in all_trainable if 'fc' not in v.name] # lr * 1.0 fc_w_trainable = [v for v in fc_trainable if 'weights' in v.name] # lr * 10.0 fc_b_trainable = [v for v in fc_trainable if 'biases' in v.name] # lr * 20.0 assert (len(all_trainable) == len(fc_trainable) + len(conv_trainable)) assert (len(fc_trainable) == len(fc_w_trainable) + len(fc_b_trainable)) # Predictions: ignoring all predictions with labels greater or equal than n_classes raw_prediction = tf.reshape(raw_output, [-1, args.num_classes]) raw_prediction100 = tf.reshape(raw_output100, [-1, args.num_classes]) raw_prediction075 = tf.reshape(raw_output075, [-1, args.num_classes]) raw_prediction05 = tf.reshape(raw_output05, [-1, args.num_classes]) label_proc = prepare_label(label_batch, tf.stack(raw_output.get_shape()[1:3]), num_classes=args.num_classes, one_hot=False) # [batch_size, h, w] label_proc075 = prepare_label(label_batch, tf.stack(raw_output075.get_shape()[1:3]), num_classes=args.num_classes, one_hot=False) label_proc05 = prepare_label(label_batch, tf.stack(raw_output05.get_shape()[1:3]), num_classes=args.num_classes, one_hot=False) raw_gt = tf.reshape(label_proc, [ -1, ]) raw_gt075 = tf.reshape(label_proc075, [ -1, ]) raw_gt05 = tf.reshape(label_proc05, [ -1, ]) indices = tf.squeeze(tf.where(tf.less_equal(raw_gt, args.num_classes - 1)), 1) indices075 = tf.squeeze( tf.where(tf.less_equal(raw_gt075, args.num_classes - 1)), 1) indices05 = tf.squeeze( tf.where(tf.less_equal(raw_gt05, args.num_classes - 1)), 1) gt = tf.cast(tf.gather(raw_gt, indices), tf.int32) gt075 = tf.cast(tf.gather(raw_gt075, indices075), tf.int32) gt05 = tf.cast(tf.gather(raw_gt05, indices05), tf.int32) prediction = tf.gather(raw_prediction, indices) prediction100 = tf.gather(raw_prediction100, indices) prediction075 = tf.gather(raw_prediction075, indices075) prediction05 = tf.gather(raw_prediction05, indices05) # Pixel-wise softmax loss. loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=prediction, labels=gt) loss100 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=prediction100, labels=gt) loss075 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=prediction075, labels=gt075) loss05 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=prediction05, labels=gt05) l2_losses = [ args.weight_decay * tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'weights' in v.name ] reduced_loss = tf.reduce_mean(loss) + tf.reduce_mean( loss100) + tf.reduce_mean(loss075) + tf.reduce_mean(loss05) + tf.add_n( l2_losses) tf.summary.scalar('loss', reduced_loss) # Processed predictions: for visualisation. raw_output_up = tf.image.resize_bilinear(raw_output, tf.shape(image_batch)[1:3, ]) raw_output_up = tf.argmax(raw_output_up, dimension=3) pred = tf.expand_dims(raw_output_up, dim=3) # Image summary. images_summary = tf.py_func(inv_preprocess, [image_batch, args.save_num_images, IMG_MEAN], tf.uint8) labels_summary = tf.py_func( decode_labels, [label_batch, args.save_num_images, args.num_classes], tf.uint8) preds_summary = tf.py_func(decode_labels, [pred, args.save_num_images, args.num_classes], tf.uint8) tf.summary.image( 'images', tf.concat(axis=2, values=[images_summary, labels_summary, preds_summary]), max_outputs=args.save_num_images) # Concatenate row-wise. # Define loss and optimisation parameters. base_lr = tf.constant(args.learning_rate) step_ph = tf.placeholder(dtype=tf.float32, shape=()) learning_rate = tf.scalar_mul( base_lr, tf.pow((1 - step_ph / args.num_steps), args.power)) tf.summary.scalar('learning_rate', learning_rate) opt_conv = tf.train.AdamOptimizer(learning_rate) opt_fc_w = tf.train.AdamOptimizer(learning_rate) opt_fc_b = tf.train.AdamOptimizer(learning_rate) # Define a variable to accumulate gradients. accum_grads = [ tf.Variable(tf.zeros_like(v.initialized_value()), trainable=False) for v in conv_trainable + fc_w_trainable + fc_b_trainable ] # Define an operation to clear the accumulated gradients for next batch. zero_op = [v.assign(tf.zeros_like(v)) for v in accum_grads] # Compute gradients. grads = tf.gradients(reduced_loss, conv_trainable + fc_w_trainable + fc_b_trainable) # Accumulate and normalise the gradients. accum_grads_op = [ accum_grads[i].assign_add(grad / args.grad_update_every) for i, grad in enumerate(grads) ] grads_conv = accum_grads[:len(conv_trainable)] grads_fc_w = accum_grads[len(conv_trainable):(len(conv_trainable) + len(fc_w_trainable))] grads_fc_b = accum_grads[(len(conv_trainable) + len(fc_w_trainable)):] # Apply the gradients. train_op_conv = opt_conv.apply_gradients(zip(grads_conv, conv_trainable)) train_op_fc_w = opt_fc_w.apply_gradients(zip(grads_fc_w, fc_w_trainable)) train_op_fc_b = opt_fc_b.apply_gradients(zip(grads_fc_b, fc_b_trainable)) train_op = tf.group(train_op_conv, train_op_fc_w, train_op_fc_b) merged = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(args.snapshot_dir, graph=tf.get_default_graph()) # Set up tf session and initialize variables. config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) init = tf.global_variables_initializer() sess.run(init) # Saver for storing checkpoints of the model. saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=10) # Load variables if the checkpoint is provided. if args.restore_from is not None: loader = tf.train.Saver(var_list=restore_var) load(loader, sess, args.restore_from) # Start queue threads. threads = tf.train.start_queue_runners(coord=coord, sess=sess) # Iterate over training steps. for step in range(args.num_steps): start_time = time.time() feed_dict = {step_ph: step} loss_value = 0 # Clear the accumulated gradients. sess.run(zero_op, feed_dict=feed_dict) # Accumulate gradients. for i in range(args.grad_update_every): _, l_val = sess.run([accum_grads_op, reduced_loss], feed_dict=feed_dict) loss_value += l_val # Normalise the loss. loss_value /= args.grad_update_every # Apply gradients. if step % args.save_pred_every == 0: images, labels, summary, _ = sess.run( [image_batch, label_batch, merged, train_op], feed_dict=feed_dict) summary_writer.add_summary(summary, step) save(saver, sess, args.snapshot_dir, step) else: sess.run(train_op, feed_dict=feed_dict) duration = time.time() - start_time print('step {:d} \t loss = {:.3f}, ({:.3f} sec/step)'.format( step, loss_value, duration)) coord.request_stop() coord.join(threads)
def get_branch_logits(features, num_classes, atrous_rates=None, aspp_with_batch_norm=False, kernel_size=1, weight_decay=0.0001, reuse=None, scope_suffix=''): """Gets the logits from each model's branch. The underlying model is branched out in the last layer when atrous spatial pyramid pooling is employed, and all branches are sum-merged to form the final logits. Args: features: A float tensor of shape [batch, height, width, channels]. num_classes: Number of classes to predict. atrous_rates: A list of atrous convolution rates for last layer. aspp_with_batch_norm: Use batch normalization layers for ASPP. kernel_size: Kernel size for convolution. weight_decay: Weight decay for the model variables. reuse: Reuse model variables or not. scope_suffix: Scope suffix for the model variables. Returns: Merged logits with shape [batch, height, width, num_classes]. Raises: ValueError: Upon invalid input kernel_size value. """ # When using batch normalization with ASPP, ASPP has been applied before # in extract_features, and thus we simply apply 1x1 convolution here. if aspp_with_batch_norm or atrous_rates is None: if kernel_size != 1: raise ValueError('Kernel size must be 1 when atrous_rates is None or ' 'using aspp_with_batch_norm. Gets %d.' % kernel_size) atrous_rates = [1] with slim.arg_scope( [slim.conv2d], weights_regularizer=slim.l2_regularizer(weight_decay), weights_initializer=tf.truncated_normal_initializer(stddev=0.01), reuse=reuse): with tf.variable_scope(LOGITS_SCOPE_NAME, LOGITS_SCOPE_NAME, [features]): branch_logits = [] for i, rate in enumerate(atrous_rates): scope = scope_suffix if i: scope += '_%d' % i branch_logits.append( slim.conv2d( features, num_classes, kernel_size=kernel_size, rate=rate, activation_fn=None, normalizer_fn=None, scope=scope)) return tf.add_n(branch_logits)
def __init__(self, params, word2vec, features, labels, training=False): len1, len2, s1, s2 = features embed_dim = params['embed_dim'] hidden_size = embed_dim #params['hidden_size'] dropout = params['dropout'] input_keep = 0.8 learning_rate = 0.001 max_norm = 10 l2_coef = 1e-5 #0.0001 num_heads = 8 if not training: dropout = 0.0 K.set_learning_phase(training) with tf.device('/cpu:0'): embedding = tf.get_variable("word2vec", initializer=word2vec, trainable=False) s1 = tf.nn.embedding_lookup(embedding, s1) s2 = tf.nn.embedding_lookup(embedding, s2) if training: s1 = tf.nn.dropout(s1, input_keep) s2 = tf.nn.dropout(s2, input_keep) c = highway(s1, size=embed_dim, scope="highway", dropout=dropout, reuse=None) q = highway(s2, size=embed_dim, scope="highway", dropout=dropout, reuse=True) c_mask = tf.sequence_mask(len1, dtype=tf.float32) q_mask = tf.sequence_mask(len2, dtype=tf.float32) # Encoding c = residual_block(c, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=c_mask, num_filters=hidden_size, num_heads=num_heads, seq_len=len1, scope="Encoder", bias=False, dropout=dropout) q = residual_block( q, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=q_mask, num_filters=hidden_size, num_heads=num_heads, seq_len=len2, scope="Encoder", reuse=True, # Share the weights between passage and question bias=False, dropout=dropout) # att c_maxlen = tf.cast(tf.reduce_max(len1), tf.int32) q_maxlen = tf.cast(tf.reduce_max(len2), tf.int32) S = optimized_trilinear_for_attention([c, q], c_maxlen, q_maxlen, input_keep_prob=1.0 - dropout) mask_q = tf.expand_dims(q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) c_att = tf.matmul(S_, q) # same length as c mask_c = tf.expand_dims(c_mask, 2) S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask=mask_c), axis=1), [0, 2, 1]) q_att = tf.matmul(S_T, c) # same length as q # c_att2 = tf.matmul(S_, q_att) # same length as c # q_att2 = tf.matmul(S_T, c_att) # same length as q # c_comb = tf.concat([c, c_att, c*c_att, c_att2, c*c_att2], axis=-1) # q_comb = tf.concat([q, q_att, q*q_att, q_att2, q*q_att2], axis=-1) c_comb = tf.concat([c, c_att, c * c_att, tf.abs(c - c_att)], axis=-1) q_comb = tf.concat([q, q_att, q * q_att, tf.abs(q - q_att)], axis=-1) # match c_proj = conv(c_comb, hidden_size, name="proj") q_proj = conv(q_comb, hidden_size, name="proj", reuse=True) c_proj = tf.nn.dropout(c_proj, 1.0 - dropout) q_proj = tf.nn.dropout(q_proj, 1.0 - dropout) c_match = residual_block(c_proj, num_blocks=1, num_conv_layers=2, kernel_size=5, mask=c_mask, num_filters=hidden_size, num_heads=num_heads, seq_len=len1, scope="match", bias=False, reuse=False, dropout=dropout) q_match = residual_block(q_proj, num_blocks=1, num_conv_layers=2, kernel_size=5, mask=q_mask, num_filters=hidden_size, num_heads=num_heads, seq_len=len2, scope="match", bias=False, reuse=True, dropout=dropout) # Aggregate with tf.name_scope('l2_norm'): x = aggregate(c_match, q_match) logits = tf.squeeze(Dense(1)(x)) self.prob = tf.sigmoid(logits) self.pred = tf.rint(self.prob) self.acc = tf.metrics.accuracy(labels=labels, predictions=self.pred) self.loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.to_float(labels), logits=logits)) l2 = tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables("l2_norm") if 'bias' not in v.name ]) * l2_coef # l2 = tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() # if 'bias' not in v.name ]) * l2_coef self.loss += l2 variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization( regularizer, variables) self.loss += l2_loss # decay var_ema = tf.train.ExponentialMovingAverage(0.9999) ema_op = var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) if training: self.global_step = tf.train.get_or_create_global_step() learning_rate = tf.minimum( 0.0005, 0.001 / tf.log(999.) * tf.log(tf.cast(self.global_step, tf.float32) + 1)) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): # Ensures that we execute the update_ops before performing the train_step gradients, variables = zip( *optimizer.compute_gradients(self.loss)) gradients, _ = tf.clip_by_global_norm(gradients, max_norm) self.train_op = optimizer.apply_gradients( zip(gradients, variables), global_step=self.global_step)
def resnet_model_fn(features, labels, mode, model_class, resnet_size, weight_decay, learning_rate_fn, momentum, data_format, resnet_version, loss_scale, loss_filter_fn=None, dtype=resnet_model.DEFAULT_DTYPE, fine_tune=False, label_smoothing=0.0, horovod=False): """Shared functionality for different resnet model_fns. Initializes the ResnetModel representing the model layers and uses that model to build the necessary EstimatorSpecs for the `mode` in question. For training, this means building losses, the optimizer, and the train op that get passed into the EstimatorSpec. For evaluation and prediction, the EstimatorSpec is returned without a train op, but with the necessary parameters for the given mode. Args: features: tensor representing input images labels: tensor representing class labels for all input images mode: current estimator mode; should be one of `tf.estimator.ModeKeys.TRAIN`, `EVALUATE`, `PREDICT` model_class: a class representing a TensorFlow model that has a __call__ function. We assume here that this is a subclass of ResnetModel. resnet_size: A single integer for the size of the ResNet model. weight_decay: weight decay loss rate used to regularize learned variables. learning_rate_fn: function that returns the current learning rate given the current global_step momentum: momentum term used for optimization data_format: Input format ('channels_last', 'channels_first', or None). If set to None, the format is dependent on whether a GPU is available. resnet_version: Integer representing which version of the ResNet network to use. See README for details. Valid values: [1, 2] loss_scale: The factor to scale the loss for numerical stability. A detailed summary is present in the arg parser help text. loss_filter_fn: function that takes a string variable name and returns True if the var should be included in loss calculation, and False otherwise. If None, batch_normalization variables will be excluded from the loss. dtype: the TensorFlow dtype to use for calculations. fine_tune: If True only train the dense layers(final layers). label_smoothing: If greater than 0 then smooth the labels. Returns: EstimatorSpec parameterized according to the input params and the current mode. """ # Generate a summary node for the images tf.compat.v1.summary.image('images', features, max_outputs=6) # Checks that features/images have same data type being used for calculations. assert features.dtype == dtype model = model_class(resnet_size, data_format, resnet_version=resnet_version, dtype=dtype) logits = model(features, mode == tf.estimator.ModeKeys.TRAIN) # This acts as a no-op if the logits are already in fp32 (provided logits are # not a SparseTensor). If dtype is is low precision, logits must be cast to # fp32 for numerical stability. logits = tf.cast(logits, tf.float32) predictions = { 'classes': tf.argmax(input=logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } if mode == tf.estimator.ModeKeys.PREDICT: # Return the predictions and the specification for serving a SavedModel return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs={ 'predict': tf.estimator.export.PredictOutput(predictions) }) # Calculate loss, which includes softmax cross entropy and L2 regularization. if label_smoothing != 0.0: one_hot_labels = tf.one_hot(labels, 1001) cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=one_hot_labels, label_smoothing=label_smoothing) else: cross_entropy = tf.compat.v1.losses.sparse_softmax_cross_entropy( logits=logits, labels=labels) # Create a tensor named cross_entropy for logging purposes. tf.identity(cross_entropy, name='cross_entropy') tf.compat.v1.summary.scalar('cross_entropy', cross_entropy) # If no loss_filter_fn is passed, assume we want the default behavior, # which is that batch_normalization variables are excluded from loss. def exclude_batch_norm(name): return 'batch_normalization' not in name loss_filter_fn = loss_filter_fn or exclude_batch_norm # Add weight decay to the loss. l2_loss = weight_decay * tf.add_n( # loss is computed using fp32 for numerical stability. [ tf.nn.l2_loss(tf.cast(v, tf.float32)) for v in tf.compat.v1.trainable_variables() if loss_filter_fn(v.name) ]) tf.compat.v1.summary.scalar('l2_loss', l2_loss) loss = cross_entropy + l2_loss if mode == tf.estimator.ModeKeys.TRAIN: global_step = tf.compat.v1.train.get_or_create_global_step() learning_rate = learning_rate_fn(global_step) # Create a tensor named learning_rate for logging purposes tf.identity(learning_rate, name='learning_rate') tf.compat.v1.summary.scalar('learning_rate', learning_rate) if flags.FLAGS.enable_lars: optimizer = tf.contrib.opt.LARSOptimizer( learning_rate, momentum=momentum, weight_decay=weight_decay, skip_list=['batch_normalization', 'bias']) else: optimizer = tf.compat.v1.train.MomentumOptimizer( learning_rate=learning_rate, momentum=momentum) fp16_implementation = getattr(flags.FLAGS, 'fp16_implementation', None) if fp16_implementation == 'graph_rewrite': optimizer = (tf.compat.v1.train.experimental. enable_mixed_precision_graph_rewrite( optimizer, loss_scale=loss_scale)) if horovod: import horovod.tensorflow as hvd optimizer = hvd.DistributedOptimizer(optimizer, num_groups=1) def _dense_grad_filter(gvs): """Only apply gradient updates to the final layer. This function is used for fine tuning. Args: gvs: list of tuples with gradients and variable info Returns: filtered gradients so that only the dense layer remains """ return [(g, v) for g, v in gvs if 'dense' in v.name] if loss_scale != 1 and fp16_implementation != 'graph_rewrite': # When computing fp16 gradients, often intermediate tensor values are # so small, they underflow to 0. To avoid this, we multiply the loss by # loss_scale to make these tensor values loss_scale times bigger. scaled_grad_vars = optimizer.compute_gradients(loss * loss_scale) if fine_tune: scaled_grad_vars = _dense_grad_filter(scaled_grad_vars) # Once the gradient computation is complete we can scale the gradients # back to the correct scale before passing them to the optimizer. unscaled_grad_vars = [(grad / loss_scale, var) for grad, var in scaled_grad_vars] minimize_op = optimizer.apply_gradients(unscaled_grad_vars, global_step) else: grad_vars = optimizer.compute_gradients(loss) if fine_tune: grad_vars = _dense_grad_filter(grad_vars) minimize_op = optimizer.apply_gradients(grad_vars, global_step) update_ops = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.UPDATE_OPS) train_op = tf.group(minimize_op, update_ops) else: train_op = None accuracy = tf.compat.v1.metrics.accuracy(labels, predictions['classes']) accuracy_top_5 = tf.compat.v1.metrics.mean( tf.nn.in_top_k(predictions=logits, targets=labels, k=5, name='top_5_op')) metrics = {'accuracy': accuracy, 'accuracy_top_5': accuracy_top_5} # Create a tensor named train_accuracy for logging purposes tf.identity(accuracy[1], name='train_accuracy') tf.identity(accuracy_top_5[1], name='train_accuracy_top_5') tf.compat.v1.summary.scalar('train_accuracy', accuracy[1]) tf.compat.v1.summary.scalar('train_accuracy_top_5', accuracy_top_5[1]) return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, loss=loss, train_op=train_op, eval_metric_ops=metrics)
def get_train_model(opt, device='/cpu:0'): num_inp = opt['num_inp'] num_hid_enc = opt['num_hid_enc'] num_hid = opt['num_hid'] num_hid_dec = opt['num_hid_dec'] wd = opt['weight_decay'] nl = eval(opt['non_linear']) with tf.device(device): # Input (N, D) x = tf.placeholder('float', [None, num_inp], name='x') # Encoder hidden layer (N, H1) w_1 = weight_variable([num_inp, num_hid_enc], wd=wd, name='w_1') b_1 = weight_variable([num_hid_enc], wd=wd, name='b_1') h_enc = nl(tf.matmul(x, w_1) + b_1, name='h_enc') # Encoder output: distribution parameters mu, log_sigma (N, 1, H) w_2 = weight_variable([num_hid_enc, num_hid], wd=wd, name='w_2') b_2 = weight_variable([num_hid], wd=wd, name='b_2') mu_enc = tf.matmul(h_enc, w_2) + b_2 w_3 = weight_variable([num_hid_enc, num_hid], wd=wd, name='w_3') b_3 = weight_variable([num_hid], wd=wd, name='b_3') log_sigma_enc = tf.add(tf.matmul(h_enc, w_3), b_3, name='log_sigma_enc') # Noise (N, M, H) t = tf.placeholder('float', [None, num_hid], name='t') # Encoder latent variable (N * M, H) z = tf.add(mu_enc, tf.mul(tf.exp(log_sigma_enc), t), name='z') # KL Divergence kl_qzx_pz = tf.mul( -0.5, tf.reduce_sum(1 + 2 * log_sigma_enc - mu_enc * mu_enc - tf.exp(2 * log_sigma_enc)), name='kl_qzx_pz') # Decoder hidden layer w_4 = weight_variable([num_hid, num_hid_dec], wd=wd, name='w_4') b_4 = weight_variable([num_hid_dec], wd=wd, name='b_4') h_dec = nl(tf.matmul(z, w_4) + b_4, name='h_dec') # Decoder output: distribution parameters mu, log_sigma w_5 = weight_variable([num_hid_dec, num_inp], wd=wd, name='w_5') b_5 = weight_variable([num_inp], wd=wd, name='b_5') mu_dec = tf.sigmoid(tf.matmul(h_dec, w_5) + b_5) # Gaussian posterior: p(x | z) if opt['output_dist'] == 'Gaussian': w_6 = weight_variable([num_hid_dec, num_inp], wd=wd, name='w_6') b_6 = weight_variable([num_inp], wd=wd, name='b_6') log_sigma_dec = tf.add(tf.matmul(h_dec, w_6), b_6, name='log_sigma_dec') sigma_dec = tf.exp(log_sigma_dec + 1e-4, name='sigma_dec') log_pxz = tf.reduce_sum(-0.5 * tf.log(2 * np.pi) - log_sigma_dec - 0.5 * (x - mu_dec) / sigma_dec * (x - mu_dec) / sigma_dec, name='log_pxz') elif opt['output_dist'] == 'Bernoulli': # Bernoulli posterior: p(x | z), (same as cross entropy) log_pxz = tf.reduce_sum(x * tf.log(mu_dec + 1e-7) + (1 - x) * tf.log((1 - mu_dec + 1e-7)), name='log_pxz') else: raise Exception('Unknown output distribution type: {}'.format( opt['output_dist'])) # Normalize by number of examples num_ex = tf.shape(x, name='num_ex') # Variational lower bound of marginal log-likelihood w_kl = 1.0 w_logp = 1.0 log_px_lb = (-w_kl * kl_qzx_pz + w_logp * log_pxz) / \ (w_kl + w_logp) * 2.0 / tf.to_float(num_ex[0]) tf.add_to_collection('losses', -log_px_lb) total_loss = tf.add_n(tf.get_collection('losses'), name='total_loss') lr = 1e-4 eps = 1e-7 train_step = tf.train.AdamOptimizer(lr, epsilon=eps).minimize(total_loss) m = { 'x': x, 't': t, 'w_1': w_1, 'b_1': b_1, 'h_enc': h_enc, 'w_2': w_2, 'b_2': b_2, 'w_3': w_3, 'b_3': b_3, 'mu_enc': mu_enc, 'log_sigma_enc': log_sigma_enc, 'z': z, 'kl_qzx_pz': kl_qzx_pz, 'w_4': w_4, 'b_4': b_4, 'h_dec': h_dec, 'w_5': w_5, 'b_5': b_5, 'mu_dec': mu_dec, 'log_pxz': log_pxz, 'log_px_lb': log_px_lb, 'train_step': train_step } if opt['output_dist'] == 'Gaussian': m['w_6'] = w_6 m['b_6'] = b_6 m['log_sigma_dec'] = log_sigma_dec return m
def __init__(self, is_training=False, hidden_units=128, num_layers=1, input_sequence_len=20, output_sequence_len=10, num_input_symbols=20, num_output_symbols=20, weight_amplitude=0.08, batch_size=32, peep=False): self.encoder_inputs = [] self.decoder_inputs = [] for i in range(input_sequence_len): self.encoder_inputs.append(tf.placeholder(tf.float32, shape=(None, num_input_symbols), name="encoder_{0}".format(i))) for i in range(output_sequence_len + 1): self.decoder_inputs.append(tf.placeholder(tf.float32, shape=(None, num_output_symbols), name="decoder_{0}".format(i))) def random_uniform(): return tf.random_uniform_initializer(-weight_amplitude, weight_amplitude) if num_layers > 1: cells = [rnn_cell.LSTMCell(hidden_units, use_peepholes=peep, input_size=num_input_symbols, initializer=random_uniform())] cells += [rnn_cell.LSTMCell(hidden_units, use_peepholes=peep, input_size=hidden_units, initializer=random_uniform()) for _ in range(num_layers - 1)] self.cell = rnn_cell.MultiRNNCell(cells) else: self.cell = rnn_cell.LSTMCell(hidden_units, use_peepholes=peep, initializer=random_uniform()) self.w_softmax = tf.get_variable('w_softmax', shape=(hidden_units, num_output_symbols), initializer=random_uniform()) self.b_softmax = tf.get_variable('b_softmax', shape=(num_output_symbols,), initializer=random_uniform()) # decoder_outputs is a list of tensors with output_sequence_len: [(batch_size x hidden_units)] decoder_outputs, _ = self._init_seq2seq(self.encoder_inputs, self.decoder_inputs, self.cell, feed_previous=not is_training) output_logits = [tf.matmul(decoder_output, self.w_softmax) + self.b_softmax for decoder_output in decoder_outputs] self.output_probs = [tf.nn.softmax(logit) for logit in output_logits] # If this is a training model create the training operation and loss function if is_training: self.targets = self.decoder_inputs[1:] losses = [tf.nn.softmax_cross_entropy_with_logits(logit, target) for logit, target in zip(output_logits, self.targets)] loss = tf.reduce_sum(tf.add_n(losses)) self.cost = loss / output_sequence_len / batch_size self.learning_rate = tf.Variable(DEFAULT_LEARNING_RATE, trainable=False) train_vars = tf.trainable_variables() grads = tf.gradients(self.cost, train_vars) optimizer = tf.train.AdamOptimizer(self.learning_rate) self.train_op = optimizer.apply_gradients(zip(grads, train_vars))
def __init__(self, num_features, **kwargs): defaults = { 'num_epochs': 10, 'display_step': 1, 'batch_size': 100, 'num_steps': 3, 'debug': False, 'normalize': True, 'latent_vector_size': 100, 'adpt_l': 2.0, 'res_depth': 1, 'ns_param': 0.01, 'batch_param': 0.1, 'dr_param': 1., 'df_param': 1., 'learning_rate': .001, 'reg_param': 0.01 } self.num_features = num_features vars(self).update({p: kwargs.get(p, d) for p, d in defaults.items()}) ######################################## # TensorFlow Variables # ######################################## self.X = tf.placeholder( 'float32', [None, num_features * self.num_steps], name='X' ) self.Y = tf.placeholder('int64', [None], name='Y') self.T = tf.placeholder('float32', name='T') self.Z = tf.placeholder( 'float32', [None, self.latent_vector_size], name='Z' ) self.keep_prob = tf.placeholder('float32', name='keep_prob') # for normalization self.feature_min = tf.Variable( np.zeros(num_features * self.num_steps), dtype=tf.float32 ) self.feature_max = tf.Variable( np.zeros(num_features * self.num_steps), dtype=tf.float32 ) ######################################## # GAN Model # ######################################## self.embedding_ops = [] def build_net(x, sizes): lrelu = nn.lrelu_gen(0.1) def block(x, in_dim, out_dim, i): with tf.variable_scope('block_{}'.format(i)): z = x for j in range(self.res_depth): with tf.variable_scope('res_block_{}'.format(j)): z = nn.build_residual_block( z, lrelu, in_dim, self.reg_param ) with tf.variable_scope('residual_block'): self.embedding_ops.append(z) z = tf.nn.dropout(z, self.keep_prob) z = nn.build_fc_layer( z, lrelu, in_dim, out_dim, self.reg_param ) with tf.variable_scope('fc_block'): self.embedding_ops.append(z) if i < len(sizes) - 2: z = tf.nn.dropout(z, self.keep_prob) return z z = x for i in range(1, len(sizes)): z = block(z, sizes[i-1], sizes[i], i-1) return z vec_size = self.num_features * self.num_steps rnn_g_sizes = [vec_size, 100, vec_size] def generator(t, x_prev): x = tf.squeeze( tf.slice(self.X, [0, t, 0], [-1, 1, -1]) ) x = tf.reduce_mean(tf.add(x, x_prev), axis=1) x_next = tf.nn.sigmoid(build_net(x_prev, rnn_g_sizes)) t = tf.add(t, 1) return t, x_next def discriminator(t, out, x_prev): x = tf.squeeze( tf.slice(self.X, [0, t, 0], [-1, 1, -1]) ) x = tf.reduce_mean(tf.add(x, x_prev), axis=1) x_next = tf.nn.sigmoid(build_net(x_prev, rnn_g_sizes)) t = tf.add(t, 1) return t, x_next g_sizes = [self.latent_vector_size, 100, vec_size] d_sizes = [vec_size, 64, 32, 16, 8, 4, 2] with tf.variable_scope('generator'): G_sample = tf.nn.sigmoid(build_net(self.Z, g_sizes)) with tf.variable_scope('discriminator'): D_logit_real = build_net(self.X, d_sizes) tf.get_variable_scope().reuse_variables() D_logit_fake = build_net(G_sample, d_sizes) D_fake = tf.nn.sigmoid(D_logit_fake) D_real = tf.nn.sigmoid(D_logit_real) self.scores = D_logit_real ######################################## # Losses & Optimizers # ######################################## # D Loss D_loss_real = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=D_logit_real, labels=tf.ones_like(self.Y) ) ) D_loss_fake = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=D_logit_fake, labels=tf.zeros_like(self.Y) ) ) # Ensure differnce between nodes. node_simiality_loss = -tf.reduce_mean(tf.add( tf.square(D_real[:, 1] - D_real[:, 0]), tf.square(D_fake[:, 1] - D_fake[:, 0]) )) # Punish stddev accross batch batch_loss = tf.reduce_mean(tf.add( tf.nn.moments(D_real, [1])[1], tf.nn.moments(D_fake, [1])[1] )) self.D_loss = tf.add_n([ self.df_param * D_loss_fake, self.dr_param * D_loss_real, self.ns_param * node_simiality_loss, self.batch_param * batch_loss ]) self.D_only_loss = tf.add_n([ self.df_param * D_loss_fake, self.dr_param * D_loss_real ]) self.D_loss += tf.add_n(tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES, scope='discriminator' )) # G Loss self.G_loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=D_logit_fake, labels=tf.ones_like(self.Y) ) ) self.G_loss += tf.add_n(tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES, scope='generator' )) # Optimizers self.D_solver = tf.train.AdamOptimizer(self.learning_rate).minimize( self.D_loss, var_list=tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='discriminator' ) ) self.G_solver = tf.train.AdamOptimizer(self.learning_rate).minimize( self.G_loss, var_list=tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='generator' ) ) ######################################## # Evaluation Metrics # ######################################## # negative_labels = tf.cast(tf.fill(tf.shape(self.Y), 0), 'int64') # positive_labels = tf.cast(tf.fill(tf.shape(self.Y), 1), 'int64') # pred_labels = tf.where( # tf.greater(self.scores, tf.fill(tf.shape(self.Y), 0.5)), # positive_labels, # negative_labels # ) pred_labels = tf.argmax(self.scores, 1) self.confusion_matrix = tf.confusion_matrix( self.Y, pred_labels, num_classes=2 ) self.accuracy = tf.reduce_mean( tf.to_float(tf.equal(pred_labels, self.Y)) ) # Variable ops self.init_op = tf.global_variables_initializer() self.saver = tf.train.Saver() self.config = tf.ConfigProto() self.config.gpu_options.allow_growth = True
def l1_weights(self): """L1 loss for the weights of the network""" return tf.add_n([tf.reduce_sum(tf.abs(v)) for v in tf.trainable_variables() if v in self.vars])
def l2_weights(self): """L2 loss for the weights of the network""" return tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables() if v in self.vars])
b_fc2 = bias_variable([fc_size2]) ''' h_fc2 = tf.nn.sigmoid(tf.matmul(h_fc1_drop, W_fc2) + b_fc2) h_fc2_drop = tf.nn.dropout(h_fc2, keep_prob) W_fc3 = weight_variable([fc_size2, 2]) b_fc3 = bias_variable([2]) ''' y_conv = tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2) #Train and evaluate saver = tf.train.Saver() cross_entropy = -tf.reduce_sum(y_*tf.log(y_conv)) #Weight reg tf.add_to_collection("losses",cross_entropy) loss = tf.add_n(tf.get_collection("losses")) train_step = tf.train.AdamOptimizer(1e-4).minimize(loss) correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) if emptyTrain: sess.run(tf.initialize_all_variables()) else: saver.restore(sess, "./mnistnnsave/model.ckpt") Iteration = 20000 data_dic #train_x = scale_to_01(np.array(data_dic['data'])) train_x = np.array(data_dic['data']) train_y = np.array(data_dic['label']) #shuffle the train data train = np.hstack((train_x,train_y)) train_list = train.tolist()
def main(args): network = importlib.import_module(args.model_def, 'inference') subdir = datetime.strftime(datetime.now(), '%Y%m%d-%H%M%S') log_dir = os.path.join(os.path.expanduser(args.logs_base_dir), subdir) if not os.path.isdir(log_dir): # Create the log directory if it doesn't exist os.makedirs(log_dir) model_dir = os.path.join(os.path.expanduser(args.models_base_dir), subdir) if not os.path.isdir(model_dir): # Create the model directory if it doesn't exist os.makedirs(model_dir) # Store some git revision info in a text file in the log directory src_path,_ = os.path.split(os.path.realpath(__file__)) facenet.store_revision_info(src_path, log_dir, ' '.join(sys.argv)) np.random.seed(seed=args.seed) train_set = facenet.get_dataset(args.data_dir) nrof_classes = len(train_set) print('Model directory: %s' % model_dir) print('Log directory: %s' % log_dir) pretrained_model = None if args.pretrained_model: pretrained_model = os.path.expanduser(args.pretrained_model) print('Pre-trained model: %s' % pretrained_model) if args.lfw_dir: print('LFW directory: %s' % args.lfw_dir) # Read the file containing the pairs used for testing pairs = lfw.read_pairs(os.path.expanduser(args.lfw_pairs)) # Get the paths for the corresponding images lfw_paths, actual_issame = lfw.get_paths(os.path.expanduser(args.lfw_dir), pairs, args.lfw_file_ext) with tf.Graph().as_default(): tf.set_random_seed(args.seed) global_step = tf.Variable(0, trainable=False) # Get a list of image paths and their labels image_list, label_list = facenet.get_image_paths_and_labels(train_set) # Read data and apply label preserving distortions image_batch, label_batch = facenet.read_and_augument_data(image_list, label_list, args.image_size, args.batch_size, args.max_nrof_epochs, args.random_crop, args.random_flip, args.random_rotate, args.nrof_preprocess_threads) print('Total number of classes: %d' % nrof_classes) print('Total number of examples: %d' % len(image_list)) print('Building training graph') # Placeholder for the learning rate learning_rate_placeholder = tf.placeholder(tf.float32, name='learning_rate') # Build the inference graph prelogits, _ = network.inference(image_batch, args.keep_probability, phase_train=True, weight_decay=args.weight_decay) logits = slim.fully_connected(prelogits, len(train_set), activation_fn=None, weights_initializer=tf.truncated_normal_initializer(stddev=0.1), weights_regularizer=slim.l2_regularizer(args.weight_decay), scope='Logits', reuse=False) # Add DeCov regularization loss if args.decov_loss_factor>0.0: logits_decov_loss = facenet.decov_loss(logits) * args.decov_loss_factor tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, logits_decov_loss) # Add center loss if args.center_loss_factor>0.0: prelogits_center_loss, _ = facenet.center_loss(prelogits, label_batch, args.center_loss_alfa, nrof_classes) tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, prelogits_center_loss * args.center_loss_factor) learning_rate = tf.train.exponential_decay(learning_rate_placeholder, global_step, args.learning_rate_decay_epochs*args.epoch_size, args.learning_rate_decay_factor, staircase=True) tf.scalar_summary('learning_rate', learning_rate) # Calculate the average cross entropy loss across the batch cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits, label_batch, name='cross_entropy_per_example') cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy') tf.add_to_collection('losses', cross_entropy_mean) # Calculate the total losses regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) total_loss = tf.add_n([cross_entropy_mean] + regularization_losses, name='total_loss') # Build a Graph that trains the model with one batch of examples and updates the model parameters train_op = facenet.train(total_loss, global_step, args.optimizer, learning_rate, args.moving_average_decay, tf.all_variables(), args.log_histograms) # Evaluation print('Building evaluation graph') lfw_label_list = range(0,len(lfw_paths)) assert (len(lfw_paths) % args.lfw_batch_size == 0), "The number of images in the LFW test set need to be divisible by the lfw_batch_size" eval_image_batch, eval_label_batch = facenet.read_and_augument_data(lfw_paths, lfw_label_list, args.image_size, args.lfw_batch_size, None, False, False, False, args.nrof_preprocess_threads, shuffle=False) # Node for input images eval_image_batch.set_shape((None, args.image_size, args.image_size, 3)) eval_image_batch = tf.identity(eval_image_batch, name='input') eval_prelogits, _ = network.inference(eval_image_batch, 1.0, phase_train=False, weight_decay=0.0, reuse=True) eval_embeddings = tf.nn.l2_normalize(eval_prelogits, 1, 1e-10, name='embeddings') # Create a saver saver = tf.train.Saver(tf.all_variables(), max_to_keep=3) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Start running operations on the Graph. gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_memory_fraction) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False)) sess.run(tf.initialize_all_variables()) sess.run(tf.initialize_local_variables()) summary_writer = tf.train.SummaryWriter(log_dir, sess.graph) tf.train.start_queue_runners(sess=sess) with sess.as_default(): if pretrained_model: print('Restoring pretrained model: %s' % pretrained_model) saver.restore(sess, pretrained_model) # Training and validation loop print('Running training') epoch = 0 while epoch < args.max_nrof_epochs: step = sess.run(global_step, feed_dict=None) epoch = step // args.epoch_size # Train for one epoch train(args, sess, epoch, learning_rate_placeholder, global_step, total_loss, train_op, summary_op, summary_writer, regularization_losses, args.learning_rate_schedule_file) # Save variables and the metagraph if it doesn't exist already save_variables_and_metagraph(sess, saver, summary_writer, model_dir, subdir, step) # Evaluate on LFW ''' if args.lfw_dir: evaluate(sess, eval_embeddings, eval_label_batch, actual_issame, args.lfw_batch_size, args.seed, args.lfw_nrof_folds, log_dir, step, summary_writer) ''' return model_dir
def l2_loss(self): return self.l2_scale * tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'bias' not in v.name ])
y_pred = tf.nn.softmax_cross_entropy_with_logits_v2(logits=pred, labels=y) cost = tf.reduce_mean(y_pred) norms = [] for weight in weights.values(): if "6" in weight.name or "7" in weight.name or "8" in weight.name: norms.append(tf.nn.l2_loss(weight)) for weight in biases.values(): if "6" in weight.name or "7" in weight.name or "8" in weight.name: norms.append(tf.nn.l2_loss(weight)) loss_L2 = tf.add_n(norms) * .05 cost = cost optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(cost + loss_L2) correct_pred = tf.equal(tf.math.argmax(input=pred, axis=1), tf.argmax(input=y, axis=1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) init = tf.global_variables_initializer() saver = tf.train.Saver(max_to_keep=epochs, save_relative_paths=True) with tf.Session() as sess: sess.run(init)
return var x = tf.placeholder(dtype=tf.float32, shape=(None, 2)) y_ = tf.placeholder(dtype=tf.float32, shape=(None, 1)) batch_size = 8 layer_dimension = [2, 10, 10, 10, 1] n_layers = len(layer_dimension) cur_layer = x in_dimension = layer_dimension[0] for i in range(1, n_layers): out_dimension = layer_dimension[i] weight = get_weight(shape=[in_dimension, out_dimension], lamdba=0.001) bias = tf.Variable(tf.constant(0.1, shape=[out_dimension])) cur_layer = tf.nn.relu(tf.matmul(cur_layer, weight) + bias) in_dimension = out_dimension mess_loss = tf.reduce_mean(tf.square(y_ - cur_layer)) tf.add_to_collection('losses', mess_loss) loss = tf.add_n(tf.get_collection('losses'))
from tensorflow.core.protobuf import saver_pb2 # These are the local imports. We import that from our directory # driving_data is for reading our dataset import driving_data # model is out tensorflow model. check the model graph here. https://imgur.com/IuBJdKe import model # the path for our trained model. In case there is a trained model already we will import that and start training with that. If you want to you can also start from scratch. LOGDIR = './save' # Tensorflow Session. Read more here https://www.tensorflow.org/api_docs/python/tf/Session sess = tf.InteractiveSession() # This is our normalization function. We use L2 and now we define a constant for that. L2NormConst = 0.001 train_vars = tf.trainable_variables() loss = tf.reduce_mean(tf.square(tf.subtract(model.y_, model.y))) + tf.add_n([tf.nn.l2_loss(v) for v in train_vars]) * L2NormConst accuracy = 100 - loss train_step = tf.train.AdamOptimizer(1e-4).minimize(loss) sess.run(tf.initialize_all_variables()) # create a summary to monitor cost tensor tf.summary.scalar("loss", loss) tf.summary.scalar("accuracy", accuracy) # merge all summaries into a single op merged_summary_op = tf.summary.merge_all() saver = tf.train.Saver(write_version = saver_pb2.SaverDef.V2) # op to write logs to Tensorboard logs_path = './logs' summary_writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph())
use_relu=True, weight_loss=0.04) layer_fc3 = create_fc_layer(input=layer_fc2, num_inputs=fc_layer_size2, num_outputs=num_classes, use_relu=False) y_pred = tf.nn.softmax(layer_fc3, name='y_pred') y_pred_cls = tf.argmax(y_pred, dimension=1) session.run(tf.global_variables_initializer()) cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=layer_fc3, labels=y_true) cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy') # L2 Regularization tf.add_to_collection('losses', cross_entropy_mean) cost = tf.add_n(tf.get_collection('losses'), name='total_loss') optimizer = tf.train.AdamOptimizer(learning_rate=1e-4).minimize(cost) # 1e-4 correct_prediction = tf.equal(y_pred_cls, y_true_cls) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) session.run(tf.global_variables_initializer()) def show_progress(epoch, feed_dict_train, feed_dict_validate, val_loss, duration=None): acc = session.run(accuracy, feed_dict=feed_dict_train) val_acc = session.run(accuracy, feed_dict=feed_dict_validate) if duration is not None: examples_per_sec = batch_size / duration msg = "Training Epoch {0}, Iterations: {1} --- Training Accuracy: {2:>6.1%}," \ " Validation Accuracy: {3:>6.1%}, Validation Loss: {4:.3f}," \ " {5:.2f} examples/sec, {6:.2f} sec/iteration"
def build_graph(self, image, label): is_training = get_current_tower_context().is_training fw, fa, fg = get_dorefa(BITW, BITA, BITG) # monkey-patch tf.get_variable to apply fw def binarize_weight(v): name = v.op.name # don't binarize first and last layer if not name.endswith('W') or 'conv0' in name or 'fc' in name: return v else: logger.info("Binarizing weight {}".format(v.op.name)) return fw(v) def nonlin(x): if BITA == 32: return tf.nn.relu(x) return tf.clip_by_value(x, 0.0, 1.0) def activate(x): return fa(nonlin(x)) image = image / 256.0 with remap_variables(binarize_weight), \ argscope(BatchNorm, momentum=0.9, epsilon=1e-4),\ argscope(Conv2D, use_bias=False): logits = ( LinearWrap(image).Conv2D('conv0', 48, 5, padding='VALID', use_bias=True).MaxPooling( 'pool0', 2, padding='SAME').apply(activate) # 18 .Conv2D('conv1', 64, 3, padding='SAME').apply( fg, 'fg1', is_training, kernel_size=18) #模型的核心变动,用fg代替bn和activate #.BatchNorm('bn1') #.apply(activate) .Conv2D('conv2', 64, 3, padding='SAME').MaxPooling( 'pool1', 2, padding='SAME').apply( fg, 'fg2', training=is_training, kernel_size=9) #注意,这里要先maxpooling再做量化。 #因为原来的模型maxpooling是在bn之后的 #.BatchNorm('bn2') #.MaxPooling('pool1', 2, padding='SAME') #.apply(activate) # 9 .Conv2D('conv3', 128, 3, padding='VALID').apply(fg, 'fg3', is_training, kernel_size=7) #.BatchNorm('bn3') #.apply(activate) # 7 .Conv2D('conv4', 128, 3, padding='SAME').apply(fg, 'fg4', is_training, kernel_size=7) #.BatchNorm('bn4') #.apply(activate) .Conv2D('conv5', 128, 3, padding='VALID').apply(fg, 'fg5', is_training, kernel_size=5) #.BatchNorm('bn5').apply(activate) # 5 .Dropout(rate=0.5 if is_training else 0.0).Conv2D( 'conv6', 512, 5, padding='VALID') #最后一层不做量化 .BatchNorm('bn6').apply(nonlin).FullyConnected('fc1', 100)()) tf.nn.softmax(logits, name='output') # compute the number of failed samples wrong = tf.cast(tf.logical_not(tf.nn.in_top_k(logits, label, 1)), tf.float32, name='wrong-top1') # monitor training error add_moving_summary(tf.reduce_mean(wrong, name='train_error')) cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') # weight decay on all W of fc layers wd_cost = regularize_cost('fc.*/W', l2_regularizer(1e-7)) add_param_summary(('.*/W', ['histogram', 'rms'])) total_cost = tf.add_n([cost, wd_cost], name='cost') add_moving_summary(cost, wd_cost, total_cost) return total_cost
matrix_name = get_block_name(i, j) matrices[matrix_name] = tf.random_uniform([M, M], name=matrix_name) #intermediate_traces will store sum of trace for all sub-matrices on each machine intermediate_traces = {0:0,1:0,2:0,3:0,4:0} import datetime print "Before matrix creation:",datetime.datetime.now() for i in range(0, d): for j in range(0, d): with tf.device("/job:worker/task:%d" % ( (i+j) % 5 )): A = matrices[get_block_name(i, j)] B = matrices[get_block_name(j, i)] traceForSubMatrix = tf.trace(tf.matmul(A, B)) oldSum = intermediate_traces[(i+j) % 5] intermediate_traces[(i+j) % 5] = oldSum + traceForSubMatrix print "After matrix creation:",datetime.datetime.now() with tf.device("/job:worker/task:0"): #Calculate total trace by summing up all elements from intermediate_traces retval = tf.add_n(intermediate_traces.values()) print "After retval calculation:",datetime.datetime.now() config = tf.ConfigProto(log_device_placement=True) with tf.Session("grpc://vm-23-2:2222", config=config, graph=g) as sess: result = sess.run(retval) sess.close() print "Trace of the big matrix is = ", result print "SUCCESS"
#softmax with tf.name_scope("softmmax"): W_soft = weight_variable("W_soft", [192, 10], stddev=1 / 192.0) b_soft = bias_variable("b_soft", [10], 0.0) h_soft = tf.add(tf.matmul(h_local4, W_soft), b_soft, name="h_soft") #暂不是很清楚这个函数的意思 # _activation_summary(h_soft) with tf.name_scope("loss"): cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=input_y, logits=h_soft) cross_entropy_mean = tf.reduce_mean(cross_entropy, name="cross_entropy") tf.add_to_collection("losses", cross_entropy_mean) loss = tf.add_n(tf.get_collection("losses"), name="total_loss") #计算准确率 with tf.name_scope("accuracy"): correct_prediction = tf.equal(tf.arg_max(input_y_onehot, 1), tf.arg_max(h_soft, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, dtype=tf.float32)) with tf.name_scope("train"): opt_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) tf.train.start_queue_runners(sess=sess) #绘制图表
def compute_weight_decay(vars): return tf.add_n([tf.nn.l2_loss(v) for v in vars])
def regularize_cost(regex, func, name='regularize_cost'): """ Apply a regularizer on trainable variables matching the regex, and print the matched variables (only print once in multi-tower training). In replicated mode, it will only regularize variables within the current tower. Args: regex (str): a regex to match variable names, e.g. "conv.*/W" func: the regularization function, which takes a tensor and returns a scalar tensor. E.g., ``tf.contrib.layers.l2_regularizer``. Returns: tf.Tensor: the total regularization cost. Example: .. code-block:: python cost = cost + regularize_cost("fc.*/W", l2_regularizer(1e-5)) """ ctx = get_current_tower_context() if not ctx.is_training: # Currently cannot build the wd_cost correctly at inference, # because ths vs_name used in inference can be '', therefore the # variable filter will fail return tf.constant(0, dtype=tf.float32, name='empty_' + name) params = tf.trainable_variables() # If vars are shared, use all of them # If vars are replicated, only regularize those in the current tower params = ctx.filter_vars_by_vs_name(params) G = tf.get_default_graph() to_regularize = [] with tf.name_scope('regularize_cost'): costs = [] for p in params: para_name = p.op.name if re.search(regex, para_name): with G.colocate_with(p): costs.append(func(p)) to_regularize.append(p.name) if not costs: return tf.constant(0, dtype=tf.float32, name='empty_' + name) # remove tower prefix from names, and print if len(ctx.vs_name): prefix = ctx.vs_name + '/' prefixlen = len(prefix) def f(name): if name.startswith(prefix): return name[prefixlen:] return name to_regularize = list(map(f, to_regularize)) to_print = ', '.join(to_regularize) _log_regularizer(to_print) return tf.add_n(costs, name=name)
def ssd_losses_old(logits, localisations, gclasses, glocalisations, gscores, match_threshold=0.5, negative_ratio=3., alpha=1., label_smoothing=0., device='/cpu:0', scope=None): """Loss functions for training the SSD 300 VGG network. This function defines the different loss components of the SSD, and adds them to the TF loss collection. Arguments: logits: (list of) predictions logits Tensors; localisations: (list of) localisations Tensors; gclasses: (list of) groundtruth labels Tensors; glocalisations: (list of) groundtruth localisations Tensors; gscores: (list of) groundtruth score Tensors; """ with tf.device(device): with tf.name_scope(scope, 'ssd_losses'): l_cross_pos = [] l_cross_neg = [] l_loc = [] for i in range(len(logits)): dtype = logits[i].dtype with tf.name_scope('block_%i' % i): # Sizing weight... wsize = tfe.get_shape(logits[i], rank=5) wsize = wsize[1] * wsize[2] * wsize[3] # Positive mask. pmask = gscores[i] > match_threshold fpmask = tf.cast(pmask, dtype) n_positives = tf.reduce_sum(fpmask) # Select some random negative entries. # n_entries = np.prod(gclasses[i].get_shape().as_list()) # r_positive = n_positives / n_entries # r_negative = negative_ratio * n_positives / (n_entries - n_positives) # Negative mask. no_classes = tf.cast(pmask, tf.int32) predictions = slim.softmax(logits[i]) nmask = tf.logical_and(tf.logical_not(pmask), gscores[i] > -0.5) fnmask = tf.cast(nmask, dtype) nvalues = tf.where(nmask, predictions[:, :, :, :, 0], 1. - fnmask) nvalues_flat = tf.reshape(nvalues, [-1]) # Number of negative entries to select. n_neg = tf.cast(negative_ratio * n_positives, tf.int32) n_neg = tf.maximum(n_neg, tf.size(nvalues_flat) // 8) n_neg = tf.maximum(n_neg, tf.shape(nvalues)[0] * 4) max_neg_entries = 1 + tf.cast(tf.reduce_sum(fnmask), tf.int32) n_neg = tf.minimum(n_neg, max_neg_entries) val, idxes = tf.nn.top_k(-nvalues_flat, k=n_neg) max_hard_pred = -val[-1] # Final negative mask. nmask = tf.logical_and(nmask, nvalues < max_hard_pred) fnmask = tf.cast(nmask, dtype) # Add cross-entropy loss. with tf.name_scope('cross_entropy_pos'): fpmask = wsize * fpmask loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits[i], labels=gclasses[i]) loss = tf.losses.compute_weighted_loss(loss, fpmask) l_cross_pos.append(loss) with tf.name_scope('cross_entropy_neg'): fnmask = wsize * fnmask loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits[i], labels=no_classes) loss = tf.losses.compute_weighted_loss(loss, fnmask) l_cross_neg.append(loss) # Add localization loss: smooth L1, L2, ... with tf.name_scope('localization'): # Weights Tensor: positive mask + random negative. weights = tf.expand_dims(alpha * fpmask, axis=-1) loss = custom_layers.abs_smooth(localisations[i] - glocalisations[i]) loss = tf.losses.compute_weighted_loss(loss, weights) l_loc.append(loss) # Additional total losses... with tf.name_scope('total'): total_cross_pos = tf.add_n(l_cross_pos, 'cross_entropy_pos') total_cross_neg = tf.add_n(l_cross_neg, 'cross_entropy_neg') total_cross = tf.add(total_cross_pos, total_cross_neg, 'cross_entropy') total_loc = tf.add_n(l_loc, 'localization') # Add to EXTRA LOSSES TF.collection tf.add_to_collection('EXTRA_LOSSES', total_cross_pos) tf.add_to_collection('EXTRA_LOSSES', total_cross_neg) tf.add_to_collection('EXTRA_LOSSES', total_cross) tf.add_to_collection('EXTRA_LOSSES', total_loc)
def train(): with tf.Graph().as_default(), tf.device('/cpu:0'): num_gpu = len(cfgs.GPU_GROUP.strip().split(',')) global_step = slim.get_or_create_global_step() lr = warmup_lr(cfgs.LR, global_step, cfgs.WARM_SETP, num_gpu) # lr = warmup_and_cosine_lr(cfgs.LR, global_step, cfgs.WARM_SETP, cfgs.MAX_ITERATION, num_gpu) tf.summary.scalar('lr', lr) optimizer = tf.train.MomentumOptimizer(lr, momentum=cfgs.MOMENTUM) retinanet = build_whole_network_r3det_csl.DetectionNetwork( base_network_name=cfgs.NET_NAME, is_training=True) with tf.name_scope('get_batch'): if cfgs.IMAGE_PYRAMID: shortside_len_list = tf.constant(cfgs.IMG_SHORT_SIDE_LEN) shortside_len = tf.random_shuffle(shortside_len_list)[0] else: shortside_len = cfgs.IMG_SHORT_SIDE_LEN img_name_batch, img_batch, gtboxes_and_label_batch, num_objects_batch, img_h_batch, img_w_batch = \ next_batch(dataset_name=cfgs.DATASET_NAME, batch_size=cfgs.BATCH_SIZE * num_gpu, shortside_len=shortside_len, is_training=True) # data processing inputs_list = [] for i in range(num_gpu): img = tf.expand_dims(img_batch[i], axis=0) if cfgs.NET_NAME in [ 'resnet152_v1d', 'resnet101_v1d', 'resnet50_v1d' ]: img = img / tf.constant([cfgs.PIXEL_STD]) gtboxes_and_label_r = tf.py_func(backward_convert, inp=[gtboxes_and_label_batch[i]], Tout=tf.float32) gtboxes_and_label_r = tf.reshape(gtboxes_and_label_r, [-1, 6]) gtboxes_and_label_h = get_horizen_minAreaRectangle( gtboxes_and_label_batch[i]) gtboxes_and_label_h = tf.reshape(gtboxes_and_label_h, [-1, 5]) num_objects = num_objects_batch[i] num_objects = tf.cast(tf.reshape(num_objects, [ -1, ]), tf.float32) img_h = img_h_batch[i] img_w = img_w_batch[i] inputs_list.append([ img, gtboxes_and_label_h, gtboxes_and_label_r, num_objects, img_h, img_w ]) tower_grads = [] biases_regularizer = tf.no_regularizer weights_regularizer = tf.contrib.layers.l2_regularizer( cfgs.WEIGHT_DECAY) total_loss_dict = { 'cls_loss': tf.constant(0., tf.float32), 'reg_loss': tf.constant(0., tf.float32), 'refine_cls_loss': tf.constant(0., tf.float32), 'refine_reg_loss': tf.constant(0., tf.float32), 'angle_cls_loss': tf.constant(0., tf.float32), 'total_losses': tf.constant(0., tf.float32), } with tf.variable_scope(tf.get_variable_scope()): for i in range(num_gpu): with tf.device('/gpu:%d' % i): with tf.name_scope('tower_%d' % i): with slim.arg_scope( [slim.model_variable, slim.variable], device='/device:CPU:0'): with slim.arg_scope( [ slim.conv2d, slim.conv2d_in_plane, slim.conv2d_transpose, slim.separable_conv2d, slim.fully_connected ], weights_regularizer=weights_regularizer, biases_regularizer=biases_regularizer, biases_initializer=tf.constant_initializer( 0.0)): gtboxes_and_label_h, gtboxes_and_label_r = tf.py_func( get_gtboxes_and_label, inp=[ inputs_list[i][1], inputs_list[i][2], inputs_list[i][3] ], Tout=[tf.float32, tf.float32]) gtboxes_and_label_h = tf.reshape( gtboxes_and_label_h, [-1, 5]) gtboxes_and_label_r = tf.reshape( gtboxes_and_label_r, [-1, 6]) if cfgs.ANGLE_RANGE == 180: gtboxes_and_label_r_ = tf.py_func( coordinate_present_convert, inp=[gtboxes_and_label_r, -1], Tout=tf.float32) gtboxes_and_label_r_ = tf.reshape( gtboxes_and_label_r_, [-1, 6]) gt_smooth_label = tf.py_func( angle_smooth_label, inp=[ gtboxes_and_label_r_[:, -2], cfgs.ANGLE_RANGE, cfgs.LABEL_TYPE, cfgs.RADUIUS, cfgs.OMEGA ], Tout=tf.float32) else: gt_smooth_label = tf.py_func( angle_smooth_label, inp=[ gtboxes_and_label_r[:, -2], cfgs.ANGLE_RANGE, cfgs.LABEL_TYPE, cfgs.RADUIUS, cfgs.OMEGA ], Tout=tf.float32) gt_smooth_label = tf.reshape( gt_smooth_label, [-1, cfgs.ANGLE_RANGE // cfgs.OMEGA]) img = inputs_list[i][0] img_shape = inputs_list[i][-2:] img = tf.image.crop_to_bounding_box( image=img, offset_height=0, offset_width=0, target_height=tf.cast( img_shape[0], tf.int32), target_width=tf.cast( img_shape[1], tf.int32)) outputs = retinanet.build_whole_detection_network( input_img_batch=img, gtboxes_batch_h=gtboxes_and_label_h, gtboxes_batch_r=gtboxes_and_label_r, gt_smooth_label=gt_smooth_label, gpu_id=i) gtboxes_in_img_h = draw_boxes_with_categories( img_batch=img, boxes=gtboxes_and_label_h[:, :-1], labels=gtboxes_and_label_h[:, -1], method=0, is_csl=True) gtboxes_in_img_r = draw_boxes_with_categories( img_batch=img, boxes=gtboxes_and_label_r[:, :-1], labels=gtboxes_and_label_r[:, -1], method=1, is_csl=True) tf.summary.image( 'Compare/gtboxes_h_gpu:%d' % i, gtboxes_in_img_h) tf.summary.image( 'Compare/gtboxes_r_gpu:%d' % i, gtboxes_in_img_r) if cfgs.ADD_BOX_IN_TENSORBOARD: detections_in_img = draw_boxes_with_categories_and_scores( img_batch=img, boxes=outputs[0], scores=outputs[1], labels=outputs[2], method=1, is_csl=True) tf.summary.image( 'Compare/final_detection_gpu:%d' % i, detections_in_img) detections_angle_in_img = draw_boxes_with_categories_and_scores( img_batch=img, boxes=outputs[3], scores=outputs[1], labels=outputs[2], method=1, is_csl=True) tf.summary.image( 'Compare/final_detection_angle_gpu:%d' % i, detections_angle_in_img) loss_dict = outputs[-1] total_losses = 0.0 for k in loss_dict.keys(): total_losses += loss_dict[k] total_loss_dict[ k] += loss_dict[k] / num_gpu total_losses = total_losses / num_gpu total_loss_dict['total_losses'] += total_losses if i == num_gpu - 1: regularization_losses = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) # weight_decay_loss = tf.add_n(slim.losses.get_regularization_losses()) total_losses = total_losses + tf.add_n( regularization_losses) tf.get_variable_scope().reuse_variables() grads = optimizer.compute_gradients(total_losses) if cfgs.GRADIENT_CLIPPING_BY_NORM is not None: grads = slim.learning.clip_gradient_norms( grads, cfgs.GRADIENT_CLIPPING_BY_NORM) tower_grads.append(grads) for k in total_loss_dict.keys(): tf.summary.scalar('{}/{}'.format(k.split('_')[0], k), total_loss_dict[k]) if len(tower_grads) > 1: grads = sum_gradients(tower_grads) else: grads = tower_grads[0] if cfgs.MUTILPY_BIAS_GRADIENT is not None: final_gvs = [] with tf.variable_scope('Gradient_Mult'): for grad, var in grads: scale = 1. if '/biases:' in var.name: scale *= cfgs.MUTILPY_BIAS_GRADIENT if 'conv_new' in var.name: scale *= 3. if not np.allclose(scale, 1.0): grad = tf.multiply(grad, scale) final_gvs.append((grad, var)) apply_gradient_op = optimizer.apply_gradients( final_gvs, global_step=global_step) else: apply_gradient_op = optimizer.apply_gradients( grads, global_step=global_step) variable_averages = tf.train.ExponentialMovingAverage( 0.9999, global_step) variables_averages_op = variable_averages.apply( tf.trainable_variables()) train_op = tf.group(apply_gradient_op, variables_averages_op) # train_op = optimizer.apply_gradients(final_gvs, global_step=global_step) summary_op = tf.summary.merge_all() restorer, restore_ckpt = retinanet.get_restorer() saver = tf.train.Saver(max_to_keep=5) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) tfconfig = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) tfconfig.gpu_options.allow_growth = True with tf.Session(config=tfconfig) as sess: sess.run(init_op) # sess.run(tf.initialize_all_variables()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord, sess=sess) summary_path = os.path.join(cfgs.SUMMARY_PATH, cfgs.VERSION) tools.mkdir(summary_path) summary_writer = tf.summary.FileWriter(summary_path, graph=sess.graph) if not restorer is None: restorer.restore(sess, restore_ckpt) print('restore model') for step in range(cfgs.MAX_ITERATION // num_gpu): training_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) if step % cfgs.SHOW_TRAIN_INFO_INTE != 0 and step % cfgs.SMRY_ITER != 0: _, global_stepnp = sess.run([train_op, global_step]) else: if step % cfgs.SHOW_TRAIN_INFO_INTE == 0 and step % cfgs.SMRY_ITER != 0: start = time.time() _, global_stepnp, total_loss_dict_ = \ sess.run([train_op, global_step, total_loss_dict]) end = time.time() print('***' * 20) print("""%s: global_step:%d current_step:%d""" % (training_time, (global_stepnp - 1) * num_gpu, step * num_gpu)) print("""per_cost_time:%.3fs""" % ((end - start) / num_gpu)) loss_str = '' for k in total_loss_dict_.keys(): loss_str += '%s:%.3f\n' % (k, total_loss_dict_[k]) print(loss_str) if np.isnan(total_loss_dict_['total_losses']): sys.exit(0) else: if step % cfgs.SMRY_ITER == 0: _, global_stepnp, summary_str = sess.run( [train_op, global_step, summary_op]) summary_writer.add_summary( summary_str, (global_stepnp - 1) * num_gpu) summary_writer.flush() if (step > 0 and step % (cfgs.SAVE_WEIGHTS_INTE // num_gpu) == 0) or (step >= cfgs.MAX_ITERATION // num_gpu - 1): save_dir = os.path.join(cfgs.TRAINED_CKPT, cfgs.VERSION) if not os.path.exists(save_dir): os.mkdir(save_dir) save_ckpt = os.path.join( save_dir, '{}_'.format(cfgs.DATASET_NAME) + str( (global_stepnp - 1) * num_gpu) + 'model.ckpt') saver.save(sess, save_ckpt) print(' weights had been saved') coord.request_stop() coord.join(threads)
def add_loss(self): '''Adds loss to the model. Sets "loss" field. initialize must have been called.''' with tf.variable_scope('loss') as scope: hp = self.hparams if hp.mask_decoder: # Compute loss of predictions before postnet before = MaskedMSE(self.mel_targets, self.decoder_output, self.targets_lengths, hparams=self.hparams) # Compute loss after postnet after = MaskedMSE(self.mel_targets, self.mel_outputs, self.targets_lengths, hparams=self.hparams) # Compute <stop_token> loss (for learning dynamic generation stop) stop_token_loss = MaskedSigmoidCrossEntropy( self.stop_token_targets, self.stop_token_prediction, self.targets_lengths, hparams=self.hparams) else: # Compute loss of predictions before postnet before = tf.losses.mean_squared_error(self.mel_targets, self.decoder_output) # Compute loss after postnet after = tf.losses.mean_squared_error(self.mel_targets, self.mel_outputs) # Compute <stop_token> loss (for learning dynamic generation stop) stop_token_loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( labels=self.stop_token_targets, logits=self.stop_token_prediction)) if hp.predict_linear: # Compute linear loss # From https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py # Prioritize loss for frequencies under 2000 Hz. l1 = tf.abs(self.linear_targets - self.linear_outputs) n_priority_freq = int(2000 / (hp.sample_rate * 0.5) * hp.num_mels) linear_loss = 0.5 * tf.reduce_mean(l1) + 0.5 * tf.reduce_mean( l1[:, :, 0:n_priority_freq]) else: linear_loss = 0. # Compute the regularization weight if hp.tacotron_scale_regularization: reg_weight_scaler = 1. / ( 2 * hp.max_abs_value) if hp.symmetric_mels else 1. / ( hp.max_abs_value) reg_weight = hp.tacotron_reg_weight * reg_weight_scaler else: reg_weight = hp.tacotron_reg_weight # Get all trainable variables all_vars = tf.trainable_variables() regularization = tf.add_n([ tf.nn.l2_loss(v) for v in all_vars if not ('bias' in v.name or 'Bias' in v.name) ]) * reg_weight # Compute final loss term self.before_loss = before self.after_loss = after self.stop_token_loss = stop_token_loss self.regularization_loss = regularization self.linear_loss = linear_loss self.loss = self.before_loss + self.after_loss + self.stop_token_loss + self.regularization_loss + self.linear_loss
def position_sensitive_crop_regions(image, boxes, crop_size, num_spatial_bins, global_pool): """Position-sensitive crop and pool rectangular regions from a feature grid. The output crops are split into `spatial_bins_y` vertical bins and `spatial_bins_x` horizontal bins. For each intersection of a vertical and a horizontal bin the output values are gathered by performing `tf.image.crop_and_resize` (bilinear resampling) on a a separate subset of channels of the image. This reduces `depth` by a factor of `(spatial_bins_y * spatial_bins_x)`. When global_pool is True, this function implements a differentiable version of position-sensitive RoI pooling used in [R-FCN detection system](https://arxiv.org/abs/1605.06409). When global_pool is False, this function implements a differentiable version of position-sensitive assembling operation used in [instance FCN](https://arxiv.org/abs/1603.08678). Args: image: A `Tensor`. Must be one of the following types: `uint8`, `int8`, `int16`, `int32`, `int64`, `half`, `float32`, `float64`. A 3-D tensor of shape `[image_height, image_width, depth]`. Both `image_height` and `image_width` need to be positive. boxes: A `Tensor` of type `float32`. A 2-D tensor of shape `[num_boxes, 4]`. Each box is specified in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the `[0, 1]` interval of normalized image height is mapped to `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in which case the sampled crop is an up-down flipped version of the original image. The width dimension is treated similarly. crop_size: A list of two integers `[crop_height, crop_width]`. All cropped image patches are resized to this size. The aspect ratio of the image content is not preserved. Both `crop_height` and `crop_width` need to be positive. num_spatial_bins: A list of two integers `[spatial_bins_y, spatial_bins_x]`. Represents the number of position-sensitive bins in y and x directions. Both values should be >= 1. `crop_height` should be divisible by `spatial_bins_y`, and similarly for width. The number of image channels should be divisible by (spatial_bins_y * spatial_bins_x). Suggested value from R-FCN paper: [3, 3]. global_pool: A boolean variable. If True, we perform average global pooling on the features assembled from the position-sensitive score maps. If False, we keep the position-pooled features without global pooling over the spatial coordinates. Note that using global_pool=True is equivalent to but more efficient than running the function with global_pool=False and then performing global average pooling. Returns: position_sensitive_features: A 4-D tensor of shape `[num_boxes, K, K, crop_channels]`, where `crop_channels = depth / (spatial_bins_y * spatial_bins_x)`, where K = 1 when global_pool is True (Average-pooled cropped regions), and K = crop_size when global_pool is False. Raises: ValueError: Raised in four situations: `num_spatial_bins` is not >= 1; `num_spatial_bins` does not divide `crop_size`; `(spatial_bins_y*spatial_bins_x)` does not divide `depth`; `bin_crop_size` is not square when global_pool=False due to the constraint in function space_to_depth. """ total_bins = 1 bin_crop_size = [] for (num_bins, crop_dim) in zip(num_spatial_bins, crop_size): if num_bins < 1: raise ValueError('num_spatial_bins should be >= 1') if crop_dim % num_bins != 0: raise ValueError('crop_size should be divisible by num_spatial_bins') total_bins *= num_bins bin_crop_size.append(crop_dim // num_bins) if not global_pool and bin_crop_size[0] != bin_crop_size[1]: raise ValueError('Only support square bin crop size for now.') ymin, xmin, ymax, xmax = tf.unstack(boxes, axis=1) spatial_bins_y, spatial_bins_x = num_spatial_bins # Split each box into spatial_bins_y * spatial_bins_x bins. position_sensitive_boxes = [] for bin_y in range(spatial_bins_y): step_y = (ymax - ymin) / spatial_bins_y for bin_x in range(spatial_bins_x): step_x = (xmax - xmin) / spatial_bins_x box_coordinates = [ymin + bin_y * step_y, xmin + bin_x * step_x, ymin + (bin_y + 1) * step_y, xmin + (bin_x + 1) * step_x, ] position_sensitive_boxes.append(tf.stack(box_coordinates, axis=1)) image_splits = tf.split(value=image, num_or_size_splits=total_bins, axis=2) image_crops = [] for (split, box) in zip(image_splits, position_sensitive_boxes): if split.shape.is_fully_defined() and box.shape.is_fully_defined(): crop = tf.squeeze( matmul_crop_and_resize( tf.expand_dims(split, axis=0), tf.expand_dims(box, axis=0), bin_crop_size), axis=0) else: crop = tf.image.crop_and_resize( tf.expand_dims(split, 0), box, tf.zeros(tf.shape(boxes)[0], dtype=tf.int32), bin_crop_size) image_crops.append(crop) if global_pool: # Average over all bins. position_sensitive_features = tf.add_n(image_crops) / len(image_crops) # Then average over spatial positions within the bins. position_sensitive_features = tf.reduce_mean( position_sensitive_features, [1, 2], keep_dims=True) else: # Reorder height/width to depth channel. block_size = bin_crop_size[0] if block_size >= 2: image_crops = [tf.space_to_depth( crop, block_size=block_size) for crop in image_crops] # Pack image_crops so that first dimension is for position-senstive boxes. position_sensitive_features = tf.stack(image_crops, axis=0) # Unroll the position-sensitive boxes to spatial positions. position_sensitive_features = tf.squeeze( tf.batch_to_space_nd(position_sensitive_features, block_shape=[1] + num_spatial_bins, crops=tf.zeros((3, 2), dtype=tf.int32)), squeeze_dims=[0]) # Reorder back the depth channel. if block_size >= 2: position_sensitive_features = tf.depth_to_space( position_sensitive_features, block_size=block_size) return position_sensitive_features
def resnet_model_fn(features, labels, mode, params): """The model_fn for ResNet to be used with TPUEstimator. Args: features: `Tensor` of batched images. If transpose_input is enabled, it is transposed to device layout and reshaped to 1D tensor. labels: `Tensor` of labels for the data samples mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}` params: `dict` of parameters passed to the model from the TPUEstimator, `params['batch_size']` is always provided and should be used as the effective batch size. Returns: A `TPUEstimatorSpec` for the model """ if isinstance(features, dict): features = features['feature'] # In most cases, the default data format NCHW instead of NHWC should be # used for a significant performance boost on GPU/TPU. NHWC should be used # only if the network needs to be run on CPU since the pooling operations # are only supported on NHWC. if params['data_format'] == 'channels_first': assert not params['transpose_input'] # channels_first only for GPU features = tf.transpose(features, [0, 3, 1, 2]) if params['transpose_input'] and mode != tf.estimator.ModeKeys.PREDICT: image_size = tf.sqrt(tf.shape(features)[0] / (3 * tf.shape(labels)[0])) features = tf.reshape(features, [image_size, image_size, 3, -1]) features = tf.transpose(features, [3, 0, 1, 2]) # HWCN to NHWC # Normalize the image to zero mean and unit variance. features -= tf.constant(MEAN_RGB, shape=[1, 1, 3], dtype=features.dtype) features /= tf.constant(STDDEV_RGB, shape=[1, 1, 3], dtype=features.dtype) # DropBlock keep_prob for the 4 block groups of ResNet architecture. # None means applying no DropBlock at the corresponding block group. dropblock_keep_probs = [None] * 4 if params['dropblock_groups']: # Scheduled keep_prob for DropBlock. train_steps = tf.cast(params['train_steps'], tf.float32) current_step = tf.cast(tf.train.get_global_step(), tf.float32) current_ratio = current_step / train_steps dropblock_keep_prob = (1 - current_ratio * (1 - params['dropblock_keep_prob'])) # Computes DropBlock keep_prob for different block groups of ResNet. dropblock_groups = [ int(x) for x in params['dropblock_groups'].split(',') ] for block_group in dropblock_groups: if block_group < 1 or block_group > 4: raise ValueError( 'dropblock_groups should be a comma separated list of integers ' 'between 1 and 4 (dropblcok_groups: {}).'.format( params['dropblock_groups'])) dropblock_keep_probs[block_group - 1] = 1 - ( (1 - dropblock_keep_prob) / 4.0**(4 - block_group)) # This nested function allows us to avoid duplicating the logic which # builds the network, for different values of --precision. def build_network(): network = resnet_model.resnet_v1( resnet_depth=params['resnet_depth'], num_classes=params['num_label_classes'], dropblock_size=params['dropblock_size'], dropblock_keep_probs=dropblock_keep_probs, data_format=params['data_format']) return network(inputs=features, is_training=(mode == tf.estimator.ModeKeys.TRAIN)) if params['precision'] == 'bfloat16': with tf.contrib.tpu.bfloat16_scope(): logits = build_network() logits = tf.cast(logits, tf.float32) elif params['precision'] == 'float32': logits = build_network() if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'classes': tf.argmax(logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs={ 'classify': tf.estimator.export.PredictOutput(predictions) }) # If necessary, in the model_fn, use params['batch_size'] instead the batch # size flags (--train_batch_size or --eval_batch_size). batch_size = params['batch_size'] # pylint: disable=unused-variable # Calculate loss, which includes softmax cross entropy and L2 regularization. one_hot_labels = tf.one_hot(labels, params['num_label_classes']) cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=one_hot_labels, label_smoothing=params['label_smoothing']) # Add weight decay to the loss for non-batch-normalization variables. loss = cross_entropy + params['weight_decay'] * tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'batch_normalization' not in v.name ]) host_call = None if mode == tf.estimator.ModeKeys.TRAIN: # Compute the current epoch and associated learning rate from global_step. global_step = tf.train.get_global_step() steps_per_epoch = params['num_train_images'] / params[ 'train_batch_size'] current_epoch = (tf.cast(global_step, tf.float32) / steps_per_epoch) # LARS is a large batch optimizer. LARS enables higher accuracy at batch 16K # and larger batch sizes. if params['enable_lars']: learning_rate = 0.0 optimizer = lars_util.init_lars_optimizer(current_epoch, params) else: learning_rate = lottery.get_lr_tensor(params) if learning_rate is None: learning_rate = learning_rate_schedule(params, current_epoch) optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=params['momentum'], use_nesterov=True) if params['use_tpu']: # When using TPU, wrap the optimizer with CrossShardOptimizer which # handles synchronization details between different TPU cores. To the # user, this should look like regular synchronous training. optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) # Batch normalization requires UPDATE_OPS to be added as a dependency to # the train operation. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step) if not params['skip_host_call']: def host_call_fn(gs, loss, lr, ce): """Training host call. Creates scalar summaries for training metrics. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `host_call`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `host_call`. Args: gs: `Tensor with shape `[batch]` for the global_step loss: `Tensor` with shape `[batch]` for the training loss. lr: `Tensor` with shape `[batch]` for the learning_rate. ce: `Tensor` with shape `[batch]` for the current_epoch. Returns: List of summary ops to run on the CPU host. """ gs = gs[0] # Host call fns are executed params['iterations_per_loop'] times after # one TPU loop is finished, setting max_queue value to the same as # number of iterations will make the summary writer only flush the data # to storage once per loop. with summary.create_file_writer( FLAGS.model_dir, max_queue=params['iterations_per_loop']).as_default(): with summary.always_record_summaries(): summary.scalar('loss', loss[0], step=gs) summary.scalar('learning_rate', lr[0], step=gs) summary.scalar('current_epoch', ce[0], step=gs) return summary.all_summary_ops() # To log the loss, current learning rate, and epoch for Tensorboard, the # summary op needs to be run on the host CPU via host_call. host_call # expects [batch_size, ...] Tensors, thus reshape to introduce a batch # dimension. These Tensors are implicitly concatenated to # [params['batch_size']]. gs_t = tf.reshape(global_step, [1]) loss_t = tf.reshape(loss, [1]) lr_t = tf.reshape(learning_rate, [1]) ce_t = tf.reshape(current_epoch, [1]) host_call = (host_call_fn, [gs_t, loss_t, lr_t, ce_t]) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(labels, logits): """Evaluation metric function. Evaluates accuracy. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `eval_metrics`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `eval_metrics`. Args: labels: `Tensor` with shape `[batch]`. logits: `Tensor` with shape `[batch, num_classes]`. Returns: A dict of the metrics to return from evaluation. """ predictions = tf.argmax(logits, axis=1) top_1_accuracy = tf.metrics.accuracy(labels, predictions) in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32) top_5_accuracy = tf.metrics.mean(in_top_5) return { 'top_1_accuracy': top_1_accuracy, 'top_5_accuracy': top_5_accuracy, } eval_metrics = (metric_fn, [labels, logits]) return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics)
def build(self): with self._graph.as_default(), tf.device('/cpu:0'): # Create an optimizer that performs gradient descent. opt, lr, global_step = self.get_opt() ##some global placeholder keep_prob = tf.placeholder(tf.float32, name="keep_prob") L2_reg = tf.placeholder(tf.float32, name="L2_reg") training = tf.placeholder(tf.bool, name="training_flag") total_loss_to_show = 0. images_place_holder_list = [] labels_place_holder_list = [] boxes_place_holder_list = [] weights_initializer = slim.xavier_initializer() biases_initializer = tf.constant_initializer(0.) biases_regularizer = tf.no_regularizer weights_regularizer = tf.contrib.layers.l2_regularizer(L2_reg) # Calculate the gradients for each model tower. tower_grads = [] with tf.variable_scope(tf.get_variable_scope()): for i in range(cfg.TRAIN.num_gpu): with tf.device('/gpu:%d' % i): with tf.name_scope('tower_%d' % (i)) as scope: with slim.arg_scope( [slim.model_variable, slim.variable], device='/cpu:0'): images_ = tf.placeholder(tf.float32, [None, None, None, 3], name="images") boxes_ = tf.placeholder( tf.float32, [cfg.TRAIN.batch_size, None, 4], name="input_boxes") labels_ = tf.placeholder( tf.int64, [cfg.TRAIN.batch_size, None], name="input_labels") ###total anchor images_place_holder_list.append(images_) labels_place_holder_list.append(labels_) boxes_place_holder_list.append(boxes_) with slim.arg_scope([slim.conv2d, slim.conv2d_in_plane, \ slim.conv2d_transpose, slim.separable_conv2d, slim.fully_connected], weights_regularizer=weights_regularizer, biases_regularizer=biases_regularizer, weights_initializer=weights_initializer, biases_initializer=biases_initializer): reg_loss, cla_loss, l2_loss = self.tower_loss( scope, images_, labels_, boxes_, L2_reg, training) ##use muti gpu ,large batch if i == cfg.TRAIN.num_gpu - 1: total_loss = tf.add_n( [reg_loss, cla_loss, l2_loss]) else: total_loss = tf.add_n( [reg_loss, cla_loss]) total_loss_to_show += total_loss # Reuse variables for the next tower. tf.get_variable_scope().reuse_variables() ##when use batchnorm, updates operations only from the ## final tower. Ideally, we should grab the updates from all towers # but these stats accumulate extremely fast so we can ignore the # other stats from the other towers without significant detriment. bn_update_ops = tf.get_collection( tf.GraphKeys.UPDATE_OPS, scope=scope) # Retain the summaries from the final tower. self.summaries = tf.get_collection( tf.GraphKeys.SUMMARIES, scope) # Calculate the gradients for the batch of data on this CIFAR tower. grads = opt.compute_gradients(total_loss) # Keep track of the gradients across all towers. tower_grads.append(grads) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. grads = self.average_gradients(tower_grads) # Add a summary to track the learning rate. self.add_summary(tf.summary.scalar('learning_rate', lr)) self.add_summary( tf.summary.scalar('total_loss', total_loss_to_show)) self.add_summary(tf.summary.scalar('loc_loss', reg_loss)) self.add_summary(tf.summary.scalar('cla_loss', cla_loss)) self.add_summary(tf.summary.scalar('l2_loss', l2_loss)) # Add histograms for gradients. for grad, var in grads: if grad is not None: self.add_summary( tf.summary.histogram(var.op.name + '/gradients', grad)) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): self.add_summary(tf.summary.histogram(var.op.name, var)) if self.ema_weights: # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage( 0.9, global_step) variables_averages_op = variable_averages.apply( tf.trainable_variables()) # Group all updates to into a single train op. train_op = tf.group(apply_gradient_op, variables_averages_op, *bn_update_ops) else: train_op = tf.group(apply_gradient_op, *bn_update_ops) ###set inputs and ouputs self.inputs = [ images_place_holder_list, boxes_place_holder_list, labels_place_holder_list, keep_prob, L2_reg, training ] self.outputs = [ train_op, total_loss_to_show, reg_loss, cla_loss, l2_loss, lr ] self.val_outputs = [ total_loss_to_show, reg_loss, cla_loss, l2_loss, lr ] ##init all variables init = tf.global_variables_initializer() self.sess.run(init)
def train(): with tf.Graph().as_default(): with tf.device('/gpu:' + str(GPU_INDEX)): pointclouds_pl, labels_pl = MODEL.placeholder_inputs( BATCH_SIZE, NUM_POINT) is_training_pl = tf.placeholder(tf.bool, shape=()) # Note the global_step=batch parameter to minimize. # That tells the optimizer to helpfully increment the 'batch' parameter # for you every time it trains. batch = tf.get_variable('batch', [], initializer=tf.constant_initializer(0), trainable=False) bn_decay = get_bn_decay(batch) tf.summary.scalar('bn_decay', bn_decay) # Get model and loss pred, end_points = MODEL.get_model(pointclouds_pl, is_training_pl, bn_decay=bn_decay, num_class=NUM_CLASSES) MODEL.get_loss(pred, labels_pl, end_points) losses = tf.get_collection('losses') total_loss = tf.add_n(losses, name='total_loss') tf.summary.scalar('total_loss', total_loss) for l in losses + [total_loss]: tf.summary.scalar(l.op.name, l) correct = tf.equal(tf.argmax(pred, 1), tf.to_int64(labels_pl)) accuracy = tf.reduce_sum(tf.cast(correct, tf.float32)) / float(BATCH_SIZE) tf.summary.scalar('accuracy', accuracy) print("--- Get training operator") # Get training operator learning_rate = get_learning_rate(batch) tf.summary.scalar('learning_rate', learning_rate) if OPTIMIZER == 'momentum': optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=MOMENTUM) elif OPTIMIZER == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate) train_op = optimizer.minimize(total_loss, global_step=batch) # Add ops to save and restore all the variables. saver = tf.train.Saver() # Create a session config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True config.log_device_placement = False sess = tf.Session(config=config) # Add summary writers merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(os.path.join(LOG_DIR, 'train'), sess.graph) test_writer = tf.summary.FileWriter(os.path.join(LOG_DIR, 'test'), sess.graph) # Init variables init = tf.global_variables_initializer() sess.run(init) # saver.restore(sess, os.path.join(LOG_DIR,'model.ckpt')) # log_string("Model restored.") ops = { 'pointclouds_pl': pointclouds_pl, 'labels_pl': labels_pl, 'is_training_pl': is_training_pl, 'pred': pred, 'loss': total_loss, 'train_op': train_op, 'merged': merged, 'step': batch, 'end_points': end_points } best_acc = -1 for epoch in range(MAX_EPOCH): log_string('**** EPOCH %03d ****' % (epoch)) sys.stdout.flush() train_one_epoch(sess, ops, train_writer) eval_one_epoch(sess, ops, test_writer) # Save the variables to disk. # if epoch % 10 == 0: save_path = saver.save(sess, os.path.join(LOG_DIR, "model.ckpt")) log_string("Model saved in file: %s" % save_path)
# hidden layers h, E0, E1 = layer(args.layer_type, (h, E0, E1, alpha_val), 64, training, args, activation=tf.nn.elu) # classification layer logits,_,_ = layer(args.layer_type, (h, E0, E1, alpha_val), nC, training, args, multi_edge_aggregation='mean') Yhat = tf.one_hot(tf.argmax(logits, axis=-1), nC) loss_train = utils.calc_loss(Y, logits, idx_train, W=W) loss_val = utils.calc_loss(Y, logits, idx_val) loss_test = utils.calc_loss(Y, logits, idx_test) vars = tf.trainable_variables() lossL2 = tf.add_n([tf.nn.l2_loss(v) for v in vars if 'bias' not in v.name and 'gamma' not in v.name]) * args.weight_decay optimizer = tf.train.AdamOptimizer(learning_rate=args.lr) train_op = optimizer.minimize(loss_train + lossL2) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) # ************************************************************ # training # ************************************************************ # ckpt_dir = Path('./ckpt') # ckpt_dir.mkdir(parents=True, exist_ok=True) # ckpt_path = ckpt_dir/'checkpoint.ckpt' # print('ckpt_path=', ckpt_path)