def add_training_costs(self, graph, name_scopes, output, labels, weights): with graph.as_default(): epsilon = 1e-3 # small float to avoid dividing by zero weighted_costs = [] # weighted costs for each example gradient_costs = [] # costs used for gradient calculation with TensorflowGraph.shared_name_scope('costs', graph, name_scopes): for task in range(self.n_tasks): task_str = str(task).zfill(len(str(self.n_tasks))) with TensorflowGraph.shared_name_scope( 'cost_{}'.format(task_str), graph, name_scopes): with tf.name_scope('weighted'): weighted_cost = self.cost(output[task], labels[task], weights[task]) weighted_costs.append(weighted_cost) with tf.name_scope('gradient'): # Note that we divide by the batch size and not the number of # non-zero weight examples in the batch. Also, instead of using # tf.reduce_mean (which can put ops on the CPU) we explicitly # calculate with div/sum so it stays on the GPU. gradient_cost = tf.div( tf.reduce_sum(weighted_cost), self.batch_size) gradient_costs.append(gradient_cost) # aggregated costs with TensorflowGraph.shared_name_scope('aggregated', graph, name_scopes): with tf.name_scope('gradient'): loss = tf.add_n(gradient_costs) # weight decay if self.penalty != 0.0: penalty = model_ops.weight_decay( self.penalty_type, self.penalty) loss += penalty return loss
def add_training_cost(self, graph, name_scopes, output, labels, weights): with graph.as_default(): epsilon = 1e-3 # small float to avoid dividing by zero weighted_costs = [] # weighted costs for each example gradient_costs = [] # costs used for gradient calculation with TensorflowGraph.shared_name_scope('costs', graph, name_scopes): for task in range(self.n_tasks): task_str = str(task).zfill(len(str(self.n_tasks))) with TensorflowGraph.shared_name_scope('cost_{}'.format(task_str), graph, name_scopes): with tf.name_scope('weighted'): weighted_cost = self.cost(output[task], labels[task], weights[task]) weighted_costs.append(weighted_cost) with tf.name_scope('gradient'): # Note that we divide by the batch size and not the number of # non-zero weight examples in the batch. Also, instead of using # tf.reduce_mean (which can put ops on the CPU) we explicitly # calculate with div/sum so it stays on the GPU. gradient_cost = tf.div( tf.reduce_sum(weighted_cost), self.batch_size) gradient_costs.append(gradient_cost) # aggregated costs with TensorflowGraph.shared_name_scope('aggregated', graph, name_scopes): with tf.name_scope('gradient'): loss = tf.add_n(gradient_costs) # weight decay if self.penalty != 0.0: penalty = model_ops.weight_decay(self.penalty_type, self.penalty) loss += penalty return loss