def merge_gradients(tower_grad): grads_and_vars = tower_grad.pop() grad_len = float(len(tower_grad)) # if len tower_grad > 0 means that more than # one gradients need to be averaged. if grad_len > 0: gs = [] vs = [] for i, (g, v) in enumerate(grads_and_vars): gs.append(g) vs.append(v) for grad in tower_grad: for i, (g, v) in enumerate(grad): assert v == vs[i] if isinstance(g, tf.Tensor): gs[i] += g elif isinstance(g, tf.IndexedSlices): sum_values = tf.accumulate_n([gs[i].values, g.values]) sum_indices = tf.accumulate_n([gs[i].indices, g.indices]) gs[i] = tf.IndexedSlices(sum_values, sum_indices, g.dense_shape) for i in range(len(gs)): if isinstance(gs[i], tf.Tensor): gs[i] /= grad_len + 1.0 elif isinstance(gs[i], tf.IndexedSlices): gs[i] = tf.IndexedSlices(sum_values / grad_len + 1.0, sum_indices, gs[i].dense_shape) grads_and_vars = zip(gs, vs) return grads_and_vars
def build_J_logistic_w_scaled_reg(self, lambda_val=1.0 ,y=None ): """ @fn :: build_J_L2norm_w_reg - @brief :: build or make cost functional, of the form of the L2 norm (i.e. Euclidean distance norm) # regularization, "learning", "momentum" parameters/constants/rates @type lambda_val : float @param lambda_val : regularization constant """ try: m = tf.cast( self.X.get_shape()[0], tf.float32) # ValueError except ValueError as valerr: print("ValueError in obtaining batch size: ", valerr) m = self.X.get_shape()[0] loss = self.build_J_logistic(y) Thetas_only = self._CNN_model.__get_state__()['Thetas'] ThetaL2norms = map( tf.nn.l2_loss, Thetas_only) reg_term = tf.accumulate_n(ThetaL2norms) J = loss + lambda_val*(1.0/m)*reg_term self.J_Theta = J return J
def build_J_logistic_w_reg(self, lambda_val=1.0, y=None ): """ @fn :: build_J_L2norm_w_reg - @brief :: build or make cost functional, of the form of the L2 norm (i.e. Euclidean distance norm) # regularization, "learning", "momentum" parameters/constants/rates @type lambda_val : float @param lambda_val : regularization constant """ """ if y is not None: self.y = y else: y = self.y """ loss = self.build_J_logistic(y) Thetas_only = self._CNN_model.__get_state__()['Thetas'] ThetaL2norms = map( tf.nn.l2_loss, Thetas_only) reg_term = tf.accumulate_n(ThetaL2norms) J = loss + lambda_val*reg_term self.J_Theta = J return J
def build_J_xent_w_scaled_reg(self, y=None, lambda_val=1.0): """ build_J_L2norm_w_scaled_reg with J_loss = \frac{1}{m} \sum_{i=0}^{m-1} (\widehat{\mathbf{y}}^{(i)}-\mathbf{y}^{(i)})^2 (see build_J_L2norm) this adds the following term: J_reg = \frac{1}{m} \sum_{l=0}^{L-1} \| \Theta^{(l)} \|_2 = = \frac{1}{m} \sum_{l=0}^{L-1} \sum_{I \in \mathcal{I}} (\Theta_I^{(l)})^2 and computes J = J_loss + J_reg @type lambda_val : (single) float number @param lambda_val : regularization parameter """ # m=number of samples, i.e. total "batch" size X = self.X m = tf.cast(tf.shape(X)[0], tf.float32) loss = self.build_J_xent(y) Thetas_only = self.DNN_model.__get_state__()['Thetas'] ThetaL2norms = map(tf.nn.l2_loss, Thetas_only) reg_term = tf.accumulate_n(ThetaL2norms) J = loss + lambda_val * (1.0 / m) * reg_term self.J_Theta = J return J
def connectivity_penalty(adj: tf.Tensor, features: tf.Tensor, batch_size: int, penalty_weight: float = 1.0, add_summaries: bool = False, scope: Optional[str] = None) -> tf.Tensor: def _sigmoid(x, a=100): # {1 + exp[−a(x − 1/2 ))] }^−1 return tf.sigmoid(a * (x - 0.5)) with tf.name_scope(scope, 'ConnectivityPenalty', [adj, features]): n_nodes = adj.shape[-1].value with tf.name_scope('adj_power', values=[adj]): prob_edge = 1.0 - adj[:, 0, :, :] As = [tf.eye(n_nodes, batch_shape=[batch_size]), prob_edge] for i in range(2, n_nodes - 1): As.append(_sigmoid(tf.matmul(As[i - 1], prob_edge))) indicator = _sigmoid(tf.accumulate_n(As)) prob_node = tf.expand_dims(1.0 - features[:, :, 0], axis=-1) # compute all paired probabilities q = tf.matmul(prob_node, tf.matrix_transpose(prob_node)) g = tf.add(q * (1.0 - indicator), (1.0 - q) * indicator) penalty = penalty_weight / (n_nodes * n_nodes) * tf.reduce_sum(g) tf.losses.add_loss(penalty) if add_summaries: tf.summary.scalar('penalty', penalty) return penalty
def build_J_xent_w_scaled_reg(self,y=None,lambda_val=1.0): """ build_J_L2norm_w_scaled_reg with J_loss = \frac{1}{m} \sum_{i=0}^{m-1} (\widehat{\mathbf{y}}^{(i)}-\mathbf{y}^{(i)})^2 (see build_J_L2norm) this adds the following term: J_reg = \frac{1}{m} \sum_{l=0}^{L-1} \| \Theta^{(l)} \|_2 = = \frac{1}{m} \sum_{l=0}^{L-1} \sum_{I \in \mathcal{I}} (\Theta_I^{(l)})^2 and computes J = J_loss + J_reg @type lambda_val : (single) float number @param lambda_val : regularization parameter """ # m=number of samples, i.e. total "batch" size X=self.X m = tf.cast( tf.shape( X )[0], tf.float32) loss = self.build_J_xent(y) Thetas_only = self.DNN_model.__get_state__()['Thetas'] ThetaL2norms = map( tf.nn.l2_loss, Thetas_only) reg_term = tf.accumulate_n( ThetaL2norms ) J = loss + lambda_val*(1.0/m)*reg_term self.J_Theta = J return J
def build_J_xent_w_reg(self, y=None, lambda_val=1.0): """ build_J_xent_w_reg with J_loss (see build_J_xent) this adds the following term: J_reg = \sum_{l=0}^{L-1} \| \Theta^{(l)} \|_2 = = \sum_{l=0}^{L-1} \sum_{I \in \mathcal{I}} (\Theta_I^{(l)})^2 and computes J = J_loss + J_reg Notice that in J_reg, there's no $\frac{1}{m}$ factor, m=number of (input) examples/samples, and there is no way to obtain that factor without invoking m haphazardly from the matrix size dimension of the input X since X \in \text{Mat}_{\mathbb{K}}(m,d). This is done in build_J_L2norm_w_scaledreg @type lambda_val : (single) float number @param lambda_val : regularization parameter """ loss = self.build_J_xent(y) Thetas_only = self.DNN_model.__get_state__()['Thetas'] ThetaL2norms = map(tf.nn.l2_loss, Thetas_only) reg_term = tf.accumulate_n(ThetaL2norms) J = loss + lambda_val * reg_term self.J_Theta = J return J
def build_J_xent_w_reg(self,y=None,lambda_val=1.0): """ build_J_xent_w_reg with J_loss (see build_J_xent) this adds the following term: J_reg = \sum_{l=0}^{L-1} \| \Theta^{(l)} \|_2 = = \sum_{l=0}^{L-1} \sum_{I \in \mathcal{I}} (\Theta_I^{(l)})^2 and computes J = J_loss + J_reg Notice that in J_reg, there's no $\frac{1}{m}$ factor, m=number of (input) examples/samples, and there is no way to obtain that factor without invoking m haphazardly from the matrix size dimension of the input X since X \in \text{Mat}_{\mathbb{K}}(m,d). This is done in build_J_L2norm_w_scaledreg @type lambda_val : (single) float number @param lambda_val : regularization parameter """ loss = self.build_J_xent(y) Thetas_only = self.DNN_model.__get_state__()['Thetas'] ThetaL2norms = map( tf.nn.l2_loss, Thetas_only) reg_term = tf.accumulate_n( ThetaL2norms ) J = loss + lambda_val*reg_term self.J_Theta = J return J
def build_J_logistic_w_scaled_reg(self, lambda_val=1.0, y=None): """ @fn :: build_J_L2norm_w_reg - @brief :: build or make cost functional, of the form of the L2 norm (i.e. Euclidean distance norm) # regularization, "learning", "momentum" parameters/constants/rates @type lambda_val : float @param lambda_val : regularization constant """ try: m = tf.cast(self.X.get_shape()[0], tf.float32) # ValueError except ValueError as valerr: print("ValueError in obtaining batch size: ", valerr) m = self.X.get_shape()[0] loss = self.build_J_logistic(y) Thetas_only = self._CNN_model.__get_state__()['Thetas'] ThetaL2norms = map(tf.nn.l2_loss, Thetas_only) reg_term = tf.accumulate_n(ThetaL2norms) J = loss + lambda_val * (1.0 / m) * reg_term self.J_Theta = J return J
def scale_invariant_gradient_loss(prediction, gt): def discrete_scale_invariant_gradient(f, h): """ Calculates the discrete scale invariant gradient of f with spacing h """ _, height, width, _ = f.shape.as_list() # Pad the input width and height to allow for the spacing padded_f = tf.pad(f, [[0, 0], [0, h], [0, h], [0, 0]]) # f(i + h, j) f_ih_j = padded_f[:, 0:height, h:width + h, :] # (f(i + h, j) - f(i, j)) / (|f(i + h, j)| + |f(i, j)|) i = (f_ih_j - f) / (tf.abs(f_ih_j) + tf.abs(f)) # f(i, j + h) f_i_jh = padded_f[:, h:height + h, 0:width, :] # (f(i, j + h) - f(i, j)) / (|f(i, j + h)| + |f(i, j)|) j = (f_i_jh - f) / (tf.abs(f_i_jh) + tf.abs(f)) return tf.stack([i, j]) all_losses = [] hs = [1, 2, 4, 8, 16] for h in hs: pred_grad = discrete_scale_invariant_gradient(prediction) gt_grad = discrete_scale_invariant_gradient(gt) all_losses.append(l2(pred_grad, gt_grad_i, normalize=False)) return tf.reduce_sum(tf.accumulate_n(all_losses))
def build_J_logistic_w_reg(self, lambda_val=1.0, y=None): """ @fn :: build_J_L2norm_w_reg - @brief :: build or make cost functional, of the form of the L2 norm (i.e. Euclidean distance norm) # regularization, "learning", "momentum" parameters/constants/rates @type lambda_val : float @param lambda_val : regularization constant """ """ if y is not None: self.y = y else: y = self.y """ loss = self.build_J_logistic(y) Thetas_only = self._CNN_model.__get_state__()['Thetas'] ThetaL2norms = map(tf.nn.l2_loss, Thetas_only) reg_term = tf.accumulate_n(ThetaL2norms) J = loss + lambda_val * reg_term self.J_Theta = J return J
def classify(prob): max_pred_digits = [] cum_max_pred = [] for i in range(n_digits): log_prob = tf.log(prob[i]) max_pred_digits.append(tf.argmax(log_prob, 1)) max_pred = tf.reduce_max(log_prob, 1) if i == 0: cum_max_pred.append(max_pred) else: cum_max_pred.append( tf.accumulate_n([cum_max_pred[i - 1], max_pred])) max_pred_digits = tf.reshape(tf.concat(0, max_pred_digits), [-1, n_digits]) log_prob_len = tf.log(prob[n_digits]) log_prob_len = tf.split(1, n_digits + 1, log_prob_len) total_max_pred = [] total_max_pred.append(log_prob_len[0]) for i in range(n_digits): total_max_pred.append( tf.accumulate_n( [log_prob_len[i + 1], tf.reshape(cum_max_pred[i], [-1, 1])])) total_max_pred = tf.reshape(tf.concat(0, total_max_pred), [-1, len(total_max_pred)]) total_len = tf.cast(tf.argmax(total_max_pred, 1), tf.int32) batch_size = total_len.get_shape().as_list()[0] lengths_transposed = tf.expand_dims(total_len, 1) lengths_tiled = tf.tile(lengths_transposed, [1, n_digits]) range_all = tf.range(0, n_digits, 1) range_row = tf.expand_dims(range_all, 0) range_tiled = tf.tile(range_row, [batch_size, 1]) mask = tf.less(range_tiled, lengths_tiled) all_neg_ones = tf.cast(tf.fill(tf.shape(mask), -1), tf.int64) result = tf.select(mask, max_pred_digits, all_neg_ones) return result
def _build(self, logits): weak_classifications = [tf.nn.softmax(logits) for logits in logits] weighted_classifications = [ c * (1. / (s + 1e-5) * a) for c, s, a in zip(weak_classifications, self._weak_running_sums, self._running_accs) ] return tf.accumulate_n(weighted_classifications)
def _build(self, logits): assert len(logits) == self._weights.get_shape().as_list()[0] stopped_logits = [tf.stop_gradient(l) for l in logits] weighted_logits = [ a * b for a, b in zip(stopped_logits, tf.split(self._weights, len(logits))) ] return tf.accumulate_n(weighted_logits) / float(len(logits))
def testSimple(self): with self.test_session(): random_arrays = [np.random.rand(16, 16, 16, 16).astype(np.float32) for _ in range(20)] random_tensors = [tf.convert_to_tensor(x, dtype=tf.float32) for x in random_arrays] tf_val = tf.accumulate_n(random_tensors) np_val = random_arrays[0] for random_array in random_arrays[1:]: np_val += random_array self.assertAllClose(np_val, tf_val.eval())
def classify(prob): max_pred_digits = [] cum_max_pred = [] for i in range(n_digits): log_prob = tf.log(prob[i]) max_pred_digits.append(tf.argmax(log_prob,1)) max_pred = tf.reduce_max(log_prob,1) if i == 0: cum_max_pred.append(max_pred) else: cum_max_pred.append(tf.accumulate_n([cum_max_pred[i-1], max_pred])) max_pred_digits = tf.reshape(tf.concat(0, max_pred_digits), [-1, n_digits]) log_prob_len = tf.log(prob[n_digits]) log_prob_len = tf.split(1,n_digits+1,log_prob_len) total_max_pred = [] total_max_pred.append(log_prob_len[0]) for i in range(n_digits): total_max_pred.append(tf.accumulate_n([log_prob_len[i+1], tf.reshape(cum_max_pred[i], [-1,1])])) total_max_pred = tf.reshape(tf.concat(0, total_max_pred), [-1, len(total_max_pred)]) total_len = tf.cast(tf.argmax(total_max_pred,1), tf.int32) batch_size = total_len.get_shape().as_list()[0] lengths_transposed = tf.expand_dims(total_len, 1) lengths_tiled = tf.tile(lengths_transposed, [1, n_digits]) range_all = tf.range(0, n_digits, 1) range_row = tf.expand_dims(range_all, 0) range_tiled = tf.tile(range_row, [batch_size, 1]) mask = tf.less(range_tiled, lengths_tiled) all_neg_ones = tf.cast(tf.fill(tf.shape(mask), -1), tf.int64) result = tf.select(mask, max_pred_digits, all_neg_ones) return result
def call(self, inputs): """Predict op.""" # A list of tree outputs. Each element corresponds to one tree. tree_logits = [] for tree_index in range(self.trees_num): tree_logits.append( nt_compute_output_op(inputs, self.node_weights[tree_index], self.leaf_weights[tree_index], self.output_logits_dim, self.depth, self.smooth_step_param, self.parallelize_over_samples)) if self.trees_num == 1: return tree_logits[0] elif self.sum_outputs: return tf.accumulate_n(tree_logits) else: return tf.concat(tree_logits, axis=1)
def ff_bp(data, w, grads, ff_deps, bp_deps): new_ff_deps = [] new_bp_deps = [] # ff fwd = [] last = data for i in xrange(FLAGS.num_layers): with tf.name_scope('fc_ff%d' % i): fwd.append(last) tmp = [] new_ff_deps.append([]) for j in xrange(FLAGS.num_gpus): with tf.device('/gpu:%d' % j), tf.control_dependencies([ff_deps[i][j]]): # matmult y = tf.matmul(last[j], w[i][j]) # split y_split = tf.split(split_dim=1, num_split=FLAGS.num_gpus, value=y) tmp.append(y_split) new_ff_deps[i].append(y) # reduce red = [] for j in xrange(FLAGS.num_gpus): with tf.device('/gpu:%d' % j): red.append(tf.accumulate_n([s[j] for s in tmp])) last = red # bp for i in reversed(xrange(FLAGS.num_layers)): with tf.name_scope('fc_bp%d' % i): # convert col -> rep tmp = [] for j in xrange(FLAGS.num_gpus): with tf.device('/gpu:%d' % j): tmp.append(tf.concat(concat_dim=1, values=last)) last = [] for j in xrange(FLAGS.num_gpus): with tf.device('/gpu:%d' % j): with tf.name_scope('bp'): # matmult: bp dy = tf.matmul(tmp[j], w[i][j], transpose_b=True) last.append(dy) # matmult: grad dw = tf.matmul(fwd[i][j], tmp[j], transpose_a=True) # update grads[i][j] += dw return new_ff_deps, new_bp_deps
def build_J_xent_w_scaled_reg(self, lambda_val=1.0, y=None): """ build_J_L2norm_w_scaled_reg with J_loss = \frac{1}{m} \sum_{i=0}^{m-1} (\widehat{\mathbf{y}}^{(i)}-\mathbf{y}^{(i)})^2 (see build_J_L2norm) this adds the following term: J_reg = \frac{1}{m} \sum_{l=0}^{L-1} \| \Theta^{(l)} \|_2 = = \frac{1}{m} \sum_{l=0}^{L-1} \sum_{I \in \mathcal{I}} (\Theta_I^{(l)})^2 and computes J = J_loss + J_reg @type lambda_val : (single) float number @param lambda_val : regularization parameter """ """ if y is not None: self.y = y else: y = self.y """ # m=number of samples, i.e. total "batch" size X=self.X try: m = tf.cast( self.X.get_shape()[0], tf.float32) # ValueError except ValueError as valerr: print("ValueError in obtaining batch size: ", valerr) m = self.X.get_shape()[0] loss = self.build_J_xent(y) Thetas_only = self._CNN_model.__get_state__()['Thetas'] ThetaL2norms = map( tf.nn.l2_loss, Thetas_only) reg_term = tf.accumulate_n( ThetaL2norms ) J = loss + lambda_val*(1.0/m)*reg_term self.J_Theta = J return J
def build_J_xent_w_scaled_reg(self, lambda_val=1.0, y=None): """ build_J_L2norm_w_scaled_reg with J_loss = \frac{1}{m} \sum_{i=0}^{m-1} (\widehat{\mathbf{y}}^{(i)}-\mathbf{y}^{(i)})^2 (see build_J_L2norm) this adds the following term: J_reg = \frac{1}{m} \sum_{l=0}^{L-1} \| \Theta^{(l)} \|_2 = = \frac{1}{m} \sum_{l=0}^{L-1} \sum_{I \in \mathcal{I}} (\Theta_I^{(l)})^2 and computes J = J_loss + J_reg @type lambda_val : (single) float number @param lambda_val : regularization parameter """ """ if y is not None: self.y = y else: y = self.y """ # m=number of samples, i.e. total "batch" size X = self.X try: m = tf.cast(self.X.get_shape()[0], tf.float32) # ValueError except ValueError as valerr: print("ValueError in obtaining batch size: ", valerr) m = self.X.get_shape()[0] loss = self.build_J_xent(y) Thetas_only = self._CNN_model.__get_state__()['Thetas'] ThetaL2norms = map(tf.nn.l2_loss, Thetas_only) reg_term = tf.accumulate_n(ThetaL2norms) J = loss + lambda_val * (1.0 / m) * reg_term self.J_Theta = J return J
def SAMME_R_voting_strategy(logits): """ Algorithm 4 of "Multi-class AdaBoost" by Zhu et al. 2006 PDF: Can be found at the bottom of page 9 (https://web.stanford.edu/~hastie/Papers/samme.pdf) Args: See `voting strategy` """ class_num = logits[0].get_shape().as_list()[-1] for x in logits: assert x.shape == logits[0].shape log_probs = [tf.log(tf.nn.softmax(l)) for l in logits] # two steps to get a matrix of -1 except for the diagonal which is 1 hk_inner_prod = tf.constant( (-1 / class_num), dtype=tf.float32, shape=(class_num, class_num)) hk_inner_prod = tf.matrix_set_diag(hk_inner_prod, tf.ones([class_num])) h_ks = [(class_num - 1) * tf.matmul(lp, hk_inner_prod) for lp in log_probs] return tf.accumulate_n(h_ks)
def testZeroArgs(self): with self.test_session(): with self.assertRaises(ValueError): tf_val = tf.accumulate_n([]) tf_val.eval()
add_op_2 = add_op_1 + one_matrix # Sum of all elements of resultant matrix matrix_sum_1 = tf.reduce_sum(add_op_1, name="matrix_sum_1") matrix_sum_2 = tf.reduce_sum(add_op_2, name="matrix_sum_2") # Product of all elements prod_1 = tf.reduce_prod(add_op_1) # reduce_min, reduce_max, reduce_mean # reduce_all - item wise AND operator applied # reduce_any - item wise OR operator applied # Element wise sum of matrices of same shape # shape paramter is inferred element_sum = tf.accumulate_n([add_op_1, add_op_2]) # Initialize variables init = tf.initialize_variables([one_matrix, a_matrix]) # Session session = tf.Session() session.run(init) try: assert_op = tf.assert_variables_initialized([one_matrix, a_matrix]) result = session.run([element_sum, prod_1, matrix_sum_2, matrix_sum_1]) except tf.errors.FailedPreconditionError: print 'Intialize variables before using them, exiting session' session.close()
def define_graph(self): """ Set up the model graph """ with tf.name_scope('data'): self.ref_image = tf.placeholder(tf.float32, shape=[None,128,128,3], name='ref_image') self.multi_plane = tf.placeholder(tf.float32, shape=[None,128,128,3*c.NUM_PLANE]) self.gt = tf.placeholder(tf.float32,shape=[None,128,128,3], name='gt') self.summaries=[] with tf.name_scope('predection'): def prediction(ref_image,multi_plane): net_in = tf.concat([ref_image,multi_plane],axis=-1) conv1_1 = conv_block(net_in,64,3,1) conv1_2 = conv_block(conv1_1,128,3,2) conv2_1 = conv_block(conv1_2,128,3,1) conv2_2 = conv_block(conv2_1,256,3,2) conv3_1 = conv_block(conv2_2,256,3,1) conv3_2 = conv_block(conv3_1,256,3,1) conv3_3 = conv_block(conv3_2,512,3,2) # weight3_1 = tf.Variable(tf.random_normal([3, 3, 512])) # weight3_2 = tf.Variable(tf.random_normal([3, 3, 512])) # weight3_3 = tf.Variable(tf.random_normal([3, 3, 512])) # conv4_1 = tf.nn.dilation2d(conv3_3,weight3_1,[1,1,1,1],[1,2,2,1],'SAME') # conv4_2 = tf.nn.dilation2d(conv4_1,weight3_2,[1,1,1,1],[1,2,2,1],'SAME') # conv4_3 = tf.nn.dilation2d(conv4_2,weight3_3,[1,1,1,1],[1,2,2,1],'SAME') conv4_1 = tf.layers.conv2d(conv3_3,512,(3,3),(1,1),'SAME',dilation_rate=(2,2)) conv4_2 = tf.layers.conv2d(conv4_1,512,(3,3),(1,1),'SAME',dilation_rate=(2,2)) conv4_3 = tf.layers.conv2d(conv4_2,512,(3,3),(1,1),'SAME',dilation_rate=(2,2)) conv5_1 = deconv_block(tf.concat([conv4_3,conv3_3],axis=-1),256,4,2) conv5_2 = conv_block(conv5_1,256,3,1) conv5_3 = conv_block(conv5_2,256,3,1) conv6_1 = deconv_block(tf.concat([conv5_3,conv2_2],axis=-1),128,4,2) conv6_2 = conv_block(conv6_1,128,3,1) conv7_1 = deconv_block(tf.concat([conv6_2,conv1_2],axis=-1),64,4,2) conv7_2 = conv_block(conv7_1,64,3,1) conv7_3 = tf.layers.conv2d(conv7_2,62,(1,1),(1,1),'SAME') conv7_3 = tf.nn.tanh(conv7_3) blending_weights, alpha_images = tf.split(conv7_3,[c.NUM_PLANE,c.NUM_PLANE],axis=-1) blending_weights = tensor_norm(blending_weights) #alpha_images = tensor_norm(alpha_images) alpha_images = tf.nn.softmax(alpha_images,axis=-1) feature_maps = { 'conv1_1':conv1_1, 'conv1_2':conv1_2, 'conv2_1':conv2_1, 'conv2_2':conv2_2, 'conv3_1':conv3_1, 'conv3_2':conv3_2, 'conv3_3':conv3_3, 'conv4_1':conv4_1, 'conv4_2':conv4_2, 'conv4_3':conv4_3, 'conv5_1':conv5_1, 'conv6_1':conv6_1, 'conv6_2':conv6_2, 'conv7_1':conv7_1, 'conv7_2':conv7_2, 'conv7_3':conv7_3 } return blending_weights, alpha_images, feature_maps self.blending_weights, self.alpha_images, self.feature_maps = prediction(self.ref_image,self.multi_plane) self.color_images = [] for i in range(c.NUM_PLANE): tmp_weights = tf.expand_dims(self.blending_weights[:,:,:,i],axis=-1) #tmp_weights = self.blending_weights[:,:,:,i] self.color_images.append( tf.multiply(tmp_weights,self.ref_image) + tf.multiply(1-tmp_weights,self.multi_plane[:,:,:,3*i:3*(i+1)])) self.preds = [] for i in range(c.NUM_PLANE): tmp_alpha = tf.expand_dims(self.alpha_images[:,:,:,i],axis=-1) self.preds.append(tf.multiply(tmp_alpha, self.color_images[i])) self.preds = tf.accumulate_n(self.preds) #self.preds = inception_model(self.preds,6) with tf.name_scope('train'): self.loss = VGG_loss(self.preds,self.gt) self.global_step = tf.Variable(0, trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=c.LRATE, name='optimizer') self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step, name='train_op') loss_summary = tf.summary.scalar('train_loss', self.loss) self.summaries.append(loss_summary) with tf.name_scope('error'): self.psnr = psnr(self.preds,self.gt) self.sharpdiff = sharp_diff(self.preds,self.gt) self.ssim = ssim(self.preds, self.gt) summary_psnr = tf.summary.scalar('train_PSNR',self.psnr) summary_sharpdiff = tf.summary.scalar('train_SharpDiff',self.sharpdiff) summary_ssim = tf.summary.scalar('trian_ssim',self.ssim) self.summaries += [summary_psnr, summary_sharpdiff, summary_ssim] self.summaries = tf.summary.merge(self.summaries)
def model_fn(self, features, labels, mode, params): """Build the model based on features, labels, and mode. Args: features: The features dictionary containing the data Tensor and the number of examples. labels: The labels Tensor resulting from calling the model. mode: A string indicating the training mode. params: A dictionary of hyperparameters. Returns: A tf.estimator.EstimatorSpec. """ del params is_training = (mode == tf.estimator.ModeKeys.TRAIN) if is_training: features = tf.transpose(features, [3, 0, 1, 2]) # HWCN to NHWC total_loss, outputs = self._build_network(features, labels, mode) devices = cluster_utils.get_pipeline_devices(FLAGS.pipeline_device_num) slice_num = len(devices) micro_batch_num = FLAGS.micro_batch_num losses = [] all_outputs = [] losses.append(total_loss) all_outputs.append(outputs) layer_grads = [[[] for i in xrange(slice_num)] for j in xrange(micro_batch_num)] layer_vars = [[] for i in xrange(slice_num)] remained_vars = tf.trainable_variables() ys = losses[0] prev_grads=None # layers-1 ~ 1 compute grads for i in xrange(slice_num - 1, 0, -1): vars_i = [v for v in remained_vars if v.device==devices[i]] remained_vars = [v for v in remained_vars if v not in vars_i] prev_y = all_outputs[0][i-1] prev_y = prev_y if isinstance(prev_y, list) else [prev_y] num_tensors = len(prev_y) y_grads = tf.gradients(ys=ys, xs=prev_y+vars_i, grad_ys=prev_grads, colocate_gradients_with_ops=True) ys = prev_y prev_grads = y_grads[0:num_tensors] grads_i = y_grads[num_tensors:] layer_grads[0][i] = [g for g in grads_i if g is not None] layer_vars[i] = [v for (g, v) in zip(grads_i, vars_i) if g is not None] # layer 0 compute grads grads_0 = tf.gradients(ys=ys, xs=remained_vars, grad_ys=prev_grads, colocate_gradients_with_ops=True) layer_grads[0][0] = [g for g in grads_0 if g is not None] layer_vars[0] = [v for (g, v) in zip(grads_0, remained_vars) if g is not None] # other micro_batch_num for j in xrange(1, micro_batch_num): dep_outputs = [] for i in xrange(slice_num): dep_outputs.append(all_outputs[j-1][i] if i+j < 2*slice_num-1 else layer_grads[i+j-2*slice_num+1][i]) loss, outputs = self._build_network(features, labels, mode, dep_outputs=dep_outputs) losses.append(loss) all_outputs.append(outputs) ys = losses[j] prev_grads=None for i in xrange(slice_num - 1, 0, -1): prev_y = all_outputs[j][i-1] prev_y = prev_y if isinstance(prev_y, list) else [prev_y] num_tensors = len(prev_y) y_grads = tf.gradients(ys=ys, xs=prev_y+layer_vars[i], grad_ys=prev_grads, colocate_gradients_with_ops=True) ys = prev_y prev_grads = y_grads[0:num_tensors] grads_i = y_grads[num_tensors:] layer_grads[j][i] = [g for g in grads_i if g is not None] grads_0 = tf.gradients(ys=ys, xs=layer_vars[0], grad_ys=prev_grads, colocate_gradients_with_ops=True) layer_grads[j][0] = [g for g in grads_0 if g is not None] grads_set = [] vars_set = [] for i in xrange(slice_num): for j in xrange(len(layer_grads[0][i])): grad_i_set = [layer_grads[m][i][j] for m in range(micro_batch_num)] #print (grad_i_set) if micro_batch_num == 1: with tf.device(grad_i_set[0].device): acc_grads = grad_i_set[0] else: with tf.control_dependencies(grad_i_set), tf.device(grad_i_set[0].device): # replica if isinstance(grad_i_set[0], tf.IndexedSlices): acc_grads = tf.add_n(grad_i_set) else: acc_grads = tf.accumulate_n(grad_i_set) grads_set.append(acc_grads) vars_set.append(layer_vars[i][j]) grads_and_vars = zip(grads_set, vars_set) ####################### train_op = None if is_training: global_step = tf.train.get_or_create_global_step() gs_t = tf.reshape(tf.cast(global_step, tf.int32), [1]) # Setup learning rate schedule learning_rate = self._build_learning_rate_schedule(global_step) # Setup optimizer. optimizer = self._build_optimizer(learning_rate) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(None): # original is update_ops train_op = self._build_train_op(optimizer, grads_and_vars, global_step=global_step) if self.hparams.moving_average_decay > 0: ema = tf.train.ExponentialMovingAverage( decay=self.hparams.moving_average_decay, num_updates=global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) with tf.control_dependencies([train_op]): with tf.name_scope('moving_average'): train_op = ema.apply(variables_to_average) lr_t = tf.reshape(learning_rate, [1]) host_call = None if self.hparams.enable_hostcall: def host_call_fn(gs, lr): # Outfeed supports int32 but global_step is expected to be int64. gs = tf.cast(tf.reduce_mean(gs), tf.int64) with tf.contrib.summary.create_file_writer( self.model_dir).as_default(): with tf.contrib.summary.always_record_summaries(): tf.contrib.summary.scalar('learning_rate', tf.reduce_mean(lr), step=gs) return tf.contrib.summary.all_summary_ops() host_call = (host_call_fn, [gs_t, lr_t]) return tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, train_op=train_op)
def naive_voting_strategy(logits): for x in logits: assert x.shape == logits[0].shape return tf.accumulate_n(logits) / float(len(logits))
def feature_select(*args): xs = args[1:] #Define the forward operation #First, reconstruct the actual arguments feature_selector_mat = args[0] #recall, feature_selector_mat is num_features x num_prev_layers transpose_feature_selector_mat = tf.transpose(feature_selector_mat) #this is num_prev_layers * num_features #Get a bool vector of layers with nonzero feature selection vectors #condition = tf.logical_not(tf.equal(transpose_feature_selector_mat, 0.0)) #transpose_feature_selector_nonzero_mat = tf.reduce_any(condition, axis=-1) feature_selectors = [] #feature_selectors_nonzero = [] for i in range(len(xs)): feature_selectors.append(transpose_feature_selector_mat[i]) #feature_selectors_nonzero.append(transpose_feature_selector_nonzero_mat[i]) summands = [] for i in range(len(xs)): x = xs[i] feature_selector = feature_selectors[i] summand = tf.reshape(feature_selector, [1, 1, 1, -1]) * x summands.append(summand) #Define the gradients def grad(dy): #Note: dy is intuitively the direction that we __want__ the summed output #feature maps to go in. x_grads = [] feature_selector_grads = [] dy_flat = tf.reshape(dy, [-1, tf.shape(dy)[-1]]) #Compute gradients with respect to the previous feature maps for i in range(len(xs)): #For the gradient here, gate the dy by #the weights that this x actually effects #and collapse across features with a sum feature_selector = feature_selectors[i] x = xs[i] #is_nonzero = feature_selectors_nonzero[i] def nonzero_branch(): expanded_feature_selector = tf.reshape(feature_selector, [-1, 1]) x_grad = tf.matmul(dy_flat, expanded_feature_selector, b_is_sparse=False) #The above should be the same thing as: #x_grad = tf.einsum('ijkl,l->ijk', dy, feature_selector) x_grad = tf.reshape(x_grad, tf.shape(x)) return x_grad def zero_branch(): return tf.zeros_like(x, dtype=tf.float32) x_grad = nonzero_branch() #There's another case -- if the feature is not used, then the gradient is zero! #x_grad = tf.cond(is_nonzero, nonzero_branch, zero_branch) x_grads.append(x_grad) #Compute gradients with respect to the weights on feature maps for i in range(len(feature_selectors)): #Find how much x goes in the direction of dy x = xs[i] #is_nonzero = feature_selectors_nonzero[i] def nonzero_branch_two(): x_flat = tf.reshape(x, [1, -1]) x_ys_similarities = tf.matmul(x_flat, dy_flat) x_ys_similarities = tf.reshape(x_ys_similarities, tf.shape(feature_selectors[i])) #The above should be the same as... #x_ys_similarities = tf.einsum('ijkl,ijk->l', dy, x) return x_ys_similarities def zero_branch_two(): return tf.zeros_like(feature_selectors[i], dtype=tf.float32) #There's another case: If the feature is not used, it's zero #x_ys_similarities = tf.cond(is_nonzero, nonzero_branch_two, zero_branch_two) x_ys_similarities = nonzero_branch_two() feature_selector_grads.append(x_ys_similarities) #Okay, great. Now return grads in the same order as the arguments. #recall, feature_selector_mat was originally num_features x num_prev_layers #we need to take the feature_selector_grads and concat along second axis f_grad = tf.stack(feature_selector_grads, axis=-1) result_grads = [] result_grads.append(f_grad) for i in range(len(feature_selectors)): result_grads.append(x_grads[i]) return result_grads return tf.accumulate_n(summands), grad
def tower_fn(self, inputs): """ This method doesn't have side-effects. `inputs`, `targets`, and `outputs` are batch-major but internal calculations use time-major tensors. """ # batch-major to time-major inputs = nest.map_structure(transpose_batch_time, inputs) with tf.variable_scope(self.generator_scope): gen_outputs = self.generator_fn(inputs) if self.discriminator_fn: with tf.variable_scope(self.discriminator_scope) as discrim_scope: discrim_outputs = self.discriminator_fn(inputs, gen_outputs) # post-update discriminator tensors (i.e. after the discriminator weights have been updated) with tf.variable_scope(discrim_scope, reuse=True): discrim_outputs_post = self.discriminator_fn(inputs, gen_outputs) else: discrim_outputs = {} discrim_outputs_post = {} outputs = [gen_outputs, discrim_outputs] total_num_outputs = sum([len(output) for output in outputs]) outputs = OrderedDict(itertools.chain(*[output.items() for output in outputs])) assert len(outputs) == total_num_outputs # ensure no output is lost because of repeated keys if isinstance(self.learning_rate, tf.Tensor): outputs['learning_rate'] = self.learning_rate if isinstance(self.kl_weight, tf.Tensor): outputs['kl_weight'] = self.kl_weight if self.mode == 'train': with tf.name_scope("discriminator_loss"): d_losses = self.discriminator_loss_fn(inputs, outputs) print_loss_info(d_losses, inputs, outputs) with tf.name_scope("generator_loss"): g_losses = self.generator_loss_fn(inputs, outputs) print_loss_info(g_losses, inputs, outputs) if discrim_outputs_post: outputs_post = OrderedDict(itertools.chain(gen_outputs.items(), discrim_outputs_post.items())) # generator losses after the discriminator weights have been updated g_losses_post = self.generator_loss_fn(inputs, outputs_post) else: g_losses_post = g_losses else: d_losses = {} g_losses = {} g_losses_post = {} with tf.name_scope("metrics"): metrics = self.metrics_fn(inputs, outputs) with tf.name_scope("eval_outputs_and_metrics"): eval_outputs, eval_metrics = self.eval_outputs_and_metrics_fn(inputs, outputs) # time-major to batch-major outputs_tuple = (outputs, eval_outputs) outputs_tuple = nest.map_structure(transpose_batch_time, outputs_tuple) losses_tuple = (d_losses, g_losses, g_losses_post) losses_tuple = nest.map_structure(tf.convert_to_tensor, losses_tuple) loss_tuple = tuple(tf.accumulate_n([loss * weight for loss, weight in losses.values()]) if losses else tf.zeros(()) for losses in losses_tuple) metrics_tuple = (metrics, eval_metrics) metrics_tuple = nest.map_structure(transpose_batch_time, metrics_tuple) return outputs_tuple, losses_tuple, loss_tuple, metrics_tuple
import tensorflow as tf """tf.accumulate_n(inputs, shape=None, tensor_dtype=None, name=None) 功能:对应位置元素相加。如果输入是训练变量,不要使用,应使用tf.add_n。 输入:shape,tensor_dtype:类型检查""" a = tf.constant([[1, 2], [3, 4]]) b = tf.constant([[5, 6], [7, 8]]) z = tf.accumulate_n([a, b]) sess = tf.Session() print(sess.run(z)) sess.close() # z==>[[6 8] # [10 12]]
def accumulate_n(sess): x = tf.constant([[0, 1, 2], [1, 0, 2]]) y = tf.accumulate_n([x, x]) print('x', sess.run(x), 'y', sess.run(y))
def get_magnitude(self): f = lambda x: x.get_magnitude() return tf.accumulate_n([f(y) for x,y in self.parameters.items()])
def __init__(self, iterator, session, model, num_classes, optimizer, dataset, p_norm=2., alpha=None, decomp_type='bior2.2', NUMPY_images=None, NUMPY_labels=None, learning_rate=.001, weight_decay_p=.0001, lp_wavelet_p=.0001, batch_size=32, bn_momentum=.99, robust_regularization=True, use_wavelet_decomposition=True, wavelet_weights=[0, 1], sensitivity_mode='logits', graph=tf.get_default_graph()): self.iterator = iterator self.session = session self.model = model self.num_classes = num_classes self.optimizer = optimizer self.dataset = dataset self.robust_regularization = robust_regularization self.wavelet_weights = wavelet_weights self.nested_wavelet_weights = utils.nested_weight_list(wavelet_weights) self.sensitivity_mode = sensitivity_mode self.graph = graph self.decomp_type = decomp_type self.decomp_depth = len(wavelet_weights) - 1 self.learning_rate = learning_rate self.weight_decay_p = weight_decay_p self.lp_wavelet_p = lp_wavelet_p self.batch_size = batch_size self.bn_momentum = bn_momentum self.graph = tf.get_default_graph() self.p_norm = p_norm self.alpha = alpha self.NUMPY_images = NUMPY_images self.NUMPY_labels = NUMPY_labels if use_wavelet_decomposition: from fwt import multi_channel_fwt, create_filter_bank self.decomp_filters, self.reconst_filters = create_filter_bank( decomp_type) devices = device_lib.list_local_devices() GPU_devices = [dev.name for dev in devices if dev.device_type == 'GPU'] self.num_GPUs = len(GPU_devices) tensors = [] scalars = [] gradients = [] summaries = [] with tf.variable_scope(tf.get_variable_scope()): with session.as_default(): for dev in range(self.num_GPUs): with tf.device('/device:GPU:%d' % dev): with tf.name_scope('GPU_%d' % dev) as scope: print("Compiling on GPU %d ..." % dev) tensors.append(dict()) scalars.append(dict()) # scalars finished converting to dict: # mean_NLL, sum_of_true_logits, mean_correlations # Get the inputs from the iterators next_element = iterator.get_next() tensors[-1]['images'] = next_element[0] tensors[-1]['targets'] = next_element[1] tensors[-1]['one_hot_targets'] = tf.one_hot( tensors[-1]['targets'], self.num_classes) # Get the forward propagated output # for the current batch of this GPU. network_output = model(tensors[-1]['images']) tensors[-1]['logits'] = network_output # For neural networks that use batch # normalization, network_output is actually # a list of tensors, where logits[1:] # represent the inputs to the BatchNorm # layers. Here, we handle this situation # if it arises. if type(network_output) == list: tensors[-1]['logits'] = network_output[0] bn_inputs = network_output[1:] utils.add_bn_ops(model, bn_inputs, bn_momentum=bn_momentum) tensors[-1]['predictions'] = tf.argmax( tensors[-1]['logits'], axis=1) tensors[-1][ 'predicted_one_hot_targets'] = tf.one_hot( tensors[-1]['predictions'], self.num_classes) tensors[-1]['predicted_logits'] = tf.reduce_max( tensors[-1]['logits'], axis=1) tensors[-1]['probabilities'] = tf.nn.softmax( tensors[-1]['logits']) #### x-terms, b-terms #################### tensors[-1]['x_terms'] = Rop( tensors[-1]['logits'], tensors[-1]['images'], tensors[-1]['images']) tensors[-1]['b_terms'] = tensors[-1][ 'logits'] - tensors[-1]['x_terms'] tensors[-1]['predicted_b_terms'] = utils.select( tensors[-1]['b_terms'], tensors[-1]['predictions'], self.num_classes) if self.alpha is not None: tensors[-1]['taus'] = tensors[-1][ 'logits'] - self.alpha * tensors[-1][ 'x_terms'] #NUMPY SECTION if NUMPY_images is not None and NUMPY_labels is not None: NUMPY_network_output = model(NUMPY_images) tensors[-1][ 'NUMPY_logits'] = NUMPY_network_output if type(NUMPY_network_output) == list: tensors[-1][ 'NUMPY_logits'] = NUMPY_network_output[ 0] tensors[-1]['NUMPY_predictions'] = tf.argmax( tensors[-1]['NUMPY_logits'], axis=1) tensors[-1]['NUMPY_x_terms'] = Rop( tensors[-1]['NUMPY_logits'], NUMPY_images, NUMPY_images) tensors[-1]['NUMPY_b_terms'] = tensors[-1][ 'NUMPY_logits'] - tensors[-1][ 'NUMPY_x_terms'] tensors[-1][ 'NUMPY_selected_x_terms'] = utils.select( tensors[-1]['NUMPY_x_terms'], NUMPY_labels, self.num_classes) tensors[-1][ 'NUMPY_selected_b_terms'] = utils.select( tensors[-1]['NUMPY_b_terms'], NUMPY_labels, self.num_classes) if self.alpha is not None: NUMPY_taus = tensors[-1][ 'NUMPY_logits'] - self.alpha * tensors[ -1]['NUMPY_x_terms'] tensors[-1][ 'NUMPY_selected_logits'] = utils.select( tensors[-1]['NUMPY_logits'], NUMPY_labels, self.num_classes) tensors[-1][ 'NUMPY_logit_sensitivities'] = tf.gradients( tf.reduce_sum( tensors[-1] ['NUMPY_selected_logits']), NUMPY_images)[0] tensors[-1][ 'NUMPY_bias_shifted_images'] = bias_shifted_input( NUMPY_images, tensors[-1]['NUMPY_selected_b_terms'], tensors[-1] ['NUMPY_logit_sensitivities']) ########################################## # Classification loss tensors[-1][ 'NLLs'] = tf.nn.softmax_cross_entropy_with_logits_v2( labels=tensors[-1]['one_hot_targets'], logits=tensors[-1]['logits']) scalars[-1]['mean_NLL'] = tf.reduce_mean( tensors[-1]['NLLs']) # Setting up the sensitivity penalty. if sensitivity_mode == 'logits': scalars[-1][ 'sum_of_true_logits'] = tf.reduce_sum( tensors[-1]['logits'] * tensors[-1]['one_hot_targets']) tensors[-1]['sensitivities'] = tf.gradients( scalars[-1]['sum_of_true_logits'], tensors[-1]['images'], name='input_gradients')[0] elif sensitivity_mode == 'NLL': tensors[-1]['sensitivities'] = tf.gradients( scalars[-1]['mean_NLL'], tensors[-1]['images'], name='input_gradients')[0] if use_wavelet_decomposition: sensitivity_w_decomp = multi_channel_fwt( tensors[-1]['sensitivities'], self.decomp_filters, self.decomp_depth, output_type='list') tensors[-1]['inner_products'] = tf.reduce_sum( tensors[-1]['images'] * tensors[-1]['sensitivities'], axis=[1, 2, 3]) tensors[-1]['sensitivity_norms'] = tf.sqrt( tf.reduce_sum(tensors[-1]['sensitivities']**2, axis=[1, 2, 3], name='sens_norm')) tensors[-1]['image_norms'] = tf.sqrt( tf.reduce_sum(tensors[-1]['images']**2, axis=[1, 2, 3], name='im_norm')) tensors[-1]['norm_products'] = tensors[-1][ 'sensitivity_norms'] * tensors[-1][ 'image_norms'] epsilon = 0.0 tensors[-1]['correlations'] = tensors[-1][ 'inner_products'] / ( tensors[-1]['norm_products'] + epsilon) scalars[-1]['mean_correlation'] = tf.reduce_mean( tensors[-1]['correlations']) scalars[-1]['mean_inner_product'] = tf.reduce_mean( tensors[-1]['inner_products']) scalars[-1]['mean_norm_product'] = tf.reduce_mean( tensors[-1]['norm_products']) tensors[-1]['true_logits'] = tf.reduce_sum( tensors[-1]['logits'] * tensors[-1]['one_hot_targets'], axis=1) scalars[-1]['sum_of_true_logits'] = tf.reduce_sum( tensors[-1]['true_logits']) tensors[-1]['logit_sensitivities'] = tf.gradients( scalars[-1]['sum_of_true_logits'], tensors[-1]['images'], name='logit_input_gradients')[0] tensors[-1][ 'logit_inner_products'] = tf.reduce_sum( tensors[-1]['images'] * tensors[-1]['logit_sensitivities'], axis=[1, 2, 3]) tensors[-1]['logit_sensitivity_norms'] = tf.sqrt( tf.reduce_sum( tensors[-1]['logit_sensitivities']**2, axis=[1, 2, 3], name='sens_norm')) tensors[-1]['logit_norm_products'] = tensors[-1][ 'logit_sensitivity_norms'] * tensors[-1][ 'image_norms'] tensors[-1]['logit_correlations'] = tensors[-1]['logit_inner_products'] / \ (tensors[-1]['logit_norm_products'] + epsilon) scalars[-1][ 'mean_logit_correlation'] = tf.reduce_mean( tensors[-1]['logit_correlations']) scalars[-1][ 'mean_logit_inner_product'] = tf.reduce_mean( tensors[-1]['logit_inner_products']) scalars[-1][ 'mean_logit_norm_product'] = tf.reduce_mean( tensors[-1]['logit_norm_products']) # Again as a tiled image, for visualization. # Only do this if the dimensions work out. tiled_image_works = False if use_wavelet_decomposition: try: tensors[-1][ 'sensitivity_w_decomp_imgs'] = multi_channel_fwt( tensors[-1]['sensitivities'], self.decomp_filters, self.decomp_depth, output_type='image') tiled_image_works = True except tf.errors.OpError: print( "Creating a tiled wavelet image failed." ) # sum up all the p-norms of the FWTs of # all channels. if use_wavelet_decomposition: sensitivity_w_mean_lp = 0 for decomp in sensitivity_w_decomp: sensitivity_w_mean_lp += utils.lp_norm_weighted( decomp, self.nested_wavelet_weights, p_norm=self.p_norm) else: # Otherwise, just calculate the p-norm of the # sensitivity. sensitivity_w_mean_lp = utils.lp_norm( tensors[-1]['sensitivities'], p_norm=self.p_norm) scalars[-1][ 'sensitivity_w_mean_lp'] = sensitivity_w_mean_lp ############ ONLY FOR LOGGING PURPOSES ################### tensors[-1]['random_targets'] = tf.random_uniform( tf.shape(tensors[-1]['targets']), maxval=self.num_classes - 1, dtype=tf.int32) tensors[-1]['random_one_hot_targets'] = tf.one_hot( tensors[-1]['random_targets'], self.num_classes) tensors[-1]['random_logits'] = tf.reduce_sum( tensors[-1]['logits'] * tensors[-1]['random_one_hot_targets'], axis=1) scalars[-1][ 'sum_of_random_logits'] = tf.reduce_sum( tensors[-1]['random_logits']) tensors[-1][ 'random_logit_sensitivities'] = tf.gradients( scalars[-1]['sum_of_random_logits'], tensors[-1]['images'], name='random_logit_sensitivities')[0] tensors[-1][ 'random_logit_inner_products'] = tf.reduce_sum( tensors[-1]['images'] * tensors[-1]['random_logit_sensitivities'], axis=[1, 2, 3]) tensors[-1][ 'random_logit_sensitivity_norms'] = tf.sqrt( tf.reduce_sum( tensors[-1] ['random_logit_sensitivities']**2, axis=[1, 2, 3])) scalars[-1][ 'sum_of_predicted_logits'] = tf.reduce_sum( tensors[-1]['predicted_logits']) tensors[-1][ 'predicted_logit_sensitivities'] = tf.gradients( scalars[-1]['sum_of_predicted_logits'], tensors[-1]['images'], name='predicted_logit_sensitivities')[0] tensors[-1][ 'predicted_logit_inner_products'] = tf.reduce_sum( tensors[-1]['images'] * tensors[-1] ['predicted_logit_sensitivities'], axis=[1, 2, 3]) tensors[-1][ 'predicted_logit_sensitivity_norms'] = tf.sqrt( tf.reduce_sum( tensors[-1] ['predicted_logit_sensitivities']**2, axis=[1, 2, 3])) tensors[-1]['true_logit_sensitivities'] = tensors[ -1]['logit_sensitivities'] tensors[-1][ 'true_logit_inner_products'] = tf.reduce_sum( tensors[-1]['images'] * tensors[-1]['true_logit_sensitivities'], axis=[1, 2, 3]) tensors[-1][ 'true_logit_sensitivity_norms'] = tf.sqrt( tf.reduce_sum( tensors[-1]['true_logit_sensitivities'] **2, axis=[1, 2, 3])) # Calculate the bias gradients flatten = lambda a: tf.reshape(a, (-1, )) IP = lambda a, b: tf.reduce_sum(a * b) biases = [ b for b in model.trainable_weights if 'bias' in b.name ] biases += tf.get_collection('bn_betas') biases += tf.get_collection('bn_means') random_bias_gradients = tf.gradients( scalars[-1]['sum_of_random_logits'], biases, name='random_bias_gradients') random_bg = [ IP(flatten(b), flatten(g)) for (b, g) in zip(biases, random_bias_gradients) ] random_bias_inner_products = tf.accumulate_n( random_bg) predicted_bias_gradients = tf.gradients( scalars[-1]['sum_of_predicted_logits'], biases, name='predicted_bias_gradients') predicted_bg = [ IP(flatten(b), flatten(g)) for ( b, g) in zip(biases, predicted_bias_gradients) ] predicted_bias_inner_products = tf.accumulate_n( predicted_bg) true_bias_gradients = tf.gradients( scalars[-1]['sum_of_true_logits'], biases, name='true_bias_gradients') true_bg = [ IP(flatten(b), flatten(g)) for (b, g) in zip(biases, true_bias_gradients) ] true_bias_inner_products = tf.add_n(true_bg) zero_image = tf.zeros_like(tensors[-1]['images']) tensors[-1]['zero_output'] = model(zero_image)[0] tensors[-1]['random_zero_logits'] = tf.reduce_sum( tensors[-1]['zero_output'] * tensors[-1]['random_one_hot_targets'], axis=1) tensors[-1][ 'predicted_zero_logits'] = tf.reduce_sum( tensors[-1]['zero_output'] * tensors[-1]['predicted_one_hot_targets'], axis=1) tensors[-1]['true_zero_logits'] = tf.reduce_sum( tensors[-1]['zero_output'] * tensors[-1]['one_hot_targets'], axis=1) # Calculate the approximate random robustness tensors[-1]['inner_product_differences'] = ( tensors[-1]['predicted_logit_inner_products'] - tensors[-1]['random_logit_inner_products']) tensors[-1][ 'bias_differences'] = predicted_bias_inner_products - random_bias_inner_products numerator = tensors[-1][ 'inner_product_differences'] - tensors[-1][ 'bias_differences'] tensors[-1]['logit_sensitivity_differences'] = ( tensors[-1]['predicted_logit_sensitivities'] - tensors[-1]['random_logit_sensitivities']) denominator = tf.sqrt( tf.reduce_sum( tensors[-1] ['logit_sensitivity_differences']**2)) tensors[-1][ 'approximate_random_robustness'] = numerator / denominator tensors[-1][ 'inner_product_differences_normalized'] = ( tensors[-1]['inner_product_differences'] / denominator) tensors[-1][ 'bias_differences_normalized'] = tensors[-1][ 'bias_differences'] / denominator tensors[-1][ 'bias_difference_shifted_images'] = bias_shifted_input( tensors[-1]['images'], tensors[-1]['bias_differences'], tensors[-1] ['logit_sensitivity_differences']) #print(tensors[-1]['bias_differences_normalized']) #crash() ####################################################### # Collect the network's weights and set up # the weight decay penalty trainable_weights = model.trainable_weights scalars[-1]['weight_norm'] = tf.add_n([ tf.reduce_sum(w**2) for w in trainable_weights ]) # Assemble the total loss for this GPU scalars[-1]['total_loss'] = scalars[-1]['mean_NLL'] scalars[-1][ 'total_loss'] += weight_decay_p * scalars[-1][ 'weight_norm'] if robust_regularization: scalars[-1][ 'sensitivity_penalty'] = lp_wavelet_p * scalars[ -1]['sensitivity_w_mean_lp'] scalars[-1]['total_loss'] += scalars[-1][ 'sensitivity_penalty'] # Everything that is tracked during training # goes here. Top-5 and top-1 accuracies are # automatically added. summary_dict = { 'total_loss': scalars[-1]['total_loss'], 'mean_NLL': scalars[-1]['mean_NLL'], 'weight_2_norm_squared': scalars[-1]['weight_norm'], 'mean_sensitivity_wavelet_coeffs_lp': scalars[-1]['sensitivity_w_mean_lp'] } # Add some hyperparameters, too. # Some redundant calculations through averaging # later, but the computational overhead is negligible. summary_dict['learning_rate_'] = learning_rate summary_dict['correlation_'] = scalars[-1][ 'mean_correlation'] summary_dict['inner_product_'] = scalars[-1][ 'mean_inner_product'] summary_dict['norm_product_'] = scalars[-1][ 'mean_norm_product'] summary_dict['logit_correlation_'] = scalars[-1][ 'mean_logit_correlation'] summary_dict['logit_inner_product_'] = scalars[-1][ 'mean_logit_inner_product'] summary_dict['logit_norm_product_'] = scalars[-1][ 'mean_logit_norm_product'] summary_dict[ 'weight_decay_parameter_'] = weight_decay_p summary_dict[ 'lp_Wavelet_parameter_'] = lp_wavelet_p summary_dict[ 'total_batch_size'] = batch_size * self.num_GPUs summary_dict['bn_momentum_'] = bn_momentum summary_dict['p_norm'] = p_norm if robust_regularization: summary_dict['sensitivity_penalty'] = scalars[ -1]['sensitivity_penalty'] summary_dict = summary_utils.prepare_summaries( summary_dict=summary_dict, predictions=tensors[-1]['probabilities'], labels=tensors[-1]['targets']) summaries.append(summary_dict) # Collect the gradients for every GPU gradients.append( optimizer.compute_gradients( scalars[-1]['total_loss'], var_list=trainable_weights, colocate_gradients_with_ops=True)) # So far, the adversarial attack model is only # created on one GPU. Different parallelized versions # always lead to errors. if dev == 0: self.adversarial_model = TensorFlowModel( tensors[-1]['images'], tensors[-1]['logits'], bounds=self.dataset.bounds) print("Done.") # Copy the lists 'tensors' and 'scalars' and replace these with an aggregated version: # Concatenate the tensors and average the scalars. self.tensors = dict() self.scalars = dict() for key in tensors[0].keys(): print(key) self.tensors[key] = tf.concat( [tensors_item[key] for tensors_item in tensors], axis=0) for key in scalars[0].keys(): self.scalars[key] = tf.reduce_mean( [scalars_item[key] for scalars_item in scalars]) # Create self.GPU_collections for backwards compatibility self.GPU_collections = {**self.tensors, **self.scalars} self.GPU_collections['top_1'] = tf.concat(tf.get_collection('top_1'), 0) self.GPU_collections['top_5'] = tf.concat(tf.get_collection('top_5'), 0) # Collection and apply the gradients over all used # GPUs for synchronous parallel training. avg_grads = utils.average_gradients(gradients) gradient_application = optimizer.apply_gradients(avg_grads) # We combine the gradient update and possibly the # batch normalization update operators into one. self.train_op = tf.group(gradient_application, *(tf.get_collection('bn_update_ops'))) summary_dict = summary_utils.collect_summaries(summaries) self.summary_op = summary_utils.create_summary_op(summary_dict) if use_wavelet_decomposition: wavelet_summary = tf.summary.tensor_summary( 'wavelet_weights', self.wavelet_weights) self.summary_op = tf.summary.merge( [self.summary_op, wavelet_summary]) # Here, we create a tiled image summary for Tensorboard. # We hereby shift the range of the sensitivity and # possibly its decomposition to the range of the image. image_range = self.dataset.image_range() image_max = image_range[1] image_min = image_range[0] image_span = image_max - image_min image_mid = image_span / 2. self.images = self.dataset.interpret_as_image( self.GPU_collections['images']) self.saliencies = self.GPU_collections['sensitivities'] saliencies_max = tf.reduce_max(tf.abs(self.saliencies), [1, 2], keepdims=True) normalized_saliencies = image_span * self.saliencies / \ (2*saliencies_max + 1e-9) + image_mid if use_wavelet_decomposition: self.saliency_decomps = self.GPU_collections[ 'sensitivity_w_decomp_imgs'] saliency_decomps_max = tf.reduce_max(tf.abs(self.saliency_decomps), [1, 2], keepdims=True) normalized_decomps = image_span * self.saliency_decomps / \ (2*saliency_decomps_max + 1e-9) + image_mid composite_image = [self.images, normalized_saliencies] if tiled_image_works: composite_image.append(normalized_decomps) img_saliency_decomp = tf.concat(composite_image, 2) self.img_summary_op = tf.summary.image('img_saliency_decomp', img_saliency_decomp, max_outputs=10)
def _set_train_or_infer(self, res, reverse_target_vocab_table, hparams, scope=None): """Set up training and inference.""" if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.train_loss = res[1] self.word_count = tf.reduce_sum( self.iterator.source_sequence_length) + tf.reduce_sum( self.iterator.target_sequence_length) elif self.mode == tf.contrib.learn.ModeKeys.EVAL: self.eval_loss = res[1] elif self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_logits, _, self.final_context_state, self.sample_id = res self.sample_words = reverse_target_vocab_table.lookup( tf.to_int64(self.sample_id)) if self.mode != tf.contrib.learn.ModeKeys.INFER: ## Count the number of predicted words for compute ppl. self.predict_count = tf.reduce_sum( self.iterator.target_sequence_length) params = tf.trainable_variables() # Gradients and SGD update operation for training the model. # Arrange for the embedding vars to appear at the beginning. if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.learning_rate = tf.constant(hparams.learning_rate) # warm-up self.learning_rate = self._get_learning_rate_warmup(hparams) # decay self.learning_rate = self._get_learning_rate_decay(hparams) # Optimizer if hparams.optimizer == "sgd": opt = tf.train.GradientDescentOptimizer(self.learning_rate) elif hparams.optimizer == "adam": opt = tf.train.AdamOptimizer(self.learning_rate) else: raise ValueError("Unknown optimizer type %s" % hparams.optimizer) DAPPLE_TEST = hparams.dapple_test if DAPPLE_TEST: devices = cluster_utils.get_pipeline_devices( hparams.pipeline_device_num) slice_num = len(devices) micro_batch_num = hparams.micro_batch_num losses = [] all_outputs = [] losses.append(self.train_loss) stage_outputs = res[-1] all_outputs.append(stage_outputs) layer_grads = [[[] for i in xrange(slice_num)] for j in xrange(micro_batch_num)] layer_vars = [[] for i in xrange(slice_num)] remained_vars = tf.trainable_variables() ys = losses[0] prev_grads = None # layers-1 ~ 1 compute grads for i in xrange(slice_num - 1, 0, -1): vars_i = [ v for v in remained_vars if v.device == devices[i] ] remained_vars = [ v for v in remained_vars if v not in vars_i ] prev_y = all_outputs[0][i - 1] prev_y = prev_y if isinstance(prev_y, list) else [prev_y] num_tensors = len(prev_y) with tf.device(devices[i]): y_grads = tf.gradients( ys=ys, xs=prev_y + vars_i, grad_ys=prev_grads, colocate_gradients_with_ops=True) ys = prev_y prev_grads = y_grads[0:num_tensors] grads_i = y_grads[num_tensors:] layer_grads[0][i] = [g for g in grads_i if g is not None] layer_vars[i] = [ v for (g, v) in zip(grads_i, vars_i) if g is not None ] # layer 0 compute grads #with tf.device(devices[0]): grads_0 = tf.gradients(ys=ys, xs=remained_vars, grad_ys=prev_grads, colocate_gradients_with_ops=True) #colocate_gradients_with_ops=True, name="gradients_gpu_0") layer_grads[0][0] = [g for g in grads_0 if g is not None] layer_vars[0] = [ v for (g, v) in zip(grads_0, remained_vars) if g is not None ] # other micro_batch_num for j in xrange(1, micro_batch_num): dep_outputs = [] for i in xrange(slice_num): dep_outputs.append( all_outputs[j - 1][i] if i + j < slice_num else layer_grads[i + j - slice_num][i]) #dep_outputs.append(all_outputs[j-1][i] if i+j < 2*slice_num-1 else layer_grads[i+j-2*slice_num+1][i]) res = self.build_graph(hparams, scope, dep_outputs=dep_outputs) losses.append(res[1]) all_outputs.append( res[-1] ) ### push this micro_batch's outputs of all stage ys = losses[j] prev_grads = None for i in xrange(slice_num - 1, 0, -1): prev_y = all_outputs[j][i - 1] prev_y = prev_y if isinstance(prev_y, list) else [prev_y] num_tensors = len(prev_y) y_grads = tf.gradients( ys=ys, xs=prev_y + layer_vars[i], grad_ys=prev_grads, colocate_gradients_with_ops=True) ys = prev_y prev_grads = y_grads[0:num_tensors] grads_i = y_grads[num_tensors:] layer_grads[j][i] = [ g for g in grads_i if g is not None ] grads_0 = tf.gradients(ys=ys, xs=layer_vars[0], grad_ys=prev_grads, colocate_gradients_with_ops=True) layer_grads[j][0] = [g for g in grads_0 if g is not None] grads_set = [] vars_set = [] for i in xrange(slice_num): for j in xrange(len(layer_grads[0][i])): grad_i_set = [ layer_grads[m][i][j] for m in range(micro_batch_num) ] #print (grad_i_set) if micro_batch_num == 1: with tf.device(grad_i_set[0].device): acc_grads = grad_i_set[0] else: with tf.control_dependencies( grad_i_set), tf.device( grad_i_set[0].device): # replica if isinstance(grad_i_set[0], tf.IndexedSlices): acc_grads = tf.add_n(grad_i_set) else: acc_grads = tf.accumulate_n(grad_i_set) grads_set.append(acc_grads) vars_set.append(layer_vars[i][j]) grads_and_vars = zip(grads_set, vars_set) #if hparams.cross_pipeline and hvd.size() > 1: if hparams.cross_pipeline and hvd.size() >= 1: devices = cluster_utils.get_pipeline_devices( hparams.pipeline_device_num) gradients_list = [[] for i in xrange(len(devices))] for grad, var in grads_and_vars: for i in xrange(len(devices)): if var.device == devices[i]: gradients_list[i].append((grad, var)) break avg_grads_and_vars = [] for i in xrange(len(devices)): with tf.device(devices[i]): for grad, var in gradients_list[i]: if isinstance(grad, tf.IndexedSlices): grad = tf.convert_to_tensor(grad) avg_grad = hvd.allreduce(grad) avg_grads_and_vars.append((avg_grad, var)) grads_and_vars = avg_grads_and_vars gradients = [grad for grad, _ in grads_and_vars] ##### else: # Gradients gradients = tf.gradients(self.train_loss, params, colocate_gradients_with_ops=True) clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip( gradients, max_gradient_norm=hparams.max_gradient_norm) #self.grad_norm_summary = grad_norm_summary #self.grad_norm = grad_norm if DAPPLE_TEST: self.update = opt.apply_gradients(grads_and_vars, global_step=self.global_step) else: self.update = opt.apply_gradients(zip(clipped_grads, params), global_step=self.global_step) # Summary self.train_summary = self._get_train_summary() elif self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_summary = self._get_infer_summary(hparams) # Print trainable variables utils.print_out("# Trainable variables") utils.print_out( "Format: <name>, <shape>, <size(MB)>, <(soft) device placement>") param_total_size = 0.0 for param in params: param_total_size += np.asarray( param.shape.as_list()[0:]).prod() * 4.0 / 1000 / 1000 utils.print_out(" %s, %s, %.2f, %s" % (param.name, str(param.get_shape()), np.asarray(param.shape.as_list()[0:]).prod() * 4.0 / 1000 / 1000, param.op.device)) print("# Total size of trainable variables: %0.2f", param_total_size)
tf.reduce_sum(x, [0, 1]) ==> 6 """ # 计算输入 tensor 所有元素的均值/最大值/最小值/积/逻辑与/或 # 或者计算指定的轴所有元素的均值/最大值/最小值/积/逻辑与/或(just like reduce_sum) tf.reduce_mean(input_tensor, axis=None, keep_dims=False, name=None) tf.reduce_max(input_tensor, axis=None, keep_dims=False, name=None) tf.reduce_min(input_tensor, axis=None, keep_dims=False, name=None) tf.reduce_prod(input_tensor, axis=None, keep_dims=False, name=None) tf.reduce_all(input_tensor, axis=None, keep_dims=False, name=None) # 全部满足条件 tf.reduce_any(input_tensor, axis=None, keep_dims=False, name=None) #至少有一个满足条件 ------------------------------------------- # 分界线以上和 Numpy 中相应的用法完全一致 ------------------------------------------- # inputs 为一 list, 计算 list 中所有元素的累计和, # tf.add(x, y, name=None)只能计算两个元素的和,此函数相当于扩展了其功能 tf.accumulate_n(inputs, shape=None, tensor_dtype=None, name=None) # Computes log(sum(exp(elements across dimensions of a tensor))) tf.reduce_logsumexp(input_tensor, axis=None, keep_dims=False, name=None) # Computes number of nonzero elements across dimensions of a tensor tf.count_nonzero(input_tensor, axis=None, keep_dims=False, name=None) # Compute the cumulative sum of the tensor x along axis tf.cumsum(x, axis=0, exclusive=False, reverse=False, name=None) # Eg: tf.cumsum([a, b, c]) # => [a, a + b, a + b + c] tf.cumsum([a, b, c], exclusive=True) # => [0, a, a + b]
z_reduce_all = tf.reduce_all(z, reduction_indices=[0, 2]) # tf.reduce_any x = np.random.randint(0, 2, 10 * 5 * 4) z = np.empty(10 * 5 * 4, dtype=np.bool) for i, x_ in enumerate(x): if x_ > 0: z[i] = True else: z[i] = False z = z.reshape((10, 5, 4)) z_reduce_any = tf.reduce_any(z, reduction_indices=[0, 2]) # tf.accumulate_n inputs = [np.random.rand(5, 4, 3)] * 3 z_accumulate_n = tf.accumulate_n(inputs, shape=(10, 5, 4)) with tf.Session() as sess: print "tf.reduce_sum" print sess.run(z_reduce_sum) print "tf.reduce_prod" print sess.run(z_reduce_prod) print "tf.reduce_min" print sess.run(z_reduce_min) print "tf.reduce_max" print sess.run(z_reduce_max)
def build_model(self, batch_queue, tower, opt, scope): """ The main function where the bilevel approach is used """ imgs_train, labels_train = batch_queue.get_next() tf.summary.histogram('labels', labels_train) # We split the training batches in the pre-defined splits (each containing the same label distribution) num_split = self.data_generator.batch_splits imgs_train_list = tf.split(imgs_train, num_split) labels_train_list = tf.split(labels_train, num_split) preds_list = [] loss_list = [] # Iterate over all the batch splits for i, (imgs, labels) in enumerate(zip(imgs_train_list, labels_train_list)): tf.summary.image('imgs/train', montage_tf(imgs, 1, 8), max_outputs=1) # Create the model reuse = True if (tower > 0 or i > 0) else None preds, layers = self.model.net(imgs, self.data_generator.num_classes, reuse=reuse) preds_list.append(preds) # Compute losses loss = self.model.loss(scope, preds, self.data_generator.format_labels(labels), tower) tf.get_variable_scope().reuse_variables() # Handle dependencies with update_ops (batch-norm) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if update_ops: updates = tf.group(*update_ops) loss = control_flow_ops.with_dependencies([updates], loss) # Store the loss on this split in the list loss_list.append(loss) # Calculate the gradients on all the batch splits. weights = get_variables_to_train(self.train_scopes) grads_list = [opt.compute_gradients(l, weights) for l in loss_list] # A dictionary with a list of gradients corresponding to the model variables grads_accum = {v: [] for (_, v) in grads_list[0]} # Flatten the gradients of each split grads_flat = [ tf.concat([tf.reshape(g, (-1, 1)) for (g, v) in grad], axis=0) for grad in grads_list ] # Compute the mini-batch weights val_grad = grads_flat[0] w = [ tf.divide( tf.reduce_sum(tf.multiply(val_grad, train_grad)), tf.reduce_sum(tf.multiply(train_grad, train_grad)) + self.mu) for train_grad in grads_flat[1:] ] # Multiply mini-batch gradients by l1 normalized weights w_l1norm = tf.reduce_sum(tf.abs(w)) for i, grads in enumerate(grads_list[1:]): for g, v in grads: grads_accum[v].append(tf.multiply(g, w[i] / w_l1norm)) tf.summary.histogram('w', tf.stack(w)) # Apply weight-decay grads_wd = { v: self.model.weight_decay * v if v.op.name.endswith('weights') else 0.0 for (_, v) in grads_list[0] } # Accumulate all the gradients per variable grads = [(tf.accumulate_n(grads_accum[v]) + grads_wd[v], v) for (_, v) in grads_list[0]] self.summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) return tf.reduce_mean(loss_list), grads, layers
def model_fn(features, labels, mode, params): inputs = [features['input_ids'], features['input_mask'], features['segment_ids'], features['cls_index'], features['p_mask'], features['start_positions'], features['end_positions'], features['is_impossible']] slice_xlnet = XlnetSlice( embedding_dim=FLAGS.embedding_dim, num_token=FLAGS.num_token, num_layer=FLAGS.num_layer, num_head=FLAGS.num_head, feed_forward_dim=FLAGS.feed_forward_dim, attention_head_dim=FLAGS.attention_head_dim, target_len=FLAGS.target_len, dropout=FLAGS.dropout, is_training=True, attention_dropout=FLAGS.dropatt) total_loss, outputs = slice_xlnet.build(inputs) devices = cluster_utils.get_pipeline_devices(FLAGS.pipeline_device_num) slice_num = len(devices) micro_batch_num = FLAGS.micro_batch_num losses = [] all_outputs = [] losses.append(total_loss) all_outputs.append(outputs) layer_grads = [[[] for i in xrange(slice_num)] for j in xrange(micro_batch_num)] layer_vars = [[] for i in xrange(slice_num)] remained_vars = tf.trainable_variables() ys = losses[0] prev_grads=None # layers-1 ~ 1 compute grads for i in xrange(slice_num - 1, 0, -1): vars_i = [v for v in remained_vars if v.device==devices[i]] remained_vars = [v for v in remained_vars if v not in vars_i] prev_y = all_outputs[0][i-1] y_grads = tf.gradients(ys=ys, xs=[prev_y]+vars_i, grad_ys=prev_grads, colocate_gradients_with_ops=True) ys = prev_y prev_grads = y_grads[0] grads_i = y_grads[1:] layer_grads[0][i] = [g for g in grads_i if g is not None] layer_vars[i] = [v for (g, v) in zip(grads_i, vars_i) if g is not None] # layer 0 compute grads grads_0 = tf.gradients(ys=ys, xs=remained_vars, grad_ys=prev_grads, colocate_gradients_with_ops=True) layer_grads[0][0] = [g for g in grads_0 if g is not None] layer_vars[0] = [v for (g, v) in zip(grads_0, remained_vars) if g is not None] # other micro_batch_num for j in xrange(1, micro_batch_num): dep_outputs = [] for i in xrange(slice_num): dep_outputs.append(all_outputs[j-1][i] if i+j < slice_num else layer_grads[i+j-slice_num][i]) loss, outputs = slice_xlnet.build(inputs, dep_outputs=dep_outputs) losses.append(loss) all_outputs.append(outputs) ys = losses[j] prev_grads=None for i in xrange(slice_num - 1, 0, -1): prev_y = all_outputs[j][i-1] y_grads = tf.gradients(ys=ys, xs=[prev_y]+layer_vars[i], grad_ys=prev_grads, colocate_gradients_with_ops=True) ys = prev_y prev_grads = y_grads[0] grads_i = y_grads[1:] layer_grads[j][i] = [g for g in grads_i if g is not None] grads_0 = tf.gradients(ys=ys, xs=layer_vars[0], grad_ys=prev_grads, colocate_gradients_with_ops=True) layer_grads[j][0] = [g for g in grads_0 if g is not None] grads_set = [] vars_set = [] for i in xrange(slice_num): for j in xrange(len(layer_grads[0][i])): grad_i_set = [layer_grads[m][i][j] for m in range(micro_batch_num)] #print (grad_i_set) if micro_batch_num == 1: with tf.device(devices[i]): acc_grads = grad_i_set[0] else: with tf.control_dependencies(grad_i_set), tf.device(devices[i]): if isinstance(grad_i_set[0], tf.IndexedSlices): acc_grads = tf.add_n(grad_i_set) else: acc_grads = tf.accumulate_n(grad_i_set) grads_set.append(acc_grads) vars_set.append(layer_vars[i][j]) grads_and_vars = zip(grads_set, vars_set) # init_from_checkpoint(FLAGS) train_op = get_train_op(FLAGS, grads_and_vars) return tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, )
import numpy as np input_a = np.array([[1, 1, 2], [2, 3, 4]], dtype=np.float32) input_b = np.array([[True, False], [True, True]]) input_c = np.array([[1.3, 1.2, 2.3], [2., 3., 2.3]], dtype=np.float32) input_a_sum_column = tf.reduce_sum(input_a, reduction_indices=0) input_a_sum_row = tf.reduce_sum(input_a, reduction_indices=1, keep_dims=True) input_a_prod_column = tf.reduce_prod(input_a, reduction_indices=0) input_a_prod_row = tf.reduce_prod(input_a, reduction_indices=1, keep_dims=True) input_a_min = tf.reduce_min(input_a, reduction_indices=1) input_a_max = tf.reduce_max(input_a, reduction_indices=1) input_a_mean = tf.reduce_mean(input_a, reduction_indices=1, keep_dims=True) input_b_and = tf.reduce_all(input_b, reduction_indices=1) input_b_or = tf.reduce_any(input_b, reduction_indices=1) input_accum = tf.accumulate_n(inputs=[input_a, input_c]) input_cum = tf.cumsum(x=[input_a_sum_column, input_a_prod_column]) with tf.Session() as sess: init = tf.global_variables_initializer() sess.run(init) print(sess.run(input_a_sum_column), '\n', sess.run(input_a_sum_row)) print(sess.run(input_a_prod_column), '\n', sess.run(input_a_prod_row)) print(sess.run(input_a_min), '\n', sess.run(input_a_max), '\n', sess.run(input_a_mean)) print(sess.run(input_accum), '\n', sess.run(input_cum))
def get_run_op(): global batch_size global slice_size global feature_size batch_size = FLAGS.batch_size slice_size = FLAGS.hidden_size / FLAGS.num_gpus feature_size = slice_size * FLAGS.num_gpus print("Slice size: {}".format(slice_size)) data = [] for i in xrange(FLAGS.num_gpus): with tf.device('/gpu:%d' % i): data.append(tf.get_variable( name = 'data%d' % i, shape=[batch_size, slice_size], trainable=False)) # weights w = [] for i in xrange(FLAGS.num_layers): w.append([]) for j in xrange(FLAGS.num_gpus): with tf.device('/gpu:%d' % j): with tf.variable_scope('fc%d' % i): w[i].append(tf.get_variable( name='w%d' % j, shape=[slice_size,feature_size], trainable=True)) # ff fwd = [] last = data for i in xrange(FLAGS.num_layers): with tf.name_scope('fc_ff%d' % i): fwd.append(last) tmp = [] for j in xrange(FLAGS.num_gpus): with tf.device('/gpu:%d' % j): # matmult y = tf.matmul(last[j], w[i][j]) if FLAGS.num_gpus > 1: # split tmp.append(tf.split(split_dim=1, num_split=FLAGS.num_gpus, value=y)) else: tmp.append(y) if FLAGS.num_gpus > 1: # reduce red = [] for j in xrange(FLAGS.num_gpus): with tf.device('/gpu:%d' % j): red.append(tf.accumulate_n([s[j] for s in tmp])) last = red else: last = tmp # bp targets = [] for i in reversed(xrange(FLAGS.num_layers)): with tf.name_scope('fc_bp%d' % i): # convert col -> rep tmp = [] if FLAGS.num_gpus > 1: for j in xrange(FLAGS.num_gpus): with tf.device('/gpu:%d' % j): tmp.append(tf.concat(concat_dim=1, values=last)) else: tmp = last last = [] for j in xrange(FLAGS.num_gpus): with tf.device('/gpu:%d' % j): with tf.name_scope('bp'): # matmult: bp dy = tf.matmul(tmp[j], w[i][j], transpose_b=True) last.append(dy) if i == 0: dep = [] # no manual scheduling dep since the last bp is not needed else: dep = [dy] # add manual dep for better scheduling decision with tf.control_dependencies(dep), tf.name_scope('grad'): # matmult: grad dw = tf.matmul(fwd[i][j], tmp[j], transpose_a=True) # update targets.append(dw) with tf.control_dependencies(targets): train_op = tf.no_op() init_op = tf.initialize_all_variables() return init_op, train_op