Example #1
0
def merge_gradients(tower_grad):
    grads_and_vars = tower_grad.pop()
    grad_len = float(len(tower_grad))
    # if len tower_grad > 0 means that more than
    # one gradients need to be averaged.
    if grad_len > 0:
        gs = []
        vs = []
        for i, (g, v) in enumerate(grads_and_vars):
            gs.append(g)
            vs.append(v)

        for grad in tower_grad:
            for i, (g, v) in enumerate(grad):
                assert v == vs[i]
                if isinstance(g, tf.Tensor):
                    gs[i] += g
                elif isinstance(g, tf.IndexedSlices):
                    sum_values = tf.accumulate_n([gs[i].values, g.values])
                    sum_indices = tf.accumulate_n([gs[i].indices, g.indices])
                    gs[i] = tf.IndexedSlices(sum_values, sum_indices,
                                             g.dense_shape)

        for i in range(len(gs)):
            if isinstance(gs[i], tf.Tensor):
                gs[i] /= grad_len + 1.0
            elif isinstance(gs[i], tf.IndexedSlices):
                gs[i] = tf.IndexedSlices(sum_values / grad_len + 1.0,
                                         sum_indices, gs[i].dense_shape)
        grads_and_vars = zip(gs, vs)
    return grads_and_vars
Example #2
0
	def build_J_logistic_w_scaled_reg(self, lambda_val=1.0 ,y=None ):
		""" @fn :: build_J_L2norm_w_reg - 
		@brief :: build or make cost functional, of the form of the L2 norm (i.e. Euclidean distance norm)

		# regularization, "learning", "momentum" parameters/constants/rates
		@type lambda_val : float
		@param lambda_val : regularization constant
		"""

		try:
			m = tf.cast( self.X.get_shape()[0], tf.float32) # ValueError
		except ValueError as valerr:
			print("ValueError in obtaining batch size: ", valerr)
			m = self.X.get_shape()[0]

		loss = self.build_J_logistic(y)

		Thetas_only = self._CNN_model.__get_state__()['Thetas']

		ThetaL2norms = map( tf.nn.l2_loss, Thetas_only)
		
		reg_term = tf.accumulate_n(ThetaL2norms)
		
		J = loss + lambda_val*(1.0/m)*reg_term

		self.J_Theta = J
		return J
Example #3
0
	def build_J_logistic_w_reg(self, lambda_val=1.0, y=None ):
		""" @fn :: build_J_L2norm_w_reg - 
		@brief :: build or make cost functional, of the form of the L2 norm (i.e. Euclidean distance norm)

		# regularization, "learning", "momentum" parameters/constants/rates
		@type lambda_val : float
		@param lambda_val : regularization constant
		"""
		"""		
		if y is not None:
			self.y = y
		else:
			y = self.y
		"""		
		loss = self.build_J_logistic(y)

		Thetas_only = self._CNN_model.__get_state__()['Thetas']

		ThetaL2norms = map( tf.nn.l2_loss, Thetas_only)
		
		reg_term = tf.accumulate_n(ThetaL2norms)
		
		J = loss + lambda_val*reg_term

		self.J_Theta = J
		return J
Example #4
0
    def build_J_xent_w_scaled_reg(self, y=None, lambda_val=1.0):
        """ build_J_L2norm_w_scaled_reg
		
		with 
		J_loss = \frac{1}{m} \sum_{i=0}^{m-1} (\widehat{\mathbf{y}}^{(i)}-\mathbf{y}^{(i)})^2
		(see build_J_L2norm)
		this adds the following term:  
		J_reg = \frac{1}{m} \sum_{l=0}^{L-1} \| \Theta^{(l)} \|_2 = 
				= \frac{1}{m} \sum_{l=0}^{L-1} \sum_{I \in \mathcal{I}} (\Theta_I^{(l)})^2 
		and computes 
		J = J_loss + J_reg
				
		@type lambda_val  : (single) float number
		@param lambda_val : regularization parameter
		
		"""
        # m=number of samples, i.e. total "batch" size
        X = self.X
        m = tf.cast(tf.shape(X)[0], tf.float32)

        loss = self.build_J_xent(y)

        Thetas_only = self.DNN_model.__get_state__()['Thetas']

        ThetaL2norms = map(tf.nn.l2_loss, Thetas_only)

        reg_term = tf.accumulate_n(ThetaL2norms)

        J = loss + lambda_val * (1.0 / m) * reg_term

        self.J_Theta = J

        return J
Example #5
0
def connectivity_penalty(adj: tf.Tensor,
                         features: tf.Tensor,
                         batch_size: int,
                         penalty_weight: float = 1.0,
                         add_summaries: bool = False,
                         scope: Optional[str] = None) -> tf.Tensor:
    def _sigmoid(x, a=100):
        # {1 + exp[−a(x − 1/2 ))] }^−1
        return tf.sigmoid(a * (x - 0.5))

    with tf.name_scope(scope, 'ConnectivityPenalty', [adj, features]):
        n_nodes = adj.shape[-1].value
        with tf.name_scope('adj_power', values=[adj]):
            prob_edge = 1.0 - adj[:, 0, :, :]
            As = [tf.eye(n_nodes, batch_shape=[batch_size]), prob_edge]
            for i in range(2, n_nodes - 1):
                As.append(_sigmoid(tf.matmul(As[i - 1], prob_edge)))
            indicator = _sigmoid(tf.accumulate_n(As))

        prob_node = tf.expand_dims(1.0 - features[:, :, 0], axis=-1)
        # compute all paired probabilities
        q = tf.matmul(prob_node, tf.matrix_transpose(prob_node))
        g = tf.add(q * (1.0 - indicator), (1.0 - q) * indicator)
        penalty = penalty_weight / (n_nodes * n_nodes) * tf.reduce_sum(g)

        tf.losses.add_loss(penalty)
        if add_summaries:
            tf.summary.scalar('penalty', penalty)

    return penalty
Example #6
0
	def build_J_xent_w_scaled_reg(self,y=None,lambda_val=1.0):
		""" build_J_L2norm_w_scaled_reg
		
		with 
		J_loss = \frac{1}{m} \sum_{i=0}^{m-1} (\widehat{\mathbf{y}}^{(i)}-\mathbf{y}^{(i)})^2
		(see build_J_L2norm)
		this adds the following term:  
		J_reg = \frac{1}{m} \sum_{l=0}^{L-1} \| \Theta^{(l)} \|_2 = 
				= \frac{1}{m} \sum_{l=0}^{L-1} \sum_{I \in \mathcal{I}} (\Theta_I^{(l)})^2 
		and computes 
		J = J_loss + J_reg
				
		@type lambda_val  : (single) float number
		@param lambda_val : regularization parameter
		
		"""
		# m=number of samples, i.e. total "batch" size
		X=self.X
		m = tf.cast( tf.shape( X )[0], tf.float32)
		
		loss = self.build_J_xent(y)
		
		Thetas_only = self.DNN_model.__get_state__()['Thetas']
		
		ThetaL2norms = map( tf.nn.l2_loss, Thetas_only)
		
		reg_term = tf.accumulate_n( ThetaL2norms )
		
		J = loss + lambda_val*(1.0/m)*reg_term  

		self.J_Theta = J

		return J 		
Example #7
0
    def build_J_xent_w_reg(self, y=None, lambda_val=1.0):
        """ build_J_xent_w_reg
		
		with 
		J_loss 
		(see build_J_xent)
		this adds the following term:  
		J_reg = \sum_{l=0}^{L-1} \| \Theta^{(l)} \|_2 = 
				= \sum_{l=0}^{L-1} \sum_{I \in \mathcal{I}} (\Theta_I^{(l)})^2 
		and computes 
		J = J_loss + J_reg
		
		Notice that in J_reg, there's no $\frac{1}{m}$ factor, m=number of (input) examples/samples, and there is no way to 
		obtain that factor without invoking m haphazardly from the matrix size dimension of the input X since 
		X \in \text{Mat}_{\mathbb{K}}(m,d).  This is done in build_J_L2norm_w_scaledreg
		
		@type lambda_val  : (single) float number
		@param lambda_val : regularization parameter
		
		"""
        loss = self.build_J_xent(y)

        Thetas_only = self.DNN_model.__get_state__()['Thetas']

        ThetaL2norms = map(tf.nn.l2_loss, Thetas_only)

        reg_term = tf.accumulate_n(ThetaL2norms)

        J = loss + lambda_val * reg_term

        self.J_Theta = J

        return J
Example #8
0
	def build_J_xent_w_reg(self,y=None,lambda_val=1.0):
		""" build_J_xent_w_reg
		
		with 
		J_loss 
		(see build_J_xent)
		this adds the following term:  
		J_reg = \sum_{l=0}^{L-1} \| \Theta^{(l)} \|_2 = 
				= \sum_{l=0}^{L-1} \sum_{I \in \mathcal{I}} (\Theta_I^{(l)})^2 
		and computes 
		J = J_loss + J_reg
		
		Notice that in J_reg, there's no $\frac{1}{m}$ factor, m=number of (input) examples/samples, and there is no way to 
		obtain that factor without invoking m haphazardly from the matrix size dimension of the input X since 
		X \in \text{Mat}_{\mathbb{K}}(m,d).  This is done in build_J_L2norm_w_scaledreg
		
		@type lambda_val  : (single) float number
		@param lambda_val : regularization parameter
		
		"""
		loss = self.build_J_xent(y)
		
		Thetas_only = self.DNN_model.__get_state__()['Thetas']
		
		ThetaL2norms = map( tf.nn.l2_loss, Thetas_only)
		
		reg_term = tf.accumulate_n( ThetaL2norms )
		
		J = loss + lambda_val*reg_term  

		self.J_Theta = J

		return J 
Example #9
0
    def build_J_logistic_w_scaled_reg(self, lambda_val=1.0, y=None):
        """ @fn :: build_J_L2norm_w_reg - 
		@brief :: build or make cost functional, of the form of the L2 norm (i.e. Euclidean distance norm)

		# regularization, "learning", "momentum" parameters/constants/rates
		@type lambda_val : float
		@param lambda_val : regularization constant
		"""

        try:
            m = tf.cast(self.X.get_shape()[0], tf.float32)  # ValueError
        except ValueError as valerr:
            print("ValueError in obtaining batch size: ", valerr)
            m = self.X.get_shape()[0]

        loss = self.build_J_logistic(y)

        Thetas_only = self._CNN_model.__get_state__()['Thetas']

        ThetaL2norms = map(tf.nn.l2_loss, Thetas_only)

        reg_term = tf.accumulate_n(ThetaL2norms)

        J = loss + lambda_val * (1.0 / m) * reg_term

        self.J_Theta = J
        return J
Example #10
0
def scale_invariant_gradient_loss(prediction, gt):
    def discrete_scale_invariant_gradient(f, h):
        """
        Calculates the discrete scale invariant gradient of f with spacing h
        """
        _, height, width, _ = f.shape.as_list()

        # Pad the input width and height to allow for the spacing
        padded_f = tf.pad(f, [[0, 0], [0, h], [0, h], [0, 0]])

        # f(i + h, j)
        f_ih_j = padded_f[:, 0:height, h:width + h, :]

        # (f(i + h, j) - f(i, j)) / (|f(i + h, j)| + |f(i, j)|)
        i = (f_ih_j - f) / (tf.abs(f_ih_j) + tf.abs(f))

        # f(i, j + h)
        f_i_jh = padded_f[:, h:height + h, 0:width, :]

        # (f(i, j + h) - f(i, j)) / (|f(i, j + h)| + |f(i, j)|)
        j = (f_i_jh - f) / (tf.abs(f_i_jh) + tf.abs(f))

        return tf.stack([i, j])

    all_losses = []
    hs = [1, 2, 4, 8, 16]
    for h in hs:
        pred_grad = discrete_scale_invariant_gradient(prediction)
        gt_grad = discrete_scale_invariant_gradient(gt)
        all_losses.append(l2(pred_grad, gt_grad_i, normalize=False))
    return tf.reduce_sum(tf.accumulate_n(all_losses))
Example #11
0
    def build_J_logistic_w_reg(self, lambda_val=1.0, y=None):
        """ @fn :: build_J_L2norm_w_reg - 
		@brief :: build or make cost functional, of the form of the L2 norm (i.e. Euclidean distance norm)

		# regularization, "learning", "momentum" parameters/constants/rates
		@type lambda_val : float
		@param lambda_val : regularization constant
		"""
        """		
		if y is not None:
			self.y = y
		else:
			y = self.y
		"""
        loss = self.build_J_logistic(y)

        Thetas_only = self._CNN_model.__get_state__()['Thetas']

        ThetaL2norms = map(tf.nn.l2_loss, Thetas_only)

        reg_term = tf.accumulate_n(ThetaL2norms)

        J = loss + lambda_val * reg_term

        self.J_Theta = J
        return J
Example #12
0
def classify(prob):
    max_pred_digits = []
    cum_max_pred = []

    for i in range(n_digits):
        log_prob = tf.log(prob[i])
        max_pred_digits.append(tf.argmax(log_prob, 1))
        max_pred = tf.reduce_max(log_prob, 1)
        if i == 0:
            cum_max_pred.append(max_pred)
        else:
            cum_max_pred.append(
                tf.accumulate_n([cum_max_pred[i - 1], max_pred]))

    max_pred_digits = tf.reshape(tf.concat(0, max_pred_digits), [-1, n_digits])

    log_prob_len = tf.log(prob[n_digits])
    log_prob_len = tf.split(1, n_digits + 1, log_prob_len)

    total_max_pred = []
    total_max_pred.append(log_prob_len[0])

    for i in range(n_digits):
        total_max_pred.append(
            tf.accumulate_n(
                [log_prob_len[i + 1],
                 tf.reshape(cum_max_pred[i], [-1, 1])]))

    total_max_pred = tf.reshape(tf.concat(0, total_max_pred),
                                [-1, len(total_max_pred)])
    total_len = tf.cast(tf.argmax(total_max_pred, 1), tf.int32)

    batch_size = total_len.get_shape().as_list()[0]

    lengths_transposed = tf.expand_dims(total_len, 1)
    lengths_tiled = tf.tile(lengths_transposed, [1, n_digits])

    range_all = tf.range(0, n_digits, 1)
    range_row = tf.expand_dims(range_all, 0)
    range_tiled = tf.tile(range_row, [batch_size, 1])

    mask = tf.less(range_tiled, lengths_tiled)
    all_neg_ones = tf.cast(tf.fill(tf.shape(mask), -1), tf.int64)

    result = tf.select(mask, max_pred_digits, all_neg_ones)

    return result
 def _build(self, logits):
     weak_classifications = [tf.nn.softmax(logits) for logits in logits]
     weighted_classifications = [
         c * (1. / (s + 1e-5) * a)
         for c, s, a in zip(weak_classifications, self._weak_running_sums,
                            self._running_accs)
     ]
     return tf.accumulate_n(weighted_classifications)
 def _build(self, logits):
     assert len(logits) == self._weights.get_shape().as_list()[0]
     stopped_logits = [tf.stop_gradient(l) for l in logits]
     weighted_logits = [
         a * b for a, b in zip(stopped_logits,
                               tf.split(self._weights, len(logits)))
     ]
     return tf.accumulate_n(weighted_logits) / float(len(logits))
Example #15
0
 def testSimple(self):
     with self.test_session():
         random_arrays = [np.random.rand(16, 16, 16, 16).astype(np.float32) for _ in range(20)]
         random_tensors = [tf.convert_to_tensor(x, dtype=tf.float32) for x in random_arrays]
         tf_val = tf.accumulate_n(random_tensors)
         np_val = random_arrays[0]
         for random_array in random_arrays[1:]:
             np_val += random_array
         self.assertAllClose(np_val, tf_val.eval())
def classify(prob):
    max_pred_digits = []
    cum_max_pred = []

    for i in range(n_digits):
        log_prob = tf.log(prob[i])
        max_pred_digits.append(tf.argmax(log_prob,1))
        max_pred = tf.reduce_max(log_prob,1)
        if i == 0:
            cum_max_pred.append(max_pred)
        else:
            cum_max_pred.append(tf.accumulate_n([cum_max_pred[i-1], max_pred]))
    
    max_pred_digits = tf.reshape(tf.concat(0, max_pred_digits), [-1, n_digits])
    
    log_prob_len = tf.log(prob[n_digits])
    log_prob_len = tf.split(1,n_digits+1,log_prob_len)
    
    total_max_pred = []
    total_max_pred.append(log_prob_len[0])

    for i in range(n_digits):
        total_max_pred.append(tf.accumulate_n([log_prob_len[i+1], tf.reshape(cum_max_pred[i], [-1,1])]))
    
    total_max_pred = tf.reshape(tf.concat(0, total_max_pred), [-1, len(total_max_pred)])
    total_len = tf.cast(tf.argmax(total_max_pred,1), tf.int32)
   
    batch_size = total_len.get_shape().as_list()[0]
 
    lengths_transposed = tf.expand_dims(total_len, 1)
    lengths_tiled = tf.tile(lengths_transposed, [1, n_digits])
    
    range_all = tf.range(0, n_digits, 1)
    range_row = tf.expand_dims(range_all, 0)
    range_tiled = tf.tile(range_row, [batch_size, 1])

    mask = tf.less(range_tiled, lengths_tiled)
    all_neg_ones = tf.cast(tf.fill(tf.shape(mask), -1), tf.int64)
	
    result = tf.select(mask, max_pred_digits, all_neg_ones)
    
    return result
Example #17
0
 def testSimple(self):
   with self.test_session():
     random_arrays = [np.random.rand(16, 16, 16, 16).astype(np.float32)
                      for _ in range(20)]
     random_tensors = [tf.convert_to_tensor(x, dtype=tf.float32)
                       for x in random_arrays]
     tf_val = tf.accumulate_n(random_tensors)
     np_val = random_arrays[0]
     for random_array in random_arrays[1:]:
       np_val += random_array
     self.assertAllClose(np_val, tf_val.eval())
Example #18
0
 def call(self, inputs):
     """Predict op."""
     # A list of tree outputs. Each element corresponds to one tree.
     tree_logits = []
     for tree_index in range(self.trees_num):
         tree_logits.append(
             nt_compute_output_op(inputs, self.node_weights[tree_index],
                                  self.leaf_weights[tree_index],
                                  self.output_logits_dim, self.depth,
                                  self.smooth_step_param,
                                  self.parallelize_over_samples))
     if self.trees_num == 1:
         return tree_logits[0]
     elif self.sum_outputs:
         return tf.accumulate_n(tree_logits)
     else:
         return tf.concat(tree_logits, axis=1)
Example #19
0
def ff_bp(data, w, grads, ff_deps, bp_deps):
  new_ff_deps = []
  new_bp_deps = []
  # ff
  fwd = []
  last = data
  for i in xrange(FLAGS.num_layers):
    with tf.name_scope('fc_ff%d' % i):
      fwd.append(last)
      tmp = []
      new_ff_deps.append([])
      for j in xrange(FLAGS.num_gpus):
        with tf.device('/gpu:%d' % j), tf.control_dependencies([ff_deps[i][j]]):
          # matmult
          y = tf.matmul(last[j], w[i][j])
          # split
          y_split = tf.split(split_dim=1, num_split=FLAGS.num_gpus, value=y)
          tmp.append(y_split)
          new_ff_deps[i].append(y)
      # reduce
      red = []
      for j in xrange(FLAGS.num_gpus):
        with tf.device('/gpu:%d' % j):
          red.append(tf.accumulate_n([s[j] for s in tmp]))
      last = red
  # bp
  for i in reversed(xrange(FLAGS.num_layers)):
    with tf.name_scope('fc_bp%d' % i):
      # convert col -> rep
      tmp = []
      for j in xrange(FLAGS.num_gpus):
        with tf.device('/gpu:%d' % j):
          tmp.append(tf.concat(concat_dim=1, values=last))
      last = []
      for j in xrange(FLAGS.num_gpus):
        with tf.device('/gpu:%d' % j):
          with tf.name_scope('bp'):
            # matmult: bp
            dy = tf.matmul(tmp[j], w[i][j], transpose_b=True)
            last.append(dy)
            # matmult: grad
            dw = tf.matmul(fwd[i][j], tmp[j], transpose_a=True)
          # update
          grads[i][j] += dw
  return new_ff_deps, new_bp_deps
Example #20
0
	def build_J_xent_w_scaled_reg(self, lambda_val=1.0, y=None):
		""" build_J_L2norm_w_scaled_reg
		
		with 
		J_loss = \frac{1}{m} \sum_{i=0}^{m-1} (\widehat{\mathbf{y}}^{(i)}-\mathbf{y}^{(i)})^2
		(see build_J_L2norm)
		this adds the following term:  
		J_reg = \frac{1}{m} \sum_{l=0}^{L-1} \| \Theta^{(l)} \|_2 = 
				= \frac{1}{m} \sum_{l=0}^{L-1} \sum_{I \in \mathcal{I}} (\Theta_I^{(l)})^2 
		and computes 
		J = J_loss + J_reg
				
		@type lambda_val  : (single) float number
		@param lambda_val : regularization parameter
		
		"""
		"""		
		if y is not None:
			self.y = y
		else:
			y = self.y
		"""
		# m=number of samples, i.e. total "batch" size
		X=self.X

		try:
			m = tf.cast( self.X.get_shape()[0], tf.float32) # ValueError
		except ValueError as valerr:
			print("ValueError in obtaining batch size: ", valerr)
			m = self.X.get_shape()[0]

		
		loss = self.build_J_xent(y)
		
		Thetas_only = self._CNN_model.__get_state__()['Thetas']
		
		ThetaL2norms = map( tf.nn.l2_loss, Thetas_only)
		
		reg_term = tf.accumulate_n( ThetaL2norms )
		
		J = loss + lambda_val*(1.0/m)*reg_term  

		self.J_Theta = J

		return J 		
Example #21
0
    def build_J_xent_w_scaled_reg(self, lambda_val=1.0, y=None):
        """ build_J_L2norm_w_scaled_reg
		
		with 
		J_loss = \frac{1}{m} \sum_{i=0}^{m-1} (\widehat{\mathbf{y}}^{(i)}-\mathbf{y}^{(i)})^2
		(see build_J_L2norm)
		this adds the following term:  
		J_reg = \frac{1}{m} \sum_{l=0}^{L-1} \| \Theta^{(l)} \|_2 = 
				= \frac{1}{m} \sum_{l=0}^{L-1} \sum_{I \in \mathcal{I}} (\Theta_I^{(l)})^2 
		and computes 
		J = J_loss + J_reg
				
		@type lambda_val  : (single) float number
		@param lambda_val : regularization parameter
		
		"""
        """		
		if y is not None:
			self.y = y
		else:
			y = self.y
		"""
        # m=number of samples, i.e. total "batch" size
        X = self.X

        try:
            m = tf.cast(self.X.get_shape()[0], tf.float32)  # ValueError
        except ValueError as valerr:
            print("ValueError in obtaining batch size: ", valerr)
            m = self.X.get_shape()[0]

        loss = self.build_J_xent(y)

        Thetas_only = self._CNN_model.__get_state__()['Thetas']

        ThetaL2norms = map(tf.nn.l2_loss, Thetas_only)

        reg_term = tf.accumulate_n(ThetaL2norms)

        J = loss + lambda_val * (1.0 / m) * reg_term

        self.J_Theta = J

        return J
def SAMME_R_voting_strategy(logits):
    """
    Algorithm 4 of "Multi-class AdaBoost" by Zhu et al. 2006

    PDF: Can be found at the bottom of page 9
    (https://web.stanford.edu/~hastie/Papers/samme.pdf)

    Args:
      See `voting strategy`
    """
    class_num = logits[0].get_shape().as_list()[-1]
    for x in logits:
        assert x.shape == logits[0].shape

    log_probs = [tf.log(tf.nn.softmax(l)) for l in logits]
    # two steps to get a matrix of -1 except for the diagonal which is 1
    hk_inner_prod = tf.constant(
        (-1 / class_num), dtype=tf.float32, shape=(class_num, class_num))
    hk_inner_prod = tf.matrix_set_diag(hk_inner_prod, tf.ones([class_num]))
    h_ks = [(class_num - 1) * tf.matmul(lp, hk_inner_prod) for lp in log_probs]

    return tf.accumulate_n(h_ks)
Example #23
0
 def testZeroArgs(self):
   with self.test_session():
     with self.assertRaises(ValueError):
       tf_val = tf.accumulate_n([])
       tf_val.eval()
Example #24
0
add_op_2 = add_op_1 + one_matrix

# Sum of all elements of resultant matrix
matrix_sum_1 = tf.reduce_sum(add_op_1, name="matrix_sum_1")
matrix_sum_2 = tf.reduce_sum(add_op_2, name="matrix_sum_2")

# Product of all elements
prod_1 = tf.reduce_prod(add_op_1)

# reduce_min, reduce_max, reduce_mean
# reduce_all - item wise AND operator applied
# reduce_any - item wise OR operator applied

# Element wise sum of matrices of same shape
# shape paramter is inferred
element_sum = tf.accumulate_n([add_op_1, add_op_2])

# Initialize variables
init = tf.initialize_variables([one_matrix, a_matrix])

# Session
session = tf.Session()
session.run(init)

try:
    assert_op = tf.assert_variables_initialized([one_matrix, a_matrix])
    result = session.run([element_sum, prod_1, matrix_sum_2, matrix_sum_1])
except tf.errors.FailedPreconditionError:
    print 'Intialize variables before using them, exiting session'

session.close()
Example #25
0
    def define_graph(self):
        """
        Set up the model graph
        """
        with tf.name_scope('data'):
            self.ref_image = tf.placeholder(tf.float32, shape=[None,128,128,3], name='ref_image')
            self.multi_plane = tf.placeholder(tf.float32, shape=[None,128,128,3*c.NUM_PLANE])
            self.gt = tf.placeholder(tf.float32,shape=[None,128,128,3], name='gt')

        self.summaries=[]
        
        with tf.name_scope('predection'):
            def prediction(ref_image,multi_plane):
                net_in = tf.concat([ref_image,multi_plane],axis=-1)

                conv1_1 = conv_block(net_in,64,3,1)
                conv1_2 = conv_block(conv1_1,128,3,2)

                conv2_1 = conv_block(conv1_2,128,3,1)
                conv2_2 = conv_block(conv2_1,256,3,2)

                conv3_1 = conv_block(conv2_2,256,3,1)
                conv3_2 = conv_block(conv3_1,256,3,1)
                conv3_3 = conv_block(conv3_2,512,3,2)

                # weight3_1 = tf.Variable(tf.random_normal([3, 3, 512]))
                # weight3_2 = tf.Variable(tf.random_normal([3, 3, 512]))
                # weight3_3 = tf.Variable(tf.random_normal([3, 3, 512]))

                # conv4_1 = tf.nn.dilation2d(conv3_3,weight3_1,[1,1,1,1],[1,2,2,1],'SAME')
                # conv4_2 = tf.nn.dilation2d(conv4_1,weight3_2,[1,1,1,1],[1,2,2,1],'SAME')
                # conv4_3 = tf.nn.dilation2d(conv4_2,weight3_3,[1,1,1,1],[1,2,2,1],'SAME')

                conv4_1 = tf.layers.conv2d(conv3_3,512,(3,3),(1,1),'SAME',dilation_rate=(2,2))
                conv4_2 = tf.layers.conv2d(conv4_1,512,(3,3),(1,1),'SAME',dilation_rate=(2,2))
                conv4_3 = tf.layers.conv2d(conv4_2,512,(3,3),(1,1),'SAME',dilation_rate=(2,2))

                conv5_1 = deconv_block(tf.concat([conv4_3,conv3_3],axis=-1),256,4,2)
                conv5_2 = conv_block(conv5_1,256,3,1)
                conv5_3 = conv_block(conv5_2,256,3,1)

                conv6_1 = deconv_block(tf.concat([conv5_3,conv2_2],axis=-1),128,4,2)
                conv6_2 = conv_block(conv6_1,128,3,1)
                
                conv7_1 = deconv_block(tf.concat([conv6_2,conv1_2],axis=-1),64,4,2)
                conv7_2 = conv_block(conv7_1,64,3,1)
                conv7_3 = tf.layers.conv2d(conv7_2,62,(1,1),(1,1),'SAME')
                conv7_3 = tf.nn.tanh(conv7_3)

                blending_weights, alpha_images = tf.split(conv7_3,[c.NUM_PLANE,c.NUM_PLANE],axis=-1)
                blending_weights = tensor_norm(blending_weights)
                #alpha_images = tensor_norm(alpha_images)
                alpha_images = tf.nn.softmax(alpha_images,axis=-1)
               
                feature_maps = {
                    'conv1_1':conv1_1,
                    'conv1_2':conv1_2,
                    'conv2_1':conv2_1,
                    'conv2_2':conv2_2,
                    'conv3_1':conv3_1,
                    'conv3_2':conv3_2,
                    'conv3_3':conv3_3,
                    'conv4_1':conv4_1,
                    'conv4_2':conv4_2,
                    'conv4_3':conv4_3,
                    'conv5_1':conv5_1,
                    'conv6_1':conv6_1,
                    'conv6_2':conv6_2,
                    'conv7_1':conv7_1,
                    'conv7_2':conv7_2,
                    'conv7_3':conv7_3
                }

                return blending_weights, alpha_images, feature_maps
            
            
            self.blending_weights, self.alpha_images, self.feature_maps = prediction(self.ref_image,self.multi_plane)
            self.color_images = []
            for i in range(c.NUM_PLANE):
                tmp_weights = tf.expand_dims(self.blending_weights[:,:,:,i],axis=-1)
                #tmp_weights = self.blending_weights[:,:,:,i]
                self.color_images.append(
                    tf.multiply(tmp_weights,self.ref_image) + 
                    tf.multiply(1-tmp_weights,self.multi_plane[:,:,:,3*i:3*(i+1)]))
            
            self.preds = []
            for i in range(c.NUM_PLANE):
                tmp_alpha = tf.expand_dims(self.alpha_images[:,:,:,i],axis=-1)
                self.preds.append(tf.multiply(tmp_alpha, self.color_images[i]))
            self.preds = tf.accumulate_n(self.preds)
            #self.preds = inception_model(self.preds,6)

        with tf.name_scope('train'):
            self.loss = VGG_loss(self.preds,self.gt)
            self.global_step = tf.Variable(0, trainable=False)
            self.optimizer = tf.train.AdamOptimizer(learning_rate=c.LRATE, name='optimizer')
            self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step, name='train_op')
            loss_summary = tf.summary.scalar('train_loss', self.loss)
            self.summaries.append(loss_summary)

        with tf.name_scope('error'):
            self.psnr = psnr(self.preds,self.gt)
            self.sharpdiff = sharp_diff(self.preds,self.gt)
            self.ssim = ssim(self.preds, self.gt)
            summary_psnr = tf.summary.scalar('train_PSNR',self.psnr)
            summary_sharpdiff = tf.summary.scalar('train_SharpDiff',self.sharpdiff)
            summary_ssim = tf.summary.scalar('trian_ssim',self.ssim)
            self.summaries += [summary_psnr, summary_sharpdiff, summary_ssim]
        self.summaries = tf.summary.merge(self.summaries)
  def model_fn(self, features, labels, mode, params):
    """Build the model based on features, labels, and mode.

    Args:
      features: The features dictionary containing the data Tensor
        and the number of examples.
      labels: The labels Tensor resulting from calling the model.
      mode: A string indicating the training mode.
      params: A dictionary of hyperparameters.

    Returns:
      A tf.estimator.EstimatorSpec.
    """
    del params
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    if is_training:
      features = tf.transpose(features, [3, 0, 1, 2])  # HWCN to NHWC
    total_loss, outputs = self._build_network(features, labels, mode)

    devices = cluster_utils.get_pipeline_devices(FLAGS.pipeline_device_num)
    slice_num = len(devices)
    micro_batch_num = FLAGS.micro_batch_num
    losses = []
    all_outputs = []
    losses.append(total_loss)
    all_outputs.append(outputs)
    layer_grads = [[[] for i in xrange(slice_num)] for j in xrange(micro_batch_num)]
    layer_vars = [[] for i in xrange(slice_num)]
    remained_vars = tf.trainable_variables()
    ys = losses[0]
    prev_grads=None
    # layers-1 ~ 1 compute grads
    for i in xrange(slice_num - 1, 0, -1):
      vars_i = [v for v in remained_vars if v.device==devices[i]]
      remained_vars = [v for v in remained_vars if v not in vars_i]
      prev_y = all_outputs[0][i-1]
      prev_y = prev_y if isinstance(prev_y, list) else [prev_y]
      num_tensors = len(prev_y)
      y_grads = tf.gradients(ys=ys, xs=prev_y+vars_i, grad_ys=prev_grads, colocate_gradients_with_ops=True)
      ys = prev_y
      prev_grads = y_grads[0:num_tensors]
      grads_i = y_grads[num_tensors:]
      layer_grads[0][i] = [g for g in grads_i if g is not None]
      layer_vars[i] = [v for (g, v) in zip(grads_i, vars_i) if g is not None]
    # layer 0 compute grads
    grads_0 = tf.gradients(ys=ys, xs=remained_vars, grad_ys=prev_grads, colocate_gradients_with_ops=True)
    layer_grads[0][0] = [g for g in grads_0 if g is not None]
    layer_vars[0] = [v for (g, v) in zip(grads_0, remained_vars) if g is not None]

    # other micro_batch_num
    for j in xrange(1, micro_batch_num):
      dep_outputs = []
      for i in xrange(slice_num):
        dep_outputs.append(all_outputs[j-1][i] if i+j < 2*slice_num-1 else layer_grads[i+j-2*slice_num+1][i])
      loss, outputs = self._build_network(features, labels, mode, dep_outputs=dep_outputs)
      losses.append(loss)
      all_outputs.append(outputs)
      ys = losses[j]
      prev_grads=None
      for i in xrange(slice_num - 1, 0, -1):
        prev_y = all_outputs[j][i-1]
        prev_y = prev_y if isinstance(prev_y, list) else [prev_y]
        num_tensors = len(prev_y)
        y_grads = tf.gradients(ys=ys, xs=prev_y+layer_vars[i], grad_ys=prev_grads, colocate_gradients_with_ops=True)
        ys = prev_y
        prev_grads = y_grads[0:num_tensors]
        grads_i = y_grads[num_tensors:]
        layer_grads[j][i] = [g for g in grads_i if g is not None]
      grads_0 = tf.gradients(ys=ys, xs=layer_vars[0], grad_ys=prev_grads, colocate_gradients_with_ops=True)
      layer_grads[j][0] = [g for g in grads_0 if g is not None]

    grads_set = []
    vars_set = []
    for i in xrange(slice_num):
      for j in xrange(len(layer_grads[0][i])):
        grad_i_set = [layer_grads[m][i][j] for m in range(micro_batch_num)]
        #print (grad_i_set)
        if micro_batch_num == 1:
          with tf.device(grad_i_set[0].device):
            acc_grads = grad_i_set[0]
        else:
          with tf.control_dependencies(grad_i_set), tf.device(grad_i_set[0].device): # replica
            if isinstance(grad_i_set[0], tf.IndexedSlices):
              acc_grads = tf.add_n(grad_i_set)
            else:
              acc_grads = tf.accumulate_n(grad_i_set)
        grads_set.append(acc_grads)
        vars_set.append(layer_vars[i][j])
    grads_and_vars = zip(grads_set, vars_set)
#######################

    train_op = None

    if is_training:
      global_step = tf.train.get_or_create_global_step()
      gs_t = tf.reshape(tf.cast(global_step, tf.int32), [1])

      # Setup learning rate schedule
      learning_rate = self._build_learning_rate_schedule(global_step)

      # Setup optimizer.
      optimizer = self._build_optimizer(learning_rate)

      update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
      with tf.control_dependencies(None): # original is update_ops
        train_op = self._build_train_op(optimizer, grads_and_vars,
                                        global_step=global_step)

      if self.hparams.moving_average_decay > 0:
        ema = tf.train.ExponentialMovingAverage(
            decay=self.hparams.moving_average_decay, num_updates=global_step)
        variables_to_average = (tf.trainable_variables() +
                                tf.moving_average_variables())
        with tf.control_dependencies([train_op]):
          with tf.name_scope('moving_average'):
            train_op = ema.apply(variables_to_average)

      lr_t = tf.reshape(learning_rate, [1])
      host_call = None
      if self.hparams.enable_hostcall:
        def host_call_fn(gs, lr):
          # Outfeed supports int32 but global_step is expected to be int64.
          gs = tf.cast(tf.reduce_mean(gs), tf.int64)
          with tf.contrib.summary.create_file_writer(
              self.model_dir).as_default():
            with tf.contrib.summary.always_record_summaries():
              tf.contrib.summary.scalar('learning_rate', tf.reduce_mean(lr),
                                        step=gs)
              return tf.contrib.summary.all_summary_ops()
        host_call = (host_call_fn, [gs_t, lr_t])

    return tf.estimator.EstimatorSpec(
        mode=mode, loss=total_loss, train_op=train_op)
def naive_voting_strategy(logits):
    for x in logits:
        assert x.shape == logits[0].shape
    return tf.accumulate_n(logits) / float(len(logits))
Example #28
0
def feature_select(*args):

    xs = args[1:]
    #Define the forward operation

    #First, reconstruct the actual arguments
    feature_selector_mat = args[0]
    #recall, feature_selector_mat is num_features x num_prev_layers

    transpose_feature_selector_mat = tf.transpose(feature_selector_mat)
    #this is num_prev_layers * num_features

    #Get a bool vector of layers with nonzero feature selection vectors
    #condition = tf.logical_not(tf.equal(transpose_feature_selector_mat, 0.0))
    #transpose_feature_selector_nonzero_mat = tf.reduce_any(condition, axis=-1)

    feature_selectors = []
    #feature_selectors_nonzero = []
    for i in range(len(xs)):
        feature_selectors.append(transpose_feature_selector_mat[i])
        #feature_selectors_nonzero.append(transpose_feature_selector_nonzero_mat[i])

    summands = []
    for i in range(len(xs)):
        x = xs[i]
        feature_selector = feature_selectors[i]
        summand = tf.reshape(feature_selector, [1, 1, 1, -1]) * x
        summands.append(summand)

    #Define the gradients
    def grad(dy):
        #Note: dy is intuitively the direction that we __want__ the summed output
        #feature maps to go in.
        x_grads = []
        feature_selector_grads = []

        dy_flat = tf.reshape(dy, [-1, tf.shape(dy)[-1]])

        #Compute gradients with respect to the previous feature maps
        for i in range(len(xs)):
            #For the gradient here, gate the dy by
            #the weights that this x actually effects
            #and collapse across features with a sum
            feature_selector = feature_selectors[i]
            x = xs[i]

            #is_nonzero = feature_selectors_nonzero[i]

            def nonzero_branch():

                expanded_feature_selector = tf.reshape(feature_selector,
                                                       [-1, 1])

                x_grad = tf.matmul(dy_flat,
                                   expanded_feature_selector,
                                   b_is_sparse=False)

                #The above should be the same thing as:
                #x_grad = tf.einsum('ijkl,l->ijk', dy, feature_selector)

                x_grad = tf.reshape(x_grad, tf.shape(x))
                return x_grad

            def zero_branch():
                return tf.zeros_like(x, dtype=tf.float32)

            x_grad = nonzero_branch()

            #There's another case -- if the feature is not used, then the gradient is zero!
            #x_grad = tf.cond(is_nonzero, nonzero_branch, zero_branch)

            x_grads.append(x_grad)

        #Compute gradients with respect to the weights on feature maps
        for i in range(len(feature_selectors)):
            #Find how much x goes in the direction of dy
            x = xs[i]

            #is_nonzero = feature_selectors_nonzero[i]

            def nonzero_branch_two():
                x_flat = tf.reshape(x, [1, -1])
                x_ys_similarities = tf.matmul(x_flat, dy_flat)

                x_ys_similarities = tf.reshape(x_ys_similarities,
                                               tf.shape(feature_selectors[i]))

                #The above should be the same as...
                #x_ys_similarities = tf.einsum('ijkl,ijk->l', dy, x)
                return x_ys_similarities

            def zero_branch_two():
                return tf.zeros_like(feature_selectors[i], dtype=tf.float32)

            #There's another case: If the feature is not used, it's zero
            #x_ys_similarities = tf.cond(is_nonzero, nonzero_branch_two, zero_branch_two)
            x_ys_similarities = nonzero_branch_two()
            feature_selector_grads.append(x_ys_similarities)
        #Okay, great. Now return grads in the same order as the arguments.

        #recall, feature_selector_mat was originally num_features x num_prev_layers
        #we need to take the feature_selector_grads and concat along second axis
        f_grad = tf.stack(feature_selector_grads, axis=-1)

        result_grads = []
        result_grads.append(f_grad)
        for i in range(len(feature_selectors)):
            result_grads.append(x_grads[i])
        return result_grads

    return tf.accumulate_n(summands), grad
    def tower_fn(self, inputs):
        """
        This method doesn't have side-effects. `inputs`, `targets`, and
        `outputs` are batch-major but internal calculations use time-major
        tensors.
        """
        # batch-major to time-major
        inputs = nest.map_structure(transpose_batch_time, inputs)

        with tf.variable_scope(self.generator_scope):
            gen_outputs = self.generator_fn(inputs)

        if self.discriminator_fn:
            with tf.variable_scope(self.discriminator_scope) as discrim_scope:
                discrim_outputs = self.discriminator_fn(inputs, gen_outputs)
            # post-update discriminator tensors (i.e. after the discriminator weights have been updated)
            with tf.variable_scope(discrim_scope, reuse=True):
                discrim_outputs_post = self.discriminator_fn(inputs, gen_outputs)
        else:
            discrim_outputs = {}
            discrim_outputs_post = {}

        outputs = [gen_outputs, discrim_outputs]
        total_num_outputs = sum([len(output) for output in outputs])
        outputs = OrderedDict(itertools.chain(*[output.items() for output in outputs]))
        assert len(outputs) == total_num_outputs  # ensure no output is lost because of repeated keys

        if isinstance(self.learning_rate, tf.Tensor):
            outputs['learning_rate'] = self.learning_rate
        if isinstance(self.kl_weight, tf.Tensor):
            outputs['kl_weight'] = self.kl_weight

        if self.mode == 'train':
            with tf.name_scope("discriminator_loss"):
                d_losses = self.discriminator_loss_fn(inputs, outputs)
                print_loss_info(d_losses, inputs, outputs)
            with tf.name_scope("generator_loss"):
                g_losses = self.generator_loss_fn(inputs, outputs)
                print_loss_info(g_losses, inputs, outputs)
                if discrim_outputs_post:
                    outputs_post = OrderedDict(itertools.chain(gen_outputs.items(), discrim_outputs_post.items()))
                    # generator losses after the discriminator weights have been updated
                    g_losses_post = self.generator_loss_fn(inputs, outputs_post)
                else:
                    g_losses_post = g_losses
        else:
            d_losses = {}
            g_losses = {}
            g_losses_post = {}
        with tf.name_scope("metrics"):
            metrics = self.metrics_fn(inputs, outputs)
        with tf.name_scope("eval_outputs_and_metrics"):
            eval_outputs, eval_metrics = self.eval_outputs_and_metrics_fn(inputs, outputs)

        # time-major to batch-major
        outputs_tuple = (outputs, eval_outputs)
        outputs_tuple = nest.map_structure(transpose_batch_time, outputs_tuple)
        losses_tuple = (d_losses, g_losses, g_losses_post)
        losses_tuple = nest.map_structure(tf.convert_to_tensor, losses_tuple)
        loss_tuple = tuple(tf.accumulate_n([loss * weight for loss, weight in losses.values()])
                           if losses else tf.zeros(()) for losses in losses_tuple)
        metrics_tuple = (metrics, eval_metrics)
        metrics_tuple = nest.map_structure(transpose_batch_time, metrics_tuple)
        return outputs_tuple, losses_tuple, loss_tuple, metrics_tuple
import tensorflow as tf

"""tf.accumulate_n(inputs, shape=None, tensor_dtype=None, name=None)
功能:对应位置元素相加。如果输入是训练变量,不要使用,应使用tf.add_n。
输入:shape,tensor_dtype:类型检查"""
a = tf.constant([[1, 2], [3, 4]])
b = tf.constant([[5, 6], [7, 8]])
z = tf.accumulate_n([a, b])

sess = tf.Session()
print(sess.run(z))
sess.close()

# z==>[[6 8]
#      [10 12]]
Example #31
0
def accumulate_n(sess):
    x = tf.constant([[0, 1, 2], [1, 0, 2]])
    y = tf.accumulate_n([x, x])
    print('x', sess.run(x), 'y', sess.run(y))
Example #32
0
 def get_magnitude(self):
     f = lambda x: x.get_magnitude()
     return tf.accumulate_n([f(y) for x,y in self.parameters.items()])
    def __init__(self,
                 iterator,
                 session,
                 model,
                 num_classes,
                 optimizer,
                 dataset,
                 p_norm=2.,
                 alpha=None,
                 decomp_type='bior2.2',
                 NUMPY_images=None,
                 NUMPY_labels=None,
                 learning_rate=.001,
                 weight_decay_p=.0001,
                 lp_wavelet_p=.0001,
                 batch_size=32,
                 bn_momentum=.99,
                 robust_regularization=True,
                 use_wavelet_decomposition=True,
                 wavelet_weights=[0, 1],
                 sensitivity_mode='logits',
                 graph=tf.get_default_graph()):

        self.iterator = iterator
        self.session = session
        self.model = model
        self.num_classes = num_classes
        self.optimizer = optimizer
        self.dataset = dataset
        self.robust_regularization = robust_regularization
        self.wavelet_weights = wavelet_weights
        self.nested_wavelet_weights = utils.nested_weight_list(wavelet_weights)
        self.sensitivity_mode = sensitivity_mode
        self.graph = graph
        self.decomp_type = decomp_type

        self.decomp_depth = len(wavelet_weights) - 1
        self.learning_rate = learning_rate
        self.weight_decay_p = weight_decay_p
        self.lp_wavelet_p = lp_wavelet_p
        self.batch_size = batch_size
        self.bn_momentum = bn_momentum
        self.graph = tf.get_default_graph()
        self.p_norm = p_norm

        self.alpha = alpha
        self.NUMPY_images = NUMPY_images
        self.NUMPY_labels = NUMPY_labels

        if use_wavelet_decomposition:
            from fwt import multi_channel_fwt, create_filter_bank
            self.decomp_filters, self.reconst_filters = create_filter_bank(
                decomp_type)

        devices = device_lib.list_local_devices()
        GPU_devices = [dev.name for dev in devices if dev.device_type == 'GPU']
        self.num_GPUs = len(GPU_devices)

        tensors = []
        scalars = []
        gradients = []
        summaries = []
        with tf.variable_scope(tf.get_variable_scope()):
            with session.as_default():
                for dev in range(self.num_GPUs):
                    with tf.device('/device:GPU:%d' % dev):
                        with tf.name_scope('GPU_%d' % dev) as scope:
                            print("Compiling on GPU %d ..." % dev)

                            tensors.append(dict())
                            scalars.append(dict())

                            # scalars finished converting to dict:
                            # mean_NLL, sum_of_true_logits, mean_correlations

                            # Get the inputs from the iterators
                            next_element = iterator.get_next()
                            tensors[-1]['images'] = next_element[0]
                            tensors[-1]['targets'] = next_element[1]
                            tensors[-1]['one_hot_targets'] = tf.one_hot(
                                tensors[-1]['targets'], self.num_classes)

                            # Get the forward propagated output
                            # for the current batch of this GPU.
                            network_output = model(tensors[-1]['images'])
                            tensors[-1]['logits'] = network_output

                            # For neural networks that use batch
                            # normalization, network_output is actually
                            # a list of tensors, where logits[1:]
                            # represent the inputs to the BatchNorm
                            # layers. Here, we handle this situation
                            # if it arises.
                            if type(network_output) == list:
                                tensors[-1]['logits'] = network_output[0]
                                bn_inputs = network_output[1:]
                                utils.add_bn_ops(model,
                                                 bn_inputs,
                                                 bn_momentum=bn_momentum)

                            tensors[-1]['predictions'] = tf.argmax(
                                tensors[-1]['logits'], axis=1)
                            tensors[-1][
                                'predicted_one_hot_targets'] = tf.one_hot(
                                    tensors[-1]['predictions'],
                                    self.num_classes)
                            tensors[-1]['predicted_logits'] = tf.reduce_max(
                                tensors[-1]['logits'], axis=1)
                            tensors[-1]['probabilities'] = tf.nn.softmax(
                                tensors[-1]['logits'])

                            #### x-terms, b-terms ####################

                            tensors[-1]['x_terms'] = Rop(
                                tensors[-1]['logits'], tensors[-1]['images'],
                                tensors[-1]['images'])
                            tensors[-1]['b_terms'] = tensors[-1][
                                'logits'] - tensors[-1]['x_terms']
                            tensors[-1]['predicted_b_terms'] = utils.select(
                                tensors[-1]['b_terms'],
                                tensors[-1]['predictions'], self.num_classes)

                            if self.alpha is not None:
                                tensors[-1]['taus'] = tensors[-1][
                                    'logits'] - self.alpha * tensors[-1][
                                        'x_terms']

                            #NUMPY SECTION
                            if NUMPY_images is not None and NUMPY_labels is not None:
                                NUMPY_network_output = model(NUMPY_images)
                                tensors[-1][
                                    'NUMPY_logits'] = NUMPY_network_output
                                if type(NUMPY_network_output) == list:
                                    tensors[-1][
                                        'NUMPY_logits'] = NUMPY_network_output[
                                            0]
                                tensors[-1]['NUMPY_predictions'] = tf.argmax(
                                    tensors[-1]['NUMPY_logits'], axis=1)

                                tensors[-1]['NUMPY_x_terms'] = Rop(
                                    tensors[-1]['NUMPY_logits'], NUMPY_images,
                                    NUMPY_images)
                                tensors[-1]['NUMPY_b_terms'] = tensors[-1][
                                    'NUMPY_logits'] - tensors[-1][
                                        'NUMPY_x_terms']

                                tensors[-1][
                                    'NUMPY_selected_x_terms'] = utils.select(
                                        tensors[-1]['NUMPY_x_terms'],
                                        NUMPY_labels, self.num_classes)
                                tensors[-1][
                                    'NUMPY_selected_b_terms'] = utils.select(
                                        tensors[-1]['NUMPY_b_terms'],
                                        NUMPY_labels, self.num_classes)

                                if self.alpha is not None:
                                    NUMPY_taus = tensors[-1][
                                        'NUMPY_logits'] - self.alpha * tensors[
                                            -1]['NUMPY_x_terms']

                                tensors[-1][
                                    'NUMPY_selected_logits'] = utils.select(
                                        tensors[-1]['NUMPY_logits'],
                                        NUMPY_labels, self.num_classes)

                                tensors[-1][
                                    'NUMPY_logit_sensitivities'] = tf.gradients(
                                        tf.reduce_sum(
                                            tensors[-1]
                                            ['NUMPY_selected_logits']),
                                        NUMPY_images)[0]
                                tensors[-1][
                                    'NUMPY_bias_shifted_images'] = bias_shifted_input(
                                        NUMPY_images,
                                        tensors[-1]['NUMPY_selected_b_terms'],
                                        tensors[-1]
                                        ['NUMPY_logit_sensitivities'])

                            ##########################################

                            # Classification loss
                            tensors[-1][
                                'NLLs'] = tf.nn.softmax_cross_entropy_with_logits_v2(
                                    labels=tensors[-1]['one_hot_targets'],
                                    logits=tensors[-1]['logits'])
                            scalars[-1]['mean_NLL'] = tf.reduce_mean(
                                tensors[-1]['NLLs'])

                            # Setting up the sensitivity penalty.
                            if sensitivity_mode == 'logits':
                                scalars[-1][
                                    'sum_of_true_logits'] = tf.reduce_sum(
                                        tensors[-1]['logits'] *
                                        tensors[-1]['one_hot_targets'])
                                tensors[-1]['sensitivities'] = tf.gradients(
                                    scalars[-1]['sum_of_true_logits'],
                                    tensors[-1]['images'],
                                    name='input_gradients')[0]
                            elif sensitivity_mode == 'NLL':
                                tensors[-1]['sensitivities'] = tf.gradients(
                                    scalars[-1]['mean_NLL'],
                                    tensors[-1]['images'],
                                    name='input_gradients')[0]

                            if use_wavelet_decomposition:
                                sensitivity_w_decomp = multi_channel_fwt(
                                    tensors[-1]['sensitivities'],
                                    self.decomp_filters,
                                    self.decomp_depth,
                                    output_type='list')

                            tensors[-1]['inner_products'] = tf.reduce_sum(
                                tensors[-1]['images'] *
                                tensors[-1]['sensitivities'],
                                axis=[1, 2, 3])

                            tensors[-1]['sensitivity_norms'] = tf.sqrt(
                                tf.reduce_sum(tensors[-1]['sensitivities']**2,
                                              axis=[1, 2, 3],
                                              name='sens_norm'))
                            tensors[-1]['image_norms'] = tf.sqrt(
                                tf.reduce_sum(tensors[-1]['images']**2,
                                              axis=[1, 2, 3],
                                              name='im_norm'))

                            tensors[-1]['norm_products'] = tensors[-1][
                                'sensitivity_norms'] * tensors[-1][
                                    'image_norms']

                            epsilon = 0.0
                            tensors[-1]['correlations'] = tensors[-1][
                                'inner_products'] / (
                                    tensors[-1]['norm_products'] + epsilon)

                            scalars[-1]['mean_correlation'] = tf.reduce_mean(
                                tensors[-1]['correlations'])
                            scalars[-1]['mean_inner_product'] = tf.reduce_mean(
                                tensors[-1]['inner_products'])
                            scalars[-1]['mean_norm_product'] = tf.reduce_mean(
                                tensors[-1]['norm_products'])

                            tensors[-1]['true_logits'] = tf.reduce_sum(
                                tensors[-1]['logits'] *
                                tensors[-1]['one_hot_targets'],
                                axis=1)
                            scalars[-1]['sum_of_true_logits'] = tf.reduce_sum(
                                tensors[-1]['true_logits'])
                            tensors[-1]['logit_sensitivities'] = tf.gradients(
                                scalars[-1]['sum_of_true_logits'],
                                tensors[-1]['images'],
                                name='logit_input_gradients')[0]

                            tensors[-1][
                                'logit_inner_products'] = tf.reduce_sum(
                                    tensors[-1]['images'] *
                                    tensors[-1]['logit_sensitivities'],
                                    axis=[1, 2, 3])

                            tensors[-1]['logit_sensitivity_norms'] = tf.sqrt(
                                tf.reduce_sum(
                                    tensors[-1]['logit_sensitivities']**2,
                                    axis=[1, 2, 3],
                                    name='sens_norm'))

                            tensors[-1]['logit_norm_products'] = tensors[-1][
                                'logit_sensitivity_norms'] * tensors[-1][
                                    'image_norms']

                            tensors[-1]['logit_correlations'] = tensors[-1]['logit_inner_products'] / \
                                (tensors[-1]['logit_norm_products'] + epsilon)

                            scalars[-1][
                                'mean_logit_correlation'] = tf.reduce_mean(
                                    tensors[-1]['logit_correlations'])
                            scalars[-1][
                                'mean_logit_inner_product'] = tf.reduce_mean(
                                    tensors[-1]['logit_inner_products'])
                            scalars[-1][
                                'mean_logit_norm_product'] = tf.reduce_mean(
                                    tensors[-1]['logit_norm_products'])

                            # Again as a tiled image, for visualization.
                            # Only do this if the dimensions work out.
                            tiled_image_works = False
                            if use_wavelet_decomposition:
                                try:
                                    tensors[-1][
                                        'sensitivity_w_decomp_imgs'] = multi_channel_fwt(
                                            tensors[-1]['sensitivities'],
                                            self.decomp_filters,
                                            self.decomp_depth,
                                            output_type='image')
                                    tiled_image_works = True
                                except tf.errors.OpError:
                                    print(
                                        "Creating a tiled wavelet image failed."
                                    )

                            # sum up all the p-norms of the FWTs of
                            # all channels.
                            if use_wavelet_decomposition:
                                sensitivity_w_mean_lp = 0
                                for decomp in sensitivity_w_decomp:
                                    sensitivity_w_mean_lp += utils.lp_norm_weighted(
                                        decomp,
                                        self.nested_wavelet_weights,
                                        p_norm=self.p_norm)
                            else:
                                # Otherwise, just calculate the p-norm of the
                                # sensitivity.
                                sensitivity_w_mean_lp = utils.lp_norm(
                                    tensors[-1]['sensitivities'],
                                    p_norm=self.p_norm)

                            scalars[-1][
                                'sensitivity_w_mean_lp'] = sensitivity_w_mean_lp

                            ############ ONLY FOR LOGGING PURPOSES ###################
                            tensors[-1]['random_targets'] = tf.random_uniform(
                                tf.shape(tensors[-1]['targets']),
                                maxval=self.num_classes - 1,
                                dtype=tf.int32)

                            tensors[-1]['random_one_hot_targets'] = tf.one_hot(
                                tensors[-1]['random_targets'],
                                self.num_classes)
                            tensors[-1]['random_logits'] = tf.reduce_sum(
                                tensors[-1]['logits'] *
                                tensors[-1]['random_one_hot_targets'],
                                axis=1)
                            scalars[-1][
                                'sum_of_random_logits'] = tf.reduce_sum(
                                    tensors[-1]['random_logits'])

                            tensors[-1][
                                'random_logit_sensitivities'] = tf.gradients(
                                    scalars[-1]['sum_of_random_logits'],
                                    tensors[-1]['images'],
                                    name='random_logit_sensitivities')[0]
                            tensors[-1][
                                'random_logit_inner_products'] = tf.reduce_sum(
                                    tensors[-1]['images'] *
                                    tensors[-1]['random_logit_sensitivities'],
                                    axis=[1, 2, 3])
                            tensors[-1][
                                'random_logit_sensitivity_norms'] = tf.sqrt(
                                    tf.reduce_sum(
                                        tensors[-1]
                                        ['random_logit_sensitivities']**2,
                                        axis=[1, 2, 3]))

                            scalars[-1][
                                'sum_of_predicted_logits'] = tf.reduce_sum(
                                    tensors[-1]['predicted_logits'])
                            tensors[-1][
                                'predicted_logit_sensitivities'] = tf.gradients(
                                    scalars[-1]['sum_of_predicted_logits'],
                                    tensors[-1]['images'],
                                    name='predicted_logit_sensitivities')[0]
                            tensors[-1][
                                'predicted_logit_inner_products'] = tf.reduce_sum(
                                    tensors[-1]['images'] * tensors[-1]
                                    ['predicted_logit_sensitivities'],
                                    axis=[1, 2, 3])
                            tensors[-1][
                                'predicted_logit_sensitivity_norms'] = tf.sqrt(
                                    tf.reduce_sum(
                                        tensors[-1]
                                        ['predicted_logit_sensitivities']**2,
                                        axis=[1, 2, 3]))

                            tensors[-1]['true_logit_sensitivities'] = tensors[
                                -1]['logit_sensitivities']
                            tensors[-1][
                                'true_logit_inner_products'] = tf.reduce_sum(
                                    tensors[-1]['images'] *
                                    tensors[-1]['true_logit_sensitivities'],
                                    axis=[1, 2, 3])
                            tensors[-1][
                                'true_logit_sensitivity_norms'] = tf.sqrt(
                                    tf.reduce_sum(
                                        tensors[-1]['true_logit_sensitivities']
                                        **2,
                                        axis=[1, 2, 3]))

                            # Calculate the bias gradients
                            flatten = lambda a: tf.reshape(a, (-1, ))
                            IP = lambda a, b: tf.reduce_sum(a * b)

                            biases = [
                                b for b in model.trainable_weights
                                if 'bias' in b.name
                            ]
                            biases += tf.get_collection('bn_betas')
                            biases += tf.get_collection('bn_means')

                            random_bias_gradients = tf.gradients(
                                scalars[-1]['sum_of_random_logits'],
                                biases,
                                name='random_bias_gradients')

                            random_bg = [
                                IP(flatten(b), flatten(g))
                                for (b,
                                     g) in zip(biases, random_bias_gradients)
                            ]
                            random_bias_inner_products = tf.accumulate_n(
                                random_bg)

                            predicted_bias_gradients = tf.gradients(
                                scalars[-1]['sum_of_predicted_logits'],
                                biases,
                                name='predicted_bias_gradients')
                            predicted_bg = [
                                IP(flatten(b), flatten(g)) for (
                                    b,
                                    g) in zip(biases, predicted_bias_gradients)
                            ]
                            predicted_bias_inner_products = tf.accumulate_n(
                                predicted_bg)

                            true_bias_gradients = tf.gradients(
                                scalars[-1]['sum_of_true_logits'],
                                biases,
                                name='true_bias_gradients')

                            true_bg = [
                                IP(flatten(b), flatten(g))
                                for (b, g) in zip(biases, true_bias_gradients)
                            ]
                            true_bias_inner_products = tf.add_n(true_bg)

                            zero_image = tf.zeros_like(tensors[-1]['images'])
                            tensors[-1]['zero_output'] = model(zero_image)[0]

                            tensors[-1]['random_zero_logits'] = tf.reduce_sum(
                                tensors[-1]['zero_output'] *
                                tensors[-1]['random_one_hot_targets'],
                                axis=1)
                            tensors[-1][
                                'predicted_zero_logits'] = tf.reduce_sum(
                                    tensors[-1]['zero_output'] *
                                    tensors[-1]['predicted_one_hot_targets'],
                                    axis=1)
                            tensors[-1]['true_zero_logits'] = tf.reduce_sum(
                                tensors[-1]['zero_output'] *
                                tensors[-1]['one_hot_targets'],
                                axis=1)

                            # Calculate the approximate random robustness

                            tensors[-1]['inner_product_differences'] = (
                                tensors[-1]['predicted_logit_inner_products'] -
                                tensors[-1]['random_logit_inner_products'])

                            tensors[-1][
                                'bias_differences'] = predicted_bias_inner_products - random_bias_inner_products

                            numerator = tensors[-1][
                                'inner_product_differences'] - tensors[-1][
                                    'bias_differences']

                            tensors[-1]['logit_sensitivity_differences'] = (
                                tensors[-1]['predicted_logit_sensitivities'] -
                                tensors[-1]['random_logit_sensitivities'])
                            denominator = tf.sqrt(
                                tf.reduce_sum(
                                    tensors[-1]
                                    ['logit_sensitivity_differences']**2))

                            tensors[-1][
                                'approximate_random_robustness'] = numerator / denominator
                            tensors[-1][
                                'inner_product_differences_normalized'] = (
                                    tensors[-1]['inner_product_differences'] /
                                    denominator)
                            tensors[-1][
                                'bias_differences_normalized'] = tensors[-1][
                                    'bias_differences'] / denominator

                            tensors[-1][
                                'bias_difference_shifted_images'] = bias_shifted_input(
                                    tensors[-1]['images'],
                                    tensors[-1]['bias_differences'],
                                    tensors[-1]
                                    ['logit_sensitivity_differences'])

                            #print(tensors[-1]['bias_differences_normalized'])
                            #crash()
                            #######################################################

                            # Collect the network's weights and set up
                            # the weight decay penalty
                            trainable_weights = model.trainable_weights
                            scalars[-1]['weight_norm'] = tf.add_n([
                                tf.reduce_sum(w**2) for w in trainable_weights
                            ])

                            # Assemble the total loss for this GPU
                            scalars[-1]['total_loss'] = scalars[-1]['mean_NLL']
                            scalars[-1][
                                'total_loss'] += weight_decay_p * scalars[-1][
                                    'weight_norm']
                            if robust_regularization:
                                scalars[-1][
                                    'sensitivity_penalty'] = lp_wavelet_p * scalars[
                                        -1]['sensitivity_w_mean_lp']
                                scalars[-1]['total_loss'] += scalars[-1][
                                    'sensitivity_penalty']

                            # Everything that is tracked during training
                            # goes here. Top-5 and top-1 accuracies are
                            # automatically added.
                            summary_dict = {
                                'total_loss':
                                scalars[-1]['total_loss'],
                                'mean_NLL':
                                scalars[-1]['mean_NLL'],
                                'weight_2_norm_squared':
                                scalars[-1]['weight_norm'],
                                'mean_sensitivity_wavelet_coeffs_lp':
                                scalars[-1]['sensitivity_w_mean_lp']
                            }

                            # Add some hyperparameters, too.
                            # Some redundant calculations through averaging
                            # later, but the computational overhead is negligible.
                            summary_dict['learning_rate_'] = learning_rate
                            summary_dict['correlation_'] = scalars[-1][
                                'mean_correlation']
                            summary_dict['inner_product_'] = scalars[-1][
                                'mean_inner_product']
                            summary_dict['norm_product_'] = scalars[-1][
                                'mean_norm_product']
                            summary_dict['logit_correlation_'] = scalars[-1][
                                'mean_logit_correlation']
                            summary_dict['logit_inner_product_'] = scalars[-1][
                                'mean_logit_inner_product']
                            summary_dict['logit_norm_product_'] = scalars[-1][
                                'mean_logit_norm_product']
                            summary_dict[
                                'weight_decay_parameter_'] = weight_decay_p
                            summary_dict[
                                'lp_Wavelet_parameter_'] = lp_wavelet_p
                            summary_dict[
                                'total_batch_size'] = batch_size * self.num_GPUs
                            summary_dict['bn_momentum_'] = bn_momentum
                            summary_dict['p_norm'] = p_norm

                            if robust_regularization:
                                summary_dict['sensitivity_penalty'] = scalars[
                                    -1]['sensitivity_penalty']

                            summary_dict = summary_utils.prepare_summaries(
                                summary_dict=summary_dict,
                                predictions=tensors[-1]['probabilities'],
                                labels=tensors[-1]['targets'])
                            summaries.append(summary_dict)

                            # Collect the gradients for every GPU
                            gradients.append(
                                optimizer.compute_gradients(
                                    scalars[-1]['total_loss'],
                                    var_list=trainable_weights,
                                    colocate_gradients_with_ops=True))

                            # So far, the adversarial attack model is only
                            # created on one GPU. Different parallelized versions
                            # always lead to errors.
                            if dev == 0:
                                self.adversarial_model = TensorFlowModel(
                                    tensors[-1]['images'],
                                    tensors[-1]['logits'],
                                    bounds=self.dataset.bounds)

        print("Done.")

        # Copy the lists 'tensors' and 'scalars' and replace these with an aggregated version:
        # Concatenate the tensors and average the scalars.
        self.tensors = dict()
        self.scalars = dict()
        for key in tensors[0].keys():
            print(key)
            self.tensors[key] = tf.concat(
                [tensors_item[key] for tensors_item in tensors], axis=0)
        for key in scalars[0].keys():
            self.scalars[key] = tf.reduce_mean(
                [scalars_item[key] for scalars_item in scalars])

        # Create self.GPU_collections for backwards compatibility
        self.GPU_collections = {**self.tensors, **self.scalars}
        self.GPU_collections['top_1'] = tf.concat(tf.get_collection('top_1'),
                                                  0)
        self.GPU_collections['top_5'] = tf.concat(tf.get_collection('top_5'),
                                                  0)

        # Collection and apply the gradients over all used
        # GPUs for synchronous parallel training.
        avg_grads = utils.average_gradients(gradients)
        gradient_application = optimizer.apply_gradients(avg_grads)
        # We combine the gradient update and possibly the
        # batch normalization update operators into one.
        self.train_op = tf.group(gradient_application,
                                 *(tf.get_collection('bn_update_ops')))

        summary_dict = summary_utils.collect_summaries(summaries)
        self.summary_op = summary_utils.create_summary_op(summary_dict)

        if use_wavelet_decomposition:
            wavelet_summary = tf.summary.tensor_summary(
                'wavelet_weights', self.wavelet_weights)
            self.summary_op = tf.summary.merge(
                [self.summary_op, wavelet_summary])

        # Here, we create a tiled image summary for Tensorboard.
        # We hereby shift the range of the sensitivity and
        # possibly its decomposition to the range of the image.
        image_range = self.dataset.image_range()
        image_max = image_range[1]
        image_min = image_range[0]
        image_span = image_max - image_min
        image_mid = image_span / 2.

        self.images = self.dataset.interpret_as_image(
            self.GPU_collections['images'])
        self.saliencies = self.GPU_collections['sensitivities']
        saliencies_max = tf.reduce_max(tf.abs(self.saliencies), [1, 2],
                                       keepdims=True)
        normalized_saliencies = image_span * self.saliencies / \
            (2*saliencies_max + 1e-9) + image_mid

        if use_wavelet_decomposition:
            self.saliency_decomps = self.GPU_collections[
                'sensitivity_w_decomp_imgs']
            saliency_decomps_max = tf.reduce_max(tf.abs(self.saliency_decomps),
                                                 [1, 2],
                                                 keepdims=True)
            normalized_decomps = image_span * self.saliency_decomps / \
                (2*saliency_decomps_max + 1e-9) + image_mid

        composite_image = [self.images, normalized_saliencies]

        if tiled_image_works:
            composite_image.append(normalized_decomps)

        img_saliency_decomp = tf.concat(composite_image, 2)

        self.img_summary_op = tf.summary.image('img_saliency_decomp',
                                               img_saliency_decomp,
                                               max_outputs=10)
Example #34
0
    def _set_train_or_infer(self,
                            res,
                            reverse_target_vocab_table,
                            hparams,
                            scope=None):
        """Set up training and inference."""
        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            self.train_loss = res[1]
            self.word_count = tf.reduce_sum(
                self.iterator.source_sequence_length) + tf.reduce_sum(
                    self.iterator.target_sequence_length)
        elif self.mode == tf.contrib.learn.ModeKeys.EVAL:
            self.eval_loss = res[1]
        elif self.mode == tf.contrib.learn.ModeKeys.INFER:
            self.infer_logits, _, self.final_context_state, self.sample_id = res
            self.sample_words = reverse_target_vocab_table.lookup(
                tf.to_int64(self.sample_id))

        if self.mode != tf.contrib.learn.ModeKeys.INFER:
            ## Count the number of predicted words for compute ppl.
            self.predict_count = tf.reduce_sum(
                self.iterator.target_sequence_length)

        params = tf.trainable_variables()

        # Gradients and SGD update operation for training the model.
        # Arrange for the embedding vars to appear at the beginning.
        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            self.learning_rate = tf.constant(hparams.learning_rate)
            # warm-up
            self.learning_rate = self._get_learning_rate_warmup(hparams)
            # decay
            self.learning_rate = self._get_learning_rate_decay(hparams)

            # Optimizer
            if hparams.optimizer == "sgd":
                opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            elif hparams.optimizer == "adam":
                opt = tf.train.AdamOptimizer(self.learning_rate)
            else:
                raise ValueError("Unknown optimizer type %s" %
                                 hparams.optimizer)

            DAPPLE_TEST = hparams.dapple_test
            if DAPPLE_TEST:
                devices = cluster_utils.get_pipeline_devices(
                    hparams.pipeline_device_num)
                slice_num = len(devices)
                micro_batch_num = hparams.micro_batch_num
                losses = []
                all_outputs = []
                losses.append(self.train_loss)
                stage_outputs = res[-1]
                all_outputs.append(stage_outputs)
                layer_grads = [[[] for i in xrange(slice_num)]
                               for j in xrange(micro_batch_num)]
                layer_vars = [[] for i in xrange(slice_num)]
                remained_vars = tf.trainable_variables()
                ys = losses[0]
                prev_grads = None
                # layers-1 ~ 1 compute grads
                for i in xrange(slice_num - 1, 0, -1):
                    vars_i = [
                        v for v in remained_vars if v.device == devices[i]
                    ]
                    remained_vars = [
                        v for v in remained_vars if v not in vars_i
                    ]
                    prev_y = all_outputs[0][i - 1]
                    prev_y = prev_y if isinstance(prev_y, list) else [prev_y]
                    num_tensors = len(prev_y)
                    with tf.device(devices[i]):
                        y_grads = tf.gradients(
                            ys=ys,
                            xs=prev_y + vars_i,
                            grad_ys=prev_grads,
                            colocate_gradients_with_ops=True)
                    ys = prev_y
                    prev_grads = y_grads[0:num_tensors]
                    grads_i = y_grads[num_tensors:]
                    layer_grads[0][i] = [g for g in grads_i if g is not None]
                    layer_vars[i] = [
                        v for (g, v) in zip(grads_i, vars_i) if g is not None
                    ]
                # layer 0 compute grads
                #with tf.device(devices[0]):
                grads_0 = tf.gradients(ys=ys,
                                       xs=remained_vars,
                                       grad_ys=prev_grads,
                                       colocate_gradients_with_ops=True)
                #colocate_gradients_with_ops=True, name="gradients_gpu_0")
                layer_grads[0][0] = [g for g in grads_0 if g is not None]
                layer_vars[0] = [
                    v for (g, v) in zip(grads_0, remained_vars)
                    if g is not None
                ]

                # other micro_batch_num
                for j in xrange(1, micro_batch_num):
                    dep_outputs = []
                    for i in xrange(slice_num):
                        dep_outputs.append(
                            all_outputs[j - 1][i] if i +
                            j < slice_num else layer_grads[i + j -
                                                           slice_num][i])
                        #dep_outputs.append(all_outputs[j-1][i] if i+j < 2*slice_num-1 else layer_grads[i+j-2*slice_num+1][i])
                    res = self.build_graph(hparams,
                                           scope,
                                           dep_outputs=dep_outputs)
                    losses.append(res[1])
                    all_outputs.append(
                        res[-1]
                    )  ### push this micro_batch's outputs of all stage
                    ys = losses[j]
                    prev_grads = None
                    for i in xrange(slice_num - 1, 0, -1):
                        prev_y = all_outputs[j][i - 1]
                        prev_y = prev_y if isinstance(prev_y,
                                                      list) else [prev_y]
                        num_tensors = len(prev_y)
                        y_grads = tf.gradients(
                            ys=ys,
                            xs=prev_y + layer_vars[i],
                            grad_ys=prev_grads,
                            colocate_gradients_with_ops=True)
                        ys = prev_y
                        prev_grads = y_grads[0:num_tensors]
                        grads_i = y_grads[num_tensors:]
                        layer_grads[j][i] = [
                            g for g in grads_i if g is not None
                        ]
                    grads_0 = tf.gradients(ys=ys,
                                           xs=layer_vars[0],
                                           grad_ys=prev_grads,
                                           colocate_gradients_with_ops=True)
                    layer_grads[j][0] = [g for g in grads_0 if g is not None]

                grads_set = []
                vars_set = []
                for i in xrange(slice_num):
                    for j in xrange(len(layer_grads[0][i])):
                        grad_i_set = [
                            layer_grads[m][i][j]
                            for m in range(micro_batch_num)
                        ]
                        #print (grad_i_set)
                        if micro_batch_num == 1:
                            with tf.device(grad_i_set[0].device):
                                acc_grads = grad_i_set[0]
                        else:
                            with tf.control_dependencies(
                                    grad_i_set), tf.device(
                                        grad_i_set[0].device):  # replica
                                if isinstance(grad_i_set[0], tf.IndexedSlices):
                                    acc_grads = tf.add_n(grad_i_set)
                                else:
                                    acc_grads = tf.accumulate_n(grad_i_set)
                        grads_set.append(acc_grads)
                        vars_set.append(layer_vars[i][j])
                grads_and_vars = zip(grads_set, vars_set)

                #if hparams.cross_pipeline and hvd.size() > 1:
                if hparams.cross_pipeline and hvd.size() >= 1:
                    devices = cluster_utils.get_pipeline_devices(
                        hparams.pipeline_device_num)
                    gradients_list = [[] for i in xrange(len(devices))]
                    for grad, var in grads_and_vars:
                        for i in xrange(len(devices)):
                            if var.device == devices[i]:
                                gradients_list[i].append((grad, var))
                                break
                    avg_grads_and_vars = []
                    for i in xrange(len(devices)):
                        with tf.device(devices[i]):
                            for grad, var in gradients_list[i]:
                                if isinstance(grad, tf.IndexedSlices):
                                    grad = tf.convert_to_tensor(grad)
                                avg_grad = hvd.allreduce(grad)
                                avg_grads_and_vars.append((avg_grad, var))
                    grads_and_vars = avg_grads_and_vars
                gradients = [grad for grad, _ in grads_and_vars]


#####
            else:
                # Gradients
                gradients = tf.gradients(self.train_loss,
                                         params,
                                         colocate_gradients_with_ops=True)
            clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip(
                gradients, max_gradient_norm=hparams.max_gradient_norm)
            #self.grad_norm_summary = grad_norm_summary
            #self.grad_norm = grad_norm

            if DAPPLE_TEST:
                self.update = opt.apply_gradients(grads_and_vars,
                                                  global_step=self.global_step)
            else:
                self.update = opt.apply_gradients(zip(clipped_grads, params),
                                                  global_step=self.global_step)

            # Summary
            self.train_summary = self._get_train_summary()
        elif self.mode == tf.contrib.learn.ModeKeys.INFER:
            self.infer_summary = self._get_infer_summary(hparams)

        # Print trainable variables
        utils.print_out("# Trainable variables")
        utils.print_out(
            "Format: <name>, <shape>, <size(MB)>, <(soft) device placement>")
        param_total_size = 0.0
        for param in params:
            param_total_size += np.asarray(
                param.shape.as_list()[0:]).prod() * 4.0 / 1000 / 1000
            utils.print_out("  %s, %s, %.2f, %s" %
                            (param.name, str(param.get_shape()),
                             np.asarray(param.shape.as_list()[0:]).prod() *
                             4.0 / 1000 / 1000, param.op.device))
        print("# Total size of trainable variables: %0.2f", param_total_size)
Example #35
0
tf.reduce_sum(x, [0, 1]) ==> 6
"""

# 计算输入 tensor 所有元素的均值/最大值/最小值/积/逻辑与/或
# 或者计算指定的轴所有元素的均值/最大值/最小值/积/逻辑与/或(just like reduce_sum)
tf.reduce_mean(input_tensor, axis=None, keep_dims=False, name=None)
tf.reduce_max(input_tensor, axis=None, keep_dims=False, name=None)
tf.reduce_min(input_tensor, axis=None, keep_dims=False, name=None)
tf.reduce_prod(input_tensor, axis=None, keep_dims=False, name=None)
tf.reduce_all(input_tensor, axis=None, keep_dims=False, name=None)  # 全部满足条件
tf.reduce_any(input_tensor, axis=None, keep_dims=False, name=None) #至少有一个满足条件
 
-------------------------------------------
# 分界线以上和 Numpy 中相应的用法完全一致
-------------------------------------------
 
# inputs 为一 list, 计算 list 中所有元素的累计和,
# tf.add(x, y, name=None)只能计算两个元素的和,此函数相当于扩展了其功能
tf.accumulate_n(inputs, shape=None, tensor_dtype=None, name=None)
 
# Computes log(sum(exp(elements across dimensions of a tensor)))
tf.reduce_logsumexp(input_tensor, axis=None, keep_dims=False, name=None)
 
# Computes number of nonzero elements across dimensions of a tensor
tf.count_nonzero(input_tensor, axis=None, keep_dims=False, name=None)

# Compute the cumulative sum of the tensor x along axis
tf.cumsum(x, axis=0, exclusive=False, reverse=False, name=None)
# Eg:
tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
Example #36
0
z_reduce_all = tf.reduce_all(z, reduction_indices=[0, 2])

# tf.reduce_any
x = np.random.randint(0, 2, 10 * 5 * 4)
z = np.empty(10 * 5 * 4, dtype=np.bool)
for i, x_ in enumerate(x):
    if x_ > 0:
        z[i] = True
    else:
        z[i] = False
z = z.reshape((10, 5, 4))
z_reduce_any = tf.reduce_any(z, reduction_indices=[0, 2])

# tf.accumulate_n
inputs = [np.random.rand(5, 4, 3)] * 3
z_accumulate_n = tf.accumulate_n(inputs, shape=(10, 5, 4))

with tf.Session() as sess:

    print "tf.reduce_sum"
    print sess.run(z_reduce_sum)

    print "tf.reduce_prod"
    print sess.run(z_reduce_prod)

    print "tf.reduce_min"
    print sess.run(z_reduce_min)

    print "tf.reduce_max"
    print sess.run(z_reduce_max)
Example #37
0
    def build_model(self, batch_queue, tower, opt, scope):
        """
            The main function where the bilevel approach is used
        """
        imgs_train, labels_train = batch_queue.get_next()

        tf.summary.histogram('labels', labels_train)

        # We split the training batches in the pre-defined splits (each containing the same label distribution)
        num_split = self.data_generator.batch_splits
        imgs_train_list = tf.split(imgs_train, num_split)
        labels_train_list = tf.split(labels_train, num_split)

        preds_list = []
        loss_list = []
        # Iterate over all the batch splits
        for i, (imgs,
                labels) in enumerate(zip(imgs_train_list, labels_train_list)):
            tf.summary.image('imgs/train',
                             montage_tf(imgs, 1, 8),
                             max_outputs=1)

            # Create the model
            reuse = True if (tower > 0 or i > 0) else None
            preds, layers = self.model.net(imgs,
                                           self.data_generator.num_classes,
                                           reuse=reuse)
            preds_list.append(preds)

            # Compute losses
            loss = self.model.loss(scope, preds,
                                   self.data_generator.format_labels(labels),
                                   tower)
            tf.get_variable_scope().reuse_variables()

            # Handle dependencies with update_ops (batch-norm)
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            if update_ops:
                updates = tf.group(*update_ops)
                loss = control_flow_ops.with_dependencies([updates], loss)

            # Store the loss on this split in the list
            loss_list.append(loss)

        # Calculate the gradients on all the batch splits.
        weights = get_variables_to_train(self.train_scopes)
        grads_list = [opt.compute_gradients(l, weights) for l in loss_list]

        # A dictionary with a list of gradients corresponding to the model variables
        grads_accum = {v: [] for (_, v) in grads_list[0]}

        # Flatten the gradients of each split
        grads_flat = [
            tf.concat([tf.reshape(g, (-1, 1)) for (g, v) in grad], axis=0)
            for grad in grads_list
        ]

        # Compute the mini-batch weights
        val_grad = grads_flat[0]
        w = [
            tf.divide(
                tf.reduce_sum(tf.multiply(val_grad, train_grad)),
                tf.reduce_sum(tf.multiply(train_grad, train_grad)) + self.mu)
            for train_grad in grads_flat[1:]
        ]

        # Multiply mini-batch gradients by l1 normalized weights
        w_l1norm = tf.reduce_sum(tf.abs(w))
        for i, grads in enumerate(grads_list[1:]):
            for g, v in grads:
                grads_accum[v].append(tf.multiply(g, w[i] / w_l1norm))
        tf.summary.histogram('w', tf.stack(w))

        # Apply weight-decay
        grads_wd = {
            v: self.model.weight_decay *
            v if v.op.name.endswith('weights') else 0.0
            for (_, v) in grads_list[0]
        }

        # Accumulate all the gradients per variable
        grads = [(tf.accumulate_n(grads_accum[v]) + grads_wd[v], v)
                 for (_, v) in grads_list[0]]

        self.summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
        return tf.reduce_mean(loss_list), grads, layers
Example #38
0
    def model_fn(features, labels, mode, params):
        inputs = [features['input_ids'], features['input_mask'], features['segment_ids'],
                  features['cls_index'], features['p_mask'], features['start_positions'],
                  features['end_positions'], features['is_impossible']]

        slice_xlnet = XlnetSlice(
            embedding_dim=FLAGS.embedding_dim,
            num_token=FLAGS.num_token,
            num_layer=FLAGS.num_layer,
            num_head=FLAGS.num_head,
            feed_forward_dim=FLAGS.feed_forward_dim,
            attention_head_dim=FLAGS.attention_head_dim,
            target_len=FLAGS.target_len,
            dropout=FLAGS.dropout,
            is_training=True,
            attention_dropout=FLAGS.dropatt)

        total_loss, outputs = slice_xlnet.build(inputs)

        devices = cluster_utils.get_pipeline_devices(FLAGS.pipeline_device_num)
        slice_num = len(devices)
        micro_batch_num = FLAGS.micro_batch_num
        losses = []
        all_outputs = []
        losses.append(total_loss)
        all_outputs.append(outputs)
        layer_grads = [[[] for i in xrange(slice_num)] for j in xrange(micro_batch_num)]
        layer_vars = [[] for i in xrange(slice_num)]
        remained_vars = tf.trainable_variables()
        ys = losses[0]
        prev_grads=None
        # layers-1 ~ 1 compute grads
        for i in xrange(slice_num - 1, 0, -1):
          vars_i = [v for v in remained_vars if v.device==devices[i]]
          remained_vars = [v for v in remained_vars if v not in vars_i]
          prev_y = all_outputs[0][i-1]
          y_grads = tf.gradients(ys=ys, xs=[prev_y]+vars_i, grad_ys=prev_grads, colocate_gradients_with_ops=True)
          ys = prev_y
          prev_grads = y_grads[0]
          grads_i = y_grads[1:]
          layer_grads[0][i] = [g for g in grads_i if g is not None]
          layer_vars[i] = [v for (g, v) in zip(grads_i, vars_i) if g is not None]
        # layer 0 compute grads
        grads_0 = tf.gradients(ys=ys, xs=remained_vars, grad_ys=prev_grads, colocate_gradients_with_ops=True)
        layer_grads[0][0] = [g for g in grads_0 if g is not None]
        layer_vars[0] = [v for (g, v) in zip(grads_0, remained_vars) if g is not None]

        # other micro_batch_num
        for j in xrange(1, micro_batch_num):
          dep_outputs = []
          for i in xrange(slice_num):
            dep_outputs.append(all_outputs[j-1][i] if i+j < slice_num else layer_grads[i+j-slice_num][i])
          loss, outputs = slice_xlnet.build(inputs, dep_outputs=dep_outputs)
          losses.append(loss)
          all_outputs.append(outputs)
          ys = losses[j]
          prev_grads=None
          for i in xrange(slice_num - 1, 0, -1):
            prev_y = all_outputs[j][i-1]
            y_grads = tf.gradients(ys=ys, xs=[prev_y]+layer_vars[i], grad_ys=prev_grads, colocate_gradients_with_ops=True)
            ys = prev_y
            prev_grads = y_grads[0]
            grads_i = y_grads[1:]
            layer_grads[j][i] = [g for g in grads_i if g is not None]
          grads_0 = tf.gradients(ys=ys, xs=layer_vars[0], grad_ys=prev_grads, colocate_gradients_with_ops=True)
          layer_grads[j][0] = [g for g in grads_0 if g is not None]

        grads_set = []
        vars_set = []
        for i in xrange(slice_num):
          for j in xrange(len(layer_grads[0][i])):
            grad_i_set = [layer_grads[m][i][j] for m in range(micro_batch_num)]
            #print (grad_i_set)
            if micro_batch_num == 1:
              with tf.device(devices[i]):
                acc_grads = grad_i_set[0]
            else:
              with tf.control_dependencies(grad_i_set), tf.device(devices[i]):
                if isinstance(grad_i_set[0], tf.IndexedSlices):
                  acc_grads = tf.add_n(grad_i_set)
                else:
                  acc_grads = tf.accumulate_n(grad_i_set)
            grads_set.append(acc_grads)
            vars_set.append(layer_vars[i][j])
        grads_and_vars = zip(grads_set, vars_set)

#        init_from_checkpoint(FLAGS)
        train_op = get_train_op(FLAGS, grads_and_vars)

        return tf.estimator.EstimatorSpec(
            mode=mode, loss=total_loss, train_op=train_op, )
 def testZeroArgs(self):
     with self.test_session():
         with self.assertRaises(ValueError):
             tf_val = tf.accumulate_n([])
             tf_val.eval()
Example #40
0
import numpy as np

input_a = np.array([[1, 1, 2], [2, 3, 4]], dtype=np.float32)
input_b = np.array([[True, False], [True, True]])
input_c = np.array([[1.3, 1.2, 2.3], [2., 3., 2.3]], dtype=np.float32)

input_a_sum_column = tf.reduce_sum(input_a, reduction_indices=0)
input_a_sum_row = tf.reduce_sum(input_a, reduction_indices=1, keep_dims=True)

input_a_prod_column = tf.reduce_prod(input_a, reduction_indices=0)
input_a_prod_row = tf.reduce_prod(input_a, reduction_indices=1, keep_dims=True)

input_a_min = tf.reduce_min(input_a, reduction_indices=1)
input_a_max = tf.reduce_max(input_a, reduction_indices=1)
input_a_mean = tf.reduce_mean(input_a, reduction_indices=1, keep_dims=True)

input_b_and = tf.reduce_all(input_b, reduction_indices=1)
input_b_or = tf.reduce_any(input_b, reduction_indices=1)

input_accum = tf.accumulate_n(inputs=[input_a, input_c])
input_cum = tf.cumsum(x=[input_a_sum_column, input_a_prod_column])

with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    print(sess.run(input_a_sum_column), '\n', sess.run(input_a_sum_row))
    print(sess.run(input_a_prod_column), '\n', sess.run(input_a_prod_row))
    print(sess.run(input_a_min), '\n', sess.run(input_a_max), '\n',
          sess.run(input_a_mean))
    print(sess.run(input_accum), '\n', sess.run(input_cum))
Example #41
0
def get_run_op():
  global batch_size
  global slice_size
  global feature_size
  batch_size = FLAGS.batch_size
  slice_size = FLAGS.hidden_size / FLAGS.num_gpus
  feature_size = slice_size * FLAGS.num_gpus
  print("Slice size: {}".format(slice_size))
  data = []
  for i in xrange(FLAGS.num_gpus):
    with tf.device('/gpu:%d' % i):
      data.append(tf.get_variable(
          name = 'data%d' % i,
          shape=[batch_size, slice_size],
          trainable=False))
  # weights
  w = []
  for i in xrange(FLAGS.num_layers):
    w.append([])
    for j in xrange(FLAGS.num_gpus):
      with tf.device('/gpu:%d' % j):
        with tf.variable_scope('fc%d' % i):
          w[i].append(tf.get_variable(
              name='w%d' % j,
              shape=[slice_size,feature_size],
              trainable=True))
  # ff
  fwd = []
  last = data
  for i in xrange(FLAGS.num_layers):
    with tf.name_scope('fc_ff%d' % i):
      fwd.append(last)
      tmp = []
      for j in xrange(FLAGS.num_gpus):
        with tf.device('/gpu:%d' % j):
          # matmult
          y = tf.matmul(last[j], w[i][j])
          if FLAGS.num_gpus > 1:
            # split
            tmp.append(tf.split(split_dim=1, num_split=FLAGS.num_gpus, value=y))
          else:
            tmp.append(y)
      if FLAGS.num_gpus > 1:
        # reduce
        red = []
        for j in xrange(FLAGS.num_gpus):
          with tf.device('/gpu:%d' % j):
            red.append(tf.accumulate_n([s[j] for s in tmp]))
        last = red
      else:
        last = tmp
  # bp
  targets = []
  for i in reversed(xrange(FLAGS.num_layers)):
    with tf.name_scope('fc_bp%d' % i):
      # convert col -> rep
      tmp = []
      if FLAGS.num_gpus > 1:
        for j in xrange(FLAGS.num_gpus):
          with tf.device('/gpu:%d' % j):
            tmp.append(tf.concat(concat_dim=1, values=last))
      else:
        tmp = last
      last = []
      for j in xrange(FLAGS.num_gpus):
        with tf.device('/gpu:%d' % j):
          with tf.name_scope('bp'):
            # matmult: bp
            dy = tf.matmul(tmp[j], w[i][j], transpose_b=True)
            last.append(dy)
          if i == 0:
            dep = [] # no manual scheduling dep since the last bp is not needed
          else:
            dep = [dy] # add manual dep for better scheduling decision
          with tf.control_dependencies(dep), tf.name_scope('grad'):
            # matmult: grad
            dw = tf.matmul(fwd[i][j], tmp[j], transpose_a=True)
          # update
          targets.append(dw)
  with tf.control_dependencies(targets):
    train_op = tf.no_op()
  init_op = tf.initialize_all_variables()
  return init_op, train_op