def sequence_softmax(inputs, noutput, scope=None, name=None, linear_name=None): """Run a softmax layer over all the time steps of an input sequence. Args: inputs: (length, batch_size, depth) tensor noutput: output depth scope: optional scope name name: optional name for output tensor linear_name: name for linear (pre-softmax) output Returns: A tensor of size (length, batch_size, noutput). """ length, _, ninputs = _shape(inputs) inputs_u = array_ops.unstack(inputs) output_u = [] with variable_scope.variable_scope(scope, "SequenceSoftmax", [inputs]): initial_w = random_ops.truncated_normal([0 + ninputs, noutput], stddev=0.1) initial_b = constant_op.constant(0.1, shape=[noutput]) w = variables.model_variable("weights", initializer=initial_w) b = variables.model_variable("biases", initializer=initial_b) for i in xrange(length): with variable_scope.variable_scope(scope, "SequenceSoftmaxStep", [inputs_u[i]]): # TODO(tmb) consider using slim.fully_connected(..., # activation_fn=tf.nn.softmax) linear = nn_ops.xw_plus_b(inputs_u[i], w, b, name=linear_name) output = nn_ops.softmax(linear) output_u += [output] outputs = array_ops.stack(output_u, name=name) return outputs
def testGetLocalVariables(self): with self.test_session(): with variable_scope.variable_scope('A'): _ = variables_lib2.model_variable('a', [5]) with variable_scope.variable_scope('B'): _ = variables_lib2.model_variable('a', [5]) self.assertEquals([], variables_lib2.get_local_variables('A')) self.assertEquals([], variables_lib2.get_local_variables('B'))
def testGetModelVariables(self): with self.test_session(): with variable_scope.variable_scope('A'): a = variables_lib2.model_variable('a', [5]) with variable_scope.variable_scope('B'): b = variables_lib2.model_variable('a', [5]) self.assertEquals([a], variables_lib2.get_model_variables('A')) self.assertEquals([b], variables_lib2.get_model_variables('B'))
def testVariableWithVariableDeviceChooser(self): with ops.Graph().as_default(): device_fn = variables_lib2.VariableDeviceChooser() with arg_scope([variables_lib2.model_variable], device=device_fn): a = variables_lib2.model_variable('a', [5]) b = variables_lib2.model_variable('b', [20]) self.assertDeviceEqual(a.device, 'cpu:0') self.assertEqual(a.initial_value.op.colocation_groups(), a.op.colocation_groups()) self.assertDeviceEqual(b.device, 'cpu:0') self.assertEqual(a.initial_value.op.colocation_groups(), a.op.colocation_groups())
def _model_variable_getter(getter, name, shape=None, dtype=None, initializer=None, regularizer=None, trainable=True, collections=None, caching_device=None, partitioner=None, rename=None, use_resource=None, **_): """Getter that uses model_variable for compatibility with core layers.""" short_name = name.split('/')[-1] if rename and short_name in rename: name_components = name.split('/') name_components[-1] = rename[short_name] name = '/'.join(name_components) return variables.model_variable( name, shape=shape, dtype=dtype, initializer=initializer, regularizer=regularizer, collections=collections, trainable=trainable, caching_device=caching_device, partitioner=partitioner, custom_getter=getter, use_resource=use_resource)
def testNotInLocalVariables(self): with self.test_session(): with variable_scope.variable_scope('A'): a = variables_lib2.model_variable('a', [5]) self.assertTrue(a in variables_lib.global_variables()) self.assertTrue(a in ops.get_collection(ops.GraphKeys.MODEL_VARIABLES)) self.assertFalse(a in variables_lib.local_variables())
def testNameAndShape(self): with self.test_session(): with variable_scope.variable_scope('A'): a = variables_lib2.model_variable('a', [5]) self.assertEquals(a.op.name, 'A/a') self.assertListEqual(a.get_shape().as_list(), [5]) self.assertListEqual([a], variables_lib2.get_model_variables('A'))
def bow_encoder(ids, vocab_size, embed_dim, sparse_lookup=True, initializer=None, regularizer=None, trainable=True, scope=None, reuse=None): """Maps a sequence of symbols to a vector per example by averaging embeddings. Args: ids: `[batch_size, doc_length]` `Tensor` or `SparseTensor` of type `int32` or `int64` with symbol ids. vocab_size: Integer number of symbols in vocabulary. embed_dim: Integer number of dimensions for embedding matrix. sparse_lookup: `bool`, if `True`, converts ids to a `SparseTensor` and performs a sparse embedding lookup. This is usually faster, but not desirable if padding tokens should have an embedding. Empty rows are assigned a special embedding. initializer: An initializer for the embeddings, if `None` default for current scope is used. regularizer: Optional regularizer for the embeddings. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional string specifying the variable scope for the op, required if `reuse=True`. reuse: If `True`, variables inside the op will be reused. Returns: Encoding `Tensor` `[batch_size, embed_dim]` produced by averaging embeddings. Raises: ValueError: If `embed_dim` or `vocab_size` are not specified. """ if not vocab_size or not embed_dim: raise ValueError('Must specify vocab size and embedding dimension') with variable_scope.variable_scope( scope, 'bow_encoder', [ids], reuse=reuse): embeddings = variables.model_variable( 'embeddings', shape=[vocab_size, embed_dim], initializer=initializer, regularizer=regularizer, trainable=trainable) if sparse_lookup: if isinstance(ids, sparse_tensor.SparseTensor): sparse_ids = ids else: sparse_ids = sparse_ops.dense_to_sparse_tensor(ids) return contrib_embedding_ops.safe_embedding_lookup_sparse( [embeddings], sparse_ids, combiner='mean', default_id=0) else: if isinstance(ids, sparse_tensor.SparseTensor): raise TypeError('ids are expected to be dense Tensor, got: %s', ids) return math_ops.reduce_mean( embedding_ops.embedding_lookup(embeddings, ids), reduction_indices=1)
def l2_normalization( inputs, scaling=False, scale_initializer=init_ops.ones_initializer(), reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """Implement L2 normalization on every feature (i.e. spatial normalization). Should be extended in some near future to other dimensions, providing a more flexible normalization framework. inputs: a 4-D tensor with dimensions [batch_size, height, width, channels]. scaling: whether or not to add a post scaling operation along the dimensions which have been normalized. scale_initializer: An initializer for the weights. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: optional list of collections for all the variables or a dictionary containing a different list of collection per variable. outputs_collections: collection to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for `variable_scope`. Returns: A `Tensor` representing the output of the operation. """ with variable_scope.variable_scope( scope, 'L2Normalization', [inputs], reuse=reuse) as sc: inputs_shape = inputs.get_shape() inputs_rank = inputs_shape.ndims params_shape = inputs_shape[-1:] dtype = inputs.dtype.base_dtype # Normalize along spatial dimensions. norm_dim = tf.range(1, inputs_rank-1) outputs = nn.l2_normalize(inputs, norm_dim, epsilon=1e-12) # Additional scaling. if scaling: scale_collections = utils.get_variable_collections( variables_collections, 'scale') scale = variables.model_variable('gamma', shape=params_shape, dtype=dtype, initializer=scale_initializer, collections=scale_collections, trainable=trainable) outputs = tf.multiply(outputs, scale) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs)
def testDeviceFn(self): class DevFn(object): def __init__(self): self.counter = -1 def __call__(self, op): self.counter += 1 return '/cpu:%d' % self.counter with ops.Graph().as_default(): with arg_scope([variables_lib2.model_variable], device=DevFn()): a = variables_lib2.model_variable('a', [5]) b = variables_lib2.model_variable('b', [20]) self.assertDeviceEqual(a.device, '/cpu:0') self.assertEqual(a.initial_value.op.colocation_groups(), a.op.colocation_groups()) self.assertDeviceEqual(b.device, '/cpu:1') self.assertEqual(b.initial_value.op.colocation_groups(), b.op.colocation_groups())
def embed_sequence(ids, vocab_size=None, embed_dim=None, unique=False, initializer=None, regularizer=None, trainable=True, scope=None, reuse=None): """Maps a sequence of symbols to a sequence of embeddings. Typical use case would be reusing embeddings between an encoder and decoder. Args: ids: `[batch_size, doc_length]` `Tensor` of type `int32` or `int64` with symbol ids. vocab_size: Integer number of symbols in vocabulary. embed_dim: Integer number of dimensions for embedding matrix. unique: If `True`, will first compute the unique set of indices, and then lookup each embedding once, repeating them in the output as needed. initializer: An initializer for the embeddings, if `None` default for current scope is used. regularizer: Optional regularizer for the embeddings. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). scope: Optional string specifying the variable scope for the op, required if `reuse=True`. reuse: If `True`, variables inside the op will be reused. Returns: `Tensor` of `[batch_size, doc_length, embed_dim]` with embedded sequences. Raises: ValueError: if `embed_dim` or `vocab_size` are not specified when `reuse` is `None` or `False`. """ if not (reuse or (vocab_size and embed_dim)): raise ValueError('Must specify vocab size and embedding dimension when not' 'reusing. Got vocab_size=%s and embed_dim=%s' % ( vocab_size, embed_dim)) with variable_scope.variable_scope( scope, 'EmbedSequence', [ids], reuse=reuse): shape = [vocab_size, embed_dim] if reuse and vocab_size is None or embed_dim is None: shape = None embeddings = variables.model_variable( 'embeddings', shape=shape, initializer=initializer, regularizer=regularizer, trainable=trainable) if unique: return contrib_embedding_ops.embedding_lookup_unique(embeddings, ids) return embedding_ops.embedding_lookup(embeddings, ids)
def bias_add(inputs, activation_fn=None, initializer=init_ops.zeros_initializer, regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """Adds a bias to the inputs. Can be used as a normalizer function for conv2d and fully_connected. Args: inputs: a tensor of with at least rank 2 and value for the last dimension, e.g. `[batch_size, depth]`, `[None, None, None, depth]`. activation_fn: Optional activation function. initializer: An initializer for the bias, defaults to 0. regularizer: A regularizer like the result of `l1_regularizer` or `l2_regularizer`. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: optional collections for the variables. outputs_collections: collections to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for variable_op_scope. Returns: a tensor representing the result of adding biases to the inputs. """ with variable_scope.variable_op_scope([inputs], scope, 'BiasAdd', reuse=reuse) as sc: inputs = ops.convert_to_tensor(inputs) dtype = inputs.dtype.base_dtype num_features = utils.last_dimension(inputs.get_shape(), min_rank=2) biases_collections = utils.get_variable_collections(variables_collections, 'biases') biases = variables.model_variable('biases', shape=[num_features,], dtype=dtype, initializer=initializer, regularizer=regularizer, collections=biases_collections, trainable=trainable) outputs = nn.bias_add(inputs, biases) if activation_fn: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def _create_joint_embedding_lookup(columns_to_tensors, embedding_lookup_arguments, num_outputs, trainable, weight_collections): """Creates an embedding lookup for all columns sharing a single weight.""" for arg in embedding_lookup_arguments: assert arg.weight_tensor is None, ( 'Joint sums for weighted sparse columns are not supported. ' 'Please use weighted_sum_from_feature_columns instead.') assert arg.combiner == 'sum', ( 'Combiners other than sum are not supported for joint sums. ' 'Please use weighted_sum_from_feature_columns instead.') assert len(embedding_lookup_arguments) >= 1, ( 'At least one column must be in the model.') prev_size = 0 sparse_tensors = [] for a in embedding_lookup_arguments: t = a.input_tensor values = t.values + prev_size prev_size += a.vocab_size sparse_tensors.append( ops.SparseTensor(t.indices, values, t.shape)) sparse_tensor = sparse_ops.sparse_concat(1, sparse_tensors) with variable_scope.variable_scope( None, default_name='linear_weights', values=columns_to_tensors.values()): variable = contrib_variables.model_variable( name='weights', shape=[prev_size, num_outputs], dtype=dtypes.float32, initializer=init_ops.zeros_initializer, trainable=trainable, collections=weight_collections) if isinstance(variable, variables.Variable): variable = [variable] else: variable = variable._get_variable_list() # pylint: disable=protected-access predictions = embedding_ops.safe_embedding_lookup_sparse( variable, sparse_tensor, sparse_weights=None, default_id=0, combiner='sum', name='_weights') return variable, predictions
def _create_embedding_lookup(column, columns_to_tensors, embedding_lookup_arguments, num_outputs, trainable, weight_collections): """Creates variables and returns predictions for linear weights in a model. Args: column: the column we're working on. columns_to_tensors: a map from column name to tensors. embedding_lookup_arguments: arguments for embedding lookup. num_outputs: how many outputs. trainable: whether the variable we create is trainable. weight_collections: weights will be placed here. Returns: variables: the created embeddings. predictions: the computed predictions. """ with variable_scope.variable_scope( None, default_name=column.name, values=columns_to_tensors.values()): variable = contrib_variables.model_variable( name='weights', shape=[embedding_lookup_arguments.vocab_size, num_outputs], dtype=dtypes.float32, initializer=embedding_lookup_arguments.initializer, trainable=trainable, collections=weight_collections) if isinstance(variable, variables.Variable): variable = [variable] else: variable = variable._get_variable_list() # pylint: disable=protected-access predictions = embedding_ops.safe_embedding_lookup_sparse( variable, embedding_lookup_arguments.input_tensor, sparse_weights=embedding_lookup_arguments.weight_tensor, default_id=0, combiner=embedding_lookup_arguments.combiner, name=column.name + '_weights') return variable, predictions
def part_fn(self, feature_part, n, reuse): name = self.name + '/separable_split/' + n with tf.variable_scope(name, reuse=reuse): # shape = list(map(int, (*feature_part.shape[1:], 1))) in_chs = int(feature_part.shape[-1]) # depthwise_filter: [filter_height, filter_width, in_channels, channel_multiplier]. # Contains in_channels convolutional filters of depth 1. depthwise_shape = [self.kernel_size, self.kernel_size, in_chs, 1] depthwise_filter = tf.get_variable(name + '/depthwise_filter', depthwise_shape, tf.float32, self.depthwise_initializer) # depthwise_filter = tf.Variable(self.kernel_initializer(depthwise_shape)) # pointwise_filter: [1, 1, channel_multiplier * in_channels, out_channels]. # Pointwise filter to mix channels after depthwise_filter has convolved spatially. pointwise_shape = [1, 1, in_chs, in_chs] # pointwise_filter = tf.Variable(self.pointwise_initializer(pointwise_shape)) pointwise_filter = tf.get_variable(name + 'pointwise_filter', pointwise_shape, tf.float32, self.kernel_initializer()) outputs = tf.nn.separable_conv2d( feature_part, depthwise_filter=depthwise_filter, pointwise_filter=pointwise_filter, strides=self.stride, padding=self.padding, ) if self.biases_initializer is not None: biases = variables.model_variable( 'biases' + n, shape=[ in_chs, ], dtype=feature_part.dtype, initializer=self.biases_initializer, ) outputs = nn.bias_add(outputs, biases) outputs = self.act_fn(outputs) return outputs
def spatial_normalization(self, inputs): with variable_scope.variable_scope(None, 'L2Normalization', [inputs], reuse=None) as sc: inputs_shape = inputs.get_shape() inputs_rank = inputs_shape.ndims norm_dim = tf.range(inputs_rank - 1, inputs_rank) params_shape = inputs_shape[-1:] # Normalize along spatial dimensions. outputs = nn.l2_normalize(inputs, norm_dim, epsilon=1e-12) # Additional scaling. scale_collections = utils.get_variable_collections(None, 'scale') scale = variables.model_variable( 'gamma', shape=params_shape, dtype=inputs.dtype.base_dtype, initializer=init_ops.ones_initializer(), collections=scale_collections, trainable=True) outputs = tf.multiply(outputs, scale) return utils.collect_named_outputs(None, sc.original_name_scope, outputs)
def _create_embedding_lookup(column, columns_to_tensors, embedding_lookup_arguments, num_outputs, trainable, weight_collections): """Creates variables and returns predictions for linear weights in a model. Args: column: the column we're working on. columns_to_tensors: a map from column name to tensors. embedding_lookup_arguments: arguments for embedding lookup. num_outputs: how many outputs. trainable: whether the variable we create is trainable. weight_collections: weights will be placed here. Returns: variables: the created embeddings. predictions: the computed predictions. """ with variable_scope.variable_scope(None, default_name=column.name, values=columns_to_tensors.values()): variable = contrib_variables.model_variable( name='weights', shape=[embedding_lookup_arguments.vocab_size, num_outputs], dtype=dtypes.float32, initializer=embedding_lookup_arguments.initializer, trainable=trainable, collections=weight_collections) if fc._is_variable(variable): # pylint: disable=protected-access variable = [variable] else: variable = variable._get_variable_list() # pylint: disable=protected-access predictions = embedding_ops.safe_embedding_lookup_sparse( variable, embedding_lookup_arguments.input_tensor, sparse_weights=embedding_lookup_arguments.weight_tensor, combiner=embedding_lookup_arguments.combiner, name=column.name + '_weights') return variable, predictions
def init_state(self, state_name, batch_size, dtype, learned_state=False): """Creates an initial state compatible with this cell. Args: state_name: name of the state tensor batch_size: model batch size dtype: dtype for the tensor values i.e. tf.float32 learned_state: whether the initial state should be learnable. If false, the initial state is set to all 0's Returns: ret: the created initial state """ state_size = (self.state_size_flat if self._flatten_state else self.state_size) # list of 2 zero tensors or variables tensors, # depending on if learned_state is true # pylint: disable=g-long-ternary,g-complex-comprehension ret_flat = [(contrib_variables.model_variable( state_name + str(i), shape=s, dtype=dtype, initializer=tf.truncated_normal_initializer( stddev=0.03)) if learned_state else tf.zeros( [batch_size] + s, dtype=dtype, name=state_name)) for i, s in enumerate(state_size)] # duplicates initial state across the batch axis if it's learned if learned_state: ret_flat = [ tf.stack([tensor for i in range(int(batch_size))]) for tensor in ret_flat ] for s, r in zip(state_size, ret_flat): r = tf.reshape(r, [-1] + s) ret = tf.nest.pack_sequence_as(structure=[1, 1], flat_sequence=ret_flat) return ret
def _conv(x,shape,stride,padding,dilation_rate=None,w_name='w',b_name='b', std=0.01,wd=None,dtype=tf.float32,add_bias=True,device=None): """ Define a Convolutional layer with (optional) bias term. For documentation, see `conv_block`. If DILATION_RATE is specified, ATROU-conv is used. In this case, the STRIDE parameter is ignored, as the stride is set to one. """ w = _variable_with_weight_decay(w_name,shape=shape,stddev=std, wd=wd,dtype=dtype,device=device) if dilation_rate is None: out = tf.nn.conv2d(x,w,strides=stride,padding=padding) else: out = tf.nn.atrous_conv2d(x,w,dilation_rate,padding=padding) # [optional] bias: if add_bias: b = variables.model_variable(b_name,shape=shape[-1:],dtype=dtype, initializer=tf.constant_initializer(0.0), device=device) out = tf.nn.bias_add(out,b) return out
def add_bias(x, n_units, biases_initializer, dtype, trainable): # Initializer biases_shape = [n_units] if biases_initializer is None: biases_initializer = tf.constant_initializer(0.0, dtype=tf.float32) elif isinstance(biases_initializer, np.ndarray): if biases_initializer.ndim != 1 or biases_initializer.shape[ 0] != biases_shape[0]: raise ValueError('Shape of constant initializer (' + str(biases_initializer.shape) + ') does not match expected shape (' + str(biases_shape) + '). ') biases_shape = None # Shape is inferred from initializer # Create variable for bias biases = variables.model_variable('biases', shape=biases_shape, dtype=dtype, initializer=biases_initializer, trainable=trainable) # Add bias return tf.nn.bias_add(x, biases)
def l2_normalization(inputs, scaling=False, scale_initializer=init_ops.ones_initializer(), reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """ conv4_3需要先进行l2正则,以减小该层和后面的误差 """ with variable_scope.variable_scope(scope, 'L2Normalization', [inputs], reuse=reuse) as sc: inputs_shape = inputs.get_shape() inputs_rank = inputs_shape.ndims dtype = inputs.dtype.base_dtype norm_dim = tf.range(inputs_rank - 1, inputs_rank) params_shape = inputs_shape[-1:] # Normalize along spatial dimensions. outputs = nn.l2_normalize(inputs, norm_dim, epsilon=1e-12) # Additional scaling. if scaling: scale_collections = utils.get_variable_collections( variables_collections, 'scale') scale = variables.model_variable('gamma', shape=params_shape, dtype=dtype, initializer=scale_initializer, collections=scale_collections, trainable=trainable) outputs = tf.multiply(outputs, scale) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs)
def init_state(self, state_name, batch_size, dtype, learned_state=False): """Creates an initial state compatible with this cell. Args: state_name: name of the state tensor batch_size: model batch size dtype: dtype for the tensor values i.e. tf.float32 learned_state: whether the initial state should be learnable. If false, the initial state is set to all 0's Returns: The created initial state. """ state_size = ( self.state_size_flat if self._flattened_state else self.state_size) # list of 2 zero tensors or variables tensors, depending on if # learned_state is true ret_flat = [(variables.model_variable( state_name + str(i), shape=s, dtype=dtype, initializer=tf.truncated_normal_initializer(stddev=0.03)) if learned_state else tf.zeros( [batch_size] + s, dtype=dtype, name=state_name)) for i, s in enumerate(state_size)] # duplicates initial state across the batch axis if it's learned if learned_state: ret_flat = [ tf.stack([tensor for i in range(int(batch_size))]) for tensor in ret_flat ] for s, r in zip(state_size, ret_flat): r.set_shape([None] + s) return tf.contrib.framework.nest.pack_sequence_as( structure=[1, 1], flat_sequence=ret_flat)
def testSeparableConvWithResourceVar(self): graph = ops.Graph() with graph.as_default(): with variable_scope.variable_scope('', use_resource=True): batch_size, height, width, depth = 5, 128, 128, 3 input1 = array_ops.zeros((batch_size, height, width, depth)) kernel_size, depth_multiplier = 3, 1 depthwise_shape = [kernel_size, kernel_size, depth, depth_multiplier] depthwise_weights = variables.model_variable( 'depthwise_weights', shape=depthwise_shape) strides = [1, 1, 1, 1] with variable_scope.variable_scope('depthwise_conv_1'): conv1 = nn.depthwise_conv2d( input1, depthwise_weights, strides, padding='SAME') with variable_scope.variable_scope('depthwise_conv_2'): conv2 = nn.depthwise_conv2d( conv1, depthwise_weights, strides, padding='SAME') math_ops.add(conv2, input1, name='add') quantize.Quantize(graph, True) # Test that the weights and activations of all convs have been quantized. quant_node_name = 'FakeQuantWithMinMaxVars' weights_quant = graph.get_operation_by_name( 'depthwise_conv_1/weights_quant/' + quant_node_name) self.assertEqual(weights_quant.type, quant_node_name) act_quant = graph.get_operation_by_name('depthwise_conv_1/act_quant/' + quant_node_name) self.assertEqual(act_quant.type, quant_node_name) weights_quant = graph.get_operation_by_name( 'depthwise_conv_2/weights_quant/' + quant_node_name) self.assertEqual(weights_quant.type, quant_node_name) act_quant = graph.get_operation_by_name('depthwise_conv_2/act_quant/' + quant_node_name) self.assertEqual(act_quant.type, quant_node_name)
def embedding(x, vocab_dim, emb_dim, trainable=True, dtype=tf.float32, initializer=None, activation_collection=tf.GraphKeys.ACTIVATIONS, variable_collection=tf.GraphKeys.MODEL_VARIABLES, scope='lookup'): if initializer is None: init_width = 0.5 / emb_dim initializer = tf.random_uniform_initializer(-init_width, init_width) W = variables.model_variable('embedding', shape=[vocab_dim, emb_dim], dtype=dtype, initializer=initializer, trainable=trainable) x = tf.nn.embedding_lookup(W, x, name=scope) if activation_collection is not None: tf.add_to_collection(activation_collection, x) if variable_collection is not None: tf.add_to_collection(variable_collection, W) return x
def joint_weighted_sum_from_feature_columns(columns_to_tensors, feature_columns, num_outputs, weight_collections=None, trainable=True, scope=None): """A restricted linear prediction builder based on FeatureColumns. As long as all feature columns are unweighted sparse columns this computes the prediction of a linear model which stores all weights in a single variable. Args: columns_to_tensors: A mapping from feature column to tensors. 'string' key means a base feature (not-transformed). It can have FeatureColumn as a key too. That means that FeatureColumn is already transformed by input pipeline. For example, `inflow` may have handled transformations. feature_columns: A set containing all the feature columns. All items in the set should be instances of classes derived from FeatureColumn. num_outputs: An integer specifying number of outputs. Default value is 1. weight_collections: List of graph collections to which weights are added. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for variable_scope. Returns: A tuple containing: * A Tensor which represents predictions of a linear model. * A list of Variables storing the weights. * A Variable which is used for bias. Raises: ValueError: if FeatureColumn cannot be used for linear predictions. """ columns_to_tensors = columns_to_tensors.copy() check_feature_columns(feature_columns) with variable_scope.variable_scope( scope, default_name='joint_weighted_sum_from_feature_columns', values=columns_to_tensors.values()): transformer = _Transformer(columns_to_tensors) embedding_lookup_arguments = [] for column in sorted(set(feature_columns), key=lambda x: x.key): transformed_tensor = transformer.transform(column) try: embedding_lookup_arguments.append( column._wide_embedding_lookup_arguments( transformed_tensor)) # pylint: disable=protected-access except NotImplementedError: raise NotImplementedError( 'Real-valued columns are not supported. ' 'Use weighted_sum_from_feature_columns ' 'instead, or bucketize these columns.') variable, predictions_no_bias = _create_joint_embedding_lookup( columns_to_tensors, embedding_lookup_arguments, num_outputs, trainable, weight_collections) bias = contrib_variables.model_variable( 'bias_weight', shape=[num_outputs], initializer=init_ops.zeros_initializer(), trainable=trainable, collections=_add_variable_collection(weight_collections)) _log_variable(bias) predictions = nn_ops.bias_add(predictions_no_bias, bias) return predictions, variable, bias
def batch_norm(inputs, decay=0.999, center=True, scale=False, epsilon=0.001, activation_fn=None, updates_collections=ops.GraphKeys.UPDATE_OPS, is_training=True, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """Adds a Batch Normalization layer from http://arxiv.org/abs/1502.03167. "Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift" Sergey Ioffe, Christian Szegedy Can be used as a normalizer function for conv2d and fully_connected. Args: -inputs: a tensor of size `[batch_size, height, width, channels]` or `[batch_size, channels]`. -decay: decay for the moving average. -center: If True, subtract `beta`. If False, `beta` is ignored. -scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the next layer is linear (also e.g. `nn.relu`), this can be disabled since the scaling can be done by the next layer. -epsilon: small float added to variance to avoid dividing by zero. -activation_fn: Optional activation function. -updates_collections: collections to collect the update ops for computation. If None, a control dependency would be added to make sure the updates are computed. -is_training: whether or not the layer is in training mode. In training mode it would accumulate the statistics of the moments into `moving_mean` and `moving_variance` using an exponential moving average with the given `decay`. When it is not in training mode then it would use the values of the `moving_mean` and the `moving_variance`. -reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. -variables_collections: optional collections for the variables. -outputs_collections: collections to add the outputs. -trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). -scope: Optional scope for `variable_op_scope`. Returns: a tensor representing the output of the operation. """ with variable_scope.variable_op_scope([inputs],scope, 'BatchNorm', reuse=reuse) as sc: inputs_shape = inputs.get_shape() dtype = inputs.dtype.base_dtype axis = list(range(len(inputs_shape) - 1)) params_shape = inputs_shape[-1:] # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None if center: beta_collections = utils.get_variable_collections(variables_collections,'beta') beta = variables.model_variable('beta',shape=params_shape,dtype=dtype,initializer=init_ops.zeros_initializer,collections=beta_collections,trainable=trainable) if scale: gamma_collections = utils.get_variable_collections(variables_collections,'gamma') gamma = variables.model_variable('gamma',shape=params_shape,dtype=dtype,initializer=init_ops.ones_initializer,collections=gamma_collections,trainable=trainable) # Create moving_mean and moving_variance variables and add them to the # appropiate collections. moving_mean_collections = utils.get_variable_collections(variables_collections, 'moving_mean') moving_mean = variables.model_variable('moving_mean',shape=params_shape,dtype=dtype,initializer=init_ops.zeros_initializer,trainable=False,collections=moving_mean_collections) moving_variance_collections = utils.get_variable_collections(variables_collections, 'moving_variance') moving_variance = variables.model_variable('moving_variance',shape=params_shape,dtype=dtype,initializer=init_ops.ones_initializer,trainable=False,collections=moving_variance_collections) if is_training: # Calculate the moments based on the individual batch. mean, variance = nn.moments(inputs, axis, shift=moving_mean) # Update the moving_mean and moving_variance moments. update_moving_mean = moving_averages.assign_moving_average(moving_mean, mean, decay) update_moving_variance = moving_averages.assign_moving_average(moving_variance, variance, decay) if updates_collections is None: # Make sure the updates are computed here. with ops.control_dependencies([update_moving_mean,update_moving_variance]): outputs = nn.batch_normalization(inputs, mean, variance, beta, gamma, epsilon) else: # Collect the updates to be computed later. ops.add_to_collections(updates_collections, update_moving_mean) ops.add_to_collections(updates_collections, update_moving_variance) outputs = nn.batch_normalization(inputs, mean, variance, beta, gamma, epsilon) else: outputs = nn.batch_normalization( inputs, moving_mean, moving_variance, beta, gamma, epsilon) outputs.set_shape(inputs.get_shape()) if activation_fn: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def fully_connected(inputs, num_outputs, activation_fn=nn.relu, normalizer_fn=None, normalizer_params=None, weights_normalizer_fn=None, weights_normalizer_params=None, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=init_ops.zeros_initializer(), biases_regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): # Be copied and modified from tensorflow-0.12.0.contrib.layer.fully_connected, # add weights_nomalizer_* options. """Adds a fully connected layer. `fully_connected` creates a variable called `weights`, representing a fully connected weight matrix, which is multiplied by the `inputs` to produce a `Tensor` of hidden units. If a `normalizer_fn` is provided (such as `batch_norm`), it is then applied. Otherwise, if `normalizer_fn` is None and a `biases_initializer` is provided then a `biases` variable would be created and added the hidden units. Finally, if `activation_fn` is not `None`, it is applied to the hidden units as well. Note: that if `inputs` have a rank greater than 2, then `inputs` is flattened prior to the initial matrix multiply by `weights`. Args: inputs: A tensor of with at least rank 2 and value for the last dimension, i.e. `[batch_size, depth]`, `[None, None, None, channels]`. num_outputs: Integer or long, the number of output units in the layer. activation_fn: activation function, set to None to skip it and maintain a linear activation. normalizer_fn: normalization function to use instead of `biases`. If `normalizer_fn` is provided then `biases_initializer` and `biases_regularizer` are ignored and `biases` are not created nor added. default set to None for no normalizer function normalizer_params: normalization function parameters. weights_normalizer_fn: weights normalization function. weights_normalizer_params: weights normalization function parameters. weights_initializer: An initializer for the weights. weights_regularizer: Optional regularizer for the weights. biases_initializer: An initializer for the biases. If None skip biases. biases_regularizer: Optional regularizer for the biases. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional list of collections for all the variables or a dictionary containing a different list of collections per variable. outputs_collections: collection to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for variable_scope. Returns: the tensor variable representing the result of the series of operations. Raises: ValueError: if x has rank less than 2 or if its last dimension is not set. """ if not (isinstance(num_outputs, six.integer_types)): raise ValueError('num_outputs should be int or long, got %s.', num_outputs) with variable_scope.variable_scope(scope, 'fully_connected', [inputs], reuse=reuse) as sc: inputs = ops.convert_to_tensor(inputs) dtype = inputs.dtype.base_dtype inputs_shape = inputs.get_shape() num_input_units = utils.last_dimension(inputs_shape, min_rank=2) static_shape = inputs_shape.as_list() static_shape[-1] = num_outputs out_shape = array_ops.unpack(array_ops.shape(inputs), len(static_shape)) out_shape[-1] = num_outputs weights_shape = [num_input_units, num_outputs] weights_collections = utils.get_variable_collections( variables_collections, 'weights') weights = variables.model_variable('weights', shape=weights_shape, dtype=dtype, initializer=weights_initializer, regularizer=weights_regularizer, collections=weights_collections, trainable=trainable) if weights_normalizer_fn is not None: weights_normalizer_params = weights_normalizer_params or {} weights = weights_normalizer_fn(weights, **weights_normalizer_params) if len(static_shape) > 2: # Reshape inputs inputs = array_ops.reshape(inputs, [-1, num_input_units]) outputs = standard_ops.matmul(inputs, weights) if normalizer_fn is not None: normalizer_params = normalizer_params or {} outputs = normalizer_fn(outputs, **normalizer_params) else: if biases_initializer is not None: biases_collections = utils.get_variable_collections( variables_collections, 'biases') biases = variables.model_variable( 'biases', shape=[ num_outputs, ], dtype=dtype, initializer=biases_initializer, regularizer=biases_regularizer, collections=biases_collections, trainable=trainable) outputs = nn.bias_add(outputs, biases) if activation_fn is not None: outputs = activation_fn(outputs) if len(static_shape) > 2: # Reshape back outputs outputs = array_ops.reshape(outputs, array_ops.pack(out_shape)) outputs.set_shape(static_shape) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs)
def _embeddings_from_arguments(column, args, weight_collections, trainable, output_rank=2): """Returns embeddings for a column based on the computed arguments. Args: column: the column name. args: the _DeepEmbeddingLookupArguments for this column. weight_collections: collections to store weights in. trainable: whether these embeddings should be trainable. output_rank: the desired rank of the returned `Tensor`. Inner dimensions will be combined to produce the desired rank. Returns: the embeddings. Raises: ValueError: if not possible to create. """ # pylint: disable=protected-access input_tensor = layers._inner_flatten(args.input_tensor, output_rank) weight_tensor = None if args.weight_tensor is not None: weight_tensor = layers._inner_flatten(args.weight_tensor, output_rank) # pylint: enable=protected-access # This option is only enabled for scattered_embedding_column. if args.hash_key: embeddings = contrib_variables.model_variable( name='weights', shape=[args.vocab_size], dtype=dtypes.float32, initializer=args.initializer, trainable=trainable, collections=weight_collections) return embedding_ops.scattered_embedding_lookup_sparse( embeddings, input_tensor, args.dimension, hash_key=args.hash_key, combiner=args.combiner, name='lookup') if args.shared_embedding_name is not None: shared_embedding_collection_name = ( 'SHARED_EMBEDDING_COLLECTION_' + args.shared_embedding_name.upper()) graph = ops.get_default_graph() shared_embedding_collection = ( graph.get_collection_ref(shared_embedding_collection_name)) shape = [args.vocab_size, args.dimension] if shared_embedding_collection: if len(shared_embedding_collection) > 1: raise ValueError('Collection %s can only contain one ' '(partitioned) variable.' % shared_embedding_collection_name) else: embeddings = shared_embedding_collection[0] if embeddings.get_shape() != shape: raise ValueError('The embedding variable with name {} already ' 'exists, but its shape does not match required ' 'embedding shape here. Please make sure to use ' 'different shared_embedding_name for different ' 'shared embeddings.'.format( args.shared_embedding_name)) else: embeddings = contrib_variables.model_variable( name=args.shared_embedding_name, shape=shape, dtype=dtypes.float32, initializer=args.initializer, trainable=trainable, collections=weight_collections) graph.add_to_collection(shared_embedding_collection_name, embeddings) else: embeddings = contrib_variables.model_variable( name='weights', shape=[args.vocab_size, args.dimension], dtype=dtypes.float32, initializer=args.initializer, trainable=trainable, collections=weight_collections) if isinstance(embeddings, variables.Variable): embeddings = [embeddings] else: embeddings = embeddings._get_variable_list() # pylint: disable=protected-access # pylint: disable=protected-access _maybe_restore_from_checkpoint( column._checkpoint_path(), embeddings) return embedding_ops.safe_embedding_lookup_sparse( embeddings, input_tensor, sparse_weights=weight_tensor, combiner=args.combiner, name=column.name + 'weights', max_norm=args.max_norm)
def convolution2d(inputs, num_outputs, kernel_size, stride=1, padding='SAME', activation_fn=nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=init_ops.zeros_initializer, biases_regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """Adds a 2D convolution followed by an optional batch_norm layer. `convolution2d` creates a variable called `weights`, representing the convolutional kernel, that is convolved with the `inputs` to produce a `Tensor` of activations. If a `normalizer_fn` is provided (such as `batch_norm`), it is then applied. Otherwise, if `normalizer_fn` is None and a `biases_initializer` is provided then a `biases` variable would be created and added the activations. Finally, if `activation_fn` is not `None`, it is applied to the activations as well. Args: inputs: a 4-D tensor `[batch_size, height, width, channels]`. num_outputs: integer, the number of output filters. kernel_size: a list of length 2 `[kernel_height, kernel_width]` of of the filters. Can be an int if both values are the same. stride: a list of length 2 `[stride_height, stride_width]`. Can be an int if both strides are the same. Note that presently both strides must have the same value. padding: one of `VALID` or `SAME`. activation_fn: activation function. normalizer_fn: normalization function to use instead of `biases`. If `normalize_fn` is provided then `biases_initializer` and `biases_regularizer` are ignored and `biases` are not created nor added. normalizer_params: normalization function parameters. weights_initializer: An initializer for the weights. weights_regularizer: Optional regularizer for the weights. biases_initializer: An initializer for the biases. If None skip biases. biases_regularizer: Optional regularizer for the biases. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: optional list of collections for all the variables or a dictionay containing a different list of collection per variable. outputs_collections: collection to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for `variable_op_scope`. Returns: a tensor representing the output of the operation. """ with variable_scope.variable_op_scope([inputs], scope, 'Conv', reuse=reuse) as sc: dtype = inputs.dtype.base_dtype kernel_h, kernel_w = utils.two_element_tuple(kernel_size) stride_h, stride_w = utils.two_element_tuple(stride) num_filters_in = utils.last_dimension(inputs.get_shape(), min_rank=4) weights_shape = [kernel_h, kernel_w, num_filters_in, num_outputs] weights_collections = utils.get_variable_collections( variables_collections, 'weights') weights = variables.model_variable('weights', shape=weights_shape, dtype=dtype, initializer=weights_initializer, regularizer=weights_regularizer, collections=weights_collections, trainable=trainable) outputs = nn.conv2d(inputs, weights, [1, stride_h, stride_w, 1], padding=padding) if normalizer_fn: normalizer_params = normalizer_params or {} outputs = normalizer_fn(outputs, **normalizer_params) else: if biases_initializer is not None: biases_collections = utils.get_variable_collections( variables_collections, 'biases') biases = variables.model_variable('biases', shape=[num_outputs,], dtype=dtype, initializer=biases_initializer, regularizer=biases_regularizer, collections=biases_collections, trainable=trainable) outputs = nn.bias_add(outputs, biases) if activation_fn: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def instance_norm(inputs, center=True, scale=True, epsilon=1e-6, activation_fn=None, param_initializers=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, data_format=DATA_FORMAT_NHWC, scope=None): """Functional interface for the instance normalization layer. Reference: https://arxiv.org/abs/1607.08022. "Instance Normalization: The Missing Ingredient for Fast Stylization" Dmitry Ulyanov, Andrea Vedaldi, Victor Lempitsky Args: inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`. The normalization is over all but the last dimension if `data_format` is `NHWC` and the second dimension if `data_format` is `NCHW`. center: If True, add offset of `beta` to normalized tensor. If False, `beta` is ignored. scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the next layer is linear (also e.g. `nn.relu`), this can be disabled since the scaling can be done by the next layer. epsilon: Small float added to variance to avoid dividing by zero. activation_fn: Activation function, default set to None to skip it and maintain a linear activation. param_initializers: Optional initializers for beta, gamma, moving mean and moving variance. reuse: Whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional collections for the variables. outputs_collections: Collections to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). data_format: A string. `NHWC` (default) and `NCHW` are supported. scope: Optional scope for `variable_scope`. Returns: A `Tensor` representing the output of the operation. Raises: ValueError: If `data_format` is neither `NHWC` nor `NCHW`. ValueError: If the rank of `inputs` is undefined. ValueError: If rank or channels dimension of `inputs` is undefined. """ inputs = ops.convert_to_tensor(inputs) inputs_shape = inputs.shape inputs_rank = inputs.shape.ndims if inputs_rank is None: raise ValueError('Inputs %s has undefined rank.' % inputs.name) if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC): raise ValueError('data_format has to be either NCHW or NHWC.') with variable_scope.variable_scope( scope, 'InstanceNorm', [inputs], reuse=reuse) as sc: if data_format == DATA_FORMAT_NCHW: reduction_axis = 1 # For NCHW format, rather than relying on implicit broadcasting, we # explicitly reshape the params to params_shape_broadcast when computing # the moments and the batch normalization. params_shape_broadcast = list( [1, inputs_shape[1].value] + [1 for _ in range(2, inputs_rank)]) else: reduction_axis = inputs_rank - 1 params_shape_broadcast = None moments_axes = list(range(inputs_rank)) del moments_axes[reduction_axis] del moments_axes[0] params_shape = inputs_shape[reduction_axis:reduction_axis + 1] if not params_shape.is_fully_defined(): raise ValueError('Inputs %s has undefined channels dimension %s.' % ( inputs.name, params_shape)) # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None dtype = inputs.dtype.base_dtype if param_initializers is None: param_initializers = {} if center: beta_collections = utils.get_variable_collections( variables_collections, 'beta') beta_initializer = param_initializers.get( 'beta', init_ops.zeros_initializer()) beta = variables.model_variable('beta', shape=params_shape, dtype=dtype, initializer=beta_initializer, collections=beta_collections, trainable=trainable) if params_shape_broadcast: beta = array_ops.reshape(beta, params_shape_broadcast) if scale: gamma_collections = utils.get_variable_collections( variables_collections, 'gamma') gamma_initializer = param_initializers.get( 'gamma', init_ops.ones_initializer()) gamma = variables.model_variable('gamma', shape=params_shape, dtype=dtype, initializer=gamma_initializer, collections=gamma_collections, trainable=trainable) if params_shape_broadcast: gamma = array_ops.reshape(gamma, params_shape_broadcast) # Calculate the moments (instance activations). mean, variance = nn.moments(inputs, moments_axes, keep_dims=True) # Compute instance normalization. outputs = nn.batch_normalization( inputs, mean, variance, beta, gamma, epsilon, name='instancenorm') if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def test(input, scope=None, reuse=None): with variable_scope.variable_op_scope([input], scope, 'test', reuse=reuse): return variables.model_variable('asdf', [1, 1], initializer=tf.constant_initializer(0.), trainable=True)
def group_norm(inputs, groups=32, channels_axis=-1, reduction_axes=(-3, -2), center=True, scale=True, epsilon=1e-6, activation_fn=None, param_initializers=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None, mean_close_to_zero=False): """Functional interface for the group normalization layer. Reference: https://arxiv.org/abs/1803.08494. "Group Normalization", Yuxin Wu, Kaiming He Args: inputs: A Tensor with at least 2 dimensions one which is channels. All shape dimensions must be fully defined. groups: Integer. Divide the channels into this number of groups over which normalization statistics are computed. This number must be commensurate with the number of channels in `inputs`. channels_axis: An integer. Specifies index of channels axis which will be broken into `groups`, each of which whose statistics will be computed across. Must be mutually exclusive with `reduction_axes`. Preferred usage is to specify negative integers to be agnostic as to whether a batch dimension is included. reduction_axes: Tuple of integers. Specifies dimensions over which statistics will be accumulated. Must be mutually exclusive with `channels_axis`. Statistics will not be accumulated across axes not specified in `reduction_axes` nor `channel_axis`. Preferred usage is to specify negative integers to be agnostic to whether a batch dimension is included. Some sample usage cases: NHWC format: channels_axis=-1, reduction_axes=[-3, -2] NCHW format: channels_axis=-3, reduction_axes=[-2, -1] center: If True, add offset of `beta` to normalized tensor. If False, `beta` is ignored. scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the next layer is linear (also e.g. `nn.relu`), this can be disabled since the scaling can be done by the next layer. epsilon: Small float added to variance to avoid dividing by zero. activation_fn: Activation function, default set to None to skip it and maintain a linear activation. param_initializers: Optional initializers for beta, gamma, moving mean and moving variance. reuse: Whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional collections for the variables. outputs_collections: Collections to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). scope: Optional scope for `variable_scope`. mean_close_to_zero: The mean of `input` before ReLU will be close to zero when batch size >= 4k for Resnet-50 on TPU. If `True`, use `nn.sufficient_statistics` and `nn.normalize_moments` to calculate the variance. This is the same behavior as `fused` equals `True` in batch normalization. If `False`, use `nn.moments` to calculate the variance. When `mean` is close to zero, like 1e-4, use `mean` to calculate the variance may have poor result due to repeated roundoff error and denormalization in `mean`. When `mean` is large, like 1e2, sum(`input`^2) is so large that only the high-order digits of the elements are being accumulated. Thus, use sum(`input` - `mean`)^2/n to calculate the variance has better accuracy compared to (sum(`input`^2)/n - `mean`^2) when `mean` is large. Returns: A `Tensor` representing the output of the operation. Raises: ValueError: If the rank of `inputs` is undefined. ValueError: If rank or channels dimension of `inputs` is undefined. ValueError: If number of groups is not commensurate with number of channels. ValueError: If reduction_axes or channels_axis are out of bounds. ValueError: If reduction_axes are not mutually exclusive with channels_axis. """ # TODO(shlens): Support partially defined shapes for the inputs. inputs = ops.convert_to_tensor(inputs) original_shape = inputs.shape if inputs.shape.ndims is None: raise ValueError('Inputs %s has undefined rank.' % inputs.name) if channels_axis > (inputs.shape.ndims - 1): raise ValueError('Axis is out of bounds.') # Standardize the channels_axis to be positive and identify # of channels. if channels_axis < 0: channels_axis = inputs.shape.ndims + channels_axis channels = inputs.shape[channels_axis].value if channels is None: raise ValueError('Inputs %s has undefined channel dimension: %d.' % ( inputs.name, channels_axis)) # Standardize the reduction_axes to be positive. reduction_axes = list(reduction_axes) for i in range(len(reduction_axes)): if reduction_axes[i] < 0: reduction_axes[i] += inputs.shape.ndims for a in reduction_axes: if a > inputs.shape.ndims: raise ValueError('Axis is out of bounds.') if inputs.shape[a].value is None: raise ValueError('Inputs %s has undefined dimensions %d.' % ( inputs.name, a)) if channels_axis == a: raise ValueError('reduction_axis must be mutually exclusive ' 'with channels_axis') if groups > channels: raise ValueError('Invalid groups %d for %d channels.' % (groups, channels)) if channels % groups != 0: raise ValueError('%d channels is not commensurate with %d groups.' % (channels, groups)) # Determine axes before channels. Some examples of common image formats: # 'NCHW': before = [N], after = [HW] # 'NHWC': before = [NHW], after = [] axes_before_channels = inputs.shape.as_list()[:channels_axis] axes_after_channels = inputs.shape.as_list()[channels_axis+1:] # Manually broadcast the parameters to conform to the number of groups. params_shape_broadcast = ([1] * len(axes_before_channels) + [groups, channels // groups] + [1] * len(axes_after_channels)) # Reshape the input by the group within the channel dimension. inputs_shape = (axes_before_channels + [groups, channels // groups] + axes_after_channels) inputs = array_ops.reshape(inputs, inputs_shape) # Determine the dimensions across which moments are calculated. moments_axes = [channels_axis + 1] for a in reduction_axes: if a > channels_axis: moments_axes.append(a + 1) else: moments_axes.append(a) with variable_scope.variable_scope( scope, 'GroupNorm', [inputs], reuse=reuse) as sc: # Note that the params_shape is the number of channels always. params_shape = [channels] # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None dtype = inputs.dtype.base_dtype if param_initializers is None: param_initializers = {} if center: beta_collections = utils.get_variable_collections( variables_collections, 'beta') beta_initializer = param_initializers.get( 'beta', init_ops.zeros_initializer()) beta = variables.model_variable('beta', shape=params_shape, dtype=dtype, initializer=beta_initializer, collections=beta_collections, trainable=trainable) beta = array_ops.reshape(beta, params_shape_broadcast) if scale: gamma_collections = utils.get_variable_collections( variables_collections, 'gamma') gamma_initializer = param_initializers.get( 'gamma', init_ops.ones_initializer()) gamma = variables.model_variable('gamma', shape=params_shape, dtype=dtype, initializer=gamma_initializer, collections=gamma_collections, trainable=trainable) gamma = array_ops.reshape(gamma, params_shape_broadcast) # Calculate the moments. if mean_close_to_zero: # One pass algorithm returns better result when mean is close to zero. counts, means_ss, variance_ss, _ = nn.sufficient_statistics( inputs, moments_axes, keep_dims=True) mean, variance = nn.normalize_moments( counts, means_ss, variance_ss, shift=None) else: mean, variance = nn.moments(inputs, moments_axes, keep_dims=True) # Compute normalization. # TODO(shlens): Fix nn.batch_normalization to handle the 5-D Tensor # appropriately so that this operation may be faster. gain = math_ops.rsqrt(variance + epsilon) offset = -mean * gain if gamma is not None: gain *= gamma offset *= gamma if beta is not None: offset += beta outputs = inputs * gain + offset # Collapse the groups into the channel dimension. outputs = array_ops.reshape(outputs, original_shape) if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def conv2d_leaders( inputs, num_outputs, kernel_size, rates=[1], stride=1, padding='SAME', activation_fn=nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=init_ops.zeros_initializer, biases_regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None, ): """Adds a 2D convolution followed by an optional batch_norm layer. `convolution2d` creates a variable called `weights`, representing the convolutional kernel, that is convolved with the `inputs` to produce a `Tensor` of activations. If a `normalizer_fn` is provided (such as `batch_norm`), it is then applied. Otherwise, if `normalizer_fn` is None and a `biases_initializer` is provided then a `biases` variable would be created and added the activations. Finally, if `activation_fn` is not `None`, it is applied to the activations as well. Performs a'trous convolution with input stride equal to rate if rate is greater than one. Args: inputs: a 4-D tensor `[batch_size, height, width, channels]`. num_outputs: integer, the number of output filters. kernel_size: a list of length 2 `[kernel_height, kernel_width]` of of the filters. Can be an int if both values are the same. stride: a list of length 2 `[stride_height, stride_width]`. Can be an int if both strides are the same. Note that presently both strides must have the same value. padding: one of `VALID` or `SAME`. rate: integer. If less than or equal to 1, a standard convolution is used. If greater than 1, than the a'trous convolution is applied and `stride` must be set to 1. activation_fn: activation function. normalizer_fn: normalization function to use instead of `biases`. If `normalize_fn` is provided then `biases_initializer` and `biases_regularizer` are ignored and `biases` are not created nor added. normalizer_params: normalization function parameters. weights_initializer: An initializer for the weights. weights_regularizer: Optional regularizer for the weights. biases_initializer: An initializer for the biases. If None skip biases. biases_regularizer: Optional regularizer for the biases. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: optional list of collections for all the variables or a dictionay containing a different list of collection per variable. outputs_collections: collection to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for `variable_op_scope`. Returns: a tensor representing the output of the operation. Raises: ValueError: if both 'rate' and `stride` are larger than one. """ with variable_scope.variable_scope(scope, 'Conv', [inputs], reuse=reuse) as sc: inputs = ops.convert_to_tensor(inputs) dtype = inputs.dtype.base_dtype # inshape = tf.shape(inputs) # Leading kernel size. kernel_h, kernel_w = utils.two_element_tuple(kernel_size) stride_h, stride_w = utils.two_element_tuple(stride) num_filters_in = utils.last_dimension(inputs.get_shape(), min_rank=4) # Weights variable. weights_shape = [kernel_h, kernel_w, num_filters_in, num_outputs] weights_collections = utils.get_variable_collections( variables_collections, 'weights') weights = variables.model_variable('weights', shape=weights_shape, dtype=dtype, initializer=weights_initializer, regularizer=weights_regularizer, collections=weights_collections, trainable=trainable) # # Bias variable. # biases = None # if biases_initializer is not None: # biases_collections = utils.get_variable_collections( # variables_collections, 'biases') # biases = variables.model_variable('biases', # shape=[num_outputs, ], # dtype=dtype, # initializer=biases_initializer, # regularizer=biases_regularizer, # collections=biases_collections, # trainable=trainable) # Convolution at different scales. outputs_pool = [] for rate in rates: if rate > 1: conv = nn.atrous_conv2d(inputs, weights, rate, padding='SAME') else: conv = nn.conv2d(inputs, weights, [1, 1, 1, 1], padding='SAME') outputs_pool.append(conv) # 'Pooling' at different scales. A bit hacky. Use of concat + max_pool? outputs = None outputs_pool.reverse() for node in outputs_pool: if outputs is None: outputs = node else: outputs = tf.maximum(outputs, node) # # Add bias? # if biases is not None: # outputs = tf.nn.bias_add(outputs, biases) # Fix padding and stride. A bit hacky too and not so efficient! if padding == 'VALID' or stride > 1: padfilter = np.zeros(shape=(kernel_h, kernel_w, num_filters_in, 1), dtype=dtype) x = (kernel_h - 1) / 2 y = (kernel_w - 1) / 2 padfilter[x, y, :, 0] = 1. outputs = tf.nn.depthwise_conv2d(outputs, padfilter, [1, stride_h, stride_w, 1], padding=padding) # Batch norm / bias and activation... if normalizer_fn is not None: normalizer_params = normalizer_params or {} outputs = normalizer_fn(outputs, **normalizer_params) else: if biases_initializer is not None: biases_collections = utils.get_variable_collections( variables_collections, 'biases') biases = variables.model_variable( 'biases', shape=[ num_outputs, ], dtype=dtype, initializer=biases_initializer, regularizer=biases_regularizer, collections=biases_collections, trainable=trainable) outputs = nn.bias_add(outputs, biases) if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def _build(self, inputs, is_training=True): """ Args: inputs: A Tensor of shape `(batch_size, height, width, channels)`. Returns: A dict of feature maps to be consumed by an SSD network """ # TODO: Is there a better way to manage scoping in these cases? scope = self.module_name if self.parent_name: scope = self.parent_name + '/' + scope base_net_endpoints = super(SSDFeatureExtractor, self)._build( inputs, is_training=is_training)['end_points'] if self.vgg_16_type: # The original SSD paper uses a modified version of the vgg16 # network, which we'll modify here vgg_network_truncation_endpoint = base_net_endpoints[ scope + '/vgg_16/conv5/conv5_3'] # As it is pointed out in SSD and ParseNet papers, `conv4_3` has a # different features scale compared to other layers, to adjust it # we need to add a spatial normalization before adding the # predictors. vgg_conv4_3_name = scope + '/vgg_16/conv4/conv4_3' vgg_conv4_3 = base_net_endpoints[vgg_conv4_3_name] with tf.variable_scope(vgg_conv4_3_name + '_norm'): inputs_shape = vgg_conv4_3.shape inputs_rank = inputs_shape.ndims dtype = vgg_conv4_3.dtype.base_dtype norm_dim = tf.range(inputs_rank - 1, inputs_rank) params_shape = inputs_shape[-1:] # Normalize. vgg_conv4_3_norm = tf.nn.l2_normalize( vgg_conv4_3, norm_dim, epsilon=1e-12 ) # Scale. # TODO use tf.get_variable and initialize # to 20 as described in paper scale = variables.model_variable( 'gamma', shape=params_shape, dtype=dtype, initializer=init_ops.ones_initializer() ) vgg_conv4_3_norm = tf.multiply(vgg_conv4_3_norm, scale) tf.add_to_collection('FEATURE_MAPS', vgg_conv4_3_norm) # Extra layers for vgg16 as detailed in paper self._init_vgg16_extra_layers() with tf.variable_scope('extra_feature_layers'): net = tf.nn.max_pool( vgg_network_truncation_endpoint, [1, 3, 3, 1], padding='SAME', strides=[1, 1, 1, 1], name='pool5' ) net = self.conv6(net) net = self.activation_fn(net) net = self.conv7(net) net = self.activation_fn(net) tf.add_to_collection('FEATURE_MAPS', net) net = self.conv8_1(net) net = self.activation_fn(net) net = self.conv8_2(net) net = self.activation_fn(net) tf.add_to_collection('FEATURE_MAPS', net) net = self.conv9_1(net) net = self.activation_fn(net) net = self.conv9_2(net) net = self.activation_fn(net) tf.add_to_collection('FEATURE_MAPS', net) net = self.conv10_1(net) net = self.activation_fn(net) net = self.conv10_2(net) net = self.activation_fn(net) tf.add_to_collection('FEATURE_MAPS', net) net = self.conv11_1(net) net = self.activation_fn(net) net = self.conv11_2(net) net = self.activation_fn(net) tf.add_to_collection('FEATURE_MAPS', net) # This parameter determines onto which variables we try to load the # pretrained weights self.pretrained_weights_scope = scope + '/vgg_16' # It's actually an ordered dict return utils.convert_collection_to_dict('FEATURE_MAPS')
def fully_connected(inputs, num_outputs, activation_fn=nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=init_ops.zeros_initializer, biases_regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, scope=None): """Adds a fully connected layer. `fully_connected` creates a variable called `weights`, representing a fully connected weight matrix, which is multiplied by the `inputs` to produce a `Tensor` of hidden units. If a `normalizer_fn` is provided (such as `batch_norm`), it is then applied. Otherwise, if `normalizer_fn` is None and a `biases_initializer` is provided then a `biases` variable would be created and added the hidden units. Finally, if `activation_fn` is not `None`, it is applied to the hidden units as well. Note: that if `inputs` have a rank greater than 2, then `inputs` is flattened prior to the initial matrix multiply by `weights`. Args: inputs: A tensor of with at least rank 2 and value for the last dimension, i.e. `[batch_size, depth]`, `[None, None, None, channels]`. num_outputs: Integer, the number of output units in the layer. activation_fn: activation function. normalizer_fn: normalization function to use instead of `biases`. If `normalize_fn` is provided then `biases_initializer` and `biases_regularizer` are ignored and `biases` are not created nor added. normalizer_params: normalization function parameters. weights_initializer: An initializer for the weights. weights_regularizer: Optional regularizer for the weights. biases_initializer: An initializer for the biases. If None skip biases. biases_regularizer: Optional regularizer for the biases. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional list of collections for all the variables or a dictionay containing a different list of collection per variable. outputs_collections: collection to add the outputs. scope: Optional scope for variable_op_scope. Returns: the tensor variable representing the result of the series of operations. Raises: ValueError: if x has rank less than 2 or if its last dimension is not set. """ with variable_scope.variable_op_scope([inputs], scope, 'fully_connected', reuse=reuse) as sc: dtype = inputs.dtype.base_dtype num_input_units = utils.last_dimension(inputs.get_shape(), min_rank=2) static_shape = inputs.get_shape().as_list() static_shape[-1] = num_outputs out_shape = array_ops.unpack(array_ops.shape(inputs)) out_shape[-1] = num_outputs weights_shape = [num_input_units, num_outputs] weights_collections = utils.get_variable_collections( variables_collections, 'weights') weights = variables.model_variable('weights', shape=weights_shape, dtype=dtype, initializer=weights_initializer, regularizer=weights_regularizer, collections=weights_collections) if len(static_shape) > 2: # Reshape inputs inputs = array_ops.reshape(inputs, [-1, num_input_units]) outputs = standard_ops.matmul(inputs, weights) if normalizer_fn: normalizer_params = normalizer_params or {} outputs = normalizer_fn(outputs, **normalizer_params) else: if biases_initializer is not None: biases_collections = utils.get_variable_collections( variables_collections, 'biases') biases = variables.model_variable( 'biases', shape=[ num_outputs, ], dtype=dtype, initializer=biases_initializer, regularizer=biases_regularizer, collections=biases_collections) outputs = nn.bias_add(outputs, biases) if len(static_shape) > 2: # Reshape back outputs outputs = array_ops.reshape(outputs, array_ops.pack(out_shape)) outputs.set_shape(static_shape) if activation_fn: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def group_norm(inputs, groups=32, channels_axis=-1, reduction_axes=(-3, -2), center=True, scale=True, epsilon=1e-6, activation_fn=None, param_initializers=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None, mean_close_to_zero=False): """Functional interface for the group normalization layer. Reference: https://arxiv.org/abs/1803.08494. "Group Normalization", Yuxin Wu, Kaiming He Args: inputs: A Tensor with at least 2 dimensions one which is channels. All shape dimensions must be fully defined. groups: Integer. Divide the channels into this number of groups over which normalization statistics are computed. This number must be commensurate with the number of channels in `inputs`. channels_axis: An integer. Specifies index of channels axis which will be broken into `groups`, each of which whose statistics will be computed across. Must be mutually exclusive with `reduction_axes`. Preferred usage is to specify negative integers to be agnostic as to whether a batch dimension is included. reduction_axes: Tuple of integers. Specifies dimensions over which statistics will be accumulated. Must be mutually exclusive with `channels_axis`. Statistics will not be accumulated across axes not specified in `reduction_axes` nor `channel_axis`. Preferred usage is to specify negative integers to be agnostic to whether a batch dimension is included. Some sample usage cases: NHWC format: channels_axis=-1, reduction_axes=[-3, -2] NCHW format: channels_axis=-3, reduction_axes=[-2, -1] center: If True, add offset of `beta` to normalized tensor. If False, `beta` is ignored. scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the next layer is linear (also e.g. `nn.relu`), this can be disabled since the scaling can be done by the next layer. epsilon: Small float added to variance to avoid dividing by zero. activation_fn: Activation function, default set to None to skip it and maintain a linear activation. param_initializers: Optional initializers for beta, gamma, moving mean and moving variance. reuse: Whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional collections for the variables. outputs_collections: Collections to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). scope: Optional scope for `variable_scope`. mean_close_to_zero: The mean of `input` before ReLU will be close to zero when batch size >= 4k for Resnet-50 on TPU. If `True`, use `nn.sufficient_statistics` and `nn.normalize_moments` to calculate the variance. This is the same behavior as `fused` equals `True` in batch normalization. If `False`, use `nn.moments` to calculate the variance. When `mean` is close to zero, like 1e-4, use `mean` to calculate the variance may have poor result due to repeated roundoff error and denormalization in `mean`. When `mean` is large, like 1e2, sum(`input`^2) is so large that only the high-order digits of the elements are being accumulated. Thus, use sum(`input` - `mean`)^2/n to calculate the variance has better accuracy compared to (sum(`input`^2)/n - `mean`^2) when `mean` is large. Returns: A `Tensor` representing the output of the operation. Raises: ValueError: If the rank of `inputs` is undefined. ValueError: If rank or channels dimension of `inputs` is undefined. ValueError: If number of groups is not commensurate with number of channels. ValueError: If reduction_axes or channels_axis are out of bounds. ValueError: If reduction_axes are not mutually exclusive with channels_axis. """ # TODO(shlens): Support partially defined shapes for the inputs. inputs = ops.convert_to_tensor(inputs) original_shape = inputs.shape if inputs.shape.ndims is None: raise ValueError('Inputs %s has undefined rank.' % inputs.name) if channels_axis > (inputs.shape.ndims - 1): raise ValueError('Axis is out of bounds.') # Standardize the channels_axis to be positive and identify # of channels. if channels_axis < 0: channels_axis = inputs.shape.ndims + channels_axis channels = inputs.shape[channels_axis].value if channels is None: raise ValueError('Inputs %s has undefined channel dimension: %d.' % (inputs.name, channels_axis)) # Standardize the reduction_axes to be positive. reduction_axes = list(reduction_axes) for i in range(len(reduction_axes)): if reduction_axes[i] < 0: reduction_axes[i] += inputs.shape.ndims for a in reduction_axes: if a > inputs.shape.ndims: raise ValueError('Axis is out of bounds.') if inputs.shape[a].value is None: raise ValueError('Inputs %s has undefined dimensions %d.' % (inputs.name, a)) if channels_axis == a: raise ValueError('reduction_axis must be mutually exclusive ' 'with channels_axis') if groups > channels: raise ValueError('Invalid groups %d for %d channels.' % (groups, channels)) if channels % groups != 0: raise ValueError('%d channels is not commensurate with %d groups.' % (channels, groups)) # Determine axes before channels. Some examples of common image formats: # 'NCHW': before = [N], after = [HW] # 'NHWC': before = [NHW], after = [] axes_before_channels = inputs.shape.as_list()[:channels_axis] axes_after_channels = inputs.shape.as_list()[channels_axis + 1:] # Manually broadcast the parameters to conform to the number of groups. params_shape_broadcast = ([1] * len(axes_before_channels) + [groups, channels // groups] + [1] * len(axes_after_channels)) # Reshape the input by the group within the channel dimension. inputs_shape = (axes_before_channels + [groups, channels // groups] + axes_after_channels) inputs = array_ops.reshape(inputs, inputs_shape) # Determine the dimensions across which moments are calculated. moments_axes = [channels_axis + 1] for a in reduction_axes: if a > channels_axis: moments_axes.append(a + 1) else: moments_axes.append(a) with variable_scope.variable_scope(scope, 'GroupNorm', [inputs], reuse=reuse) as sc: # Note that the params_shape is the number of channels always. params_shape = [channels] # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None dtype = inputs.dtype.base_dtype if param_initializers is None: param_initializers = {} if center: beta_collections = utils.get_variable_collections( variables_collections, 'beta') beta_initializer = param_initializers.get( 'beta', init_ops.zeros_initializer()) beta = variables.model_variable('beta', shape=params_shape, dtype=dtype, initializer=beta_initializer, collections=beta_collections, trainable=trainable) beta = array_ops.reshape(beta, params_shape_broadcast) if scale: gamma_collections = utils.get_variable_collections( variables_collections, 'gamma') gamma_initializer = param_initializers.get( 'gamma', init_ops.ones_initializer()) gamma = variables.model_variable('gamma', shape=params_shape, dtype=dtype, initializer=gamma_initializer, collections=gamma_collections, trainable=trainable) gamma = array_ops.reshape(gamma, params_shape_broadcast) # Calculate the moments. if mean_close_to_zero: # One pass algorithm returns better result when mean is close to zero. counts, means_ss, variance_ss, _ = nn.sufficient_statistics( inputs, moments_axes, keep_dims=True) mean, variance = nn.normalize_moments(counts, means_ss, variance_ss, shift=None) else: mean, variance = nn.moments(inputs, moments_axes, keep_dims=True) # Compute normalization. # TODO(shlens): Fix nn.batch_normalization to handle the 5-D Tensor # appropriately so that this operation may be faster. gain = math_ops.rsqrt(variance + epsilon) offset = -mean * gain if gamma is not None: gain *= gamma offset *= gamma if beta is not None: offset += beta outputs = inputs * gain + offset # Collapse the groups into the channel dimension. outputs = array_ops.reshape(outputs, original_shape) if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def batch_norm(inputs, decay=0.999, center=True, scale=False, epsilon=0.001, updates_collections=ops.GraphKeys.UPDATE_OPS, is_training=True, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """Code modification of tensorflow/contrib/layers/python/layers/layers.py """ with variable_scope.variable_op_scope([inputs], scope, 'BatchNorm', reuse=reuse) as sc: inputs = ops.convert_to_tensor(inputs) inputs_shape = inputs.get_shape() inputs_rank = inputs_shape.ndims if inputs_rank is None: raise ValueError('Inputs %s has undefined rank.' % inputs.name) dtype = inputs.dtype.base_dtype axis = list(range(inputs_rank - 1)) params_shape = inputs_shape[-1:] if not params_shape.is_fully_defined(): raise ValueError('Inputs %s has undefined last dimension %s.' % (inputs.name, params_shape)) # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None if center: beta_collections = utils.get_variable_collections( variables_collections, 'beta') beta = variables.model_variable( 'beta', shape=params_shape, dtype=dtype, initializer=init_ops.zeros_initializer, collections=beta_collections, trainable=trainable) if scale: gamma_collections = utils.get_variable_collections( variables_collections, 'gamma') gamma = variables.model_variable( 'gamma', shape=params_shape, dtype=dtype, initializer=init_ops.ones_initializer, collections=gamma_collections, trainable=trainable) # Create moving_mean and moving_variance variables and add them to the # appropiate collections. moving_mean_collections = utils.get_variable_collections( variables_collections, 'moving_mean') moving_mean = variables.model_variable( 'moving_mean', shape=params_shape, dtype=dtype, initializer=init_ops.zeros_initializer, trainable=False, collections=moving_mean_collections) moving_variance_collections = utils.get_variable_collections( variables_collections, 'moving_variance') moving_variance = variables.model_variable( 'moving_variance', shape=params_shape, dtype=dtype, initializer=init_ops.ones_initializer, trainable=False, collections=moving_variance_collections) # Calculate the moments based on the individual batch. mean, variance = nn.moments(inputs, axis, shift=moving_mean) # Update the moving_mean and moving_variance moments. update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay) update_moving_variance = moving_averages.assign_moving_average( moving_variance, variance, decay) if updates_collections is None: # Make sure the updates are computed here. with ops.control_dependencies( [update_moving_mean, update_moving_variance]): outputs = nn.batch_normalization(inputs, mean, variance, beta, gamma, epsilon) else: # Collect the updates to be computed later. ops.add_to_collections(updates_collections, update_moving_mean) ops.add_to_collections(updates_collections, update_moving_variance) outputs = nn.batch_normalization(inputs, mean, variance, beta, gamma, epsilon) test_outputs = nn.batch_normalization(inputs, moving_mean, moving_variance, beta, gamma, epsilon) outputs = tf.cond(is_training, lambda: outputs, lambda: test_outputs) outputs.set_shape(inputs_shape) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def depth_conv2d(inputs, kernel_size, stride=1, channel_multiplier=1, padding='SAME', data_format=DATA_FORMAT_NHWC, rate=1, activation_fn=nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=init_ops.zeros_initializer(), biases_regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC): raise ValueError('data_format has to be either NCHW or NHWC.') layer_variable_getter = _build_variable_getter({ 'bias': 'biases', 'depthwise_kernel': 'depthwise_weights' }) with variable_scope.variable_scope( scope, 'SeparableConv2d', [inputs], reuse=reuse, custom_getter=layer_variable_getter) as sc: inputs = ops.convert_to_tensor(inputs) df = ('channels_first' if data_format and data_format.startswith('NC') else 'channels_last') # Actually apply depthwise conv instead of separable conv. dtype = inputs.dtype.base_dtype kernel_h, kernel_w = utils.two_element_tuple(kernel_size) stride_h, stride_w = utils.two_element_tuple(stride) num_filters_in = utils.channel_dimension(inputs.get_shape(), df, min_rank=4) weights_collections = utils.get_variable_collections( variables_collections, 'weights') depthwise_shape = [ kernel_h, kernel_w, num_filters_in, channel_multiplier ] depthwise_weights = variables.model_variable( 'depthwise_weights', shape=depthwise_shape, dtype=dtype, initializer=weights_initializer, regularizer=weights_regularizer, trainable=trainable, collections=weights_collections) strides = [ 1, 1, stride_h, stride_w ] if data_format.startswith('NC') else [1, stride_h, stride_w, 1] outputs = nn.depthwise_conv2d(inputs, depthwise_weights, strides, padding, rate=utils.two_element_tuple(rate), data_format=data_format) num_outputs = num_filters_in if normalizer_fn is not None: normalizer_params = normalizer_params or {} outputs = normalizer_fn(outputs, **normalizer_params) else: if biases_initializer is not None: biases_collections = utils.get_variable_collections( variables_collections, 'biases') biases = variables.model_variable( 'biases', shape=[ num_outputs, ], dtype=dtype, initializer=biases_initializer, regularizer=biases_regularizer, trainable=trainable, collections=biases_collections) outputs = nn.bias_add(outputs, biases, data_format=data_format) if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def layer_norm_custom(inputs, center=True, scale=True, activation_fn=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, epsilon=1E-12, scope=None): """Adds a Layer Normalization layer from https://arxiv.org/abs/1607.06450. "Layer Normalization" Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton Can be used as a normalizer function for conv2d and fully_connected. Args: inputs: a tensor with 2 or more dimensions. The normalization occurs over all but the first dimension. center: If True, subtract `beta`. If False, `beta` is ignored. scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the next layer is linear (also e.g. `nn.relu`), this can be disabled since the scaling can be done by the next layer. activation_fn: activation function, default set to None to skip it and maintain a linear activation. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: optional collections for the variables. outputs_collections: collections to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). epsilon: small value added to prevent NaN outputs. scope: Optional scope for `variable_scope`. Returns: A `Tensor` representing the output of the operation. Raises: ValueError: if rank or last dimension of `inputs` is undefined. """ with variable_scope.variable_scope(scope, 'LayerNorm', [inputs], reuse=reuse) as sc: inputs = ops.convert_to_tensor(inputs) inputs_shape = inputs.get_shape() inputs_rank = inputs_shape.ndims if inputs_rank is None: raise ValueError('Inputs %s has undefined rank.' % inputs.name) dtype = inputs.dtype.base_dtype params_shape = inputs_shape[-1:] if not params_shape.is_fully_defined(): raise ValueError('Inputs %s has undefined last dimension %s.' % (inputs.name, params_shape)) # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None if center: beta_collections = utils.get_variable_collections( variables_collections, 'beta') beta = variables.model_variable( 'beta', shape=params_shape, dtype=dtype, initializer=init_ops.zeros_initializer(), collections=beta_collections, trainable=trainable) if scale: gamma_collections = utils.get_variable_collections( variables_collections, 'gamma') gamma = variables.model_variable( 'gamma', shape=params_shape, dtype=dtype, initializer=init_ops.ones_initializer(), collections=gamma_collections, trainable=trainable) variance_epsilon = epsilon if epsilon <= 0: print("WARNING: epsilon <=0, may result in NaN outputs.") if center and scale: outputs = cMod.layer_norm_fused_custom(inputs, gamma, beta, epsilon=variance_epsilon) elif center: outputs = cMod.layer_norm_bias_add_custom(inputs, beta, epsilon=variance_epsilon) elif scale: # dummy constant beta for layer_norm_fused_custom() beta = tf.zeros(params_shape, dtype=dtype, name="dummy_beta") outputs = cMod.layer_norm_fused_custom(inputs, gamma, beta, epsilon=variance_epsilon) else: outputs = cMod.layer_norm_custom(inputs, epsilon=variance_epsilon) if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs)
def spatial_softmax(features, temperature=None, name=None, variables_collections=None, trainable=True, data_format='NHWC'): """Computes the spatial softmax of a convolutional feature map. First computes the softmax over the spatial extent of each channel of a convolutional feature map. Then computes the expected 2D position of the points of maximal activation for each channel, resulting in a set of feature keypoints [x1, y1, ... xN, yN] for all N channels. Read more here: "Learning visual feature spaces for robotic manipulation with deep spatial autoencoders." Finn et al., http://arxiv.org/abs/1509.06113. Args: features: A `Tensor` of size [batch_size, W, H, num_channels]; the convolutional feature map. temperature: Softmax temperature (optional). If None, a learnable temperature is created. name: A name for this operation (optional). variables_collections: Collections for the temperature variable. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). data_format: A string. `NHWC` (default) and `NCHW` are supported. Returns: feature_keypoints: A `Tensor` with size [batch_size, num_channels * 2]; the expected 2D locations of each channel's feature keypoint (normalized to the range (-1,1)). The inner dimension is arranged as [x1, y1, ... xN, yN]. Raises: ValueError: If unexpected data_format specified. ValueError: If num_channels dimension is unspecified. """ shape = array_ops.shape(features) static_shape = features.shape height, width, num_channels = shape[1], shape[2], static_shape[3] if num_channels.value is None: raise ValueError('The num_channels dimension of the inputs to ' '`spatial_softmax` should be defined. Found `None`.') with ops.name_scope(name, 'spatial_softmax', [features]) as name: # Create tensors for x and y coordinate values, scaled to range [-1, 1]. pos_x, pos_y = array_ops.meshgrid(math_ops.lin_space(-1., 1., num=height), math_ops.lin_space(-1., 1., num=width), indexing='ij') pos_x = array_ops.reshape(pos_x, [height * width]) pos_y = array_ops.reshape(pos_y, [height * width]) if temperature is None: temperature_collections = utils.get_variable_collections( variables_collections, name + 'temperature') temperature = variables.model_variable( name + 'temperature', shape=(), dtype=dtypes.float32, initializer=init_ops.ones_initializer(), collections=temperature_collections, trainable=trainable) # We assume all ops are [NBATCH, HEIGHT, WIDTH, CHANNELS] but this code # does not! It will reorder them appropriately. features = array_ops.reshape( array_ops.transpose(features, [0, 3, 1, 2]), [-1, height * width]) softmax_attention = nn.softmax(features / temperature) expected_x = math_ops.reduce_sum(pos_x * softmax_attention, [1], keep_dims=True) expected_y = math_ops.reduce_sum(pos_y * softmax_attention, [1], keep_dims=True) expected_xy = array_ops.concat([expected_x, expected_y], 1) feature_keypoints = array_ops.reshape(expected_xy, [-1, num_channels.value * 2]) feature_keypoints.set_shape([None, num_channels.value * 2]) return feature_keypoints
def l2_normalization(inputs, scaling=False, scale_initializer=init_ops.ones_initializer(), reuse=None, variables_collections=None, outputs_collections=None, data_format='NHWC', trainable=True, scope=None): """Implement L2 normalization on every feature (i.e. spatial normalization). Should be extended in some near future to other dimensions, providing a more flexible normalization framework. Args: inputs: a 4-D tensor with dimensions [batch_size, height, width, channels]. scaling: whether or not to add a post scaling operation along the dimensions which have been normalized. scale_initializer: An initializer for the weights. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: optional list of collections for all the variables or a dictionary containing a different list of collection per variable. outputs_collections: collection to add the outputs. data_format: NHWC or NCHW data format. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for `variable_scope`. Returns: A `Tensor` representing the output of the operation. """ with variable_scope.variable_scope(scope, 'L2Normalization', [inputs], reuse=reuse) as sc: inputs_shape = inputs.get_shape() inputs_rank = inputs_shape.ndims dtype = inputs.dtype.base_dtype if data_format == 'NHWC': # norm_dim = tf.range(1, inputs_rank-1) norm_dim = tf.range(inputs_rank - 1, inputs_rank) params_shape = inputs_shape[-1:] elif data_format == 'NCHW': # norm_dim = tf.range(2, inputs_rank) norm_dim = tf.range(1, 2) params_shape = (inputs_shape[1]) # Normalize along spatial dimensions. outputs = nn.l2_normalize(inputs, norm_dim, epsilon=1e-12) # Additional scaling. if scaling: scale_collections = utils.get_variable_collections( variables_collections, 'scale') scale = variables.model_variable('gamma', shape=params_shape, dtype=dtype, initializer=scale_initializer, collections=scale_collections, trainable=trainable) if data_format == 'NHWC': outputs = tf.multiply(outputs, scale) elif data_format == 'NCHW': scale = tf.expand_dims(scale, axis=-1) scale = tf.expand_dims(scale, axis=-1) outputs = tf.multiply(outputs, scale) # outputs = tf.transpose(outputs, perm=(0, 2, 3.txt, 1)) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs)
def batch_norm_mine_old(inputs, decay=0.999, center=True, scale=False, epsilon=0.001, activation_fn=None, param_initializers=None, param_regularizers=None, updates_collections=ops.GraphKeys.UPDATE_OPS, is_training=True, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, batch_weights=None, fused=False, data_format=DATA_FORMAT_NHWC, zero_debias_moving_mean=False, scope=None, renorm=False, renorm_clipping=None, renorm_decay=0.99): """ This earlier version of my modification to batch norm uses current_mean and current_variance if is_training is True and moving_mean and moving_variance otherwise. This was leading a large divergence between the results depending upon whether the is_training set to True or not. I think ideally it should always use moving_mean and moving_variance. batch_norm_mine does this. Adds a Batch Normalization layer from http://arxiv.org/abs/1502.03167. copy of tensorflow.contrib.layers Args: inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`. The normalization is over all but the last dimension if `data_format` is `NHWC` and the second dimension if `data_format` is `NCHW`. decay: Decay for the moving average. Reasonable values for `decay` are close to 1.0, typically in the multiple-nines range: 0.999, 0.99, 0.9, etc. Lower `decay` value (recommend trying `decay`=0.9) if model experiences reasonably good training performance but poor validation and/or test performance. Try zero_debias_moving_mean=True for improved stability. center: If True, add offset of `beta` to normalized tensor. If False, `beta` is ignored. scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the next layer is linear (also e.g. `nn.relu`), this can be disabled since the scaling can be done by the next layer. epsilon: Small float added to variance to avoid dividing by zero. activation_fn: Activation function, default set to None to skip it and maintain a linear activation. param_initializers: Optional initializers for beta, gamma, moving mean and moving variance. param_regularizers: Optional regularizer for beta and gamma. updates_collections: Collections to collect the update ops for computation. The updates_ops need to be executed with the train_op. If None, a control dependency would be added to make sure the updates are computed in place. is_training: Whether or not the layer is in training mode. In training mode it would accumulate the statistics of the moments into `moving_mean` and `moving_variance` using an exponential moving average with the given `decay`. When it is not in training mode then it would use the values of the `moving_mean` and the `moving_variance`. reuse: Whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional collections for the variables. outputs_collections: Collections to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). batch_weights: An optional tensor of shape `[batch_size]`, containing a frequency weight for each batch item. If present, then the batch normalization uses weighted mean and variance. (This can be used to correct for bias in training example selection.) fused: Use nn.fused_batch_norm if True, nn.batch_normalization otherwise. data_format: A string. `NHWC` (default) and `NCHW` are supported. zero_debias_moving_mean: Use zero_debias for moving_mean. It creates a new pair of variables 'moving_mean/biased' and 'moving_mean/local_step'. scope: Optional scope for `variable_scope`. renorm: Whether to use Batch Renormalization (https://arxiv.org/abs/1702.03275). This adds extra variables during training. The inference is the same for either value of this parameter. renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to scalar `Tensors` used to clip the renorm correction. The correction `(r, d)` is used as `corrected_value = normalized_value * r + d`, with `r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin, dmax are set to inf, 0, inf, respectively. renorm_decay: Momentum used to update the moving means and standard deviations with renorm. Unlike `momentum`, this affects training and should be neither too small (which would add noise) nor too large (which would give stale estimates). Note that `decay` is still applied to get the means and variances for inference. Returns: A `Tensor` representing the output of the operation. Raises: ValueError: If `batch_weights` is not None and `fused` is True. ValueError: If `param_regularizers` is not None and `fused` is True. ValueError: If `data_format` is neither `NHWC` nor `NCHW`. ValueError: If the rank of `inputs` is undefined. ValueError: If rank or channels dimension of `inputs` is undefined. """ if fused: if batch_weights is not None: raise ValueError('Weighted mean and variance is not currently ' 'supported for fused batch norm.') if param_regularizers is not None: raise ValueError('Regularizers are not currently ' 'supported for fused batch norm.') if renorm: raise ValueError('Renorm is not supported for fused batch norm.') return _fused_batch_norm( inputs, decay=decay, center=center, scale=scale, epsilon=epsilon, activation_fn=activation_fn, param_initializers=param_initializers, updates_collections=updates_collections, is_training=is_training, reuse=reuse, variables_collections=variables_collections, outputs_collections=outputs_collections, trainable=trainable, data_format=data_format, zero_debias_moving_mean=zero_debias_moving_mean, scope=scope) if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC): raise ValueError('data_format has to be either NCHW or NHWC.') layer_variable_getter = _build_variable_getter() with variable_scope.variable_scope( scope, 'BatchNorm', [inputs], reuse=reuse, custom_getter=layer_variable_getter) as sc: inputs = ops.convert_to_tensor(inputs) # Determine whether we can use the core layer class. if (batch_weights is None and updates_collections is ops.GraphKeys.UPDATE_OPS and not zero_debias_moving_mean): # Use the core layer class. axis = 1 if data_format == DATA_FORMAT_NCHW else -1 if not param_initializers: param_initializers = {} beta_initializer = param_initializers.get('beta', init_ops.zeros_initializer()) gamma_initializer = param_initializers.get('gamma', init_ops.ones_initializer()) moving_mean_initializer = param_initializers.get( 'moving_mean', init_ops.zeros_initializer()) moving_variance_initializer = param_initializers.get( 'moving_variance', init_ops.ones_initializer()) if not param_regularizers: param_regularizers = {} beta_regularizer = param_regularizers.get('beta') gamma_regularizer = param_regularizers.get('gamma') layer = normalization_layers.BatchNormalization( axis=axis, momentum=decay, epsilon=epsilon, center=center, scale=scale, beta_initializer=beta_initializer, gamma_initializer=gamma_initializer, moving_mean_initializer=moving_mean_initializer, moving_variance_initializer=moving_variance_initializer, beta_regularizer=beta_regularizer, gamma_regularizer=gamma_regularizer, trainable=trainable, renorm=renorm, renorm_clipping=renorm_clipping, renorm_momentum=renorm_decay, name=sc.name, _scope=sc, _reuse=reuse) outputs = layer.apply(inputs, training=is_training) # Add variables to collections. _add_variable_to_collections( layer.moving_mean, variables_collections, 'moving_mean') _add_variable_to_collections( layer.moving_variance, variables_collections, 'moving_variance') if layer.beta: _add_variable_to_collections(layer.beta, variables_collections, 'beta') if layer.gamma: _add_variable_to_collections( layer.gamma, variables_collections, 'gamma') if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs) # Not supported by layer class: batch_weights argument, # and custom updates_collections. In that case, use the legacy BN # implementation. # Custom updates collections are not supported because the update logic # is different in this case, in particular w.r.t. "forced updates" and # update op reuse. if renorm: raise ValueError('renorm is not supported with batch_weights, ' 'updates_collections or zero_debias_moving_mean') inputs_shape = inputs.get_shape() inputs_rank = inputs_shape.ndims if inputs_rank is None: raise ValueError('Inputs %s has undefined rank.' % inputs.name) dtype = inputs.dtype.base_dtype if batch_weights is not None: batch_weights = ops.convert_to_tensor(batch_weights) inputs_shape[0:1].assert_is_compatible_with(batch_weights.get_shape()) # Reshape batch weight values so they broadcast across inputs. nshape = [-1] + [1 for _ in range(inputs_rank - 1)] batch_weights = array_ops.reshape(batch_weights, nshape) if data_format == DATA_FORMAT_NCHW: moments_axes = [0] + list(range(2, inputs_rank)) params_shape = inputs_shape[1:2] # For NCHW format, rather than relying on implicit broadcasting, we # explicitly reshape the params to params_shape_broadcast when computing # the moments and the batch normalization. params_shape_broadcast = list( [1, inputs_shape[1].value] + [1 for _ in range(2, inputs_rank)]) else: moments_axes = list(range(inputs_rank - 1)) params_shape = inputs_shape[-1:] params_shape_broadcast = None if not params_shape.is_fully_defined(): raise ValueError('Inputs %s has undefined channels dimension %s.' % ( inputs.name, params_shape)) # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None if not param_initializers: param_initializers = {} if center: beta_collections = utils.get_variable_collections(variables_collections, 'beta') beta_initializer = param_initializers.get('beta', init_ops.zeros_initializer()) beta = variables.model_variable('beta', shape=params_shape, dtype=dtype, initializer=beta_initializer, collections=beta_collections, trainable=trainable) if scale: gamma_collections = utils.get_variable_collections(variables_collections, 'gamma') gamma_initializer = param_initializers.get('gamma', init_ops.ones_initializer()) gamma = variables.model_variable('gamma', shape=params_shape, dtype=dtype, initializer=gamma_initializer, collections=gamma_collections, trainable=trainable) # Create moving_mean and moving_variance variables and add them to the # appropriate collections. We disable variable partitioning while creating # them, because assign_moving_average is not yet supported for partitioned # variables. partitioner = variable_scope.get_variable_scope().partitioner try: variable_scope.get_variable_scope().set_partitioner(None) moving_mean_collections = utils.get_variable_collections( variables_collections, 'moving_mean') moving_mean_initializer = param_initializers.get( 'moving_mean', init_ops.zeros_initializer()) moving_mean = variables.model_variable( 'moving_mean', shape=params_shape, dtype=dtype, initializer=moving_mean_initializer, trainable=False, collections=moving_mean_collections) moving_variance_collections = utils.get_variable_collections( variables_collections, 'moving_variance') moving_variance_initializer = param_initializers.get( 'moving_variance', init_ops.ones_initializer()) moving_variance = variables.model_variable( 'moving_variance', shape=params_shape, dtype=dtype, initializer=moving_variance_initializer, trainable=False, collections=moving_variance_collections) finally: variable_scope.get_variable_scope().set_partitioner(partitioner) # If `is_training` doesn't have a constant value, because it is a `Tensor`, # a `Variable` or `Placeholder` then is_training_value will be None and # `needs_moments` will be true. is_training_value = utils.constant_value(is_training) need_moments = is_training_value is None or is_training_value if need_moments: # Calculate the moments based on the individual batch. if batch_weights is None: if data_format == DATA_FORMAT_NCHW: mean, _ = nn.moments(inputs, moments_axes, keep_dims=True) variance,_ = nn.moments( (inputs-moving_mean)**2, moments_axes, keep_dims=True) mean = array_ops.reshape(mean, [-1]) variance = array_ops.reshape(variance, [-1]) else: mean, _ = nn.moments(inputs, moments_axes) variance, _ = nn.moments( (inputs-moving_mean)**2, moments_axes) else: if data_format == DATA_FORMAT_NCHW: mean, _ = nn.weighted_moments(inputs, moments_axes, batch_weights, keep_dims=True) variance, _ = nn.weighted_moments( (inputs-moving_mean)**2, moments_axes, batch_weights, keep_dims=True) mean = array_ops.reshape(mean, [-1]) variance = array_ops.reshape(variance, [-1]) else: mean, _ = nn.weighted_moments(inputs, moments_axes, batch_weights) variance, _ = nn.weighted_moments( (inputs-moving_mean)**2, moments_axes, batch_weights) moving_vars_fn = lambda: (moving_mean, moving_variance) if updates_collections is None: def _force_updates(): """Internal function forces updates moving_vars if is_training.""" update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay, zero_debias=zero_debias_moving_mean) update_moving_variance = moving_averages.assign_moving_average( moving_variance, variance, decay, zero_debias=False) with ops.control_dependencies([update_moving_mean, update_moving_variance]): return array_ops.identity(mean), array_ops.identity(variance) mean, variance = utils.smart_cond(is_training, _force_updates, moving_vars_fn) else: def _delay_updates(): """Internal function that delay updates moving_vars if is_training.""" update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay, zero_debias=zero_debias_moving_mean) update_moving_variance = moving_averages.assign_moving_average( moving_variance, variance, decay, zero_debias=False) return update_moving_mean, update_moving_variance update_mean, update_variance = utils.smart_cond(is_training, _delay_updates, moving_vars_fn) ops.add_to_collections(updates_collections, update_mean) ops.add_to_collections(updates_collections, update_variance) # Use computed moments during training and moving_vars otherwise. vars_fn = lambda: (mean, variance) mean, variance = utils.smart_cond(is_training, vars_fn, moving_vars_fn) else: mean, variance = moving_mean, moving_variance if data_format == DATA_FORMAT_NCHW: mean = array_ops.reshape(mean, params_shape_broadcast) variance = array_ops.reshape(variance, params_shape_broadcast) beta = array_ops.reshape(beta, params_shape_broadcast) if gamma is not None: gamma = array_ops.reshape(gamma, params_shape_broadcast) # Compute batch_normalization. outputs = nn.batch_normalization(inputs, mean, variance, beta, gamma, epsilon) outputs.set_shape(inputs_shape) if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs)
def nan_batch_norm(inputs, decay=0.999, center=True, scale=False, epsilon=0.001, is_training=True, reuse=None, variables_collections=None, outputs_collections=None, trainable=False, scope=None): with variable_scope.variable_op_scope([inputs], scope, 'NanBatchNorm', reuse=reuse) as sc: inputs_shape = inputs.get_shape() inputs_rank = inputs_shape.ndims if inputs_rank is None: raise ValueError('Inputs %s has undefined rank.' % inputs.name) dtype = inputs.dtype.base_dtype axis = list(range(inputs_rank - 1)) params_shape = inputs_shape[-1:] beta, gamma = None, None if center: beta_collections = utils.get_variable_collections(variables_collections, 'beta') beta = variables.model_variable('beta', shape=params_shape, dtype=dtype, initializer=init_ops.zeros_initializer, collections=beta_collections, trainable=False) if scale: gamma_collections = utils.get_variable_collections(variables_collections, 'gamma') gamma = variables.model_variable('gamma', shape=params_shape, dtype=dtype, initializer=init_ops.ones_initializer, collections=gamma_collections, trainable=trainable) # Create moving_mean and moving_variance variables and add them to the # appropiate collections. moving_mean_collections = utils.get_variable_collections( variables_collections, 'moving_mean') moving_mean = variables.model_variable( 'moving_mean', shape=params_shape, dtype=dtype, initializer=init_ops.zeros_initializer, trainable=False, collections=moving_mean_collections) moving_variance_collections = utils.get_variable_collections( variables_collections, 'moving_variance') moving_variance = variables.model_variable( 'moving_variance', shape=params_shape, dtype=dtype, initializer=init_ops.ones_initializer, trainable=False, collections=moving_variance_collections) is_training_value = utils.constant_value(is_training) need_moments = is_training_value is None or is_training_value if need_moments: mean = nanmean(inputs, axis=axis) variance = nanvar(inputs, axis=axis) moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay) moving_variance = moving_averages.assign_moving_average( moving_variance, variance, decay) mean, variance = moving_mean, moving_variance outputs = tf.nn.batch_normalization(inputs, mean, variance, beta, gamma, epsilon) outputs.set_shape(inputs_shape) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def batch_norm(inputs, decay=0.999, center=True, scale=False, epsilon=0.001, updates_collections=ops.GraphKeys.UPDATE_OPS, is_training=True, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """Code modification of tensorflow/contrib/layers/python/layers/layers.py """ with variable_scope.variable_op_scope([inputs], scope, 'BatchNorm', reuse=reuse) as sc: inputs = ops.convert_to_tensor(inputs) inputs_shape = inputs.get_shape() inputs_rank = inputs_shape.ndims if inputs_rank is None: raise ValueError('Inputs %s has undefined rank.' % inputs.name) dtype = inputs.dtype.base_dtype axis = list(range(inputs_rank - 1)) params_shape = inputs_shape[-1:] if not params_shape.is_fully_defined(): raise ValueError('Inputs %s has undefined last dimension %s.' % ( inputs.name, params_shape)) # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None if center: beta_collections = utils.get_variable_collections(variables_collections, 'beta') beta = variables.model_variable('beta', shape=params_shape, dtype=dtype, initializer=init_ops.zeros_initializer, collections=beta_collections, trainable=trainable) if scale: gamma_collections = utils.get_variable_collections(variables_collections, 'gamma') gamma = variables.model_variable('gamma', shape=params_shape, dtype=dtype, initializer=init_ops.ones_initializer, collections=gamma_collections, trainable=trainable) # Create moving_mean and moving_variance variables and add them to the # appropiate collections. moving_mean_collections = utils.get_variable_collections( variables_collections, 'moving_mean') moving_mean = variables.model_variable( 'moving_mean', shape=params_shape, dtype=dtype, initializer=init_ops.zeros_initializer, trainable=False, collections=moving_mean_collections) moving_variance_collections = utils.get_variable_collections( variables_collections, 'moving_variance') moving_variance = variables.model_variable( 'moving_variance', shape=params_shape, dtype=dtype, initializer=init_ops.ones_initializer, trainable=False, collections=moving_variance_collections) # Calculate the moments based on the individual batch. mean, variance = nn.moments(inputs, axis, shift=moving_mean) # Update the moving_mean and moving_variance moments. update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay) update_moving_variance = moving_averages.assign_moving_average( moving_variance, variance, decay) if updates_collections is None: # Make sure the updates are computed here. with ops.control_dependencies([update_moving_mean, update_moving_variance]): outputs = nn.batch_normalization( inputs, mean, variance, beta, gamma, epsilon) else: # Collect the updates to be computed later. ops.add_to_collections(updates_collections, update_moving_mean) ops.add_to_collections(updates_collections, update_moving_variance) outputs = nn.batch_normalization( inputs, mean, variance, beta, gamma, epsilon) test_outputs = nn.batch_normalization( inputs, moving_mean, moving_variance, beta, gamma, epsilon) outputs = tf.cond(is_training, lambda: outputs, lambda: test_outputs) outputs.set_shape(inputs_shape) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def joint_weighted_sum_from_feature_columns(columns_to_tensors, feature_columns, num_outputs, weight_collections=None, trainable=True, scope=None): """A restricted linear prediction builder based on FeatureColumns. As long as all feature columns are unweighted sparse columns this computes the prediction of a linear model which stores all weights in a single variable. Args: columns_to_tensors: A mapping from feature column to tensors. 'string' key means a base feature (not-transformed). It can have FeatureColumn as a key too. That means that FeatureColumn is already transformed by input pipeline. For example, `inflow` may have handled transformations. feature_columns: A set containing all the feature columns. All items in the set should be instances of classes derived from FeatureColumn. num_outputs: An integer specifying number of outputs. Default value is 1. weight_collections: List of graph collections to which weights are added. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for variable_scope. Returns: A tuple containing: * A Tensor which represents predictions of a linear model. * A list of Variables storing the weights. * A Variable which is used for bias. Raises: ValueError: if FeatureColumn cannot be used for linear predictions. """ check_feature_columns(feature_columns) with variable_scope.variable_scope( scope, default_name='joint_weighted_sum_from_feature_columns', values=columns_to_tensors.values()): transformer = _Transformer(columns_to_tensors) embedding_lookup_arguments = [] for column in sorted(set(feature_columns), key=lambda x: x.key): transformed_tensor = transformer.transform(column) try: embedding_lookup_arguments.append( column._wide_embedding_lookup_arguments(transformed_tensor)) # pylint: disable=protected-access except NotImplementedError: raise NotImplementedError('Real-valued columns are not supported. ' 'Use weighted_sum_from_feature_columns ' 'instead, or bucketize these columns.') variable, predictions_no_bias = _create_joint_embedding_lookup( columns_to_tensors, embedding_lookup_arguments, num_outputs, trainable, weight_collections) bias = contrib_variables.model_variable( 'bias_weight', shape=[num_outputs], initializer=init_ops.zeros_initializer(), trainable=trainable, collections=_add_variable_collection(weight_collections)) _log_variable(bias) predictions = nn_ops.bias_add(predictions_no_bias, bias) return predictions, variable, bias
def depthwise_convolution2d( inputs, kernel_size, depth_multiplier=1, stride=1, padding='SAME', rate=1, activation_fn=nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=init_ops.zeros_initializer(), biases_regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, data_format='NHWC', scope=None): """Adds a depthwise 2D convolution with optional batch_norm layer. This op performs a depthwise convolution that acts separately on channels, creating a variable called `depthwise_weights`. Then, if `normalizer_fn` is None, it adds bias to the result, creating a variable called 'biases', otherwise, the `normalizer_fn` is applied. It finally applies an activation function to produce the end result. Args: inputs: A tensor of size [batch_size, height, width, channels]. num_outputs: The number of pointwise convolution output filters. If is None, then we skip the pointwise convolution stage. kernel_size: A list of length 2: [kernel_height, kernel_width] of of the filters. Can be an int if both values are the same. depth_multiplier: The number of depthwise convolution output channels for each input channel. The total number of depthwise convolution output channels will be equal to `num_filters_in * depth_multiplier`. stride: A list of length 2: [stride_height, stride_width], specifying the depthwise convolution stride. Can be an int if both strides are the same. padding: One of 'VALID' or 'SAME'. rate: A list of length 2: [rate_height, rate_width], specifying the dilation rates for atrous convolution. Can be an int if both rates are the same. If any value is larger than one, then both stride values need to be one. activation_fn: Activation function. The default value is a ReLU function. Explicitly set it to None to skip it and maintain a linear activation. normalizer_fn: Normalization function to use instead of `biases`. If `normalizer_fn` is provided then `biases_initializer` and `biases_regularizer` are ignored and `biases` are not created nor added. default set to None for no normalizer function normalizer_params: Normalization function parameters. weights_initializer: An initializer for the weights. weights_regularizer: Optional regularizer for the weights. biases_initializer: An initializer for the biases. If None skip biases. biases_regularizer: Optional regularizer for the biases. reuse: Whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional list of collections for all the variables or a dictionary containing a different list of collection per variable. outputs_collections: Collection to add the outputs. trainable: Whether or not the variables should be trainable or not. scope: Optional scope for variable_scope. Returns: A `Tensor` representing the output of the operation. """ with variable_scope.variable_scope(scope, 'DepthwiseConv2d', [inputs], reuse=reuse) as sc: inputs = ops.convert_to_tensor(inputs) # Actually apply depthwise conv instead of separable conv. dtype = inputs.dtype.base_dtype kernel_h, kernel_w = utils.two_element_tuple(kernel_size) stride_h, stride_w = utils.two_element_tuple(stride) if data_format == 'NHWC': num_filters_in = utils.last_dimension(inputs.get_shape(), min_rank=4) strides = [1, stride_h, stride_w, 1] else: num_filters_in = inputs.get_shape().as_list()[1] strides = [1, 1, stride_h, stride_w] weights_collections = utils.get_variable_collections( variables_collections, 'weights') # Depthwise weights variable. depthwise_shape = [ kernel_h, kernel_w, num_filters_in, depth_multiplier ] depthwise_weights = variables.model_variable( 'depthwise_weights', shape=depthwise_shape, dtype=dtype, initializer=weights_initializer, regularizer=weights_regularizer, trainable=trainable, collections=weights_collections) outputs = nn.depthwise_conv2d(inputs, depthwise_weights, strides, padding, rate=utils.two_element_tuple(rate), data_format=data_format) num_outputs = depth_multiplier * num_filters_in if normalizer_fn is not None: normalizer_params = normalizer_params or {} outputs = normalizer_fn(outputs, **normalizer_params) else: if biases_initializer is not None: biases_collections = utils.get_variable_collections( variables_collections, 'biases') biases = variables.model_variable( 'biases', shape=[ num_outputs, ], dtype=dtype, initializer=biases_initializer, regularizer=biases_regularizer, trainable=trainable, collections=biases_collections) outputs = nn.bias_add(outputs, biases, data_format=data_format) if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs)
def weighted_sum_from_feature_columns(columns_to_tensors, feature_columns, num_outputs, weight_collections=None, trainable=True, scope=None): """A tf.contrib.layer style linear prediction builder based on FeatureColumns. Generally a single example in training data is described with feature columns. This function generates weighted sum for each num_outputs. Weighted sum refers to logits in classification problems. It refers to prediction itself for linear regression problems. Example: ``` # Building model for training feature_columns = ( real_valued_column("my_feature1"), ... ) columns_to_tensor = tf.parse_example(...) logits = weighted_sum_from_feature_columns( columns_to_tensors=columns_to_tensor, feature_columns=feature_columns, num_outputs=1) loss = tf.nn.sigmoid_cross_entropy_with_logits(logits, labels) ``` Args: columns_to_tensors: A mapping from feature column to tensors. 'string' key means a base feature (not-transformed). It can have FeatureColumn as a key too. That means that FeatureColumn is already transformed by input pipeline. For example, `inflow` may have handled transformations. feature_columns: A set containing all the feature columns. All items in the set should be instances of classes derived from FeatureColumn. num_outputs: An integer specifying number of outputs. Default value is 1. weight_collections: List of graph collections to which weights are added. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for variable_scope. Returns: A tuple containing: * A Tensor which represents predictions of a linear model. * A dictionary which maps feature_column to corresponding Variable. * A Variable which is used for bias. Raises: ValueError: if FeatureColumn cannot be used for linear predictions. """ check_feature_columns(feature_columns) with variable_scope.variable_scope( scope, default_name='weighted_sum_from_feature_columns', values=columns_to_tensors.values()): output_tensors = [] column_to_variable = dict() transformer = _Transformer(columns_to_tensors) # pylint: disable=protected-access for column in sorted(set(feature_columns), key=lambda x: x.key): transformed_tensor = transformer.transform(column) try: embedding_lookup_arguments = column._wide_embedding_lookup_arguments( transformed_tensor) variable, predictions = _create_embedding_lookup( column, columns_to_tensors, embedding_lookup_arguments, num_outputs, trainable, weight_collections) except NotImplementedError: with variable_scope.variable_scope( None, default_name=column.name, values=columns_to_tensors.values()): tensor = column._to_dense_tensor(transformed_tensor) tensor = fc._reshape_real_valued_tensor(tensor, 2, column.name) variable = [ contrib_variables.model_variable( name='weight', shape=[tensor.get_shape()[1], num_outputs], initializer=init_ops.zeros_initializer(), trainable=trainable, collections=weight_collections) ] predictions = math_ops.matmul(tensor, variable[0], name='matmul') except ValueError as ee: raise ValueError('Error creating weighted sum for column: {}.\n' '{}'.format(column.name, ee)) output_tensors.append(predictions) column_to_variable[column] = variable _log_variable(variable) _maybe_restore_from_checkpoint(column._checkpoint_path(), variable) # pylint: enable=protected-access predictions_no_bias = math_ops.add_n(output_tensors) bias = contrib_variables.model_variable( 'bias_weight', shape=[num_outputs], initializer=init_ops.zeros_initializer(), trainable=trainable, collections=_add_variable_collection(weight_collections)) _log_variable(bias) predictions = nn_ops.bias_add(predictions_no_bias, bias) return predictions, column_to_variable, bias
def weighted_sum_from_feature_columns(columns_to_tensors, feature_columns, num_outputs, weight_collections=None, trainable=True, scope=None): """A tf.contrib.layer style linear prediction builder based on FeatureColumns. Generally a single example in training data is described with feature columns. This function generates weighted sum for each num_outputs. Weighted sum refers to logits in classification problems. It refers to prediction itself for linear regression problems. An example usage of weighted_sum_from_feature_columns is as follows: # Building model for training columns_to_tensor = tf.parse_example(...) logits = weighted_sum_from_feature_columns( columns_to_tensors=columns_to_tensor, feature_columns=feature_columns, num_outputs=1) loss = tf.nn.sigmoid_cross_entropy_with_logits(logits, labels) where feature_columns can be defined as follows: occupation = sparse_column_with_hash_bucket(column_name="occupation", hash_bucket_size=1000) occupation_emb = embedding_column(sparse_id_column=occupation, dimension=16, combiner="sum") age = real_valued_column("age") age_buckets = bucketized_column( source_column=age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) occupation_x_age = crossed_column(columns=[occupation, age_buckets], hash_bucket_size=10000) feature_columns=[occupation_emb, occupation_x_age] Args: columns_to_tensors: A mapping from feature column to tensors. 'string' key means a base feature (not-transformed). It can have FeatureColumn as a key too. That means that FeatureColumn is already transformed by input pipeline. For example, `inflow` may have handled transformations. feature_columns: A set containing all the feature columns. All items in the set should be instances of classes derived from FeatureColumn. num_outputs: An integer specifying number of outputs. Default value is 1. weight_collections: List of graph collections to which weights are added. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for variable_scope. Returns: A tuple of followings: * A Tensor which represents predictions of a linear model. * A dictionary which maps feature_column to corresponding Variable. * A Variable which is used for bias. Raises: ValueError: if FeatureColumn cannot be used for linear predictions. """ check_feature_columns(feature_columns) with variable_scope.variable_scope( scope, default_name='weighted_sum_from_feature_columns', values=columns_to_tensors.values()): output_tensors = [] column_to_variable = dict() transformer = _Transformer(columns_to_tensors) for column in sorted(set(feature_columns), key=lambda x: x.key): transformed_tensor = transformer.transform(column) try: embedding_lookup_arguments = column._to_embedding_lookup_arguments( # pylint: disable=protected-access transformed_tensor) variable, predictions = _create_embedding_lookup( column, columns_to_tensors, embedding_lookup_arguments, num_outputs, trainable, weight_collections) except NotImplementedError: with variable_scope.variable_scope( None, default_name=column.name, values=columns_to_tensors.values()): tensor = column._to_dense_tensor(transformed_tensor) # pylint: disable=protected-access variable = [ contrib_variables.model_variable( name='weight', shape=[tensor.get_shape()[1], num_outputs], initializer=init_ops.zeros_initializer, collections=weight_collections) ] predictions = math_ops.matmul(tensor, variable[0], name='matmul') except ValueError as ee: raise ValueError( 'Error creating weighted sum for column: {}.\n' '{}'.format(column.name, ee)) output_tensors.append(predictions) column_to_variable[column] = variable _log_variable(variable) _maybe_restore_from_checkpoint(column._checkpoint_path(), variable) # pylint: disable=protected-access predictions_no_bias = math_ops.add_n(output_tensors) bias = contrib_variables.model_variable( 'bias_weight', shape=[num_outputs], initializer=init_ops.zeros_initializer, collections=fc._add_variable_collection(weight_collections)) # pylint: disable=protected-access _log_variable(bias) predictions = nn_ops.bias_add(predictions_no_bias, bias) return predictions, column_to_variable, bias
def fully_connected(inputs, num_outputs, activation_fn=nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=init_ops.zeros_initializer, biases_regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """Adds a fully connected layer. `fully_connected` creates a variable called `weights`, representing a fully connected weight matrix, which is multiplied by the `inputs` to produce a `Tensor` of hidden units. If a `normalizer_fn` is provided (such as `batch_norm`), it is then applied. Otherwise, if `normalizer_fn` is None and a `biases_initializer` is provided then a `biases` variable would be created and added the hidden units. Finally, if `activation_fn` is not `None`, it is applied to the hidden units as well. Note: that if `inputs` have a rank greater than 2, then `inputs` is flattened prior to the initial matrix multiply by `weights`. Args: inputs: A tensor of with at least rank 2 and value for the last dimension, i.e. `[batch_size, depth]`, `[None, None, None, channels]`. num_outputs: Integer, the number of output units in the layer. activation_fn: activation function. normalizer_fn: normalization function to use instead of `biases`. If `normalize_fn` is provided then `biases_initializer` and `biases_regularizer` are ignored and `biases` are not created nor added. normalizer_params: normalization function parameters. weights_initializer: An initializer for the weights. weights_regularizer: Optional regularizer for the weights. biases_initializer: An initializer for the biases. If None skip biases. biases_regularizer: Optional regularizer for the biases. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional list of collections for all the variables or a dictionary containing a different list of collections per variable. outputs_collections: collection to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for variable_op_scope. Returns: the tensor variable representing the result of the series of operations. Raises: ValueError: if x has rank less than 2 or if its last dimension is not set. """ if not isinstance(num_outputs, int): raise ValueError('num_outputs should be integer, got %s.', num_outputs) with variable_scope.variable_op_scope([inputs], scope, 'fully_connected', reuse=reuse) as sc: dtype = inputs.dtype.base_dtype num_input_units = utils.last_dimension(inputs.get_shape(), min_rank=2) static_shape = inputs.get_shape().as_list() static_shape[-1] = num_outputs out_shape = array_ops.unpack(array_ops.shape(inputs)) out_shape[-1] = num_outputs weights_shape = [num_input_units, num_outputs] weights_collections = utils.get_variable_collections( variables_collections, 'weights') weights = variables.model_variable('weights', shape=weights_shape, dtype=dtype, initializer=weights_initializer, regularizer=weights_regularizer, collections=weights_collections, trainable=trainable) if len(static_shape) > 2: # Reshape inputs inputs = array_ops.reshape(inputs, [-1, num_input_units]) outputs = standard_ops.matmul(inputs, weights) if normalizer_fn: normalizer_params = normalizer_params or {} outputs = normalizer_fn(outputs, **normalizer_params) else: if biases_initializer is not None: biases_collections = utils.get_variable_collections( variables_collections, 'biases') biases = variables.model_variable('biases', shape=[num_outputs,], dtype=dtype, initializer=biases_initializer, regularizer=biases_regularizer, collections=biases_collections, trainable=trainable) outputs = nn.bias_add(outputs, biases) if len(static_shape) > 2: # Reshape back outputs outputs = array_ops.reshape(outputs, array_ops.pack(out_shape)) outputs.set_shape(static_shape) if activation_fn: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def _embeddings_from_arguments(column, args, weight_collections, trainable, output_rank=2): """Returns embeddings for a column based on the computed arguments. Args: column: the column name. args: the _DeepEmbeddingLookupArguments for this column. weight_collections: collections to store weights in. trainable: whether these embeddings should be trainable. output_rank: the desired rank of the returned `Tensor`. Inner dimensions will be combined to produce the desired rank. Returns: the embeddings. Raises: ValueError: if not possible to create. """ # pylint: disable=protected-access input_tensor = layers._inner_flatten(args.input_tensor, output_rank) weight_tensor = None if args.weight_tensor is not None: weight_tensor = layers._inner_flatten(args.weight_tensor, output_rank) # pylint: enable=protected-access if args.hashed: embeddings = contrib_variables.model_variable( name='weights', shape=[args.vocab_size], dtype=dtypes.float32, initializer=args.initializer, trainable=trainable, collections=weight_collections) return embedding_ops.hashed_embedding_lookup_sparse( embeddings, input_tensor, args.dimension, combiner=args.combiner, name='lookup') if args.shared_embedding_name is not None: shared_embedding_collection_name = ( 'SHARED_EMBEDDING_COLLECTION_' + args.shared_embedding_name.upper()) graph = ops.get_default_graph() shared_embedding_collection = ( graph.get_collection_ref(shared_embedding_collection_name)) shape = [args.vocab_size, args.dimension] if shared_embedding_collection: if len(shared_embedding_collection) > 1: raise ValueError('Collection %s can only contain one ' '(partitioned) variable.' % shared_embedding_collection_name) else: embeddings = shared_embedding_collection[0] if embeddings.get_shape() != shape: raise ValueError('The embedding variable with name {} already ' 'exists, but its shape does not match required ' 'embedding shape here. Please make sure to use ' 'different shared_embedding_name for different ' 'shared embeddings.'.format( args.shared_embedding_name)) else: embeddings = contrib_variables.model_variable( name=args.shared_embedding_name, shape=shape, dtype=dtypes.float32, initializer=args.initializer, trainable=trainable, collections=weight_collections) graph.add_to_collection(shared_embedding_collection_name, embeddings) else: embeddings = contrib_variables.model_variable( name='weights', shape=[args.vocab_size, args.dimension], dtype=dtypes.float32, initializer=args.initializer, trainable=trainable, collections=weight_collections) if isinstance(embeddings, variables.Variable): embeddings = [embeddings] else: embeddings = embeddings._get_variable_list() # pylint: disable=protected-access # pylint: disable=protected-access _maybe_restore_from_checkpoint( column._checkpoint_path(), embeddings) return embedding_ops.safe_embedding_lookup_sparse( embeddings, input_tensor, sparse_weights=weight_tensor, combiner=args.combiner, name=column.name + 'weights', max_norm=args.max_norm)
def convolution(inputs, num_outputs, kernel_size, stride=1, padding='SAME', data_format=None, rate=1, activation_fn=nn.relu, normalizer_fn=None, normalizer_params=None, weights_normalizer_fn=None, weights_normalizer_params=None, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=init_ops.zeros_initializer(), biases_regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): # Be copied and modified from tensorflow-0.12.0.contrib.layer.convolution, # add weights_nomalizer_* options. """Adds an N-D convolution followed by an optional batch_norm layer. It is required that 1 <= N <= 3. `convolution` creates a variable called `weights`, representing the convolutional kernel, that is convolved (actually cross-correlated) with the `inputs` to produce a `Tensor` of activations. If a `normalizer_fn` is provided (such as `batch_norm`), it is then applied. Otherwise, if `normalizer_fn` is None and a `biases_initializer` is provided then a `biases` variable would be created and added the activations. Finally, if `activation_fn` is not `None`, it is applied to the activations as well. Performs a'trous convolution with input stride/dilation rate equal to `rate` if a value > 1 for any dimension of `rate` is specified. In this case `stride` values != 1 are not supported. Args: inputs: a Tensor of rank N+2 of shape `[batch_size] + input_spatial_shape + [in_channels]` if data_format does not start with "NC" (default), or `[batch_size, in_channels] + input_spatial_shape` if data_format starts with "NC". num_outputs: integer, the number of output filters. kernel_size: a sequence of N positive integers specifying the spatial dimensions of of the filters. Can be a single integer to specify the same value for all spatial dimensions. stride: a sequence of N positive integers specifying the stride at which to compute output. Can be a single integer to specify the same value for all spatial dimensions. Specifying any `stride` value != 1 is incompatible with specifying any `rate` value != 1. padding: one of `"VALID"` or `"SAME"`. data_format: A string or None. Specifies whether the channel dimension of the `input` and output is the last dimension (default, or if `data_format` does not start with "NC"), or the second dimension (if `data_format` starts with "NC"). For N=1, the valid values are "NWC" (default) and "NCW". For N=2, the valid values are "NHWC" (default) and "NCHW". For N=3, currently the only valid value is "NDHWC". rate: a sequence of N positive integers specifying the dilation rate to use for a'trous convolution. Can be a single integer to specify the same value for all spatial dimensions. Specifying any `rate` value != 1 is incompatible with specifying any `stride` value != 1. activation_fn: activation function, set to None to skip it and maintain a linear activation. normalizer_fn: normalization function to use instead of `biases`. If `normalizer_fn` is provided then `biases_initializer` and `biases_regularizer` are ignored and `biases` are not created nor added. default set to None for no normalizer function normalizer_params: normalization function parameters. weights_normalizer_fn: weights normalization function. weights_normalizer_params: weights normalization function parameters. weights_initializer: An initializer for the weights. weights_regularizer: Optional regularizer for the weights. biases_initializer: An initializer for the biases. If None skip biases. biases_regularizer: Optional regularizer for the biases. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: optional list of collections for all the variables or a dictionary containing a different list of collection per variable. outputs_collections: collection to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for `variable_scope`. Returns: a tensor representing the output of the operation. Raises: ValueError: if `data_format` is invalid. ValueError: both 'rate' and `stride` are not uniformly 1. """ if data_format not in [None, 'NWC', 'NCW', 'NHWC', 'NCHW', 'NDHWC']: raise ValueError('Invalid data_format: %r' % (data_format, )) with variable_scope.variable_scope(scope, 'Conv', [inputs], reuse=reuse) as sc: inputs = ops.convert_to_tensor(inputs) dtype = inputs.dtype.base_dtype input_rank = inputs.get_shape().ndims if input_rank is None: raise ValueError('Rank of inputs must be known') if input_rank < 3 or input_rank > 5: raise ValueError( 'Rank of inputs is %d, which is not >= 3 and <= 5' % input_rank) conv_dims = input_rank - 2 kernel_size = utils.n_positive_integers(conv_dims, kernel_size) stride = utils.n_positive_integers(conv_dims, stride) rate = utils.n_positive_integers(conv_dims, rate) if data_format is None or data_format.endswith('C'): num_input_channels = inputs.get_shape()[input_rank - 1].value elif data_format.startswith('NC'): num_input_channels = inputs.get_shape()[1].value else: raise ValueError('Invalid data_format') if num_input_channels is None: raise ValueError('Number of in_channels must be known.') weights_shape = (list(kernel_size) + [num_input_channels, num_outputs]) weights_collections = utils.get_variable_collections( variables_collections, 'weights') weights = variables.model_variable('weights', shape=weights_shape, dtype=dtype, initializer=weights_initializer, regularizer=weights_regularizer, collections=weights_collections, trainable=trainable) if weights_normalizer_fn is not None: weights_normalizer_params = weights_normalizer_params or {} weights = weights_normalizer_fn(weights, **weights_normalizer_params) outputs = nn.convolution(input=inputs, filter=weights, dilation_rate=rate, strides=stride, padding=padding, data_format=data_format) if normalizer_fn is not None: normalizer_params = normalizer_params or {} outputs = normalizer_fn(outputs, **normalizer_params) else: if biases_initializer is not None: biases_collections = utils.get_variable_collections( variables_collections, 'biases') biases = variables.model_variable( 'biases', shape=[num_outputs], dtype=dtype, initializer=biases_initializer, regularizer=biases_regularizer, collections=biases_collections, trainable=trainable) outputs = nn.bias_add(outputs, biases, data_format=data_format) if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs)
def weighted_sum_from_feature_columns(columns_to_tensors, feature_columns, num_outputs, weight_collections=None, trainable=True, scope=None): """A tf.contrib.layer style linear prediction builder based on FeatureColumns. Generally a single example in training data is described with feature columns. This function generates weighted sum for each num_outputs. Weighted sum refers to logits in classification problems. It refers to prediction itself for linear regression problems. An example usage of weighted_sum_from_feature_columns is as follows: # Building model for training columns_to_tensor = tf.parse_example(...) logits = weighted_sum_from_feature_columns( columns_to_tensors=columns_to_tensor, feature_columns=feature_columns, num_outputs=1) loss = tf.nn.sigmoid_cross_entropy_with_logits(logits, labels) where feature_columns can be defined as follows: occupation = sparse_column_with_hash_bucket(column_name="occupation", hash_bucket_size=1000) occupation_emb = embedding_column(sparse_id_column=occupation, dimension=16, combiner="sum") age = real_valued_column("age") age_buckets = bucketized_column( source_column=age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) occupation_x_age = crossed_column(columns=[occupation, age_buckets], hash_bucket_size=10000) feature_columns=[occupation_emb, occupation_x_age] Args: columns_to_tensors: A mapping from feature column to tensors. 'string' key means a base feature (not-transformed). It can have FeatureColumn as a key too. That means that FeatureColumn is already transformed by input pipeline. For example, `inflow` may have handled transformations. feature_columns: A set containing all the feature columns. All items in the set should be instances of classes derived from FeatureColumn. num_outputs: An integer specifying number of outputs. Default value is 1. weight_collections: List of graph collections to which weights are added. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for variable_scope. Returns: A tuple of followings: * A Tensor which represents predictions of a linear model. * A dictionary which maps feature_column to corresponding Variable. * A Variable which is used for bias. Raises: ValueError: if FeatureColumn cannot be used for linear predictions. """ check_feature_columns(feature_columns) with variable_scope.variable_scope( scope, default_name='weighted_sum_from_feature_columns', values=columns_to_tensors.values()): output_tensors = [] column_to_variable = dict() transformer = _Transformer(columns_to_tensors) for column in sorted(set(feature_columns), key=lambda x: x.key): with variable_scope.variable_scope( None, default_name=column.name, values=columns_to_tensors.values()): try: transformed_tensor = transformer.transform(column) predictions, variable = column.to_weighted_sum(transformed_tensor, num_outputs, weight_collections, trainable) except ValueError as e: raise ValueError('Error creating weighted sum for column: {}.\n' '{}'.format(column.name, e)) output_tensors.append(predictions) column_to_variable[column] = variable _log_variable(variable) predictions_no_bias = math_ops.add_n(output_tensors) bias = contrib_variables.model_variable( 'bias_weight', shape=[num_outputs], initializer=init_ops.zeros_initializer, collections=fc._add_variable_collection(weight_collections)) # pylint: disable=protected-access _log_variable(bias) predictions = nn_ops.bias_add(predictions_no_bias, bias) return predictions, column_to_variable, bias
def conv2d_leaders(inputs, num_outputs, kernel_size, rates=[1], stride=1, padding='SAME', activation_fn=nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=init_ops.zeros_initializer, biases_regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None,): """Adds a 2D convolution followed by an optional batch_norm layer. `convolution2d` creates a variable called `weights`, representing the convolutional kernel, that is convolved with the `inputs` to produce a `Tensor` of activations. If a `normalizer_fn` is provided (such as `batch_norm`), it is then applied. Otherwise, if `normalizer_fn` is None and a `biases_initializer` is provided then a `biases` variable would be created and added the activations. Finally, if `activation_fn` is not `None`, it is applied to the activations as well. Performs a'trous convolution with input stride equal to rate if rate is greater than one. Args: inputs: a 4-D tensor `[batch_size, height, width, channels]`. num_outputs: integer, the number of output filters. kernel_size: a list of length 2 `[kernel_height, kernel_width]` of of the filters. Can be an int if both values are the same. stride: a list of length 2 `[stride_height, stride_width]`. Can be an int if both strides are the same. Note that presently both strides must have the same value. padding: one of `VALID` or `SAME`. rate: integer. If less than or equal to 1, a standard convolution is used. If greater than 1, than the a'trous convolution is applied and `stride` must be set to 1. activation_fn: activation function. normalizer_fn: normalization function to use instead of `biases`. If `normalize_fn` is provided then `biases_initializer` and `biases_regularizer` are ignored and `biases` are not created nor added. normalizer_params: normalization function parameters. weights_initializer: An initializer for the weights. weights_regularizer: Optional regularizer for the weights. biases_initializer: An initializer for the biases. If None skip biases. biases_regularizer: Optional regularizer for the biases. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: optional list of collections for all the variables or a dictionay containing a different list of collection per variable. outputs_collections: collection to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for `variable_op_scope`. Returns: a tensor representing the output of the operation. Raises: ValueError: if both 'rate' and `stride` are larger than one. """ with variable_scope.variable_scope(scope, 'Conv', [inputs], reuse=reuse) as sc: inputs = ops.convert_to_tensor(inputs) dtype = inputs.dtype.base_dtype # inshape = tf.shape(inputs) # Leading kernel size. kernel_h, kernel_w = utils.two_element_tuple(kernel_size) stride_h, stride_w = utils.two_element_tuple(stride) num_filters_in = utils.last_dimension(inputs.get_shape(), min_rank=4) # Weights variable. weights_shape = [kernel_h, kernel_w, num_filters_in, num_outputs] weights_collections = utils.get_variable_collections( variables_collections, 'weights') weights = variables.model_variable('weights', shape=weights_shape, dtype=dtype, initializer=weights_initializer, regularizer=weights_regularizer, collections=weights_collections, trainable=trainable) # # Bias variable. # biases = None # if biases_initializer is not None: # biases_collections = utils.get_variable_collections( # variables_collections, 'biases') # biases = variables.model_variable('biases', # shape=[num_outputs, ], # dtype=dtype, # initializer=biases_initializer, # regularizer=biases_regularizer, # collections=biases_collections, # trainable=trainable) # Convolution at different scales. outputs_pool = [] for rate in rates: if rate > 1: conv = nn.atrous_conv2d(inputs, weights, rate, padding='SAME') else: conv = nn.conv2d(inputs, weights, [1, 1, 1, 1], padding='SAME') outputs_pool.append(conv) # 'Pooling' at different scales. A bit hacky. Use of concat + max_pool? outputs = None outputs_pool.reverse() for node in outputs_pool: if outputs is None: outputs = node else: outputs = tf.maximum(outputs, node) # # Add bias? # if biases is not None: # outputs = tf.nn.bias_add(outputs, biases) # Fix padding and stride. A bit hacky too and not so efficient! if padding == 'VALID' or stride > 1: padfilter = np.zeros(shape=(kernel_h, kernel_w, num_filters_in, 1), dtype=dtype) x = (kernel_h - 1) / 2 y = (kernel_w - 1) / 2 padfilter[x, y, :, 0] = 1. outputs = tf.nn.depthwise_conv2d(outputs, padfilter, [1, stride_h, stride_w, 1], padding=padding) # Batch norm / bias and activation... if normalizer_fn is not None: normalizer_params = normalizer_params or {} outputs = normalizer_fn(outputs, **normalizer_params) else: if biases_initializer is not None: biases_collections = utils.get_variable_collections( variables_collections, 'biases') biases = variables.model_variable('biases', shape=[num_outputs, ], dtype=dtype, initializer=biases_initializer, regularizer=biases_regularizer, collections=biases_collections, trainable=trainable) outputs = nn.bias_add(outputs, biases) if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def main(args): config = load_configs(args.configs) train_config = config.training gpus = range(args.ngpus) # get the data and logging (checkpointing) directories: data_dir = train_config.datadir log_dir = train_config.logdir SUBSET = 'train' NUM_STEPS = 30000000 # value at which the gradients are clipped GRAD_CLIP = train_config.gradclip if args.checkpoint is not None: checkpoint_fname = args.checkpoint else: print( colorize('No checkpoint file specified. Initializing randomly.', 'red', bold=True)) checkpoint_fname = osp.join(log_dir, 'INVALID') opts = {} opts['gpu_ids'] = gpus opts['log_dir'] = log_dir opts[ 'n_summary'] = 10 # number of iterations after which to run the summary-op if hasattr(train_config, 'n_test'): opts['n_test'] = train_config.n_test else: opts['n_test'] = 500 opts[ 'n_checkpoint'] = train_config.ncheckpoint # number of iteration after which to save the model batch_size = train_config.batch graph = tf.Graph() with graph.as_default(): global_step = variables.model_variable( 'global_step', shape=[], initializer=tf.constant_initializer(args.reset_global_step), trainable=False) # common model / optimizer parameters: lr = args.lr_multiple * tf.train.exponential_decay( train_config.lr.start_val, global_step, train_config.lr.step, train_config.lr.decay, staircase=True) if train_config.optim.lower() == 'adam': optim = tf.train.AdamOptimizer(lr, name='Adam') elif train_config.optim.lower() == 'adadelta': optim = tf.train.AdadeltaOptimizer(lr, rho=0.95, epsilon=1e-06, use_locking=False, name='Adadelta') elif train_config.optim.lower() == 'adagrad': optim = tf.train.AdagradOptimizer(lr, use_locking=False, name='AdaGrad') else: raise ValueError('Optimizer = %s not suppoerted' % train_config.optim) factory = model_factory(IMMModel, config=config.model, global_step=global_step) opts['batch_size'] = batch_size tf.summary.scalar('lr', lr) # add a summary print(colorize('log_dir: ' + log_dir, 'green', bold=True)) print(colorize('BATCH-SIZE: %d' % batch_size, 'red', bold=True)) # dynamic import of a dataset class dset_class = import_dataset(train_config.dset) # default datasets parameters train_dset_params = {} test_dset_params = {} train_subset = 'train' test_subset = 'test' if hasattr(train_config, 'train_dset_params'): train_dset_params.update(train_config.train_dset_params) if 'subset' in train_dset_params: train_subset = train_dset_params['subset'] # delete because not positional kwarg del train_dset_params['subset'] if hasattr(train_config, 'test_dset_params'): test_dset_params.update(train_config.test_dset_params) if 'subset' in test_dset_params: test_subset = test_dset_params['subset'] # delete because not positional kwarg del test_dset_params['subset'] train_dset = dset_class(train_config.datadir, subset=train_subset, **train_dset_params) train_dset = train_dset.get_dataset(batch_size, repeat=True, shuffle=False, num_preprocess_threads=12) if hasattr(train_config, 'max_test_samples'): raise ValueError('max_test_samples attribute deprecated') test_dset = dset_class(train_config.datadir, subset=test_subset, **test_dset_params) test_dset = test_dset.get_dataset(batch_size, repeat=False, shuffle=False, num_preprocess_threads=12) # set up inputs training_pl = tf.placeholder(tf.bool) handle_pl = tf.placeholder(tf.string, shape=[]) base_iterator = tf.data.Iterator.from_string_handle( handle_pl, train_dset.output_types, train_dset.output_shapes) inputs = base_iterator.get_next() split_gpus = False if hasattr(config.model, 'split_gpus'): split_gpus = config.model.split_gpus # create the network distributed over multi-GPUs: loss, train_op, train_summary_op, test_summary_op, _ = tru.setup_training( opts, graph, optim, inputs, training_pl, factory, global_step, clip_value=GRAD_CLIP, split_gpus=split_gpus) # run the training loop: if args.restore_optim: restore_vars = 'all' else: restore_vars = 'model' tru.train_loop(opts, graph, loss, train_dset, training_pl, handle_pl, train_op, train_summary_op, test_summary_op, NUM_STEPS, global_step, checkpoint_fname, test_dataset=test_dset, ignore_missing_vars=args.ignore_missing_vars, reset_global_step=args.reset_global_step, vars_to_restore=restore_vars, exclude_vars=[], allow_growth=train_config.allow_growth)
def weighted_sum_from_feature_columns(columns_to_tensors, feature_columns, num_outputs, weight_collections=None, trainable=True, scope=None): """A tf.contrib.layers style linear prediction builder based on FeatureColumn. Generally a single example in training data is described with feature columns. This function generates weighted sum for each num_outputs. Weighted sum refers to logits in classification problems. It refers to prediction itself for linear regression problems. Example: ``` # Building model for training feature_columns = ( real_valued_column("my_feature1"), ... ) columns_to_tensor = tf.io.parse_example(...) logits = weighted_sum_from_feature_columns( columns_to_tensors=columns_to_tensor, feature_columns=feature_columns, num_outputs=1) loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits) ``` Args: columns_to_tensors: A mapping from feature column to tensors. 'string' key means a base feature (not-transformed). It can have FeatureColumn as a key too. That means that FeatureColumn is already transformed by input pipeline. For example, `inflow` may have handled transformations. feature_columns: A set containing all the feature columns. All items in the set should be instances of classes derived from FeatureColumn. num_outputs: An integer specifying number of outputs. Default value is 1. weight_collections: List of graph collections to which weights are added. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for variable_scope. Returns: A tuple containing: * A Tensor which represents predictions of a linear model. * A dictionary which maps feature_column to corresponding Variable. * A Variable which is used for bias. Raises: ValueError: if FeatureColumn cannot be used for linear predictions. """ columns_to_tensors = columns_to_tensors.copy() check_feature_columns(feature_columns) with variable_scope.variable_scope( scope, default_name='weighted_sum_from_feature_columns', values=columns_to_tensors.values()): output_tensors = [] column_to_variable = {} transformer = _Transformer(columns_to_tensors) # pylint: disable=protected-access for column in sorted(set(feature_columns), key=lambda x: x.key): transformed_tensor = transformer.transform(column) try: embedding_lookup_arguments = column._wide_embedding_lookup_arguments( transformed_tensor) variable, predictions = _create_embedding_lookup( column, columns_to_tensors, embedding_lookup_arguments, num_outputs, trainable, weight_collections) except NotImplementedError: with variable_scope.variable_scope( None, default_name=column.name, values=columns_to_tensors.values()): tensor = column._to_dense_tensor(transformed_tensor) tensor = _maybe_reshape_input_tensor(tensor, column.name, output_rank=2) variable = [ contrib_variables.model_variable( name='weight', shape=[tensor.get_shape()[1], num_outputs], initializer=init_ops.zeros_initializer(), trainable=trainable, collections=weight_collections) ] predictions = math_ops.matmul(tensor, variable[0], name='matmul') except ValueError as ee: raise ValueError( 'Error creating weighted sum for column: {}.\n' '{}'.format(column.name, ee)) output_tensors.append( array_ops.reshape(predictions, shape=(-1, num_outputs))) column_to_variable[column] = variable _log_variable(variable) fc._maybe_restore_from_checkpoint(column._checkpoint_path(), variable) # pylint: disable=protected-access # pylint: enable=protected-access predictions_no_bias = math_ops.add_n(output_tensors) bias = contrib_variables.model_variable( 'bias_weight', shape=[num_outputs], initializer=init_ops.zeros_initializer(), trainable=trainable, collections=_add_variable_collection(weight_collections)) _log_variable(bias) predictions = nn_ops.bias_add(predictions_no_bias, bias) return predictions, column_to_variable, bias
def dnn_sampled_softmax_classifier_model_fn(features, target_indices, mode, params): """model_fn that uses candidate sampling. Args: features: Single Tensor or dict of Tensor (depends on data passed to `fit`) target_indices: A single Tensor of shape [batch_size, n_labels] containing the target indices. mode: Represents if this training, evaluation or prediction. See `ModeKeys`. params: A dict of hyperparameters that are listed below. hidden_units- List of hidden units per layer. All layers are fully connected. Ex. `[64, 32]` means first layer has 64 nodes and second one has 32. feature_columns- An iterable containing all the feature columns used by the model. All items in the set should be instances of classes derived from `FeatureColumn`. n_classes- number of target classes. It must be greater than 2. n_samples- number of sample target classes. Needs to be tuned - A good starting point could be 2% of n_classes. n_labels- number of labels in each example. top_k- The number of classes to predict. optimizer- An instance of `tf.Optimizer` used to train the model. If `None`, will use an Adagrad optimizer. dropout- When not `None`, the probability we will drop out a given coordinate. gradient_clip_norm- A float > 0. If provided, gradients are clipped to their global norm with this clipping ratio. See tf.clip_by_global_norm for more details. num_ps_replicas- The number of parameter server replicas. Returns: predictions: A single Tensor or a dict of Tensors. loss: A scalar containing the loss of the step. train_op: The op for training. """ hidden_units = params["hidden_units"] feature_columns = params["feature_columns"] n_classes = params["n_classes"] n_samples = params["n_samples"] n_labels = params["n_labels"] top_k = params["top_k"] optimizer = params["optimizer"] dropout = params["dropout"] gradient_clip_norm = params["gradient_clip_norm"] num_ps_replicas = params["num_ps_replicas"] parent_scope = "dnn_ss" # Setup the input layer partitioner. input_layer_partitioner = ( partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas, min_slice_size=64 << 20)) # Create the input layer. with variable_scope.variable_scope( parent_scope + "/input_from_feature_columns", features.values(), partitioner=input_layer_partitioner) as scope: net = layers.input_from_feature_columns( features, feature_columns, weight_collections=[parent_scope], scope=scope) # Setup the hidden layer partitioner. hidden_layer_partitioner = ( partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas)) final_hidden_layer_dim = None # Create hidden layers using fully_connected. for layer_id, num_hidden_units in enumerate(hidden_units): with variable_scope.variable_scope( parent_scope + "/hiddenlayer_%d" % layer_id, [net], partitioner=hidden_layer_partitioner) as scope: net = layers.fully_connected(net, num_hidden_units, variables_collections=[parent_scope], scope=scope) final_hidden_layer_dim = num_hidden_units # Add dropout if it is enabled. if dropout is not None and mode == estimator.ModeKeys.TRAIN: net = layers.dropout(net, keep_prob=(1.0 - dropout)) # Create the weights and biases for the logit layer. with variable_scope.variable_scope( parent_scope + "/logits", [net], partitioner=hidden_layer_partitioner) as scope: dtype = net.dtype.base_dtype weights_shape = [n_classes, final_hidden_layer_dim] weights = variables.model_variable( "weights", shape=weights_shape, dtype=dtype, initializer=initializers.xavier_initializer(), trainable=True, collections=[parent_scope]) biases = variables.model_variable( "biases", shape=[n_classes,], dtype=dtype, initializer=init_ops.zeros_initializer, trainable=True, collections=[parent_scope]) if mode == estimator.ModeKeys.TRAIN: # Call the candidate sampling APIs and calculate the loss. sampled_values = nn.learned_unigram_candidate_sampler( true_classes=math_ops.to_int64(target_indices), num_true=n_labels, num_sampled=n_samples, unique=True, range_max=n_classes) sampled_softmax_loss = nn.sampled_softmax_loss( weights=weights, biases=biases, inputs=net, labels=math_ops.to_int64(target_indices), num_sampled=n_samples, num_classes=n_classes, num_true=n_labels, sampled_values=sampled_values) loss = math_ops.reduce_mean(sampled_softmax_loss, name="loss") train_op = optimizers.optimize_loss( loss=loss, global_step=contrib_framework.get_global_step(), learning_rate=_DEFAULT_LEARNING_RATE, optimizer=_get_optimizer(optimizer), clip_gradients=gradient_clip_norm, name=parent_scope) return None, loss, train_op elif mode == estimator.ModeKeys.EVAL: logits = nn.bias_add(standard_ops.matmul(net, array_ops.transpose(weights)), biases) predictions = {} predictions[_PROBABILITIES] = nn.softmax(logits) predictions[_CLASSES] = math_ops.argmax(logits, 1) _, predictions[_TOP_K] = nn.top_k(logits, top_k) # Since the targets have multiple labels, setup the target probabilities # as 1.0/n_labels for each of the labels. target_one_hot = array_ops.one_hot( indices=target_indices, depth=n_classes, on_value=1.0 / n_labels) target_one_hot = math_ops.reduce_sum( input_tensor=target_one_hot, reduction_indices=[1]) loss = math_ops.reduce_mean( nn.softmax_cross_entropy_with_logits(logits, target_one_hot)) return predictions, loss, None elif mode == estimator.ModeKeys.INFER: logits = nn.bias_add(standard_ops.matmul(net, array_ops.transpose(weights)), biases) predictions = {} predictions[_PROBABILITIES] = nn.softmax(logits) predictions[_CLASSES] = math_ops.argmax(logits, 1) _, predictions[_TOP_K] = nn.top_k(logits, top_k) return predictions, None, None