def __init__(self, master, component_spec, attr_defaults=None): """Initializes the ComponentBuilder from specifications. Args: master: dragnn.MasterBuilder object. component_spec: dragnn.ComponentSpec proto to be built. attr_defaults: Optional dict of component attribute defaults. If not provided or if empty, attributes are not extracted. """ self.master = master self.num_actions = component_spec.num_actions self.name = component_spec.name self.spec = component_spec self.moving_average = None # Determine if this component should apply self-normalization. self.eligible_for_self_norm = ( not self.master.hyperparams.self_norm_components_filter or self.name in self.master.hyperparams.self_norm_components_filter.split(',')) # Extract component attributes before make_network(), so the network unit # can access them. self._attrs = {} global_attr_defaults = { 'locally_normalize': False, 'output_as_probabilities': False } if attr_defaults: global_attr_defaults.update(attr_defaults) self._attrs = network_units.get_attrs_with_defaults( self.spec.component_builder.parameters, global_attr_defaults) do_local_norm = self._attrs['locally_normalize'] self._output_as_probabilities = self._attrs['output_as_probabilities'] with tf.variable_scope(self.name): self.training_beam_size = tf.constant(self.spec.training_beam_size, name='TrainingBeamSize') self.inference_beam_size = tf.constant( self.spec.inference_beam_size, name='InferenceBeamSize') self.locally_normalize = tf.constant(do_local_norm, name='LocallyNormalize') self._step = tf.get_variable('step', [], initializer=tf.zeros_initializer(), dtype=tf.int32) self._total = tf.get_variable('total', [], initializer=tf.zeros_initializer(), dtype=tf.int32) # Construct network variables. self.network = self.make_network(self.spec.network_unit) # Construct moving average. if self.master.hyperparams.use_moving_average: self.moving_average = tf.train.ExponentialMovingAverage( decay=self.master.hyperparams.average_weight, num_updates=self._step) self.avg_ops = [self.moving_average.apply(self.network.params)] # Used to export the cell; see add_cell_input() and add_cell_output(). self._cell_subgraph_spec = export_pb2.CellSubgraphSpec()
def __init__(self, component): """Initializes layers. Args: component: Parent ComponentBuilderBase object. """ layers = [ network_units.Layer(self, 'lengths', -1), network_units.Layer(self, 'scores', -1), network_units.Layer(self, 'logits', -1), network_units.Layer(self, 'arcs', -1), ] super(MstSolverNetwork, self).__init__(component, init_layers=layers) self._attrs = network_units.get_attrs_with_defaults( component.spec.network_unit.parameters, defaults={ 'forest': False, 'loss': 'softmax', 'crf_max_dynamic_range': 20, }) check.Eq(len(self._fixed_feature_dims.items()), 0, 'Expected no fixed features') check.Eq(len(self._linked_feature_dims.items()), 2, 'Expected two linked features') check.In('lengths', self._linked_feature_dims, 'Missing required linked feature') check.In('scores', self._linked_feature_dims, 'Missing required linked feature')
def __init__(self, component): """Initializes layers. Args: component: Parent ComponentBuilderBase object. """ layers = [ network_units.Layer(self, 'lengths', -1), network_units.Layer(self, 'scores', -1), network_units.Layer(self, 'logits', -1), network_units.Layer(self, 'arcs', -1), ] super(MstSolverNetwork, self).__init__(component, init_layers=layers) self._attrs = network_units.get_attrs_with_defaults( component.spec.network_unit.parameters, defaults={ 'forest': False, 'loss': 'softmax', 'crf_max_dynamic_range': 20, }) check.Eq( len(self._fixed_feature_dims.items()), 0, 'Expected no fixed features') check.Eq( len(self._linked_feature_dims.items()), 2, 'Expected two linked features') check.In('lengths', self._linked_feature_dims, 'Missing required linked feature') check.In('scores', self._linked_feature_dims, 'Missing required linked feature')
def __init__(self, master, component_spec, attr_defaults=None): """Initializes the ComponentBuilder from specifications. Args: master: dragnn.MasterBuilder object. component_spec: dragnn.ComponentSpec proto to be built. attr_defaults: Optional dict of component attribute defaults. If not provided or if empty, attributes are not extracted. """ self.master = master self.num_actions = component_spec.num_actions self.name = component_spec.name self.spec = component_spec self.moving_average = None # Determine if this component should apply self-normalization. self.eligible_for_self_norm = ( not self.master.hyperparams.self_norm_components_filter or self.name in self.master.hyperparams.self_norm_components_filter.split(',')) # Extract component attributes before make_network(), so the network unit # can access them. self._attrs = {} if attr_defaults: self._attrs = network_units.get_attrs_with_defaults( self.spec.component_builder.parameters, attr_defaults) with tf.variable_scope(self.name): self.locally_normalize = tf.constant(False, name='LocallyNormalize') self._step = tf.get_variable('step', [], initializer=tf.zeros_initializer(), dtype=tf.int32) self._total = tf.get_variable('total', [], initializer=tf.zeros_initializer(), dtype=tf.int32) # Construct network variables. self.network = self.make_network(self.spec.network_unit) # Construct moving average. if self.master.hyperparams.use_moving_average: self.moving_average = tf.train.ExponentialMovingAverage( decay=self.master.hyperparams.average_weight, num_updates=self._step) self.avg_ops = [self.moving_average.apply(self.network.params)]
def __init__(self, master, component_spec, attr_defaults=None): """Initializes the ComponentBuilder from specifications. Args: master: dragnn.MasterBuilder object. component_spec: dragnn.ComponentSpec proto to be built. attr_defaults: Optional dict of component attribute defaults. If not provided or if empty, attributes are not extracted. """ self.master = master self.num_actions = component_spec.num_actions self.name = component_spec.name self.spec = component_spec self.moving_average = None # Determine if this component should apply self-normalization. self.eligible_for_self_norm = ( not self.master.hyperparams.self_norm_components_filter or self.name in self.master.hyperparams.self_norm_components_filter.split(',')) # Extract component attributes before make_network(), so the network unit # can access them. self._attrs = {} if attr_defaults: self._attrs = network_units.get_attrs_with_defaults( self.spec.component_builder.parameters, attr_defaults) with tf.variable_scope(self.name): self.training_beam_size = tf.constant( self.spec.training_beam_size, name='TrainingBeamSize') self.inference_beam_size = tf.constant( self.spec.inference_beam_size, name='InferenceBeamSize') self.locally_normalize = tf.constant(False, name='LocallyNormalize') self._step = tf.get_variable( 'step', [], initializer=tf.zeros_initializer(), dtype=tf.int32) self._total = tf.get_variable( 'total', [], initializer=tf.zeros_initializer(), dtype=tf.int32) # Construct network variables. self.network = self.make_network(self.spec.network_unit) # Construct moving average. if self.master.hyperparams.use_moving_average: self.moving_average = tf.train.ExponentialMovingAverage( decay=self.master.hyperparams.average_weight, num_updates=self._step) self.avg_ops = [self.moving_average.apply(self.network.params)]
def __init__(self, component, additional_attr_defaults=None): """Initializes the LSTM base class. Parameters used: hidden_layer_sizes: Comma-delimited number of hidden units for each layer. input_dropout_rate (-1.0): Input dropout rate for each layer. If < 0.0, use the global |dropout_rate| hyperparameter. recurrent_dropout_rate (0.8): Recurrent dropout rate. If < 0.0, use the global |recurrent_dropout_rate| hyperparameter. layer_norm (True): Whether or not to use layer norm. Hyperparameters used: dropout_rate: Input dropout rate. recurrent_dropout_rate: Recurrent dropout rate. Args: component: parent ComponentBuilderBase object. additional_attr_defaults: Additional attributes for use by derived class. """ attr_defaults = additional_attr_defaults or {} attr_defaults.update({ 'layer_norm': True, 'input_dropout_rate': -1.0, 'recurrent_dropout_rate': 0.8, 'hidden_layer_sizes': '256', }) self._attrs = dragnn.get_attrs_with_defaults( component.spec.network_unit.parameters, defaults=attr_defaults) self._hidden_layer_sizes = map( int, self._attrs['hidden_layer_sizes'].split(',')) self._input_dropout_rate = self._attrs['input_dropout_rate'] if self._input_dropout_rate < 0.0: self._input_dropout_rate = component.master.hyperparams.dropout_rate self._recurrent_dropout_rate = self._attrs['recurrent_dropout_rate'] if self._recurrent_dropout_rate < 0.0: self._recurrent_dropout_rate = ( component.master.hyperparams.recurrent_dropout_rate) if self._recurrent_dropout_rate < 0.0: self._recurrent_dropout_rate = component.master.hyperparams.dropout_rate tf.logging.info('[%s] input_dropout_rate=%s recurrent_dropout_rate=%s', component.name, self._input_dropout_rate, self._recurrent_dropout_rate) layers, context_layers = self.create_hidden_layers( component, self._hidden_layer_sizes) last_layer_dim = layers[-1].dim layers.append( dragnn.Layer(component, name='last_layer', dim=last_layer_dim)) layers.append( dragnn.Layer(component, name='logits', dim=component.num_actions)) # Provide initial layers and context layers, so the base class constructor # can safely use accessors like get_layer_size(). super(BaseLSTMNetwork, self).__init__(component, init_layers=layers, init_context_layers=context_layers) # Allocate parameters for the softmax. self._params.append( tf.get_variable( 'weights_softmax', [last_layer_dim, component.num_actions], initializer=tf.random_normal_initializer(stddev=1e-4))) self._params.append( tf.get_variable('bias_softmax', [component.num_actions], initializer=tf.zeros_initializer()))
def __init__(self, component, additional_attr_defaults=None): """Initializes the LSTM base class. Parameters used: hidden_layer_sizes: Comma-delimited number of hidden units for each layer. input_dropout_rate (-1.0): Input dropout rate for each layer. If < 0.0, use the global |dropout_rate| hyperparameter. recurrent_dropout_rate (0.8): Recurrent dropout rate. If < 0.0, use the global |recurrent_dropout_rate| hyperparameter. layer_norm (True): Whether or not to use layer norm. Hyperparameters used: dropout_rate: Input dropout rate. recurrent_dropout_rate: Recurrent dropout rate. Args: component: parent ComponentBuilderBase object. additional_attr_defaults: Additional attributes for use by derived class. """ attr_defaults = additional_attr_defaults or {} attr_defaults.update({ 'layer_norm': True, 'input_dropout_rate': -1.0, 'recurrent_dropout_rate': 0.8, 'hidden_layer_sizes': '256', }) self._attrs = dragnn.get_attrs_with_defaults( component.spec.network_unit.parameters, defaults=attr_defaults) self._hidden_layer_sizes = map(int, self._attrs['hidden_layer_sizes'].split(',')) self._input_dropout_rate = self._attrs['input_dropout_rate'] if self._input_dropout_rate < 0.0: self._input_dropout_rate = component.master.hyperparams.dropout_rate self._recurrent_dropout_rate = self._attrs['recurrent_dropout_rate'] if self._recurrent_dropout_rate < 0.0: self._recurrent_dropout_rate = ( component.master.hyperparams.recurrent_dropout_rate) if self._recurrent_dropout_rate < 0.0: self._recurrent_dropout_rate = component.master.hyperparams.dropout_rate tf.logging.info('[%s] input_dropout_rate=%s recurrent_dropout_rate=%s', component.name, self._input_dropout_rate, self._recurrent_dropout_rate) layers, context_layers = self.create_hidden_layers(component, self._hidden_layer_sizes) last_layer_dim = layers[-1].dim layers.append( dragnn.Layer(component, name='last_layer', dim=last_layer_dim)) layers.append( dragnn.Layer(component, name='logits', dim=component.num_actions)) # Provide initial layers and context layers, so the base class constructor # can safely use accessors like get_layer_size(). super(BaseLSTMNetwork, self).__init__( component, init_layers=layers, init_context_layers=context_layers) # Allocate parameters for the softmax. self._params.append( tf.get_variable( 'weights_softmax', [last_layer_dim, component.num_actions], initializer=tf.random_normal_initializer(stddev=1e-4))) self._params.append( tf.get_variable( 'bias_softmax', [component.num_actions], initializer=tf.zeros_initializer()))
def MakeAttrs(self, defaults, key=None, value=None): """Returns attrs based on the |defaults| and one |key|,|value| override.""" spec = spec_pb2.RegisteredModuleSpec() if key and value: spec.parameters[key] = value return network_units.get_attrs_with_defaults(spec.parameters, defaults)
def __init__(self, component): """Initializes parameters for this Transformer unit. Args: component: parent ComponentBuilderBase object. Parameters used to construct the network: num_layers: number of transformer layers (attention + MLP) hidden_size: size of hidden layers in MLPs filter_size: filter width for each attention head num_heads: number of attention heads residual_dropout: dropout keep rate for residual layers attention_dropout: dropout keep rate for attention weights mlp_dropout: dropout keep rate for mlp layers initialization: initialization scheme to use for model parameters bias_init: initial value for bias parameters scale_attention: whether to scale attention parameters by filter_size^-0.5 layer_norm_residuals: whether to perform layer normalization on residual layers timing_signal: whether to add a position-wise timing signal to the input kernel: kernel width in middle MLP layers mlp_layers: number of MLP layers. Must be >= 2. Raises: ValueError: if mlp_layers < 2. The input depth of the first layer is inferred from the total concatenated size of the input features, minus 1 to account for the sequence lengths. Hyperparameters used: dropout_rate: The probability that an input is not dropped. This is the default when the |dropout_keep_prob| parameter is unset. """ super(TransformerEncoderNetwork, self).__init__(component) default_dropout_rate = component.master.hyperparams.dropout_rate self._attrs = network_units.get_attrs_with_defaults( component.spec.network_unit.parameters, defaults={ 'num_layers': 4, 'hidden_size': 256, 'filter_size': 64, 'num_heads': 8, 'residual_drop': default_dropout_rate, 'attention_drop': default_dropout_rate, 'mlp_drop': default_dropout_rate, 'initialization': 'xavier', 'bias_init': 0.001, 'scale_attention': True, 'layer_norm_residuals': True, 'timing_signal': True, 'kernel': 1, 'mlp_layers': 2}) self._num_layers = self._attrs['num_layers'] self._hidden_size = self._attrs['hidden_size'] self._filter_size = self._attrs['filter_size'] self._num_heads = self._attrs['num_heads'] self._residual_dropout = self._attrs['residual_drop'] self._attention_dropout = self._attrs['attention_drop'] self._mlp_dropout = self._attrs['mlp_drop'] self._initialization = self._attrs['initialization'] self._bias_init = self._attrs['bias_init'] self._scale_attn = self._attrs['scale_attention'] self._layer_norm_res = self._attrs['layer_norm_residuals'] self._timing_signal = self._attrs['timing_signal'] self._kernel = self._attrs['kernel'] self._mlp_depth = self._attrs['mlp_layers'] if self._mlp_depth < 2: raise ValueError('TransformerEncoderNetwork needs mlp_layers >= 2') self._combined_filters = self._num_heads * self._filter_size self._weights = [] self._biases = [] self._layer_norms = {} # Hacky: one dimension comes from the lengths input; subtract it. self._concatenated_input_dim -= 1 # Initial projection of inputs, this is mainly to project input down to the # right size for residual layers proj_shape = [1, 1, self._concatenated_input_dim, self._combined_filters] self._weights.append( network_units.add_var_initialized('init_proj', proj_shape, self._initialization)) self._biases.append(tf.get_variable('init_bias', self._combined_filters, initializer=tf.constant_initializer( self._bias_init), dtype=tf.float32)) for i in range(self._num_layers): with tf.variable_scope('transform_%d' % i): # Attention weights: 3 * self.combined_filters = (q, k, v) # We assume that q, k and v all have the same dimension attn_shape = [1, 1, self._combined_filters, 3 * self._combined_filters] self._weights.append( network_units.add_var_initialized('attn_weights', attn_shape, self._initialization)) # Attention final projection weights proj_shape = [1, 1, self._combined_filters, self._combined_filters] self._weights.append( network_units.add_var_initialized('proj_weights', proj_shape, self._initialization)) # MLP weights with tf.variable_scope('mlp'): ff_shape = [1, 1, self._combined_filters, self._hidden_size] self._weights.append( network_units.add_var_initialized('ff_weights_0', ff_shape, self._initialization)) ff_shape = [1, self._kernel, self._hidden_size, self._hidden_size] for j in range(1, self._mlp_depth - 1): self._weights.append( network_units.add_var_initialized('ff_weights_%d' % j, ff_shape, self._initialization)) ff_shape = [1, 1, self._hidden_size, self._combined_filters] self._weights.append( network_units.add_var_initialized('ff_weights_%d' % (self._mlp_depth - 1), ff_shape, self._initialization)) # Layer normalization for residual layers if self._layer_norm_res: attn_layer_norm = network_units.LayerNorm(component, 'attn_layer_norm_%d' % i, self._combined_filters, tf.float32) self._layer_norms['attn_layer_norm_%d' % i] = attn_layer_norm ff_layer_norm = network_units.LayerNorm(component, 'ff_layer_norm_%d' % i, self._combined_filters, tf.float32) self._layer_norms['ff_layer_norm_%d' % i] = ff_layer_norm # Layer norm parameters are not added to self._weights, # which means that they are not l2 regularized self._params.extend(attn_layer_norm.params + ff_layer_norm.params) self._params.extend(self._weights) self._params.extend(self._biases) self._regularized_weights.extend(self._weights) self._layers.append( network_units.Layer(component, name='transformer_output', dim=self._combined_filters))