class Discriminator(object): def __init__(self, x_k, n_steps, hidden_dim): self.x_k = x_k self.hidden_dim = hidden_dim constraint = lambda: ClipConstraint(1e-2) self.lstm = LSTM(hidden_dim) self.lstm.build((None, n_steps, 1)) for w in self.lstm.trainable_weights: # print("Weight: {}".format(w)) self.lstm.constraints[w] = constraint() self.dense = Dense(1, W_constraint=constraint()) self.dense.build((None, hidden_dim)) self.weights = self.lstm.trainable_weights + self.dense.trainable_weights self.constraints = self.lstm.constraints.copy() self.constraints.update(self.dense.constraints) # print("Constraints: {}".format(self.constraints)) def call(self, x): return self.dense.call(self.lstm.call(x))
def call(self, x, **kwargs): return Dense.call(self, x)
class NSE(Layer): ''' Simple Neural Semantic Encoder. ''' def __init__(self, output_dim, input_length=None, composer_activation='linear', return_mode='last_output', weights=None, **kwargs): ''' Arguments: output_dim (int) input_length (int) composer_activation (str): activation used in the MLP return_mode (str): One of last_output, all_outputs, output_and_memory This is analogous to the return_sequences flag in Keras' Recurrent. last_output returns only the last h_t all_outputs returns the whole sequence of h_ts output_and_memory returns the last output and the last memory concatenated (needed if this layer is followed by a MMA-NSE) weights (list): Initial weights ''' self.output_dim = output_dim self.input_dim = output_dim # Equation 2 in the paper makes this assumption. self.initial_weights = weights self.input_spec = [InputSpec(ndim=3)] self.input_length = input_length self.composer_activation = composer_activation super(NSE, self).__init__(**kwargs) self.reader = LSTM(self.output_dim, return_sequences=True, name="{}_reader".format(self.name)) # TODO: Let the writer use parameter dropout and any consume_less mode. # Setting dropout to 0 here to eliminate the need for constants. # Setting consume_less to mem to eliminate need for preprocessing self.writer = LSTM(self.output_dim, dropout_W=0.0, dropout_U=0.0, consume_less="mem", name="{}_writer".format(self.name)) self.composer = Dense(self.output_dim * 2, activation=self.composer_activation, name="{}_composer".format(self.name)) if return_mode not in [ "last_output", "all_outputs", "output_and_memory" ]: raise Exception("Unrecognized return mode: %s" % (return_mode)) self.return_mode = return_mode def get_output_shape_for(self, input_shape): input_length = input_shape[1] if self.return_mode == "last_output": return (input_shape[0], self.output_dim) elif self.return_mode == "all_outputs": return (input_shape[0], input_length, self.output_dim) else: # return_mode is output_and_memory. Output will be concatenated to memory. return (input_shape[0], input_length + 1, self.output_dim) def compute_mask(self, input, mask): if mask is None or self.return_mode == "last_output": return None elif self.return_mode == "all_outputs": return mask # (batch_size, input_length) else: # Return mode is output_and_memory # Mask memory corresponding to all the inputs that are masked, and do not mask the output # (batch_size, input_length + 1) return K.cast(K.concatenate([K.zeros_like(mask[:, :1]), mask]), 'uint8') def get_composer_input_shape(self, input_shape): # Takes concatenation of output and memory summary return (input_shape[0], self.output_dim * 2) def get_reader_input_shape(self, input_shape): return input_shape def build(self, input_shape): self.input_spec = [InputSpec(shape=input_shape)] input_dim = input_shape[-1] assert self.reader.return_sequences, "The reader has to return sequences!" reader_input_shape = self.get_reader_input_shape(input_shape) print >> sys.stderr, "NSE reader input shape:", reader_input_shape writer_input_shape = (input_shape[0], 1, self.output_dim * 2 ) # Will process one timestep at a time print >> sys.stderr, "NSE writer input shape:", writer_input_shape composer_input_shape = self.get_composer_input_shape(input_shape) print >> sys.stderr, "NSE composer input shape:", composer_input_shape self.reader.build(reader_input_shape) self.writer.build(writer_input_shape) self.composer.build(composer_input_shape) # Aggregate weights of individual components for this layer. reader_weights = self.reader.trainable_weights writer_weights = self.writer.trainable_weights composer_weights = self.composer.trainable_weights self.trainable_weights = reader_weights + writer_weights + composer_weights if self.initial_weights is not None: self.set_weights(self.initial_weights) del self.initial_weights def read(self, nse_input, input_mask=None): ''' This method produces the 'read' output (equation 1 in the paper) for all timesteps and initializes the memory slot mem_0. Input: nse_input (batch_size, input_length, input_dim) Outputs: o (batch_size, input_length, output_dim) flattened_mem_0 (batch_size, input_length * output_dim) While this method simply copies input to mem_0, variants that inherit from this class can do something fancier. ''' input_to_read = nse_input mem_0 = input_to_read flattened_mem_0 = K.batch_flatten(mem_0) o = self.reader.call(input_to_read, input_mask) o_mask = self.reader.compute_mask(input_to_read, input_mask) return o, [flattened_mem_0], o_mask @staticmethod def summarize_memory(o_t, mem_tm1): ''' This method selects the relevant parts of the memory given the read output and summarizes the memory. Implements Equations 2-3 or 8-11 in the paper. ''' # Selecting relevant memory slots, Equation 2 z_t = K.softmax(K.sum(K.expand_dims(o_t, dim=1) * mem_tm1, axis=2)) # (batch_size, input_length) # Summarizing memory, Equation 3 m_rt = K.sum(K.expand_dims(z_t, dim=2) * mem_tm1, axis=1) # (batch_size, output_dim) return z_t, m_rt def compose_memory_and_output(self, output_memory_list): ''' This method takes a list of tensors and applies the composition function on their concatrnation. Implements equation 4 or 12 in the paper. ''' # Composition, Equation 4 c_t = self.composer.call( K.concatenate(output_memory_list)) # (batch_size, output_dim) return c_t def update_memory(self, z_t, h_t, mem_tm1): ''' This method takes the attention vector (z_t), writer output (h_t) and previous timestep's memory (mem_tm1) and updates the memory. Implements equations 6, 14 or 15. ''' tiled_z_t = K.tile( K.expand_dims(z_t), (self.output_dim)) # (batch_size, input_length, output_dim) input_length = K.shape(mem_tm1)[1] # (batch_size, input_length, output_dim) tiled_h_t = K.permute_dimensions( K.tile(K.expand_dims(h_t), (input_length)), (0, 2, 1)) # Updating memory. First term in summation corresponds to selective forgetting and the second term to # selective addition. Equation 6. mem_t = mem_tm1 * ( 1 - tiled_z_t ) + tiled_h_t * tiled_z_t # (batch_size, input_length, output_dim) return mem_t def compose_and_write_step(self, o_t, states): ''' This method is a step function that updates the memory at each time step and produces a new output vector (Equations 2 to 6 in the paper). The memory_state is flattened because K.rnn requires all states to be of the same shape as the output, because it uses the same mask for the output and the states. Inputs: o_t (batch_size, output_dim) states (list[Tensor]) flattened_mem_tm1 (batch_size, input_length * output_dim) writer_h_tm1 (batch_size, output_dim) writer_c_tm1 (batch_size, output_dim) Outputs: h_t (batch_size, output_dim) flattened_mem_t (batch_size, input_length * output_dim) ''' flattened_mem_tm1, writer_h_tm1, writer_c_tm1 = states input_mem_shape = K.shape(flattened_mem_tm1) mem_tm1_shape = (input_mem_shape[0], input_mem_shape[1] / self.output_dim, self.output_dim) mem_tm1 = K.reshape( flattened_mem_tm1, mem_tm1_shape) # (batch_size, input_length, output_dim) z_t, m_rt = self.summarize_memory(o_t, mem_tm1) c_t = self.compose_memory_and_output([o_t, m_rt]) # Collecting the necessary variables to directly call writer's step function. writer_constants = self.writer.get_constants( c_t) # returns dropouts for W and U (all 1s, see init) writer_states = [writer_h_tm1, writer_c_tm1] + writer_constants # Making a call to writer's step function, Equation 5 h_t, [_, writer_c_t] = self.writer.step( c_t, writer_states) # h_t, writer_c_t: (batch_size, output_dim) mem_t = self.update_memory(z_t, h_t, mem_tm1) flattened_mem_t = K.batch_flatten(mem_t) return h_t, [flattened_mem_t, h_t, writer_c_t] def call(self, x, mask=None): # input_shape = (batch_size, input_length, input_dim). This needs to be defined in build. read_output, initial_memory_states, output_mask = self.read(x, mask) initial_write_states = self.writer.get_initial_states( read_output) # h_0 and c_0 of the writer LSTM initial_states = initial_memory_states + initial_write_states # last_output: (batch_size, output_dim) # all_outputs: (batch_size, input_length, output_dim) # last_states: # last_memory_state: (batch_size, input_length, output_dim) # last_output # last_writer_ct last_output, all_outputs, last_states = K.rnn( self.compose_and_write_step, read_output, initial_states, mask=output_mask) last_memory = last_states[0] if self.return_mode == "last_output": return last_output elif self.return_mode == "all_outputs": return all_outputs else: # return mode is output_and_memory expanded_last_output = K.expand_dims( last_output, dim=1) # (batch_size, 1, output_dim) # (batch_size, 1+input_length, output_dim) return K.concatenate([expanded_last_output, last_memory], axis=1) def get_config(self): config = { 'output_dim': self.output_dim, 'input_length': self.input_length, 'composer_activation': self.composer_activation, 'return_mode': self.return_mode } base_config = super(NSE, self).get_config() config.update(base_config) return config
class CNNEncoder(Layer): ''' CNNEncoder is a combination of multiple convolution layers and max pooling layers. This is defined as a single layer to be consistent with the other encoders in terms of input and output specifications. The input to this "layer" is of shape (batch_size, num_words, embedding_size) and the output is of size (batch_size, output_dim). The CNN has one convolution layer per each ngram filter size. Each convolution operation gives out a vector of size num_filters. The number of times a convolution layer will be used depends on the ngram size: input_length - ngram_size + 1. The corresponding maxpooling layer aggregates all these outputs from the convolution layer and outputs the max. This operation is repeated for every ngram size passed, and consequently the dimensionality of the output after maxpooling is len(ngram_filter_sizes) * num_filters. We then use a fully connected layer to project in back to the desired output_dim. For more details, refer to "A Sensitivity Analysis of (and Practitioners’ Guide to) Convolutional Neural Networks for Sentence Classification", Zhang and Wallace 2016, particularly Figure 1. ''' def __init__(self, weights=None, **kwargs): self.supports_masking = True # This is the output dim for each convolutional layer, which is the same as the number of # "filters" learned by that layer. self.num_filters = kwargs.pop('num_filters') # This specifies both the number of convolutional layers we will create and their sizes. # Must be a List[int]. The default of (2, 3, 4, 5) will have four convolutional layers, # corresponding to encoding ngrams of size 2 to 5 with some number of filters. ngram_filter_sizes = kwargs.pop('ngram_filter_sizes', (2, 3, 4, 5)) self.ngram_filter_sizes = ngram_filter_sizes self.output_dim = kwargs.pop('output_dim') conv_layer_activation = kwargs.pop('conv_layer_activation', 'relu') self.conv_layer_activation = conv_layer_activation self.l1_regularization = kwargs.pop("l1_regularization", None) self.l2_regularization = kwargs.pop("l2_regularization", None) self.regularizer = lambda: l1l2(l1=self.l1_regularization, l2=self.l2_regularization) # These are member variables that will be defined during self.build(). self.convolution_layers = None self.max_pooling_layers = None self.projection_layer = None self.input_spec = [InputSpec(ndim=3)] self.initial_weights = weights super(CNNEncoder, self).__init__(**kwargs) def build(self, input_shape): input_length = input_shape[1] # number of words # We define convolution, maxpooling and dense layers first. self.convolution_layers = [ Convolution1D(nb_filter=self.num_filters, filter_length=ngram_size, activation=self.conv_layer_activation, W_regularizer=self.regularizer(), b_regularizer=self.regularizer()) for ngram_size in self.ngram_filter_sizes ] self.max_pooling_layers = [ MaxPooling1D(pool_length=input_length - ngram_size + 1) for ngram_size in self.ngram_filter_sizes ] self.projection_layer = Dense(self.output_dim) # Building all layers because these sub-layers are not explitly part of the computatonal graph. for convolution_layer, max_pooling_layer in zip( self.convolution_layers, self.max_pooling_layers): convolution_layer.build(input_shape) max_pooling_layer.build( convolution_layer.get_output_shape_for(input_shape)) maxpool_output_dim = self.num_filters * len(self.ngram_filter_sizes) projection_input_shape = (input_shape[0], maxpool_output_dim) self.projection_layer.build(projection_input_shape) # Defining the weights of this "layer" as the set of weights from all convolution # and maxpooling layers. self.trainable_weights = [] for layer in self.convolution_layers + self.max_pooling_layers + [ self.projection_layer ]: self.trainable_weights.extend(layer.trainable_weights) if self.initial_weights is not None: self.set_weights(self.initial_weights) del self.initial_weights super(CNNEncoder, self).build(input_shape) def call(self, x, mask=None): # Each convolution layer returns output of size (samples, pool_length, num_filters), # where pool_length = num_words - ngram_size + 1 # Each maxpooling layer returns output of size (samples, 1, num_filters). # We need to flatten to remove the second dimension of length 1 from the maxpooled output. filter_outputs = [ K.batch_flatten( max_pooling_layer.call(convolution_layer.call(x, mask))) for max_pooling_layer, convolution_layer in zip( self.max_pooling_layers, self.convolution_layers) ] maxpool_output = merge( filter_outputs, mode='concat') if len(filter_outputs) > 1 else filter_outputs[0] return self.projection_layer.call(maxpool_output) def get_output_shape_for(self, input_shape): return (input_shape[0], self.output_dim) def compute_mask(self, input, input_mask=None): # pylint: disable=redefined-builtin # By default Keras propagates the mask from a layer that supports masking. We don't need it # anymore. So eliminating it from the flow. return None def get_config(self): config = { "output_dim": self.output_dim, "num_filters": self.num_filters, "ngram_filter_sizes": self.ngram_filter_sizes, "conv_layer_activation": self.conv_layer_activation, "l1_regularization": self.l1_regularization, "l2_regularization": self.l2_regularization, } base_config = super(CNNEncoder, self).get_config() config.update(base_config) return config
class CNNEncoder(MaskedLayer): ''' CNNEncoder is a combination of multiple convolution layers and max pooling layers. This is defined as a single layer to be consistent with the other encoders in terms of input and output specifications. The input to this "layer" is of shape (batch_size, num_words, embedding_dim) and the output is of size (batch_size, output_dim). The CNN has one convolution layer per each ngram filter size. Each convolution operation gives out a vector of size num_filters. The number of times a convolution layer will be used depends on the ngram size: input_length - ngram_size + 1. The corresponding maxpooling layer aggregates all these outputs from the convolution layer and outputs the max. This operation is repeated for every ngram size passed, and consequently the dimensionality of the output after maxpooling is len(ngram_filter_sizes) * num_filters. We then use a fully connected layer to project in back to the desired output_dim. For more details, refer to "A Sensitivity Analysis of (and Practitioners’ Guide to) Convolutional Neural Networks for Sentence Classification", Zhang and Wallace 2016, particularly Figure 1. Parameters ---------- units: int After doing convolutions, we'll project the collected features into a vector of this size. This used to be ``output_dim``, but Keras changed it to ``units``. I prefer the name ``output_dim``, so we'll leave the code using ``output_dim``, and just use the name ``units`` in the external API. num_filters: int This is the output dim for each convolutional layer, which is the same as the number of "filters" learned by that layer. ngram_filter_sizes: Tuple[int], optional (default=(2, 3, 4, 5)) This specifies both the number of convolutional layers we will create and their sizes. The default of (2, 3, 4, 5) will have four convolutional layers, corresponding to encoding ngrams of size 2 to 5 with some number of filters. conv_layer_activation: str, optional (default='relu') l1_regularization: float, optional (default=None) l2_regularization: float, optional (default=None) ''' def __init__(self, units: int, num_filters: int, ngram_filter_sizes: Tuple[int] = (2, 3, 4, 5), conv_layer_activation: str = 'relu', l1_regularization: float = None, l2_regularization: float = None, **kwargs): self.num_filters = num_filters self.ngram_filter_sizes = ngram_filter_sizes self.output_dim = units self.conv_layer_activation = conv_layer_activation self.l1_regularization = l1_regularization self.l2_regularization = l2_regularization self.regularizer = lambda: l1_l2(l1=self.l1_regularization, l2=self.l2_regularization) # These are member variables that will be defined during self.build(). self.convolution_layers = None self.max_pooling_layers = None self.projection_layer = None self.input_spec = [InputSpec(ndim=3)] super(CNNEncoder, self).__init__(**kwargs) @overrides def build(self, input_shape): input_length = input_shape[1] # number of words # We define convolution, maxpooling and dense layers first. self.convolution_layers = [ Convolution1D(filters=self.num_filters, kernel_size=ngram_size, activation=self.conv_layer_activation, kernel_regularizer=self.regularizer(), bias_regularizer=self.regularizer()) for ngram_size in self.ngram_filter_sizes ] self.max_pooling_layers = [ MaxPooling1D(pool_length=input_length - ngram_size + 1) for ngram_size in self.ngram_filter_sizes ] self.projection_layer = Dense(self.output_dim) # Building all layers because these sub-layers are not explitly part of the computatonal graph. for convolution_layer, max_pooling_layer in zip( self.convolution_layers, self.max_pooling_layers): with K.name_scope(convolution_layer.name): convolution_layer.build(input_shape) with K.name_scope(max_pooling_layer.name): max_pooling_layer.build( convolution_layer.compute_output_shape(input_shape)) maxpool_output_dim = self.num_filters * len(self.ngram_filter_sizes) projection_input_shape = (input_shape[0], maxpool_output_dim) with K.name_scope(self.projection_layer.name): self.projection_layer.build(projection_input_shape) # Defining the weights of this "layer" as the set of weights from all convolution # and maxpooling layers. self.trainable_weights = [] for layer in self.convolution_layers + self.max_pooling_layers + [ self.projection_layer ]: self.trainable_weights.extend(layer.trainable_weights) super(CNNEncoder, self).build(input_shape) @overrides def call(self, inputs, mask=None): # pylint: disable=unused-argument # Each convolution layer returns output of size (samples, pool_length, num_filters), # where pool_length = num_words - ngram_size + 1 # Each maxpooling layer returns output of size (samples, 1, num_filters). # We need to flatten to remove the second dimension of length 1 from the maxpooled output. # TODO(matt): we need to use a convolutional layer here that supports masking. filter_outputs = [ K.batch_flatten( max_pooling_layer.call(convolution_layer.call(inputs))) for max_pooling_layer, convolution_layer in zip( self.max_pooling_layers, self.convolution_layers) ] if K.backend() == 'theano': # Just using the `call` method on layers does not set the _keras_shape, which is # necessary with the theano backend. So we set it manually here to what we expect the # shape to be. for filter_output in filter_outputs: filter_output._keras_shape = (None, self.num_filters) # pylint: disable=protected-access maxpool_output = Concatenate()( filter_outputs) if len(filter_outputs) > 1 else filter_outputs[0] return self.projection_layer.call(maxpool_output) @overrides def compute_output_shape(self, input_shape): return (input_shape[0], self.output_dim) @overrides def compute_mask(self, inputs, mask=None): # pylint: disable=unused-argument # By default Keras propagates the mask from a layer that supports masking. We don't need it # anymore. So eliminating it from the flow. return None @overrides def get_config(self): config = { "units": self.output_dim, "num_filters": self.num_filters, "ngram_filter_sizes": self.ngram_filter_sizes, "conv_layer_activation": self.conv_layer_activation, "l1_regularization": self.l1_regularization, "l2_regularization": self.l2_regularization, } base_config = super(CNNEncoder, self).get_config() config.update(base_config) return config
class NSE(Layer): ''' Simple Neural Semantic Encoder. ''' def __init__(self, output_dim, input_length=None, composer_activation='linear', return_mode='last_output', weights=None, **kwargs): ''' Arguments: output_dim (int) input_length (int) composer_activation (str): activation used in the MLP return_mode (str): One of last_output, all_outputs, output_and_memory This is analogous to the return_sequences flag in Keras' Recurrent. last_output returns only the last h_t all_outputs returns the whole sequence of h_ts output_and_memory returns the last output and the last memory concatenated (needed if this layer is followed by a MMA-NSE) weights (list): Initial weights ''' self.output_dim = output_dim self.input_dim = output_dim # Equation 2 in the paper makes this assumption. self.initial_weights = weights self.input_spec = [InputSpec(ndim=3)] self.input_length = input_length self.composer_activation = composer_activation super(NSE, self).__init__(**kwargs) self.reader = LSTM(self.output_dim, dropout_W=0.0, dropout_U=0.0, consume_less="gpu", name="{}_reader".format(self.name)) # TODO: Let the writer use parameter dropout and any consume_less mode. # Setting dropout to 0 here to eliminate the need for constants. # Setting consume_less to gpu to eliminate need for preprocessing self.writer = LSTM(self.output_dim, dropout_W=0.0, dropout_U=0.0, consume_less="gpu", name="{}_writer".format(self.name)) self.composer = Dense(self.output_dim * 2, activation=self.composer_activation, name="{}_composer".format(self.name)) if return_mode not in [ "last_output", "all_outputs", "output_and_memory" ]: raise Exception("Unrecognized return mode: %s" % (return_mode)) print("vj golden NSE.__init__ return_mode is {}".format(return_mode)) self.return_mode = return_mode def get_output_shape_for(self, input_shape): input_length = input_shape[1] if self.return_mode == "last_output": return (input_shape[0], self.output_dim) elif self.return_mode == "all_outputs": return (input_shape[0], input_length, self.output_dim) else: # return_mode is output_and_memory. Output will be concatenated to memory. return (input_shape[0], input_length + 1, self.output_dim) def compute_mask(self, input, mask): if mask is None or self.return_mode == "last_output": return None elif self.return_mode == "all_outputs": return mask # (batch_size, input_length) else: # Return mode is output_and_memory # Mask memory corresponding to all the inputs that are masked, and do not mask the output # (batch_size, input_length + 1) return K.cast(K.concatenate([K.zeros_like(mask[:, :1]), mask]), 'uint8') def get_composer_input_shape(self, input_shape): # Takes concatenation of output and memory summary return (input_shape[0], self.output_dim * 2) def get_reader_input_shape(self, input_shape): return input_shape def build(self, input_shape): self.input_spec = [InputSpec(shape=input_shape)] input_dim = input_shape[-1] reader_input_shape = self.get_reader_input_shape(input_shape) print >> sys.stderr, "NSE reader input shape:", reader_input_shape writer_input_shape = (input_shape[0], 1, self.output_dim * 2 ) # Will process one timestep at a time print >> sys.stderr, "NSE writer input shape:", writer_input_shape composer_input_shape = self.get_composer_input_shape(input_shape) print >> sys.stderr, "NSE composer input shape:", composer_input_shape self.reader.build(reader_input_shape) self.writer.build(writer_input_shape) self.composer.build(composer_input_shape) # Aggregate weights of individual components for this layer. reader_weights = self.reader.trainable_weights writer_weights = self.writer.trainable_weights composer_weights = self.composer.trainable_weights self.trainable_weights = reader_weights + writer_weights + composer_weights if self.initial_weights is not None: self.set_weights(self.initial_weights) del self.initial_weights def get_initial_states(self, nse_input, input_mask=None): ''' This method produces the 'read' mask for all timesteps and initializes the memory slot mem_0. Input: nse_input (batch_size, input_length, input_dim) Output: list[Tensors]: h_0 (batch_size, output_dim) c_0 (batch_size, output_dim) flattened_mem_0 (batch_size, input_length * output_dim) While this method simply copies input to mem_0, variants that inherit from this class can do something fancier. ''' input_to_read = nse_input mem_0 = input_to_read flattened_mem_0 = K.batch_flatten(mem_0) flattened_mem_0 = TF_PRINT(flattened_mem_0, "get_initial_states.flattened_mem_0", expected_shape=[BATCH, LENGTH * DIM]) initial_states = self.reader.get_initial_states(nse_input) initial_states += [flattened_mem_0] return initial_states @staticmethod def summarize_memory(o_t, mem_tm1): ''' This method selects the relevant parts of the memory given the read output and summarizes the memory. Implements Equations 2-3 or 8-11 in the paper. ''' # Selecting relevant memory slots, Equation 2 z_t = K.softmax(K.sum(K.expand_dims(o_t, dim=1) * mem_tm1, axis=2)) # (batch_size, input_length) z_t = TF_PRINT(z_t, "summarize_memory.z_t", expected_shape=[BATCH, LENGTH]) # Summarizing memory, Equation 3 m_rt = K.sum(K.expand_dims(z_t, dim=2) * mem_tm1, axis=1) # (batch_size, output_dim) m_rt = TF_PRINT(m_rt, "summarize_memory.m_rt", expected_shape=[BATCH, DIM]) return z_t, m_rt def compose_memory_and_output(self, output_memory_list): ''' This method takes a list of tensors and applies the composition function on their concatrnation. Implements equation 4 or 12 in the paper. ''' # Composition, Equation 4 c_t = self.composer.call( K.concatenate(output_memory_list)) # (batch_size, output_dim) c_t = TF_PRINT(c_t, "compose_memory_and_output.c_t", expected_shape=[BATCH, DIM]) return c_t def update_memory(self, z_t, h_t, mem_tm1): ''' This method takes the attention vector (z_t), writer output (h_t) and previous timestep's memory (mem_tm1) and updates the memory. Implements equations 6, 14 or 15. ''' """ The following is written assuming the equations in the paper are implemented as they are written: tiled_z_t_trans = K.tile(K.expand_dims(z_t,1), [1,self.output_dim,1]) # (batch_size, input_length, output_dim) input_length = K.shape(mem_tm1)[1] # (batch_size, input_length, output_dim) # tiled_h_t = K.permute_dimensions(K.tile(K.expand_dims(h_t, -1), [1,input_length]), (0, 2, 1)) tiled_h_t = K.tile(K.expand_dims(h_t, -1), [1,1, input_length]) # Updating memory. First term in summation corresponds to selective forgetting and the second term to # selective addition. Equation 6. mem_t = mem_tm1 * (1 - tiled_z_t_trans) + tiled_h_t * tiled_z_t_trans # (batch_size, input_length, output_dim) """ """ The following code assumes that mem_t is actually the transpose of what is in the paper. Implemented by simply wrapping a K.permute_dimensions(_, (0, 2, 1)) call around the original value. """ tiled_z_t = K.permute_dimensions( K.tile(K.expand_dims(z_t, 1), [1, self.output_dim, 1]), (0, 2, 1)) # (batch_size, input_length, output_dim) input_length = K.shape(mem_tm1)[1] # (batch_size, input_length, output_dim) # tiled_h_t = K.permute_dimensions(K.tile(K.expand_dims(h_t, -1), [1,input_length]), (0, 2, 1)) tiled_h_t = K.permute_dimensions( K.tile(K.expand_dims(h_t, -1), [1, 1, input_length]), (0, 2, 1)) # Updating memory. First term in summation corresponds to selective forgetting and the second term to # selective addition. Equation 6. mem_t = mem_tm1 * ( 1 - tiled_z_t ) + tiled_h_t * tiled_z_t # (batch_size, input_length, output_dim) mem_t = TF_PRINT(mem_t, "update_memory.mem_t", expected_shape=[BATCH, LENGTH, DIM]) return mem_t @staticmethod def split_states(states): # This method is a helper for the step function to split the states into reader states, memory and # awrite states. return states[:2], states[2], states[3:] def step(self, input_t, states): ''' This method is a step function that updates the memory at each time step and produces a new output vector (Equations 1 to 6 in the paper). The memory_state is flattened because K.rnn requires all states to be of the same shape as the output, because it uses the same mask for the output and the states. Inputs: input_t (batch_size, input_dim) states (list[Tensor]) flattened_mem_tm1 (batch_size, input_length * output_dim) writer_h_tm1 (batch_size, output_dim) writer_c_tm1 (batch_size, output_dim) Outputs: h_t (batch_size, output_dim) flattened_mem_t (batch_size, input_length * output_dim) ''' input_t = TF_PRINT(input_t, "step.input_t", expected_shape=[BATCH, DIM]) reader_states, flattened_mem_tm1, writer_states = self.split_states( states) input_mem_shape = K.shape(flattened_mem_tm1) mem_tm1_shape = (input_mem_shape[0], input_mem_shape[1] / self.output_dim, self.output_dim) mem_tm1 = K.reshape( flattened_mem_tm1, mem_tm1_shape) # (batch_size, input_length, output_dim) mem_tm1 = TF_PRINT(mem_tm1, "step.mem_tm1", expected_shape=[BATCH, LENGTH, DIM]) reader_constants = self.reader.get_constants( input_t) # Does not depend on input_t, see init. reader_states = reader_states[:2] + tuple( reader_constants) + reader_states[2:] o_t, [_, reader_c_t] = self.reader.step( input_t, reader_states) # o_t, reader_c_t: (batch_size, output_dim) o_t = TF_PRINT(o_t, "step.o_t", expected_shape=[BATCH, DIM]) reader_c_t = TF_PRINT(reader_c_t, "step.reader_c_t", expected_shape=[BATCH, DIM]) z_t, m_rt = self.summarize_memory(o_t, mem_tm1) c_t = self.compose_memory_and_output([o_t, m_rt]) # Collecting the necessary variables to directly call writer's step function. writer_constants = self.writer.get_constants( c_t) # returns dropouts for W and U (all 1s, see init) writer_states += tuple(writer_constants) # Making a call to writer's step function, Equation 5 h_t, [_, writer_c_t] = self.writer.step( c_t, writer_states) # h_t, writer_c_t: (batch_size, output_dim) h_t = TF_PRINT(h_t, "step.h_t", expected_shape=[BATCH, DIM]) writer_c_t = TF_PRINT(writer_c_t, "step.writer_c_t", expected_shape=[BATCH, DIM]) mem_t = self.update_memory(z_t, h_t, mem_tm1) flattened_mem_t = K.batch_flatten(mem_t) flattened_mem_t = TF_PRINT(flattened_mem_t, "step.flattened_mem_t", expected_shape=[BATCH, LENGTH * DIM]) return h_t, [o_t, reader_c_t, flattened_mem_t, h_t, writer_c_t] def loop(self, x, initial_states, mask): # This is a separate method because Ontoaware variants will have to override this to make a call # to changingdim rnn. last_output, all_outputs, last_states = K.rnn(self.step, x, initial_states, mask=mask) last_output = TF_PRINT(last_output, "loop.last_output") all_outputs = TF_PRINT(all_outputs, "loop.all_outputs") # last_states = TF_PRINT(last_states, "loop.last_states") return last_output, all_outputs, last_states def call(self, x, mask=None): # input_shape = (batch_size, input_length, input_dim). This needs to be defined in build. if mask != None: print("vj golden call.mask ={}. Being set to None.".format(mask)) mask = None initial_read_states = self.get_initial_states(x, mask) fake_writer_input = K.expand_dims(initial_read_states[0], dim=1) # (batch_size, 1, output_dim) fake_writer_input = TF_PRINT(fake_writer_input, "call.fake_writer_input", expected_shape=[BATCH, 1, DIM]) initial_write_states = self.writer.get_initial_states( fake_writer_input) # h_0 and c_0 of the writer LSTM initial_states = initial_read_states + initial_write_states # last_output: (batch_size, output_dim) # all_outputs: (batch_size, input_length, output_dim) # last_states: # last_memory_state: (batch_size, input_length, output_dim) # last_output # last_writer_ct last_output, all_outputs, last_states = self.loop( x, initial_states, mask) last_memory = last_states[0] if self.return_mode == "last_output": return last_output elif self.return_mode == "all_outputs": return all_outputs else: # return mode is output_and_memory expanded_last_output = K.expand_dims( last_output, dim=1) # (batch_size, 1, output_dim) expanded_last_output = TF_PRINT(expanded_last_output, "call.expanded_last_output", expected_size=[BATCH, 1, DIM]) # (batch_size, 1+input_length, output_dim) result = K.concatenate([expanded_last_output, last_memory], axis=1) result = TF_PRINT(result, "call.result", expected_size=[BATCH, 1 + LENGTH, DIM]) return result def get_config(self): config = { 'output_dim': self.output_dim, 'input_length': self.input_length, 'composer_activation': self.composer_activation, 'return_mode': self.return_mode } base_config = super(NSE, self).get_config() config.update(base_config) return config
class AttentionTransformer(Layer): """ Keras implementation of the multihead attention layers in tensorflow, adapted from https://github.com/Kyubyong/transformer 3 inputs - queries, keys, values (in this order) generally: [batch size; length of sequence; features vector] queries: A 3d tensor with shape of [N_batches, T_q, C_q]. keys: A 3d tensor with shape of [N_batches, T_k, C_k]. values: A 3d tensor with shape of [N_batches, T_v, C_v]. if called with one input, assumes keys=queries=values as in attention is all you need. """ def __init__(self, usesoftmax=True, num_units=None, num_heads=8, dropout_rate=0, activation='relu', causality=False, usequerymasks=True, **kwargs): self.activation = activation self.num_units = num_units self.num_heads = num_heads self.dropout_rate = dropout_rate self.causality = causality self.usesoftmax = usesoftmax self.usequerymasks = usequerymasks Layer.__init__(self, **kwargs) def get_config(self): config = {'activation': self.activation, 'num_units': self.num_units, 'num_heads': self.num_heads, 'dropout_rate': self.dropout_rate, 'causality': self.causality, 'usesoftmax': self.usesoftmax, 'usequerymasks': self.usequerymasks, } base_config = super(AttentionTransformer, self).get_config() return dict(list(base_config.items()) + list(config.items())) def build(self, input_shape): (queries, keys, values) = self._care_inputs(input_shape) queries = list(queries) keys = list(keys) values = list(values) if self.num_units is None: self.num_units = queries[-1] # we will now accept inputs as sequences, so if something is not a sequence it IS a sequence of len 1 if len(queries) <= 2: queries.insert(-1, 1) if len(keys) <= 2: keys.insert(-1, 1) if len(values) <= 2: values.insert(-1, 1) self.Q_dense = Dense(self.num_units, activation=self.activation, name="Q_dense") self.Q_dense.build(queries) self.K_dense = Dense(self.num_units, activation=self.activation, name="K_dense") self.K_dense.build(keys) self.V_dense = Dense(self.num_units, activation=self.activation, name="V_dense") self.V_dense.build(values) self.trainable_weights = self.Q_dense.trainable_weights + self.K_dense.trainable_weights + \ self.V_dense.trainable_weights self.non_trainable_weights = self.Q_dense.non_trainable_weights + self.K_dense.non_trainable_weights + \ self.V_dense.non_trainable_weights self.dropout = Dropout(rate=self.dropout_rate) self.built = True # a hint about the Keras implementation: it is all called in the sequence: build, compute_output_shape, call def _care_inputs(self, inputs): inputs = copy.copy(inputs) if (isinstance(inputs, list)): while (len(inputs) < 3): inputs.append(inputs[-1]) inputs = inputs[0:3] else: inputs = [inputs, inputs, inputs] return inputs def compute_output_shape(self, input_shape): (queries, keys, values) = self._care_inputs(input_shape) # assert input_shape and len(input_shape) >= 2 # assert input_shape[-1] output_shape = list(queries) output_shape[-1] = self.num_units # (N, T_q, C) num units = T_q, if num units unspecified by user return tuple(output_shape) def call(self, inputs, training=None): # expects 3 inputs as merge layer https://github.com/keras-team/keras/blob/master/keras/layers/merge.py (queries, keys, values) = self._care_inputs(inputs) if self.num_units is None: # done in build too self.num_units = queries.get_shape().as_list()[-1] # we will now accept inputs as sequences, so if something is not a sequence it IS a sequence of len 1 if len(queries.shape) <= 2: queries = tf.expand_dims(queries, -2) if len(keys.shape) <= 2: keys = tf.expand_dims(keys, -2) if len(values.shape) <= 2: values = tf.expand_dims(values, -2) Q = self.Q_dense.call(queries) # call is a way how to use a layer inside a layer K = self.K_dense.call(keys) V = self.V_dense.call(values) if len(Q.shape) <= 2: Q = tf.expand_dims(Q, -2) if len(K.shape) <= 2: K = tf.expand_dims(K, -2) if len(V.shape) <= 2: V = tf.expand_dims(V, -2) return self.multihead_attention_mechanism(Q, K, V, queries=queries, keys=keys, num_heads=self.num_heads, causality=self.causality, usequerymasks=self.usequerymasks, scope="multihead_attention", usesoftmax=self.usesoftmax, reuse=None) def normalize(self, inputs, epsilon=1e-8, scope="ln", reuse=None): """Applies layer normalization. Args: ---- inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`. epsilon: A floating number. A very small number for preventing ZeroDivision Error. scope: Optional scope for `variable_scope`. reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns ------- A tensor with the same shape and data dtype as `inputs`. """ with tf.variable_scope(scope, reuse=reuse): inputs_shape = inputs.get_shape() params_shape = inputs_shape[-1:] mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True) beta = tf.Variable(tf.zeros(params_shape)) gamma = tf.Variable(tf.ones(params_shape)) normalized = (inputs - mean) / ((variance + epsilon) ** (.5)) outputs = gamma * normalized + beta return outputs def multihead_attention_mechanism(self, Qinp, Kinp, Vinp, queries, keys, num_heads=8, causality=False, usequerymasks=True, scope="multihead_attention", usesoftmax=True, reuse=None): """Applies multihead attention mechanism. Just the computation eithout trainable weights. Args: ---- queries: A 3d tensor with shape of [N, T_q, C_q]. keys: A 3d tensor with shape of [N, T_k, C_k]. causality: Boolean. If true, units that reference the future are masked. num_heads: An int. Number of heads. scope: Optional scope for `variable_scope`. reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns ------- A 3d tensor with shape of (N, T_q, C) """ assert (len(Qinp.shape) + len(Kinp.shape) + len(Vinp.shape) > 3 * 2) with tf.variable_scope(scope, reuse=reuse): # Split and concat - for keras, the N dimension is HIDDEN, but in tf we see it! Q_ = tf.concat(tf.split(Qinp, num_heads, axis=2), axis=0) # (h*N, T_q, C/h) K_ = tf.concat(tf.split(Kinp, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) V_ = tf.concat(tf.split(Vinp, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) # Multiplication # T_q, T_k are the original queries and keys - # sequence lengths (and in the application they are the same) preoutputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (h*N, T_q, T_k) # Scale preoutputs = preoutputs / (K_.get_shape().as_list()[-1] ** 0.5) # Key Masking key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1))) # (N, T_k) key_masks = tf.tile(key_masks, [num_heads, 1]) # (h*N, T_k) key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) # (h*N, T_q, T_k) paddings = tf.ones_like(preoutputs) * (-2 ** 32 + 1) preoutputs = tf.where(tf.equal(key_masks, 0), paddings, preoutputs) # (h*N, T_q, T_k) # Causality = Future blinding if causality: diag_vals = tf.ones_like(preoutputs[0, :, :]) # (T_q, T_k) tril = tf.contrib.linalg.LinearOperatorTriL(diag_vals).to_dense() # (T_q, T_k) masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(preoutputs)[0], 1, 1]) # (h*N, T_q, T_k) paddings = tf.ones_like(masks) * (-2 ** 32 + 1) preoutputs = tf.where(tf.equal(masks, 0), paddings, preoutputs) # (h*N, T_q, T_k) # Activation if (usesoftmax): preoutputs = tf.nn.softmax(preoutputs) # (h*N, T_q, T_k) # Query Masking if usequerymasks: query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1))) # (N, T_q) query_masks = tf.tile(query_masks, [num_heads, 1]) # (h*N, T_q) query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) # (h*N, T_q, T_k) preoutputs *= query_masks # broadcasting. (N, T_q, T_k) outputs = self.dropout.call(preoutputs) # Weighted sum outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h) # Restore shape outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2) # (N, T_q, C) # Residual connection #still the same dimension outputs += queries # Normalize outputs = self.normalize(outputs) # (N, T_q, C) return outputs
class CNNEncoder(Seq2VecEncoder): """ CNNEncoder is a combination of multiple convolutional layers and max pooling layers. This is defined as a single layer to be consistent with other encoders in terms of input and output specifications. Input shape: (batch_size, sequence_length, input_dim). Output shape: (batch_size, output_dim). The CNN has one convolution layer per each ngram filter size. Each convolution operation gives out a vector of size num_filters. The number of times a convolution layer will be used depends on the ngram size: input_len - ngram_size + 1. The corresponding maxpooling layer aggregates all these outputs from the convolution layer and outputs the max. This operation is repeated for every ngram size passed, and consequently the dimensionality of the output after maxpooling is len(ngram_filter_sizes) * num_filters. We the use a fully connected layer to project in back to the desired output_dim. References: "A Sensitivity Analysis of (and Practitioners’ Guide to) Convolutional Neural Networks for Sentence Classification", Zhang and Wallace 2016, particularly Figure 1. Args: filters: Integer, the output dim for each convolutional layer. kernel_sizes: An integer tuple of list, the kernel sizes of each convolutional layers. units: After doing convolutions, we'll project the collected features into a vecor of this size. If this value is `None`, just return the result of the max pooling. conv_layer_activation: string of convolutional layer `Activation`. l1_regularization: float. l2_regularization: float. """ def __init__(self, filters=100, kernel_sizes=(2, 3, 4, 5), units=None, conv_layer_activation='relu', l1_regularization=None, l2_regularization=None, **kwargs): self.filters = filters self.kernel_sizes = kernel_sizes self.units = units self.conv_layer_activation = conv_layer_activation self.l1_regularization = l1_regularization self.l2_regularization = l2_regularization self.regularizer = l1_l2(l1=self.l1_regularization, l2=self.l2_regularization) self.conv_layers = None self.projection_layer = None self.output_dim = None self.input_spec = [InputSpec(ndim=3)] super(CNNEncoder, self).__init__(**kwargs) def build(self, input_shape): self.conv_layers = [ Conv1D(filters=self.filters, kernel_size=kernel_size, activation=self.conv_layer_activation, kernel_regularizer=self.regularizer, bias_regularizer=self.regularizer) for kernel_size in self.kernel_sizes ] for conv_layer in self.conv_layers: with K.name_scope(conv_layer.name): conv_layer.build(input_shape) maxpool_output_dim = self.filters * len(self.kernel_sizes) if self.units is not None: self.projection_layer = Dense(self.units) projection_input_shape = (input_shape[0], maxpool_output_dim) with K.name_scope(self.projection_layer.name): self.projection_layer.build(projection_input_shape) self.output_dim = self.units trainable_layers = self.conv_layers + [self.projection_layer] else: self.projection_layer = None self.output_dim = maxpool_output_dim trainable_layers = self.conv_layers # Define weights of this layer as the set of weights from all layers. self.trainable_weights = [] for layer in trainable_layers: self.trainable_weights.extend(layer.trainable_weights) super(CNNEncoder, self).build(input_shape) def call(self, inputs, mask=None): # Each convolution layer returns output of size (batch_size, conv_length, filters), # where `conv_length = num_words - kernel_size + 1`. We then do max # pooling over each filter for the whole input sequence, just use K.max, # giving a result tensor of shape (batch_size, filters), which then # gets projected using the projection layer. filter_outputs = [ K.max(conv_layer.call(inputs), axis=1) for conv_layer in self.conv_layers ] maxpool_output = Concatenate()(filter_outputs) \ if len(filter_outputs) > 1 else filter_outputs[0] if self.projection_layer: result = self.projection_layer.call(maxpool_output) else: result = maxpool_output return result def compute_output_shape(self, input_shape): return (input_shape[0], self.output_dim) def compute_mask(self, inputs, mask=None): # By default Keras propagates the mask from a layer that supports masking. We don't need it # anymore. So eliminating it from the flow. return None def get_config(self): config = { "filters": self.filters, "kernel_sizes": self.kernel_sizes, "units": self.units, "conv_layer_activation": self.conv_layer_activation, "l1_regularization": self.l1_regularization, "l2_regularization": self.l2_regularization } base_config = super(CNNEncoder, self).get_config() config.update(base_config) return config
class Connected(Layer): """ Darknet "connected" layer. Main difference vs. keras Dense layer is that input also becomes flatten. The same as ``` def get_connected(params): activation = get_activation(params.get('activation', "linear")) def _connected(x): y = Flatten()(x) return Dense(params.get('output', 1), activation=activation)(y) return Lambda(_connected) ``` - but also has weights. """ def __init__(self, output=1, activation=None, batch_normalize=0, **kwargs): self.units = output self.batch_normalize = batch_normalize super(Connected, self).__init__(**kwargs) self.dense_layer = Dense(self.units, **kwargs) # TODO: axis check if self.batch_normalize: self.batchnorm_layer = BatchNormalization(scale=True, center=False) self.activation_layer = get_activation(activation) def build(self, input_shape): super(Connected, self).build(input_shape) densed_shape = (input_shape[0], np.prod(input_shape[1:])) self.dense_layer.build(densed_shape) if self.batch_normalize: densed_shape = self.dense_layer.output_shape(densed_shape) self.batchnorm_layer.build(densed_shape) self.activation_layer.build(densed_shape) def call(self, x, training=None): flatten_inputs = K.batch_flatten(x) output = self.dense_layer.call(flatten_inputs) if self.batch_normalize: output = self.batchnorm_layer.call(output) output = self.activation_layer.call(output) return output def compute_output_shape(self, input_shape): dense_input_shape = (input_shape[0], np.prod(input_shape[1:])) shape = self.dense_layer.compute_output_shape(dense_input_shape) #if self.batch_normalize: # shape = self.batch_normalize.compute_output_shape(shape) return shape def set_weights(self, weights): if self.batch_normalize: (weights, bias, scales, rolling_mean, rolling_variance) = weights self.dense_layer.set_weights((weights, bias)) self.batchnorm_layer.set_weights( (scales, rolling_mean, rolling_variance)) else: self.dense_layer.set_weights(weights) def get_weights(self): if self.batch_normalize: return self.dense_layer.get_weights( ) + self.batchnorm_layer.get_weights() return self.dense_layer.get_weights()