class HighwayLayer(HiddenLayer): def __init__(self, in_dim, activation, hidden_dim=None, transform_gate="sigmoid", prefix="", initializer=default_initializer, dropout=0, verbose=True): # By construction the dimensions of in_dim and out_dim have to match, and hence W_T and W_H are square matrices. if hidden_dim is not None: assert in_dim == hidden_dim if verbose: logger.debug('Building {}...'.format(self.__class__.__name__)) super(HighwayLayer, self).__init__(in_dim, in_dim, activation, prefix, initializer, dropout, verbose) self.transform_gate = Activation(transform_gate) self.W_H, self.W_H.name = self.W, prefix + "W_H" self.b_H, self.b_H.name = self.b, prefix + "b_H" self.W_T = shared_rand_matrix((self.hidden_dim, self.in_dim), prefix + 'W_T', initializer) self.b_T = shared_zero_matrix((self.hidden_dim, ), prefix + 'b_T') self.params = [self.W_H, self.W_T, self.b_H, self.b_T] self.norm_params = [self.W_H, self.W_T] self.l1_norm = T.sum( [T.sum(T.abs_(param)) for param in self.norm_params]) self.l2_norm = T.sum([T.sum(param**2) for param in self.norm_params]) if verbose: logger.debug('Architecture of {} built finished'.format( self.__class__.__name__)) logger.debug('Input dimension: %d' % self.in_dim) logger.debug('Hidden dimension: %d' % self.hidden_dim) logger.debug('Activation Func: %s' % self.act.method) logger.debug('Transform Gate: %s' % self.transform_gate.method) logger.debug('Dropout Rate: %f' % self.dropout) def forward(self, x): """ :param x: (in, ) """ # (in, in) (in, ) + (in, ) -> (in) t = self.transform_gate.activate(T.dot(self.W_T, x) + self.b_T) # (in, in) (in, ) + (in, ) -> (in) z_t = self.act.activate(T.dot(self.W_H, x) + self.b_H) # (in, ) * (in, ) + (in, ) * (in, ) -> (in, ) return t * z_t + (1 - t) * x def forward_batch(self, x): """ :param x: (batch, in) """ # (batch, in) (in, in) + (in, ) -> (batch, in) t = self.transform_gate.activate(T.dot(x, self.W_T.T) + self.b_T) # (batch, in) (in, in) + (in, ) -> (batch, in) z_t = self.act.activate(T.dot(x, self.W_H.T) + self.b_H) # (batch, in) * (batch, in) + (batch, in) * (batch, in) -> (batch, in) return t * z_t + (1 - t) * x
class HiddenLayer(object): def __init__(self, in_dim, hidden_dim, activation, prefix="", initializer=default_initializer, dropout=0, verbose=True): if verbose: logger.debug('Building {}...'.format(self.__class__.__name__)) self.in_dim = in_dim self.hidden_dim = hidden_dim self.out_dim = hidden_dim self.act = Activation(activation) self.dropout = dropout self.W = shared_rand_matrix((self.hidden_dim, self.in_dim), prefix + 'W', initializer) self.b = shared_zero_matrix((self.hidden_dim, ), prefix + 'b') self.params = [self.W, self.b] self.norm_params = [self.W] self.l1_norm = T.sum( [T.sum(T.abs_(param)) for param in self.norm_params]) self.l2_norm = T.sum([T.sum(param**2) for param in self.norm_params]) if verbose: logger.debug('Architecture of {} built finished'.format( self.__class__.__name__)) logger.debug('Input dimension: %d' % self.in_dim) logger.debug('Hidden dimension: %d' % self.hidden_dim) logger.debug('Activation Func: %s' % self.act.method) logger.debug('Dropout Rate: %f' % self.dropout) def forward(self, x): """ :param x: (dim, ) """ output = self.act.activate(T.dot(self.W, x) + self.b) return dropout_from_layer(output, self.dropout) def forward_batch(self, x): """ :param x: (batch, dim) """ # (batch, in) (in, hidden) + (None, hidden) -> (batch, hidden) output = self.act.activate(T.dot(x, self.W.T) + self.b) return dropout_from_layer(output, self.dropout)
class BRNN(object): ''' Bidirectional RNN. This is just a trial for using BRNN as a tool for sentence modeling. First trial on the task of sentiment analysis. ''' def __init__(self, configs, verbose=True): if verbose: pprint('Build Tied weights Bidirectional Recurrent Neural Network') self.input = T.matrix(name='input') self.truth = T.ivector(name='label') self.learn_rate = T.scalar(name='learn rate') # Configure Activation function self.act = Activation(configs.activation) # Build bidirectional RNN with tied weights num_input, num_hidden, num_class = configs.num_input, configs.num_hidden, configs.num_class # Stack all the variables together into a vector in order to apply the batch updating algorithm # Since there are two directions for the RNN, all the weight matrix associated with RNN will be # duplicated num_params = 2 * (num_input * num_hidden + \ num_hidden * num_hidden + \ num_hidden) + \ 2 * num_hidden * num_class + \ num_class self.num_params = num_params self.theta = theano.shared(value=np.zeros(num_params, dtype=floatX), name='theta', borrow=True) # Incremental index param_idx = 0 # 1, Feed-forward matrix for forward direction: W_forward self.W_forward = self.theta[param_idx: param_idx+num_input*num_hidden].reshape((num_input, num_hidden)) self.W_forward.name = 'W_forward_RNN' W_forward_init = np.asarray(np.random.uniform(low=-np.sqrt(6.0/(num_input+num_hidden)), high=np.sqrt(6.0/(num_input+num_hidden)), size=(num_input, num_hidden)), dtype=floatX) param_idx += num_input * num_hidden # 1, Feed-forward matrix for backward direction: W_backward self.W_backward = self.theta[param_idx: param_idx+num_input*num_hidden].reshape((num_input, num_hidden)) self.W_backward.name = 'W_backward_RNN' W_backward_init = np.asarray(np.random.uniform(low=-np.sqrt(6.0/(num_input+num_hidden)), high=np.sqrt(6.0/(num_input+num_hidden)), size=(num_input, num_hidden)), dtype=floatX) param_idx += num_input * num_hidden # 2, Recurrent matrix for forward direction: U_forward self.U_forward = self.theta[param_idx: param_idx+num_hidden*num_hidden].reshape((num_hidden, num_hidden)) self.U_forward.name = 'U_forward_RNN' U_forward_init = np.asarray(np.random.uniform(low=-np.sqrt(6.0/(num_hidden+num_hidden)), high=np.sqrt(6.0/(num_hidden+num_hidden)), size=(num_hidden, num_hidden)), dtype=floatX) param_idx += num_hidden * num_hidden # 2, Recurrent matrix for backward direction: U_backward self.U_backward = self.theta[param_idx: param_idx+num_hidden*num_hidden].reshape((num_hidden, num_hidden)) self.U_backward.name = 'U_backward_RNN' U_backward_init = np.asarray(np.random.uniform(low=-np.sqrt(6.0/(num_hidden+num_hidden)), high=np.sqrt(6.0/(num_hidden+num_hidden)), size=(num_hidden, num_hidden)), dtype=floatX) param_idx += num_hidden * num_hidden # 3, Bias parameter for the hidden-layer forward direction RNN self.b_forward = self.theta[param_idx: param_idx+num_hidden] self.b_forward.name = 'b_forward_RNN' b_forward_init = np.zeros(num_hidden, dtype=floatX) param_idx += num_hidden # 3, Bias parameter for the hidden-layer backward direction RNN self.b_backward = self.theta[param_idx: param_idx+num_hidden] self.b_backward.name = 'b_backward_RNN' b_backward_init = np.zeros(num_hidden, dtype=floatX) param_idx += num_hidden # Weight matrix for softmax function self.W_softmax = self.theta[param_idx: param_idx+2*num_hidden*num_class].reshape((2*num_hidden, num_class)) self.W_softmax.name = 'W_softmax' W_softmax_init = np.asarray(np.random.uniform(low=-np.sqrt(6.0/(2*num_hidden+num_class)), high=np.sqrt(6.0/(2*num_hidden+num_class)), size=(2*num_hidden, num_class)), dtype=floatX) param_idx += 2*num_hidden*num_class # Bias vector for softmax function self.b_softmax = self.theta[param_idx: param_idx+num_class] self.b_softmax.name = 'b_softmax' b_softmax_init = np.zeros(num_class, dtype=floatX) param_idx += num_class # Set all the default parameters into theta self.theta.set_value(np.concatenate([x.ravel() for x in (W_forward_init, W_backward_init, U_forward_init, U_backward_init, b_forward_init, b_backward_init, W_softmax_init, b_softmax_init)])) assert param_idx == num_params # h[0], zero vector, treated as constants self.h_start = theano.shared(value=np.zeros(num_hidden, dtype=floatX), name='h_start', borrow=True) self.h_end = theano.shared(value=np.zeros(num_hidden, dtype=floatX), name='h_end', borrow=True) # recurrent function used to compress a sequence of input vectors # the first dimension should correspond to time def forward_step(x_t, h_tm1): h_t = self.act.activate(T.dot(x_t, self.W_forward) + \ T.dot(h_tm1, self.U_forward) + self.b_forward) return h_t def backward_step(x_t, h_tm1): h_t = self.act.activate(T.dot(x_t, self.W_backward) + \ T.dot(h_tm1, self.U_backward) + self.b_backward) return h_t # Forward and backward representation over time self.forward_h, _ = theano.scan(fn=forward_step, sequences=self.input, outputs_info=[self.h_start], truncate_gradient=configs.bptt) self.backward_h, _ = theano.scan(fn=backward_step, sequences=self.input, outputs_info=[self.h_end], truncate_gradient=configs.bptt, go_backwards=True) # Store the final value # self.h_start_star = self.forward_h[-1] # self.h_end_star = self.backward_h[-1] self.h_start_star = T.mean(self.forward_h, axis=0) self.h_end_star = T.mean(self.backward_h, axis=0) # L1, L2 regularization self.L1_norm = T.sum(T.abs_(self.W_forward) + T.abs_(self.W_backward) + \ T.abs_(self.U_forward) + T.abs_(self.U_backward) + \ T.abs_(self.W_softmax)) self.L2_norm = T.sum(self.W_forward ** 2) + T.sum(self.W_backward ** 2) + \ T.sum(self.U_forward ** 2) + T.sum(self.U_backward ** 2) + \ T.sum(self.W_softmax ** 2) # Build function to show the learned representation for different sentences self.show_forward = theano.function(inputs=[self.input], outputs=self.h_start_star) self.show_backward = theano.function(inputs=[self.input], outputs=self.h_end_star) ################################################################################## # Correlated BRNN ################################################################################## # Concatenate these two vectors into one self.h = T.concatenate([self.h_start_star, self.h_end_star], axis=0) # Dropout parameter srng = T.shared_randomstreams.RandomStreams(configs.random_seed) mask = srng.binomial(n=1, p=1-configs.dropout, size=self.h.shape) self.h *= T.cast(mask, floatX) # Use concatenated vector as input to the Softmax/MLP classifier self.output = T.nnet.softmax(T.dot(self.h, self.W_softmax) + self.b_softmax) self.pred = T.argmax(self.output, axis=1) # Build cost function self.cost = -T.mean(T.log(self.output)[T.arange(self.truth.shape[0]), self.truth]) if configs.regularization: self.cost += configs.lambda1 * self.L2_norm # Compute gradient self.gradtheta = T.grad(self.cost, self.theta) self.gradinput = T.grad(self.cost, self.input) # Build objective function # Compute the gradients to parameters self.compute_cost_and_gradient = theano.function(inputs=[self.input, self.truth], outputs=[self.cost, self.gradtheta]) # Compute the gradients to inputs self.compute_input_gradient = theano.function(inputs=[self.input, self.truth], outputs=self.gradinput) # Build prediction function self.predict = theano.function(inputs=[self.input], outputs=self.pred) if verbose: pprint('*' * 50) pprint('Finished constructing Bidirectional Recurrent Neural Network (BRNN)') pprint('Size of input dimension: %d' % configs.num_input) pprint('Size of hidden/recurrent dimension: %d' % configs.num_hidden) pprint('Size of output dimension: %d' % configs.num_class) pprint('Is regularization applied? %s' % ('yes' if configs.regularization else 'no')) if configs.regularization: pprint('Coefficient of regularization term: %f' % configs.lambda1) pprint('BPTT step: %d' % configs.bptt) pprint('Number of free parameters in BRNN: %d' % self.num_params) pprint('*' * 50) # This method is used to implement the batch updating algorithm def update_params(self, gradtheta, learn_rate): # gradparams is a single long vector which can be used to update self.theta # Learning algorithm: simple stochastic gradient descent theta = self.theta.get_value(borrow=True) self.theta.set_value(theta - learn_rate * gradtheta, borrow=True) @staticmethod def save(fname, model): with file(fname, 'wb') as fout: cPickle.dump(model, fout) @staticmethod def load(fname): with file(fname, 'rb') as fin: return cPickle.load(fin)
class BRNNEncoder(object): ''' Bidirectional RNN for sequence encoding. ''' def __init__(self, config, verbose=True): if verbose: logger.debug('Building Bidirectional RNN Encoder...') self.input = T.matrix(name='BRNNEncoder_input') # Configure Activation function self.act = Activation(config.activation) # Build Bidirectional RNN num_input, num_hidden = config.num_input, config.num_hidden self.num_params = 2 * (num_input * num_hidden + num_hidden * num_hidden + num_hidden) # Initialize model parameters np.random.seed(config.random_seed) # 1, Feed-forward matrix for forward direction: W_forward W_forward_val = np.random.uniform(low=-1.0, high=1.0, size=(num_input, num_hidden)) W_forward_val = W_forward_val.astype(floatX) self.W_forward = theano.shared(value=W_forward_val, name='W_forward', borrow=True) # 1, Feed-forward matrix for backward direction: W_backward W_backward_val = np.random.uniform(low=-1.0, high=1.0, size=(num_input, num_hidden)) W_backward_val = W_backward_val.astype(floatX) self.W_backward = theano.shared(value=W_backward_val, name='W_backward', borrow=True) # 2, Recurrent matrix for forward direction: U_forward U_forward_val = np.random.uniform(low=-1.0, high=1.0, size=(num_hidden, num_hidden)) U_forward_val = U_forward_val.astype(floatX) U_forward_val, _, _ = np.linalg.svd(U_forward_val) self.U_forward = theano.shared(value=U_forward_val, name='U_forward', borrow=True) # 2, Recurrent matrix for backward direction: U_backward U_backward_val = np.random.uniform(low=-1.0, high=1.0, size=(num_hidden, num_hidden)) U_backward_val = U_backward_val.astype(floatX) U_backward_val, _, _ = np.linalg.svd(U_backward_val) self.U_backward = theano.shared(value=U_backward_val, name='U_backward', borrow=True) # 3, Bias parameter for the hidden-layer forward direction RNN b_forward_val = np.zeros(num_hidden, dtype=floatX) self.b_forward = theano.shared(value=b_forward_val, name='b_forward', borrow=True) # 3, Bias parameter for the hidden-layer backward direction RNN b_backward_val = np.zeros(num_hidden, dtype=floatX) self.b_backward = theano.shared(value=b_backward_val, name='b_backward', borrow=True) # h[0], zero vectors, treated as constants self.h0_forward = theano.shared(value=np.zeros(num_hidden, dtype=floatX), name='h0_forward', borrow=True) self.h0_backward = theano.shared(value=np.zeros(num_hidden, dtype=floatX), name='h0_backward', borrow=True) # Stack all the parameters self.params = [self.W_forward, self.W_backward, self.U_forward, self.U_backward, self.b_forward, self.b_backward] # Compute the forward and backward representation over time self.h_forwards, _ = theano.scan(fn=self._forward_step, sequences=self.input, outputs_info=[self.h0_forward], truncate_gradient=config.bptt) self.h_backwards, _ = theano.scan(fn=self._backward_step, sequences=self.input, outputs_info=[self.h0_backward], truncate_gradient=config.bptt, go_backwards=True) # Average compressing self.h_forward = T.mean(self.h_forwards, axis=0) self.h_backward = T.mean(self.h_backwards, axis=0) # Concatenate self.output = T.concatenate([self.h_forward, self.h_backward], axis=0) # L1, L2 regularization self.L1_norm = T.sum(T.abs_(self.W_forward) + T.abs_(self.W_backward) + T.abs_(self.U_forward) + T.abs_(self.U_backward)) self.L2_norm = T.sum(self.W_forward ** 2) + T.sum(self.W_backward ** 2) + \ T.sum(self.U_forward ** 2) + T.sum(self.U_backward ** 2) if verbose: logger.debug('Finished constructing the structure of BRNN Encoder: ') logger.debug('Size of the input dimension: %d' % num_input) logger.debug('Size of the hidden dimension: %d' % num_hidden) logger.debug('Activation function: %s' % config.activation) def _forward_step(self, x_t, h_tm1): h_t = self.act.activate(T.dot(x_t, self.W_forward) + \ T.dot(h_tm1, self.U_forward) + \ self.b_forward) return h_t def _backward_step(self, x_t, h_tm1): h_t = self.act.activate(T.dot(x_t, self.W_backward) + \ T.dot(h_tm1, self.U_backward) + \ self.b_backward) return h_t def encode(self, inputM): ''' @inputM: Theano symbol matrix. Compress the input matrix into output vector. ''' h_forwards, _ = theano.scan(fn=self._forward_step, sequences=inputM, outputs_info=[self.h0_forward]) h_backwards, _ = theano.scan(fn=self._backward_step, sequences=inputM, outputs_info=[self.h0_backward], go_backwards=True) # Averaging h_forward = T.mean(h_forwards, axis=0) h_backward = T.mean(h_backwards, axis=0) # Concatenate h = T.concatenate([h_forward, h_backward], axis=0) return h
class RNN(object): ''' Basic component for Recurrent Neural Network ''' def __init__(self, configs=None, verbose=True): ''' Basic RNN is an unsupervised component, where the input is a sequence and the output is a vector with fixed length ''' if verbose: pprint('Build Recurrent Neural Network...') self.input = T.matrix(name='input', dtype=floatX) self.learn_rate = T.scalar(name='learn rate') # Configure activation function self.act = Activation(configs.activation) fan_in = configs.num_input fan_out = configs.num_hidden # Initialize all the variables in RNN, including: # 1, Feed-forward matrix, feed-forward bias, W, W_b # 2, Recurrent matrix, recurrent bias, U, U_b self.W = theano.shared(value=np.asarray( np.random.uniform(low=-np.sqrt(6.0/(fan_in+fan_out)), high=np.sqrt(6.0/(fan_in+fan_out)), size=(fan_in, fan_out)), dtype=floatX), name='W', borrow=True) self.U = theano.shared(value=np.asarray( np.random.uniform(low=-np.sqrt(6.0/(fan_out+fan_out)), high=np.sqrt(6.0/(fan_out+fan_out)), size=(fan_out, fan_out)), dtype=floatX), name='U', borrow=True) # Bias parameter for the hidden-layer encoder of RNN self.b = theano.shared(value=np.zeros(fan_out, dtype=floatX), name='b', borrow=True) # h[0], zero vector self.h0 = theano.shared(value=np.zeros(fan_out, dtype=floatX), name='h0', borrow=True) # Save all the parameters self.params = [self.W, self.U, self.b, self.h0] # recurrent function used to compress a sequence of input vectors # the first dimension should correspond to time def step(x_t, h_tm1): h_t = self.act.activate(T.dot(x_t, self.W) + \ T.dot(h_tm1, self.U) + self.b) return h_t # h is the hidden representation over a time sequence self.hs, _ = theano.scan(fn=step, sequences=self.input, outputs_info=[self.h0], truncate_gradient=configs.bptt) self.h = self.hs[-1] # L1, L2 regularization self.L1_norm = T.sum(T.abs_(self.W) + T.abs_(self.U)) self.L2_norm = T.sum(self.W ** 2) + T.sum(self.U ** 2) # Compress function self.compress = theano.function(inputs=[self.input], outputs=self.h) @staticmethod def save(fname, model): ''' Save current RNN model into fname @fname: String. Filename to save the model. @model: RNN. An instance of RNN class. ''' with file(fname, 'wb') as fout: cPickle.dump(model, fout) @staticmethod def load(fname): ''' Load an RNN model from fname @fname: String. Filename to load the model. ''' with file(fname, 'rb') as fin: return cPickle.load(fin)
class ConvolutionLayer(object): def __init__(self, in_dim, hidden_dim, kernel_size=3, padding='same', pooling='max', dilation_rate=1.0, activation='relu', prefix="", initializer=GlorotUniformInitializer(), dropout=0.0, verbose=True): """ Init Function for ConvolutionLayer :param in_dim: :param hidden_dim: :param kernel_size: :param padding: 'same', 'valid' :param pooling: 'max', 'mean', 'min' :param dilation_rate: :param activation: :param prefix: :param initializer: :param dropout: :param verbose: """ if verbose: logger.debug('Building {}...'.format(self.__class__.__name__)) self.in_dim = in_dim self.out_dim = hidden_dim self.hidden_dim = hidden_dim self.kernel_size = kernel_size self.padding = padding self.dilation_rate = dilation_rate self.pooling = pooling self.dropout = dropout self.act = Activation(activation) self.padding_size = int(self.dilation_rate * (self.kernel_size - 1)) # Composition Function Weight # Kernel Matrix (kernel_size, hidden, in) self.W = shared_rand_matrix((self.kernel_size, self.hidden_dim, self.in_dim), prefix + 'W', initializer) # Bias Term (hidden) self.b = shared_zero_matrix((self.hidden_dim,), prefix + 'b') self.params = [self.W, self.b] self.norm_params = [self.W] # L1, L2 Norm self.l1_norm = T.sum(T.abs_(self.W)) self.l2_norm = T.sum(self.W ** 2) if verbose: logger.debug('Architecture of {} built finished'.format(self.__class__.__name__)) logger.debug('Input dimension: %d' % self.in_dim) logger.debug('Filter Num (Hidden): %d' % self.hidden_dim) logger.debug('Kernel Size (Windows): %d' % self.kernel_size) logger.debug('Padding method : %s' % self.padding) logger.debug('Dilation Rate : %s' % self.dilation_rate) logger.debug('Padding Size : %s' % self.padding_size) logger.debug('Pooling method : %s' % self.pooling) logger.debug('Activation Func: %s' % self.act.method) logger.debug('Dropout Rate: %f' % self.dropout) def forward_conv(self, x): """ #TODO :param x: (length, dim) :return: (length+2*(kernel-1, hidden_dim) """ # T.nn.conv2d (batch size, input channels, input rows, input columns) # dl4nlp (batch size, 1, length, in_dim) x = x.dimshuffle(['x', 'x', 0, 1]) # T.nn.conv2d (output channels, input channels, filter rows, filter columns) # dl4nlp (hidden_dim, 1, kernel_size, in_dim) filter_w = self.W.dimshuffle([1, 'x', 0, 2]) # T.nn.conv2d (batch size, output channels, output rows, output columns) # dl4nlp (batch size, hidden_dim, length+kernel-1, 1) conv_result = T.nnet.conv2d(x, filter_w, border_mode='valid',) # (batch size, hidden_dim, length+kernel-1, 1) -> (length+kernel-1, hidden_dim) conv_result = T.transpose(conv_result[0, :, :, 0], (1, 0)) return conv_result def forward_conv_batch(self, x): """ :param x: (batch, length, dim) :return: (batch, length - kernel + 2*padding_size + 1, hidden_dim) """ # T.nn.conv2d (batch size, input channels, input rows, input columns) # dl4nlp (batch size, 1, length, in_dim) x = x.dimshuffle([0, 'x', 1, 2]) # T.nn.conv2d (output channels, input channels, filter rows, filter columns) # dl4nlp (hidden_dim, 1, kernel_size, in_dim) filter_w = self.W.dimshuffle([1, 'x', 0, 2]) # T.nn.conv2d (batch size, output channels, output rows, output columns) # dl4nlp (batch size, hidden_dim, length+kernel-1, 1) conv_result = T.nnet.conv2d(x, filter_w, border_mode='valid',) # from theano.printing import Print # conv_result = Print()(conv_result) # (batch size, hidden_dim, length - kernel + 2*padding_size + 1, 1) # -> (batch, length - kernel + 2*padding_size + 1, hidden_dim) conv_result = T.transpose(conv_result[:, :, :, 0], (0, 2, 1)) return conv_result def forward(self, x): """ :param x: (length, dim) :return: (hidden_dim, ) """ if self.padding_size > 0: # (padding_size + length + padding_size, dim) x = temporal_padding_2d(x, (self.padding_size, self.padding_size)) safe_x = temporal_padding_2d(x, (0, self.kernel_size - x.shape[0])) # If Kernel Size is greater than sentence length, padding at the end of sentence x = ifelse(T.gt(self.kernel_size - x.shape[0], 0), safe_x, x) conv_result = self.forward_conv(x) pooling_result = get_pooling(conv_result, self.pooling) dropout_out = dropout_from_layer(pooling_result, self.dropout) return self.act.activate(dropout_out + self.b) def forward_batch(self, x, mask): """ :param x: (batch, length, dim) :param mask: (batch, length, ) :return: (batch, length, hidden_dim) """ # conv_after_length = length - kernel + 2 * padding_size + 1 new_x = x if self.padding_size > 0: # (padding_size + length + padding_size, dim) new_x = temporal_padding_3d(x, (self.padding_size, self.padding_size)) # (batch, conv_after_length) mask = temporal_padding_mask(mask, kernel_size=self.kernel_size, padding_size=self.padding_size) elif self.padding_size == 0: # (batch, conv_after_length) mask = temporal_padding_mask(mask, kernel_size=self.kernel_size, padding_size=0) else: raise RuntimeError("Dilation Rate >= 0") # safe_x = temporal_padding_3d(x, (0, self.kernel_size - x.shape[1])) # safe_mask = T.ones((x.shape[0], ), dtype=theano.config.floatX).dimshuffle([0, 'x']) # !!! convert safe_mask from col to matrix # safe_mask = T.unbroadcast(safe_mask, 1) # x, mask = ifelse(T.gt(self.kernel_size - x.shape[1], 0), # (safe_x, safe_mask), # (new_x, mask)) # (batch, conv_after_length, hidden_dim) conv_result = self.forward_conv_batch(new_x) # new_x = Print(new_x) # mask = Print()(mask) pooling_result = get_pooling_batch(conv_result, mask, self.pooling) dropout_out = dropout_from_layer(pooling_result, self.dropout) return self.act.activate(dropout_out + self.b)
class GrCNNEncoder(object): ''' (Binary) Gated Recursive Convolutional Neural Network Encoder. ''' def __init__(self, config=None, verbose=True): ''' @config: GRCNNConfiger. Configer used to set the architecture of GRCNNEncoder. ''' if verbose: logger.debug( 'Building Gated Recursive Convolutional Neural Network Encoder...' ) # Scale factor for initializing parameters self.scale = config.scale # Make theano symbolic tensor for input and model parameters self.input = T.matrix(name='GrCNN Encoder input', dtype=floatX) # Configure activation function self.act = Activation(config.activation) fan_in, fan_out = config.num_input, config.num_hidden # Initialize model parameters # Set seed of the random generator np.random.seed(config.random_seed) # Projection matrix U # Initialize all the matrices using orthogonal matrices U_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_in, fan_out)) U_val = U_val.astype(floatX) U_val *= self.scale self.U = theano.shared(value=U_val, name='U', borrow=True) self.hidden0 = T.dot(self.input, self.U) # W^l, W^r, parameters used to construct the central hidden representation Wl_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, fan_out)) Wl_val = Wl_val.astype(floatX) Wl_val, _, _ = np.linalg.svd(Wl_val) # Wl_val *= self.scale self.Wl = theano.shared(value=Wl_val, name='W_l', borrow=True) Wr_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, fan_out)) Wr_val = Wr_val.astype(floatX) Wr_val, _, _ = np.linalg.svd(Wr_val) # Wr_val *= self.scale self.Wr = theano.shared(value=Wr_val, name='W_r', borrow=True) self.Wb = theano.shared(value=np.zeros(fan_out, dtype=floatX), name='Wb', borrow=True) # G^l, G^r, parameters used to construct the three-way coefficients Gl_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, 3)) Gl_val = Gl_val.astype(floatX) self.Gl = theano.shared(value=Gl_val, name='G_l', borrow=True) Gr_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, 3)) Gr_val = Gr_val.astype(floatX) self.Gr = theano.shared(value=Gr_val, name='G_r', borrow=True) self.Gb = theano.shared(value=np.zeros(3, dtype=floatX), name='Gb', borrow=True) # Save all the parameters into one batch self.params = [ self.U, self.Wl, self.Wr, self.Wb, self.Gl, self.Gr, self.Gb ] # Compute the total number of parameters self.num_params = reduce(lambda x, y: x + np.prod(y.get_value().shape), self.params, 0) # Length of the time sequence self.nsteps = self.input.shape[0] self.pyramids, _ = theano.scan(fn=self._step_prop, sequences=T.arange(self.nsteps - 1), non_sequences=self.nsteps, outputs_info=[self.hidden0], n_steps=self.nsteps - 1) self.output = self.pyramids[-1][0].dimshuffle('x', 0) # Compression -- Encoding function self.compress = theano.function(inputs=[self.input], outputs=self.output) if verbose: logger.debug( 'Finished constructing the structure of grCNN Encoder: ') logger.debug('Size of the input dimension: %d' % fan_in) logger.debug('Size of the hidden dimension: %d' % fan_out) logger.debug('Activation function: %s' % config.activation) def _step_prop(self, iter, current_level, nsteps): ''' @current_level: Input matrix at current level. The first dimension corresponds to the timestamp while the second dimension corresponds to the dimension of hidden representation ''' # Build shifted matrix, due to the constraints of Theano.scan, we have to keep the shape of the # input and output matrix left_current_level = current_level[:nsteps - iter - 1] right_current_level = current_level[1:nsteps - iter] # Compute temporary central hidden representation, of size Txd, but we only care about the first # T-1 rows, i.e., we only focus on the (T-1)xd sub-matrix. central_current_level = self.act.activate( T.dot(left_current_level, self.Wl) + T.dot(right_current_level, self.Wr) + self.Wb) # Compute gating function, of size Tx3. Again, due to the internal limitation of Theano.scan, we cannot # reduce the size of the matrix and have to keep the same size, but actually we only want the first (T-1)x3 # sub-matrix. current_gates = T.nnet.softmax( T.dot(left_current_level, self.Gl) + T.dot(right_current_level, self.Gr) + self.Gb) left_gate, central_gate, right_gate = current_gates[:, 0], current_gates[:, 1], current_gates[:, 2] # Reshape for broadcasting left_gate = left_gate.dimshuffle(0, 'x') central_gate = central_gate.dimshuffle(0, 'x') right_gate = right_gate.dimshuffle(0, 'x') # Build next level of hidden representation using soft combination, # matrix of size (T-1)xd next_level = left_gate * left_current_level + \ right_gate * right_current_level + \ central_gate * central_current_level return T.set_subtensor(current_level[:nsteps - iter - 1], next_level) def _step_prop_reduce(self, current_level): ''' @current_level: Input matrix at current level. The first dimension corresponds to the timestamp while the second dimension corresponds to the dimension of hidden representation Reduced version of level propagation, much more memory and time efficient implementation, but cannot be used inside theano.scan because theano.scan requires that the input and output through timestamps should have the same shape. ''' # Build shifted matrix, due to the constraints of Theano.scan, we have to keep the shape of the # input and output matrix right_current_level = current_level[1:] left_current_level = current_level[:-1] # Compute temporary central hidden representation, of size Txd, but we only care about the first # T-1 rows, i.e., we only focus on the (T-1)xd sub-matrix. central_current_level = self.act.activate( T.dot(left_current_level, self.Wl) + T.dot(right_current_level, self.Wr) + self.Wb) # Compute gating function, of size Tx3. Again, due to the internal limitation of Theano.scan, we cannot # reduce the size of the matrix and have to keep the same size, but actually we only want the first (T-1)x3 # sub-matrix. current_gates = T.nnet.softmax( T.dot(left_current_level, self.Gl) + T.dot(right_current_level, self.Gr) + self.Gb) left_gate, central_gate, right_gate = current_gates[:, 0], current_gates[:, 1], current_gates[:, 2] # Reshape for broadcasting left_gate = left_gate.dimshuffle(0, 'x') central_gate = central_gate.dimshuffle(0, 'x') right_gate = right_gate.dimshuffle(0, 'x') # Build next level of hidden representation using soft combination, # matrix of size (T-1)xd next_level = left_gate * left_current_level + \ right_gate * right_current_level + \ central_gate * central_current_level return next_level def encode(self, inputM): ''' @input: Theano symbol matrix. Compress the input matrix into output vector. ''' hidden = T.dot(inputM, self.U) # Length of the time sequence nsteps = inputM.shape[0] pyramids, _ = theano.scan(fn=self._step_prop, sequences=T.arange(nsteps - 1), non_sequences=nsteps, outputs_info=[hidden], n_steps=nsteps - 1) output = pyramids[-1][0].dimshuffle('x', 0) return output def L2_loss(self): ''' Return L2 norm of the model parameters. ''' return T.sum(self.U ** 2) + T.sum(self.Wl ** 2) + T.sum(self.Wr ** 2) + \ T.sum(self.Gl ** 2) + T.sum(self.Gr ** 2)
class ExtGrCNNEncoder(object): ''' An extension of the canonical GrCNN, with more than 1 gate at each local binary window. ''' def __init__(self, config, verbose=True): ''' @config: GrCNNConfiger. Configer used to set the architecture of ExtGrCNNEncoder. ''' if verbose: logger.debug('Building Extended Gated Recursive Convolutional Neural Network Encoder...') # Scale factor for initializing model parameters self.scale = config.scale # Make theano symbolic tensor for input and model parameters self.input = T.matrix(name='ExtGrCNNEncoder input', dtype=floatX) # Configure activation function self.act = Activation(config.activation) fan_in, fan_out = config.num_input, config.num_hidden # Initialize model parameter np.random.seed(config.random_seed) # Projection matrix U U_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_in, fan_out)) U_val = U_val.astype(floatX) U_val *= self.scale self.U = theano.shared(value=U_val, name='U', borrow=True) self.hidden0 = T.dot(self.input, self.U) # 3rd-tensor to implement the multi-gate GrCNN Encoders, where the first dimension corresponds # to the number of gates Wl_vals = [np.random.uniform(low=-1.0, high=1.0, size=(fan_out, fan_out)).astype(floatX) for _ in xrange(config.num_gates)] Wl_vals = [np.linalg.svd(Wl_val)[0] for Wl_val in Wl_vals] Wl_vals = np.asarray(Wl_vals) self.Wl = theano.shared(value=Wl_vals, name='W_l', borrow=True) Wr_vals = [np.random.uniform(low=-1.0, high=1.0, size=(fan_out, fan_out)).astype(floatX) for _ in xrange(config.num_gates)] Wr_vals = [np.linalg.svd(Wr_val)[0] for Wr_val in Wr_vals] Wr_vals = np.asarray(Wr_vals) self.Wr = theano.shared(value=Wr_vals, name='W_r', borrow=True) self.Wb = theano.shared(value=np.zeros((config.num_gates, fan_out), dtype=floatX), name='W_b', borrow=True) # Multi-gate choosing functions Gl_vals = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, config.num_gates+2)).astype(floatX) self.Gl = theano.shared(value=Gl_vals, name='G_l', borrow=True) Gr_vals = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, config.num_gates+2)).astype(floatX) self.Gr = theano.shared(value=Gr_vals, name='G_r', borrow=True) self.Gb = theano.shared(value=np.zeros(config.num_gates+2, dtype=floatX), name='G_b', borrow=True) # Stack all the model parameters self.params = [self.U, self.Wl, self.Wr, self.Wb, self.Gl, self.Gr, self.Gb] self.num_params = fan_in * fan_out + 2 * config.num_gates * fan_out * fan_out + config.num_gates * fan_out + \ 2 * (config.num_gates+2) * fan_out + config.num_gates + 2 # Length of the time sequence self.nsteps = self.input.shape[0] # Building ExtGrCNNEncoder pyramids self.pyramids, _ = theano.scan(fn=self._step_prop, sequences=T.arange(self.nsteps-1), non_sequences=self.nsteps, outputs_info=[self.hidden0], n_steps=self.nsteps-1) self.output = self.pyramids[-1][0].dimshuffle('x', 0) # Compression -- Encoding function self.compress = theano.function(inputs=[self.input], outputs=self.output) if verbose: logger.debug('Finished constructing the structure of ExtGrCNN Encoder: ') logger.debug('Size of the input dimension: %d' % fan_in) logger.debug('Size of the hidden dimension: %d' % fan_out) logger.debug('Number of gating functions: %d' % config.num_gates) logger.debug('Number of parameters in ExtGrCNN: %d' % self.num_params) logger.debug('Activation function: %s' % config.activation) def _step_prop(self, iter, current_level, nsteps): ''' @current_level: Input matrix at current level. The first dimension corresponds to the time dimension while the second dimension corresponds to the dimension of hidden representation ''' # Building shifted matrix, due to the constraints of Theano.scan, we have to keep the shape of the # input and output matrix, of size Txd left_current_level = current_level[:nsteps-iter-1] right_current_level = current_level[1:nsteps-iter] # Compute the temporary central multi-representation, of size TxKxd, where T is the dimension of # time, K is the dimension of number of gates and d is the dimension of hidden representation multi_centrals = self.act.activate(T.dot(left_current_level, self.Wl) + T.dot(right_current_level, self.Wr) + self.Wb) # Compute the gating function, of size Tx(K+2) multi_gates = T.nnet.softmax(T.dot(left_current_level, self.Gl) + T.dot(right_current_level, self.Gr) + self.Gb) # Softmax-Gating combination multi_gates = multi_gates.dimshuffle(0, 1, 'x') next_level = multi_gates[:, 1:-1, :] * multi_centrals next_level = T.sum(next_level, axis=1) next_level += multi_gates[:, 0] * left_current_level + multi_gates[:, -1] * right_current_level return T.set_subtensor(current_level[:nsteps-iter-1], next_level) def encode(self, inputM): ''' @input: Theano symbolic matrix. Compress the input matrix into output vector. The first dimension of inputM should correspond to the time dimension. ''' hidden = T.dot(inputM, self.U) nsteps = inputM.shape[0] pyramids, _ = theano.scan(fn=self._step_prop, sequences=T.arange(nsteps-1), non_sequences=nsteps, outputs_info=[hidden], n_steps=nsteps-1) output = pyramids[-1][0].dimshuffle('x', 0) return output
class TransEModel(EntityScorer): def __init__(self, entity_dim, relation_num, activation='iden', initializer=default_initializer, prefix='', verbose=True): super(TransEModel, self).__init__() self.entity_dim = entity_dim self.relation_num = relation_num # (relation_num, entity_dim, entity_dim) self.W = shared_rand_matrix((relation_num, self.entity_dim), prefix + 'TransE_R', initializer) self.act = Activation(activation) self.params = [self.W] self.norm_params = [self.W] self.l1_norm = T.sum(T.abs_(self.W)) self.l2_norm = T.sum(self.W**2) if verbose: logger.debug( 'Architecture of TransE Model built finished, summarized as below:' ) logger.debug('Entity Dimension: %d' % self.entity_dim) logger.debug('Relation Number: %d' % self.relation_num) logger.debug('Initializer: %s' % initializer) logger.debug('Activation: %s' % activation) def score(self, e1, e2, r_index): """ :param e1: (entity_dim, ) :param e2: (entity_dim, ) :param r_index: scalar :return: """ # (entity_dim, ) + (entity_dim, ) - (entity_dim, ) -> (entity_dim, ) hidden = e1 + self.W[r_index] - e2 # (entity_dim, ) -> scalar d = T.sum(hidden**2) return self.act.activate(d) def score_batch(self, e1, e2, r_index): """ :param e1: (batch, entity_dim, ) :param e2: (batch, entity_dim, ) :param r_index: (batch, ) :return: """ # (batch, entity_dim, ) + (batch, entity_dim, ) - (batch, entity_dim, ) -> (batch, entity_dim, ) hidden = e1 + self.W[r_index] - e2 d = T.sum(hidden**2, axis=1) return self.act.activate(d) def score_one_relation(self, e1, e2, r_index): """ :param e1: (batch, entity_dim, ) :param e2: (batch, entity_dim, ) :param r_index: scalar :return: """ # (batch, entity_dim, ) + (batch, entity_dim, ) - (batch, entity_dim, ) -> (batch, entity_dim, ) hidden = e1 + self.W[r_index][None, :] - e2 d = T.sum(hidden**2, axis=1) return self.act.activate(d)
class RecurrentNormEncoder(object): def __init__(self, in_dim, hidden_dim, pooling, activation, prefix="", initializer=default_initializer, dropout=0, verbose=True): if verbose: logger.debug('Building {}...'.format(self.__class__.__name__)) self.in_dim = in_dim self.out_dim = hidden_dim self.hidden_dim = hidden_dim self.pooling = pooling self.dropout = dropout self.act = Activation(activation) # Composition Function Weight # Feed-Forward Matrix (hidden, in) self.W = shared_rand_matrix((8, 8), prefix + 'W_forward', initializer) # Bias Term (hidden) self.b = shared_zero_matrix((8, 8), prefix + 'b_forward') # Recurrent Matrix (hidden, hidden) self.U = shared_rand_matrix((8, 8), prefix + 'U_forward', initializer) self.params = [self.W, self.U, self.b] self.norm_params = [self.W, self.U] # L1, L2 Norm self.l1_norm = T.sum(T.abs_(self.W)) + T.sum(T.abs_(self.U)) self.l2_norm = T.sum(self.W**2 + self.U**2) if verbose: logger.debug('Architecture of {} built finished'.format( self.__class__.__name__)) logger.debug('Input dimension: %d' % self.in_dim) logger.debug('Hidden dimension: %d' % self.hidden_dim) logger.debug('Pooling methods: %s' % self.pooling) logger.debug('Activation Func: %s' % self.act.method) logger.debug('Dropout Rate: %f' % self.dropout) def _step(self, x_t, h_t_1, w, u, b): """ step function of forward :param x_t: (in, ) :param h_t_1: (hidden, ) :param w: (hidden, in) :param u: (hidden, hidden) :param b: (hidden, ) :return: (hidden) """ # (hidden, in) (in, ) + (hidden, hidden) (hidden, ) + (hidden, ) -> hidden h_t = self.act.activate(T.dot(w, x_t) + T.dot(u, h_t_1) + b) return h_t def _step_batch(self, x_t, mask, h_t_1, w, u, b): """ step function of forward in batch version :param x_t: (batch, in) :param mask: (batch, ) :param h_t_1: (batch, hidden) :param w: (hidden, in) :param u: (hidden, hidden) :param b: (hidden) :return: (batch, hidden) """ # (batch, in) (in, hidden) -> (batch, hidden) h_t_1 = T.reshape(h_t_1, (h_t_1.shape[0], 8, 8)) x_t = T.reshape(x_t, (x_t.shape[0], 8, 8)) x_t = x_t / x_t.norm(2, axis=1)[:, None, :] h_t = self.act.activate(T.dot(x_t, w.T) + T.dot(h_t_1, u.T) + b) h_t = h_t / h_t.norm(2, axis=1)[:, None, :] h_t_1 = T.reshape(h_t_1, (h_t_1.shape[0], 64)) h_t = T.reshape(h_t, (h_t.shape[0], 64)) # (batch, hidden) * (batch, None) + (batch, hidden) * (batch, None) -> (batch, hidden) return h_t * mask[:, None] + h_t_1 * (1 - mask[:, None]) def forward_sequence(self, x): h0 = shared_zero_matrix((self.hidden_dim, ), 'h0') hs, _ = theano.scan( fn=self._step, sequences=x, outputs_info=[h0], non_sequences=[self.W, self.U, self.b], ) return hs def forward_sequence_batch(self, x, mask, batch_size): """ :param x: (batch, max_len, dim) :param mask: (batch, max_len) :param batch_size: """ h0 = shared_zero_matrix((batch_size, self.hidden_dim), 'h0') hs, _ = theano.scan( fn=self._step_batch, sequences=[ T.transpose( x, (1, 0, 2)), # (batch, max_len, dim) -> (max_len, batch, dim) T.transpose(mask, (1, 0)) ], # (batch, max_len) -> (max_len, batch) outputs_info=[h0], non_sequences=[self.W, self.U, self.b], ) # (max_len, batch, dim) -> (batch, max_len, dim) return T.transpose(hs, (1, 0, 2)) def forward(self, x): """ :param x: (len, dim) """ # Use Pooling to reduce into a fixed-length representation return get_pooling(self.forward_sequence(x), self.pooling) def forward_batch(self, x, mask, batch_size): """ :param x: (batch, max_len, dim) :param mask: (batch, max_len) :param batch_size: """ # Use Pooling to reduce into a fixed-length representation # (max_len, batch, dim) -> (batch, max_len, dim) -> (batch, dim) hidden = self.forward_sequence_batch(x, mask, batch_size) return get_pooling_batch(hidden, mask, self.pooling)
class RecurrentEncoder(AbstractRecurrentEncoder): def __init__(self, in_dim, hidden_dim, pooling, activation='tanh', prefix="", initializer=default_initializer, dropout=0, verbose=True): if verbose: logger.debug('Building {}...'.format(self.__class__.__name__)) super(RecurrentEncoder, self).__init__(in_dim, hidden_dim, pooling, activation, dropout) self.in_dim = in_dim self.out_dim = hidden_dim self.hidden_dim = hidden_dim self.pooling = pooling self.dropout = dropout self.act = Activation(activation) # Composition Function Weight # Feed-Forward Matrix (hidden, in) self.W = shared_rand_matrix((self.hidden_dim, self.in_dim), prefix + 'W_forward', initializer) # Bias Term (hidden) self.b = shared_zero_matrix((self.hidden_dim, ), prefix + 'b_forward') # Recurrent Matrix (hidden, hidden) self.U = shared_rand_matrix((self.hidden_dim, self.hidden_dim), prefix + 'U_forward', initializer) self.params = [self.W, self.U, self.b] self.norm_params = [self.W, self.U] # L1, L2 Norm self.l1_norm = T.sum(T.abs_(self.W)) + T.sum(T.abs_(self.U)) self.l2_norm = T.sum(self.W**2) + T.sum(self.U**2) if verbose: logger.debug('Architecture of {} built finished'.format( self.__class__.__name__)) logger.debug('Input dimension: %d' % self.in_dim) logger.debug('Hidden dimension: %d' % self.hidden_dim) logger.debug('Pooling methods: %s' % self.pooling) logger.debug('Activation Func: %s' % self.act.method) logger.debug('Dropout Rate: %f' % self.dropout) def _step(self, x_t, h_t_1, w, u, b): """ step function of forward :param x_t: (in, ) :param h_t_1: (hidden, ) :param w: (hidden, in) :param u: (hidden, hidden) :param b: (hidden, ) :return: (hidden) """ # (hidden, in) (in, ) + (hidden, hidden) (hidden, ) + (hidden, ) -> hidden h_t = self.act.activate(T.dot(w, x_t) + T.dot(u, h_t_1) + b) return h_t def _step_batch(self, x_t, mask, h_t_1, w, u, b): """ step function of forward in batch version :param x_t: (batch, in) :param mask: (batch, ) :param h_t_1: (batch, hidden) :param w: (hidden, in) :param u: (hidden, hidden) :param b: (hidden) :return: (batch, hidden) """ # (batch, in) (in, hidden) -> (batch, hidden) h_t = self.act.activate(T.dot(x_t, w.T) + T.dot(h_t_1, u.T) + b) # (batch, hidden) * (batch, None) + (batch, hidden) * (batch, None) -> (batch, hidden) return h_t * mask[:, None] + h_t_1 * (1 - mask[:, None]) def forward_scan(self, x): h0 = T.zeros((self.hidden_dim, )) hs, _ = theano.scan( fn=self._step, sequences=x, outputs_info=[h0], non_sequences=[self.W, self.U, self.b], ) return hs def forward_scan_batch(self, x, mask): """ :param x: (batch, max_len, dim) :param mask: (batch, max_len) """ h0 = T.zeros((x.shape[0], self.hidden_dim)) hs, _ = theano.scan( fn=self._step_batch, sequences=[ T.transpose( x, (1, 0, 2)), # (batch, max_len, dim) -> (max_len, batch, dim) T.transpose(mask, (1, 0)) ], # (batch, max_len) -> (max_len, batch) outputs_info=[h0], non_sequences=[self.W, self.U, self.b], ) # (max_len, batch, dim) -> (batch, max_len, dim) return T.transpose(hs, (1, 0, 2))
class GRUEncoder(AbstractRecurrentEncoder): def __init__(self, in_dim, hidden_dim, pooling, activation='tanh', gates=("sigmoid", "sigmoid"), prefix="", initializer=default_initializer, dropout=0, verbose=True): if verbose: logger.debug('Building {}...'.format(self.__class__.__name__)) super(GRUEncoder, self).__init__(in_dim, hidden_dim, pooling, activation, dropout) self.reset_gate, self.update_gate = Activation(gates[0]), Activation( gates[1]) # W [reset, update, recurrent] (3 * hidden, in) self.W = shared_rand_matrix((self.hidden_dim * 3, self.in_dim), prefix + 'W', initializer) # U [reset, update, recurrent] (3 * hidden, hidden) self.U = shared_rand_matrix((self.hidden_dim * 3, self.hidden_dim), prefix + 'U', initializer) # b [reset, update, recurrent] (3 * hidden,) # self.b = shared_zero_matrix((self.hidden_dim * 3,), prefix + 'b') self.params = [self.W, self.U] # , self.b] self.l1_norm = T.sum(T.abs_(self.W)) + T.sum(T.abs_(self.U)) self.l2_norm = T.sum(self.W**2) + T.sum(self.U**2) if verbose: logger.debug('Architecture of {} built finished'.format( self.__class__.__name__)) logger.debug('Input dimension: %d' % self.in_dim) logger.debug('Hidden dimension: %d' % self.hidden_dim) logger.debug('Pooling methods: %s' % self.pooling) logger.debug('Activation Func: %s' % self.act.method) logger.debug('Reset Gate: %s' % self.reset_gate.method) logger.debug('Update Gate: %s' % self.update_gate.method) logger.debug('Activation Func: %s' % self.act.method) logger.debug('Dropout Rate: %f' % self.dropout) def _step(self, x_t, h_t_1, w, u): # (hidden * 2, in) reset_update_w = w[:self.hidden_dim * 2, :] # (hidden * 2, hidden) reset_update_u = u[:self.hidden_dim * 2, :] # (hidden, in) recurrent_w = w[self.hidden_dim * 2:, :] # (hidden, hidden) recurrent_u = u[self.hidden_dim * 2:, :] # (in,) dot (in, hidden * 2) + (hidden,) dot (hidden, hidden * 2) -> (hidden * 2) pre_calc = T.dot(x_t, reset_update_w.T) + T.dot( h_t_1, reset_update_u.T) # (hidden * 2) -> (hidden) (hidden) reset_t = self.reset_gate.activate( ndarray_slice(pre_calc, 0, self.hidden_dim)) update_t = self.update_gate.activate( ndarray_slice(pre_calc, 1, self.hidden_dim)) # (in,) dot (in, hidden) + [(hidden,) * (hidden,)] dot (hidden, hidden)-> (hidden, ) g_t = T.dot(x_t, recurrent_w.T) + T.dot(h_t_1 * reset_t, recurrent_u.T) # (hidden,) * (hidden,) + (hidden,) * (hidden,) -> (hidden,) h_t = update_t * h_t_1 + (1 - update_t) * g_t return h_t def _step_batch(self, x_t, m_t, h_t_1, w, u): # (hidden * 2, in) reset_update_w = w[:self.hidden_dim * 2, :] # (hidden * 2, hidden) reset_update_u = u[:self.hidden_dim * 2, :] # (hidden, in) recurrent_w = w[self.hidden_dim * 2:, :] # (hidden, hidden) recurrent_u = u[self.hidden_dim * 2:, :] # (batch, in,) dot (in, hidden * 2) + (batch, hidden,) dot (hidden, hidden * 2) -> (hidden * 2) pre_calc = T.dot(x_t, reset_update_w.T) + T.dot( h_t_1, reset_update_u.T) # (batch, hidden * 2) -> (batch, hidden) (batch, hidden) reset_t = self.reset_gate.activate( ndarray_slice(pre_calc, 0, self.hidden_dim)) update_t = self.update_gate.activate( ndarray_slice(pre_calc, 1, self.hidden_dim)) # (batch, in,) dot (in, hidden) + [(batch, hidden,) * (batch, hidden,)] dot (hidden, hidden)-> (hidden, ) g_t = T.dot(x_t, recurrent_w.T) + T.dot(h_t_1 * reset_t, recurrent_u.T) # (batch, hidden,) * (batch, hidden,) + (batch, hidden,) * (batch, hidden,) -> (batch, hidden,) h_t = update_t * h_t_1 + (1 - update_t) * g_t # (batch, :) * (batch, hidden,) + (batch, :) * (batch, hidden,) h_t = m_t[:, None] * h_t + (1. - m_t)[:, None] * h_t_1 # (batch, hidden) return h_t def forward_scan(self, x): h0 = shared_zero_matrix((self.hidden_dim, ), 'h0_forward') hs, _ = theano.scan( fn=self._step, sequences=x, outputs_info=[h0], non_sequences=[self.W, self.U], ) return hs def forward_scan_batch(self, x, mask): h0 = T.zeros((x.shape[0], self.hidden_dim)) hs, _ = theano.scan( fn=self._step_batch, sequences=[T.transpose(x, (1, 0, 2)), T.transpose(mask, (1, 0))], outputs_info=[ h0, ], non_sequences=[self.W, self.U], ) return T.transpose(hs, (1, 0, 2))
class LSTMEncoder(AbstractRecurrentEncoder): def __init__(self, in_dim, hidden_dim, pooling, activation='tanh', gates=("sigmoid", "sigmoid", "sigmoid"), prefix="", initializer=OrthogonalInitializer(), dropout=0, verbose=True): if verbose: logger.debug('Building {}...'.format(self.__class__.__name__)) super(LSTMEncoder, self).__init__(in_dim, hidden_dim, pooling, activation, dropout) self.in_gate, self.forget_gate, self.out_gate = Activation( gates[0]), Activation(gates[1]), Activation(gates[2]) # W [in, forget, output, recurrent] (4 * hidden, in) self.W = shared_rand_matrix((self.hidden_dim * 4, self.in_dim), prefix + 'W', initializer) # U [in, forget, output, recurrent] (4 * hidden, hidden) self.U = shared_rand_matrix((self.hidden_dim * 4, self.hidden_dim), prefix + 'U', initializer) # b [in, forget, output, recurrent] (4 * hidden,) self.b = shared_zero_matrix((self.hidden_dim * 4, ), prefix + 'b') self.params = [self.W, self.U, self.b] self.l1_norm = T.sum(T.abs_(self.W)) + T.sum(T.abs_(self.U)) self.l2_norm = T.sum(self.W**2) + T.sum(self.U**2) if verbose: logger.debug('Architecture of {} built finished'.format( self.__class__.__name__)) logger.debug('Input dimension: %d' % self.in_dim) logger.debug('Hidden dimension: %d' % self.hidden_dim) logger.debug('Pooling methods: %s' % self.pooling) logger.debug('Activation Func: %s' % self.act.method) logger.debug('Input Gate: %s' % self.in_gate.method) logger.debug('Forget Gate: %s' % self.forget_gate.method) logger.debug('Output Gate: %s' % self.out_gate.method) logger.debug('Activation Func: %s' % self.act.method) logger.debug('Dropout Rate: %f' % self.dropout) def _step(self, x_t, h_t_1, c_t_1, w, u, b): pre_calc = T.dot(w, x_t) + T.dot(u, h_t_1) + b i_t = self.in_gate.activate(ndarray_slice(pre_calc, 0, self.hidden_dim)) f_t = self.forget_gate.activate( ndarray_slice(pre_calc, 1, self.hidden_dim)) o_t = self.out_gate.activate( ndarray_slice(pre_calc, 2, self.hidden_dim)) g_t = self.act.activate(ndarray_slice(pre_calc, 3, self.hidden_dim)) c_t = f_t * c_t_1 + i_t * g_t h_t = o_t * self.act.activate(c_t) return h_t, c_t def _step_batch(self, x_t, m_t, h_t_1, c_t_1, w, u, b): # (batch, in) (in, hidden * 4) + (hidden, in) (in, hidden * 4) + (hidden * 4) # -> (batch, hidden * 4) pre_calc = T.dot(x_t, w.T) + T.dot(h_t_1, u.T) + b # (batch, hidden * 4) -> (batch, hidden) (batch, hidden) (batch, hidden) (batch, hidden) i_t = self.in_gate.activate(ndarray_slice(pre_calc, 0, self.hidden_dim)) f_t = self.forget_gate.activate( ndarray_slice(pre_calc, 1, self.hidden_dim)) o_t = self.out_gate.activate( ndarray_slice(pre_calc, 2, self.hidden_dim)) g_t = self.act.activate(ndarray_slice(pre_calc, 3, self.hidden_dim)) # (batch, hidden) * (batch, hidden) + (batch, hidden) * (batch, hidden) # -> (batch, hidden) c_t = f_t * c_t_1 + i_t * g_t # (batch, hidden) * (batch, hidden) -> (batch, hidden) h_t = o_t * self.act.activate(c_t) c_t = m_t[:, None] * c_t + (1. - m_t)[:, None] * c_t_1 h_t = m_t[:, None] * h_t + (1. - m_t)[:, None] * h_t_1 # (batch, hidden) (batch, hidden) return h_t, c_t def forward_scan(self, x): h0 = shared_zero_matrix((self.hidden_dim, ), 'h0_forward') c0 = shared_zero_matrix((self.hidden_dim, ), 'c0_forward') hs, _ = theano.scan( fn=self._step, sequences=x, outputs_info=[h0, c0], non_sequences=[self.W, self.U, self.b], ) return hs[0] def forward_scan_batch(self, x, mask): h0 = T.zeros((x.shape[0], self.hidden_dim)) c0 = T.zeros((x.shape[0], self.hidden_dim)) hs, _ = theano.scan( fn=self._step_batch, sequences=[T.transpose(x, (1, 0, 2)), T.transpose(mask, (1, 0))], outputs_info=[h0, c0], non_sequences=[self.W, self.U, self.b], ) return T.transpose(hs[0], (1, 0, 2))
class RecursiveEncoder(object): def __init__(self, in_dim, hidden_dim, initializer=default_initializer, normalize=True, dropout=0, reconstructe=True, activation="tanh", verbose=True): """ :param in_dim: 输入维度 :param hidden_dim: 隐层维度 :param initializer: 随机初始化器 :param normalize: 是否归一化 :param dropout: dropout率 :param activation: 激活函数 :param verbose: 是否输出Debug日志内容 :return: """ self.in_dim = in_dim self.out_dim = hidden_dim self.hidden_dim = hidden_dim assert self.in_dim == self.hidden_dim self.initializer = initializer self.normalize = normalize self.dropout = dropout self.verbose = verbose self.act = Activation(activation) # Composition Function Weight # (dim, 2 * dim) self.W = shared_rand_matrix((self.hidden_dim, 2 * self.in_dim), 'W', initializer=initializer) # (dim, ) self.b = shared_zero_matrix((self.hidden_dim, ), 'b') # Reconstruction Function Weight # (2 * dim, dim) self.Wr = shared_rand_matrix((2 * self.in_dim, self.hidden_dim), 'Wr', initializer=initializer) # (2 * dim, ) self.br = shared_zero_matrix((self.in_dim * 2, ), 'br') self.params = [self.W, self.b, self.Wr, self.br] self.norm_params = [self.W, self.Wr] self.l1_norm = sum( [T.sum(T.abs_(param)) for param in self.norm_params]) self.l2_norm = sum([T.sum(param**2) for param in self.norm_params]) if verbose: logger.debug( 'Architecture of RAE built finished, summarized as below: ') logger.debug('Hidden dimension: %d' % self.hidden_dim) logger.debug('Normalize: %s' % self.normalize) logger.debug('Activation: %s' % self.act) logger.debug('Dropout Rate: %s' % self.dropout) def compose(self, left_v, right_v): v = T.concatenate([left_v, right_v]) z = self.act.activate(self.b + T.dot(self.W, v)) if self.normalize: z = z / z.norm(2) r = self.act.activate(self.br + T.dot(self.Wr, z)) w_left_r, w_right_r = r[:self.hidden_dim], r[self.hidden_dim:] if self.normalize: w_left_r = w_left_r / w_left_r.norm(2) w_right_r = w_right_r / w_right_r.norm(2) loss_rec = T.sum((w_left_r - left_v)**2) + T.sum( (w_right_r - right_v)**2) return z, loss_rec def encode(self, seq, vecs, loss_rec): # vecs[t[0]] and vecs[t[0]] ==> vecs[t[2]] w_left, w_right = vecs[seq[0]], vecs[seq[1]] z, loss_rec = self.compose(w_left, w_right) return T.set_subtensor(vecs[seq[2]], z), loss_rec def forward(self, x, seq): """ :param x: (length, dim) :param seq: (length - 1, 3) :return: """ # (length, dim) -> (2 * length - 1, dim) vector = T.concatenate([x, T.zeros_like(x)[:-1, :]], axis=0) # vector = theano.printing.Print()(vector) # scan length-1 times hs, _ = theano.scan(fn=self.encode, sequences=seq, outputs_info=[vector, shared_scalar(0)], name="compose_phrase") comp_vec_init = hs[0][-1][-1] comp_rec_init = T.sum(hs[1]) if self.normalize: hidden = x[0] / x[0].norm(2) else: hidden = x[0] comp_vec = ifelse(x.shape[0] > 1, comp_vec_init, hidden) comp_rec = ifelse(x.shape[0] > 1, comp_rec_init, shared_zero_scalar()) return comp_vec, comp_rec def compose_batch(self, left, right, W, b, Wr, br): """ 合成函数代表一个Batch中的其中一个合成过程 :param left: (batch, dim) :param right: (batch, dim) :param W: (dim, dim) :param b: (dim, ) :param Wr: (dim, dim) :param br: (dim,) :return: """ v = T.concatenate( [left, right], axis=1) # [(batch, dim) (batch, dim)] -> (batch, 2 * dim) z = self.act.activate(b + T.dot( v, W.T)) # (batch, 2 * dim) dot (dim, 2 * dim)T -> (batch, dim) if self.normalize: z = z / (z.norm(2, axis=1)[:, None] + epsilon ) # (batch, dim) -> (batch, dim) normalize by row r = self.act.activate(br + T.dot( z, Wr.T)) # (batch, dim) dot (2 * dim, dim)T -> (batch, 2 * dim) # (batch, 2 * dim) -> [(batch, dim) (batch. dim)] left_r, right_r = r[:, :self.hidden_dim], r[:, self.hidden_dim:] if self.normalize: # (batch, dim) -> (batch, dim) normalize by row left_r /= (left_r.norm(2, axis=1)[:, None] + epsilon) # (batch, dim) -> (batch, dim) normalize by row right_r /= (right_r.norm(2, axis=1)[:, None] + epsilon) # (batch, ) loss_rec = T.sum((left_r - left)**2, axis=1) + T.sum( (right_r - right)**2, axis=1) # (batch, dim) (batch) return z, loss_rec def encode_batch(self, _seq, _mask, _input, _pre, loss_rec, W, b, Wr, br, range_index): """ batch合成短语表示过程中 单词循环执行的函数 :param _seq: (batch, 3) :param _mask: (batch, ) :param _input: (batch, word * 2 - 1, dim) :param _pre: (batch, dim) :param loss_rec: (batch, ) :param W: (dim, dim) :param b: (dim, ) :param Wr: (dim, dim) :param br: (dim,) :return: (batch, dim) """ left = _seq[:, 0] right = _seq[:, 1] # (batch, dim) # left_vec = _input[T.arange(self.batch), left] left_vec = _input[range_index, left] # (batch, dim) right_vec = _input[range_index, right] # (batch, dim) (batch, dim) -> (batch, 2 * dim), (batch, ) left_right, loss_rec = self.compose_batch(left_vec, right_vec, W, b, Wr, br) # (batch, 2 * dim) # 若掩码已为0 则代表已经超出原短语长度 此为多余计算 直接去上一轮结果作为该轮结果 left_right = _mask[:, None] * left_right + (1. - _mask[:, None]) * _pre # (batch, ) # 若掩码已为0 则代表已经超出原短语长度 此为多余计算 用0掩码消去 loss_rec *= _mask # (batch, word * 2 - 1, dim), (batch, dim), (batch, ) return T.set_subtensor(_input[range_index, _seq[:, 2]], left_right), left_right, loss_rec def forward_batch(self, x, mask, seqs): """ :param x: (batch, length, dim) :param mask: (batch, length) :param seqs: (batch, length - 1, 3) :return: """ zeros_rec = T.zeros((x.shape[0], )) # (batch, length, dim) -> (batch, 2 * length - 1, dim) vector = T.concatenate([x, T.zeros_like(x)[:, :-1, :]], axis=1) # scan仅能循环扫描张量的第一维 故转置输入的张量 # (batch, length - 1, 3) -> (length - 1, batch, 3) seqs = T.transpose(seqs, axes=(1, 0, 2)) # (batch, length - 1) -> (length - 1, batch) mask = T.transpose(mask, axes=(1, 0)) range_index = T.arange(x.shape[0]) result, _ = theano.scan( fn=self.encode_batch, # 编码函数,对batch数量的短语进行合成 sequences=[seqs, mask[1:]], # 扫描合成路径和掩码 # 因合成次数为短语长度-1 所以对于长度为1的短语,掩码第一次循环即为0 # 故取vector的第0维(第一个词)作为初始值,直接返回 outputs_info=[vector, vector[:, 0, :], zeros_rec], non_sequences=[self.W, self.b, self.Wr, self.br, range_index], name="compose_scan") phrases, pres, loss_recs = result # (word - 1, batch, dim) -> (batch, dim) # 最后一次合成扫描返回的结果为最终表示 phrases = pres[-1] sum_loss_recs = T.sum(loss_recs, axis=0) # (batch, dim) # 归一化 if self.normalize: phrases = phrases / phrases.norm(2, axis=1)[:, None] return phrases, sum_loss_recs
class NeuralTensorModel(EntityScorer): def __init__(self, entity_dim, relation_num, activation='tanh', hidden=5, keep_normal=False, initializer=default_initializer, prefix='', verbose=True): super(NeuralTensorModel, self).__init__() self.entity_dim = entity_dim self.relation_num = relation_num self.hidden = hidden self.slice_seq = T.arange(hidden) self.keep_normal = keep_normal # (relation_num, entity_dim, entity_dim, hidden) self.W = shared_rand_matrix( (relation_num, self.entity_dim, self.entity_dim, self.hidden), prefix + 'NTN_W', initializer) # (relation_num, hidden) self.U = shared_ones_matrix((relation_num, self.hidden), name=prefix + 'NTN_U') if keep_normal: # (relation_num, entity_dim, hidden) self.V = shared_rand_matrix( (relation_num, self.entity_dim * 2, self.hidden), prefix + 'NTN_V', initializer) # (relation_num, hidden) self.b = shared_zero_matrix((relation_num, self.hidden), name=prefix + 'NTN_B') self.params = [self.W, self.V, self.U, self.b] self.norm_params = [self.W, self.V, self.U, self.b] else: self.params = [self.W] self.norm_params = [self.W] self.act = Activation(activation) self.l1_norm = T.sum( [T.sum(T.abs_(param)) for param in self.norm_params]) self.l2_norm = T.sum([T.sum(param**2) for param in self.norm_params]) if verbose: logger.debug( 'Architecture of Tensor Model built finished, summarized as below:' ) logger.debug('Entity Dimension: %d' % self.entity_dim) logger.debug('Hidden Dimension: %d' % self.hidden) logger.debug('Relation Number: %d' % self.relation_num) logger.debug('Initializer: %s' % initializer) logger.debug('Activation: %s' % activation) @staticmethod def step(_slice, e1, e2, w): """ :param _slice: scalar :param e1: (entity_dim, ) :param e2: (entity_dim, ) :param w : (entity_dim, entity_dim, hidden) :return: """ # (entity_dim, ) dot (entity_dim, entity_dim) dot (entiy_dim) -> scalar return T.dot(e1, T.dot(w[_slice], e2)) @staticmethod def step_relation(_slice, e1, e2, w): """ :param _slice: scalar :param e1: (batch, entity_dim) :param e2: (batch, entity_dim) :param w : (entity_dim, entity_dim, hidden) :return: """ # (batch, entity_dim, ) dot (entity_dim, entity_dim) -> (batch, entity_dim) hidden = T.dot(e1, w[:, :, _slice]) # (batch, entity_dim) dot (batch, entity_dim, ) -> (batch, ) hidden = T.sum(hidden * e2, axis=1) return hidden @staticmethod def step_batch(_slice, e1, e2, w): """ :param _slice: scalar :param e1: (batch, entity_dim) :param e2: (batch, entity_dim) :param w : (batch, entity_dim, entity_dim, hidden) :return: """ # (batch, entity_dim, ) dot (batch, entity_dim, entity_dim) -> (batch, entity_dim) hidden = T.batched_dot(e1, w[:, :, :, _slice]) # (batch, entity_dim) dot (batch, entity_dim, ) -> (batch, ) hidden = T.sum(hidden * e2, axis=1) return hidden def score(self, e1, e2, r_index): """ :param e1: (entity_dim, ) :param e2: (entity_dim, ) :param r_index: scalar :return: """ # (entity_dim, ) dot (entity_dim, entity_dim, hidden) dot (entity_dim, ) -> (hidden, ) hidden1_sep, _ = theano.scan(fn=self.step, sequences=[self.slice_seq], non_sequences=[e1, e2, self.W[r_index]], name='single_scan') hidden1 = T.concatenate([hidden1_sep]) if self.keep_normal: # (2 * entity_dim, ) dot (2 * entity_dim, hidden) -> (hidden, ) hidden2 = T.dot(T.concatenate([e1, e2]), self.V[r_index]) # (hidden, ) + (hidden, ) + (hidden, ) -> (hidden, ) hidden = hidden1 + hidden2 + self.b[r_index] else: hidden = hidden1 # (hidden, ) -> (hidden, ) act_hidden = self.act.activate(hidden) # (hidden, ) dot (hidden, ) -> scalar return T.dot(act_hidden, self.U[r_index]) def score_batch(self, e1, e2, r_index): """ :param e1: (batch, entity_dim, ) :param e2: (batch, entity_dim, ) :param r_index: (batch, ) :return: """ # (batch, entity_dim) dot (batch, entity_dim, entity_dim, hidden) dot (batch, entity_dim) -> hidden * (batch, ) hidden1_sep, _ = theano.scan(fn=self.step_batch, sequences=[self.slice_seq], non_sequences=[e1, e2, self.W[r_index]], name='batch_scan') # hidden * (batch, ) -> (batch, hidden) hidden1 = T.concatenate([hidden1_sep], axis=1).transpose() if self.keep_normal: # (batch, 2 * entity_dim) dot (batch, 2 * entity_dim, hidden) -> (batch, hidden, ) hidden2 = T.batched_dot(T.concatenate([e1, e2], axis=1), self.V[r_index]) # (batch, hidden) + (batch, hidden) + (batch, hidden) -> (batch, hidden) hidden = hidden1 + hidden2 + self.b[r_index] else: hidden = hidden1 # (batch, hidden) -> (batch, hidden) act_hidden = self.act.activate(hidden) # (batch, hidden) dot (batch, hidden) -> (batch, ) return T.sum(act_hidden * self.U[r_index], axis=1) def score_one_relation(self, e1, e2, r_index): """ :param e1: (batch, entity_dim, ) :param e2: (batch, entity_dim, ) :param r_index: scalar :return: """ # (batch, entity_dim) dot (entity_dim, entity_dim, hidden) dot (batch, entity_dim) -> hidden * (batch, ) hidden1_sep, _ = theano.scan(fn=self.step_relation, sequences=self.slice_seq, non_sequences=[e1, e2, self.W[r_index]], name='relation_scan') # hidden * (batch, ) -> (batch, hidden) hidden1 = T.concatenate([hidden1_sep], axis=1).transpose() if self.keep_normal: # (batch, 2 * entity_dim) dot (2 * entity_dim, hidden) -> (batch, hidden) hidden2 = T.dot(T.concatenate([e1, e2], axis=1), self.V[r_index]) # (batch, hidden) + (batch, hidden) + (hidden) -> (batch, hidden) hidden = hidden1 + hidden2 + self.b[r_index][None, :] else: hidden = hidden1 # (batch, hidden) -> (batch, hidden) act_hidden = self.act.activate(hidden) # (batch, hidden) dot (batch, hidden) -> (batch, ) return T.sum(act_hidden * self.U[r_index], axis=1)
class BRNN(object): ''' Bidirectional RNN. This is just a trial for using BRNN as a tool for sentence modeling. First trial on the task of sentiment analysis. ''' def __init__(self, configs, verbose=True): if verbose: pprint('Build Tied weights Bidirectional Recurrent Neural Network') self.input = T.matrix(name='input') self.truth = T.ivector(name='label') self.learn_rate = T.scalar(name='learn rate') # Configure Activation function self.act = Activation(configs.activation) # Build bidirectional RNN with tied weights num_input, num_hidden, num_class = configs.num_input, configs.num_hidden, configs.num_class # Stack all the variables together into a vector in order to apply the batch updating algorithm # Since there are two directions for the RNN, all the weight matrix associated with RNN will be # duplicated num_params = 2 * (num_input * num_hidden + \ num_hidden * num_hidden + \ num_hidden) + \ 2 * num_hidden * num_class + \ num_class self.num_params = num_params self.theta = theano.shared(value=np.zeros(num_params, dtype=floatX), name='theta', borrow=True) # Incremental index param_idx = 0 # 1, Feed-forward matrix for forward direction: W_forward self.W_forward = self.theta[param_idx:param_idx + num_input * num_hidden].reshape( (num_input, num_hidden)) self.W_forward.name = 'W_forward_RNN' W_forward_init = np.asarray(np.random.uniform( low=-np.sqrt(6.0 / (num_input + num_hidden)), high=np.sqrt(6.0 / (num_input + num_hidden)), size=(num_input, num_hidden)), dtype=floatX) param_idx += num_input * num_hidden # 1, Feed-forward matrix for backward direction: W_backward self.W_backward = self.theta[param_idx:param_idx + num_input * num_hidden].reshape( (num_input, num_hidden)) self.W_backward.name = 'W_backward_RNN' W_backward_init = np.asarray(np.random.uniform( low=-np.sqrt(6.0 / (num_input + num_hidden)), high=np.sqrt(6.0 / (num_input + num_hidden)), size=(num_input, num_hidden)), dtype=floatX) param_idx += num_input * num_hidden # 2, Recurrent matrix for forward direction: U_forward self.U_forward = self.theta[param_idx:param_idx + num_hidden * num_hidden].reshape( (num_hidden, num_hidden)) self.U_forward.name = 'U_forward_RNN' U_forward_init = np.asarray(np.random.uniform( low=-np.sqrt(6.0 / (num_hidden + num_hidden)), high=np.sqrt(6.0 / (num_hidden + num_hidden)), size=(num_hidden, num_hidden)), dtype=floatX) param_idx += num_hidden * num_hidden # 2, Recurrent matrix for backward direction: U_backward self.U_backward = self.theta[param_idx:param_idx + num_hidden * num_hidden].reshape( (num_hidden, num_hidden)) self.U_backward.name = 'U_backward_RNN' U_backward_init = np.asarray(np.random.uniform( low=-np.sqrt(6.0 / (num_hidden + num_hidden)), high=np.sqrt(6.0 / (num_hidden + num_hidden)), size=(num_hidden, num_hidden)), dtype=floatX) param_idx += num_hidden * num_hidden # 3, Bias parameter for the hidden-layer forward direction RNN self.b_forward = self.theta[param_idx:param_idx + num_hidden] self.b_forward.name = 'b_forward_RNN' b_forward_init = np.zeros(num_hidden, dtype=floatX) param_idx += num_hidden # 3, Bias parameter for the hidden-layer backward direction RNN self.b_backward = self.theta[param_idx:param_idx + num_hidden] self.b_backward.name = 'b_backward_RNN' b_backward_init = np.zeros(num_hidden, dtype=floatX) param_idx += num_hidden # Weight matrix for softmax function self.W_softmax = self.theta[param_idx:param_idx + 2 * num_hidden * num_class].reshape( (2 * num_hidden, num_class)) self.W_softmax.name = 'W_softmax' W_softmax_init = np.asarray(np.random.uniform( low=-np.sqrt(6.0 / (2 * num_hidden + num_class)), high=np.sqrt(6.0 / (2 * num_hidden + num_class)), size=(2 * num_hidden, num_class)), dtype=floatX) param_idx += 2 * num_hidden * num_class # Bias vector for softmax function self.b_softmax = self.theta[param_idx:param_idx + num_class] self.b_softmax.name = 'b_softmax' b_softmax_init = np.zeros(num_class, dtype=floatX) param_idx += num_class # Set all the default parameters into theta self.theta.set_value( np.concatenate([ x.ravel() for x in (W_forward_init, W_backward_init, U_forward_init, U_backward_init, b_forward_init, b_backward_init, W_softmax_init, b_softmax_init) ])) assert param_idx == num_params # h[0], zero vector, treated as constants self.h_start = theano.shared(value=np.zeros(num_hidden, dtype=floatX), name='h_start', borrow=True) self.h_end = theano.shared(value=np.zeros(num_hidden, dtype=floatX), name='h_end', borrow=True) # recurrent function used to compress a sequence of input vectors # the first dimension should correspond to time def forward_step(x_t, h_tm1): h_t = self.act.activate(T.dot(x_t, self.W_forward) + \ T.dot(h_tm1, self.U_forward) + self.b_forward) return h_t def backward_step(x_t, h_tm1): h_t = self.act.activate(T.dot(x_t, self.W_backward) + \ T.dot(h_tm1, self.U_backward) + self.b_backward) return h_t # Forward and backward representation over time self.forward_h, _ = theano.scan(fn=forward_step, sequences=self.input, outputs_info=[self.h_start], truncate_gradient=configs.bptt) self.backward_h, _ = theano.scan(fn=backward_step, sequences=self.input, outputs_info=[self.h_end], truncate_gradient=configs.bptt, go_backwards=True) # Store the final value # self.h_start_star = self.forward_h[-1] # self.h_end_star = self.backward_h[-1] self.h_start_star = T.mean(self.forward_h, axis=0) self.h_end_star = T.mean(self.backward_h, axis=0) # L1, L2 regularization self.L1_norm = T.sum(T.abs_(self.W_forward) + T.abs_(self.W_backward) + \ T.abs_(self.U_forward) + T.abs_(self.U_backward) + \ T.abs_(self.W_softmax)) self.L2_norm = T.sum(self.W_forward ** 2) + T.sum(self.W_backward ** 2) + \ T.sum(self.U_forward ** 2) + T.sum(self.U_backward ** 2) + \ T.sum(self.W_softmax ** 2) # Build function to show the learned representation for different sentences self.show_forward = theano.function(inputs=[self.input], outputs=self.h_start_star) self.show_backward = theano.function(inputs=[self.input], outputs=self.h_end_star) ################################################################################## # Correlated BRNN ################################################################################## # Concatenate these two vectors into one self.h = T.concatenate([self.h_start_star, self.h_end_star], axis=0) # Dropout parameter srng = T.shared_randomstreams.RandomStreams(configs.random_seed) mask = srng.binomial(n=1, p=1 - configs.dropout, size=self.h.shape) self.h *= T.cast(mask, floatX) # Use concatenated vector as input to the Softmax/MLP classifier self.output = T.nnet.softmax( T.dot(self.h, self.W_softmax) + self.b_softmax) self.pred = T.argmax(self.output, axis=1) # Build cost function self.cost = -T.mean( T.log(self.output)[T.arange(self.truth.shape[0]), self.truth]) if configs.regularization: self.cost += configs.lambda1 * self.L2_norm # Compute gradient self.gradtheta = T.grad(self.cost, self.theta) self.gradinput = T.grad(self.cost, self.input) # Build objective function # Compute the gradients to parameters self.compute_cost_and_gradient = theano.function( inputs=[self.input, self.truth], outputs=[self.cost, self.gradtheta]) # Compute the gradients to inputs self.compute_input_gradient = theano.function( inputs=[self.input, self.truth], outputs=self.gradinput) # Build prediction function self.predict = theano.function(inputs=[self.input], outputs=self.pred) if verbose: pprint('*' * 50) pprint( 'Finished constructing Bidirectional Recurrent Neural Network (BRNN)' ) pprint('Size of input dimension: %d' % configs.num_input) pprint('Size of hidden/recurrent dimension: %d' % configs.num_hidden) pprint('Size of output dimension: %d' % configs.num_class) pprint('Is regularization applied? %s' % ('yes' if configs.regularization else 'no')) if configs.regularization: pprint('Coefficient of regularization term: %f' % configs.lambda1) pprint('BPTT step: %d' % configs.bptt) pprint('Number of free parameters in BRNN: %d' % self.num_params) pprint('*' * 50) # This method is used to implement the batch updating algorithm def update_params(self, gradtheta, learn_rate): # gradparams is a single long vector which can be used to update self.theta # Learning algorithm: simple stochastic gradient descent theta = self.theta.get_value(borrow=True) self.theta.set_value(theta - learn_rate * gradtheta, borrow=True) @staticmethod def save(fname, model): with file(fname, 'wb') as fout: cPickle.dump(model, fout) @staticmethod def load(fname): with file(fname, 'rb') as fin: return cPickle.load(fin)
class SingleLayerModel(EntityScorer): def __init__(self, entity_dim, relation_num, hidden=50, activation='tanh', initializer=default_initializer, prefix='', verbose=True): super(SingleLayerModel, self).__init__() self.hidden = hidden self.entity_dim = entity_dim self.relation_num = relation_num # (relation_num, k, entity_dim) self.W_1 = shared_rand_matrix( (relation_num, self.hidden, self.entity_dim), prefix + 'SingleLayer_W1', initializer) # (relation_num, k, entity_dim) self.W_2 = shared_rand_matrix( (relation_num, self.hidden, self.entity_dim), prefix + 'SingleLayer_W2', initializer) # (relation_num, k, ) self.u = shared_ones_matrix(( relation_num, self.hidden, ), prefix + 'SingleLayer_u') self.act = Activation(activation) self.params = [self.W_1, self.W_2, self.u] self.norm_params = [self.W_1, self.W_2, self.u] self.l1_norm = T.sum(T.abs_(self.W_1)) + T.sum(T.abs_( self.W_2)) + T.sum(T.abs_(self.u)) self.l2_norm = T.sum(self.W_1**2) + T.sum(self.W_2**2) + T.sum(self.u** 2) if verbose: logger.debug( 'Architecture of Single Layer Model built finished, summarized as below:' ) logger.debug('Entity Dimension: %d' % self.entity_dim) logger.debug('Hidden Dimension: %d' % self.hidden) logger.debug('Relation Number: %d' % self.relation_num) logger.debug('Initializer: %s' % initializer) logger.debug('Activation: %s' % activation) def score(self, e1, e2, r_index): """ :param e1: (entity_dim, ) :param e2: (entity_dim, ) :param r_index: scalar :return: """ # (hidden, entity_dim) dot (entity_dim) + (hidden, entity_dim) dot (entity_dim) -> (hidden, ) hidden = T.dot(self.W_1[r_index], e1) + T.dot(self.W_2[r_index], e2) # (hidden, ) -> (hidden, ) act_hidden = self.act.activate(hidden) # (hidden, ) dot (hidden, ) -> 1 return T.dot(self.u[r_index], act_hidden) def score_batch(self, e1, e2, r_index): """ :param e1: (batch, entity_dim, ) :param e2: (batch, entity_dim, ) :param r_index: (batch, ) :return: """ # (batch, hidden, entity_dim) dot (batch, entity_dim) + (batch, hidden, entity_dim) dot (batch, entity_dim) hidden = T.batched_dot(self.W_1[r_index], e1) hidden += T.batched_dot(self.W_2[r_index], e2) # (batch, hidden) -> (batch, hidden) act_hidden = self.act.activate(hidden) # (batch, hidden) dot (batch, hidden) -> (batch, ) return T.sum(act_hidden * self.u[r_index], axis=1) def score_one_relation(self, e1, e2, r_index): """ :param e1: (batch, entity_dim, ) :param e2: (batch, entity_dim, ) :param r_index: scalar :return: """ # (batch, entity_dim) dot (entity_dim, hidden) + (batch, entity_dim) dot (entity_dim, hidden) -> (batch, hidden) hidden = T.dot(e1, self.W_1[r_index].transpose()) + T.dot( e2, self.W_2[r_index].transpose()) # (batch, hidden) -> (batch, hidden) act_hidden = self.act.activate(hidden) # (batch, hidden) dot (hidden, ) -> (batch, ) return T.dot(act_hidden, self.u[r_index])
class RNN(object): ''' Basic component for Recurrent Neural Network ''' def __init__(self, configs=None, verbose=True): ''' Basic RNN is an unsupervised component, where the input is a sequence and the output is a vector with fixed length ''' if verbose: pprint('Build Recurrent Neural Network...') self.input = T.matrix(name='input', dtype=floatX) self.learn_rate = T.scalar(name='learn rate') # Configure activation function self.act = Activation(configs.activation) fan_in = configs.num_input fan_out = configs.num_hidden # Initialize all the variables in RNN, including: # 1, Feed-forward matrix, feed-forward bias, W, W_b # 2, Recurrent matrix, recurrent bias, U, U_b self.W = theano.shared(value=np.asarray(np.random.uniform( low=-np.sqrt(6.0 / (fan_in + fan_out)), high=np.sqrt(6.0 / (fan_in + fan_out)), size=(fan_in, fan_out)), dtype=floatX), name='W', borrow=True) self.U = theano.shared(value=np.asarray(np.random.uniform( low=-np.sqrt(6.0 / (fan_out + fan_out)), high=np.sqrt(6.0 / (fan_out + fan_out)), size=(fan_out, fan_out)), dtype=floatX), name='U', borrow=True) # Bias parameter for the hidden-layer encoder of RNN self.b = theano.shared(value=np.zeros(fan_out, dtype=floatX), name='b', borrow=True) # h[0], zero vector self.h0 = theano.shared(value=np.zeros(fan_out, dtype=floatX), name='h0', borrow=True) # Save all the parameters self.params = [self.W, self.U, self.b, self.h0] # recurrent function used to compress a sequence of input vectors # the first dimension should correspond to time def step(x_t, h_tm1): h_t = self.act.activate(T.dot(x_t, self.W) + \ T.dot(h_tm1, self.U) + self.b) return h_t # h is the hidden representation over a time sequence self.hs, _ = theano.scan(fn=step, sequences=self.input, outputs_info=[self.h0], truncate_gradient=configs.bptt) self.h = self.hs[-1] # L1, L2 regularization self.L1_norm = T.sum(T.abs_(self.W) + T.abs_(self.U)) self.L2_norm = T.sum(self.W**2) + T.sum(self.U**2) # Compress function self.compress = theano.function(inputs=[self.input], outputs=self.h) @staticmethod def save(fname, model): ''' Save current RNN model into fname @fname: String. Filename to save the model. @model: RNN. An instance of RNN class. ''' with file(fname, 'wb') as fout: cPickle.dump(model, fout) @staticmethod def load(fname): ''' Load an RNN model from fname @fname: String. Filename to load the model. ''' with file(fname, 'rb') as fin: return cPickle.load(fin)
class NNWordBasedAttention(WordBasedAttention): """ Neural Machine Translation By Jointly Learning To Align and Translate Dzmitry Bahdanau, KyungHyun Cho, and Yoshua Bengio In Proceedings of ICLR 2015 http://arxiv.org/abs/1409.0473v3 """ def __init__(self, word_dim, seq_dim, hidden_dim, activation='tanh', initializer=default_initializer): super(NNWordBasedAttention, self).__init__(word_dim=word_dim, seq_dim=seq_dim, initializer=default_initializer) # (dim, dim) self.hidden_dim = hidden_dim self.W = shared_rand_matrix((self.word_dim, self.hidden_dim), 'Attention_W', initializer) self.U = shared_rand_matrix((self.seq_dim, self.hidden_dim), 'Attention_U', initializer) self.v = shared_rand_matrix((self.hidden_dim, ), 'Attention_v', initializer) self.act = Activation(activation) self.params = [self.W] self.norm_params = [self.W] def score( self, word, sequence, ): """ :param word: (word_dim, ) :param sequence: (length, seq_dim) :return: score: (length, ) """ # (word_dim, ) dot (word_dim, hidden_dim) -> (hidden_dim, ) hidden1 = T.dot(word, self.W) # (length, seq_dim) dot (seq_dim, hidden_dim) -> (length, hidden_dim) hidden2 = T.dot(sequence, self.U) # (hidden_dim, ) + (length, hidden_dim) -> (length, hidden_dim) hidden = hidden1[None, :] + hidden2 # (length, hidden_dim) -> (length, hidden_dim) act_hidden = self.act.activate(hidden) # (length, hidden_dim) dot (hidden_dim, ) -> (length, ) score = T.dot(act_hidden, self.v) return score def score_batch( self, word, sequence, ): """ :param word: (batch, word_dim) :param sequence: (batch, length, seq_dim) :return: score: (batch, length, ) """ # (batch, word_dim) dot (word_dim, hidden_dim) -> (batch, hidden_dim) hidden1 = T.dot(word, self.W) # (batch, length, seq_dim) dot (seq_dim, hidden_dim) -> (batch, length, hidden_dim) hidden2 = T.dot(sequence, self.U) # (batch, length, hidden_dim) + (batch, hidden_dim) -> (batch, length, hidden_dim) hidden = hidden1[:, None, :] + hidden2 # (batch, length, hidden_dim) -> (batch, length, hidden_dim) act_hidden = self.act.activate(hidden) # (batch, length, hidden_dim) dot (hidden_dim, ) -> (batch, length, ) score = T.dot(act_hidden, self.v) return score
class BRNNEncoder(object): ''' Bidirectional RNN for sequence encoding. ''' def __init__(self, config, verbose=True): if verbose: logger.debug('Building Bidirectional RNN Encoder...') self.input = T.matrix(name='BRNNEncoder_input') # Configure Activation function self.act = Activation(config.activation) # Build Bidirectional RNN num_input, num_hidden = config.num_input, config.num_hidden self.num_params = 2 * (num_input * num_hidden + num_hidden * num_hidden + num_hidden) # Initialize model parameters np.random.seed(config.random_seed) # 1, Feed-forward matrix for forward direction: W_forward W_forward_val = np.random.uniform(low=-1.0, high=1.0, size=(num_input, num_hidden)) W_forward_val = W_forward_val.astype(floatX) self.W_forward = theano.shared(value=W_forward_val, name='W_forward', borrow=True) # 1, Feed-forward matrix for backward direction: W_backward W_backward_val = np.random.uniform(low=-1.0, high=1.0, size=(num_input, num_hidden)) W_backward_val = W_backward_val.astype(floatX) self.W_backward = theano.shared(value=W_backward_val, name='W_backward', borrow=True) # 2, Recurrent matrix for forward direction: U_forward U_forward_val = np.random.uniform(low=-1.0, high=1.0, size=(num_hidden, num_hidden)) U_forward_val = U_forward_val.astype(floatX) U_forward_val, _, _ = np.linalg.svd(U_forward_val) self.U_forward = theano.shared(value=U_forward_val, name='U_forward', borrow=True) # 2, Recurrent matrix for backward direction: U_backward U_backward_val = np.random.uniform(low=-1.0, high=1.0, size=(num_hidden, num_hidden)) U_backward_val = U_backward_val.astype(floatX) U_backward_val, _, _ = np.linalg.svd(U_backward_val) self.U_backward = theano.shared(value=U_backward_val, name='U_backward', borrow=True) # 3, Bias parameter for the hidden-layer forward direction RNN b_forward_val = np.zeros(num_hidden, dtype=floatX) self.b_forward = theano.shared(value=b_forward_val, name='b_forward', borrow=True) # 3, Bias parameter for the hidden-layer backward direction RNN b_backward_val = np.zeros(num_hidden, dtype=floatX) self.b_backward = theano.shared(value=b_backward_val, name='b_backward', borrow=True) # h[0], zero vectors, treated as constants self.h0_forward = theano.shared(value=np.zeros(num_hidden, dtype=floatX), name='h0_forward', borrow=True) self.h0_backward = theano.shared(value=np.zeros(num_hidden, dtype=floatX), name='h0_backward', borrow=True) # Stack all the parameters self.params = [ self.W_forward, self.W_backward, self.U_forward, self.U_backward, self.b_forward, self.b_backward ] # Compute the forward and backward representation over time self.h_forwards, _ = theano.scan(fn=self._forward_step, sequences=self.input, outputs_info=[self.h0_forward], truncate_gradient=config.bptt) self.h_backwards, _ = theano.scan(fn=self._backward_step, sequences=self.input, outputs_info=[self.h0_backward], truncate_gradient=config.bptt, go_backwards=True) # Average compressing self.h_forward = T.mean(self.h_forwards, axis=0) self.h_backward = T.mean(self.h_backwards, axis=0) # Concatenate self.output = T.concatenate([self.h_forward, self.h_backward], axis=0) # L1, L2 regularization self.L1_norm = T.sum( T.abs_(self.W_forward) + T.abs_(self.W_backward) + T.abs_(self.U_forward) + T.abs_(self.U_backward)) self.L2_norm = T.sum(self.W_forward ** 2) + T.sum(self.W_backward ** 2) + \ T.sum(self.U_forward ** 2) + T.sum(self.U_backward ** 2) if verbose: logger.debug( 'Finished constructing the structure of BRNN Encoder: ') logger.debug('Size of the input dimension: %d' % num_input) logger.debug('Size of the hidden dimension: %d' % num_hidden) logger.debug('Activation function: %s' % config.activation) def _forward_step(self, x_t, h_tm1): h_t = self.act.activate(T.dot(x_t, self.W_forward) + \ T.dot(h_tm1, self.U_forward) + \ self.b_forward) return h_t def _backward_step(self, x_t, h_tm1): h_t = self.act.activate(T.dot(x_t, self.W_backward) + \ T.dot(h_tm1, self.U_backward) + \ self.b_backward) return h_t def encode(self, inputM): ''' @inputM: Theano symbol matrix. Compress the input matrix into output vector. ''' h_forwards, _ = theano.scan(fn=self._forward_step, sequences=inputM, outputs_info=[self.h0_forward]) h_backwards, _ = theano.scan(fn=self._backward_step, sequences=inputM, outputs_info=[self.h0_backward], go_backwards=True) # Averaging h_forward = T.mean(h_forwards, axis=0) h_backward = T.mean(h_backwards, axis=0) # Concatenate h = T.concatenate([h_forward, h_backward], axis=0) return h
class GrCNNEncoder(object): ''' (Binary) Gated Recursive Convolutional Neural Network Encoder. ''' def __init__(self, config=None, verbose=True): ''' @config: GRCNNConfiger. Configer used to set the architecture of GRCNNEncoder. ''' if verbose: logger.debug('Building Gated Recursive Convolutional Neural Network Encoder...') # Scale factor for initializing parameters self.scale = config.scale # Make theano symbolic tensor for input and model parameters self.input = T.matrix(name='GrCNN Encoder input', dtype=floatX) # Configure activation function self.act = Activation(config.activation) fan_in, fan_out = config.num_input, config.num_hidden # Initialize model parameters # Set seed of the random generator np.random.seed(config.random_seed) # Projection matrix U # Initialize all the matrices using orthogonal matrices U_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_in, fan_out)) U_val = U_val.astype(floatX) U_val *= self.scale self.U = theano.shared(value=U_val, name='U', borrow=True) self.hidden0 = T.dot(self.input, self.U) # W^l, W^r, parameters used to construct the central hidden representation Wl_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, fan_out)) Wl_val = Wl_val.astype(floatX) Wl_val, _, _ = np.linalg.svd(Wl_val) # Wl_val *= self.scale self.Wl = theano.shared(value=Wl_val, name='W_l', borrow=True) Wr_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, fan_out)) Wr_val = Wr_val.astype(floatX) Wr_val, _, _ = np.linalg.svd(Wr_val) # Wr_val *= self.scale self.Wr = theano.shared(value=Wr_val, name='W_r', borrow=True) self.Wb = theano.shared(value=np.zeros(fan_out, dtype=floatX), name='Wb', borrow=True) # G^l, G^r, parameters used to construct the three-way coefficients Gl_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, 3)) Gl_val = Gl_val.astype(floatX) self.Gl = theano.shared(value=Gl_val, name='G_l', borrow=True) Gr_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, 3)) Gr_val = Gr_val.astype(floatX) self.Gr = theano.shared(value=Gr_val, name='G_r', borrow=True) self.Gb = theano.shared(value=np.zeros(3, dtype=floatX), name='Gb', borrow=True) # Save all the parameters into one batch self.params = [self.U, self.Wl, self.Wr, self.Wb, self.Gl, self.Gr, self.Gb] # Compute the total number of parameters self.num_params = reduce(lambda x, y: x+np.prod(y.get_value().shape), self.params, 0) # Length of the time sequence self.nsteps = self.input.shape[0] self.pyramids, _ = theano.scan(fn=self._step_prop, sequences=T.arange(self.nsteps-1), non_sequences=self.nsteps, outputs_info=[self.hidden0], n_steps=self.nsteps-1) self.output = self.pyramids[-1][0].dimshuffle('x', 0) # Compression -- Encoding function self.compress = theano.function(inputs=[self.input], outputs=self.output) if verbose: logger.debug('Finished constructing the structure of grCNN Encoder: ') logger.debug('Size of the input dimension: %d' % fan_in) logger.debug('Size of the hidden dimension: %d' % fan_out) logger.debug('Activation function: %s' % config.activation) def _step_prop(self, iter, current_level, nsteps): ''' @current_level: Input matrix at current level. The first dimension corresponds to the timestamp while the second dimension corresponds to the dimension of hidden representation ''' # Build shifted matrix, due to the constraints of Theano.scan, we have to keep the shape of the # input and output matrix left_current_level = current_level[:nsteps-iter-1] right_current_level = current_level[1:nsteps-iter] # Compute temporary central hidden representation, of size Txd, but we only care about the first # T-1 rows, i.e., we only focus on the (T-1)xd sub-matrix. central_current_level = self.act.activate(T.dot(left_current_level, self.Wl) + T.dot(right_current_level, self.Wr) + self.Wb) # Compute gating function, of size Tx3. Again, due to the internal limitation of Theano.scan, we cannot # reduce the size of the matrix and have to keep the same size, but actually we only want the first (T-1)x3 # sub-matrix. current_gates = T.nnet.softmax(T.dot(left_current_level, self.Gl) + T.dot(right_current_level, self.Gr) + self.Gb) left_gate, central_gate, right_gate = current_gates[:, 0], current_gates[:, 1], current_gates[:, 2] # Reshape for broadcasting left_gate = left_gate.dimshuffle(0, 'x') central_gate = central_gate.dimshuffle(0, 'x') right_gate = right_gate.dimshuffle(0, 'x') # Build next level of hidden representation using soft combination, # matrix of size (T-1)xd next_level = left_gate * left_current_level + \ right_gate * right_current_level + \ central_gate * central_current_level return T.set_subtensor(current_level[:nsteps-iter-1], next_level) def _step_prop_reduce(self, current_level): ''' @current_level: Input matrix at current level. The first dimension corresponds to the timestamp while the second dimension corresponds to the dimension of hidden representation Reduced version of level propagation, much more memory and time efficient implementation, but cannot be used inside theano.scan because theano.scan requires that the input and output through timestamps should have the same shape. ''' # Build shifted matrix, due to the constraints of Theano.scan, we have to keep the shape of the # input and output matrix right_current_level = current_level[1:] left_current_level = current_level[:-1] # Compute temporary central hidden representation, of size Txd, but we only care about the first # T-1 rows, i.e., we only focus on the (T-1)xd sub-matrix. central_current_level = self.act.activate(T.dot(left_current_level, self.Wl) + T.dot(right_current_level, self.Wr) + self.Wb) # Compute gating function, of size Tx3. Again, due to the internal limitation of Theano.scan, we cannot # reduce the size of the matrix and have to keep the same size, but actually we only want the first (T-1)x3 # sub-matrix. current_gates = T.nnet.softmax(T.dot(left_current_level, self.Gl) + T.dot(right_current_level, self.Gr) + self.Gb) left_gate, central_gate, right_gate = current_gates[:, 0], current_gates[:, 1], current_gates[:, 2] # Reshape for broadcasting left_gate = left_gate.dimshuffle(0, 'x') central_gate = central_gate.dimshuffle(0, 'x') right_gate = right_gate.dimshuffle(0, 'x') # Build next level of hidden representation using soft combination, # matrix of size (T-1)xd next_level = left_gate * left_current_level + \ right_gate * right_current_level + \ central_gate * central_current_level return next_level def encode(self, inputM): ''' @input: Theano symbol matrix. Compress the input matrix into output vector. ''' hidden = T.dot(inputM, self.U) # Length of the time sequence nsteps = inputM.shape[0] pyramids, _ = theano.scan(fn=self._step_prop, sequences=T.arange(nsteps-1), non_sequences=nsteps, outputs_info=[hidden], n_steps=nsteps-1) output = pyramids[-1][0].dimshuffle('x', 0) return output def L2_loss(self): ''' Return L2 norm of the model parameters. ''' return T.sum(self.U ** 2) + T.sum(self.Wl ** 2) + T.sum(self.Wr ** 2) + \ T.sum(self.Gl ** 2) + T.sum(self.Gr ** 2)
class ExtGrCNNEncoder(object): ''' An extension of the canonical GrCNN, with more than 1 gate at each local binary window. ''' def __init__(self, config, verbose=True): ''' @config: GrCNNConfiger. Configer used to set the architecture of ExtGrCNNEncoder. ''' if verbose: logger.debug( 'Building Extended Gated Recursive Convolutional Neural Network Encoder...' ) # Scale factor for initializing model parameters self.scale = config.scale # Make theano symbolic tensor for input and model parameters self.input = T.matrix(name='ExtGrCNNEncoder input', dtype=floatX) # Configure activation function self.act = Activation(config.activation) fan_in, fan_out = config.num_input, config.num_hidden # Initialize model parameter np.random.seed(config.random_seed) # Projection matrix U U_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_in, fan_out)) U_val = U_val.astype(floatX) U_val *= self.scale self.U = theano.shared(value=U_val, name='U', borrow=True) self.hidden0 = T.dot(self.input, self.U) # 3rd-tensor to implement the multi-gate GrCNN Encoders, where the first dimension corresponds # to the number of gates Wl_vals = [ np.random.uniform(low=-1.0, high=1.0, size=(fan_out, fan_out)).astype(floatX) for _ in xrange(config.num_gates) ] Wl_vals = [np.linalg.svd(Wl_val)[0] for Wl_val in Wl_vals] Wl_vals = np.asarray(Wl_vals) self.Wl = theano.shared(value=Wl_vals, name='W_l', borrow=True) Wr_vals = [ np.random.uniform(low=-1.0, high=1.0, size=(fan_out, fan_out)).astype(floatX) for _ in xrange(config.num_gates) ] Wr_vals = [np.linalg.svd(Wr_val)[0] for Wr_val in Wr_vals] Wr_vals = np.asarray(Wr_vals) self.Wr = theano.shared(value=Wr_vals, name='W_r', borrow=True) self.Wb = theano.shared(value=np.zeros((config.num_gates, fan_out), dtype=floatX), name='W_b', borrow=True) # Multi-gate choosing functions Gl_vals = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, config.num_gates + 2)).astype(floatX) self.Gl = theano.shared(value=Gl_vals, name='G_l', borrow=True) Gr_vals = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, config.num_gates + 2)).astype(floatX) self.Gr = theano.shared(value=Gr_vals, name='G_r', borrow=True) self.Gb = theano.shared(value=np.zeros(config.num_gates + 2, dtype=floatX), name='G_b', borrow=True) # Stack all the model parameters self.params = [ self.U, self.Wl, self.Wr, self.Wb, self.Gl, self.Gr, self.Gb ] self.num_params = fan_in * fan_out + 2 * config.num_gates * fan_out * fan_out + config.num_gates * fan_out + \ 2 * (config.num_gates+2) * fan_out + config.num_gates + 2 # Length of the time sequence self.nsteps = self.input.shape[0] # Building ExtGrCNNEncoder pyramids self.pyramids, _ = theano.scan(fn=self._step_prop, sequences=T.arange(self.nsteps - 1), non_sequences=self.nsteps, outputs_info=[self.hidden0], n_steps=self.nsteps - 1) self.output = self.pyramids[-1][0].dimshuffle('x', 0) # Compression -- Encoding function self.compress = theano.function(inputs=[self.input], outputs=self.output) if verbose: logger.debug( 'Finished constructing the structure of ExtGrCNN Encoder: ') logger.debug('Size of the input dimension: %d' % fan_in) logger.debug('Size of the hidden dimension: %d' % fan_out) logger.debug('Number of gating functions: %d' % config.num_gates) logger.debug('Number of parameters in ExtGrCNN: %d' % self.num_params) logger.debug('Activation function: %s' % config.activation) def _step_prop(self, iter, current_level, nsteps): ''' @current_level: Input matrix at current level. The first dimension corresponds to the time dimension while the second dimension corresponds to the dimension of hidden representation ''' # Building shifted matrix, due to the constraints of Theano.scan, we have to keep the shape of the # input and output matrix, of size Txd left_current_level = current_level[:nsteps - iter - 1] right_current_level = current_level[1:nsteps - iter] # Compute the temporary central multi-representation, of size TxKxd, where T is the dimension of # time, K is the dimension of number of gates and d is the dimension of hidden representation multi_centrals = self.act.activate( T.dot(left_current_level, self.Wl) + T.dot(right_current_level, self.Wr) + self.Wb) # Compute the gating function, of size Tx(K+2) multi_gates = T.nnet.softmax( T.dot(left_current_level, self.Gl) + T.dot(right_current_level, self.Gr) + self.Gb) # Softmax-Gating combination multi_gates = multi_gates.dimshuffle(0, 1, 'x') next_level = multi_gates[:, 1:-1, :] * multi_centrals next_level = T.sum(next_level, axis=1) next_level += multi_gates[:, 0] * left_current_level + multi_gates[:, -1] * right_current_level return T.set_subtensor(current_level[:nsteps - iter - 1], next_level) def encode(self, inputM): ''' @input: Theano symbolic matrix. Compress the input matrix into output vector. The first dimension of inputM should correspond to the time dimension. ''' hidden = T.dot(inputM, self.U) nsteps = inputM.shape[0] pyramids, _ = theano.scan(fn=self._step_prop, sequences=T.arange(nsteps - 1), non_sequences=nsteps, outputs_info=[hidden], n_steps=nsteps - 1) output = pyramids[-1][0].dimshuffle('x', 0) return output