def __init__(self, filters, kernel_size, stride=1, padding='same', activation=None): """ Params: filters: Number of Filters kernel_size: shape of the kernel stride: the stride padding: valid or same activation: activation function """ self.filters = filters num_weights = kernel_size[0] * kernel_size[1] self.kernel_size = kernel_size self.weights = None self.bias = None self.padding = (kernel_size[0] - 1) // 2 if padding == 'same' else 0 self.stride = stride self.output_units = [] self.activation = Activation(activation)
def __init__(self, in_dim, hidden_dim, activation, prefix="", initializer=default_initializer, dropout=0, verbose=True): if verbose: logger.debug('Building {}...'.format(self.__class__.__name__)) self.in_dim = in_dim self.hidden_dim = hidden_dim self.out_dim = hidden_dim self.act = Activation(activation) self.dropout = dropout self.W = shared_rand_matrix((self.hidden_dim, self.in_dim), prefix + 'W', initializer) self.b = shared_zero_matrix((self.hidden_dim, ), prefix + 'b') self.params = [self.W, self.b] self.norm_params = [self.W] self.l1_norm = T.sum( [T.sum(T.abs_(param)) for param in self.norm_params]) self.l2_norm = T.sum([T.sum(param**2) for param in self.norm_params]) if verbose: logger.debug('Architecture of {} built finished'.format( self.__class__.__name__)) logger.debug('Input dimension: %d' % self.in_dim) logger.debug('Hidden dimension: %d' % self.hidden_dim) logger.debug('Activation Func: %s' % self.act.method) logger.debug('Dropout Rate: %f' % self.dropout)
def __init__(self, entity_dim, relation_num, activation='iden', initializer=default_initializer, prefix='', verbose=True): super(TransEModel, self).__init__() self.entity_dim = entity_dim self.relation_num = relation_num # (relation_num, entity_dim, entity_dim) self.W = shared_rand_matrix((relation_num, self.entity_dim), prefix + 'TransE_R', initializer) self.act = Activation(activation) self.params = [self.W] self.norm_params = [self.W] self.l1_norm = T.sum(T.abs_(self.W)) self.l2_norm = T.sum(self.W**2) if verbose: logger.debug( 'Architecture of TransE Model built finished, summarized as below:' ) logger.debug('Entity Dimension: %d' % self.entity_dim) logger.debug('Relation Number: %d' % self.relation_num) logger.debug('Initializer: %s' % initializer) logger.debug('Activation: %s' % activation)
def feedforward(self, x): self.layer_1_output = Activation.sigmoid(np.dot(x, self.weights_1)) self.layer_2_output = Activation.sigmoid( np.dot(self.layer_1_output, self.weights_2)) self.output = self.layer_2_output return self.output
def __init__(self, in_dim, activation, hidden_dim=None, transform_gate="sigmoid", prefix="", initializer=default_initializer, dropout=0, verbose=True): # By construction the dimensions of in_dim and out_dim have to match, and hence W_T and W_H are square matrices. if hidden_dim is not None: assert in_dim == hidden_dim if verbose: logger.debug('Building {}...'.format(self.__class__.__name__)) super(HighwayLayer, self).__init__(in_dim, in_dim, activation, prefix, initializer, dropout, verbose) self.transform_gate = Activation(transform_gate) self.W_H, self.W_H.name = self.W, prefix + "W_H" self.b_H, self.b_H.name = self.b, prefix + "b_H" self.W_T = shared_rand_matrix((self.hidden_dim, self.in_dim), prefix + 'W_T', initializer) self.b_T = shared_zero_matrix((self.hidden_dim,), prefix + 'b_T') self.params = [self.W_H, self.W_T, self.b_H, self.b_T] self.norm_params = [self.W_H, self.W_T] self.l1_norm = T.sum([T.sum(T.abs_(param)) for param in self.norm_params]) self.l2_norm = T.sum([T.sum(param ** 2) for param in self.norm_params]) if verbose: logger.debug('Architecture of {} built finished'.format(self.__class__.__name__)) logger.debug('Input dimension: %d' % self.in_dim) logger.debug('Hidden dimension: %d' % self.hidden_dim) logger.debug('Activation Func: %s' % self.act.method) logger.debug('Transform Gate: %s' % self.transform_gate.method) logger.debug('Dropout Rate: %f' % self.dropout)
def __init__(self, configs=None, verbose=True): ''' Basic RNN is an unsupervised component, where the input is a sequence and the output is a vector with fixed length ''' if verbose: pprint('Build Recurrent Neural Network...') self.input = T.matrix(name='input', dtype=floatX) self.learn_rate = T.scalar(name='learn rate') # Configure activation function self.act = Activation(configs.activation) fan_in = configs.num_input fan_out = configs.num_hidden # Initialize all the variables in RNN, including: # 1, Feed-forward matrix, feed-forward bias, W, W_b # 2, Recurrent matrix, recurrent bias, U, U_b self.W = theano.shared(value=np.asarray(np.random.uniform( low=-np.sqrt(6.0 / (fan_in + fan_out)), high=np.sqrt(6.0 / (fan_in + fan_out)), size=(fan_in, fan_out)), dtype=floatX), name='W', borrow=True) self.U = theano.shared(value=np.asarray(np.random.uniform( low=-np.sqrt(6.0 / (fan_out + fan_out)), high=np.sqrt(6.0 / (fan_out + fan_out)), size=(fan_out, fan_out)), dtype=floatX), name='U', borrow=True) # Bias parameter for the hidden-layer encoder of RNN self.b = theano.shared(value=np.zeros(fan_out, dtype=floatX), name='b', borrow=True) # h[0], zero vector self.h0 = theano.shared(value=np.zeros(fan_out, dtype=floatX), name='h0', borrow=True) # Save all the parameters self.params = [self.W, self.U, self.b, self.h0] # recurrent function used to compress a sequence of input vectors # the first dimension should correspond to time def step(x_t, h_tm1): h_t = self.act.activate(T.dot(x_t, self.W) + \ T.dot(h_tm1, self.U) + self.b) return h_t # h is the hidden representation over a time sequence self.hs, _ = theano.scan(fn=step, sequences=self.input, outputs_info=[self.h0], truncate_gradient=configs.bptt) self.h = self.hs[-1] # L1, L2 regularization self.L1_norm = T.sum(T.abs_(self.W) + T.abs_(self.U)) self.L2_norm = T.sum(self.W**2) + T.sum(self.U**2) # Compress function self.compress = theano.function(inputs=[self.input], outputs=self.h)
def __init__(self, in_dim, hidden_dim, initializer=default_initializer, normalize=True, dropout=0, reconstructe=True, activation="tanh", verbose=True): """ :param in_dim: 输入维度 :param hidden_dim: 隐层维度 :param initializer: 随机初始化器 :param normalize: 是否归一化 :param dropout: dropout率 :param activation: 激活函数 :param verbose: 是否输出Debug日志内容 :return: """ self.in_dim = in_dim self.out_dim = hidden_dim self.hidden_dim = hidden_dim assert self.in_dim == self.hidden_dim self.initializer = initializer self.normalize = normalize self.dropout = dropout self.verbose = verbose self.act = Activation(activation) # Composition Function Weight # (dim, 2 * dim) self.W = shared_rand_matrix((self.hidden_dim, 2 * self.in_dim), 'W', initializer=initializer) # (dim, ) self.b = shared_zero_matrix((self.hidden_dim, ), 'b') # Reconstruction Function Weight # (2 * dim, dim) self.Wr = shared_rand_matrix((2 * self.in_dim, self.hidden_dim), 'Wr', initializer=initializer) # (2 * dim, ) self.br = shared_zero_matrix((self.in_dim * 2, ), 'br') self.params = [self.W, self.b, self.Wr, self.br] self.norm_params = [self.W, self.Wr] self.l1_norm = sum( [T.sum(T.abs_(param)) for param in self.norm_params]) self.l2_norm = sum([T.sum(param**2) for param in self.norm_params]) if verbose: logger.debug( 'Architecture of RAE built finished, summarized as below: ') logger.debug('Hidden dimension: %d' % self.hidden_dim) logger.debug('Normalize: %s' % self.normalize) logger.debug('Activation: %s' % self.act) logger.debug('Dropout Rate: %s' % self.dropout)
def __init__(self, in_dim, hidden_dim, kernel_size=3, padding='same', pooling='max', dilation_rate=1.0, activation='relu', prefix="", initializer=GlorotUniformInitializer(), dropout=0.0, verbose=True): """ Init Function for ConvolutionLayer :param in_dim: :param hidden_dim: :param kernel_size: :param padding: 'same', 'valid' :param pooling: 'max', 'mean', 'min' :param dilation_rate: :param activation: :param prefix: :param initializer: :param dropout: :param verbose: """ if verbose: logger.debug('Building {}...'.format(self.__class__.__name__)) self.in_dim = in_dim self.out_dim = hidden_dim self.hidden_dim = hidden_dim self.kernel_size = kernel_size self.padding = padding self.dilation_rate = dilation_rate self.pooling = pooling self.dropout = dropout self.act = Activation(activation) self.padding_size = int(self.dilation_rate * (self.kernel_size - 1)) # Composition Function Weight # Kernel Matrix (kernel_size, hidden, in) self.W = shared_rand_matrix((self.kernel_size, self.hidden_dim, self.in_dim), prefix + 'W', initializer) # Bias Term (hidden) self.b = shared_zero_matrix((self.hidden_dim,), prefix + 'b') self.params = [self.W, self.b] self.norm_params = [self.W] # L1, L2 Norm self.l1_norm = T.sum(T.abs_(self.W)) self.l2_norm = T.sum(self.W ** 2) if verbose: logger.debug('Architecture of {} built finished'.format(self.__class__.__name__)) logger.debug('Input dimension: %d' % self.in_dim) logger.debug('Filter Num (Hidden): %d' % self.hidden_dim) logger.debug('Kernel Size (Windows): %d' % self.kernel_size) logger.debug('Padding method : %s' % self.padding) logger.debug('Dilation Rate : %s' % self.dilation_rate) logger.debug('Padding Size : %s' % self.padding_size) logger.debug('Pooling method : %s' % self.pooling) logger.debug('Activation Func: %s' % self.act.method) logger.debug('Dropout Rate: %f' % self.dropout)
def __init__(self, entity_dim, relation_num, activation='tanh', hidden=5, keep_normal=False, initializer=default_initializer, prefix='', verbose=True): super(NeuralTensorModel, self).__init__() self.entity_dim = entity_dim self.relation_num = relation_num self.hidden = hidden self.slice_seq = T.arange(hidden) self.keep_normal = keep_normal # (relation_num, entity_dim, entity_dim, hidden) self.W = shared_rand_matrix( (relation_num, self.entity_dim, self.entity_dim, self.hidden), prefix + 'NTN_W', initializer) # (relation_num, hidden) self.U = shared_ones_matrix((relation_num, self.hidden), name=prefix + 'NTN_U') if keep_normal: # (relation_num, entity_dim, hidden) self.V = shared_rand_matrix( (relation_num, self.entity_dim * 2, self.hidden), prefix + 'NTN_V', initializer) # (relation_num, hidden) self.b = shared_zero_matrix((relation_num, self.hidden), name=prefix + 'NTN_B') self.params = [self.W, self.V, self.U, self.b] self.norm_params = [self.W, self.V, self.U, self.b] else: self.params = [self.W] self.norm_params = [self.W] self.act = Activation(activation) self.l1_norm = T.sum( [T.sum(T.abs_(param)) for param in self.norm_params]) self.l2_norm = T.sum([T.sum(param**2) for param in self.norm_params]) if verbose: logger.debug( 'Architecture of Tensor Model built finished, summarized as below:' ) logger.debug('Entity Dimension: %d' % self.entity_dim) logger.debug('Hidden Dimension: %d' % self.hidden) logger.debug('Relation Number: %d' % self.relation_num) logger.debug('Initializer: %s' % initializer) logger.debug('Activation: %s' % activation)
class HiddenLayer(object): def __init__(self, in_dim, hidden_dim, activation, prefix="", initializer=default_initializer, dropout=0, verbose=True): if verbose: logger.debug('Building {}...'.format(self.__class__.__name__)) self.in_dim = in_dim self.hidden_dim = hidden_dim self.out_dim = hidden_dim self.act = Activation(activation) self.dropout = dropout self.W = shared_rand_matrix((self.hidden_dim, self.in_dim), prefix + 'W', initializer) self.b = shared_zero_matrix((self.hidden_dim, ), prefix + 'b') self.params = [self.W, self.b] self.norm_params = [self.W] self.l1_norm = T.sum( [T.sum(T.abs_(param)) for param in self.norm_params]) self.l2_norm = T.sum([T.sum(param**2) for param in self.norm_params]) if verbose: logger.debug('Architecture of {} built finished'.format( self.__class__.__name__)) logger.debug('Input dimension: %d' % self.in_dim) logger.debug('Hidden dimension: %d' % self.hidden_dim) logger.debug('Activation Func: %s' % self.act.method) logger.debug('Dropout Rate: %f' % self.dropout) def forward(self, x): """ :param x: (dim, ) """ output = self.act.activate(T.dot(self.W, x) + self.b) return dropout_from_layer(output, self.dropout) def forward_batch(self, x): """ :param x: (batch, dim) """ # (batch, in) (in, hidden) + (None, hidden) -> (batch, hidden) output = self.act.activate(T.dot(x, self.W.T) + self.b) return dropout_from_layer(output, self.dropout)
def train(dataset): config_options = globals.config task_path = config_options.get("Data", dataset) loss = config_options.get('Train', 'loss') activation = config_options.get('Train', 'activation') if dataset == "classify": Xtrain = z_norm(load_mnist_X(task_path + "classf_Xtrain.txt")) Xtest = z_norm(load_mnist_X(task_path + "classf_Xtest.txt")) Xval = z_norm(load_mnist_X(task_path + "classf_XVal.txt")) ytrain = load_mnist_Y(task_path + "classf_ytrain.txt") ytest = load_mnist_Y(task_path + "classf_ytest.txt") yval = load_mnist_Y(task_path + "classf_yVal.txt") elif dataset == "regression": Xtrain = z_norm(load_regression_X(task_path + "regr_Xtrain.txt")) Xtest = z_norm(load_regression_X(task_path + "regr_Xtest.txt")) Xval = z_norm(load_regression_X(task_path + "regr_Xval.txt")) ytrain = load_regression_Y(task_path + "regr_ytrain.txt") ytest = load_regression_Y(task_path + "regr_ytest.txt") yval = load_regression_Y(task_path + "regr_yval.txt") else: logger.warning("Invalid task.") return logger.info("Load data complete.") # build model N, input_dim = Xtrain.shape model = Model() model.add(Layer(output_dim=globals.layer_dim, input_dim=input_dim)) model.add(Activation(activation=activation)) model.add(Layer(output_dim=globals.output_dim)) model.compile(loss=loss) history = model.fit(Xtrain, ytrain, batch_size=N, iterations=globals.iterations, validation_data=(Xval, yval)) # save result result_dir = config_options.get('Result', 'result-dir') file_name = "_".join([ dataset, activation, str(globals.alpha), str(globals.lam), str(globals.layer_dim), str(globals.iterations) ]) + ".txt" file_path = result_dir + file_name writeFile(file_path, "") for datum in history: datum = [str(x) for x in datum] line = "\t".join(datum) + "\n" writeFile(file_path, line, 'a') print model.loss.mse(Xval, yval) print model.loss.mse(Xtest, ytest)
def __init__(self, in_dim, hidden_dim, pooling, activation='tanh', prefix="", initializer=default_initializer, dropout=0, verbose=True): if verbose: logger.debug('Building {}...'.format(self.__class__.__name__)) super(RecurrentEncoder, self).__init__(in_dim, hidden_dim, pooling, activation, dropout) self.in_dim = in_dim self.out_dim = hidden_dim self.hidden_dim = hidden_dim self.pooling = pooling self.dropout = dropout self.act = Activation(activation) # Composition Function Weight # Feed-Forward Matrix (hidden, in) self.W = shared_rand_matrix((self.hidden_dim, self.in_dim), prefix + 'W_forward', initializer) # Bias Term (hidden) self.b = shared_zero_matrix((self.hidden_dim, ), prefix + 'b_forward') # Recurrent Matrix (hidden, hidden) self.U = shared_rand_matrix((self.hidden_dim, self.hidden_dim), prefix + 'U_forward', initializer) self.params = [self.W, self.U, self.b] self.norm_params = [self.W, self.U] # L1, L2 Norm self.l1_norm = T.sum(T.abs_(self.W)) + T.sum(T.abs_(self.U)) self.l2_norm = T.sum(self.W**2) + T.sum(self.U**2) if verbose: logger.debug('Architecture of {} built finished'.format( self.__class__.__name__)) logger.debug('Input dimension: %d' % self.in_dim) logger.debug('Hidden dimension: %d' % self.hidden_dim) logger.debug('Pooling methods: %s' % self.pooling) logger.debug('Activation Func: %s' % self.act.method) logger.debug('Dropout Rate: %f' % self.dropout)
def __init__(self, in_dim, hidden_dim, kernel_sizes=[3, 4, 5], padding='same', pooling='max', dilation_rate=1.0, activation='relu', prefix="", initializer=GlorotUniformInitializer(), dropout=0.0, verbose=True): """ Init Function for ConvolutionLayer :param in_dim: :param hidden_dim: :param kernel_sizes: :param padding: 'same', 'valid' :param pooling: 'max', 'mean', 'min' :param dilation_rate: :param activation: :param prefix: :param initializer: :param dropout: :param verbose: """ if verbose: logger.debug('Building {}...'.format(self.__class__.__name__)) self.conv_layers = list() self.in_dim = in_dim self.out_dim = hidden_dim * len(kernel_sizes) self.hidden_dim = hidden_dim self.kernel_sizes = kernel_sizes self.padding = padding self.dilation_rate = dilation_rate self.pooling = pooling self.dropout = dropout self.act = Activation(activation) self.params = list() self.norm_params = list() # L1, L2 Norm self.l1_norm = 0 self.l2_norm = 0 for filter_hs in kernel_sizes: self.conv_layers.append(ConvolutionLayer(in_dim=self.in_dim, hidden_dim=hidden_dim, kernel_size=filter_hs, padding=self.padding, pooling=self.pooling, dilation_rate=self.dilation_rate, activation=activation, prefix=prefix+"filter%s_" % filter_hs, initializer=initializer, dropout=dropout, verbose=verbose)) self.params += self.conv_layers[-1].params self.norm_params += self.conv_layers[-1].norm_params self.l1_norm += self.conv_layers[-1].l1_norm self.l2_norm += self.conv_layers[-1].l2_norm if verbose: logger.debug('Architecture of {} built finished'.format(self.__class__.__name__)) logger.debug('Input dimension: %d' % self.in_dim) logger.debug('Filter Num (Hidden): %d' % self.hidden_dim) logger.debug('Kernel Size (Windows): %s' % self.kernel_sizes) logger.debug('Padding method : %s' % self.padding) logger.debug('Dilation Rate : %s' % self.dilation_rate) logger.debug('Pooling method : %s' % self.pooling) logger.debug('Activation Func: %s' % self.act.method) logger.debug('Dropout Rate: %f' % self.dropout)
def __init__(self, verbose=True): if verbose: logger.debug('Build Multilayer Perceptron Ranking model...') # Positive input setting self.inputPL = T.matrix(name='inputPL', dtype=floatX) self.inputPR = T.matrix(name='inputPR', dtype=floatX) # Negative input setting self.inputNL = T.matrix(name='inputNL', dtype=floatX) self.inputNR = T.matrix(name='inputNR', dtype=floatX) # Standard input setting self.inputL = T.matrix(name='inputL', dtype=floatX) self.inputR = T.matrix(name='inputR', dtype=floatX) # Build activation function self.act = Activation('tanh') # Connect input matrices self.inputP = T.concatenate([self.inputPL, self.inputPR], axis=1) self.inputN = T.concatenate([self.inputNL, self.inputNR], axis=1) self.input = T.concatenate([self.inputL, self.inputR], axis=1) # Build hidden layer self.hidden_layer = HiddenLayer(self.input, (2*edim, args.hidden), act=self.act) self.hidden = self.hidden_layer.output self.hiddenP = self.hidden_layer.encode(self.inputP) self.hiddenN = self.hidden_layer.encode(self.inputN) # Dropout parameter #srng = T.shared_randomstreams.RandomStreams(args.seed) #mask = srng.binomial(n=1, p=1-args.dropout, size=self.hidden.shape) #maskP = srng.binomial(n=1, p=1-args.dropout, size=self.hiddenP.shape) #maskN = srng.binomial(n=1, p=1-args.dropout, size=self.hiddenN.shape) #self.hidden *= T.cast(mask, floatX) #self.hiddenP *= T.cast(maskP, floatX) #self.hiddenN *= T.cast(maskN, floatX) # Build linear output layer self.score_layer = ScoreLayer(self.hidden, args.hidden) self.output = self.score_layer.output self.scoreP = self.score_layer.encode(self.hiddenP) self.scoreN = self.score_layer.encode(self.hiddenN) # Stack all the parameters self.params = [] self.params += self.hidden_layer.params self.params += self.score_layer.params # Build cost function self.cost = T.mean(T.maximum(T.zeros_like(self.scoreP), 1.0-self.scoreP+self.scoreN)) # Construct the gradient of the cost function with respect to the model parameters self.gradparams = T.grad(self.cost, self.params) # Count the total number of parameters in this model self.num_params = edim * args.hidden + args.hidden + args.hidden + 1 # Build class method self.score = theano.function(inputs=[self.inputL, self.inputR], outputs=self.output) self.compute_cost_and_gradient = theano.function(inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=self.gradparams+[self.cost, self.scoreP, self.scoreN]) self.show_scores = theano.function(inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=[self.scoreP, self.scoreN]) if verbose: logger.debug('Architecture of MLP Ranker built finished, summarized below: ') logger.debug('Input dimension: %d' % edim) logger.debug('Hidden dimension: %d' % args.hidden) logger.debug('Total number of parameters used in the model: %d' % self.num_params)
def __init__(self, input_units, output_units, activation = None): """ Params: input_units = Number of input nodes output_units = Number of output nodes activation = The activation layer """ # self.weights = np.random.normal(0.0, 1.0/np.sqrt(input_units), (input_units, output_units)) # self.bias = np.random.normal(0.0, 1.0/np.sqrt(input_units), (1, output_units)) # self.weights = np.random.uniform(-0.01, 0.01, (input_units, output_units)) self.weights = np.linspace(-0.01, 0.01, num = input_units*output_units) self.weights = self.weights.reshape((input_units, output_units)) self.bias = np.zeros((1,output_units)) self.activation = Activation(activation) # Initialize Other Things as Zero self.output_units = None self.grad_weights = 0 self.grad_bias = 0
def __init__(self, input_size, output_size, hidden_size, n_layers, act_type): ''' Multilayer Perceptron ---------------------- :param input_size: dimension of input features :param output_size: dimension of output features :param hidden_size: a list containing hidden size for each hidden layer :param n_layers: number of layers :param act_type: type of activation function for each hidden layer, can be none, sigmoid, tanh, or relu ''' super(MLP, self).__init__() # total layer number should be hidden layer number + 1 (output layer) assert len( hidden_size ) + 1 == n_layers, 'total layer number should be hidden layer number + 1' # define the activation function by activation function in activations.py self.act = Activation(act_type) # initialize a list to save layers layers = nn.ModuleList() if n_layers == 1: # if n_layers == 1, MLP degenerates to a Linear layer layer = Linear(input_size, output_size) # append the layer into layers layers.append(layer) layers.append(self.act) # TODO 4: Finish MLP with at least 2 layers else: # step 1: initialize the input layer layer = Linear(input_size, hidden_size[0]) # step 2: append the input layer and the activation layer into layers layers.append(layer) layers.append(self.act) # step 3: construct the hidden layers and add it to layers for i in range(1, n_layers - 1): #initialize a hidden layer and activation layer # hint: Noting that the output size of a hidden layer is hidden_size[i], so what is its input size? layer = Linear(hidden_size[i - 1], hidden_size[i]) layers.append(layer) layers.append(self.act) # step 4: initialize the output layer and append the layer into layers # hint: what is the output size of the output layer? # hint: here we do not need activation layer layer = Linear(hidden_size[-1], output_size) layers.append(layer) # End TODO 4 #Use nn.Sequential to get the neural network self.net = nn.Sequential(*layers)
def __init__(self, in_dim, hidden_dim, pooling, activation='tanh', gates=("sigmoid", "sigmoid", "sigmoid"), prefix="", initializer=OrthogonalInitializer(), dropout=0, verbose=True): if verbose: logger.debug('Building {}...'.format(self.__class__.__name__)) super(LSTMEncoder, self).__init__(in_dim, hidden_dim, pooling, activation, dropout) self.in_gate, self.forget_gate, self.out_gate = Activation( gates[0]), Activation(gates[1]), Activation(gates[2]) # W [in, forget, output, recurrent] (4 * hidden, in) self.W = shared_rand_matrix((self.hidden_dim * 4, self.in_dim), prefix + 'W', initializer) # U [in, forget, output, recurrent] (4 * hidden, hidden) self.U = shared_rand_matrix((self.hidden_dim * 4, self.hidden_dim), prefix + 'U', initializer) # b [in, forget, output, recurrent] (4 * hidden,) self.b = shared_zero_matrix((self.hidden_dim * 4, ), prefix + 'b') self.params = [self.W, self.U, self.b] self.l1_norm = T.sum(T.abs_(self.W)) + T.sum(T.abs_(self.U)) self.l2_norm = T.sum(self.W**2) + T.sum(self.U**2) if verbose: logger.debug('Architecture of {} built finished'.format( self.__class__.__name__)) logger.debug('Input dimension: %d' % self.in_dim) logger.debug('Hidden dimension: %d' % self.hidden_dim) logger.debug('Pooling methods: %s' % self.pooling) logger.debug('Activation Func: %s' % self.act.method) logger.debug('Input Gate: %s' % self.in_gate.method) logger.debug('Forget Gate: %s' % self.forget_gate.method) logger.debug('Output Gate: %s' % self.out_gate.method) logger.debug('Activation Func: %s' % self.act.method) logger.debug('Dropout Rate: %f' % self.dropout)
def __init__(self, entity_dim, relation_num, hidden=50, activation='tanh', initializer=default_initializer, prefix='', verbose=True): super(SingleLayerModel, self).__init__() self.hidden = hidden self.entity_dim = entity_dim self.relation_num = relation_num # (relation_num, k, entity_dim) self.W_1 = shared_rand_matrix( (relation_num, self.hidden, self.entity_dim), prefix + 'SingleLayer_W1', initializer) # (relation_num, k, entity_dim) self.W_2 = shared_rand_matrix( (relation_num, self.hidden, self.entity_dim), prefix + 'SingleLayer_W2', initializer) # (relation_num, k, ) self.u = shared_ones_matrix(( relation_num, self.hidden, ), prefix + 'SingleLayer_u') self.act = Activation(activation) self.params = [self.W_1, self.W_2, self.u] self.norm_params = [self.W_1, self.W_2, self.u] self.l1_norm = T.sum(T.abs_(self.W_1)) + T.sum(T.abs_( self.W_2)) + T.sum(T.abs_(self.u)) self.l2_norm = T.sum(self.W_1**2) + T.sum(self.W_2**2) + T.sum(self.u** 2) if verbose: logger.debug( 'Architecture of Single Layer Model built finished, summarized as below:' ) logger.debug('Entity Dimension: %d' % self.entity_dim) logger.debug('Hidden Dimension: %d' % self.hidden) logger.debug('Relation Number: %d' % self.relation_num) logger.debug('Initializer: %s' % initializer) logger.debug('Activation: %s' % activation)
def __init__(self, in_dim, hidden_dim, pooling, activation='tanh', dropout=0): self.in_dim = in_dim self.out_dim = hidden_dim self.hidden_dim = hidden_dim self.pooling = pooling self.dropout = dropout self.act = Activation(activation)
def __init__(self, word_dim, seq_dim, hidden_dim, activation='tanh', initializer=default_initializer): super(NNWordBasedAttention, self).__init__(word_dim=word_dim, seq_dim=seq_dim, initializer=default_initializer) # (dim, dim) self.hidden_dim = hidden_dim self.W = shared_rand_matrix((self.word_dim, self.hidden_dim), 'Attention_W', initializer) self.U = shared_rand_matrix((self.seq_dim, self.hidden_dim), 'Attention_U', initializer) self.v = shared_rand_matrix((self.hidden_dim, ), 'Attention_v', initializer) self.act = Activation(activation) self.params = [self.W] self.norm_params = [self.W]
def __init__(self, name, n_inputs, n_outputs, activation=None, use_bias=True, weights=None, biases=None): super().__init__(name) self.n_inputs = n_inputs self.n_outputs = n_outputs self.use_bias = use_bias if activation is None: activation = Activation.getInitialized("tanh") else: if not Activation.isObjectRegistered(activation): if isinstance(activation, dict): activation = Activation(**activation) elif isinstance(activation, str): activation = Activation(class_name=activation) else: raise Exception("{} is not a "\ "registered activation. Use {}".format(activation, Activation.registeredClasses())) self.activation = activation if weights is None: # Between -1 and 1 self.weights = (np.random.random((n_outputs, n_inputs)) * 2 - 1) else: assert isinstance(weights, np.ndarray) assert weights.shape == (n_outputs, n_inputs) self.weights = weights if biases is None: # Between -1 and 1 self.biases = (np.random.random((n_outputs, 1)) * 2 - 1) * 0.001 else: assert isinstance(biases, np.ndarray) assert biases.shape == (n_outputs, 1) self.biases = biases # Mutation mask ... create only once. self.mutation_mask = np.zeros_like(self.weights)
def load(self, folder): # deduce all import parameters from saved file try: # load weights & biases self.weight = np.load(f"{folder}/weight.npy") self.bias = np.load(f"{folder}/bias.npy") # load neurons, layers, activation_functions with open(f"{folder}/dense.json", "r") as file: data = json.load(file) self.inputDim = data["inputDim"] self.outputDim = data["outputDim"] self.activation = Activation.funcFromStr(data["activation"]) except Exception as e: print(e)
def __init__(self, configs=None, verbose=True): ''' Basic RNN is an unsupervised component, where the input is a sequence and the output is a vector with fixed length ''' if verbose: pprint('Build Recurrent Neural Network...') self.input = T.matrix(name='input', dtype=floatX) self.learn_rate = T.scalar(name='learn rate') # Configure activation function self.act = Activation(configs.activation) fan_in = configs.num_input fan_out = configs.num_hidden # Initialize all the variables in RNN, including: # 1, Feed-forward matrix, feed-forward bias, W, W_b # 2, Recurrent matrix, recurrent bias, U, U_b self.W = theano.shared(value=np.asarray( np.random.uniform(low=-np.sqrt(6.0/(fan_in+fan_out)), high=np.sqrt(6.0/(fan_in+fan_out)), size=(fan_in, fan_out)), dtype=floatX), name='W', borrow=True) self.U = theano.shared(value=np.asarray( np.random.uniform(low=-np.sqrt(6.0/(fan_out+fan_out)), high=np.sqrt(6.0/(fan_out+fan_out)), size=(fan_out, fan_out)), dtype=floatX), name='U', borrow=True) # Bias parameter for the hidden-layer encoder of RNN self.b = theano.shared(value=np.zeros(fan_out, dtype=floatX), name='b', borrow=True) # h[0], zero vector self.h0 = theano.shared(value=np.zeros(fan_out, dtype=floatX), name='h0', borrow=True) # Save all the parameters self.params = [self.W, self.U, self.b, self.h0] # recurrent function used to compress a sequence of input vectors # the first dimension should correspond to time def step(x_t, h_tm1): h_t = self.act.activate(T.dot(x_t, self.W) + \ T.dot(h_tm1, self.U) + self.b) return h_t # h is the hidden representation over a time sequence self.hs, _ = theano.scan(fn=step, sequences=self.input, outputs_info=[self.h0], truncate_gradient=configs.bptt) self.h = self.hs[-1] # L1, L2 regularization self.L1_norm = T.sum(T.abs_(self.W) + T.abs_(self.U)) self.L2_norm = T.sum(self.W ** 2) + T.sum(self.U ** 2) # Compress function self.compress = theano.function(inputs=[self.input], outputs=self.h)
def __init__(self, in_features, out_features, input_layer=False, fully_connected=True): self.in_features = in_features self.out_features = out_features self.fully_connected = fully_connected # changed from v0.0.0 # self.weights = np.random.randn(out_features, in_features) self.bias = np.random.randn(out_features) # last part for emphasis # self.next_layer = None self.prev_layer = None self.input_layer = input_layer self.variables = 0 self.activation = Activation()
def testAE(self): # Set parameters input = T.matrix(name='input') num_in, num_out = 784, 500 act = Activation('sigmoid') is_denoising, is_sparse = True, False lambda1 = 1e-4 mask = 0.7 rng = RandomStreams(42) start_time = time.time() ae = AutoEncoder(input, (num_in, num_out), act, is_denoising, is_sparse, lambda1, mask, rng, verbose=True) end_time = time.time() pprint('Time used to build the AutoEncoder: %f seconds.' % (end_time - start_time)) batch_size = 1000 num_batches = self.training_set.shape[0] / batch_size nepoch = 50 learn_rate = 1 start_time = time.time() for i in xrange(nepoch): rate = learn_rate for j in xrange(num_batches): train_set = self.training_set[j * batch_size:(j + 1) * batch_size, :] cost = ae.train(train_set, rate) pprint('epoch %d, batch %d, cost = %f' % (i, j, cost)) end_time = time.time() pprint('Time used for training AutoEncoder: %f seconds.' % (end_time - start_time)) image = PIL.Image.fromarray( imgutils.tile_raster_images( X=ae.encode_layer.W.get_value(borrow=True).T, img_shape=(28, 28), tile_shape=(10, 10), tile_spacing=(1, 1))) image.save('filters_corruption_%.2f.png' % mask) AutoEncoder.save('./autoencoder-mnist.model', ae)
def __init__(self, config, verbose=True): ''' @config: GrCNNConfiger. Configer used to set the architecture of ExtGrCNNEncoder. ''' if verbose: logger.debug('Building Extended Gated Recursive Convolutional Neural Network Encoder...') # Scale factor for initializing model parameters self.scale = config.scale # Make theano symbolic tensor for input and model parameters self.input = T.matrix(name='ExtGrCNNEncoder input', dtype=floatX) # Configure activation function self.act = Activation(config.activation) fan_in, fan_out = config.num_input, config.num_hidden # Initialize model parameter np.random.seed(config.random_seed) # Projection matrix U U_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_in, fan_out)) U_val = U_val.astype(floatX) U_val *= self.scale self.U = theano.shared(value=U_val, name='U', borrow=True) self.hidden0 = T.dot(self.input, self.U) # 3rd-tensor to implement the multi-gate GrCNN Encoders, where the first dimension corresponds # to the number of gates Wl_vals = [np.random.uniform(low=-1.0, high=1.0, size=(fan_out, fan_out)).astype(floatX) for _ in xrange(config.num_gates)] Wl_vals = [np.linalg.svd(Wl_val)[0] for Wl_val in Wl_vals] Wl_vals = np.asarray(Wl_vals) self.Wl = theano.shared(value=Wl_vals, name='W_l', borrow=True) Wr_vals = [np.random.uniform(low=-1.0, high=1.0, size=(fan_out, fan_out)).astype(floatX) for _ in xrange(config.num_gates)] Wr_vals = [np.linalg.svd(Wr_val)[0] for Wr_val in Wr_vals] Wr_vals = np.asarray(Wr_vals) self.Wr = theano.shared(value=Wr_vals, name='W_r', borrow=True) self.Wb = theano.shared(value=np.zeros((config.num_gates, fan_out), dtype=floatX), name='W_b', borrow=True) # Multi-gate choosing functions Gl_vals = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, config.num_gates+2)).astype(floatX) self.Gl = theano.shared(value=Gl_vals, name='G_l', borrow=True) Gr_vals = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, config.num_gates+2)).astype(floatX) self.Gr = theano.shared(value=Gr_vals, name='G_r', borrow=True) self.Gb = theano.shared(value=np.zeros(config.num_gates+2, dtype=floatX), name='G_b', borrow=True) # Stack all the model parameters self.params = [self.U, self.Wl, self.Wr, self.Wb, self.Gl, self.Gr, self.Gb] self.num_params = fan_in * fan_out + 2 * config.num_gates * fan_out * fan_out + config.num_gates * fan_out + \ 2 * (config.num_gates+2) * fan_out + config.num_gates + 2 # Length of the time sequence self.nsteps = self.input.shape[0] # Building ExtGrCNNEncoder pyramids self.pyramids, _ = theano.scan(fn=self._step_prop, sequences=T.arange(self.nsteps-1), non_sequences=self.nsteps, outputs_info=[self.hidden0], n_steps=self.nsteps-1) self.output = self.pyramids[-1][0].dimshuffle('x', 0) # Compression -- Encoding function self.compress = theano.function(inputs=[self.input], outputs=self.output) if verbose: logger.debug('Finished constructing the structure of ExtGrCNN Encoder: ') logger.debug('Size of the input dimension: %d' % fan_in) logger.debug('Size of the hidden dimension: %d' % fan_out) logger.debug('Number of gating functions: %d' % config.num_gates) logger.debug('Number of parameters in ExtGrCNN: %d' % self.num_params) logger.debug('Activation function: %s' % config.activation)
class GrCNNEncoder(object): ''' (Binary) Gated Recursive Convolutional Neural Network Encoder. ''' def __init__(self, config=None, verbose=True): ''' @config: GRCNNConfiger. Configer used to set the architecture of GRCNNEncoder. ''' if verbose: logger.debug('Building Gated Recursive Convolutional Neural Network Encoder...') # Scale factor for initializing parameters self.scale = config.scale # Make theano symbolic tensor for input and model parameters self.input = T.matrix(name='GrCNN Encoder input', dtype=floatX) # Configure activation function self.act = Activation(config.activation) fan_in, fan_out = config.num_input, config.num_hidden # Initialize model parameters # Set seed of the random generator np.random.seed(config.random_seed) # Projection matrix U # Initialize all the matrices using orthogonal matrices U_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_in, fan_out)) U_val = U_val.astype(floatX) U_val *= self.scale self.U = theano.shared(value=U_val, name='U', borrow=True) self.hidden0 = T.dot(self.input, self.U) # W^l, W^r, parameters used to construct the central hidden representation Wl_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, fan_out)) Wl_val = Wl_val.astype(floatX) Wl_val, _, _ = np.linalg.svd(Wl_val) # Wl_val *= self.scale self.Wl = theano.shared(value=Wl_val, name='W_l', borrow=True) Wr_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, fan_out)) Wr_val = Wr_val.astype(floatX) Wr_val, _, _ = np.linalg.svd(Wr_val) # Wr_val *= self.scale self.Wr = theano.shared(value=Wr_val, name='W_r', borrow=True) self.Wb = theano.shared(value=np.zeros(fan_out, dtype=floatX), name='Wb', borrow=True) # G^l, G^r, parameters used to construct the three-way coefficients Gl_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, 3)) Gl_val = Gl_val.astype(floatX) self.Gl = theano.shared(value=Gl_val, name='G_l', borrow=True) Gr_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, 3)) Gr_val = Gr_val.astype(floatX) self.Gr = theano.shared(value=Gr_val, name='G_r', borrow=True) self.Gb = theano.shared(value=np.zeros(3, dtype=floatX), name='Gb', borrow=True) # Save all the parameters into one batch self.params = [self.U, self.Wl, self.Wr, self.Wb, self.Gl, self.Gr, self.Gb] # Compute the total number of parameters self.num_params = reduce(lambda x, y: x+np.prod(y.get_value().shape), self.params, 0) # Length of the time sequence self.nsteps = self.input.shape[0] self.pyramids, _ = theano.scan(fn=self._step_prop, sequences=T.arange(self.nsteps-1), non_sequences=self.nsteps, outputs_info=[self.hidden0], n_steps=self.nsteps-1) self.output = self.pyramids[-1][0].dimshuffle('x', 0) # Compression -- Encoding function self.compress = theano.function(inputs=[self.input], outputs=self.output) if verbose: logger.debug('Finished constructing the structure of grCNN Encoder: ') logger.debug('Size of the input dimension: %d' % fan_in) logger.debug('Size of the hidden dimension: %d' % fan_out) logger.debug('Activation function: %s' % config.activation) def _step_prop(self, iter, current_level, nsteps): ''' @current_level: Input matrix at current level. The first dimension corresponds to the timestamp while the second dimension corresponds to the dimension of hidden representation ''' # Build shifted matrix, due to the constraints of Theano.scan, we have to keep the shape of the # input and output matrix left_current_level = current_level[:nsteps-iter-1] right_current_level = current_level[1:nsteps-iter] # Compute temporary central hidden representation, of size Txd, but we only care about the first # T-1 rows, i.e., we only focus on the (T-1)xd sub-matrix. central_current_level = self.act.activate(T.dot(left_current_level, self.Wl) + T.dot(right_current_level, self.Wr) + self.Wb) # Compute gating function, of size Tx3. Again, due to the internal limitation of Theano.scan, we cannot # reduce the size of the matrix and have to keep the same size, but actually we only want the first (T-1)x3 # sub-matrix. current_gates = T.nnet.softmax(T.dot(left_current_level, self.Gl) + T.dot(right_current_level, self.Gr) + self.Gb) left_gate, central_gate, right_gate = current_gates[:, 0], current_gates[:, 1], current_gates[:, 2] # Reshape for broadcasting left_gate = left_gate.dimshuffle(0, 'x') central_gate = central_gate.dimshuffle(0, 'x') right_gate = right_gate.dimshuffle(0, 'x') # Build next level of hidden representation using soft combination, # matrix of size (T-1)xd next_level = left_gate * left_current_level + \ right_gate * right_current_level + \ central_gate * central_current_level return T.set_subtensor(current_level[:nsteps-iter-1], next_level) def _step_prop_reduce(self, current_level): ''' @current_level: Input matrix at current level. The first dimension corresponds to the timestamp while the second dimension corresponds to the dimension of hidden representation Reduced version of level propagation, much more memory and time efficient implementation, but cannot be used inside theano.scan because theano.scan requires that the input and output through timestamps should have the same shape. ''' # Build shifted matrix, due to the constraints of Theano.scan, we have to keep the shape of the # input and output matrix right_current_level = current_level[1:] left_current_level = current_level[:-1] # Compute temporary central hidden representation, of size Txd, but we only care about the first # T-1 rows, i.e., we only focus on the (T-1)xd sub-matrix. central_current_level = self.act.activate(T.dot(left_current_level, self.Wl) + T.dot(right_current_level, self.Wr) + self.Wb) # Compute gating function, of size Tx3. Again, due to the internal limitation of Theano.scan, we cannot # reduce the size of the matrix and have to keep the same size, but actually we only want the first (T-1)x3 # sub-matrix. current_gates = T.nnet.softmax(T.dot(left_current_level, self.Gl) + T.dot(right_current_level, self.Gr) + self.Gb) left_gate, central_gate, right_gate = current_gates[:, 0], current_gates[:, 1], current_gates[:, 2] # Reshape for broadcasting left_gate = left_gate.dimshuffle(0, 'x') central_gate = central_gate.dimshuffle(0, 'x') right_gate = right_gate.dimshuffle(0, 'x') # Build next level of hidden representation using soft combination, # matrix of size (T-1)xd next_level = left_gate * left_current_level + \ right_gate * right_current_level + \ central_gate * central_current_level return next_level def encode(self, inputM): ''' @input: Theano symbol matrix. Compress the input matrix into output vector. ''' hidden = T.dot(inputM, self.U) # Length of the time sequence nsteps = inputM.shape[0] pyramids, _ = theano.scan(fn=self._step_prop, sequences=T.arange(nsteps-1), non_sequences=nsteps, outputs_info=[hidden], n_steps=nsteps-1) output = pyramids[-1][0].dimshuffle('x', 0) return output def L2_loss(self): ''' Return L2 norm of the model parameters. ''' return T.sum(self.U ** 2) + T.sum(self.Wl ** 2) + T.sum(self.Wr ** 2) + \ T.sum(self.Gl ** 2) + T.sum(self.Gr ** 2)
class ExtGrCNNEncoder(object): ''' An extension of the canonical GrCNN, with more than 1 gate at each local binary window. ''' def __init__(self, config, verbose=True): ''' @config: GrCNNConfiger. Configer used to set the architecture of ExtGrCNNEncoder. ''' if verbose: logger.debug('Building Extended Gated Recursive Convolutional Neural Network Encoder...') # Scale factor for initializing model parameters self.scale = config.scale # Make theano symbolic tensor for input and model parameters self.input = T.matrix(name='ExtGrCNNEncoder input', dtype=floatX) # Configure activation function self.act = Activation(config.activation) fan_in, fan_out = config.num_input, config.num_hidden # Initialize model parameter np.random.seed(config.random_seed) # Projection matrix U U_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_in, fan_out)) U_val = U_val.astype(floatX) U_val *= self.scale self.U = theano.shared(value=U_val, name='U', borrow=True) self.hidden0 = T.dot(self.input, self.U) # 3rd-tensor to implement the multi-gate GrCNN Encoders, where the first dimension corresponds # to the number of gates Wl_vals = [np.random.uniform(low=-1.0, high=1.0, size=(fan_out, fan_out)).astype(floatX) for _ in xrange(config.num_gates)] Wl_vals = [np.linalg.svd(Wl_val)[0] for Wl_val in Wl_vals] Wl_vals = np.asarray(Wl_vals) self.Wl = theano.shared(value=Wl_vals, name='W_l', borrow=True) Wr_vals = [np.random.uniform(low=-1.0, high=1.0, size=(fan_out, fan_out)).astype(floatX) for _ in xrange(config.num_gates)] Wr_vals = [np.linalg.svd(Wr_val)[0] for Wr_val in Wr_vals] Wr_vals = np.asarray(Wr_vals) self.Wr = theano.shared(value=Wr_vals, name='W_r', borrow=True) self.Wb = theano.shared(value=np.zeros((config.num_gates, fan_out), dtype=floatX), name='W_b', borrow=True) # Multi-gate choosing functions Gl_vals = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, config.num_gates+2)).astype(floatX) self.Gl = theano.shared(value=Gl_vals, name='G_l', borrow=True) Gr_vals = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, config.num_gates+2)).astype(floatX) self.Gr = theano.shared(value=Gr_vals, name='G_r', borrow=True) self.Gb = theano.shared(value=np.zeros(config.num_gates+2, dtype=floatX), name='G_b', borrow=True) # Stack all the model parameters self.params = [self.U, self.Wl, self.Wr, self.Wb, self.Gl, self.Gr, self.Gb] self.num_params = fan_in * fan_out + 2 * config.num_gates * fan_out * fan_out + config.num_gates * fan_out + \ 2 * (config.num_gates+2) * fan_out + config.num_gates + 2 # Length of the time sequence self.nsteps = self.input.shape[0] # Building ExtGrCNNEncoder pyramids self.pyramids, _ = theano.scan(fn=self._step_prop, sequences=T.arange(self.nsteps-1), non_sequences=self.nsteps, outputs_info=[self.hidden0], n_steps=self.nsteps-1) self.output = self.pyramids[-1][0].dimshuffle('x', 0) # Compression -- Encoding function self.compress = theano.function(inputs=[self.input], outputs=self.output) if verbose: logger.debug('Finished constructing the structure of ExtGrCNN Encoder: ') logger.debug('Size of the input dimension: %d' % fan_in) logger.debug('Size of the hidden dimension: %d' % fan_out) logger.debug('Number of gating functions: %d' % config.num_gates) logger.debug('Number of parameters in ExtGrCNN: %d' % self.num_params) logger.debug('Activation function: %s' % config.activation) def _step_prop(self, iter, current_level, nsteps): ''' @current_level: Input matrix at current level. The first dimension corresponds to the time dimension while the second dimension corresponds to the dimension of hidden representation ''' # Building shifted matrix, due to the constraints of Theano.scan, we have to keep the shape of the # input and output matrix, of size Txd left_current_level = current_level[:nsteps-iter-1] right_current_level = current_level[1:nsteps-iter] # Compute the temporary central multi-representation, of size TxKxd, where T is the dimension of # time, K is the dimension of number of gates and d is the dimension of hidden representation multi_centrals = self.act.activate(T.dot(left_current_level, self.Wl) + T.dot(right_current_level, self.Wr) + self.Wb) # Compute the gating function, of size Tx(K+2) multi_gates = T.nnet.softmax(T.dot(left_current_level, self.Gl) + T.dot(right_current_level, self.Gr) + self.Gb) # Softmax-Gating combination multi_gates = multi_gates.dimshuffle(0, 1, 'x') next_level = multi_gates[:, 1:-1, :] * multi_centrals next_level = T.sum(next_level, axis=1) next_level += multi_gates[:, 0] * left_current_level + multi_gates[:, -1] * right_current_level return T.set_subtensor(current_level[:nsteps-iter-1], next_level) def encode(self, inputM): ''' @input: Theano symbolic matrix. Compress the input matrix into output vector. The first dimension of inputM should correspond to the time dimension. ''' hidden = T.dot(inputM, self.U) nsteps = inputM.shape[0] pyramids, _ = theano.scan(fn=self._step_prop, sequences=T.arange(nsteps-1), non_sequences=nsteps, outputs_info=[hidden], n_steps=nsteps-1) output = pyramids[-1][0].dimshuffle('x', 0) return output
def __init__(self, config, verbose=True): if verbose: logger.debug('Building Bidirectional RNN Encoder...') self.input = T.matrix(name='BRNNEncoder_input') # Configure Activation function self.act = Activation(config.activation) # Build Bidirectional RNN num_input, num_hidden = config.num_input, config.num_hidden self.num_params = 2 * (num_input * num_hidden + num_hidden * num_hidden + num_hidden) # Initialize model parameters np.random.seed(config.random_seed) # 1, Feed-forward matrix for forward direction: W_forward W_forward_val = np.random.uniform(low=-1.0, high=1.0, size=(num_input, num_hidden)) W_forward_val = W_forward_val.astype(floatX) self.W_forward = theano.shared(value=W_forward_val, name='W_forward', borrow=True) # 1, Feed-forward matrix for backward direction: W_backward W_backward_val = np.random.uniform(low=-1.0, high=1.0, size=(num_input, num_hidden)) W_backward_val = W_backward_val.astype(floatX) self.W_backward = theano.shared(value=W_backward_val, name='W_backward', borrow=True) # 2, Recurrent matrix for forward direction: U_forward U_forward_val = np.random.uniform(low=-1.0, high=1.0, size=(num_hidden, num_hidden)) U_forward_val = U_forward_val.astype(floatX) U_forward_val, _, _ = np.linalg.svd(U_forward_val) self.U_forward = theano.shared(value=U_forward_val, name='U_forward', borrow=True) # 2, Recurrent matrix for backward direction: U_backward U_backward_val = np.random.uniform(low=-1.0, high=1.0, size=(num_hidden, num_hidden)) U_backward_val = U_backward_val.astype(floatX) U_backward_val, _, _ = np.linalg.svd(U_backward_val) self.U_backward = theano.shared(value=U_backward_val, name='U_backward', borrow=True) # 3, Bias parameter for the hidden-layer forward direction RNN b_forward_val = np.zeros(num_hidden, dtype=floatX) self.b_forward = theano.shared(value=b_forward_val, name='b_forward', borrow=True) # 3, Bias parameter for the hidden-layer backward direction RNN b_backward_val = np.zeros(num_hidden, dtype=floatX) self.b_backward = theano.shared(value=b_backward_val, name='b_backward', borrow=True) # h[0], zero vectors, treated as constants self.h0_forward = theano.shared(value=np.zeros(num_hidden, dtype=floatX), name='h0_forward', borrow=True) self.h0_backward = theano.shared(value=np.zeros(num_hidden, dtype=floatX), name='h0_backward', borrow=True) # Stack all the parameters self.params = [self.W_forward, self.W_backward, self.U_forward, self.U_backward, self.b_forward, self.b_backward] # Compute the forward and backward representation over time self.h_forwards, _ = theano.scan(fn=self._forward_step, sequences=self.input, outputs_info=[self.h0_forward], truncate_gradient=config.bptt) self.h_backwards, _ = theano.scan(fn=self._backward_step, sequences=self.input, outputs_info=[self.h0_backward], truncate_gradient=config.bptt, go_backwards=True) # Average compressing self.h_forward = T.mean(self.h_forwards, axis=0) self.h_backward = T.mean(self.h_backwards, axis=0) # Concatenate self.output = T.concatenate([self.h_forward, self.h_backward], axis=0) # L1, L2 regularization self.L1_norm = T.sum(T.abs_(self.W_forward) + T.abs_(self.W_backward) + T.abs_(self.U_forward) + T.abs_(self.U_backward)) self.L2_norm = T.sum(self.W_forward ** 2) + T.sum(self.W_backward ** 2) + \ T.sum(self.U_forward ** 2) + T.sum(self.U_backward ** 2) if verbose: logger.debug('Finished constructing the structure of BRNN Encoder: ') logger.debug('Size of the input dimension: %d' % num_input) logger.debug('Size of the hidden dimension: %d' % num_hidden) logger.debug('Activation function: %s' % config.activation)
def __init__(self, configs=None, verbose=True): ''' @config: CNNConfiger. Configer used to set the architecture of CNN. ''' if verbose: pprint("Building Convolutional Neural Network...") # Make theano symbolic tensor for input and ground truth label self.input = T.tensor4(name='input', dtype=floatX) self.truth = T.ivector(name='label') self.learn_rate = T.scalar(name='learn rate') self.batch_size = configs.batch_size self.image_row = configs.image_row self.image_col = configs.image_col # There may have multiple convolution-pooling and multi-layer perceptrons. self.convpool_layers = [] self.hidden_layers = [] self.softmax_layers = [] # Configure activation function self.act = Activation(configs.activation) # Configuration should be valid assert configs.num_convpool == len(configs.convs) assert configs.num_convpool == len(configs.pools) assert configs.num_hidden == len(configs.hiddens) assert configs.num_softmax == len(configs.softmaxs) # Construct random number generator srng = T.shared_randomstreams.RandomStreams(configs.random_seed) # Build architecture of CNN # Convolution and Pooling layers image_shapes, filter_shapes = [], [] for i in xrange(configs.num_convpool): if i == 0: image_shapes.append( (self.batch_size, 1, self.image_row, self.image_col)) filter_shapes.append( (configs.convs[i][0], 1, configs.convs[i][1], configs.convs[i][2])) else: image_shapes.append( (self.batch_size, configs.convs[i - 1][0], (image_shapes[i - 1][2] - configs.convs[i - 1][1] + 1) / configs.pools[i - 1][0], (image_shapes[i - 1][3] - configs.convs[i - 1][2] + 1) / configs.pools[i - 1][1])) filter_shapes.append( (configs.convs[i][0], configs.convs[i - 1][0], configs.convs[i][1], configs.convs[i][2])) for i in xrange(configs.num_convpool): if i == 0: current_input = self.input else: current_input = self.convpool_layers[i - 1].output self.convpool_layers.append( LeNetConvPoolLayer(input=current_input, filter_shape=filter_shapes[i], image_shape=image_shapes[i], poolsize=configs.pools[i], act=self.act)) # Multilayer perceptron layers for i in xrange(configs.num_hidden): if i == 0: current_input = T.flatten( self.convpool_layers[configs.num_convpool - 1].output, 2) else: current_input = self.hidden_layers[i - 1].output # Adding dropout to hidden layers hidden_layer = HiddenLayer(current_input, configs.hiddens[i], act=self.act) mask = srng.binomial(n=1, p=1 - configs.dropout, size=hidden_layer.shape) hidden_layer *= T.cast(mask, floatX) self.hidden_layers.append(hidden_layer) # Softmax Layer, for most case, the architecture will only contain one softmax layer for i in xrange(configs.num_softmax): if i == 0: current_input = self.hidden_layers[configs.num_hidden - 1].output else: current_input = self.softmax_layers[i - 1].output self.softmax_layers.append( SoftmaxLayer(current_input, configs.softmaxs[i])) # Output self.pred = self.softmax_layers[configs.num_softmax - 1].prediction() # Cost function with ground truth provided self.cost = self.softmax_layers[configs.num_softmax - 1].NLL_loss( self.truth) # Build cost function # Stack all the parameters self.params = [] for convpool_layer in self.convpool_layers: self.params.extend(convpool_layer.params) for hidden_layer in self.hidden_layers: self.params.extend(hidden_layer.params) for softmax_layer in self.softmax_layers: self.params.extend(softmax_layer.params) # Compute gradient of self.cost with respect to network parameters self.gradparams = T.grad(self.cost, self.params) # Stochastic gradient descent learning algorithm self.updates = [] for param, gradparam in zip(self.params, self.gradparams): self.updates.append((param, param - self.learn_rate * gradparam)) # Build objective function self.objective = theano.function( inputs=[self.input, self.truth, self.learn_rate], outputs=self.cost, updates=self.updates) # Build prediction function self.predict = theano.function(inputs=[self.input], outputs=self.pred) if verbose: pprint('Architecture building finished, summarized as below: ') pprint( 'There are %d layers (not including the input layer) algether: ' % (configs.num_convpool * 2 + configs.num_hidden + configs.num_softmax)) pprint('%d convolution layers + %d maxpooling layers.' % (len(self.convpool_layers), len(self.convpool_layers))) pprint('%d hidden layers.' % (len(self.hidden_layers))) pprint('%d softmax layers.' % (len(self.softmax_layers))) pprint('=' * 50) pprint('Detailed architecture of each layer: ') pprint('-' * 50) pprint('Convolution and Pooling layers: ') for i in xrange(len(self.convpool_layers)): pprint('Convolution Layer %d: ' % i) pprint( '%d feature maps, each has a filter kernel with size (%d, %d)' % (configs.convs[i][0], configs.convs[i][1], configs.convs[i][2])) pprint('-' * 50) pprint('Hidden layers: ') for i in xrange(len(self.hidden_layers)): pprint('Hidden Layer %d: ' % i) pprint('Input dimension: %d, Output dimension: %d' % (configs.hiddens[i][0], configs.hiddens[i][1])) pprint('-' * 50) pprint('Softmax layers: ') for i in xrange(len(self.softmax_layers)): pprint('Softmax Layer %d: ' % i) pprint('Input dimension: %d, Output dimension: %d' % (configs.softmaxs[i][0], configs.softmaxs[i][1]))
class NNWordBasedAttention(WordBasedAttention): """ Neural Machine Translation By Jointly Learning To Align and Translate Dzmitry Bahdanau, KyungHyun Cho, and Yoshua Bengio In Proceedings of ICLR 2015 http://arxiv.org/abs/1409.0473v3 """ def __init__(self, word_dim, seq_dim, hidden_dim, activation='tanh', initializer=default_initializer): super(NNWordBasedAttention, self).__init__(word_dim=word_dim, seq_dim=seq_dim, initializer=default_initializer) # (dim, dim) self.hidden_dim = hidden_dim self.W = shared_rand_matrix((self.word_dim, self.hidden_dim), 'Attention_W', initializer) self.U = shared_rand_matrix((self.seq_dim, self.hidden_dim), 'Attention_U', initializer) self.v = shared_rand_matrix((self.hidden_dim, ), 'Attention_v', initializer) self.act = Activation(activation) self.params = [self.W] self.norm_params = [self.W] def score( self, word, sequence, ): """ :param word: (word_dim, ) :param sequence: (length, seq_dim) :return: score: (length, ) """ # (word_dim, ) dot (word_dim, hidden_dim) -> (hidden_dim, ) hidden1 = T.dot(word, self.W) # (length, seq_dim) dot (seq_dim, hidden_dim) -> (length, hidden_dim) hidden2 = T.dot(sequence, self.U) # (hidden_dim, ) + (length, hidden_dim) -> (length, hidden_dim) hidden = hidden1[None, :] + hidden2 # (length, hidden_dim) -> (length, hidden_dim) act_hidden = self.act.activate(hidden) # (length, hidden_dim) dot (hidden_dim, ) -> (length, ) score = T.dot(act_hidden, self.v) return score def score_batch( self, word, sequence, ): """ :param word: (batch, word_dim) :param sequence: (batch, length, seq_dim) :return: score: (batch, length, ) """ # (batch, word_dim) dot (word_dim, hidden_dim) -> (batch, hidden_dim) hidden1 = T.dot(word, self.W) # (batch, length, seq_dim) dot (seq_dim, hidden_dim) -> (batch, length, hidden_dim) hidden2 = T.dot(sequence, self.U) # (batch, length, hidden_dim) + (batch, hidden_dim) -> (batch, length, hidden_dim) hidden = hidden1[:, None, :] + hidden2 # (batch, length, hidden_dim) -> (batch, length, hidden_dim) act_hidden = self.act.activate(hidden) # (batch, length, hidden_dim) dot (hidden_dim, ) -> (batch, length, ) score = T.dot(act_hidden, self.v) return score
class BRNN(object): ''' Bidirectional RNN. This is just a trial for using BRNN as a tool for sentence modeling. First trial on the task of sentiment analysis. ''' def __init__(self, configs, verbose=True): if verbose: pprint('Build Tied weights Bidirectional Recurrent Neural Network') self.input = T.matrix(name='input') self.truth = T.ivector(name='label') self.learn_rate = T.scalar(name='learn rate') # Configure Activation function self.act = Activation(configs.activation) # Build bidirectional RNN with tied weights num_input, num_hidden, num_class = configs.num_input, configs.num_hidden, configs.num_class # Stack all the variables together into a vector in order to apply the batch updating algorithm # Since there are two directions for the RNN, all the weight matrix associated with RNN will be # duplicated num_params = 2 * (num_input * num_hidden + \ num_hidden * num_hidden + \ num_hidden) + \ 2 * num_hidden * num_class + \ num_class self.num_params = num_params self.theta = theano.shared(value=np.zeros(num_params, dtype=floatX), name='theta', borrow=True) # Incremental index param_idx = 0 # 1, Feed-forward matrix for forward direction: W_forward self.W_forward = self.theta[param_idx: param_idx+num_input*num_hidden].reshape((num_input, num_hidden)) self.W_forward.name = 'W_forward_RNN' W_forward_init = np.asarray(np.random.uniform(low=-np.sqrt(6.0/(num_input+num_hidden)), high=np.sqrt(6.0/(num_input+num_hidden)), size=(num_input, num_hidden)), dtype=floatX) param_idx += num_input * num_hidden # 1, Feed-forward matrix for backward direction: W_backward self.W_backward = self.theta[param_idx: param_idx+num_input*num_hidden].reshape((num_input, num_hidden)) self.W_backward.name = 'W_backward_RNN' W_backward_init = np.asarray(np.random.uniform(low=-np.sqrt(6.0/(num_input+num_hidden)), high=np.sqrt(6.0/(num_input+num_hidden)), size=(num_input, num_hidden)), dtype=floatX) param_idx += num_input * num_hidden # 2, Recurrent matrix for forward direction: U_forward self.U_forward = self.theta[param_idx: param_idx+num_hidden*num_hidden].reshape((num_hidden, num_hidden)) self.U_forward.name = 'U_forward_RNN' U_forward_init = np.asarray(np.random.uniform(low=-np.sqrt(6.0/(num_hidden+num_hidden)), high=np.sqrt(6.0/(num_hidden+num_hidden)), size=(num_hidden, num_hidden)), dtype=floatX) param_idx += num_hidden * num_hidden # 2, Recurrent matrix for backward direction: U_backward self.U_backward = self.theta[param_idx: param_idx+num_hidden*num_hidden].reshape((num_hidden, num_hidden)) self.U_backward.name = 'U_backward_RNN' U_backward_init = np.asarray(np.random.uniform(low=-np.sqrt(6.0/(num_hidden+num_hidden)), high=np.sqrt(6.0/(num_hidden+num_hidden)), size=(num_hidden, num_hidden)), dtype=floatX) param_idx += num_hidden * num_hidden # 3, Bias parameter for the hidden-layer forward direction RNN self.b_forward = self.theta[param_idx: param_idx+num_hidden] self.b_forward.name = 'b_forward_RNN' b_forward_init = np.zeros(num_hidden, dtype=floatX) param_idx += num_hidden # 3, Bias parameter for the hidden-layer backward direction RNN self.b_backward = self.theta[param_idx: param_idx+num_hidden] self.b_backward.name = 'b_backward_RNN' b_backward_init = np.zeros(num_hidden, dtype=floatX) param_idx += num_hidden # Weight matrix for softmax function self.W_softmax = self.theta[param_idx: param_idx+2*num_hidden*num_class].reshape((2*num_hidden, num_class)) self.W_softmax.name = 'W_softmax' W_softmax_init = np.asarray(np.random.uniform(low=-np.sqrt(6.0/(2*num_hidden+num_class)), high=np.sqrt(6.0/(2*num_hidden+num_class)), size=(2*num_hidden, num_class)), dtype=floatX) param_idx += 2*num_hidden*num_class # Bias vector for softmax function self.b_softmax = self.theta[param_idx: param_idx+num_class] self.b_softmax.name = 'b_softmax' b_softmax_init = np.zeros(num_class, dtype=floatX) param_idx += num_class # Set all the default parameters into theta self.theta.set_value(np.concatenate([x.ravel() for x in (W_forward_init, W_backward_init, U_forward_init, U_backward_init, b_forward_init, b_backward_init, W_softmax_init, b_softmax_init)])) assert param_idx == num_params # h[0], zero vector, treated as constants self.h_start = theano.shared(value=np.zeros(num_hidden, dtype=floatX), name='h_start', borrow=True) self.h_end = theano.shared(value=np.zeros(num_hidden, dtype=floatX), name='h_end', borrow=True) # recurrent function used to compress a sequence of input vectors # the first dimension should correspond to time def forward_step(x_t, h_tm1): h_t = self.act.activate(T.dot(x_t, self.W_forward) + \ T.dot(h_tm1, self.U_forward) + self.b_forward) return h_t def backward_step(x_t, h_tm1): h_t = self.act.activate(T.dot(x_t, self.W_backward) + \ T.dot(h_tm1, self.U_backward) + self.b_backward) return h_t # Forward and backward representation over time self.forward_h, _ = theano.scan(fn=forward_step, sequences=self.input, outputs_info=[self.h_start], truncate_gradient=configs.bptt) self.backward_h, _ = theano.scan(fn=backward_step, sequences=self.input, outputs_info=[self.h_end], truncate_gradient=configs.bptt, go_backwards=True) # Store the final value # self.h_start_star = self.forward_h[-1] # self.h_end_star = self.backward_h[-1] self.h_start_star = T.mean(self.forward_h, axis=0) self.h_end_star = T.mean(self.backward_h, axis=0) # L1, L2 regularization self.L1_norm = T.sum(T.abs_(self.W_forward) + T.abs_(self.W_backward) + \ T.abs_(self.U_forward) + T.abs_(self.U_backward) + \ T.abs_(self.W_softmax)) self.L2_norm = T.sum(self.W_forward ** 2) + T.sum(self.W_backward ** 2) + \ T.sum(self.U_forward ** 2) + T.sum(self.U_backward ** 2) + \ T.sum(self.W_softmax ** 2) # Build function to show the learned representation for different sentences self.show_forward = theano.function(inputs=[self.input], outputs=self.h_start_star) self.show_backward = theano.function(inputs=[self.input], outputs=self.h_end_star) ################################################################################## # Correlated BRNN ################################################################################## # Concatenate these two vectors into one self.h = T.concatenate([self.h_start_star, self.h_end_star], axis=0) # Dropout parameter srng = T.shared_randomstreams.RandomStreams(configs.random_seed) mask = srng.binomial(n=1, p=1-configs.dropout, size=self.h.shape) self.h *= T.cast(mask, floatX) # Use concatenated vector as input to the Softmax/MLP classifier self.output = T.nnet.softmax(T.dot(self.h, self.W_softmax) + self.b_softmax) self.pred = T.argmax(self.output, axis=1) # Build cost function self.cost = -T.mean(T.log(self.output)[T.arange(self.truth.shape[0]), self.truth]) if configs.regularization: self.cost += configs.lambda1 * self.L2_norm # Compute gradient self.gradtheta = T.grad(self.cost, self.theta) self.gradinput = T.grad(self.cost, self.input) # Build objective function # Compute the gradients to parameters self.compute_cost_and_gradient = theano.function(inputs=[self.input, self.truth], outputs=[self.cost, self.gradtheta]) # Compute the gradients to inputs self.compute_input_gradient = theano.function(inputs=[self.input, self.truth], outputs=self.gradinput) # Build prediction function self.predict = theano.function(inputs=[self.input], outputs=self.pred) if verbose: pprint('*' * 50) pprint('Finished constructing Bidirectional Recurrent Neural Network (BRNN)') pprint('Size of input dimension: %d' % configs.num_input) pprint('Size of hidden/recurrent dimension: %d' % configs.num_hidden) pprint('Size of output dimension: %d' % configs.num_class) pprint('Is regularization applied? %s' % ('yes' if configs.regularization else 'no')) if configs.regularization: pprint('Coefficient of regularization term: %f' % configs.lambda1) pprint('BPTT step: %d' % configs.bptt) pprint('Number of free parameters in BRNN: %d' % self.num_params) pprint('*' * 50) # This method is used to implement the batch updating algorithm def update_params(self, gradtheta, learn_rate): # gradparams is a single long vector which can be used to update self.theta # Learning algorithm: simple stochastic gradient descent theta = self.theta.get_value(borrow=True) self.theta.set_value(theta - learn_rate * gradtheta, borrow=True) @staticmethod def save(fname, model): with file(fname, 'wb') as fout: cPickle.dump(model, fout) @staticmethod def load(fname): with file(fname, 'rb') as fin: return cPickle.load(fin)
class TransEModel(EntityScorer): def __init__(self, entity_dim, relation_num, activation='iden', initializer=default_initializer, prefix='', verbose=True): super(TransEModel, self).__init__() self.entity_dim = entity_dim self.relation_num = relation_num # (relation_num, entity_dim, entity_dim) self.W = shared_rand_matrix((relation_num, self.entity_dim), prefix + 'TransE_R', initializer) self.act = Activation(activation) self.params = [self.W] self.norm_params = [self.W] self.l1_norm = T.sum(T.abs_(self.W)) self.l2_norm = T.sum(self.W**2) if verbose: logger.debug( 'Architecture of TransE Model built finished, summarized as below:' ) logger.debug('Entity Dimension: %d' % self.entity_dim) logger.debug('Relation Number: %d' % self.relation_num) logger.debug('Initializer: %s' % initializer) logger.debug('Activation: %s' % activation) def score(self, e1, e2, r_index): """ :param e1: (entity_dim, ) :param e2: (entity_dim, ) :param r_index: scalar :return: """ # (entity_dim, ) + (entity_dim, ) - (entity_dim, ) -> (entity_dim, ) hidden = e1 + self.W[r_index] - e2 # (entity_dim, ) -> scalar d = T.sum(hidden**2) return self.act.activate(d) def score_batch(self, e1, e2, r_index): """ :param e1: (batch, entity_dim, ) :param e2: (batch, entity_dim, ) :param r_index: (batch, ) :return: """ # (batch, entity_dim, ) + (batch, entity_dim, ) - (batch, entity_dim, ) -> (batch, entity_dim, ) hidden = e1 + self.W[r_index] - e2 d = T.sum(hidden**2, axis=1) return self.act.activate(d) def score_one_relation(self, e1, e2, r_index): """ :param e1: (batch, entity_dim, ) :param e2: (batch, entity_dim, ) :param r_index: scalar :return: """ # (batch, entity_dim, ) + (batch, entity_dim, ) - (batch, entity_dim, ) -> (batch, entity_dim, ) hidden = e1 + self.W[r_index][None, :] - e2 d = T.sum(hidden**2, axis=1) return self.act.activate(d)
class NeuralTensorModel(EntityScorer): def __init__(self, entity_dim, relation_num, activation='tanh', hidden=5, keep_normal=False, initializer=default_initializer, prefix='', verbose=True): super(NeuralTensorModel, self).__init__() self.entity_dim = entity_dim self.relation_num = relation_num self.hidden = hidden self.slice_seq = T.arange(hidden) self.keep_normal = keep_normal # (relation_num, entity_dim, entity_dim, hidden) self.W = shared_rand_matrix( (relation_num, self.entity_dim, self.entity_dim, self.hidden), prefix + 'NTN_W', initializer) # (relation_num, hidden) self.U = shared_ones_matrix((relation_num, self.hidden), name=prefix + 'NTN_U') if keep_normal: # (relation_num, entity_dim, hidden) self.V = shared_rand_matrix( (relation_num, self.entity_dim * 2, self.hidden), prefix + 'NTN_V', initializer) # (relation_num, hidden) self.b = shared_zero_matrix((relation_num, self.hidden), name=prefix + 'NTN_B') self.params = [self.W, self.V, self.U, self.b] self.norm_params = [self.W, self.V, self.U, self.b] else: self.params = [self.W] self.norm_params = [self.W] self.act = Activation(activation) self.l1_norm = T.sum( [T.sum(T.abs_(param)) for param in self.norm_params]) self.l2_norm = T.sum([T.sum(param**2) for param in self.norm_params]) if verbose: logger.debug( 'Architecture of Tensor Model built finished, summarized as below:' ) logger.debug('Entity Dimension: %d' % self.entity_dim) logger.debug('Hidden Dimension: %d' % self.hidden) logger.debug('Relation Number: %d' % self.relation_num) logger.debug('Initializer: %s' % initializer) logger.debug('Activation: %s' % activation) @staticmethod def step(_slice, e1, e2, w): """ :param _slice: scalar :param e1: (entity_dim, ) :param e2: (entity_dim, ) :param w : (entity_dim, entity_dim, hidden) :return: """ # (entity_dim, ) dot (entity_dim, entity_dim) dot (entiy_dim) -> scalar return T.dot(e1, T.dot(w[_slice], e2)) @staticmethod def step_relation(_slice, e1, e2, w): """ :param _slice: scalar :param e1: (batch, entity_dim) :param e2: (batch, entity_dim) :param w : (entity_dim, entity_dim, hidden) :return: """ # (batch, entity_dim, ) dot (entity_dim, entity_dim) -> (batch, entity_dim) hidden = T.dot(e1, w[:, :, _slice]) # (batch, entity_dim) dot (batch, entity_dim, ) -> (batch, ) hidden = T.sum(hidden * e2, axis=1) return hidden @staticmethod def step_batch(_slice, e1, e2, w): """ :param _slice: scalar :param e1: (batch, entity_dim) :param e2: (batch, entity_dim) :param w : (batch, entity_dim, entity_dim, hidden) :return: """ # (batch, entity_dim, ) dot (batch, entity_dim, entity_dim) -> (batch, entity_dim) hidden = T.batched_dot(e1, w[:, :, :, _slice]) # (batch, entity_dim) dot (batch, entity_dim, ) -> (batch, ) hidden = T.sum(hidden * e2, axis=1) return hidden def score(self, e1, e2, r_index): """ :param e1: (entity_dim, ) :param e2: (entity_dim, ) :param r_index: scalar :return: """ # (entity_dim, ) dot (entity_dim, entity_dim, hidden) dot (entity_dim, ) -> (hidden, ) hidden1_sep, _ = theano.scan(fn=self.step, sequences=[self.slice_seq], non_sequences=[e1, e2, self.W[r_index]], name='single_scan') hidden1 = T.concatenate([hidden1_sep]) if self.keep_normal: # (2 * entity_dim, ) dot (2 * entity_dim, hidden) -> (hidden, ) hidden2 = T.dot(T.concatenate([e1, e2]), self.V[r_index]) # (hidden, ) + (hidden, ) + (hidden, ) -> (hidden, ) hidden = hidden1 + hidden2 + self.b[r_index] else: hidden = hidden1 # (hidden, ) -> (hidden, ) act_hidden = self.act.activate(hidden) # (hidden, ) dot (hidden, ) -> scalar return T.dot(act_hidden, self.U[r_index]) def score_batch(self, e1, e2, r_index): """ :param e1: (batch, entity_dim, ) :param e2: (batch, entity_dim, ) :param r_index: (batch, ) :return: """ # (batch, entity_dim) dot (batch, entity_dim, entity_dim, hidden) dot (batch, entity_dim) -> hidden * (batch, ) hidden1_sep, _ = theano.scan(fn=self.step_batch, sequences=[self.slice_seq], non_sequences=[e1, e2, self.W[r_index]], name='batch_scan') # hidden * (batch, ) -> (batch, hidden) hidden1 = T.concatenate([hidden1_sep], axis=1).transpose() if self.keep_normal: # (batch, 2 * entity_dim) dot (batch, 2 * entity_dim, hidden) -> (batch, hidden, ) hidden2 = T.batched_dot(T.concatenate([e1, e2], axis=1), self.V[r_index]) # (batch, hidden) + (batch, hidden) + (batch, hidden) -> (batch, hidden) hidden = hidden1 + hidden2 + self.b[r_index] else: hidden = hidden1 # (batch, hidden) -> (batch, hidden) act_hidden = self.act.activate(hidden) # (batch, hidden) dot (batch, hidden) -> (batch, ) return T.sum(act_hidden * self.U[r_index], axis=1) def score_one_relation(self, e1, e2, r_index): """ :param e1: (batch, entity_dim, ) :param e2: (batch, entity_dim, ) :param r_index: scalar :return: """ # (batch, entity_dim) dot (entity_dim, entity_dim, hidden) dot (batch, entity_dim) -> hidden * (batch, ) hidden1_sep, _ = theano.scan(fn=self.step_relation, sequences=self.slice_seq, non_sequences=[e1, e2, self.W[r_index]], name='relation_scan') # hidden * (batch, ) -> (batch, hidden) hidden1 = T.concatenate([hidden1_sep], axis=1).transpose() if self.keep_normal: # (batch, 2 * entity_dim) dot (2 * entity_dim, hidden) -> (batch, hidden) hidden2 = T.dot(T.concatenate([e1, e2], axis=1), self.V[r_index]) # (batch, hidden) + (batch, hidden) + (hidden) -> (batch, hidden) hidden = hidden1 + hidden2 + self.b[r_index][None, :] else: hidden = hidden1 # (batch, hidden) -> (batch, hidden) act_hidden = self.act.activate(hidden) # (batch, hidden) dot (batch, hidden) -> (batch, ) return T.sum(act_hidden * self.U[r_index], axis=1)
class SingleLayerModel(EntityScorer): def __init__(self, entity_dim, relation_num, hidden=50, activation='tanh', initializer=default_initializer, prefix='', verbose=True): super(SingleLayerModel, self).__init__() self.hidden = hidden self.entity_dim = entity_dim self.relation_num = relation_num # (relation_num, k, entity_dim) self.W_1 = shared_rand_matrix( (relation_num, self.hidden, self.entity_dim), prefix + 'SingleLayer_W1', initializer) # (relation_num, k, entity_dim) self.W_2 = shared_rand_matrix( (relation_num, self.hidden, self.entity_dim), prefix + 'SingleLayer_W2', initializer) # (relation_num, k, ) self.u = shared_ones_matrix(( relation_num, self.hidden, ), prefix + 'SingleLayer_u') self.act = Activation(activation) self.params = [self.W_1, self.W_2, self.u] self.norm_params = [self.W_1, self.W_2, self.u] self.l1_norm = T.sum(T.abs_(self.W_1)) + T.sum(T.abs_( self.W_2)) + T.sum(T.abs_(self.u)) self.l2_norm = T.sum(self.W_1**2) + T.sum(self.W_2**2) + T.sum(self.u** 2) if verbose: logger.debug( 'Architecture of Single Layer Model built finished, summarized as below:' ) logger.debug('Entity Dimension: %d' % self.entity_dim) logger.debug('Hidden Dimension: %d' % self.hidden) logger.debug('Relation Number: %d' % self.relation_num) logger.debug('Initializer: %s' % initializer) logger.debug('Activation: %s' % activation) def score(self, e1, e2, r_index): """ :param e1: (entity_dim, ) :param e2: (entity_dim, ) :param r_index: scalar :return: """ # (hidden, entity_dim) dot (entity_dim) + (hidden, entity_dim) dot (entity_dim) -> (hidden, ) hidden = T.dot(self.W_1[r_index], e1) + T.dot(self.W_2[r_index], e2) # (hidden, ) -> (hidden, ) act_hidden = self.act.activate(hidden) # (hidden, ) dot (hidden, ) -> 1 return T.dot(self.u[r_index], act_hidden) def score_batch(self, e1, e2, r_index): """ :param e1: (batch, entity_dim, ) :param e2: (batch, entity_dim, ) :param r_index: (batch, ) :return: """ # (batch, hidden, entity_dim) dot (batch, entity_dim) + (batch, hidden, entity_dim) dot (batch, entity_dim) hidden = T.batched_dot(self.W_1[r_index], e1) hidden += T.batched_dot(self.W_2[r_index], e2) # (batch, hidden) -> (batch, hidden) act_hidden = self.act.activate(hidden) # (batch, hidden) dot (batch, hidden) -> (batch, ) return T.sum(act_hidden * self.u[r_index], axis=1) def score_one_relation(self, e1, e2, r_index): """ :param e1: (batch, entity_dim, ) :param e2: (batch, entity_dim, ) :param r_index: scalar :return: """ # (batch, entity_dim) dot (entity_dim, hidden) + (batch, entity_dim) dot (entity_dim, hidden) -> (batch, hidden) hidden = T.dot(e1, self.W_1[r_index].transpose()) + T.dot( e2, self.W_2[r_index].transpose()) # (batch, hidden) -> (batch, hidden) act_hidden = self.act.activate(hidden) # (batch, hidden) dot (hidden, ) -> (batch, ) return T.dot(act_hidden, self.u[r_index])
def __init__(self, config=None, verbose=True): ''' @config: GRCNNConfiger. Configer used to set the architecture of GRCNNEncoder. ''' if verbose: logger.debug('Building Gated Recursive Convolutional Neural Network Encoder...') # Scale factor for initializing parameters self.scale = config.scale # Make theano symbolic tensor for input and model parameters self.input = T.matrix(name='GrCNN Encoder input', dtype=floatX) # Configure activation function self.act = Activation(config.activation) fan_in, fan_out = config.num_input, config.num_hidden # Initialize model parameters # Set seed of the random generator np.random.seed(config.random_seed) # Projection matrix U # Initialize all the matrices using orthogonal matrices U_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_in, fan_out)) U_val = U_val.astype(floatX) U_val *= self.scale self.U = theano.shared(value=U_val, name='U', borrow=True) self.hidden0 = T.dot(self.input, self.U) # W^l, W^r, parameters used to construct the central hidden representation Wl_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, fan_out)) Wl_val = Wl_val.astype(floatX) Wl_val, _, _ = np.linalg.svd(Wl_val) # Wl_val *= self.scale self.Wl = theano.shared(value=Wl_val, name='W_l', borrow=True) Wr_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, fan_out)) Wr_val = Wr_val.astype(floatX) Wr_val, _, _ = np.linalg.svd(Wr_val) # Wr_val *= self.scale self.Wr = theano.shared(value=Wr_val, name='W_r', borrow=True) self.Wb = theano.shared(value=np.zeros(fan_out, dtype=floatX), name='Wb', borrow=True) # G^l, G^r, parameters used to construct the three-way coefficients Gl_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, 3)) Gl_val = Gl_val.astype(floatX) self.Gl = theano.shared(value=Gl_val, name='G_l', borrow=True) Gr_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, 3)) Gr_val = Gr_val.astype(floatX) self.Gr = theano.shared(value=Gr_val, name='G_r', borrow=True) self.Gb = theano.shared(value=np.zeros(3, dtype=floatX), name='Gb', borrow=True) # Save all the parameters into one batch self.params = [self.U, self.Wl, self.Wr, self.Wb, self.Gl, self.Gr, self.Gb] # Compute the total number of parameters self.num_params = reduce(lambda x, y: x+np.prod(y.get_value().shape), self.params, 0) # Length of the time sequence self.nsteps = self.input.shape[0] self.pyramids, _ = theano.scan(fn=self._step_prop, sequences=T.arange(self.nsteps-1), non_sequences=self.nsteps, outputs_info=[self.hidden0], n_steps=self.nsteps-1) self.output = self.pyramids[-1][0].dimshuffle('x', 0) # Compression -- Encoding function self.compress = theano.function(inputs=[self.input], outputs=self.output) if verbose: logger.debug('Finished constructing the structure of grCNN Encoder: ') logger.debug('Size of the input dimension: %d' % fan_in) logger.debug('Size of the hidden dimension: %d' % fan_out) logger.debug('Activation function: %s' % config.activation)
images_path = './data/training_images' annotations_path = './data/annotations' classes_file = './data/classes.txt' X, y = prepare_dataset(images_path, annotations_path, classes_file) '''TRAINING PROCEDURE''' from models_final import Sequential from convolutions_final import Conv2D from normalizations import BatchNormalization from poolings import MaxPool2D from dense_final import Flatten, Dense from activations import Activation model = Sequential() model.add(Conv2D(10, (3, 3), 1, "valid", "convLayer1", X.shape)) model.add(MaxPool2D((2, 2), 2, "valid", "poolLayer1")) model.add(Activation('relu')) model.add(BatchNormalization(1, 0, 1e-5)) model.add(Conv2D(10, (3, 3), 1, "valid", "convLayer2")) model.add(MaxPool2D((2, 2), 2, "valid", "poolLayer2")) model.add(Activation('relu')) model.add(BatchNormalization(1, 0, 1e-5)) model.add(Conv2D(10, (3, 3), 1, "valid", "convLayer3")) model.add(MaxPool2D((2, 2), 2, "valid", "poolLayer3")) model.add(Activation('relu')) model.add(BatchNormalization(1, 0, 1e-5)) model.add(Conv2D(10, (3, 3), 1, "valid", "convLayer4")) model.add(MaxPool2D((2, 2), 2, "valid", "poolLayer4")) model.add(Activation('relu'))