def __init__(self, in_dim, hidden_dim, activation, prefix="", initializer=default_initializer, dropout=0, verbose=True): if verbose: logger.debug('Building {}...'.format(self.__class__.__name__)) self.in_dim = in_dim self.hidden_dim = hidden_dim self.out_dim = hidden_dim self.act = Activation(activation) self.dropout = dropout self.W = shared_rand_matrix((self.hidden_dim, self.in_dim), prefix + 'W', initializer) self.b = shared_zero_matrix((self.hidden_dim, ), prefix + 'b') self.params = [self.W, self.b] self.norm_params = [self.W] self.l1_norm = T.sum( [T.sum(T.abs_(param)) for param in self.norm_params]) self.l2_norm = T.sum([T.sum(param**2) for param in self.norm_params]) if verbose: logger.debug('Architecture of {} built finished'.format( self.__class__.__name__)) logger.debug('Input dimension: %d' % self.in_dim) logger.debug('Hidden dimension: %d' % self.hidden_dim) logger.debug('Activation Func: %s' % self.act.method) logger.debug('Dropout Rate: %f' % self.dropout)
def __init__(self, filters, kernel_size, stride=1, padding='same', activation=None): """ Params: filters: Number of Filters kernel_size: shape of the kernel stride: the stride padding: valid or same activation: activation function """ self.filters = filters num_weights = kernel_size[0] * kernel_size[1] self.kernel_size = kernel_size self.weights = None self.bias = None self.padding = (kernel_size[0] - 1) // 2 if padding == 'same' else 0 self.stride = stride self.output_units = [] self.activation = Activation(activation)
def __init__(self, in_dim, activation, hidden_dim=None, transform_gate="sigmoid", prefix="", initializer=default_initializer, dropout=0, verbose=True): # By construction the dimensions of in_dim and out_dim have to match, and hence W_T and W_H are square matrices. if hidden_dim is not None: assert in_dim == hidden_dim if verbose: logger.debug('Building {}...'.format(self.__class__.__name__)) super(HighwayLayer, self).__init__(in_dim, in_dim, activation, prefix, initializer, dropout, verbose) self.transform_gate = Activation(transform_gate) self.W_H, self.W_H.name = self.W, prefix + "W_H" self.b_H, self.b_H.name = self.b, prefix + "b_H" self.W_T = shared_rand_matrix((self.hidden_dim, self.in_dim), prefix + 'W_T', initializer) self.b_T = shared_zero_matrix((self.hidden_dim,), prefix + 'b_T') self.params = [self.W_H, self.W_T, self.b_H, self.b_T] self.norm_params = [self.W_H, self.W_T] self.l1_norm = T.sum([T.sum(T.abs_(param)) for param in self.norm_params]) self.l2_norm = T.sum([T.sum(param ** 2) for param in self.norm_params]) if verbose: logger.debug('Architecture of {} built finished'.format(self.__class__.__name__)) logger.debug('Input dimension: %d' % self.in_dim) logger.debug('Hidden dimension: %d' % self.hidden_dim) logger.debug('Activation Func: %s' % self.act.method) logger.debug('Transform Gate: %s' % self.transform_gate.method) logger.debug('Dropout Rate: %f' % self.dropout)
def __init__(self, entity_dim, relation_num, activation='iden', initializer=default_initializer, prefix='', verbose=True): super(TransEModel, self).__init__() self.entity_dim = entity_dim self.relation_num = relation_num # (relation_num, entity_dim, entity_dim) self.W = shared_rand_matrix((relation_num, self.entity_dim), prefix + 'TransE_R', initializer) self.act = Activation(activation) self.params = [self.W] self.norm_params = [self.W] self.l1_norm = T.sum(T.abs_(self.W)) self.l2_norm = T.sum(self.W**2) if verbose: logger.debug( 'Architecture of TransE Model built finished, summarized as below:' ) logger.debug('Entity Dimension: %d' % self.entity_dim) logger.debug('Relation Number: %d' % self.relation_num) logger.debug('Initializer: %s' % initializer) logger.debug('Activation: %s' % activation)
def __init__(self, configs=None, verbose=True): ''' Basic RNN is an unsupervised component, where the input is a sequence and the output is a vector with fixed length ''' if verbose: pprint('Build Recurrent Neural Network...') self.input = T.matrix(name='input', dtype=floatX) self.learn_rate = T.scalar(name='learn rate') # Configure activation function self.act = Activation(configs.activation) fan_in = configs.num_input fan_out = configs.num_hidden # Initialize all the variables in RNN, including: # 1, Feed-forward matrix, feed-forward bias, W, W_b # 2, Recurrent matrix, recurrent bias, U, U_b self.W = theano.shared(value=np.asarray(np.random.uniform( low=-np.sqrt(6.0 / (fan_in + fan_out)), high=np.sqrt(6.0 / (fan_in + fan_out)), size=(fan_in, fan_out)), dtype=floatX), name='W', borrow=True) self.U = theano.shared(value=np.asarray(np.random.uniform( low=-np.sqrt(6.0 / (fan_out + fan_out)), high=np.sqrt(6.0 / (fan_out + fan_out)), size=(fan_out, fan_out)), dtype=floatX), name='U', borrow=True) # Bias parameter for the hidden-layer encoder of RNN self.b = theano.shared(value=np.zeros(fan_out, dtype=floatX), name='b', borrow=True) # h[0], zero vector self.h0 = theano.shared(value=np.zeros(fan_out, dtype=floatX), name='h0', borrow=True) # Save all the parameters self.params = [self.W, self.U, self.b, self.h0] # recurrent function used to compress a sequence of input vectors # the first dimension should correspond to time def step(x_t, h_tm1): h_t = self.act.activate(T.dot(x_t, self.W) + \ T.dot(h_tm1, self.U) + self.b) return h_t # h is the hidden representation over a time sequence self.hs, _ = theano.scan(fn=step, sequences=self.input, outputs_info=[self.h0], truncate_gradient=configs.bptt) self.h = self.hs[-1] # L1, L2 regularization self.L1_norm = T.sum(T.abs_(self.W) + T.abs_(self.U)) self.L2_norm = T.sum(self.W**2) + T.sum(self.U**2) # Compress function self.compress = theano.function(inputs=[self.input], outputs=self.h)
def train(dataset): config_options = globals.config task_path = config_options.get("Data", dataset) loss = config_options.get('Train', 'loss') activation = config_options.get('Train', 'activation') if dataset == "classify": Xtrain = z_norm(load_mnist_X(task_path + "classf_Xtrain.txt")) Xtest = z_norm(load_mnist_X(task_path + "classf_Xtest.txt")) Xval = z_norm(load_mnist_X(task_path + "classf_XVal.txt")) ytrain = load_mnist_Y(task_path + "classf_ytrain.txt") ytest = load_mnist_Y(task_path + "classf_ytest.txt") yval = load_mnist_Y(task_path + "classf_yVal.txt") elif dataset == "regression": Xtrain = z_norm(load_regression_X(task_path + "regr_Xtrain.txt")) Xtest = z_norm(load_regression_X(task_path + "regr_Xtest.txt")) Xval = z_norm(load_regression_X(task_path + "regr_Xval.txt")) ytrain = load_regression_Y(task_path + "regr_ytrain.txt") ytest = load_regression_Y(task_path + "regr_ytest.txt") yval = load_regression_Y(task_path + "regr_yval.txt") else: logger.warning("Invalid task.") return logger.info("Load data complete.") # build model N, input_dim = Xtrain.shape model = Model() model.add(Layer(output_dim=globals.layer_dim, input_dim=input_dim)) model.add(Activation(activation=activation)) model.add(Layer(output_dim=globals.output_dim)) model.compile(loss=loss) history = model.fit(Xtrain, ytrain, batch_size=N, iterations=globals.iterations, validation_data=(Xval, yval)) # save result result_dir = config_options.get('Result', 'result-dir') file_name = "_".join([ dataset, activation, str(globals.alpha), str(globals.lam), str(globals.layer_dim), str(globals.iterations) ]) + ".txt" file_path = result_dir + file_name writeFile(file_path, "") for datum in history: datum = [str(x) for x in datum] line = "\t".join(datum) + "\n" writeFile(file_path, line, 'a') print model.loss.mse(Xval, yval) print model.loss.mse(Xtest, ytest)
def __init__(self, in_dim, hidden_dim, kernel_sizes=[3, 4, 5], padding='same', pooling='max', dilation_rate=1.0, activation='relu', prefix="", initializer=GlorotUniformInitializer(), dropout=0.0, verbose=True): """ Init Function for ConvolutionLayer :param in_dim: :param hidden_dim: :param kernel_sizes: :param padding: 'same', 'valid' :param pooling: 'max', 'mean', 'min' :param dilation_rate: :param activation: :param prefix: :param initializer: :param dropout: :param verbose: """ if verbose: logger.debug('Building {}...'.format(self.__class__.__name__)) self.conv_layers = list() self.in_dim = in_dim self.out_dim = hidden_dim * len(kernel_sizes) self.hidden_dim = hidden_dim self.kernel_sizes = kernel_sizes self.padding = padding self.dilation_rate = dilation_rate self.pooling = pooling self.dropout = dropout self.act = Activation(activation) self.params = list() self.norm_params = list() # L1, L2 Norm self.l1_norm = 0 self.l2_norm = 0 for filter_hs in kernel_sizes: self.conv_layers.append(ConvolutionLayer(in_dim=self.in_dim, hidden_dim=hidden_dim, kernel_size=filter_hs, padding=self.padding, pooling=self.pooling, dilation_rate=self.dilation_rate, activation=activation, prefix=prefix+"filter%s_" % filter_hs, initializer=initializer, dropout=dropout, verbose=verbose)) self.params += self.conv_layers[-1].params self.norm_params += self.conv_layers[-1].norm_params self.l1_norm += self.conv_layers[-1].l1_norm self.l2_norm += self.conv_layers[-1].l2_norm if verbose: logger.debug('Architecture of {} built finished'.format(self.__class__.__name__)) logger.debug('Input dimension: %d' % self.in_dim) logger.debug('Filter Num (Hidden): %d' % self.hidden_dim) logger.debug('Kernel Size (Windows): %s' % self.kernel_sizes) logger.debug('Padding method : %s' % self.padding) logger.debug('Dilation Rate : %s' % self.dilation_rate) logger.debug('Pooling method : %s' % self.pooling) logger.debug('Activation Func: %s' % self.act.method) logger.debug('Dropout Rate: %f' % self.dropout)
def __init__(self, in_dim, hidden_dim, initializer=default_initializer, normalize=True, dropout=0, reconstructe=True, activation="tanh", verbose=True): """ :param in_dim: 输入维度 :param hidden_dim: 隐层维度 :param initializer: 随机初始化器 :param normalize: 是否归一化 :param dropout: dropout率 :param activation: 激活函数 :param verbose: 是否输出Debug日志内容 :return: """ self.in_dim = in_dim self.out_dim = hidden_dim self.hidden_dim = hidden_dim assert self.in_dim == self.hidden_dim self.initializer = initializer self.normalize = normalize self.dropout = dropout self.verbose = verbose self.act = Activation(activation) # Composition Function Weight # (dim, 2 * dim) self.W = shared_rand_matrix((self.hidden_dim, 2 * self.in_dim), 'W', initializer=initializer) # (dim, ) self.b = shared_zero_matrix((self.hidden_dim, ), 'b') # Reconstruction Function Weight # (2 * dim, dim) self.Wr = shared_rand_matrix((2 * self.in_dim, self.hidden_dim), 'Wr', initializer=initializer) # (2 * dim, ) self.br = shared_zero_matrix((self.in_dim * 2, ), 'br') self.params = [self.W, self.b, self.Wr, self.br] self.norm_params = [self.W, self.Wr] self.l1_norm = sum( [T.sum(T.abs_(param)) for param in self.norm_params]) self.l2_norm = sum([T.sum(param**2) for param in self.norm_params]) if verbose: logger.debug( 'Architecture of RAE built finished, summarized as below: ') logger.debug('Hidden dimension: %d' % self.hidden_dim) logger.debug('Normalize: %s' % self.normalize) logger.debug('Activation: %s' % self.act) logger.debug('Dropout Rate: %s' % self.dropout)
def __init__(self, verbose=True): if verbose: logger.debug('Build Multilayer Perceptron Ranking model...') # Positive input setting self.inputPL = T.matrix(name='inputPL', dtype=floatX) self.inputPR = T.matrix(name='inputPR', dtype=floatX) # Negative input setting self.inputNL = T.matrix(name='inputNL', dtype=floatX) self.inputNR = T.matrix(name='inputNR', dtype=floatX) # Standard input setting self.inputL = T.matrix(name='inputL', dtype=floatX) self.inputR = T.matrix(name='inputR', dtype=floatX) # Build activation function self.act = Activation('tanh') # Connect input matrices self.inputP = T.concatenate([self.inputPL, self.inputPR], axis=1) self.inputN = T.concatenate([self.inputNL, self.inputNR], axis=1) self.input = T.concatenate([self.inputL, self.inputR], axis=1) # Build hidden layer self.hidden_layer = HiddenLayer(self.input, (2*edim, args.hidden), act=self.act) self.hidden = self.hidden_layer.output self.hiddenP = self.hidden_layer.encode(self.inputP) self.hiddenN = self.hidden_layer.encode(self.inputN) # Dropout parameter #srng = T.shared_randomstreams.RandomStreams(args.seed) #mask = srng.binomial(n=1, p=1-args.dropout, size=self.hidden.shape) #maskP = srng.binomial(n=1, p=1-args.dropout, size=self.hiddenP.shape) #maskN = srng.binomial(n=1, p=1-args.dropout, size=self.hiddenN.shape) #self.hidden *= T.cast(mask, floatX) #self.hiddenP *= T.cast(maskP, floatX) #self.hiddenN *= T.cast(maskN, floatX) # Build linear output layer self.score_layer = ScoreLayer(self.hidden, args.hidden) self.output = self.score_layer.output self.scoreP = self.score_layer.encode(self.hiddenP) self.scoreN = self.score_layer.encode(self.hiddenN) # Stack all the parameters self.params = [] self.params += self.hidden_layer.params self.params += self.score_layer.params # Build cost function self.cost = T.mean(T.maximum(T.zeros_like(self.scoreP), 1.0-self.scoreP+self.scoreN)) # Construct the gradient of the cost function with respect to the model parameters self.gradparams = T.grad(self.cost, self.params) # Count the total number of parameters in this model self.num_params = edim * args.hidden + args.hidden + args.hidden + 1 # Build class method self.score = theano.function(inputs=[self.inputL, self.inputR], outputs=self.output) self.compute_cost_and_gradient = theano.function(inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=self.gradparams+[self.cost, self.scoreP, self.scoreN]) self.show_scores = theano.function(inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=[self.scoreP, self.scoreN]) if verbose: logger.debug('Architecture of MLP Ranker built finished, summarized below: ') logger.debug('Input dimension: %d' % edim) logger.debug('Hidden dimension: %d' % args.hidden) logger.debug('Total number of parameters used in the model: %d' % self.num_params)
def __init__(self, input_size, output_size, hidden_size, n_layers, act_type): ''' Multilayer Perceptron ---------------------- :param input_size: dimension of input features :param output_size: dimension of output features :param hidden_size: a list containing hidden size for each hidden layer :param n_layers: number of layers :param act_type: type of activation function for each hidden layer, can be none, sigmoid, tanh, or relu ''' super(MLP, self).__init__() # total layer number should be hidden layer number + 1 (output layer) assert len( hidden_size ) + 1 == n_layers, 'total layer number should be hidden layer number + 1' # define the activation function by activation function in activations.py self.act = Activation(act_type) # initialize a list to save layers layers = nn.ModuleList() if n_layers == 1: # if n_layers == 1, MLP degenerates to a Linear layer layer = Linear(input_size, output_size) # append the layer into layers layers.append(layer) layers.append(self.act) # TODO 4: Finish MLP with at least 2 layers else: # step 1: initialize the input layer layer = Linear(input_size, hidden_size[0]) # step 2: append the input layer and the activation layer into layers layers.append(layer) layers.append(self.act) # step 3: construct the hidden layers and add it to layers for i in range(1, n_layers - 1): #initialize a hidden layer and activation layer # hint: Noting that the output size of a hidden layer is hidden_size[i], so what is its input size? layer = Linear(hidden_size[i - 1], hidden_size[i]) layers.append(layer) layers.append(self.act) # step 4: initialize the output layer and append the layer into layers # hint: what is the output size of the output layer? # hint: here we do not need activation layer layer = Linear(hidden_size[-1], output_size) layers.append(layer) # End TODO 4 #Use nn.Sequential to get the neural network self.net = nn.Sequential(*layers)
def __init__(self, in_dim, hidden_dim, pooling, activation='tanh', gates=("sigmoid", "sigmoid", "sigmoid"), prefix="", initializer=OrthogonalInitializer(), dropout=0, verbose=True): if verbose: logger.debug('Building {}...'.format(self.__class__.__name__)) super(LSTMEncoder, self).__init__(in_dim, hidden_dim, pooling, activation, dropout) self.in_gate, self.forget_gate, self.out_gate = Activation( gates[0]), Activation(gates[1]), Activation(gates[2]) # W [in, forget, output, recurrent] (4 * hidden, in) self.W = shared_rand_matrix((self.hidden_dim * 4, self.in_dim), prefix + 'W', initializer) # U [in, forget, output, recurrent] (4 * hidden, hidden) self.U = shared_rand_matrix((self.hidden_dim * 4, self.hidden_dim), prefix + 'U', initializer) # b [in, forget, output, recurrent] (4 * hidden,) self.b = shared_zero_matrix((self.hidden_dim * 4, ), prefix + 'b') self.params = [self.W, self.U, self.b] self.l1_norm = T.sum(T.abs_(self.W)) + T.sum(T.abs_(self.U)) self.l2_norm = T.sum(self.W**2) + T.sum(self.U**2) if verbose: logger.debug('Architecture of {} built finished'.format( self.__class__.__name__)) logger.debug('Input dimension: %d' % self.in_dim) logger.debug('Hidden dimension: %d' % self.hidden_dim) logger.debug('Pooling methods: %s' % self.pooling) logger.debug('Activation Func: %s' % self.act.method) logger.debug('Input Gate: %s' % self.in_gate.method) logger.debug('Forget Gate: %s' % self.forget_gate.method) logger.debug('Output Gate: %s' % self.out_gate.method) logger.debug('Activation Func: %s' % self.act.method) logger.debug('Dropout Rate: %f' % self.dropout)
def __init__(self, in_dim, hidden_dim, pooling, activation='tanh', dropout=0): self.in_dim = in_dim self.out_dim = hidden_dim self.hidden_dim = hidden_dim self.pooling = pooling self.dropout = dropout self.act = Activation(activation)
def __init__(self, in_dim, hidden_dim, kernel_size=3, padding='same', pooling='max', dilation_rate=1.0, activation='relu', prefix="", initializer=GlorotUniformInitializer(), dropout=0.0, verbose=True): """ Init Function for ConvolutionLayer :param in_dim: :param hidden_dim: :param kernel_size: :param padding: 'same', 'valid' :param pooling: 'max', 'mean', 'min' :param dilation_rate: :param activation: :param prefix: :param initializer: :param dropout: :param verbose: """ if verbose: logger.debug('Building {}...'.format(self.__class__.__name__)) self.in_dim = in_dim self.out_dim = hidden_dim self.hidden_dim = hidden_dim self.kernel_size = kernel_size self.padding = padding self.dilation_rate = dilation_rate self.pooling = pooling self.dropout = dropout self.act = Activation(activation) self.padding_size = int(self.dilation_rate * (self.kernel_size - 1)) # Composition Function Weight # Kernel Matrix (kernel_size, hidden, in) self.W = shared_rand_matrix((self.kernel_size, self.hidden_dim, self.in_dim), prefix + 'W', initializer) # Bias Term (hidden) self.b = shared_zero_matrix((self.hidden_dim,), prefix + 'b') self.params = [self.W, self.b] self.norm_params = [self.W] # L1, L2 Norm self.l1_norm = T.sum(T.abs_(self.W)) self.l2_norm = T.sum(self.W ** 2) if verbose: logger.debug('Architecture of {} built finished'.format(self.__class__.__name__)) logger.debug('Input dimension: %d' % self.in_dim) logger.debug('Filter Num (Hidden): %d' % self.hidden_dim) logger.debug('Kernel Size (Windows): %d' % self.kernel_size) logger.debug('Padding method : %s' % self.padding) logger.debug('Dilation Rate : %s' % self.dilation_rate) logger.debug('Padding Size : %s' % self.padding_size) logger.debug('Pooling method : %s' % self.pooling) logger.debug('Activation Func: %s' % self.act.method) logger.debug('Dropout Rate: %f' % self.dropout)
def __init__(self, name, n_inputs, n_outputs, activation=None, use_bias=True, weights=None, biases=None): super().__init__(name) self.n_inputs = n_inputs self.n_outputs = n_outputs self.use_bias = use_bias if activation is None: activation = Activation.getInitialized("tanh") else: if not Activation.isObjectRegistered(activation): if isinstance(activation, dict): activation = Activation(**activation) elif isinstance(activation, str): activation = Activation(class_name=activation) else: raise Exception("{} is not a "\ "registered activation. Use {}".format(activation, Activation.registeredClasses())) self.activation = activation if weights is None: # Between -1 and 1 self.weights = (np.random.random((n_outputs, n_inputs)) * 2 - 1) else: assert isinstance(weights, np.ndarray) assert weights.shape == (n_outputs, n_inputs) self.weights = weights if biases is None: # Between -1 and 1 self.biases = (np.random.random((n_outputs, 1)) * 2 - 1) * 0.001 else: assert isinstance(biases, np.ndarray) assert biases.shape == (n_outputs, 1) self.biases = biases # Mutation mask ... create only once. self.mutation_mask = np.zeros_like(self.weights)
def __init__(self, entity_dim, relation_num, activation='tanh', hidden=5, keep_normal=False, initializer=default_initializer, prefix='', verbose=True): super(NeuralTensorModel, self).__init__() self.entity_dim = entity_dim self.relation_num = relation_num self.hidden = hidden self.slice_seq = T.arange(hidden) self.keep_normal = keep_normal # (relation_num, entity_dim, entity_dim, hidden) self.W = shared_rand_matrix( (relation_num, self.entity_dim, self.entity_dim, self.hidden), prefix + 'NTN_W', initializer) # (relation_num, hidden) self.U = shared_ones_matrix((relation_num, self.hidden), name=prefix + 'NTN_U') if keep_normal: # (relation_num, entity_dim, hidden) self.V = shared_rand_matrix( (relation_num, self.entity_dim * 2, self.hidden), prefix + 'NTN_V', initializer) # (relation_num, hidden) self.b = shared_zero_matrix((relation_num, self.hidden), name=prefix + 'NTN_B') self.params = [self.W, self.V, self.U, self.b] self.norm_params = [self.W, self.V, self.U, self.b] else: self.params = [self.W] self.norm_params = [self.W] self.act = Activation(activation) self.l1_norm = T.sum( [T.sum(T.abs_(param)) for param in self.norm_params]) self.l2_norm = T.sum([T.sum(param**2) for param in self.norm_params]) if verbose: logger.debug( 'Architecture of Tensor Model built finished, summarized as below:' ) logger.debug('Entity Dimension: %d' % self.entity_dim) logger.debug('Hidden Dimension: %d' % self.hidden) logger.debug('Relation Number: %d' % self.relation_num) logger.debug('Initializer: %s' % initializer) logger.debug('Activation: %s' % activation)
def __init__(self, in_features, out_features, input_layer=False, fully_connected=True): self.in_features = in_features self.out_features = out_features self.fully_connected = fully_connected # changed from v0.0.0 # self.weights = np.random.randn(out_features, in_features) self.bias = np.random.randn(out_features) # last part for emphasis # self.next_layer = None self.prev_layer = None self.input_layer = input_layer self.variables = 0 self.activation = Activation()
def __init__(self, in_dim, hidden_dim, pooling, activation='tanh', prefix="", initializer=default_initializer, dropout=0, verbose=True): if verbose: logger.debug('Building {}...'.format(self.__class__.__name__)) super(RecurrentEncoder, self).__init__(in_dim, hidden_dim, pooling, activation, dropout) self.in_dim = in_dim self.out_dim = hidden_dim self.hidden_dim = hidden_dim self.pooling = pooling self.dropout = dropout self.act = Activation(activation) # Composition Function Weight # Feed-Forward Matrix (hidden, in) self.W = shared_rand_matrix((self.hidden_dim, self.in_dim), prefix + 'W_forward', initializer) # Bias Term (hidden) self.b = shared_zero_matrix((self.hidden_dim, ), prefix + 'b_forward') # Recurrent Matrix (hidden, hidden) self.U = shared_rand_matrix((self.hidden_dim, self.hidden_dim), prefix + 'U_forward', initializer) self.params = [self.W, self.U, self.b] self.norm_params = [self.W, self.U] # L1, L2 Norm self.l1_norm = T.sum(T.abs_(self.W)) + T.sum(T.abs_(self.U)) self.l2_norm = T.sum(self.W**2) + T.sum(self.U**2) if verbose: logger.debug('Architecture of {} built finished'.format( self.__class__.__name__)) logger.debug('Input dimension: %d' % self.in_dim) logger.debug('Hidden dimension: %d' % self.hidden_dim) logger.debug('Pooling methods: %s' % self.pooling) logger.debug('Activation Func: %s' % self.act.method) logger.debug('Dropout Rate: %f' % self.dropout)
def testAE(self): # Set parameters input = T.matrix(name='input') num_in, num_out = 784, 500 act = Activation('sigmoid') is_denoising, is_sparse = True, False lambda1 = 1e-4 mask = 0.7 rng = RandomStreams(42) start_time = time.time() ae = AutoEncoder(input, (num_in, num_out), act, is_denoising, is_sparse, lambda1, mask, rng, verbose=True) end_time = time.time() pprint('Time used to build the AutoEncoder: %f seconds.' % (end_time - start_time)) batch_size = 1000 num_batches = self.training_set.shape[0] / batch_size nepoch = 50 learn_rate = 1 start_time = time.time() for i in xrange(nepoch): rate = learn_rate for j in xrange(num_batches): train_set = self.training_set[j * batch_size:(j + 1) * batch_size, :] cost = ae.train(train_set, rate) pprint('epoch %d, batch %d, cost = %f' % (i, j, cost)) end_time = time.time() pprint('Time used for training AutoEncoder: %f seconds.' % (end_time - start_time)) image = PIL.Image.fromarray( imgutils.tile_raster_images( X=ae.encode_layer.W.get_value(borrow=True).T, img_shape=(28, 28), tile_shape=(10, 10), tile_spacing=(1, 1))) image.save('filters_corruption_%.2f.png' % mask) AutoEncoder.save('./autoencoder-mnist.model', ae)
def __init__(self, input_units, output_units, activation = None): """ Params: input_units = Number of input nodes output_units = Number of output nodes activation = The activation layer """ # self.weights = np.random.normal(0.0, 1.0/np.sqrt(input_units), (input_units, output_units)) # self.bias = np.random.normal(0.0, 1.0/np.sqrt(input_units), (1, output_units)) # self.weights = np.random.uniform(-0.01, 0.01, (input_units, output_units)) self.weights = np.linspace(-0.01, 0.01, num = input_units*output_units) self.weights = self.weights.reshape((input_units, output_units)) self.bias = np.zeros((1,output_units)) self.activation = Activation(activation) # Initialize Other Things as Zero self.output_units = None self.grad_weights = 0 self.grad_bias = 0
def __init__(self, entity_dim, relation_num, hidden=50, activation='tanh', initializer=default_initializer, prefix='', verbose=True): super(SingleLayerModel, self).__init__() self.hidden = hidden self.entity_dim = entity_dim self.relation_num = relation_num # (relation_num, k, entity_dim) self.W_1 = shared_rand_matrix( (relation_num, self.hidden, self.entity_dim), prefix + 'SingleLayer_W1', initializer) # (relation_num, k, entity_dim) self.W_2 = shared_rand_matrix( (relation_num, self.hidden, self.entity_dim), prefix + 'SingleLayer_W2', initializer) # (relation_num, k, ) self.u = shared_ones_matrix(( relation_num, self.hidden, ), prefix + 'SingleLayer_u') self.act = Activation(activation) self.params = [self.W_1, self.W_2, self.u] self.norm_params = [self.W_1, self.W_2, self.u] self.l1_norm = T.sum(T.abs_(self.W_1)) + T.sum(T.abs_( self.W_2)) + T.sum(T.abs_(self.u)) self.l2_norm = T.sum(self.W_1**2) + T.sum(self.W_2**2) + T.sum(self.u** 2) if verbose: logger.debug( 'Architecture of Single Layer Model built finished, summarized as below:' ) logger.debug('Entity Dimension: %d' % self.entity_dim) logger.debug('Hidden Dimension: %d' % self.hidden) logger.debug('Relation Number: %d' % self.relation_num) logger.debug('Initializer: %s' % initializer) logger.debug('Activation: %s' % activation)
def __init__(self, word_dim, seq_dim, hidden_dim, activation='tanh', initializer=default_initializer): super(NNWordBasedAttention, self).__init__(word_dim=word_dim, seq_dim=seq_dim, initializer=default_initializer) # (dim, dim) self.hidden_dim = hidden_dim self.W = shared_rand_matrix((self.word_dim, self.hidden_dim), 'Attention_W', initializer) self.U = shared_rand_matrix((self.seq_dim, self.hidden_dim), 'Attention_U', initializer) self.v = shared_rand_matrix((self.hidden_dim, ), 'Attention_v', initializer) self.act = Activation(activation) self.params = [self.W] self.norm_params = [self.W]
def __init__(self, configs=None, verbose=True): ''' @config: CNNConfiger. Configer used to set the architecture of CNN. ''' if verbose: pprint("Building Convolutional Neural Network...") # Make theano symbolic tensor for input and ground truth label self.input = T.tensor4(name='input', dtype=floatX) self.truth = T.ivector(name='label') self.learn_rate = T.scalar(name='learn rate') self.batch_size = configs.batch_size self.image_row = configs.image_row self.image_col = configs.image_col # There may have multiple convolution-pooling and multi-layer perceptrons. self.convpool_layers = [] self.hidden_layers = [] self.softmax_layers = [] # Configure activation function self.act = Activation(configs.activation) # Configuration should be valid assert configs.num_convpool == len(configs.convs) assert configs.num_convpool == len(configs.pools) assert configs.num_hidden == len(configs.hiddens) assert configs.num_softmax == len(configs.softmaxs) # Construct random number generator srng = T.shared_randomstreams.RandomStreams(configs.random_seed) # Build architecture of CNN # Convolution and Pooling layers image_shapes, filter_shapes = [], [] for i in xrange(configs.num_convpool): if i == 0: image_shapes.append( (self.batch_size, 1, self.image_row, self.image_col)) filter_shapes.append( (configs.convs[i][0], 1, configs.convs[i][1], configs.convs[i][2])) else: image_shapes.append( (self.batch_size, configs.convs[i - 1][0], (image_shapes[i - 1][2] - configs.convs[i - 1][1] + 1) / configs.pools[i - 1][0], (image_shapes[i - 1][3] - configs.convs[i - 1][2] + 1) / configs.pools[i - 1][1])) filter_shapes.append( (configs.convs[i][0], configs.convs[i - 1][0], configs.convs[i][1], configs.convs[i][2])) for i in xrange(configs.num_convpool): if i == 0: current_input = self.input else: current_input = self.convpool_layers[i - 1].output self.convpool_layers.append( LeNetConvPoolLayer(input=current_input, filter_shape=filter_shapes[i], image_shape=image_shapes[i], poolsize=configs.pools[i], act=self.act)) # Multilayer perceptron layers for i in xrange(configs.num_hidden): if i == 0: current_input = T.flatten( self.convpool_layers[configs.num_convpool - 1].output, 2) else: current_input = self.hidden_layers[i - 1].output # Adding dropout to hidden layers hidden_layer = HiddenLayer(current_input, configs.hiddens[i], act=self.act) mask = srng.binomial(n=1, p=1 - configs.dropout, size=hidden_layer.shape) hidden_layer *= T.cast(mask, floatX) self.hidden_layers.append(hidden_layer) # Softmax Layer, for most case, the architecture will only contain one softmax layer for i in xrange(configs.num_softmax): if i == 0: current_input = self.hidden_layers[configs.num_hidden - 1].output else: current_input = self.softmax_layers[i - 1].output self.softmax_layers.append( SoftmaxLayer(current_input, configs.softmaxs[i])) # Output self.pred = self.softmax_layers[configs.num_softmax - 1].prediction() # Cost function with ground truth provided self.cost = self.softmax_layers[configs.num_softmax - 1].NLL_loss( self.truth) # Build cost function # Stack all the parameters self.params = [] for convpool_layer in self.convpool_layers: self.params.extend(convpool_layer.params) for hidden_layer in self.hidden_layers: self.params.extend(hidden_layer.params) for softmax_layer in self.softmax_layers: self.params.extend(softmax_layer.params) # Compute gradient of self.cost with respect to network parameters self.gradparams = T.grad(self.cost, self.params) # Stochastic gradient descent learning algorithm self.updates = [] for param, gradparam in zip(self.params, self.gradparams): self.updates.append((param, param - self.learn_rate * gradparam)) # Build objective function self.objective = theano.function( inputs=[self.input, self.truth, self.learn_rate], outputs=self.cost, updates=self.updates) # Build prediction function self.predict = theano.function(inputs=[self.input], outputs=self.pred) if verbose: pprint('Architecture building finished, summarized as below: ') pprint( 'There are %d layers (not including the input layer) algether: ' % (configs.num_convpool * 2 + configs.num_hidden + configs.num_softmax)) pprint('%d convolution layers + %d maxpooling layers.' % (len(self.convpool_layers), len(self.convpool_layers))) pprint('%d hidden layers.' % (len(self.hidden_layers))) pprint('%d softmax layers.' % (len(self.softmax_layers))) pprint('=' * 50) pprint('Detailed architecture of each layer: ') pprint('-' * 50) pprint('Convolution and Pooling layers: ') for i in xrange(len(self.convpool_layers)): pprint('Convolution Layer %d: ' % i) pprint( '%d feature maps, each has a filter kernel with size (%d, %d)' % (configs.convs[i][0], configs.convs[i][1], configs.convs[i][2])) pprint('-' * 50) pprint('Hidden layers: ') for i in xrange(len(self.hidden_layers)): pprint('Hidden Layer %d: ' % i) pprint('Input dimension: %d, Output dimension: %d' % (configs.hiddens[i][0], configs.hiddens[i][1])) pprint('-' * 50) pprint('Softmax layers: ') for i in xrange(len(self.softmax_layers)): pprint('Softmax Layer %d: ' % i) pprint('Input dimension: %d, Output dimension: %d' % (configs.softmaxs[i][0], configs.softmaxs[i][1]))
def __init__(self, config=None, verbose=True): # Construct two GrCNNEncoders for matching two sentences self.encoderL = GrCNNEncoder(config, verbose) self.encoderR = GrCNNEncoder(config, verbose) # Link the parameters of two parts self.params = [] self.params += self.encoderL.params self.params += self.encoderR.params # Build three kinds of inputs: # 1, inputL, inputR. This pair is used for computing the score after training # 2, inputPL, inputPR. This part is used for training positive pairs # 3, inputNL, inputNR. This part is used for training negative pairs self.inputL = self.encoderL.input self.inputR = self.encoderR.input # Positive self.inputPL = T.matrix(name='inputPL', dtype=floatX) self.inputPR = T.matrix(name='inputPR', dtype=floatX) # Negative self.inputNL = T.matrix(name='inputNL', dtype=floatX) self.inputNR = T.matrix(name='inputNR', dtype=floatX) # Linking input-output mapping self.hiddenL = self.encoderL.output self.hiddenR = self.encoderR.output # Positive self.hiddenPL = self.encoderL.encode(self.inputPL) self.hiddenPR = self.encoderR.encode(self.inputPR) # Negative self.hiddenNL = self.encoderL.encode(self.inputNL) self.hiddenNR = self.encoderR.encode(self.inputNR) # Activation function self.act = Activation(config.activation) # MLP Component self.hidden = T.concatenate([self.hiddenL, self.hiddenR], axis=1) self.hiddenP = T.concatenate([self.hiddenPL, self.hiddenPR], axis=1) self.hiddenN = T.concatenate([self.hiddenNL, self.hiddenNR], axis=1) # Build hidden layer self.hidden_layer = HiddenLayer( self.hidden, (2 * config.num_hidden, config.num_mlp), act=Activation(config.hiddenact)) self.compressed_hidden = self.hidden_layer.output self.compressed_hiddenP = self.hidden_layer.encode(self.hiddenP) self.compressed_hiddenN = self.hidden_layer.encode(self.hiddenN) # Accumulate parameters self.params += self.hidden_layer.params # Dropout parameter srng = T.shared_randomstreams.RandomStreams(config.random_seed) mask = srng.binomial(n=1, p=1 - config.dropout, size=self.compressed_hidden.shape) maskP = srng.binomial(n=1, p=1 - config.dropout, size=self.compressed_hiddenP.shape) maskN = srng.binomial(n=1, p=1 - config.dropout, size=self.compressed_hiddenN.shape) self.compressed_hidden *= T.cast(mask, floatX) self.compressed_hiddenP *= T.cast(maskP, floatX) self.compressed_hiddenN *= T.cast(maskN, floatX) # Score layers self.score_layer = ScoreLayer(self.compressed_hidden, config.num_mlp) self.output = self.score_layer.output self.scoreP = self.score_layer.encode(self.compressed_hiddenP) self.scoreN = self.score_layer.encode(self.compressed_hiddenN) # Accumulate parameters self.params += self.score_layer.params # Build cost function self.cost = T.mean( T.maximum(T.zeros_like(self.scoreP), 1.0 - self.scoreP + self.scoreN)) # Construct the gradient of the cost function with respect to the model parameters self.gradparams = T.grad(self.cost, self.params) # Compute the total number of parameters in the model self.num_params_encoder = config.num_input * config.num_hidden + \ config.num_hidden * config.num_hidden * 2 + \ config.num_hidden + \ config.num_hidden * 3 * 2 + \ 3 self.num_params_encoder *= 2 self.num_params_classifier = 2 * config.num_hidden * config.num_mlp + \ config.num_mlp + \ config.num_mlp + 1 self.num_params = self.num_params_encoder + self.num_params_classifier # Build class methods self.score = theano.function(inputs=[self.inputL, self.inputR], outputs=self.output) self.compute_cost_and_gradient = theano.function( inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=self.gradparams + [self.cost, self.scoreP, self.scoreN]) self.show_scores = theano.function( inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=[self.scoreP, self.scoreN]) self.show_hiddens = theano.function( inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=[self.hiddenP, self.hiddenN]) self.show_inputs = theano.function( inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR]) if verbose: logger.debug( 'Architecture of GrCNNMatchScorer built finished, summarized below: ' ) logger.debug('Input dimension: %d' % config.num_input) logger.debug( 'Hidden dimension inside GrCNNMatchScorer pyramid: %d' % config.num_hidden) logger.debug('Hidden dimension MLP: %d' % config.num_mlp) logger.debug('There are 2 GrCNNEncoders used in model.') logger.debug('Total number of parameters used in the model: %d' % self.num_params)
def __init__(self, config, verbose=True): ''' @config: GRCNNConfiger. Configer used to set the architecture of GRCNNEncoder. ''' self.encoder = GrCNNEncoder(config, verbose) # Link two parts self.input = self.encoder.input # Activation function self.act = Activation(config.activation) # Extract the hierarchical representation, the pyramids, from the encoder # Combine the original time series and the compressed time series self.pyramids = self.encoder.pyramids self.pyramids = T.concatenate([ self.encoder.hidden0.dimshuffle('x', 0, 1), self.encoder.pyramids ]) self.nsteps = self.pyramids.shape[0] # Use another scan function to compress each hierarchical representation # into the vector representation self.hierarchies, _ = theano.scan( fn=self._step_compress, sequences=[T.arange(self.nsteps, 0, -1), self.pyramids]) # Global classifier, MLP, mixture of experts self.hidden_layer = HiddenLayer(self.hierarchies, (config.num_hidden, config.num_mlp), act=Activation(config.hiddenact)) # Adding dropout support self.hidden = self.hidden_layer.output srng = T.shared_randomstreams.RandomStreams(config.random_seed) mask = srng.binomial(n=1, p=1 - config.dropout, size=self.hidden.shape) self.hidden *= T.cast(mask, floatX) # Connect the hidden layer after dropout to a logistic output layer self.output_layer = LogisticLayer(self.hidden, config.num_mlp) self.experts = self.output_layer.output # Global weighting mechanism, voting weights self.weight_layer = theano.shared( name='Weighting vector', value=np.random.rand(config.num_hidden).astype(floatX)) self.weights = T.nnet.softmax( T.dot(self.hierarchies, self.weight_layer)) # Compute the total number of parameters in the model self.num_params = self.encoder.num_params + self.hidden_layer.num_params + \ self.output_layer.num_params + config.num_hidden # Final decision, bagging self.score = T.sum(T.flatten(self.experts) * T.flatten(self.weights)) # Prediction for classification self.pred = self.score >= 0.5 # Stack all the parameters self.params = [] self.params += self.encoder.params self.params += self.hidden_layer.params self.params += self.output_layer.params self.params += [self.weight_layer] # Build objective function for binary classification problem self.truth = T.iscalar(name='label') self.cost = -self.truth * T.log((self.score+np.finfo(float).eps) / (1+2*np.finfo(float).eps)) - \ (1-self.truth) * T.log((1.0-self.score+np.finfo(float).eps) / (1+2*np.finfo(float).eps)) ## Weight Decay if config.weight_decay: self.regularizer = self.encoder.L2_loss() + self.hidden_layer.L2_loss() + \ self.output_layer.L2_loss() + T.sum(self.weight_layer ** 2) self.regularizer *= config.weight_decay_parameter self.cost += self.regularizer # Construct gradient vectors self.gradparams = T.grad(self.cost, self.params) # Construct gradient for the input matrix, fine-tuning self.input_grads = T.grad(self.cost, self.input) # Build and compile theano functions self.predict = theano.function(inputs=[self.input], outputs=self.pred) self.bagging = theano.function(inputs=[self.input], outputs=self.score) self.compute_gradient_and_cost = theano.function( inputs=[self.input, self.truth], outputs=self.gradparams + [self.cost, self.pred]) self.compute_input_gradient = theano.function( inputs=[self.input, self.truth], outputs=self.input_grads) # Theano functions for debugging purposes self.show_weights = theano.function(inputs=[self.input], outputs=self.weights) self.show_scores = theano.function(inputs=[self.input], outputs=self.experts) self.show_hierarchy = theano.function(inputs=[self.input], outputs=self.hierarchies) self.show_prob = theano.function(inputs=[self.input], outputs=self.score) self.show_cost = theano.function(inputs=[self.input, self.truth], outputs=self.cost) if verbose: logger.debug('GrCNNBagger built finished...') logger.debug( 'Hierarchical structure of GrCNN for classification...') logger.debug('Total number of parameters in the model: %d' % self.num_params)
images_path = './data/training_images' annotations_path = './data/annotations' classes_file = './data/classes.txt' X, y = prepare_dataset(images_path, annotations_path, classes_file) '''TRAINING PROCEDURE''' from models_final import Sequential from convolutions_final import Conv2D from normalizations import BatchNormalization from poolings import MaxPool2D from dense_final import Flatten, Dense from activations import Activation model = Sequential() model.add(Conv2D(10, (3, 3), 1, "valid", "convLayer1", X.shape)) model.add(MaxPool2D((2, 2), 2, "valid", "poolLayer1")) model.add(Activation('relu')) model.add(BatchNormalization(1, 0, 1e-5)) model.add(Conv2D(10, (3, 3), 1, "valid", "convLayer2")) model.add(MaxPool2D((2, 2), 2, "valid", "poolLayer2")) model.add(Activation('relu')) model.add(BatchNormalization(1, 0, 1e-5)) model.add(Conv2D(10, (3, 3), 1, "valid", "convLayer3")) model.add(MaxPool2D((2, 2), 2, "valid", "poolLayer3")) model.add(Activation('relu')) model.add(BatchNormalization(1, 0, 1e-5)) model.add(Conv2D(10, (3, 3), 1, "valid", "convLayer4")) model.add(MaxPool2D((2, 2), 2, "valid", "poolLayer4")) model.add(Activation('relu'))
def testTrain(self): ''' Train Auto-Encoder + SoftmaxLayer on batch learning mode. ''' input_dim, hidden_dim = self.max_length * self.word_embedding.embedding_dim( ), 500 # Build AutoEncoder + SoftmaxLayer start_time = time.time() seed = 1991 input_matrix = T.matrix(name='input') num_in, num_out = input_dim, hidden_dim act = Activation('tanh') is_denoising, is_sparse = True, False lambda1, mask = 1e-4, 0.5 rng = RandomStreams(seed) sent_model = SentModel(input_matrix, (num_in, num_out), act, is_denoising, is_sparse, lambda1, mask, rng, verbose=True) end_time = time.time() pprint('Time used to build the model: %f seconds.' % (end_time - start_time)) # Loading training data and start batch training mode num_batch = self.num_sent / self.batch_size learn_rate = 0.1 # Pretraining pprint('Start pretraining...') start_time = time.time() for i in xrange(self.nepoch): # Batch training pprint('Training epoch: %d' % i) for j in xrange(num_batch): train_set = np.zeros( (self.batch_size, self.max_length * self.word_embedding.embedding_dim()), dtype=floatX) train_txt = self.train_txt[j * self.batch_size:(j + 1) * self.batch_size] for k, sent in enumerate(train_txt): words = sent.split() vectors = np.asarray( [self.word_embedding.wordvec(word) for word in words]) vectors = vectors.flatten() train_set[k, :vectors.shape[0]] = vectors rate = learn_rate cost = sent_model.pretrain(train_set, rate) if (j + 1) % 500 == 0: pprint('Training epoch: %d, Number batch: %d, cost = %f' % (i, j, cost)) # Saving temporary pretraining model in .gz with gzip.GzipFile('./large_pretrain.sent.gz', 'wb') as fout: cPickle.dump(sent_model, fout) end_time = time.time() pprint('Time used for pretraining: %f minutes.' % ((end_time - start_time) / 60.0)) # Fine tuning pprint('Start fine-tuning...') start_time = time.time() for i in xrange(self.nepoch): # Batch training pprint('Training epoch: %d' % i) for j in xrange(num_batch): train_set = np.zeros( (self.batch_size, self.max_length * self.word_embedding.embedding_dim()), dtype=floatX) train_txt = self.train_txt[j * self.batch_size:(j + 1) * self.batch_size] for k, sent in enumerate(train_txt): words = sent.split() vectors = np.asarray( [self.word_embedding.wordvec(word) for word in words]) vectors = vectors.flatten() train_set[k, :vectors.shape[0]] = vectors rate = learn_rate cost = sent_model.finetune(train_set, rate) if (j + 1) % 500 == 0: pprint('Training epoch: %d, Number batch: %d, cost = %f' % (i, j, cost)) # Saving temporary fine-tuning model in .gz with gzip.GzipFile('./large_finetune.sent.gz', 'wb') as fout: cPickle.dump(sent_model, fout) end_time = time.time() pprint('Time used for fine-tuning: %f minutes.' % ((end_time - start_time) / 60.0))
def __init__(self, config, verbose=True): # Construct two BRNNEncoders for matching two sentences self.encoderL = BRNNEncoder(config, verbose) self.encoderR = BRNNEncoder(config, verbose) # Link two parts self.params = [] self.params += self.encoderL.params self.params += self.encoderR.params # Set up input # Note that there are three kinds of inputs altogether, including: # 1, inputL, inputR. This pair is used for computing the score after training # 2, inputPL, inputPR. This pair is used for training positive pairs # 3, inputNL, inputNR. This pair is used for training negative pairs self.inputL = self.encoderL.input self.inputR = self.encoderR.input # Positive self.inputPL = T.matrix(name='inputPL', dtype=floatX) self.inputPR = T.matrix(name='inputPR', dtype=floatX) # Negative self.inputNL = T.matrix(name='inputNL', dtype=floatX) self.inputNR = T.matrix(name='inputNR', dtype=floatX) # Get output of two BRNNEncoders self.hiddenL = self.encoderL.output self.hiddenR = self.encoderR.output # Positive Hidden self.hiddenPL = self.encoderL.encode(self.inputPL) self.hiddenPR = self.encoderR.encode(self.inputPR) # Negative Hidden self.hiddenNL = self.encoderL.encode(self.inputNL) self.hiddenNR = self.encoderR.encode(self.inputNR) # Activation function self.act = Activation(config.activation) self.hidden = T.concatenate([self.hiddenL, self.hiddenR], axis=0) self.hiddenP = T.concatenate([self.hiddenPL, self.hiddenPR], axis=0) self.hiddenN = T.concatenate([self.hiddenNL, self.hiddenNR], axis=0) # Build hidden layer self.hidden_layer = HiddenLayer( self.hidden, (4 * config.num_hidden, config.num_mlp), act=Activation(config.hiddenact)) self.compressed_hidden = self.hidden_layer.output self.compressed_hiddenP = self.hidden_layer.encode(self.hiddenP) self.compressed_hiddenN = self.hidden_layer.encode(self.hiddenN) # Accumulate parameters self.params += self.hidden_layer.params # Dropout parameter srng = T.shared_randomstreams.RandomStreams(config.random_seed) mask = srng.binomial(n=1, p=1 - config.dropout, size=self.compressed_hidden.shape) maskP = srng.binomial(n=1, p=1 - config.dropout, size=self.compressed_hiddenP.shape) maskN = srng.binomial(n=1, p=1 - config.dropout, size=self.compressed_hiddenN.shape) self.compressed_hidden *= T.cast(mask, floatX) self.compressed_hiddenP *= T.cast(maskP, floatX) self.compressed_hiddenN *= T.cast(maskN, floatX) # Score layer self.score_layer = ScoreLayer(self.compressed_hidden, config.num_mlp) self.output = self.score_layer.output self.scoreP = self.score_layer.encode(self.compressed_hiddenP) self.scoreN = self.score_layer.encode(self.compressed_hiddenN) # Accumulate parameters self.params += self.score_layer.params # Build cost function self.cost = T.mean( T.maximum(T.zeros_like(self.scoreP), 1.0 - self.scoreP + self.scoreN)) # Construct the total number of parameters in the model self.gradparams = T.grad(self.cost, self.params) # Compute the total number of parameters in the model self.num_params_encoder = self.encoderL.num_params + self.encoderR.num_params self.num_params_classifier = 2 * config.num_hidden * config.num_mlp + config.num_mlp + \ config.num_mlp + 1 self.num_params = self.num_params_encoder + self.num_params_classifier # Build class functions self.score = theano.function(inputs=[self.inputL, self.inputR], outputs=self.output) # Compute the gradient of the objective function and cost and prediction self.compute_cost_and_gradient = theano.function( inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=self.gradparams + [self.cost, self.scoreP, self.scoreN]) # Output function for debugging purpose self.show_scores = theano.function( inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=[self.scoreP, self.scoreN]) self.show_hiddens = theano.function( inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=[self.hiddenP, self.hiddenN]) if verbose: logger.debug( 'Architecture of BRNNMatchScorer built finished, summarized below: ' ) logger.debug('Input dimension: %d' % config.num_input) logger.debug('Hidden dimension of RNN: %d' % config.num_hidden) logger.debug('Hidden dimension of MLP: %d' % config.num_mlp) logger.debug('There are 2 BRNNEncoders used in the model.') logger.debug('Total number of parameters in this model: %d' % self.num_params)
def __init__(self, config, verbose=True): # Construct two BRNNEncoders for matching two sentences self.encoderL = BRNNEncoder(config, verbose) self.encoderR = BRNNEncoder(config, verbose) # Link two parts self.params = [] self.params += self.encoderL.params self.params += self.encoderR.params # Set up input self.inputL = self.encoderL.input self.inputR = self.encoderR.input # Get output of two BRNNEncoders self.hiddenL = self.encoderL.output self.hiddenR = self.encoderR.output # Activation function self.act = Activation(config.activation) # MLP Component self.hidden = T.concatenate([self.hiddenL, self.hiddenR], axis=0) self.hidden_layer = HiddenLayer( self.hidden, (4 * config.num_hidden, config.num_mlp), act=Activation(config.hiddenact)) self.compressed_hidden = self.hidden_layer.output # Accumulate parameters self.params += self.hidden_layer.params # Dropout parameter srng = T.shared_randomstreams.RandomStreams(config.random_seed) mask = srng.binomial(n=1, p=1 - config.dropout, size=self.compressed_hidden.shape) self.compressed_hidden *= T.cast(mask, floatX) # Logistic regression self.logistic_layer = LogisticLayer(self.compressed_hidden, config.num_mlp) self.output = self.logistic_layer.output self.pred = self.logistic_layer.pred # Accumulate parameters self.params += self.logistic_layer.params # Compute the total number of parameters in the model self.num_params_encoder = self.encoderL.num_params + self.encoderR.num_params self.num_params_classifier = 2 * config.num_hidden * config.num_mlp + config.num_mlp + \ config.num_mlp + 1 self.num_params = self.num_params_encoder + self.num_params_classifier # Build target function self.truth = T.ivector(name='label') self.cost = self.logistic_layer.NLL_loss(self.truth) # Build computational graph and compute the gradients of the model parameters # with respect to the cost function self.gradparams = T.grad(self.cost, self.params) # Compile theano function self.objective = theano.function( inputs=[self.inputL, self.inputR, self.truth], outputs=self.cost) self.predict = theano.function(inputs=[self.inputL, self.inputR], outputs=self.pred) # Compute the gradient of the objective function and cost and prediction self.compute_cost_and_gradient = theano.function( inputs=[self.inputL, self.inputR, self.truth], outputs=self.gradparams + [self.cost, self.pred]) # Output function for debugging purpose self.show_hidden = theano.function(inputs=[self.inputL, self.inputR], outputs=self.hidden) self.show_compressed_hidden = theano.function( inputs=[self.inputL, self.inputR], outputs=self.compressed_hidden) self.show_output = theano.function(inputs=[self.inputL, self.inputR], outputs=self.output) if verbose: logger.debug( 'Architecture of BRNNMatcher built finished, summarized below: ' ) logger.debug('Input dimension: %d' % config.num_input) logger.debug('Hidden dimension of RNN: %d' % config.num_hidden) logger.debug('Hidden dimension of MLP: %d' % config.num_mlp) logger.debug('Number of parameters in the encoder part: %d' % self.num_params_encoder) logger.debug('Number of parameters in the classifier: %d' % self.num_params_classifier) logger.debug('Total number of parameters in this model: %d' % self.num_params)
def __init__(self, config, verbose=True): if verbose: logger.debug('Building Bidirectional RNN Encoder...') self.input = T.matrix(name='BRNNEncoder_input') # Configure Activation function self.act = Activation(config.activation) # Build Bidirectional RNN num_input, num_hidden = config.num_input, config.num_hidden self.num_params = 2 * (num_input * num_hidden + num_hidden * num_hidden + num_hidden) # Initialize model parameters np.random.seed(config.random_seed) # 1, Feed-forward matrix for forward direction: W_forward W_forward_val = np.random.uniform(low=-1.0, high=1.0, size=(num_input, num_hidden)) W_forward_val = W_forward_val.astype(floatX) self.W_forward = theano.shared(value=W_forward_val, name='W_forward', borrow=True) # 1, Feed-forward matrix for backward direction: W_backward W_backward_val = np.random.uniform(low=-1.0, high=1.0, size=(num_input, num_hidden)) W_backward_val = W_backward_val.astype(floatX) self.W_backward = theano.shared(value=W_backward_val, name='W_backward', borrow=True) # 2, Recurrent matrix for forward direction: U_forward U_forward_val = np.random.uniform(low=-1.0, high=1.0, size=(num_hidden, num_hidden)) U_forward_val = U_forward_val.astype(floatX) U_forward_val, _, _ = np.linalg.svd(U_forward_val) self.U_forward = theano.shared(value=U_forward_val, name='U_forward', borrow=True) # 2, Recurrent matrix for backward direction: U_backward U_backward_val = np.random.uniform(low=-1.0, high=1.0, size=(num_hidden, num_hidden)) U_backward_val = U_backward_val.astype(floatX) U_backward_val, _, _ = np.linalg.svd(U_backward_val) self.U_backward = theano.shared(value=U_backward_val, name='U_backward', borrow=True) # 3, Bias parameter for the hidden-layer forward direction RNN b_forward_val = np.zeros(num_hidden, dtype=floatX) self.b_forward = theano.shared(value=b_forward_val, name='b_forward', borrow=True) # 3, Bias parameter for the hidden-layer backward direction RNN b_backward_val = np.zeros(num_hidden, dtype=floatX) self.b_backward = theano.shared(value=b_backward_val, name='b_backward', borrow=True) # h[0], zero vectors, treated as constants self.h0_forward = theano.shared(value=np.zeros(num_hidden, dtype=floatX), name='h0_forward', borrow=True) self.h0_backward = theano.shared(value=np.zeros(num_hidden, dtype=floatX), name='h0_backward', borrow=True) # Stack all the parameters self.params = [ self.W_forward, self.W_backward, self.U_forward, self.U_backward, self.b_forward, self.b_backward ] # Compute the forward and backward representation over time self.h_forwards, _ = theano.scan(fn=self._forward_step, sequences=self.input, outputs_info=[self.h0_forward], truncate_gradient=config.bptt) self.h_backwards, _ = theano.scan(fn=self._backward_step, sequences=self.input, outputs_info=[self.h0_backward], truncate_gradient=config.bptt, go_backwards=True) # Average compressing self.h_forward = T.mean(self.h_forwards, axis=0) self.h_backward = T.mean(self.h_backwards, axis=0) # Concatenate self.output = T.concatenate([self.h_forward, self.h_backward], axis=0) # L1, L2 regularization self.L1_norm = T.sum( T.abs_(self.W_forward) + T.abs_(self.W_backward) + T.abs_(self.U_forward) + T.abs_(self.U_backward)) self.L2_norm = T.sum(self.W_forward ** 2) + T.sum(self.W_backward ** 2) + \ T.sum(self.U_forward ** 2) + T.sum(self.U_backward ** 2) if verbose: logger.debug( 'Finished constructing the structure of BRNN Encoder: ') logger.debug('Size of the input dimension: %d' % num_input) logger.debug('Size of the hidden dimension: %d' % num_hidden) logger.debug('Activation function: %s' % config.activation)
def __init__(self, configs, verbose=True): if verbose: pprint('Build Tied weights Bidirectional Recurrent Neural Network') self.input = T.matrix(name='input') self.truth = T.ivector(name='label') self.learn_rate = T.scalar(name='learn rate') # Configure Activation function self.act = Activation(configs.activation) # Build bidirectional RNN with tied weights num_input, num_hidden, num_class = configs.num_input, configs.num_hidden, configs.num_class # Stack all the variables together into a vector in order to apply the batch updating algorithm # Since there are two directions for the RNN, all the weight matrix associated with RNN will be # duplicated num_params = 2 * (num_input * num_hidden + \ num_hidden * num_hidden + \ num_hidden) + \ 2 * num_hidden * num_class + \ num_class self.num_params = num_params self.theta = theano.shared(value=np.zeros(num_params, dtype=floatX), name='theta', borrow=True) # Incremental index param_idx = 0 # 1, Feed-forward matrix for forward direction: W_forward self.W_forward = self.theta[param_idx:param_idx + num_input * num_hidden].reshape( (num_input, num_hidden)) self.W_forward.name = 'W_forward_RNN' W_forward_init = np.asarray(np.random.uniform( low=-np.sqrt(6.0 / (num_input + num_hidden)), high=np.sqrt(6.0 / (num_input + num_hidden)), size=(num_input, num_hidden)), dtype=floatX) param_idx += num_input * num_hidden # 1, Feed-forward matrix for backward direction: W_backward self.W_backward = self.theta[param_idx:param_idx + num_input * num_hidden].reshape( (num_input, num_hidden)) self.W_backward.name = 'W_backward_RNN' W_backward_init = np.asarray(np.random.uniform( low=-np.sqrt(6.0 / (num_input + num_hidden)), high=np.sqrt(6.0 / (num_input + num_hidden)), size=(num_input, num_hidden)), dtype=floatX) param_idx += num_input * num_hidden # 2, Recurrent matrix for forward direction: U_forward self.U_forward = self.theta[param_idx:param_idx + num_hidden * num_hidden].reshape( (num_hidden, num_hidden)) self.U_forward.name = 'U_forward_RNN' U_forward_init = np.asarray(np.random.uniform( low=-np.sqrt(6.0 / (num_hidden + num_hidden)), high=np.sqrt(6.0 / (num_hidden + num_hidden)), size=(num_hidden, num_hidden)), dtype=floatX) param_idx += num_hidden * num_hidden # 2, Recurrent matrix for backward direction: U_backward self.U_backward = self.theta[param_idx:param_idx + num_hidden * num_hidden].reshape( (num_hidden, num_hidden)) self.U_backward.name = 'U_backward_RNN' U_backward_init = np.asarray(np.random.uniform( low=-np.sqrt(6.0 / (num_hidden + num_hidden)), high=np.sqrt(6.0 / (num_hidden + num_hidden)), size=(num_hidden, num_hidden)), dtype=floatX) param_idx += num_hidden * num_hidden # 3, Bias parameter for the hidden-layer forward direction RNN self.b_forward = self.theta[param_idx:param_idx + num_hidden] self.b_forward.name = 'b_forward_RNN' b_forward_init = np.zeros(num_hidden, dtype=floatX) param_idx += num_hidden # 3, Bias parameter for the hidden-layer backward direction RNN self.b_backward = self.theta[param_idx:param_idx + num_hidden] self.b_backward.name = 'b_backward_RNN' b_backward_init = np.zeros(num_hidden, dtype=floatX) param_idx += num_hidden # Weight matrix for softmax function self.W_softmax = self.theta[param_idx:param_idx + 2 * num_hidden * num_class].reshape( (2 * num_hidden, num_class)) self.W_softmax.name = 'W_softmax' W_softmax_init = np.asarray(np.random.uniform( low=-np.sqrt(6.0 / (2 * num_hidden + num_class)), high=np.sqrt(6.0 / (2 * num_hidden + num_class)), size=(2 * num_hidden, num_class)), dtype=floatX) param_idx += 2 * num_hidden * num_class # Bias vector for softmax function self.b_softmax = self.theta[param_idx:param_idx + num_class] self.b_softmax.name = 'b_softmax' b_softmax_init = np.zeros(num_class, dtype=floatX) param_idx += num_class # Set all the default parameters into theta self.theta.set_value( np.concatenate([ x.ravel() for x in (W_forward_init, W_backward_init, U_forward_init, U_backward_init, b_forward_init, b_backward_init, W_softmax_init, b_softmax_init) ])) assert param_idx == num_params # h[0], zero vector, treated as constants self.h_start = theano.shared(value=np.zeros(num_hidden, dtype=floatX), name='h_start', borrow=True) self.h_end = theano.shared(value=np.zeros(num_hidden, dtype=floatX), name='h_end', borrow=True) # recurrent function used to compress a sequence of input vectors # the first dimension should correspond to time def forward_step(x_t, h_tm1): h_t = self.act.activate(T.dot(x_t, self.W_forward) + \ T.dot(h_tm1, self.U_forward) + self.b_forward) return h_t def backward_step(x_t, h_tm1): h_t = self.act.activate(T.dot(x_t, self.W_backward) + \ T.dot(h_tm1, self.U_backward) + self.b_backward) return h_t # Forward and backward representation over time self.forward_h, _ = theano.scan(fn=forward_step, sequences=self.input, outputs_info=[self.h_start], truncate_gradient=configs.bptt) self.backward_h, _ = theano.scan(fn=backward_step, sequences=self.input, outputs_info=[self.h_end], truncate_gradient=configs.bptt, go_backwards=True) # Store the final value # self.h_start_star = self.forward_h[-1] # self.h_end_star = self.backward_h[-1] self.h_start_star = T.mean(self.forward_h, axis=0) self.h_end_star = T.mean(self.backward_h, axis=0) # L1, L2 regularization self.L1_norm = T.sum(T.abs_(self.W_forward) + T.abs_(self.W_backward) + \ T.abs_(self.U_forward) + T.abs_(self.U_backward) + \ T.abs_(self.W_softmax)) self.L2_norm = T.sum(self.W_forward ** 2) + T.sum(self.W_backward ** 2) + \ T.sum(self.U_forward ** 2) + T.sum(self.U_backward ** 2) + \ T.sum(self.W_softmax ** 2) # Build function to show the learned representation for different sentences self.show_forward = theano.function(inputs=[self.input], outputs=self.h_start_star) self.show_backward = theano.function(inputs=[self.input], outputs=self.h_end_star) ################################################################################## # Correlated BRNN ################################################################################## # Concatenate these two vectors into one self.h = T.concatenate([self.h_start_star, self.h_end_star], axis=0) # Dropout parameter srng = T.shared_randomstreams.RandomStreams(configs.random_seed) mask = srng.binomial(n=1, p=1 - configs.dropout, size=self.h.shape) self.h *= T.cast(mask, floatX) # Use concatenated vector as input to the Softmax/MLP classifier self.output = T.nnet.softmax( T.dot(self.h, self.W_softmax) + self.b_softmax) self.pred = T.argmax(self.output, axis=1) # Build cost function self.cost = -T.mean( T.log(self.output)[T.arange(self.truth.shape[0]), self.truth]) if configs.regularization: self.cost += configs.lambda1 * self.L2_norm # Compute gradient self.gradtheta = T.grad(self.cost, self.theta) self.gradinput = T.grad(self.cost, self.input) # Build objective function # Compute the gradients to parameters self.compute_cost_and_gradient = theano.function( inputs=[self.input, self.truth], outputs=[self.cost, self.gradtheta]) # Compute the gradients to inputs self.compute_input_gradient = theano.function( inputs=[self.input, self.truth], outputs=self.gradinput) # Build prediction function self.predict = theano.function(inputs=[self.input], outputs=self.pred) if verbose: pprint('*' * 50) pprint( 'Finished constructing Bidirectional Recurrent Neural Network (BRNN)' ) pprint('Size of input dimension: %d' % configs.num_input) pprint('Size of hidden/recurrent dimension: %d' % configs.num_hidden) pprint('Size of output dimension: %d' % configs.num_class) pprint('Is regularization applied? %s' % ('yes' if configs.regularization else 'no')) if configs.regularization: pprint('Coefficient of regularization term: %f' % configs.lambda1) pprint('BPTT step: %d' % configs.bptt) pprint('Number of free parameters in BRNN: %d' % self.num_params) pprint('*' * 50)