def __init__(self, U, sentence_dim=200, wv_dim = 100, ngram_filters=[3, 4, 5], dropout=[0.5], hidden=[100, 1], activations=[ReLU], batch_size=50): super(SentenceCNN, self).__init__() self.rng = np.random.RandomState(3435) # -- U is a matrix with U.shape = (size_vocab, wv_dim) self.U = U # -- the maximum sentence length... self.sentence_dim = sentence_dim self.wv_dim = wv_dim # -- list of filter sizes we want to consider... self.ngram_filters = ngram_filters self.dropout = dropout self.hidden = hidden self.batch_size = batch_size self.feature_maps = hidden[0] filter_shapes = [] pool_sizes = [] # -- get the conv parameters from each ngram... for ngram in ngram_filters: # -- we want to look at (ngram, wv_dim) sized patches... filter_shapes.append((self.feature_maps, 1, ngram, wv_dim)) pool_sizes.append((sentence_dim - ngram + 1, wv_dim - wv_dim + 1)) # -- define the model architecture # -- this is the index of the dataset... self.index = T.lscalar() # -- this is the matrix of indices for words in a sentence... self.x = T.matrix('x') # -- this is the vector of target values... self.y = T.ivector('y') # self.y = T.fvector('y') # self.y = T.matrix('y') # -- initialize our wordvectors! self.Words = theano.shared(value = self.U, name = "Words") self.zero_vec_tensor = T.vector() self.zero_vec = np.zeros(wv_dim).astype('float32') self.set_zero = theano.function([self.zero_vec_tensor], updates=[(self.Words, T.set_subtensor(self.Words[0,:], self.zero_vec_tensor))]) # -- make the actual image from the word vectors! self.sentence_image = self.Words[T.cast(self.x.flatten(),dtype="int32")].reshape((self.x.shape[0], 1, self.x.shape[1], self.Words.shape[1])) # -- make our model split self.conv_layers = [] self.conv_output_buffer = [] CONVOLUTION_NONLINEARITY = 'relu' STATIC_WV = False for i in xrange(len(ngram_filters)): # -- get the filter sizes for this particular ngram filter. filter_shape = filter_shapes[i] pool_size = pool_sizes[i] conv_layer = LeNetConvPoolLayer(rng=self.rng, input=self.sentence_image, image_shape=(self.batch_size, 1, self.sentence_dim, self.wv_dim), filter_shape=filter_shape, poolsize=pool_size, non_linear=CONVOLUTION_NONLINEARITY) conv_out = conv_layer.output.flatten(2) # -- concatenate this stuff self.conv_layers.append(conv_layer) self.conv_output_buffer.append(conv_out) # -- convert the parallel outputs into a tensor! self.conv_outputs = T.concatenate(self.conv_output_buffer, 1) # -- we need to flatten them output! self.hidden[0] = self.feature_maps * len(ngram_filters) # self.fully_connected = MLPDropout(self.rng, input=self.conv_outputs, layer_sizes=self.hidden, activations=activations, dropout_rates=dropout, classifier=False) self.fully_connected = MLPDropout(self.rng, input=self.conv_outputs, layer_sizes=self.hidden, activations=activations, dropout_rates=dropout, classifier=True) # -- define parameters of the model and update functions using adadelta self.params = self.fully_connected.params for conv_layer in self.conv_layers: self.params += conv_layer.params if not STATIC_WV: #if word vectors are allowed to change, add them as model parameters self.params += [self.Words] lr_decay = 0.95 sqr_norm_lim = 9 # now, need to hack away at th MLP class... self.cost = self.fully_connected.cost(self.y) self.dropout_cost = self.fully_connected.dropout_cost(self.y) self.grad_updates = sgd_updates_adadelta(self.params, self.dropout_cost, lr_decay, 1e-6, sqr_norm_lim)
class SentenceCNN(object): """docstring for SentenceCNN""" def __init__(self, U, sentence_dim=200, wv_dim = 100, ngram_filters=[3, 4, 5], dropout=[0.5], hidden=[100, 1], activations=[ReLU], batch_size=50): super(SentenceCNN, self).__init__() self.rng = np.random.RandomState(3435) # -- U is a matrix with U.shape = (size_vocab, wv_dim) self.U = U # -- the maximum sentence length... self.sentence_dim = sentence_dim self.wv_dim = wv_dim # -- list of filter sizes we want to consider... self.ngram_filters = ngram_filters self.dropout = dropout self.hidden = hidden self.batch_size = batch_size self.feature_maps = hidden[0] filter_shapes = [] pool_sizes = [] # -- get the conv parameters from each ngram... for ngram in ngram_filters: # -- we want to look at (ngram, wv_dim) sized patches... filter_shapes.append((self.feature_maps, 1, ngram, wv_dim)) pool_sizes.append((sentence_dim - ngram + 1, wv_dim - wv_dim + 1)) # -- define the model architecture # -- this is the index of the dataset... self.index = T.lscalar() # -- this is the matrix of indices for words in a sentence... self.x = T.matrix('x') # -- this is the vector of target values... self.y = T.ivector('y') # self.y = T.fvector('y') # self.y = T.matrix('y') # -- initialize our wordvectors! self.Words = theano.shared(value = self.U, name = "Words") self.zero_vec_tensor = T.vector() self.zero_vec = np.zeros(wv_dim).astype('float32') self.set_zero = theano.function([self.zero_vec_tensor], updates=[(self.Words, T.set_subtensor(self.Words[0,:], self.zero_vec_tensor))]) # -- make the actual image from the word vectors! self.sentence_image = self.Words[T.cast(self.x.flatten(),dtype="int32")].reshape((self.x.shape[0], 1, self.x.shape[1], self.Words.shape[1])) # -- make our model split self.conv_layers = [] self.conv_output_buffer = [] CONVOLUTION_NONLINEARITY = 'relu' STATIC_WV = False for i in xrange(len(ngram_filters)): # -- get the filter sizes for this particular ngram filter. filter_shape = filter_shapes[i] pool_size = pool_sizes[i] conv_layer = LeNetConvPoolLayer(rng=self.rng, input=self.sentence_image, image_shape=(self.batch_size, 1, self.sentence_dim, self.wv_dim), filter_shape=filter_shape, poolsize=pool_size, non_linear=CONVOLUTION_NONLINEARITY) conv_out = conv_layer.output.flatten(2) # -- concatenate this stuff self.conv_layers.append(conv_layer) self.conv_output_buffer.append(conv_out) # -- convert the parallel outputs into a tensor! self.conv_outputs = T.concatenate(self.conv_output_buffer, 1) # -- we need to flatten them output! self.hidden[0] = self.feature_maps * len(ngram_filters) # self.fully_connected = MLPDropout(self.rng, input=self.conv_outputs, layer_sizes=self.hidden, activations=activations, dropout_rates=dropout, classifier=False) self.fully_connected = MLPDropout(self.rng, input=self.conv_outputs, layer_sizes=self.hidden, activations=activations, dropout_rates=dropout, classifier=True) # -- define parameters of the model and update functions using adadelta self.params = self.fully_connected.params for conv_layer in self.conv_layers: self.params += conv_layer.params if not STATIC_WV: #if word vectors are allowed to change, add them as model parameters self.params += [self.Words] lr_decay = 0.95 sqr_norm_lim = 9 # now, need to hack away at th MLP class... self.cost = self.fully_connected.cost(self.y) self.dropout_cost = self.fully_connected.dropout_cost(self.y) self.grad_updates = sgd_updates_adadelta(self.params, self.dropout_cost, lr_decay, 1e-6, sqr_norm_lim) def fit(self, X, y, validation, n_epochs=5): ''' `validation` is a *mandatory* tuple of length 2, with elements (X_val, y_val) ''' sys.stdout.write('Building datasets...') X_val, y_val = validation np.random.seed(3435) batch_size = self.batch_size if X.shape[0] % batch_size > 0: extra_data_num = batch_size - X.shape[0] % batch_size # -- get random ix from first dimension shuffle_ix = np.random.permutation(X.shape[0]) extra_X = X[shuffle_ix[:extra_data_num]] extra_y = y[shuffle_ix[:extra_data_num]] X_training = np.append(X,extra_X,axis=0) y_training = np.append(y,extra_y,axis=0) else: X_training = X y_training = y # -- shuffle the training data shuffle_ix = np.random.permutation(X.shape[0]) X_training = X_training[shuffle_ix] y_training = y_training[shuffle_ix] # -- find the number of batches we can train on... n_batches = X_training.shape[0] / batch_size n_train_batches = int(np.round(n_batches*0.9)) #divide train set into train/val sets val_set_x = X_val val_set_y = y_val sys.stdout.write('done.\nCopying to GPU/CPU shared env...') # -- get our training and dev sets... train_set_x, train_set_y = shared_dataset(( X_training[:n_train_batches * batch_size, :], # y_training[:n_train_batches * batch_size, :] y_training[:n_train_batches * batch_size] )) dev_set_x, dev_set_y = shared_dataset(( X_training[n_train_batches*batch_size:, :], # y_training[n_train_batches*batch_size:, :] y_training[n_train_batches*batch_size:] )) # train_set_x, train_set_y = shared_dataset((train_set[:,:img_h],train_set[:,-1])) # dev_set_x, dev_set_y = shared_dataset((dev_set[:,:img_h],dev_set[:,-1])) n_dev_batches = n_batches - n_train_batches #compile theano functions to get train/val/test errors sys.stdout.write('done.\nCompiling symbolic graph...') # -- gets the error on the dev set... validate_model = theano.function([self.index], self.fully_connected.errors(self.y), givens = { self.x: dev_set_x[self.index * batch_size: (self.index + 1) * batch_size], self.y: dev_set_y[self.index * batch_size: (self.index + 1) * batch_size] } ) # -- gets the error on the training set... test_model = theano.function([self.index], self.fully_connected.errors(self.y), givens = { self.x: train_set_x[self.index * batch_size: (self.index + 1) * batch_size], self.y: train_set_y[self.index * batch_size: (self.index + 1) * batch_size] } ) # -- actually trains the model! train_model = theano.function([self.index], self.cost, updates=self.grad_updates, givens={ self.x: train_set_x[self.index*batch_size:(self.index+1)*batch_size], self.y: train_set_y[self.index*batch_size:(self.index+1)*batch_size] } ) sys.stdout.write('done.\nBuilding predictive model...') test_pred_layers = [] test_size = val_set_x.shape[0] test_layer0_input = self.Words[T.cast(self.x.flatten(),dtype="int32")].reshape((test_size,1,self.sentence_dim,self.Words.shape[1])) for conv_layer in self.conv_layers: test_layer0_output = conv_layer.predict(test_layer0_input, test_size) test_pred_layers.append(test_layer0_output.flatten(2)) test_layer1_input = T.concatenate(test_pred_layers, 1) test_y_pred = self.fully_connected.predict(test_layer1_input) test_error = self.fully_connected.cost(self.y) # -- function to test model. test_model_all = theano.function([self.x, self.y], test_error) sys.stdout.write('done.\n') #start training over mini-batches print 'Starting training...' epoch = 0 best_val_perf = 0 val_perf = 0 test_perf = 0 cost_epoch = 0 shuffle_batch = True while (epoch < n_epochs): epoch = epoch + 1 if shuffle_batch: for minibatch_index in np.random.permutation(range(n_train_batches)): cost_epoch = train_model(minibatch_index) self.set_zero(self.zero_vec) else: for minibatch_index in xrange(n_train_batches): cost_epoch = train_model(minibatch_index) self.set_zero(self.zero_vec) train_losses = [test_model(i) for i in xrange(n_train_batches)] train_perf = np.mean(train_losses) val_losses = [validate_model(i) for i in xrange(n_dev_batches)] val_perf = np.mean(val_losses) print('epoch %i, train perf %f %%, val perf %f' % (epoch, train_perf, val_perf)) if val_perf <= best_val_perf: best_val_perf = val_perf test_loss = test_model_all(val_set_x,val_set_y) test_perf = test_loss return test_perf