def addLayer(self, idx): """ :param idx: index of the layer(in the list passed to initialize the network) to be added. Note 0 is the input\ layers :return: return a list of newly created variables that has to initialized """ with tf.variable_scope('forward_variables', reuse=False): self.layers = self.layers[:-1] print 'layers len', len(self.layers) if len(self.layers) == 0: inpt = self.input else: inpt = self.layers[-1].activations self.layers.append( HiddenLayer(self.layer_dims[idx - 1], self.layer_dims[idx], inpt, 'layer' + str(idx))) self.layers.append( LinearLayer(self.layer_dims[-2], self.layer_dims[-1], self.layers[-1].activations, str(idx) + 'layerNet_output')) self.__buildLossGraph__() params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='forward_variables/layer' + str(idx)) params += tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='forward_variables_' + str(idx)) params += tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='forward_variables/' + str(idx) + 'layerNet_output') print 'params are ', params self.buildEvalGraph() self.buildSummaryGraph() return params
def __buildFullGraph__(self): inpt = self.input for idx in range(1, len(self.layer_dims) - 1): self.layers.append( HiddenLayer(self.layer_dims[idx - 1], self.layer_dims[idx], inpt, 'layer' + str(idx))) inpt = self.layers[-1].activations return inpt
def forward_propagation(self, x): """Forward Progation of a single sample.""" tau = len(x) prev_h = sp.zeros(self.n_hiddens) cells = [None for i in range(tau)] for i in range(tau): # Compute the hidden state time_input = x[i] hidden = HiddenLayer() hidden.forward(self.U, time_input, self.W, prev_h, self.b) # Compute the output prev_h = hidden.h output = OutputLayer() output.forward(self.V, hidden.h, self.c) cells[i] = (hidden, output) return cells
def __init_layers(self, layer_spec): self.layers = [] last_index = len(layer_spec) - 1 for i, size in enumerate(layer_spec): if i == 0: self.layers.append(InputLayer(size, self.activation_fn)) elif i == last_index: self.layers.append(OutputLayer(size, self.activation_fn)) else: self.layers.append(HiddenLayer(size, self.activation_fn)) for i in range(len(self.layers) - 1): self.__join_layers(self.layers[i], self.layers[i+1])
def __init__(self, architecture=[784, 100, 10], activation='sigmoid', learning_rate=0.1, momentum=0.5, weight_decay=1e-4, dropout=0.5, early_stopping=True, seed=99): """ Neural network model initializer. """ # Attributes self.architecture = architecture self.activation = activation self.learning_rate = learning_rate self.momentum = momentum self.weight_decay = weight_decay self.dropout = dropout self.early_stopping = early_stopping self.seed = seed # Turn `activation` and `learning_rate` to class instances if not isinstance(self.activation, Activation): self.activation = Activation(self.activation) if not isinstance(self.learning_rate, LearningRate): self.learning_rate = LearningRate(self.learning_rate) # Initialize a list of layers self.layers = [] for i, (n_in, n_out) in enumerate(zip(architecture[:-2], architecture[1:-1])): l = HiddenLayer('layer{}'.format(i), n_in, n_out, self.activation, self.learning_rate, self.momentum, self.weight_decay, self.dropout, self.seed + i) self.layers.append(l) # Output layer n_in, n_out = architecture[-2], architecture[-1] l = OutputLayer('output_layer', n_in, n_out, self.learning_rate, self.momentum, self.weight_decay, self.dropout, self.seed + i + 1) self.layers.append(l) # Training updates self.epoch = 0 self.training_error = [] self.validation_error = [] self.training_loss = [] self.validation_loss = []
def build_layers(self): layers = [] for i, layer_desc in enumerate(self.layer_descriptions): units = self._calc_num_units(i) constructor_params = { 'n_in': units[0], 'n_out': units[1], 'batch_size': self.batch_size, 'k': layer_desc['k'], 'activation': layer_desc['activation'], 'name': 'l_layer_%d' % i } layers.append(HiddenLayer(**constructor_params)) return layers
def process(train_source_file, train_target_file, dev_source_file, dev_target_file, test_source_file, test_target_predictions): train_source_data = get_data(train_source_file) train_target_data = get_data(train_target_file) dev_source_data = get_data(dev_source_file) dev_target_data = get_data(dev_target_file) test_source_data = get_data(test_source_file) source_words = set(itertools.chain(*(train_source_data + dev_source_data))) target_words = set(itertools.chain(*(train_target_data + dev_target_data))) source_word_to_idx = dict((v, i) for i, v in enumerate(source_words)) target_word_to_idx = dict((v, i) for i, v in enumerate(target_words)) target_idx_to_word = dict((i, v) for i, v in enumerate(target_words)) # Preparing data train_source_data = [[source_word_to_idx[word] for word in sentence] for sentence in train_source_data] dev_source_data = [[source_word_to_idx[word] for word in sentence] for sentence in dev_source_data] train_target_data = [[target_word_to_idx[word] for word in sentence] for sentence in train_target_data] dev_target_data = [[target_word_to_idx[word] for word in sentence] for sentence in dev_target_data] test_source_data = [[source_word_to_idx[word] for word in sentence] for sentence in test_source_data] # Changing the input numpy arrays to tensor vectors source_sentence = T.ivector() target_sentence = T.ivector() target_gold = T.ivector() source_word_embedding = 128 target_word_embedding = 128 source_hidden_embedding = 256 target_hidden_embedding = 256 hyper_params = [] vocab_source_size = len(source_words) vocab_target_size = len(target_words) source_lookup = EmbeddingLayer(vocab_source_size, source_word_embedding) target_lookup = EmbeddingLayer(vocab_target_size, target_word_embedding) hyper_params += source_lookup.params + target_lookup.params source_lstm_forward = LSTM(source_word_embedding, source_hidden_embedding, with_batch=False) target_lstm = LSTM(256, target_hidden_embedding, with_batch=False) hyper_params += source_lstm_forward.params + target_lstm.params[:-1] # Removing the last output tanh_layer = HiddenLayer(source_hidden_embedding, target_word_embedding, activation='tanh') # weighted_attention_vector + target_sentence_embedding + last encoded vector softmax_layer = HiddenLayer(source_hidden_embedding + target_hidden_embedding, vocab_target_size, activation='softmax') hyper_params += softmax_layer.params # Getting the source and target embeddings source_sentence_emb = source_lookup.link(source_sentence) target_sentence_emb = target_lookup.link(target_sentence) last_h = source_lstm_forward.link(source_sentence_emb) # Repeating the last encoder_output for target word length times # First changing the last encoder_output into a row and vector and repeating target word length times broadcast_source_context = T.repeat(last_h.dimshuffle('x', 0), target_sentence_emb.shape[0], axis=0) broadcast_source_context = tanh_layer.link(broadcast_source_context) target_sentence_emb = T.concatenate((target_sentence_emb, broadcast_source_context), axis=1) target_lstm.h_0 = last_h target_lstm.link(target_sentence_emb) # Attention ht = target_lstm.h.dot(source_lstm_forward.h.transpose()) # Normalizing across rows to get attention probabilities attention_weights = T.nnet.softmax(ht) # Weighted source_context_vector based on attention probabilities attention_weighted_vector = attention_weights.dot(source_lstm_forward.h) # Concatenating the hidden state from lstm and weighted source_context_vector pred = T.concatenate([attention_weighted_vector, target_lstm.h], axis=1) # Final softmax to get the best translation word prediction = softmax_layer.link(pred) # Computing the cross-entropy loss loss = T.nnet.categorical_crossentropy(prediction, target_gold).mean() updates = LearningMethod(clip=5.0).get_updates('adam', loss, hyper_params) # For training train_function = theano.function( inputs=[source_sentence, target_sentence, target_gold], outputs=loss, updates=updates ) # For prediction predict_function = theano.function( inputs=[source_sentence, target_sentence], outputs=prediction, ) def get_translations(source_sentences): translated_sentences = [] for sentence in source_sentences: source_sentence = np.array(sentence).astype(np.int32) translated_so_far = [target_word_to_idx['<s>']] while True: next_word = predict_function(source_sentence, translated_so_far).argmax(axis=1)[-1] # Get the last translated word translated_so_far.append(next_word) if next_word == target_word_to_idx['</s>']: translated_sentences.append([target_idx_to_word[x] for x in translated_so_far]) break return translated_sentences iterations = 100 batch_size = 10000 c = 0 best_score = -1.0 * sys.maxint dev_preds = [] test_preds = [] dev_best_preds = [] test_best_preds = [] for i in xrange(iterations): print 'Iteration {}'.format(i) random_indexes = range(len(train_source_data)) np.random.shuffle(random_indexes) loss = [] for sent_no, index in enumerate(random_indexes): src_vector = np.array(train_source_data[index]).astype(np.int32) tgt_vector = np.array(train_target_data[index]).astype(np.int32) c = train_function(src_vector, tgt_vector[:-1], tgt_vector[1:]) loss.append(c) if sent_no % batch_size == 0 and sent_no > 0: dev_preds = get_translations(dev_source_data) dev_bleu_score = get_bleu(dev_preds) if dev_bleu_score > best_score: best_score = dev_bleu_score dev_best_preds = dev_preds[:] # Decoding the test once the dev reaches the baseline if dev_bleu_score >= 28: test_preds = get_translations(test_source_data) test_best_preds = test_preds[:] print 'Dev bleu score {}'.format(dev_bleu_score) print 'Iteration: {} Loss {}'.format(i, 1.0 * (sum(loss))/len(loss)) dev_output_fp = open('dev_output.txt', 'w') test_output_fp = open(test_target_predictions, 'w') for pred in dev_best_preds: dev_output_fp.write(' '.join(pred) + '\n') dev_output_fp.close() for pred in test_best_preds: test_output_fp.write(' '.join(pred) + '\n') test_output_fp.close()
def main(): config = ConfigParser.ConfigParser() train_src = load_data(config.get("Data", "train_src")) dev_src = load_data(config.get("Data", "dev_src")) test_src = load_data(config.get("Data", "test_src")) train_tgt = load_data(config.get("Data", "train_tgt")) dev_tgt = load_data(config.get("Data", "dev_tgt")) test_tgt = load_data(config.get("Data", "test_tgt")) assert len(train_src) == len(train_tgt) UD_path = config.get("Path", "UD") sys.path.append(UD_path + "/") words_src = get_words(train_src + dev_src) words_tgt = get_words(train_tgt + dev_tgt) source_word2ind = {word: ind for ind, word in enumerate(words_src)} source_ind2word = {ind: word for ind, word in enumerate(words_src)} target_word2ind = {word: ind for ind, word in enumerate(words_tgt)} target_ind2word = {ind: word for ind, word in enumerate(words_tgt)} # In[24]: # # Model # src_emb_dim = 256 # source word embedding dimension tgt_emb_dim = 256 # target word embedding dimension src_lstm_hid_dim = 512 # source LSTMs hidden dimension tgt_lstm_hid_dim = 2 * src_lstm_hid_dim # target LSTM hidden dimension proj_dim = 104 # size of the first projection layer dropout = 0.5 # dropout rate n_src = len(source_word2ind) # number of words in the source language n_tgt = len(target_word2ind) # number of words in the target language # Parameters params = [] # Source words + target words embeddings layer src_lookup = EmbeddingLayer(n_src, src_emb_dim, name="src_lookup") # lookup table for source words tgt_lookup = EmbeddingLayer(n_tgt, tgt_emb_dim, name="tgt_lookup") # lookup table for target words params += src_lookup.params + tgt_lookup.params # LSTMs src_lstm_for = LSTM(src_emb_dim, src_lstm_hid_dim, name="src_lstm_for", with_batch=False) src_lstm_rev = LSTM(src_emb_dim, src_lstm_hid_dim, name="src_lstm_rev", with_batch=False) tgt_lstm = LSTM(2 * tgt_emb_dim, tgt_lstm_hid_dim, name="tgt_lstm", with_batch=False) params += src_lstm_for.params + src_lstm_rev.params + tgt_lstm.params[:-1] # Projection layers proj_layer1 = HiddenLayer(tgt_lstm_hid_dim + 2 * src_lstm_hid_dim, n_tgt, name="proj_layer1", activation="softmax") proj_layer2 = HiddenLayer(2 * src_lstm_hid_dim, tgt_emb_dim, name="proj_layer2", activation="tanh") params += proj_layer1.params # + proj_layer2.params # Train status is_train = T.iscalar("is_train") # Input sentence src_sentence = T.ivector() # Current output translation tgt_sentence = T.ivector() # Gold translation tgt_gold = T.ivector() src_sentence_emb = src_lookup.link(src_sentence) tgt_sentence_emb = tgt_lookup.link(tgt_sentence) print "src_sentence_emb", src_sentence_emb.eval({src_sentence: src_sentence_t}).shape print "tgt_sentence_emb", tgt_sentence_emb.eval({tgt_sentence: tgt_sentence_t}).shape src_lstm_for.link(src_sentence_emb) src_lstm_rev.link(src_sentence_emb[::-1, :]) print "src_lstm_for.h", src_lstm_for.h.eval({src_sentence: src_sentence_t}).shape print "src_lstm_rev.h", src_lstm_rev.h.eval({src_sentence: src_sentence_t}).shape src_context = T.concatenate([src_lstm_for.h, src_lstm_rev.h[::-1, :]], axis=1) print "src_context", src_context.eval({src_sentence: src_sentence_t}).shape tgt_lstm.h_0 = src_context[-1] print "tgt sentence emb", tgt_sentence_emb.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t}).shape tgt_lstm.link(tgt_sentence_emb) print "tgt_lstm.h", tgt_lstm.h.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t}).shape transition = tgt_lstm.h.dot(src_context.transpose()) transition = transition.dot(src_context) print "transition", transition.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t}).shape transition_last = T.concatenate([transition, tgt_lstm.h], axis=1) print "transition_last", transition_last.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t}).shape prediction = proj_layer1.link(transition_last) print "prediction", prediction.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t}).shape cost = T.nnet.categorical_crossentropy(prediction, tgt_gold).mean() cost += beta * T.mean( (tgt_lstm.h[:-1] ** 2 - tgt_lstm.h[1:] ** 2) ** 2 ) # Regularization of RNNs from http://arxiv.org/pdf/1511.08400v6.pdf print "cost", cost.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t, tgt_gold: tgt_gold_t}) # In[26]: updates = LearningMethod(clip=5.0).get_updates("adam", cost, params) # In[27]: f_train = theano.function(inputs=[src_sentence, tgt_sentence, tgt_gold], outputs=cost, updates=updates) # In[28]: f_eval = theano.function(inputs=[src_sentence, tgt_sentence], outputs=prediction)
def main(): source_word2idx, source_idx2word = create_word_table(train_src) target_word2idx, target_idx2word = create_word_table(train_tgt) sys.stderr.write("Lookup table constructed." + "\n") src_emb_dim = 256 # source word embedding dimension tgt_emb_dim = 256 # target word embedding dimension src_lstm_hid_dim = 512 # source LSTMs hidden dimension tgt_lstm_hid_dim = 2 * src_lstm_hid_dim # target LSTM hidden dimension dropout = 0.5 # dropout rate n_src = len(source_word2idx) # number of words in the source language n_tgt = len(target_word2idx) # number of words in the target language # Parameters params = [] # Source words + target words embeddings layer # lookup table for source words src_lookup = EmbeddingLayer(n_src, src_emb_dim, name='src_lookup') # lookup table for target words tgt_lookup = EmbeddingLayer(n_tgt, tgt_emb_dim, name='tgt_lookup') params += src_lookup.params + tgt_lookup.params # LSTMs src_lstm_for = LSTM(src_emb_dim, src_lstm_hid_dim, name='src_lstm_for', with_batch=False) src_lstm_rev = LSTM(src_emb_dim, src_lstm_hid_dim, name='src_lstm_rev', with_batch=False) tgt_lstm = LSTM(2 * tgt_emb_dim, tgt_lstm_hid_dim, name='tgt_lstm', with_batch=False) params += src_lstm_for.params + src_lstm_rev.params + tgt_lstm.params[:-1] # Projection layers proj_layer1 = HiddenLayer(tgt_lstm_hid_dim + 2 * src_lstm_hid_dim, n_tgt, name='proj_layer1', activation='softmax') proj_layer2 = HiddenLayer(2 * src_lstm_hid_dim, tgt_emb_dim, name='proj_layer2', activation='tanh') # proj_layer2 = HiddenLayer(2 * src_lstm_hid_dim, tgt_emb_dim, name='proj_layer2', activation='tanh') params += proj_layer1.params + proj_layer2.params beta = 500 # Train status is_train = T.iscalar('is_train') # Input sentence src_sentence = T.ivector() # Current output translation tgt_sentence = T.ivector() # Gold translation tgt_gold = T.ivector() src_sentence_emb = src_lookup.link(src_sentence) tgt_sentence_emb = tgt_lookup.link(tgt_sentence) src_lstm_for.link(src_sentence_emb) src_lstm_rev.link(src_sentence_emb[::-1, :]) src_context = T.concatenate([src_lstm_for.h, src_lstm_rev.h[::-1, :]], axis=1) tgt_lstm.h_0 = src_context[-1] repeated_src_context = T.repeat(src_context[-1].dimshuffle('x', 0), tgt_sentence_emb.shape[0], axis=0) repeated_src_context = proj_layer2.link(repeated_src_context) tgt_sentence_emb = T.concatenate((tgt_sentence_emb, repeated_src_context), axis=1) tgt_lstm.link(tgt_sentence_emb) # Attention transition = tgt_lstm.h.dot(src_context.transpose()) transition = transition.dot(src_context) transition_last = T.concatenate([transition, tgt_lstm.h], axis=1) prediction = proj_layer1.link(transition_last) cost = T.nnet.categorical_crossentropy(prediction, tgt_gold).mean() # Regularization of RNNs from http://arxiv.org/pdf/1511.08400v6.pdf cost += beta * T.mean((tgt_lstm.h[:-1]**2 - tgt_lstm.h[1:]**2)**2) updates = LearningMethod(clip=5.0).get_updates('adam', cost, params) f_train = theano.function(inputs=[src_sentence, tgt_sentence, tgt_gold], outputs=cost, updates=updates) f_eval = theano.function( inputs=[src_sentence, tgt_sentence], outputs=prediction, ) best_valid_preds = None best_valid_score = -sys.maxint best_test_preds = None log = open('blue_valid_log.txt', 'w') all_costs = [] batch_size = 50 n_epochs = 10 for i in xrange(n_epochs): print 'Starting epoch %i' % i indices = range(len(train_src)) np.random.shuffle(indices) train_src_batch = [train_src[ind] for ind in indices] train_tgt_batch = [train_tgt[ind] for ind in indices] assert len(train_src_batch) == len(train_tgt_batch) costs = [] for j in xrange(len(train_src_batch)): new_cost = f_train( np.array([source_word2idx[x] for x in train_src_batch[j]]).astype(np.int32), np.array([target_word2idx[x] for x in train_tgt_batch[j]][:-1]).astype(np.int32), np.array([target_word2idx[x] for x in train_tgt_batch[j]][1:]).astype(np.int32)) all_costs.append((j, new_cost)) costs.append(new_cost) if j % 300 == 0: print j, np.mean(costs) costs = [] if np.isnan(new_cost): print 'NaN detected.' break if j % 10000 == 0 and j != 0: valid_preds = get_predictions(source_word2idx, target_word2idx, target_idx2word, f_eval, mode="validation") bleu = get_validation_bleu(valid_preds) print '===================================================================' print 'Epoch %i BLEU on Validation : %s ' % (i, bleu) print '===================================================================' if float(bleu) >= best_valid_score: best_valid_score = float(bleu) best_valid_preds = copy.deepcopy(valid_preds) best_test_preds = get_predictions(source_word2idx, target_word2idx, target_idx2word, f_eval, mode="test") print 'Found new best validation score %f ' % ( best_valid_score) log.write('Epoch %d Minibatch %d BLEU on Validation : %s \n' % (i, j, bleu)) # Store after epoch fout = open('output' + str(i) + '.txt', 'w') for line in best_test_preds: fout.write(' '.join(line) + '\n') fout.close() log.close()
def create_cell(self): # FIXME: Should create cells base on network structure hidden = HiddenLayer(self.hidden_size) output = OutputLayer(self.hidden_size) return (hidden, output)
def main(): source_word2idx, source_idx2word = create_word_table(train_src) target_word2idx, target_idx2word = create_word_table(train_tgt) sys.stderr.write("Lookup table constructed." + "\n") src_emb_dim = 256 # source word embedding dimension tgt_emb_dim = 256 # target word embedding dimension src_lstm_hid_dim = 512 # source LSTMs hidden dimension tgt_lstm_hid_dim = 2 * src_lstm_hid_dim # target LSTM hidden dimension dropout = 0.5 # dropout rate n_src = len(source_word2idx) # number of words in the source language n_tgt = len(target_word2idx) # number of words in the target language # Parameters params = [] # Source words + target words embeddings layer # lookup table for source words src_lookup = EmbeddingLayer(n_src, src_emb_dim, name='src_lookup') # lookup table for target words tgt_lookup = EmbeddingLayer(n_tgt, tgt_emb_dim, name='tgt_lookup') params += src_lookup.params + tgt_lookup.params # LSTMs src_lstm_for = LSTM(src_emb_dim, src_lstm_hid_dim, name='src_lstm_for', with_batch=False) src_lstm_rev = LSTM(src_emb_dim, src_lstm_hid_dim, name='src_lstm_rev', with_batch=False) tgt_lstm = LSTM(2 * tgt_emb_dim, tgt_lstm_hid_dim, name='tgt_lstm', with_batch=False) params += src_lstm_for.params + src_lstm_rev.params + tgt_lstm.params[:-1] # Projection layers proj_layer1 = HiddenLayer(tgt_lstm_hid_dim + 2 * src_lstm_hid_dim, n_tgt, name='proj_layer1', activation='softmax') proj_layer2 = HiddenLayer(2 * src_lstm_hid_dim, tgt_emb_dim, name='proj_layer2', activation='tanh') # proj_layer2 = HiddenLayer(2 * src_lstm_hid_dim, tgt_emb_dim, name='proj_layer2', activation='tanh') params += proj_layer1.params + proj_layer2.params beta = 500 # Train status is_train = T.iscalar('is_train') # Input sentence src_sentence = T.ivector() # Current output translation tgt_sentence = T.ivector() # Gold translation tgt_gold = T.ivector() src_sentence_emb = src_lookup.link(src_sentence) tgt_sentence_emb = tgt_lookup.link(tgt_sentence) src_lstm_for.link(src_sentence_emb) src_lstm_rev.link(src_sentence_emb[::-1, :]) src_context = T.concatenate( [src_lstm_for.h, src_lstm_rev.h[::-1, :]], axis=1) tgt_lstm.h_0 = src_context[-1] repeated_src_context = T.repeat(src_context[-1].dimshuffle('x', 0), tgt_sentence_emb.shape[0], axis=0) repeated_src_context = proj_layer2.link(repeated_src_context) tgt_sentence_emb = T.concatenate((tgt_sentence_emb, repeated_src_context), axis=1) tgt_lstm.link(tgt_sentence_emb) # Attention transition = tgt_lstm.h.dot(src_context.transpose()) transition = transition.dot(src_context) transition_last = T.concatenate([transition, tgt_lstm.h], axis=1) prediction = proj_layer1.link(transition_last) cost = T.nnet.categorical_crossentropy(prediction, tgt_gold).mean() # Regularization of RNNs from http://arxiv.org/pdf/1511.08400v6.pdf cost += beta * T.mean((tgt_lstm.h[:-1] ** 2 - tgt_lstm.h[1:] ** 2) ** 2) updates = LearningMethod(clip=5.0).get_updates('adam', cost, params) f_train = theano.function( inputs=[src_sentence, tgt_sentence, tgt_gold], outputs=cost, updates=updates ) f_eval = theano.function( inputs=[src_sentence, tgt_sentence], outputs=prediction, ) best_valid_preds = None best_valid_score = -sys.maxint best_test_preds = None log = open('blue_valid_log.txt', 'w') all_costs = [] batch_size = 50 n_epochs = 10 for i in xrange(n_epochs): print 'Starting epoch %i' % i indices = range(len(train_src)) np.random.shuffle(indices) train_src_batch = [train_src[ind] for ind in indices] train_tgt_batch = [train_tgt[ind] for ind in indices] assert len(train_src_batch) == len(train_tgt_batch) costs = [] for j in xrange(len(train_src_batch)): new_cost = f_train( np.array([source_word2idx[x] for x in train_src_batch[j]]).astype(np.int32), np.array([target_word2idx[x] for x in train_tgt_batch[j]][:-1]).astype(np.int32), np.array([target_word2idx[x] for x in train_tgt_batch[j]][1:]).astype(np.int32) ) all_costs.append((j, new_cost)) costs.append(new_cost) if j % 300 == 0: print j, np.mean(costs) costs = [] if np.isnan(new_cost): print 'NaN detected.' break if j % 10000 == 0 and j != 0: valid_preds = get_predictions( source_word2idx, target_word2idx, target_idx2word, f_eval, mode="validation") bleu = get_validation_bleu(valid_preds) print '===================================================================' print 'Epoch %i BLEU on Validation : %s ' % (i, bleu) print '===================================================================' if float(bleu) >= best_valid_score: best_valid_score = float(bleu) best_valid_preds = copy.deepcopy(valid_preds) best_test_preds = get_predictions( source_word2idx, target_word2idx, target_idx2word, f_eval, mode="test") print 'Found new best validation score %f ' % (best_valid_score) log.write( 'Epoch %d Minibatch %d BLEU on Validation : %s \n' % (i, j, bleu)) # Store after epoch fout = open('output' + str(i) + '.txt', 'w') for line in best_test_preds: fout.write(' '.join(line) + '\n') fout.close() log.close()
def test_train_little_only( rng, batch_size, learning_rate, n_hids, n_epochs=1000, L1_reg=0.0, L2_reg=0.0001, zero_last_layer_params=False, ): def summarize_rates(): print "Learning rate: ", learning_rate.rate l_learning_rate = shared(np.array(learning_rate.rate, dtype=config.floatX), name='learning_rate') index = T.lscalar('index') l_x = T.matrix('l_x', dtype=config.floatX) y = T.ivector('y') print "Loading Data" print "... MNIST" dataset = 'mnist.pkl.gz' datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size print "Building models" print "... Building layers" # Create network structure x_size = train_set_x.shape[1].eval() y_size = train_set_y.shape[0].eval() n_in = x_size n_out = n_hids l_layers = [] b_layers = [] l_params = None # Shared variable used for always activating one block in a layer as in the # input and output layer one_block_idxs = shared(np.zeros((batch_size, 1), dtype='int64'), name='one_block_idxs') l_layers.append( HiddenLayer(n_in, n_out, batch_size, k=0.1, activation=T.tanh, name='l_layer_' + str(len(l_layers)))) n_in = n_out # l_layers.append( # HiddenLayer( # n_in, # n_out, # batch_size, # k=0.1, # activation=T.tanh, # name='l_layer_' + str(len(l_layers)) # ) # ) n_out = 10 l_layers.append( HiddenLayer(n_in, n_out, batch_size, k=1, activation=T.nnet.softmax, name='l_layer_' + str(len(l_layers)))) if zero_last_layer_params: l_layers[-1].W.set_value(0 * l_layers[-1].W.get_value()) l_layers[-1].b.set_value(0 * l_layers[-1].b.get_value()) for layer in l_layers: print "\t%s" % layer #for l_layer in l_layers: # for param in l_layer.params: # param.set_value(np.ones_like(param.get_value())) print "... Building top active updates" top_active = [] l_activation = l_x for i in range(len(l_layers)): l_activation = l_layers[i].output(l_activation) print "... Building costs and errors" l_cost = add_regularization(l_layers, l_layers[-1].cost(l_activation, y), L1_reg, L2_reg) l_error = l_layers[-1].error(l_activation, y) print "... Building parameter updates" l_grads = [] l_param_updates = [] for i in range(len(l_layers)): for param in l_layers[i].params: gparam = T.grad(l_cost, param) l_grads.append(gparam) l_param_updates.append((param, param - l_learning_rate * gparam)) print "... Compiling little net train function" l_updates = l_param_updates l_train_model = function( [index], [l_cost, l_x, y], updates=l_updates, givens={ l_x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) print "... Compiling little net test function" l_test_model = function( [index], l_error, givens={ l_x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) print "... Compiling little net validate function" l_validate_model = function( [index], l_error, givens={ l_x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) print "Training" # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 10 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None this_validation_loss = 0 this_validation_loss_l = 0 this_validation_loss_b = 0 best_validation_loss = np.inf best_validation_loss_l = best_validation_loss best_validation_loss_b = best_validation_loss best_iter = 0 test_score = 0. test_score_l = 0. accum_l = 0 epoch = 0 train_time_accum_l = 0 done_looping = False timers = ['train', 'valid', 'train'] ts = TS(['epoch', 'valid']) ts_l = TS(timers) summarize_rates() ts.start() while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 ts.start('epoch') for minibatch_index in xrange(n_train_batches): ts_l.start('train') minibatch_avg_cost_l = l_train_model(minibatch_index) ts_l.end('train') minibatch_avg_cost_l = minibatch_avg_cost_l[0] if np.isnan(minibatch_avg_cost_l): print "minibatch_avg_cost_l: %f" % minibatch_avg_cost_l ipdb.set_trace() accum_l = accum_l + minibatch_avg_cost_l # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: ts.end('epoch') ts.reset('epoch') ts_l.reset('train') accum_l = accum_l / validation_frequency l_summary = ("minibatch_avg_cost_l: %f, time: %f" % (accum_l, ts_l.accumed['train'][-1][1])) accum_l = 0 train_time_accum_l = 0 print "%s" % (l_summary) # compute zero-one loss on validation set summary = ('epoch %i, minibatch %i/%i' % (epoch, minibatch_index + 1, n_train_batches)) validation_losses_l = [ l_validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss_l = np.mean(validation_losses_l) l_summary = ('little validation error %f %% ' % (this_validation_loss_l * 100.)) print("%s %s" % (summary, l_summary)) #ipdb.set_trace() # if we got the best validation score until now this_validation_loss = this_validation_loss_l if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss_l = this_validation_loss_l best_validation_loss = best_validation_loss_l best_iter = iter # test it on the test set test_losses_l = [ l_test_model(i) for i in xrange(n_test_batches) ] test_score_l = np.mean(test_losses_l) l_summary = 'little: %f' % (test_score_l * 100.) print( ' epoch %i, minibatch %i/%i,' ' test error of best model %s' % (epoch, minibatch_index + 1, n_train_batches, l_summary)) learning_rate.update() l_learning_rate.set_value(learning_rate.rate) summarize_rates() if patience <= iter: done_looping = True break ts.end() print( 'Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%' % (best_validation_loss_l * 100., best_iter + 1, test_score_l * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %s' % ts) return ts.diffs['epoch']
def test_big_and_little_train_big(rng, batch_size, learning_rate, momentum_rate, n_epochs=1000, L1_reg=0.0, L2_reg=0.0001, restore_parameters=False, select_top_active=False, mult_small_net_params=False, zero_last_layer_params=False, train_little_net=False, train_big_net=True): def summarize_rates(): print "Learning rate: ", learning_rate.rate, \ "Momentum: ", momentum.get_value() assert (train_big_net or train_little_net) l_learning_rate = shared(np.array(learning_rate.rate, dtype=config.floatX), name='learning_rate') b_learning_rate = shared(np.array(learning_rate.rate, dtype=config.floatX), name='learning_rate') momentum = shared(np.array(momentum_rate.rate, dtype=config.floatX), name='momentum') index = T.lscalar('index') l_x = T.matrix('l_x', dtype=config.floatX) b_x = T.tensor3('b_x', dtype=config.floatX) y = T.ivector('y') print "Loading Data" print "... MNIST" dataset = 'mnist.pkl.gz' datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size print "Building models" print "... Building layers" # Create network structure x_size = train_set_x.shape[1].eval() y_size = train_set_y.shape[0].eval() n_in = x_size n_units_per = 1 n_out = 5000 l_layers = [] b_layers = [] l_params = None # Shared variable used for always activating one block in a layer as in the # input and output layer one_block_idxs = shared(np.zeros((batch_size, 1), dtype='int64'), name='one_block_idxs') l_layers.append( HiddenLayer(n_in, n_out, batch_size, k=0.1, activation=T.tanh, name='l_layer_' + str(len(l_layers)))) if mult_small_net_params: l_params = l_layers[-1].params b_layers.append( HiddenBlockLayer((1, x_size), (n_out, n_units_per), one_block_idxs, l_layers[-1].top_active, batch_size, activation=T.tanh, name='b_layer_' + str(len(b_layers)), l_params=l_params, l_param_map=[('x', 1, 0, 'x'), (0, 'x')])) n_in = n_out l_layers.append( HiddenLayer(n_in, n_out, batch_size, k=0.1, activation=T.tanh, name='l_layer_' + str(len(l_layers)))) if mult_small_net_params: l_params = l_layers[-1].params b_layers.append( HiddenBlockLayer( (n_in, n_units_per), (n_out, n_units_per), l_layers[-2].top_active, l_layers[-1].top_active, #out_idxs_n, batch_size, activation=T.tanh, name='b_layer_' + str(len(b_layers)), l_params=l_params, l_param_map=[(0, 1, 'x', 'x'), (0, 'x')])) n_out = 10 l_layers.append( HiddenLayer(n_in, n_out, batch_size, k=1, activation=T.nnet.softmax, name='l_layer_' + str(len(l_layers)))) if zero_last_layer_params: l_layers[-1].W.set_value(0 * l_layers[-1].W.get_value()) l_layers[-1].b.set_value(0 * l_layers[-1].b.get_value()) if mult_small_net_params: l_params = l_layers[-1].params b_layers.append( HiddenBlockLayer((n_in, n_units_per), (1, n_out), l_layers[-2].top_active, one_block_idxs, batch_size, None, name='b_layer_' + str(len(b_layers)), l_params=l_params, l_param_map=[(0, 'x', 'x', 1), ('x', 0)])) if zero_last_layer_params: b_layers[-1].W.set_value(0 * b_layers[-1].W.get_value()) b_layers[-1].b.set_value(0 * b_layers[-1].b.get_value()) if train_little_net or select_top_active: for layer in l_layers: print "\t%s" % layer if train_big_net: for layer in b_layers: print layer if restore_parameters: print "... Restoring weights of little model" restore_parameters('parameters_20_20_l1_0.0001_l2_0.0001.pkl', l_layers) #for l_layer in l_layers: # for param in l_layer.params: # param.set_value(np.ones_like(param.get_value())) print "... Building top active updates" top_active = [] l_activation = l_x b_activation = b_x b_activations = [b_activation] for i in range(len(l_layers)): l_activation = l_layers[i].output(l_activation) b_activation = b_layers[i].output(b_activation) b_activations.append(b_activation) top_active.append((l_layers[i].top_active, T.argsort(T.abs_(l_activation))[:, :l_layers[i].k])) print "... Building costs and errors" l_cost = add_regularization(l_layers, l_layers[-1].cost(l_activation, y), L1_reg, L2_reg) l_error = l_layers[-1].error(l_activation, y) # T.nnet.softmax takes a matrix not a tensor so we only calculate the # linear component at the last layer and here we reshape and then # apply the softmax #b_activation = T.nnet.softmax(((b_activation*b_activation)**2).sum(axis=2)) #b_activation = relu_softmax(((b_activation*b_activation)**2).sum(axis=2)) #b_activation = T.nnet.softmax(T.mean(b_activation, axis=2)) #b_activation = relu_softmax(T.mean(b_activation, axis=2)) #b_activation = T.nnet.softmax(T.max(b_activation, axis=2)) #b_activation = relu_softmax(T.max(b_activation, axis=2)) b_shp = b_activation.shape #b_activation = relu_softmax(b_activation.reshape((b_shp[0], b_shp[2]))) b_activation = T.nnet.softmax(b_activation.reshape((b_shp[0], b_shp[2]))) b_activations.append(b_activation) b_cost = add_regularization(b_layers, b_layers[-1].cost(b_activation, y), L1_reg, L2_reg) b_error = b_layers[-1].error(b_activation, y) print "... Building parameter updates" l_grads = [] l_param_updates = [] b_grads = [] b_param_updates = [] for i in range(len(l_layers)): for param in l_layers[i].params: gparam = T.grad(l_cost, param) l_grads.append(gparam) l_param_updates.append((param, param - l_learning_rate * gparam)) for param in b_layers[i].params: b_gparam = T.grad( b_cost, param, #consider_constant=[b_layers[i].in_idxs, b_layers[i].out_idxs] ) b_velocity = shared( np.zeros_like(param.get_value(), dtype=theano.config.floatX), param.name + '_velocity') b_param_updates.append( (b_velocity, momentum * b_velocity - b_learning_rate * b_gparam)) b_grads.append(b_gparam) b_param_updates.append((param, param + b_velocity)) #if b_layers[i].l_params is not None: #for param in b_layers[i].l_params: #l_gparam = T.grad( # b_cost, # param #) #l_velocity = shared( # np.zeros_like(param.get_value()), # param.name + '_velocity' #) #b_param_updates.append(( # l_velocity, momentum*l_velocity - b_learning_rate*l_gparam #)) #l_grads.append(l_gparam) #b_param_updates.append((param, param + l_velocity)) #b_param_updates.append(( # param, param - 0.0001*l_gparam #)) print "... Compiling little net train function" l_updates = [] if select_top_active: l_updates = l_updates + top_active if train_little_net: l_updates = l_updates + l_param_updates l_train_model = function( [index], [l_cost, l_x, y], updates=l_updates, givens={ l_x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) print "... Compiling big net train function" temp = train_set_x.get_value(borrow=True, return_internal_type=True) train_set_x_b = shared(temp.reshape((temp.shape[0], 1, temp.shape[1])), borrow=True, name='train_set_x_b') b_updates = [] if train_big_net: b_updates = b_updates + b_param_updates b_train_model = function( [index], [b_cost], updates=b_updates, givens={ b_x: train_set_x_b[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) #theano.printing.debugprint(b_train_model) #ipdb.set_trace() # verify_layers(batch_size, b_layers, train_set_x_b, train_set_y) # temp = verify_cost( # b_cost, # b_layers, # b_x, # y, # batch_size, # train_set_x_b, # train_set_y # ) # T.verify_grad( # temp, # [b_layers[0].W.get_value(), b_layers[1].W.get_value()], # rng=rng # ) print "... Compiling little net test function" l_test_model = function( [index], l_error, givens={ l_x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) print "... Compiling big net test function" temp = test_set_x.get_value(borrow=True, return_internal_type=True) test_set_x_b = shared(temp.reshape((temp.shape[0], 1, temp.shape[1])), borrow=True, name='test_set_x_b') b_test_model = function( [index], b_error, givens={ b_x: test_set_x_b[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) print "... Compiling little net validate function" l_validate_model = function( [index], l_error, givens={ l_x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) print "... Compiling big net validate function" temp = valid_set_x.get_value(borrow=True, return_internal_type=True) valid_set_x_b = shared(temp.reshape((temp.shape[0], 1, temp.shape[1])), borrow=True, name='valid_set_x_b') b_validate_model = function( [index], b_error, givens={ b_x: valid_set_x_b[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) print "Training" # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 10 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None this_validation_loss = 0 this_validation_loss_l = 0 this_validation_loss_b = 0 best_validation_loss = np.inf best_validation_loss_l = best_validation_loss best_validation_loss_b = best_validation_loss best_iter = 0 test_score = 0. test_score_l = 0. test_score_b = 0. accum_l = 0 accum_b = 0 epoch = 0 train_time_accum_l = 0 train_time_accum_b = 0 done_looping = False timers = ['train', 'valid', 'train'] ts = TS(['epoch', 'valid']) ts_l = TS(timers) ts_b = TS(timers) summarize_rates() ts.start() while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 ts.start('epoch') for minibatch_index in xrange(n_train_batches): if train_little_net or select_top_active: ts_l.start('train') minibatch_avg_cost_l = l_train_model(minibatch_index) ts_l.end('train') minibatch_avg_cost_l = minibatch_avg_cost_l[0] if np.isnan(minibatch_avg_cost_l): print "minibatch_avg_cost_l: %f" % minibatch_avg_cost_l ipdb.set_trace() accum_l = accum_l + minibatch_avg_cost_l if train_big_net: ts_b.start('train') minibatch_avg_cost_b = b_train_model(minibatch_index) ts_b.end('train') minibatch_avg_cost_b = minibatch_avg_cost_b[0] accum_b = accum_b + minibatch_avg_cost_b #print "minibatch_avg_cost: " + str(minibatch_avg_cost) + " minibatch_avg_cost_b: " + str(minibatch_avg_cost_b) #print l_layers[0].W.get_value().sum(), l_layers[1].W.get_value().sum(), b_layers[0].W.get_value().sum(), b_layers[1].W.get_value().sum() #print "A: ", np.max(np.abs(b_layers[0].W.get_value())), np.max(np.abs(b_layers[0].b.get_value())), np.max(np.abs(b_layers[1].W.get_value())), np.max(np.abs(b_layers[1].b.get_value())) #print "B: ", np.abs(b_layers[0].W.get_value()).sum(), np.abs(b_layers[0].b.get_value()).sum(), np.abs(b_layers[1].W.get_value()).sum(), np.abs(b_layers[1].b.get_value()).sum() #print "C: ", np.abs(np.array(minibatch_avg_cost_b[1])).sum(), np.abs(np.array(minibatch_avg_cost_b[2])).sum(), np.abs(np.array(minibatch_avg_cost_b[3])).sum(), np.abs(np.array(minibatch_avg_cost_b[4])).sum() # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: ts.end('epoch') ts.reset('epoch') l_summary = "" if train_little_net or select_top_active: ts_l.reset('train') accum_l = accum_l / validation_frequency l_summary = ("minibatch_avg_cost_l: %f, time: %f" % (accum_l, ts_l.accumed['train'][-1][1])) accum_l = 0 train_time_accum_l = 0 b_summary = "" if train_big_net: ts_b.reset('train') accum_b = accum_b / validation_frequency b_summary = ("minibatch_avg_cost_b: %f, time: %f" % (accum_b, ts_b.accumed['train'][-1][1])) accum_b = 0 print "%s %s" % (l_summary, b_summary) # compute zero-one loss on validation set summary = ('epoch %i, minibatch %i/%i' % (epoch, minibatch_index + 1, n_train_batches)) l_summary = "" if train_little_net or select_top_active: validation_losses_l = [ l_validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss_l = np.mean(validation_losses_l) l_summary = ('little validation error %f %% ' % (this_validation_loss_l * 100.)) b_summary = "" if train_big_net: validation_losses_b = [ b_validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss_b = np.mean(validation_losses_b) #this_validation_loss_b = 0 b_summary = ('big validation error %f %% ' % (this_validation_loss_b * 100.)) print("%s %s %s" % (summary, l_summary, b_summary)) #ipdb.set_trace() # if we got the best validation score until now if train_big_net: this_validation_loss = this_validation_loss_b elif train_little_net: this_validation_loss = this_validation_loss_l if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss_l = this_validation_loss_l best_validation_loss_b = this_validation_loss_b if train_big_net: best_validation_loss = best_validation_loss_b elif train_little_net: best_validation_loss = best_validation_loss_l best_iter = iter # test it on the test set l_summary = "" if train_little_net: test_losses_l = [ l_test_model(i) for i in xrange(n_test_batches) ] test_score_l = np.mean(test_losses_l) l_summary = 'little: %f' % (test_score_l * 100.) b_summary = "" if train_big_net: test_losses_b = [ b_test_model(i) for i in xrange(n_test_batches) ] test_score_b = np.mean(test_losses_b) #test_score_b = 0 b_summary = 'big: %f' % (test_score_b * 100.) print( ' epoch %i, minibatch %i/%i,' ' test error of best model %s %s' % (epoch, minibatch_index + 1, n_train_batches, l_summary, b_summary)) learning_rate.update() if train_little_net: l_learning_rate.set_value(learning_rate.rate) if train_big_net: b_learning_rate.set_value(learning_rate.rate) momentum_rate.update() momentum.set_value(momentum_rate.rate) summarize_rates() if patience <= iter: done_looping = True break ts.end() print( 'Optimization complete. Best validation score of %f %% (%f %%) ' 'obtained at iteration %i, with test performance %f %% (%f %%)' % (best_validation_loss_l * 100., best_validation_loss_b * 100., best_iter + 1, test_score_l * 100., test_score_b * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %s' % ts)
def test_big_and_little_train_both(rng, batch_size=1, learning_rate=0.01, n_epochs=1000, L1_reg=0.0, L2_reg=0.0001): l_learning_rate = learning_rate b_learning_rate = 10 * learning_rate index = T.lscalar('index') l_x = T.matrix('l_x', dtype=config.floatX) b_x = T.tensor3('b_x', dtype=config.floatX) y = T.ivector('y') print "Loading Data" dataset = 'mnist.pkl.gz' datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size print "Building models" print "... Building layers" # Create network structure x_size = train_set_x.shape[1].eval() n_in = x_size n_units_per = 32 n_out = 500 l_layers = [] b_layers = [] l_layers.append( HiddenLayer( n_in, n_out, batch_size, #k=0.05, k=1, activation=T.tanh, name='l_layer_' + str(len(l_layers)))) in_idxs_0 = shared(np.zeros((batch_size, 1), dtype='int64'), name='in_idxs_0') b_layers.append( HiddenBlockLayer((1, x_size), (n_out, n_units_per), in_idxs_0, l_layers[-1].top_active, batch_size, activation=T.tanh, name='b_layer_' + str(len(b_layers)))) #n_in = n_out #n_out = 100 #k_activations = 0.12 #l_layers.append( # HiddenLayer( # n_in, # n_out, # k=k_activations, # name='l_layer_' + str(len(l_layers)) # ) #) #b_layers.append(HiddenBlockLayer(n_in, n_out, batch_size)) n_in = n_out n_out = 10 l_layers.append( HiddenLayer(n_in, n_out, batch_size, k=1, activation=T.nnet.softmax, name='l_layer_' + str(len(l_layers)))) l_layers[-1].W.set_value(0 * l_layers[-1].W.get_value()) # T.nnet.softmax takes a matrix not a tensor so just calculate the linear # component in the layer and apply the softmax later #out_idxs_n = shared( # np.repeat( # np.arange(n_out, dtype='int64').reshape(1, n_out), # batch_size, # axis=0 # ), # name='out_idxs_' + str(len(l_layers)) #) b_layers.append( HiddenBlockLayer( (n_in, n_units_per), (n_out, n_units_per), l_layers[-2].top_active, l_layers[-1].top_active, #out_idxs_n, batch_size, None, name='b_layer_' + str(len(b_layers)))) #b_layers[-1].W.set_value(0*b_layers[-1].W.get_value()) print "... Building top active updates" top_active = [] l_activation = l_x b_activation = b_x b_activations = [b_activation] for i in range(len(l_layers)): l_activation = l_layers[i].output(l_activation) b_activation = b_layers[i].output(b_activation) b_activations.append(b_activation) top_active.append((l_layers[i].top_active, T.argsort(T.abs_(l_activation))[:, :l_layers[i].k])) print "... Building costs and errors" l_cost = add_regularization(l_layers, l_layers[-1].cost(l_activation, y), L1_reg, L2_reg) l_error = l_layers[-1].error(l_activation, y) # T.nnet.softmax takes a matrix not a tensor so we only calculate the # linear component at the last layer and here we reshape and then # apply the softmax #b_activation = T.nnet.softmax(((b_activation*b_activation)**2).sum(axis=2)) #b_activation = relu_softmax(((b_activation*b_activation)**2).sum(axis=2)) b_activation = T.nnet.softmax(T.mean(b_activation, axis=2)) #b_activation = relu_softmax(T.mean(b_activation, axis=2)) #b_activation = T.nnet.softmax(T.max(b_activation, axis=2)) #b_activation = relu_softmax(T.max(b_activation, axis=2)) b_activations.append(b_activation) b_cost = add_regularization(b_layers, b_layers[-1].cost(b_activation, y), L1_reg, L2_reg) b_error = b_layers[-1].error(b_activation, y) print "... Building parameter updates" l_grads = [] l_param_updates = [] b_grads = [] b_param_updates = [] for i in range(len(l_layers)): for param in l_layers[i].params: gparam = T.grad(l_cost, param) l_grads.append(gparam) l_param_updates.append((param, param - l_learning_rate * gparam)) for param in b_layers[i].params: gparam = T.grad( b_cost, param, consider_constant=[b_layers[i].in_idxs, b_layers[i].out_idxs]) b_grads.append(gparam) b_param_updates.append((param, param - b_learning_rate * gparam)) print "... Compiling little net train function" l_train_model = function( [index], [l_cost, l_x, y], updates=top_active + l_param_updates, givens={ l_x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) print "... Compiling big net train function" temp = train_set_x.get_value(borrow=True, return_internal_type=True) train_set_x_b = shared(temp.reshape((temp.shape[0], 1, temp.shape[1])), borrow=True, name='train_set_x_b') b_train_model = function( [index], [b_cost], updates=b_param_updates, givens={ b_x: train_set_x_b[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) #theano.printing.debugprint(b_train_model) #ipdb.set_trace() # verify_layers(batch_size, b_layers, train_set_x_b, train_set_y) # temp = verify_cost( # b_cost, # b_layers, # b_x, # y, # batch_size, # train_set_x_b, # train_set_y # ) # T.verify_grad( # temp, # [b_layers[0].W.get_value(), b_layers[1].W.get_value()], # rng=rng # ) print "... Compiling little net test function" l_test_model = function( [index], l_error, givens={ l_x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) print "... Compiling big net test function" temp = test_set_x.get_value(borrow=True, return_internal_type=True) test_set_x_b = shared(temp.reshape((temp.shape[0], 1, temp.shape[1])), borrow=True, name='test_set_x_b') b_test_model = function( [index], b_error, givens={ b_x: test_set_x_b[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) print "... Compiling little net validate function" l_validate_model = function( [index], l_error, givens={ l_x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) print "... Compiling big net validate function" temp = valid_set_x.get_value(borrow=True, return_internal_type=True) valid_set_x_b = shared(temp.reshape((temp.shape[0], 1, temp.shape[1])), borrow=True, name='valid_set_x_b') b_validate_model = function( [index], b_error, givens={ b_x: valid_set_x_b[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) print "Training" # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 100 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = np.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False accum = 0 accum_b = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = l_train_model(minibatch_index) minibatch_avg_cost_b = b_train_model(minibatch_index, learning_rate.rate) #print "minibatch_avg_cost: " + str(minibatch_avg_cost) + " minibatch_avg_cost_b: " + str(minibatch_avg_cost_b) #print l_layers[0].W.get_value().sum(), l_layers[1].W.get_value().sum(), b_layers[0].W.get_value().sum(), b_layers[1].W.get_value().sum() #print "A: ", np.max(np.abs(b_layers[0].W.get_value())), np.max(np.abs(b_layers[0].b.get_value())), np.max(np.abs(b_layers[1].W.get_value())), np.max(np.abs(b_layers[1].b.get_value())) #print "B: ", np.abs(b_layers[0].W.get_value()).sum(), np.abs(b_layers[0].b.get_value()).sum(), np.abs(b_layers[1].W.get_value()).sum(), np.abs(b_layers[1].b.get_value()).sum() #print "C: ", np.abs(np.array(minibatch_avg_cost_b[1])).sum(), np.abs(np.array(minibatch_avg_cost_b[2])).sum(), np.abs(np.array(minibatch_avg_cost_b[3])).sum(), np.abs(np.array(minibatch_avg_cost_b[4])).sum() minibatch_avg_cost = minibatch_avg_cost[0] minibatch_avg_cost_b = minibatch_avg_cost_b[0] accum = accum + minibatch_avg_cost accum_b = accum_b + minibatch_avg_cost_b # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: accum = accum / validation_frequency accum_b = accum_b / validation_frequency print "minibatch_avg_cost: ", accum, \ "minibatch_avg_cost_b: ", accum_b accum = 0 accum_b = 0 # compute zero-one loss on validation set validation_losses = [ l_validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = np.mean(validation_losses) validation_losses_b = [ b_validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss_b = np.mean(validation_losses_b) #this_validation_loss_b = 0 print( 'epoch %i, minibatch %i/%i, validation error %f %% ' '(%f %%)' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100., this_validation_loss_b * 100.)) #ipdb.set_trace() # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ l_test_model(i) for i in xrange(n_test_batches) ] test_score = np.mean(test_losses) test_losses_b = [ b_test_model(i) for i in xrange(n_test_batches) ] test_score_b = np.mean(test_losses_b) #test_score_b = 0 print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %% (%f %%)') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100., test_score_b * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def setup_hidden_layers(self, activation, n_in=0, n_out=0, n_hiddens=0): """ Setup the hidden layers with the specified number of hidden units. """ act_fn = T.tanh if activation == NeuralActivations.Rectifier: act_fn = self.rectifier_act if n_in == 0: n_in = self.n_in if n_out == 0: n_out = self.n_out if n_hiddens == 0: n_hiddens = self.n_hiddens self.rng.seed(1985) #Create the hidden layers. self.hiddenLayers.append( HiddenLayer(rng=self.rng, input=self.input, n_in=n_in, n_out=n_hiddens[0], activation=act_fn)) for i in xrange(1, self.n_hidden_layers): self.rng.seed(2012) self.hiddenLayers.append( HiddenLayer(rng=self.rng, input=self.hiddenLayers[i - 1].output, n_in=n_hiddens[i - 1], n_out=n_hiddens[i], activation=act_fn)) # The logistic regression layer gets as input the hidden units # of the hidden layer self.logRegressionLayer = LogisticRegressionLayer( input=self.hiddenLayers[-1].output, n_in=n_hiddens[-1], n_out=n_out, rng=self.rng) self.initialize_regularization() # negative log likelihood of the MLP is given by the negative # log likelihood of the output of the model, computed in the # logistic regression layer self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood # negative log likelihood of the MLP is given by the # crossentropy of the output of the model, computed in the # logistic regression layer self.crossentropy = self.logRegressionLayer.crossentropy self.crossentropy_categorical = self.logRegressionLayer.crossentropy_categorical # same holds for the function computing the number of errors self.errors = self.logRegressionLayer.errors self.raw_prediction_errors =\ self.logRegressionLayer.raw_prediction_errors self.p_y_given_x = self.logRegressionLayer.p_y_given_x # Class memberships hidden_outputs = self.hiddenLayers[0].get_outputs(self.input) for i in xrange(1, self.n_hidden_layers): hidden_outputs = self.hiddenLayers[i].get_outputs(hidden_outputs) self.class_memberships = self.logRegressionLayer.get_class_memberships( hidden_outputs) self.initialize_params()