# Updating all the parameters
     grbagger.update_params(accumu_grads, rate)
 train_accuracy = float(correct_count) / train_size
 logger.debug('Training epoch: %d, total cost: %f, accuracy = %f' 
             % (i, costs, train_accuracy))
 # Append all the numbers
 track_training_cost.append(costs)
 track_training_acc.append(train_accuracy)
 if train_accuracy > highest_train_accuracy: highest_train_accuracy = train_accuracy
 # Testing
 correct_count = 0
 costs = 0.0
 for j in xrange(test_size):
     test_sent_rep = senti_test_sparse_select[j].dot(word_embedding.embedding)
     pred = grbagger.predict(test_sent_rep)
     cost = grbagger.show_cost(test_sent_rep, senti_test_label[j])
     if pred == senti_test_label[j]: correct_count += 1
     costs += cost
 test_accuracy = float(correct_count) / test_size
 logger.debug('Test accuracy: %f' % test_accuracy)
 # Append all the numbers
 track_test_cost.append(costs)
 track_test_acc.append(test_accuracy)
 if test_accuracy > highest_test_accuracy: highest_test_accuracy = test_accuracy
 # Sampling to show the weights and experts of training and test instances
 logger.debug('Training Sampling: ')
 for j in xrange(sample_size):
     idx = np.random.randint(train_size)
     weights = grbagger.show_weights(senti_train_set[idx])
     scores = grbagger.show_scores(senti_train_set[idx])
     prob = grbagger.show_prob(senti_train_set[idx])
Example #2
0
 def testSentimentFineTune(self):
     '''
     Build a small model and use it on sentiment analysis task. With fine-tunning
     the word-embedding matrix.
     '''
     np.random.seed(1991)
     fname = './grCNN.conf'
     configer = GrCNNConfiger(fname)
     senti_train_filename = '../data/sentiment-train.txt'
     # senti_train_filename = '../data/sentiment-train-phrases.txt'
     senti_test_filename = '../data/sentiment-test.txt'
     senti_train_txt, senti_train_label = [], []
     senti_test_txt, senti_test_label = [], []
     start_time = time.time()
     # Read training data set
     with file(senti_train_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             senti_train_txt.append(txt)
             senti_train_label.append(int(label))
     # Read test data set
     with file(senti_test_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             senti_test_txt.append(txt)
             senti_test_label.append(int(label))
     end_time = time.time()
     logger.debug('Time used to load training and test data set: %f seconds.' % (end_time-start_time))
     embedding_filename = '../data/wiki_embeddings.txt'
     # Load training/test data sets and wiki-embeddings
     word_embedding = WordEmbedding(embedding_filename)
     embed_dim = word_embedding.embedding_dim()
     start_time = time.time()
     blank_index = word_embedding.word2index('</s>')
     logger.debug('Blank index: {}'.format(word_embedding.index2word(blank_index)))
     # Word-vector representation
     senti_train_label = np.asarray(senti_train_label, dtype=np.int32)
     senti_test_label = np.asarray(senti_test_label, dtype=np.int32)
     train_size = len(senti_train_txt)
     test_size = len(senti_test_txt)
     # Check size
     logger.debug('Training size: %d' % train_size)
     logger.debug('Test size: %d' % test_size)
     # Shuffling for all the instances
     start_time = time.time()
     rindex = np.arange(train_size)
     tindex = np.arange(test_size)
     np.random.shuffle(rindex)
     np.random.shuffle(tindex)
     # Shuffle label
     senti_train_label = senti_train_label[rindex]
     senti_test_label = senti_test_label[tindex]
     # Shuffle text
     senti_train_txt = list(np.asarray(senti_train_txt)[rindex])
     senti_test_txt = list(np.asarray(senti_test_txt)[tindex])
     end_time = time.time()
     logger.debug('Time used to shuffle all the data: %f seconds.' % (end_time-start_time))
     # Compute word embedding
     senti_train_set = []
     senti_test_set = []
     # Record the index of each word in each sentence for only once
     senti_train_word_index = []
     senti_test_word_index = []
     # Record the sparse input indicator matrix only once for fast computation
     senti_train_sparse_select = []
     senti_test_sparse_select = []
     # Embedding for training set
     for i, sent in enumerate(senti_train_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32)
         vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words])
         indices = [blank_index]
         indices += [word_embedding.word2index(word) for word in words]
         indices += [blank_index]
         sparse_select = lil_matrix((len(words)+2, word_embedding.dict_size()), dtype=floatX)
         sparse_select[range(len(words)+2), indices] = 1.0
         sparse_select = csc_matrix(sparse_select)
         senti_train_set.append(vectors)
         senti_train_word_index.append(indices)
         senti_train_sparse_select.append(sparse_select)
     # Embedding for test set
     for i, sent in enumerate(senti_test_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32)
         vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words])
         indices = [blank_index]
         indices += [word_embedding.word2index(word) for word in words]
         indices += [blank_index]
         sparse_select = lil_matrix((len(words)+2, word_embedding.dict_size()), dtype=floatX)
         sparse_select[range(len(words)+2), indices] = 1.0
         sparse_select = csc_matrix(sparse_select)
         senti_test_set.append(vectors)
         senti_test_word_index.append(indices)
         senti_test_sparse_select.append(sparse_select)
     end_time = time.time()
     logger.debug('Time used to build initial matrices: %f seconds.' % (end_time-start_time))
     p_count = np.sum(senti_train_label)
     logger.debug('Default positive percentage in Train: %f' % (float(p_count) / train_size))
     logger.debug('Default negative percentage in Train: %f' % (float(train_size-p_count) / train_size))
     p_count = np.sum(senti_test_label)
     logger.debug('Default positive percentage in Test: %f' % (float(p_count) / test_size))
     logger.debug('Default negative percentage in Test: %f' % (float(test_size-p_count) / test_size))
     # Now, start training
     start_time = time.time()
     grbagger = GrCNNBagger(configer, verbose=True)
     end_time = time.time()
     logger.debug('Time used to build the model: %f seconds.' % (end_time-start_time))
     learn_rate = 0.02
     # Training using stochastic gradient descent algorithm
     epoch = 200
     batch_size = 20
     start_time = time.time()
     highest_train_accuracy, highest_test_accuracy = 0.0, 0.0
     track_training_acc, track_training_cost = [], []
     track_test_acc, track_test_cost = [], []
     training_threshold_epoch = 30
     try:
         sample_size = 0
         fuedge_factor = 1e-6
         # accumu matrix for word-embedding matrix
         # hist matrix for word-embedding matrix
         accumu_embedding = np.zeros((word_embedding.dict_size(), configer.num_input), dtype=floatX)
         hist_embedding = np.zeros((word_embedding.dict_size(), configer.num_input), dtype=floatX)
         for i in xrange(epoch):
             costs = 0.0
             correct_count = 0
             logger.debug('=' * 50)
             # rate = learn_rate / (1+i)
             rate = learn_rate
             # Training
             num_batch = train_size / batch_size
             for k in xrange(num_batch):
                 # Clear all the cache        
                 accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 if i > training_threshold_epoch:
                     accumu_embedding[:] = 0.0
                     hist_embedding[:] = 0.0
                 for j in xrange(k*batch_size, (k+1)*batch_size):
                     train_sent_rep = senti_train_sparse_select[j].dot(word_embedding.embedding)
                     results = grbagger.compute_gradient_and_cost(train_sent_rep, senti_train_label[j])
                     input_grad = grbagger.compute_input_gradient(train_sent_rep, senti_train_label[j])
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == senti_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                     ## Update the word-embedding matrix
                     if i > training_threshold_epoch:
                         tmp = senti_train_sparse_select[j].T.dot(input_grad)
                         accumu_embedding += tmp
                         hist_embedding += np.square(tmp)
                 # Updating model parameters
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 # Updating word-embedding matrix
                 if i > training_threshold_epoch:
                     accumu_embedding /= batch_size
                     accumu_embedding /= fuedge_factor + np.sqrt(hist_embedding)
                     word_embedding._embedding -= rate * accumu_embedding
                 grbagger.update_params(accumu_grads, rate)
             # Clear all the cache again
             accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             if i > training_threshold_epoch:
                 accumu_embedding[:] = 0.0
                 hist_embedding[:] = 0.0
             if num_batch * batch_size < train_size:
                 for j in xrange(num_batch*batch_size, train_size):
                     train_sent_rep = senti_train_sparse_select[j].dot(word_embedding.embedding)
                     results = grbagger.compute_gradient_and_cost(train_sent_rep, senti_train_label[j])
                     input_grad = grbagger.compute_input_gradient(train_sent_rep, senti_train_label[j])
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == senti_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                     ## Update the word-embedding matrix
                     if i > training_threshold_epoch:
                         tmp = senti_train_sparse_select[j].T.dot(input_grad)
                         accumu_embedding += tmp
                         hist_embedding += np.square(tmp)
                 # Normalizing model parameters
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= train_size-num_batch*batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 # Normalizing word-embedding matrix
                 if i > training_threshold_epoch:
                     accumu_embedding /= train_size-num_batch*batch_size
                     accumu_embedding /= fuedge_factor + np.sqrt(hist_embedding)
                     word_embedding._embedding -= rate * accumu_embedding
                 # Updating all the parameters
                 grbagger.update_params(accumu_grads, rate)
             train_accuracy = float(correct_count) / train_size
             logger.debug('Training epoch: %d, total cost: %f, accuracy = %f' 
                         % (i, costs, train_accuracy))
             # Append all the numbers
             track_training_cost.append(costs)
             track_training_acc.append(train_accuracy)
             if train_accuracy > highest_train_accuracy: highest_train_accuracy = train_accuracy
             # Testing
             correct_count = 0
             costs = 0.0
             for j in xrange(test_size):
                 test_sent_rep = senti_test_sparse_select[j].dot(word_embedding.embedding)
                 pred = grbagger.predict(test_sent_rep)
                 cost = grbagger.show_cost(test_sent_rep, senti_test_label[j])
                 if pred == senti_test_label[j]: correct_count += 1
                 costs += cost
             test_accuracy = float(correct_count) / test_size
             logger.debug('Test accuracy: %f' % test_accuracy)
             # Append all the numbers
             track_test_cost.append(costs)
             track_test_acc.append(test_accuracy)
             if test_accuracy > highest_test_accuracy: highest_test_accuracy = test_accuracy
             # Sampling to show the weights and experts of training and test instances
             logger.debug('Training Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(train_size)
                 weights = grbagger.show_weights(senti_train_set[idx])
                 scores = grbagger.show_scores(senti_train_set[idx])
                 prob = grbagger.show_prob(senti_train_set[idx])
                 label = senti_train_label[idx]
                 logger.debug('Training idx: {}'.format(idx))
                 logger.debug('Training scores: {}'.format(scores))
                 logger.debug('Training weights: {}'.format(weights))
                 logger.debug('Training probability: {}'.format(prob))
                 logger.debug('Training label: {}'.format(label))
                 logger.debug('-' * 50)
             logger.debug('Test Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(test_size)
                 weights = grbagger.show_weights(senti_test_set[idx])    
                 scores = grbagger.show_scores(senti_test_set[idx])
                 prob = grbagger.show_prob(senti_test_set[idx])
                 label = senti_test_label[idx]
                 logger.debug('Test idx: {}'.format(idx))
                 logger.debug('Test scores: {}'.format(scores))
                 logger.debug('Test weights: {}'.format(weights))
                 logger.debug('Test probability: {}'.format(prob))
                 logger.debug('Test label: {}'.format(label))
                 logger.debug('-' * 50)
             # Check norms of the model parameter
             for param in grbagger.params:
                 val = param.get_value(borrow=True)
                 norm = np.sqrt(np.sum(np.square(val)))
                 logger.debug('Parameter: {}, L2-norm: {}'.format(param.name, norm))
             wnorm = np.sqrt(np.sum(np.square(word_embedding._embedding)))
             logger.debug('Parameter: {}, L2-norm: {}'.format('Word-Embedding', wnorm))
     except:
         logger.debug('Error appeared!')
         traceback.print_exc(file=sys.stdout)
         logger.debug('-' * 50)
     finally:
         end_time = time.time()
         logger.debug('Time used for training: %f seconds.' % (end_time-start_time))
         logger.debug('Highest training accuracy: %f' % highest_train_accuracy)
         logger.debug('Highest test accuracy: %f' % highest_test_accuracy)
         GrCNNBagger.save('fine-grbagger.model', grbagger)
         # Save all the tracking numbers
         track_training_acc = np.asarray(track_training_acc)
         track_training_cost = np.asarray(track_training_cost)
         track_test_acc = np.asarray(track_test_acc)
         track_test_cost = np.asarray(track_test_cost)
         with file('fine-senti-records.npy', 'w') as fout:
             np.save(fout, track_training_acc)
             np.save(fout, track_training_cost)
             np.save(fout, track_test_acc)
             np.save(fout, track_test_cost)
         logger.debug('Training and test records saved to fine-senti-records.npy...')
         logger.debug('Finished...')
Example #3
0
 def testActiveAndPassive(self):
     np.random.seed(1991)
     sp_train_filename = '../data/refined_train_sp.txt'
     sp_test_filename = '../data/refined_test_sp.txt'
     sp_train_txt, sp_train_label = [], []
     sp_test_txt, sp_test_label = [], []
     start_time = time.time()
     # Read training data set
     with file(sp_train_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             sp_train_txt.append(txt)
             sp_train_label.append(int(label))
     # Read test data set
     with file(sp_test_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             sp_test_txt.append(txt)
             sp_test_label.append(label)
     end_time = time.time()
     logger.debug('Finished loading training and test data sets...')
     logger.debug('Time used for loading: %f seconds.' % (end_time-start_time))
     embedding_filename = '../data/wiki_embeddings.txt'
     word_embedding = WordEmbedding(embedding_filename)
     start_time = time.time()
     # Starting and Ending token for each sentence
     blank_token = word_embedding.wordvec('</s>')
     # Word-vector representation
     sp_train_label = np.asarray(sp_train_label, dtype=np.int32)
     sp_test_label = np.asarray(sp_test_label, dtype=np.int32)
     train_size = len(sp_train_txt)
     test_size = len(sp_test_txt)
     # Check size
     logger.debug('Training size: %d' % train_size)
     logger.debug('Test size: %d' % test_size)
     # Sequential modeling for each sentence
     sp_train_set, sp_test_set = [], []
     # Embedding for training set
     for i, sent in enumerate(sp_train_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX)
         vectors[1:-1, :] = np.asarray([word_embedding.wordvec(word) for word in words])
         sp_train_set.append(vectors)
     # Embedding for test set
     for i, sent in enumerate(sp_test_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX)
         vectors[1:-1, :] = np.asarray([word_embedding.wordvec(word) for word in words])
         sp_test_set.append(vectors)
     end_time = time.time()
     logger.debug('Time used to build initial training and test matrix: %f seconds.' % (end_time-start_time))
     # Now, start training
     start_time = time.time()
     grbagger = GrCNNBagger(self.configer, verbose=True)
     end_time = time.time()
     logger.debug('Time used to build the model: %f seconds.' % (end_time-start_time))
     learn_rate = 2e-2
     # Training using stochastic gradient descent algorithm
     epoch = 200
     batch_size = 10
     start_time = time.time()
     highest_train_accuracy, highest_test_accuracy = 0.0, 0.0
     training_acc, training_cost = [], []
     test_acc, test_cost = [], []
     try:
         sample_size = 0
         fuedge_factor = 1e-6
         for i in xrange(epoch):
             costs = 0.0
             correct_count = 0
             logger.debug('=' * 50)
             # rate = learn_rate / (1+i)
             rate = learn_rate
             # Training
             num_batch = train_size / batch_size
             for k in xrange(num_batch):
                 accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 for j in xrange(k*batch_size, (k+1)*batch_size):
                     results = grbagger.compute_gradient_and_cost(sp_train_set[j], sp_train_label[j])
                     for r in results:
                         if np.isnan(np.sum(r)):
                             logger.debug('*' * 50)
                             logger.debug('Error!!!!!')
                             logger.debug('NaN found at %dth training instance' % j)
                             logger.debug('*' * 50)
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == sp_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 grbagger.update_params(accumu_grads, rate)
             accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             if num_batch * batch_size < train_size:
                 for j in xrange(num_batch*batch_size, train_size):
                     results = grbagger.compute_gradient_and_cost(sp_train_set[j], sp_train_label[j])
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == sp_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= train_size-num_batch*batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 grbagger.update_params(accumu_grads, rate)
             train_accuracy = float(correct_count) / train_size
             logger.debug('Training epoch: %d, total cost: %f, accuracy = %f' 
                         % (i, costs, train_accuracy))
             ## Adding training accuracy and training cost
             training_acc.append(train_accuracy)
             training_cost.append(costs)
             if train_accuracy > highest_train_accuracy: highest_train_accuracy = train_accuracy
             # Testing
             correct_count = 0
             costs = 0.0
             for j in xrange(test_size):
                 pred = grbagger.predict(sp_test_set[j])
                 cost = grbagger.show_cost(sp_test_set[j], sp_test_label[j])
                 if pred == sp_test_label[j]: correct_count += 1
                 costs += cost
             test_accuracy = float(correct_count) / test_size
             ## Adding test accuracy and test cost
             test_acc.append(test_accuracy)
             test_cost.append(costs)
             logger.debug('Test accuracy: %f' % test_accuracy)
             if test_accuracy > highest_test_accuracy: highest_test_accuracy = test_accuracy
             # Sampling to show the weights and experts of training and test instances
             logger.debug('Training Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(train_size)
                 weights = grbagger.show_weights(sp_train_set[idx])
                 scores = grbagger.show_scores(sp_train_set[idx])
                 prob = grbagger.show_prob(sp_train_set[idx])
                 label = sp_train_label[idx]
                 logger.debug('Training idx: {}'.format(idx))
                 logger.debug('Training scores: {}'.format(scores))
                 logger.debug('Training weights: {}'.format(weights))
                 logger.debug('Training probability: {}'.format(prob))
                 logger.debug('Training label: {}'.format(label))
                 logger.debug('-' * 50)
             logger.debug('Test Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(test_size)
                 weights = grbagger.show_weights(sp_test_set[idx])    
                 scores = grbagger.show_scores(sp_test_set[idx])
                 prob = grbagger.show_prob(sp_test_set[idx])
                 label = sp_test_label[idx]
                 logger.debug('Test idx: {}'.format(idx))
                 logger.debug('Test scores: {}'.format(scores))
                 logger.debug('Test weights: {}'.format(weights))
                 logger.debug('Test probability: {}'.format(prob))
                 logger.debug('Test label: {}'.format(label))
                 logger.debug('-' * 50)
             # Check norms of the model parameter
             for param in grbagger.params:
                 val = param.get_value(borrow=True)
                 norm = np.sqrt(np.sum(np.square(val)))
                 logger.debug('Parameter: {}, L2-norm: {}'.format(param.name, norm))
     except:
         logger.debug('Error appeared!')
         traceback.print_exc(file=sys.stdout)
         logger.debug('-' * 50)
     finally:
         end_time = time.time()
         logger.debug('Time used for training: %f seconds.' % (end_time-start_time))
         logger.debug('Highest training accuracy: %f' % highest_train_accuracy)
         logger.debug('Highest test accuracy: %f' % highest_test_accuracy)
         GrCNNBagger.save('sp-grbagger.model', grbagger)
         # Save all the training and test records
         training_acc = np.asarray(training_acc)
         training_cost = np.asarray(training_cost)
         test_acc = np.asarray(test_acc)
         test_cost = np.asarray(test_cost)
         with file('sp-records.npy', 'w') as fout:
             np.save(fout, training_acc)
             np.save(fout, training_cost)
             np.save(fout, test_acc)
             np.save(fout, test_cost)
         logger.debug('Training and test records saved to sp-records.npy...')
         logger.debug('Finished...')
Example #4
0
 def testSentiment(self):
     '''
     Build a small model and use it on sentiment analysis task.
     '''
     '''
     Load training and test texts and labels 
     in sentiment analysis task, preprocessing.
     '''
     np.random.seed(1991)
     senti_train_filename = '../data/sentiment-train.txt'
     # senti_train_filename = '../data/sentiment-train-phrases.txt'
     senti_test_filename = '../data/sentiment-test.txt'
     senti_train_txt, senti_train_label = [], []
     senti_test_txt, senti_test_label = [], []
     start_time = time.time()
     # Read training data set
     with file(senti_train_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             senti_train_txt.append(txt)
             senti_train_label.append(int(label))
     # Read test data set
     with file(senti_test_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             senti_test_txt.append(txt)
             senti_test_label.append(int(label))
     end_time = time.time()
     logger.debug('Time used to load training and test data set: %f seconds.' % (end_time-start_time))
     embedding_filename = '../data/wiki_embeddings.txt'
     # Load training/test data sets and wiki-embeddings
     word_embedding = WordEmbedding(embedding_filename)
     embed_dim = word_embedding.embedding_dim()
     start_time = time.time()
     # Store original text representation
     self.senti_train_txt = senti_train_txt
     self.senti_test_txt = senti_test_txt
     # Word-vector representation
     self.senti_train_label = np.asarray(senti_train_label, dtype=np.int32)
     self.senti_test_label = np.asarray(senti_test_label, dtype=np.int32)
     train_size = len(senti_train_txt)
     test_size = len(senti_test_txt)
     # Check size
     assert train_size == self.senti_train_label.shape[0]
     assert test_size == self.senti_test_label.shape[0]
     logger.debug('Training size: %d' % train_size)
     logger.debug('Test size: %d' % test_size)
     # Compute word embedding
     self.senti_train_set = []
     self.senti_test_set = []
     # Embedding for training set
     for i, sent in enumerate(senti_train_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32)
         vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words])
         self.senti_train_set.append(vectors)
     # Embedding for test set
     for i, sent in enumerate(senti_test_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32)
         vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words])
         self.senti_test_set.append(vectors)
     end_time = time.time()
     logger.debug('Time used to build initial training and test matrix: %f seconds.' % (end_time-start_time))
     # Store data
     self.train_size = train_size
     self.test_size = test_size
     self.word_embedding = word_embedding
     # Shuffling
     rindex = np.arange(train_size)
     tindex = np.arange(test_size)
     np.random.shuffle(rindex)
     np.random.shuffle(tindex)
     self.senti_train_set = list(np.asarray(self.senti_train_set)[rindex])
     self.senti_test_set = list(np.asarray(self.senti_test_set)[tindex])
     self.senti_train_label = self.senti_train_label[rindex]
     self.senti_test_label = self.senti_test_label[tindex]
     senti_train_set, senti_test_set = self.senti_train_set, self.senti_test_set
     senti_train_label, senti_test_label = self.senti_train_label, self.senti_test_label   
     p_count = np.sum(senti_train_label)
     logger.debug('Default positive percentage in Train: %f' % (float(p_count) / train_size))
     logger.debug('Default negative percentage in Train: %f' % (float(train_size-p_count) / train_size))
     p_count = np.sum(senti_test_label)
     logger.debug('Default positive percentage in Test: %f' % (float(p_count) / test_size))
     logger.debug('Default negative percentage in Test: %f' % (float(test_size-p_count) / test_size))
     # Now, start training
     start_time = time.time()
     grbagger = GrCNNBagger(self.configer, verbose=True)
     end_time = time.time()
     logger.debug('Time used to build the model: %f seconds.' % (end_time-start_time))
     learn_rate = 0.02
     # Training using stochastic gradient descent algorithm
     epoch = 200
     batch_size = 20
     start_time = time.time()
     highest_train_accuracy, highest_test_accuracy = 0.0, 0.0
     track_training_acc, track_training_cost = [], []
     track_test_acc, track_test_cost = [], []
     try:
         sample_size = 0
         fuedge_factor = 1e-6
         for i in xrange(epoch):
             costs = 0.0
             correct_count = 0
             logger.debug('=' * 50)
             # rate = learn_rate / (1+i)
             rate = learn_rate
             # Training
             num_batch = train_size / batch_size
             for k in xrange(num_batch):
                 accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 for j in xrange(k*batch_size, (k+1)*batch_size):
                     results = grbagger.compute_gradient_and_cost(senti_train_set[j], senti_train_label[j])
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == senti_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 grbagger.update_params(accumu_grads, rate)
             accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             if num_batch * batch_size < train_size:
                 for j in xrange(num_batch*batch_size, train_size):
                     results = grbagger.compute_gradient_and_cost(senti_train_set[j], senti_train_label[j])
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == senti_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= train_size-num_batch*batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 grbagger.update_params(accumu_grads, rate)
             train_accuracy = float(correct_count) / train_size
             logger.debug('Training epoch: %d, total cost: %f, accuracy = %f' 
                         % (i, costs, train_accuracy))
             # Append all the numbers
             track_training_cost.append(costs)
             track_training_acc.append(train_accuracy)
             if train_accuracy > highest_train_accuracy: highest_train_accuracy = train_accuracy
             # Testing
             correct_count = 0
             costs = 0.0
             for j in xrange(test_size):
                 pred = grbagger.predict(senti_test_set[j])
                 cost = grbagger.show_cost(senti_test_set[j], senti_test_label[j])
                 if pred == senti_test_label[j]: correct_count += 1
                 costs += cost
             test_accuracy = float(correct_count) / test_size
             logger.debug('Test accuracy: %f' % test_accuracy)
             # Append all the numbers
             track_test_cost.append(costs)
             track_test_acc.append(test_accuracy)
             if test_accuracy > highest_test_accuracy: highest_test_accuracy = test_accuracy
             # Sampling to show the weights and experts of training and test instances
             logger.debug('Training Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(train_size)
                 weights = grbagger.show_weights(senti_train_set[idx])
                 scores = grbagger.show_scores(senti_train_set[idx])
                 prob = grbagger.show_prob(senti_train_set[idx])
                 label = senti_train_label[idx]
                 logger.debug('Training idx: {}'.format(idx))
                 logger.debug('Training scores: {}'.format(scores))
                 logger.debug('Training weights: {}'.format(weights))
                 logger.debug('Training probability: {}'.format(prob))
                 logger.debug('Training label: {}'.format(label))
                 logger.debug('-' * 50)
             logger.debug('Test Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(test_size)
                 weights = grbagger.show_weights(senti_test_set[idx])    
                 scores = grbagger.show_scores(senti_test_set[idx])
                 prob = grbagger.show_prob(senti_test_set[idx])
                 label = senti_test_label[idx]
                 logger.debug('Test idx: {}'.format(idx))
                 logger.debug('Test scores: {}'.format(scores))
                 logger.debug('Test weights: {}'.format(weights))
                 logger.debug('Test probability: {}'.format(prob))
                 logger.debug('Test label: {}'.format(label))
                 logger.debug('-' * 50)
             # Check norms of the model parameter
             for param in grbagger.params:
                 val = param.get_value(borrow=True)
                 norm = np.sqrt(np.sum(np.square(val)))
                 logger.debug('Parameter: {}, L2-norm: {}'.format(param.name, norm))
     except:
         logger.debug('Error appeared!')
         traceback.print_exc(file=sys.stdout)
         logger.debug('-' * 50)
     finally:
         end_time = time.time()
         logger.debug('Time used for training: %f seconds.' % (end_time-start_time))
         logger.debug('Highest training accuracy: %f' % highest_train_accuracy)
         logger.debug('Highest test accuracy: %f' % highest_test_accuracy)
         GrCNNBagger.save('grbagger.model', grbagger)
         # Save all the tracking numbers
         track_training_acc = np.asarray(track_training_acc)
         track_training_cost = np.asarray(track_training_cost)
         track_test_acc = np.asarray(track_test_acc)
         track_test_cost = np.asarray(track_test_cost)
         with file('senti-records.npy', 'w') as fout:
             np.save(fout, track_training_acc)
             np.save(fout, track_training_cost)
             np.save(fout, track_test_acc)
             np.save(fout, track_test_cost)
         logger.debug('Training and test records saved to senti-records.npy...')
         logger.debug('Finished...')