def setUp(self): ''' Load and store the snippet data set. ''' np.random.seed(42) snippet_train_set_filename = '../data/train_snip.txt' snippet_test_set_filename = '../data/test_snip.txt' snippet_train_label_filename = '../data/train_label.txt' snippet_test_label_filename = '../data/test_label.txt' embedding_filename = '../data/wiki_embeddings.txt' # Build architecture of CNN from the configuration file # Load wiki-embedding word_embedding = WordEmbedding(embedding_filename) # Load data and train via minibatch with file(snippet_train_set_filename, 'rb') as fin: snippet_train_txt = fin.readlines() with file(snippet_test_set_filename, 'rb') as fin: snippet_test_txt = fin.readlines() snippet_train_label = np.loadtxt(snippet_train_label_filename, dtype=np.int32) snippet_test_label = np.loadtxt(snippet_test_label_filename, dtype=np.int32) training_size = len(snippet_train_txt) test_size = len(snippet_test_txt) # Check size: pprint('Training size: %d' % training_size) pprint('Test size: %d' % test_size) assert training_size == snippet_train_label.shape[0] assert test_size == snippet_test_label.shape[0] # Word embedding snippet_train_set = np.zeros( (training_size, word_embedding.embedding_dim()), dtype=floatX) snippet_test_set = np.zeros( (test_size, word_embedding.embedding_dim()), dtype=floatX) for i, snippet in enumerate(snippet_train_txt): words = snippet.split() vectors = np.asarray( [word_embedding.wordvec(word) for word in words], dtype=floatX) snippet_train_set[i, :] = np.mean(vectors, axis=0) for i, snippet in enumerate(snippet_test_txt): words = snippet.split() vectors = np.asarray( [word_embedding.wordvec(word) for word in words], dtype=floatX) snippet_test_set[i, :] = np.mean(vectors, axis=0) # Shuffle training and test data set train_rand_index = np.random.permutation(training_size) test_rand_index = np.random.permutation(test_size) snippet_train_set = snippet_train_set[train_rand_index, :] snippet_train_label = snippet_train_label[train_rand_index] snippet_test_set = snippet_test_set[test_rand_index, :] snippet_test_label = snippet_test_label[test_rand_index] # Decrease 1 from label snippet_train_label -= 1 snippet_test_label -= 1 self.snippet_train_set = snippet_train_set self.snippet_train_label = snippet_train_label self.snippet_test_set = snippet_test_set self.snippet_test_label = snippet_test_label
def setUp(self): ''' Load and store the snippet data set. ''' np.random.seed(42) snippet_train_set_filename = '../data/train_snip.txt' snippet_test_set_filename = '../data/test_snip.txt' snippet_train_label_filename = '../data/train_label.txt' snippet_test_label_filename = '../data/test_label.txt' embedding_filename = '../data/wiki_embeddings.txt' # Build architecture of CNN from the configuration file # Load wiki-embedding word_embedding = WordEmbedding(embedding_filename) # Load data and train via minibatch with file(snippet_train_set_filename, 'rb') as fin: snippet_train_txt = fin.readlines() with file(snippet_test_set_filename, 'rb') as fin: snippet_test_txt = fin.readlines() snippet_train_label = np.loadtxt(snippet_train_label_filename, dtype=np.int32) snippet_test_label = np.loadtxt(snippet_test_label_filename, dtype=np.int32) training_size = len(snippet_train_txt) test_size = len(snippet_test_txt) # Check size: pprint('Training size: %d' % training_size) pprint('Test size: %d' % test_size) assert training_size == snippet_train_label.shape[0] assert test_size == snippet_test_label.shape[0] # Word embedding snippet_train_set = np.zeros((training_size, word_embedding.embedding_dim()), dtype=floatX) snippet_test_set = np.zeros((test_size, word_embedding.embedding_dim()), dtype=floatX) for i, snippet in enumerate(snippet_train_txt): words = snippet.split() vectors = np.asarray([word_embedding.wordvec(word) for word in words], dtype=floatX) snippet_train_set[i, :] = np.mean(vectors, axis=0) for i, snippet in enumerate(snippet_test_txt): words = snippet.split() vectors = np.asarray([word_embedding.wordvec(word) for word in words], dtype=floatX) snippet_test_set[i, :] = np.mean(vectors, axis=0) # Shuffle training and test data set train_rand_index = np.random.permutation(training_size) test_rand_index = np.random.permutation(test_size) snippet_train_set = snippet_train_set[train_rand_index, :] snippet_train_label = snippet_train_label[train_rand_index] snippet_test_set = snippet_test_set[test_rand_index, :] snippet_test_label = snippet_test_label[test_rand_index] # Decrease 1 from label snippet_train_label -= 1 snippet_test_label -= 1 self.snippet_train_set = snippet_train_set self.snippet_train_label = snippet_train_label self.snippet_test_set = snippet_test_set self.snippet_test_label = snippet_test_label
class TestIO(unittest.TestCase): def setUp(self): embedding_fname = '../data/wiki_embeddings.txt' snip_train_txt = '../data/train_snip.txt' snip_test_txt = '../data/test_snip.txt' snip_train_label = '../data/train_label.txt' snip_test_label = '../data/test_label.txt' self.word_embedding = WordEmbedding(embedding_fname) self.train_snip_txt = utils.loadtxt(snip_train_txt) self.train_snip_label = utils.loadlabel(snip_train_label) self.test_snip_txt = utils.loadtxt(snip_test_txt) self.test_snip_label = utils.loadlabel(snip_test_label) def testEmbedding(self): pprint('Size of word vocabulary: %d' % self.word_embedding.dict_size()) pprint('Dimension of word embedding: %d' % self.word_embedding.embedding_dim()) self.assertEqual(self.word_embedding.dict_size(), 311467, 'Incorrect size of word vocabulary') self.assertEqual(self.word_embedding.embedding_dim(), 50, 'Incorrect dimension of word embedding') pprint("Unknown: ") pprint(self.word_embedding.wordvec('unknown')) def testSnippetTrain(self): self.assertEqual(len(self.train_snip_txt), 10060, 'Training data not complete') self.assertEqual(len(self.train_snip_label), 10060, 'Training label not complete') num_class = len(set(self.train_snip_label)) self.assertEqual(num_class, 8, 'Number of classes should be 8') for i in xrange(num_class): cls_count = np.sum((i + 1) == self.train_snip_label) pprint("Number of instances in class %d: %d" % (i + 1, cls_count)) def testSnippetTest(self): self.assertEqual(len(self.test_snip_txt), 2280, 'Test data not complete') self.assertEqual(len(self.test_snip_label), 2280, 'Test label not complete') num_class = len(set(self.test_snip_label)) self.assertEqual(num_class, 8, 'Number of classes should be 8') for i in xrange(num_class): cls_count = np.sum((i + 1) == self.test_snip_label) pprint("Number of instances in class %d: %d" % (i + 1, cls_count))
def setUp(self): ''' Load training and test data set, also, loading word-embeddings. ''' np.random.seed(1991) sp_train_filename = '../data/refined_train_sp.txt' sp_test_filename = '../data/refined_test_sp.txt' sp_train_txt, sp_train_label = [], [] sp_test_txt, sp_test_label = [], [] start_time = time.time() # Read training data set with file(sp_train_filename, 'r') as fin: reader = csv.reader(fin, delimiter='|') for txt, label in reader: sp_train_txt.append(txt) sp_train_label.append(int(label)) # Read test data set with file(sp_test_filename, 'r') as fin: reader = csv.reader(fin, delimiter='|') for txt, label in reader: sp_test_txt.append(txt) sp_test_label.append(label) end_time = time.time() logger.debug('Finished loading training and test data sets...') logger.debug('Time used for loading: %f seconds.' % (end_time-start_time)) embedding_filename = '../data/wiki_embeddings.txt' word_embedding = WordEmbedding(embedding_filename) start_time = time.time() # Starting and Ending token for each sentence self.blank_token = word_embedding.wordvec('</s>') # Store original text representation self.sp_train_txt = sp_train_txt self.sp_test_txt = sp_test_txt # Word-vector representation self.sp_train_label = np.asarray(sp_train_label, dtype=np.int32) self.sp_test_label = np.asarray(sp_test_label, dtype=np.int32) train_size = len(sp_train_txt) test_size = len(sp_test_txt) # Check size assert train_size == self.sp_train_label.shape[0] assert test_size == self.sp_test_label.shape[0] logger.debug('Training size: %d' % train_size) logger.debug('Test size: %d' % test_size) # Sequential modeling for each sentence self.sp_train_set, self.sp_test_set = [], [] sp_train_len, sp_test_len = [], [] # Embedding for training set for i, sent in enumerate(sp_train_txt): words = sent.split() words = [word.lower() for word in words] vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX) vectors[1:-1, :] = np.asarray([word_embedding.wordvec(word) for word in words]) sp_train_len.append(len(words)+2) self.sp_train_set.append(vectors) # Embedding for test set for i, sent in enumerate(sp_test_txt): words = sent.split() words = [word.lower() for word in words] vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX) vectors[1:-1, :] = np.asarray([word_embedding.wordvec(word) for word in words]) sp_test_len.append(len(words)+2) self.sp_test_set.append(vectors) assert sp_train_len == [seq.shape[0] for seq in self.sp_train_set] assert sp_test_len == [seq.shape[0] for seq in self.sp_test_set] end_time = time.time() logger.debug('Time used to build initial training and test matrix: %f seconds.' % (end_time-start_time)) # Store data self.train_size = train_size self.test_size = test_size self.word_embedding = word_embedding logger.debug('Max sentence length in training set: %d' % max(sp_train_len)) logger.debug('Max sentence length in test set: %d' % max(sp_test_len))
def testActiveAndPassive(self): np.random.seed(1991) sp_train_filename = '../data/refined_train_sp.txt' sp_test_filename = '../data/refined_test_sp.txt' sp_train_txt, sp_train_label = [], [] sp_test_txt, sp_test_label = [], [] start_time = time.time() # Read training data set with file(sp_train_filename, 'r') as fin: reader = csv.reader(fin, delimiter='|') for txt, label in reader: sp_train_txt.append(txt) sp_train_label.append(int(label)) # Read test data set with file(sp_test_filename, 'r') as fin: reader = csv.reader(fin, delimiter='|') for txt, label in reader: sp_test_txt.append(txt) sp_test_label.append(label) end_time = time.time() logger.debug('Finished loading training and test data sets...') logger.debug('Time used for loading: %f seconds.' % (end_time-start_time)) embedding_filename = '../data/wiki_embeddings.txt' word_embedding = WordEmbedding(embedding_filename) start_time = time.time() # Starting and Ending token for each sentence blank_token = word_embedding.wordvec('</s>') # Word-vector representation sp_train_label = np.asarray(sp_train_label, dtype=np.int32) sp_test_label = np.asarray(sp_test_label, dtype=np.int32) train_size = len(sp_train_txt) test_size = len(sp_test_txt) # Check size logger.debug('Training size: %d' % train_size) logger.debug('Test size: %d' % test_size) # Sequential modeling for each sentence sp_train_set, sp_test_set = [], [] # Embedding for training set for i, sent in enumerate(sp_train_txt): words = sent.split() words = [word.lower() for word in words] vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX) vectors[1:-1, :] = np.asarray([word_embedding.wordvec(word) for word in words]) sp_train_set.append(vectors) # Embedding for test set for i, sent in enumerate(sp_test_txt): words = sent.split() words = [word.lower() for word in words] vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX) vectors[1:-1, :] = np.asarray([word_embedding.wordvec(word) for word in words]) sp_test_set.append(vectors) end_time = time.time() logger.debug('Time used to build initial training and test matrix: %f seconds.' % (end_time-start_time)) # Now, start training start_time = time.time() grbagger = GrCNNBagger(self.configer, verbose=True) end_time = time.time() logger.debug('Time used to build the model: %f seconds.' % (end_time-start_time)) learn_rate = 2e-2 # Training using stochastic gradient descent algorithm epoch = 200 batch_size = 10 start_time = time.time() highest_train_accuracy, highest_test_accuracy = 0.0, 0.0 training_acc, training_cost = [], [] test_acc, test_cost = [], [] try: sample_size = 0 fuedge_factor = 1e-6 for i in xrange(epoch): costs = 0.0 correct_count = 0 logger.debug('=' * 50) # rate = learn_rate / (1+i) rate = learn_rate # Training num_batch = train_size / batch_size for k in xrange(num_batch): accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params] hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params] for j in xrange(k*batch_size, (k+1)*batch_size): results = grbagger.compute_gradient_and_cost(sp_train_set[j], sp_train_label[j]) for r in results: if np.isnan(np.sum(r)): logger.debug('*' * 50) logger.debug('Error!!!!!') logger.debug('NaN found at %dth training instance' % j) logger.debug('*' * 50) grads, cost, pred = results[:-2], results[-2], results[-1] if pred == sp_train_label[j]: correct_count += 1 costs += cost for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads): accumu_grad += grad hist_grad += np.square(grad) for accumu_grad, hist_grad in zip(accumu_grads, hist_grads): accumu_grad /= batch_size accumu_grad /= fuedge_factor + np.sqrt(hist_grad) grbagger.update_params(accumu_grads, rate) accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params] hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params] if num_batch * batch_size < train_size: for j in xrange(num_batch*batch_size, train_size): results = grbagger.compute_gradient_and_cost(sp_train_set[j], sp_train_label[j]) grads, cost, pred = results[:-2], results[-2], results[-1] if pred == sp_train_label[j]: correct_count += 1 costs += cost for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads): accumu_grad += grad hist_grad += np.square(grad) for accumu_grad, hist_grad in zip(accumu_grads, hist_grads): accumu_grad /= train_size-num_batch*batch_size accumu_grad /= fuedge_factor + np.sqrt(hist_grad) grbagger.update_params(accumu_grads, rate) train_accuracy = float(correct_count) / train_size logger.debug('Training epoch: %d, total cost: %f, accuracy = %f' % (i, costs, train_accuracy)) ## Adding training accuracy and training cost training_acc.append(train_accuracy) training_cost.append(costs) if train_accuracy > highest_train_accuracy: highest_train_accuracy = train_accuracy # Testing correct_count = 0 costs = 0.0 for j in xrange(test_size): pred = grbagger.predict(sp_test_set[j]) cost = grbagger.show_cost(sp_test_set[j], sp_test_label[j]) if pred == sp_test_label[j]: correct_count += 1 costs += cost test_accuracy = float(correct_count) / test_size ## Adding test accuracy and test cost test_acc.append(test_accuracy) test_cost.append(costs) logger.debug('Test accuracy: %f' % test_accuracy) if test_accuracy > highest_test_accuracy: highest_test_accuracy = test_accuracy # Sampling to show the weights and experts of training and test instances logger.debug('Training Sampling: ') for j in xrange(sample_size): idx = np.random.randint(train_size) weights = grbagger.show_weights(sp_train_set[idx]) scores = grbagger.show_scores(sp_train_set[idx]) prob = grbagger.show_prob(sp_train_set[idx]) label = sp_train_label[idx] logger.debug('Training idx: {}'.format(idx)) logger.debug('Training scores: {}'.format(scores)) logger.debug('Training weights: {}'.format(weights)) logger.debug('Training probability: {}'.format(prob)) logger.debug('Training label: {}'.format(label)) logger.debug('-' * 50) logger.debug('Test Sampling: ') for j in xrange(sample_size): idx = np.random.randint(test_size) weights = grbagger.show_weights(sp_test_set[idx]) scores = grbagger.show_scores(sp_test_set[idx]) prob = grbagger.show_prob(sp_test_set[idx]) label = sp_test_label[idx] logger.debug('Test idx: {}'.format(idx)) logger.debug('Test scores: {}'.format(scores)) logger.debug('Test weights: {}'.format(weights)) logger.debug('Test probability: {}'.format(prob)) logger.debug('Test label: {}'.format(label)) logger.debug('-' * 50) # Check norms of the model parameter for param in grbagger.params: val = param.get_value(borrow=True) norm = np.sqrt(np.sum(np.square(val))) logger.debug('Parameter: {}, L2-norm: {}'.format(param.name, norm)) except: logger.debug('Error appeared!') traceback.print_exc(file=sys.stdout) logger.debug('-' * 50) finally: end_time = time.time() logger.debug('Time used for training: %f seconds.' % (end_time-start_time)) logger.debug('Highest training accuracy: %f' % highest_train_accuracy) logger.debug('Highest test accuracy: %f' % highest_test_accuracy) GrCNNBagger.save('sp-grbagger.model', grbagger) # Save all the training and test records training_acc = np.asarray(training_acc) training_cost = np.asarray(training_cost) test_acc = np.asarray(test_acc) test_cost = np.asarray(test_cost) with file('sp-records.npy', 'w') as fout: np.save(fout, training_acc) np.save(fout, training_cost) np.save(fout, test_acc) np.save(fout, test_cost) logger.debug('Training and test records saved to sp-records.npy...') logger.debug('Finished...')
def testSentimentFineTune(self): ''' Build a small model and use it on sentiment analysis task. With fine-tunning the word-embedding matrix. ''' np.random.seed(1991) fname = './grCNN.conf' configer = GrCNNConfiger(fname) senti_train_filename = '../data/sentiment-train.txt' # senti_train_filename = '../data/sentiment-train-phrases.txt' senti_test_filename = '../data/sentiment-test.txt' senti_train_txt, senti_train_label = [], [] senti_test_txt, senti_test_label = [], [] start_time = time.time() # Read training data set with file(senti_train_filename, 'r') as fin: reader = csv.reader(fin, delimiter='|') for txt, label in reader: senti_train_txt.append(txt) senti_train_label.append(int(label)) # Read test data set with file(senti_test_filename, 'r') as fin: reader = csv.reader(fin, delimiter='|') for txt, label in reader: senti_test_txt.append(txt) senti_test_label.append(int(label)) end_time = time.time() logger.debug('Time used to load training and test data set: %f seconds.' % (end_time-start_time)) embedding_filename = '../data/wiki_embeddings.txt' # Load training/test data sets and wiki-embeddings word_embedding = WordEmbedding(embedding_filename) embed_dim = word_embedding.embedding_dim() start_time = time.time() blank_index = word_embedding.word2index('</s>') logger.debug('Blank index: {}'.format(word_embedding.index2word(blank_index))) # Word-vector representation senti_train_label = np.asarray(senti_train_label, dtype=np.int32) senti_test_label = np.asarray(senti_test_label, dtype=np.int32) train_size = len(senti_train_txt) test_size = len(senti_test_txt) # Check size logger.debug('Training size: %d' % train_size) logger.debug('Test size: %d' % test_size) # Shuffling for all the instances start_time = time.time() rindex = np.arange(train_size) tindex = np.arange(test_size) np.random.shuffle(rindex) np.random.shuffle(tindex) # Shuffle label senti_train_label = senti_train_label[rindex] senti_test_label = senti_test_label[tindex] # Shuffle text senti_train_txt = list(np.asarray(senti_train_txt)[rindex]) senti_test_txt = list(np.asarray(senti_test_txt)[tindex]) end_time = time.time() logger.debug('Time used to shuffle all the data: %f seconds.' % (end_time-start_time)) # Compute word embedding senti_train_set = [] senti_test_set = [] # Record the index of each word in each sentence for only once senti_train_word_index = [] senti_test_word_index = [] # Record the sparse input indicator matrix only once for fast computation senti_train_sparse_select = [] senti_test_sparse_select = [] # Embedding for training set for i, sent in enumerate(senti_train_txt): words = sent.split() words = [word.lower() for word in words] vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32) vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words]) indices = [blank_index] indices += [word_embedding.word2index(word) for word in words] indices += [blank_index] sparse_select = lil_matrix((len(words)+2, word_embedding.dict_size()), dtype=floatX) sparse_select[range(len(words)+2), indices] = 1.0 sparse_select = csc_matrix(sparse_select) senti_train_set.append(vectors) senti_train_word_index.append(indices) senti_train_sparse_select.append(sparse_select) # Embedding for test set for i, sent in enumerate(senti_test_txt): words = sent.split() words = [word.lower() for word in words] vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32) vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words]) indices = [blank_index] indices += [word_embedding.word2index(word) for word in words] indices += [blank_index] sparse_select = lil_matrix((len(words)+2, word_embedding.dict_size()), dtype=floatX) sparse_select[range(len(words)+2), indices] = 1.0 sparse_select = csc_matrix(sparse_select) senti_test_set.append(vectors) senti_test_word_index.append(indices) senti_test_sparse_select.append(sparse_select) end_time = time.time() logger.debug('Time used to build initial matrices: %f seconds.' % (end_time-start_time)) p_count = np.sum(senti_train_label) logger.debug('Default positive percentage in Train: %f' % (float(p_count) / train_size)) logger.debug('Default negative percentage in Train: %f' % (float(train_size-p_count) / train_size)) p_count = np.sum(senti_test_label) logger.debug('Default positive percentage in Test: %f' % (float(p_count) / test_size)) logger.debug('Default negative percentage in Test: %f' % (float(test_size-p_count) / test_size)) # Now, start training start_time = time.time() grbagger = GrCNNBagger(configer, verbose=True) end_time = time.time() logger.debug('Time used to build the model: %f seconds.' % (end_time-start_time)) learn_rate = 0.02 # Training using stochastic gradient descent algorithm epoch = 200 batch_size = 20 start_time = time.time() highest_train_accuracy, highest_test_accuracy = 0.0, 0.0 track_training_acc, track_training_cost = [], [] track_test_acc, track_test_cost = [], [] training_threshold_epoch = 30 try: sample_size = 0 fuedge_factor = 1e-6 # accumu matrix for word-embedding matrix # hist matrix for word-embedding matrix accumu_embedding = np.zeros((word_embedding.dict_size(), configer.num_input), dtype=floatX) hist_embedding = np.zeros((word_embedding.dict_size(), configer.num_input), dtype=floatX) for i in xrange(epoch): costs = 0.0 correct_count = 0 logger.debug('=' * 50) # rate = learn_rate / (1+i) rate = learn_rate # Training num_batch = train_size / batch_size for k in xrange(num_batch): # Clear all the cache accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params] hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params] if i > training_threshold_epoch: accumu_embedding[:] = 0.0 hist_embedding[:] = 0.0 for j in xrange(k*batch_size, (k+1)*batch_size): train_sent_rep = senti_train_sparse_select[j].dot(word_embedding.embedding) results = grbagger.compute_gradient_and_cost(train_sent_rep, senti_train_label[j]) input_grad = grbagger.compute_input_gradient(train_sent_rep, senti_train_label[j]) grads, cost, pred = results[:-2], results[-2], results[-1] if pred == senti_train_label[j]: correct_count += 1 costs += cost for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads): accumu_grad += grad hist_grad += np.square(grad) ## Update the word-embedding matrix if i > training_threshold_epoch: tmp = senti_train_sparse_select[j].T.dot(input_grad) accumu_embedding += tmp hist_embedding += np.square(tmp) # Updating model parameters for accumu_grad, hist_grad in zip(accumu_grads, hist_grads): accumu_grad /= batch_size accumu_grad /= fuedge_factor + np.sqrt(hist_grad) # Updating word-embedding matrix if i > training_threshold_epoch: accumu_embedding /= batch_size accumu_embedding /= fuedge_factor + np.sqrt(hist_embedding) word_embedding._embedding -= rate * accumu_embedding grbagger.update_params(accumu_grads, rate) # Clear all the cache again accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params] hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params] if i > training_threshold_epoch: accumu_embedding[:] = 0.0 hist_embedding[:] = 0.0 if num_batch * batch_size < train_size: for j in xrange(num_batch*batch_size, train_size): train_sent_rep = senti_train_sparse_select[j].dot(word_embedding.embedding) results = grbagger.compute_gradient_and_cost(train_sent_rep, senti_train_label[j]) input_grad = grbagger.compute_input_gradient(train_sent_rep, senti_train_label[j]) grads, cost, pred = results[:-2], results[-2], results[-1] if pred == senti_train_label[j]: correct_count += 1 costs += cost for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads): accumu_grad += grad hist_grad += np.square(grad) ## Update the word-embedding matrix if i > training_threshold_epoch: tmp = senti_train_sparse_select[j].T.dot(input_grad) accumu_embedding += tmp hist_embedding += np.square(tmp) # Normalizing model parameters for accumu_grad, hist_grad in zip(accumu_grads, hist_grads): accumu_grad /= train_size-num_batch*batch_size accumu_grad /= fuedge_factor + np.sqrt(hist_grad) # Normalizing word-embedding matrix if i > training_threshold_epoch: accumu_embedding /= train_size-num_batch*batch_size accumu_embedding /= fuedge_factor + np.sqrt(hist_embedding) word_embedding._embedding -= rate * accumu_embedding # Updating all the parameters grbagger.update_params(accumu_grads, rate) train_accuracy = float(correct_count) / train_size logger.debug('Training epoch: %d, total cost: %f, accuracy = %f' % (i, costs, train_accuracy)) # Append all the numbers track_training_cost.append(costs) track_training_acc.append(train_accuracy) if train_accuracy > highest_train_accuracy: highest_train_accuracy = train_accuracy # Testing correct_count = 0 costs = 0.0 for j in xrange(test_size): test_sent_rep = senti_test_sparse_select[j].dot(word_embedding.embedding) pred = grbagger.predict(test_sent_rep) cost = grbagger.show_cost(test_sent_rep, senti_test_label[j]) if pred == senti_test_label[j]: correct_count += 1 costs += cost test_accuracy = float(correct_count) / test_size logger.debug('Test accuracy: %f' % test_accuracy) # Append all the numbers track_test_cost.append(costs) track_test_acc.append(test_accuracy) if test_accuracy > highest_test_accuracy: highest_test_accuracy = test_accuracy # Sampling to show the weights and experts of training and test instances logger.debug('Training Sampling: ') for j in xrange(sample_size): idx = np.random.randint(train_size) weights = grbagger.show_weights(senti_train_set[idx]) scores = grbagger.show_scores(senti_train_set[idx]) prob = grbagger.show_prob(senti_train_set[idx]) label = senti_train_label[idx] logger.debug('Training idx: {}'.format(idx)) logger.debug('Training scores: {}'.format(scores)) logger.debug('Training weights: {}'.format(weights)) logger.debug('Training probability: {}'.format(prob)) logger.debug('Training label: {}'.format(label)) logger.debug('-' * 50) logger.debug('Test Sampling: ') for j in xrange(sample_size): idx = np.random.randint(test_size) weights = grbagger.show_weights(senti_test_set[idx]) scores = grbagger.show_scores(senti_test_set[idx]) prob = grbagger.show_prob(senti_test_set[idx]) label = senti_test_label[idx] logger.debug('Test idx: {}'.format(idx)) logger.debug('Test scores: {}'.format(scores)) logger.debug('Test weights: {}'.format(weights)) logger.debug('Test probability: {}'.format(prob)) logger.debug('Test label: {}'.format(label)) logger.debug('-' * 50) # Check norms of the model parameter for param in grbagger.params: val = param.get_value(borrow=True) norm = np.sqrt(np.sum(np.square(val))) logger.debug('Parameter: {}, L2-norm: {}'.format(param.name, norm)) wnorm = np.sqrt(np.sum(np.square(word_embedding._embedding))) logger.debug('Parameter: {}, L2-norm: {}'.format('Word-Embedding', wnorm)) except: logger.debug('Error appeared!') traceback.print_exc(file=sys.stdout) logger.debug('-' * 50) finally: end_time = time.time() logger.debug('Time used for training: %f seconds.' % (end_time-start_time)) logger.debug('Highest training accuracy: %f' % highest_train_accuracy) logger.debug('Highest test accuracy: %f' % highest_test_accuracy) GrCNNBagger.save('fine-grbagger.model', grbagger) # Save all the tracking numbers track_training_acc = np.asarray(track_training_acc) track_training_cost = np.asarray(track_training_cost) track_test_acc = np.asarray(track_test_acc) track_test_cost = np.asarray(track_test_cost) with file('fine-senti-records.npy', 'w') as fout: np.save(fout, track_training_acc) np.save(fout, track_training_cost) np.save(fout, track_test_acc) np.save(fout, track_test_cost) logger.debug('Training and test records saved to fine-senti-records.npy...') logger.debug('Finished...')
class TestOnLarge(unittest.TestCase): ''' Test the initial sentence model on Wiki-Data set, which contains 39,746,550 sentences, 782,603,381 words ''' def setUp(self): train_txt_filename = '../data/wiki_sentence.txt' wiki_filename = '../data/wiki_embeddings.txt' start_time = time.time() self.word_embedding = WordEmbedding(wiki_filename) with file(train_txt_filename, 'rb') as fin: self.train_txt = fin.readlines() end_time = time.time() # Since the maximum length in the task of sentiment_analysis is 56, during training # we will set 56 as the maximum length of each sentence self.max_length = 56 self.num_sent = len(self.train_txt) self.batch_size = 2000 self.nepoch = 5 pprint('Time used to load wiki sentences into memory: %f seconds.' % (end_time - start_time)) pprint('Number of sentences in the data set: %d' % len(self.train_txt)) def testTrain(self): ''' Train Auto-Encoder + SoftmaxLayer on batch learning mode. ''' input_dim, hidden_dim = self.max_length * self.word_embedding.embedding_dim( ), 500 # Build AutoEncoder + SoftmaxLayer start_time = time.time() seed = 1991 input_matrix = T.matrix(name='input') num_in, num_out = input_dim, hidden_dim act = Activation('tanh') is_denoising, is_sparse = True, False lambda1, mask = 1e-4, 0.5 rng = RandomStreams(seed) sent_model = SentModel(input_matrix, (num_in, num_out), act, is_denoising, is_sparse, lambda1, mask, rng, verbose=True) end_time = time.time() pprint('Time used to build the model: %f seconds.' % (end_time - start_time)) # Loading training data and start batch training mode num_batch = self.num_sent / self.batch_size learn_rate = 0.1 # Pretraining pprint('Start pretraining...') start_time = time.time() for i in xrange(self.nepoch): # Batch training pprint('Training epoch: %d' % i) for j in xrange(num_batch): train_set = np.zeros( (self.batch_size, self.max_length * self.word_embedding.embedding_dim()), dtype=floatX) train_txt = self.train_txt[j * self.batch_size:(j + 1) * self.batch_size] for k, sent in enumerate(train_txt): words = sent.split() vectors = np.asarray( [self.word_embedding.wordvec(word) for word in words]) vectors = vectors.flatten() train_set[k, :vectors.shape[0]] = vectors rate = learn_rate cost = sent_model.pretrain(train_set, rate) if (j + 1) % 500 == 0: pprint('Training epoch: %d, Number batch: %d, cost = %f' % (i, j, cost)) # Saving temporary pretraining model in .gz with gzip.GzipFile('./large_pretrain.sent.gz', 'wb') as fout: cPickle.dump(sent_model, fout) end_time = time.time() pprint('Time used for pretraining: %f minutes.' % ((end_time - start_time) / 60.0)) # Fine tuning pprint('Start fine-tuning...') start_time = time.time() for i in xrange(self.nepoch): # Batch training pprint('Training epoch: %d' % i) for j in xrange(num_batch): train_set = np.zeros( (self.batch_size, self.max_length * self.word_embedding.embedding_dim()), dtype=floatX) train_txt = self.train_txt[j * self.batch_size:(j + 1) * self.batch_size] for k, sent in enumerate(train_txt): words = sent.split() vectors = np.asarray( [self.word_embedding.wordvec(word) for word in words]) vectors = vectors.flatten() train_set[k, :vectors.shape[0]] = vectors rate = learn_rate cost = sent_model.finetune(train_set, rate) if (j + 1) % 500 == 0: pprint('Training epoch: %d, Number batch: %d, cost = %f' % (i, j, cost)) # Saving temporary fine-tuning model in .gz with gzip.GzipFile('./large_finetune.sent.gz', 'wb') as fout: cPickle.dump(sent_model, fout) end_time = time.time() pprint('Time used for fine-tuning: %f minutes.' % ((end_time - start_time) / 60.0))
assert len(mr_txt) == len(mr_label) data_size = len(mr_txt) logger.info('Size of the data sets: %d' % data_size) random_index = np.arange(data_size) np.random.shuffle(random_index) mr_txt = list(np.asarray(mr_txt)[random_index]) mr_label = list(np.asarray(mr_label)[random_index]) end_time = time.time() # Record timing logger.info('Time used to load and shuffle MR dataset: %f seconds.' % (end_time - start_time)) # Load word-embedding embedding_filename = './wiki_embeddings.txt.zip' # Load training/test data sets and wiki-embeddings. word_embedding = WordEmbedding(embedding_filename) embed_dim = word_embedding.embedding_dim() start_time = time.time() blank_index = word_embedding.word2index('</s>') logger.info('Blank index: {}'.format(word_embedding.index2word(blank_index))) # Word-vector representation, zero-padding all the sentences to the maximum length. max_len = 52 mr_insts = np.zeros((data_size, max_len, word_embedding.embedding_dim()), dtype=np.float32) mr_label = np.asarray(mr_label)[:, np.newaxis] for i, sent in enumerate(mr_txt): words = sent.split() words = [word.lower() for word in words] l = min(len(words), max_len - 2) # vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32) mr_insts[i, 1:l + 1, :] = np.asarray( [word_embedding.wordvec(word) for word in words[:l]])
def setUp(self): ''' Load training and test texts and labels in sentiment analysis task, preprocessing. ''' np.random.seed(1991) senti_train_filename = '../data/sentiment-train.txt' senti_test_filename = '../data/sentiment-test.txt' senti_train_txt, senti_train_label = [], [] senti_test_txt, senti_test_label = [], [] start_time = time.time() # Read training data set with file(senti_train_filename, 'r') as fin: reader = csv.reader(fin, delimiter='|') for txt, label in reader: senti_train_txt.append(txt) senti_train_label.append(int(label)) # Read test data set with file(senti_test_filename, 'r') as fin: reader = csv.reader(fin, delimiter='|') for txt, label in reader: senti_test_txt.append(txt) senti_test_label.append(int(label)) end_time = time.time() logger.debug('Time used to load training and test data set: %f seconds.' % (end_time-start_time)) embedding_filename = '../data/wiki_embeddings.txt' # Load wiki-embeddings word_embedding = WordEmbedding(embedding_filename) self.token = word_embedding.wordvec('</s>') # Store the original text representation self.senti_train_txt = senti_train_txt self.senti_test_txt = senti_test_txt # Word-vector representation self.senti_train_label = np.asarray(senti_train_label, dtype=np.int32) self.senti_test_label = np.asarray(senti_test_label, dtype=np.int32) self.train_size = len(senti_train_txt) self.test_size = len(senti_test_txt) logger.debug('Training set size: %d' % self.train_size) logger.debug('Test set size: %d' % self.test_size) assert self.train_size == self.senti_train_label.shape[0] assert self.test_size == self.senti_test_label.shape[0] # Build the word-embedding matrix start_time = time.time() self.senti_train_set, self.senti_test_set = [], [] for sent in senti_train_txt: words = sent.split() words = [word.lower() for word in words] vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX) vectors[0, :] = self.token tmp = np.asarray([word_embedding.wordvec(word) for word in words]) vectors[1:-1, :] = tmp vectors[-1, :] = self.token self.senti_train_set.append(vectors) for sent in senti_test_txt: words = sent.split() words = [word.lower() for word in words] vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX) vectors[0, :] = self.token tmp = np.asarray([word_embedding.wordvec(word) for word in words]) vectors[1:-1, :] = tmp vectors[-1, :] = self.token self.senti_test_set.append(vectors) end_time = time.time() logger.debug('Time used to build training and test word embedding matrix: %f seconds.' % (end_time-start_time)) self.word_embedding = word_embedding
def setUp(self): ''' Load training and test data set, also, loading word-embeddings. ''' np.random.seed(42) sp_train_filename = '../data/refined_train_sp.txt' sp_test_filename = '../data/refined_test_sp.txt' sp_train_txt, sp_train_label = [], [] sp_test_txt, sp_test_label = [], [] start_time = time.time() # Read training data set with file(sp_train_filename, 'r') as fin: reader = csv.reader(fin, delimiter='|') for txt, label in reader: sp_train_txt.append(txt) sp_train_label.append(int(label)) with file(sp_test_filename, 'r') as fin: reader = csv.reader(fin, delimiter='|') for txt, label in reader: sp_test_txt.append(txt) sp_test_label.append(int(label)) end_time = time.time() logger.debug('Finished loading training and test data set...') logger.debug('Time used for loading: %f seconds.' % (end_time-start_time)) embedding_filename = '../data/wiki_embeddings.txt' word_embedding = WordEmbedding(embedding_filename) start_time = time.time() # Beginning and trailing token for each sentence self.blank_token = word_embedding.wordvec('</s>') # Store original text representation self.sp_train_txt = sp_train_txt self.sp_test_txt = sp_test_txt # Store original label self.sp_train_label = np.asarray(sp_train_label, dtype=np.int32) self.sp_test_label = np.asarray(sp_test_label, dtype=np.int32) train_size = len(sp_train_txt) test_size = len(sp_test_txt) # Check size assert train_size == self.sp_train_label.shape[0] assert test_size == self.sp_test_label.shape[0] # Output the information logger.debug('Training size: %d' % train_size) logger.debug('Test size: %d' % test_size) # Word-vector representation self.sp_train_set, self.sp_test_set = [], [] sp_train_len, sp_test_len = [], [] # Embedding for training set for i, sent in enumerate(sp_train_txt): words = sent.split() words = [word.lower() for word in words] vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX) vectors[1:-1, :] = np.asarray([word_embedding.wordvec(word) for word in words]) sp_train_len.append(len(words)+2) self.sp_train_set.append(vectors) # Embedding for test set for i, sent in enumerate(sp_test_txt): words = sent.split() words = [word.lower() for word in words] vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX) vectors[1:-1, :] = np.asarray([word_embedding.wordvec(word) for word in words]) sp_test_len.append(len(words)+2) self.sp_test_set.append(vectors) # Check word-length assert sp_train_len == [seq.shape[0] for seq in self.sp_train_set] assert sp_test_len == [seq.shape[0] for seq in self.sp_test_set] end_time = time.time() logger.debug('Time used to build initial training and test matrix: %f seconds' % (end_time-start_time)) # Store metadata self.train_size = train_size self.test_size = test_size self.word_embedding = word_embedding logger.debug('Sentence of maximum length in training set: %d' % max(sp_train_len)) logger.debug('Sentence of maximum length in test set: %d' % max(sp_test_len))
def setUp(self): ''' Load training and test texts and labels in sentiment analysis task, preprocessing. ''' np.random.seed(42) senti_train_filename = '../data/sentiment-train.txt' senti_test_filename = '../data/sentiment-test.txt' senti_train_txt, senti_train_label = [], [] senti_test_txt, senti_test_label = [], [] start_time = time.time() # Record id of words for fine-tuning senti_train_words_label, senti_test_words_label = [], [] # Load Word-Embedding embedding_filename = '../data/wiki_embeddings.txt' # Load training/test data sets and wiki-embeddings word_embedding = WordEmbedding(embedding_filename) # Starting and Ending token for each sentence self.blank_token = word_embedding.wordvec('</s>') self.blank_index = word_embedding.word2index('</s>') # Read training data set with file(senti_train_filename, 'r') as fin: reader = csv.reader(fin, delimiter='|') for txt, label in reader: senti_train_txt.append(txt) senti_train_label.append(int(label)) words = txt.split() words = [word.lower() for word in words] tmp_indices = np.zeros(len(words)+2, dtype=np.int32) tmp_indices[0] = self.blank_index tmp_indices[1:-1] = np.asarray([word_embedding.word2index(word) for word in words]) tmp_indices[-1] = self.blank_index senti_train_words_label.append(tmp_indices) # Read test data set with file(senti_test_filename, 'r') as fin: reader = csv.reader(fin, delimiter='|') for txt, label in reader: senti_test_txt.append(txt) senti_test_label.append(int(label)) words = txt.split() words = [word.lower() for word in words] tmp_indices = np.zeros(len(words)+2, dtype=np.int32) tmp_indices[0] = self.blank_index tmp_indices[1:-1] = np.asarray([word_embedding.word2index(word) for word in words]) tmp_indices[-1] = self.blank_index senti_test_words_label.append(tmp_indices) end_time = time.time() logger.debug('Time used to load training and test data set: %f seconds.' % (end_time-start_time)) start_time = time.time() # Store original word index representation self.senti_train_words_label = senti_train_words_label self.senti_test_words_label = senti_test_words_label # Store original text representation self.senti_train_txt = senti_train_txt self.senti_test_txt = senti_test_txt # Word-vector representation self.senti_train_label = np.asarray(senti_train_label, dtype=np.int32) self.senti_test_label = np.asarray(senti_test_label, dtype=np.int32) train_size = len(senti_train_txt) test_size = len(senti_test_txt) # Check size assert train_size == self.senti_train_label.shape[0] assert test_size == self.senti_test_label.shape[0] logger.debug('Training size: %d' % train_size) logger.debug('Test size: %d' % test_size) # Sequential modeling for each sentence self.senti_train_set, self.senti_test_set = [], [] senti_train_len, senti_test_len = [], [] # Embedding for training set for i, sent in enumerate(senti_train_txt): words = sent.split() words = [word.lower() for word in words] vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX) vectors[1:-1, :] = np.asarray([word_embedding.wordvec(word) for word in words]) senti_train_len.append(len(words)+2) self.senti_train_set.append(vectors) # Embedding for test set for i, sent in enumerate(senti_test_txt): words = sent.split() words = [word.lower() for word in words] vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX) vectors[1:-1, :] = np.asarray([word_embedding.wordvec(word) for word in words]) senti_test_len.append(len(words)+2) self.senti_test_set.append(vectors) assert senti_train_len == [seq.shape[0] for seq in self.senti_train_set] assert senti_test_len == [seq.shape[0] for seq in self.senti_test_set] end_time = time.time() logger.debug('Time used to build initial training and test matrix: %f seconds.' % (end_time-start_time)) # Store data self.train_size = train_size self.test_size = test_size self.word_embedding = word_embedding
def setUp(self): ''' Load training and test texts and labels in sentiment analysis task, preprocessing. ''' np.random.seed(1991) # senti_train_filename = '../data/sentiment-train.txt' senti_train_filename = '../data/sentiment-train-phrases.txt' senti_test_filename = '../data/sentiment-test.txt' senti_train_txt, senti_train_label = [], [] senti_test_txt, senti_test_label = [], [] start_time = time.time() # Read training data set with file(senti_train_filename, 'r') as fin: reader = csv.reader(fin, delimiter='|') for txt, label in reader: senti_train_txt.append(txt) senti_train_label.append(int(label)) # Read test data set with file(senti_test_filename, 'r') as fin: reader = csv.reader(fin, delimiter='|') for txt, label in reader: senti_test_txt.append(txt) senti_test_label.append(int(label)) end_time = time.time() pprint('Time used to load training and test data set: %f seconds.' % (end_time-start_time)) embedding_filename = '../data/wiki_embeddings.txt' # Load training/test data sets and wiki-embeddings word_embedding = WordEmbedding(embedding_filename) start_time = time.time() # Store original text representation self.senti_train_txt = senti_train_txt self.senti_test_txt = senti_test_txt # Word-vector representation self.senti_train_label = np.asarray(senti_train_label, dtype=np.int32) self.senti_test_label = np.asarray(senti_test_label, dtype=np.int32) train_size = len(senti_train_txt) test_size = len(senti_test_txt) # Check size assert train_size == self.senti_train_label.shape[0] assert test_size == self.senti_test_label.shape[0] pprint('Training size: %d' % train_size) pprint('Test size: %d' % test_size) # Compute word embedding self.senti_train_set = np.zeros((train_size, word_embedding.embedding_dim()), dtype=floatX) self.senti_test_set = np.zeros((test_size, word_embedding.embedding_dim()), dtype=floatX) # Embedding for training set for i, sent in enumerate(senti_train_txt): words = sent.split() words = [word.lower() for word in words] # pprint('Trainging set, Number of words in sentence %d: %d' % (i, len(words))) vectors = np.asarray([word_embedding.wordvec(word) for word in words]) self.senti_train_set[i, :] = np.mean(vectors, axis=0) # Embedding for test set for i, sent in enumerate(senti_test_txt): words = sent.split() words = [word.lower() for word in words] # pprint('Test set, Number of words in sentence %d: %d' % (i, len(words))) vectors = np.asarray([word_embedding.wordvec(word) for word in words]) self.senti_test_set[i, :] = np.mean(vectors, axis=0) # Shuffle training and test data set train_rand_index = np.random.permutation(train_size) test_rand_index = np.random.permutation(test_size) self.senti_train_txt = list(np.asarray(self.senti_train_txt)[train_rand_index]) self.senti_test_txt = list(np.asarray(self.senti_test_txt)[test_rand_index]) self.senti_train_set = self.senti_train_set[train_rand_index, :] self.senti_test_set = self.senti_test_set[test_rand_index, :] self.senti_train_label = self.senti_train_label[train_rand_index] self.senti_test_label = self.senti_test_label[test_rand_index] end_time = time.time() pprint('Time used to build initial training and test matrix: %f seconds.' % (end_time-start_time)) # Store data self.train_size = train_size self.test_size = test_size self.word_embedding = word_embedding
assert len(mr_txt) == len(mr_label) data_size = len(mr_txt) logger.info('Size of the data sets: %d' % data_size) random_index = np.arange(data_size) np.random.shuffle(random_index) mr_txt = list(np.asarray(mr_txt)[random_index]) mr_label = list(np.asarray(mr_label)[random_index]) end_time = time.time() # Record timing logger.info('Time used to load and shuffle MR dataset: %f seconds.' % (end_time - start_time)) # Load word-embedding embedding_filename = './wiki_embeddings.txt' # Load training/test data sets and wiki-embeddings. word_embedding = WordEmbedding(embedding_filename) embed_dim = word_embedding.embedding_dim() start_time = time.time() blank_index = word_embedding.word2index('</s>') logger.info('Blank index: {}'.format(word_embedding.index2word(blank_index))) # Word-vector representation, zero-padding all the sentences to the maximum length. max_len = 52 mr_insts = np.zeros((data_size, max_len, word_embedding.embedding_dim()), dtype=np.float32) mr_label = np.asarray(mr_label)[:, np.newaxis] for i, sent in enumerate(mr_txt): words = sent.split() words = [word.lower() for word in words] l = len(words) # vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32) mr_insts[i, 1:l + 1, :] = np.asarray( [word_embedding.wordvec(word) for word in words])
def setUp(self): ''' Load training and test texts and labels in sentiment analysis task, preprocessing. ''' np.random.seed(1991) # senti_train_filename = '../data/sentiment-train.txt' senti_train_filename = '../data/sentiment-train-phrases.txt' senti_test_filename = '../data/sentiment-test.txt' senti_train_txt, senti_train_label = [], [] senti_test_txt, senti_test_label = [], [] start_time = time.time() # Read training data set with file(senti_train_filename, 'r') as fin: reader = csv.reader(fin, delimiter='|') for txt, label in reader: senti_train_txt.append(txt) senti_train_label.append(int(label)) # Read test data set with file(senti_test_filename, 'r') as fin: reader = csv.reader(fin, delimiter='|') for txt, label in reader: senti_test_txt.append(txt) senti_test_label.append(int(label)) end_time = time.time() pprint('Time used to load training and test data set: %f seconds.' % (end_time - start_time)) embedding_filename = '../data/wiki_embeddings.txt' # Load training/test data sets and wiki-embeddings word_embedding = WordEmbedding(embedding_filename) start_time = time.time() # Store original text representation self.senti_train_txt = senti_train_txt self.senti_test_txt = senti_test_txt # Word-vector representation self.senti_train_label = np.asarray(senti_train_label, dtype=np.int32) self.senti_test_label = np.asarray(senti_test_label, dtype=np.int32) train_size = len(senti_train_txt) test_size = len(senti_test_txt) # Check size assert train_size == self.senti_train_label.shape[0] assert test_size == self.senti_test_label.shape[0] pprint('Training size: %d' % train_size) pprint('Test size: %d' % test_size) # Compute word embedding self.senti_train_set = np.zeros( (train_size, word_embedding.embedding_dim()), dtype=floatX) self.senti_test_set = np.zeros( (test_size, word_embedding.embedding_dim()), dtype=floatX) # Embedding for training set for i, sent in enumerate(senti_train_txt): words = sent.split() words = [word.lower() for word in words] # pprint('Trainging set, Number of words in sentence %d: %d' % (i, len(words))) vectors = np.asarray( [word_embedding.wordvec(word) for word in words]) self.senti_train_set[i, :] = np.mean(vectors, axis=0) # Embedding for test set for i, sent in enumerate(senti_test_txt): words = sent.split() words = [word.lower() for word in words] # pprint('Test set, Number of words in sentence %d: %d' % (i, len(words))) vectors = np.asarray( [word_embedding.wordvec(word) for word in words]) self.senti_test_set[i, :] = np.mean(vectors, axis=0) # Shuffle training and test data set train_rand_index = np.random.permutation(train_size) test_rand_index = np.random.permutation(test_size) self.senti_train_txt = list( np.asarray(self.senti_train_txt)[train_rand_index]) self.senti_test_txt = list( np.asarray(self.senti_test_txt)[test_rand_index]) self.senti_train_set = self.senti_train_set[train_rand_index, :] self.senti_test_set = self.senti_test_set[test_rand_index, :] self.senti_train_label = self.senti_train_label[train_rand_index] self.senti_test_label = self.senti_test_label[test_rand_index] end_time = time.time() pprint( 'Time used to build initial training and test matrix: %f seconds.' % (end_time - start_time)) # Store data self.train_size = train_size self.test_size = test_size self.word_embedding = word_embedding
class TestOnLarge(unittest.TestCase): ''' Test the initial sentence model on Wiki-Data set, which contains 39,746,550 sentences, 782,603,381 words ''' def setUp(self): train_txt_filename = '../data/wiki_sentence.txt' wiki_filename = '../data/wiki_embeddings.txt' start_time = time.time() self.word_embedding = WordEmbedding(wiki_filename) with file(train_txt_filename, 'rb') as fin: self.train_txt = fin.readlines() end_time = time.time() # Since the maximum length in the task of sentiment_analysis is 56, during training # we will set 56 as the maximum length of each sentence self.max_length = 56 self.num_sent = len(self.train_txt) self.batch_size = 2000 self.nepoch = 5 pprint('Time used to load wiki sentences into memory: %f seconds.' % (end_time-start_time)) pprint('Number of sentences in the data set: %d' % len(self.train_txt)) def testTrain(self): ''' Train Auto-Encoder + SoftmaxLayer on batch learning mode. ''' input_dim, hidden_dim = self.max_length * self.word_embedding.embedding_dim(), 500 # Build AutoEncoder + SoftmaxLayer start_time = time.time() seed = 1991 input_matrix = T.matrix(name='input') num_in, num_out = input_dim, hidden_dim act = Activation('tanh') is_denoising, is_sparse = True, False lambda1, mask = 1e-4, 0.5 rng = RandomStreams(seed) sent_model = SentModel(input_matrix, (num_in, num_out), act, is_denoising, is_sparse, lambda1, mask, rng, verbose=True) end_time = time.time() pprint('Time used to build the model: %f seconds.' % (end_time-start_time)) # Loading training data and start batch training mode num_batch = self.num_sent / self.batch_size learn_rate = 0.1 # Pretraining pprint('Start pretraining...') start_time = time.time() for i in xrange(self.nepoch): # Batch training pprint('Training epoch: %d' % i) for j in xrange(num_batch): train_set = np.zeros((self.batch_size, self.max_length * self.word_embedding.embedding_dim()), dtype=floatX) train_txt = self.train_txt[j*self.batch_size : (j+1)*self.batch_size] for k, sent in enumerate(train_txt): words = sent.split() vectors = np.asarray([self.word_embedding.wordvec(word) for word in words]) vectors = vectors.flatten() train_set[k, :vectors.shape[0]] = vectors rate = learn_rate cost = sent_model.pretrain(train_set, rate) if (j+1) % 500 == 0: pprint('Training epoch: %d, Number batch: %d, cost = %f' % (i, j, cost)) # Saving temporary pretraining model in .gz with gzip.GzipFile('./large_pretrain.sent.gz', 'wb') as fout: cPickle.dump(sent_model, fout) end_time = time.time() pprint('Time used for pretraining: %f minutes.' % ((end_time-start_time)/60.0)) # Fine tuning pprint('Start fine-tuning...') start_time = time.time() for i in xrange(self.nepoch): # Batch training pprint('Training epoch: %d' % i) for j in xrange(num_batch): train_set = np.zeros((self.batch_size, self.max_length * self.word_embedding.embedding_dim()), dtype=floatX) train_txt = self.train_txt[j*self.batch_size : (j+1)*self.batch_size] for k, sent in enumerate(train_txt): words = sent.split() vectors = np.asarray([self.word_embedding.wordvec(word) for word in words]) vectors = vectors.flatten() train_set[k, :vectors.shape[0]] = vectors rate = learn_rate cost = sent_model.finetune(train_set, rate) if (j+1) % 500 == 0: pprint('Training epoch: %d, Number batch: %d, cost = %f' % (i, j, cost)) # Saving temporary fine-tuning model in .gz with gzip.GzipFile('./large_finetune.sent.gz', 'wb') as fout: cPickle.dump(sent_model, fout) end_time = time.time() pprint('Time used for fine-tuning: %f minutes.' %((end_time-start_time)/60.0))
for txt, label in reader: senti_train_txt.append(txt) senti_train_label.append(int(label)) # Read test data set with file(senti_test_filename, 'r') as fin: reader = csv.reader(fin, delimiter='|') for txt, label in reader: senti_test_txt.append(txt) senti_test_label.append(int(label)) end_time = time.time() logger.debug('Time used to load training and test data set: %f seconds.' % (end_time-start_time)) # Load word-embedding embedding_filename = '../data/wiki_embeddings.txt' # Load training/test data sets and wiki-embeddings word_embedding = WordEmbedding(embedding_filename) embed_dim = word_embedding.embedding_dim() start_time = time.time() blank_index = word_embedding.word2index('</s>') logger.debug('Blank index: {}'.format(word_embedding.index2word(blank_index))) # Word-vector representation senti_train_label = np.asarray(senti_train_label, dtype=np.int32) senti_test_label = np.asarray(senti_test_label, dtype=np.int32) train_size = len(senti_train_txt) test_size = len(senti_test_txt) # Check size logger.debug('Training size: %d' % train_size) logger.debug('Test size: %d' % test_size) # Shuffling for all the instances start_time = time.time() rindex = np.arange(train_size) tindex = np.arange(test_size)
def testSentiment(self): ''' Build a small model and use it on sentiment analysis task. ''' ''' Load training and test texts and labels in sentiment analysis task, preprocessing. ''' np.random.seed(1991) senti_train_filename = '../data/sentiment-train.txt' # senti_train_filename = '../data/sentiment-train-phrases.txt' senti_test_filename = '../data/sentiment-test.txt' senti_train_txt, senti_train_label = [], [] senti_test_txt, senti_test_label = [], [] start_time = time.time() # Read training data set with file(senti_train_filename, 'r') as fin: reader = csv.reader(fin, delimiter='|') for txt, label in reader: senti_train_txt.append(txt) senti_train_label.append(int(label)) # Read test data set with file(senti_test_filename, 'r') as fin: reader = csv.reader(fin, delimiter='|') for txt, label in reader: senti_test_txt.append(txt) senti_test_label.append(int(label)) end_time = time.time() logger.debug('Time used to load training and test data set: %f seconds.' % (end_time-start_time)) embedding_filename = '../data/wiki_embeddings.txt' # Load training/test data sets and wiki-embeddings word_embedding = WordEmbedding(embedding_filename) embed_dim = word_embedding.embedding_dim() start_time = time.time() # Store original text representation self.senti_train_txt = senti_train_txt self.senti_test_txt = senti_test_txt # Word-vector representation self.senti_train_label = np.asarray(senti_train_label, dtype=np.int32) self.senti_test_label = np.asarray(senti_test_label, dtype=np.int32) train_size = len(senti_train_txt) test_size = len(senti_test_txt) # Check size assert train_size == self.senti_train_label.shape[0] assert test_size == self.senti_test_label.shape[0] logger.debug('Training size: %d' % train_size) logger.debug('Test size: %d' % test_size) # Compute word embedding self.senti_train_set = [] self.senti_test_set = [] # Embedding for training set for i, sent in enumerate(senti_train_txt): words = sent.split() words = [word.lower() for word in words] vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32) vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words]) self.senti_train_set.append(vectors) # Embedding for test set for i, sent in enumerate(senti_test_txt): words = sent.split() words = [word.lower() for word in words] vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32) vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words]) self.senti_test_set.append(vectors) end_time = time.time() logger.debug('Time used to build initial training and test matrix: %f seconds.' % (end_time-start_time)) # Store data self.train_size = train_size self.test_size = test_size self.word_embedding = word_embedding # Shuffling rindex = np.arange(train_size) tindex = np.arange(test_size) np.random.shuffle(rindex) np.random.shuffle(tindex) self.senti_train_set = list(np.asarray(self.senti_train_set)[rindex]) self.senti_test_set = list(np.asarray(self.senti_test_set)[tindex]) self.senti_train_label = self.senti_train_label[rindex] self.senti_test_label = self.senti_test_label[tindex] senti_train_set, senti_test_set = self.senti_train_set, self.senti_test_set senti_train_label, senti_test_label = self.senti_train_label, self.senti_test_label p_count = np.sum(senti_train_label) logger.debug('Default positive percentage in Train: %f' % (float(p_count) / train_size)) logger.debug('Default negative percentage in Train: %f' % (float(train_size-p_count) / train_size)) p_count = np.sum(senti_test_label) logger.debug('Default positive percentage in Test: %f' % (float(p_count) / test_size)) logger.debug('Default negative percentage in Test: %f' % (float(test_size-p_count) / test_size)) # Now, start training start_time = time.time() grbagger = GrCNNBagger(self.configer, verbose=True) end_time = time.time() logger.debug('Time used to build the model: %f seconds.' % (end_time-start_time)) learn_rate = 0.02 # Training using stochastic gradient descent algorithm epoch = 200 batch_size = 20 start_time = time.time() highest_train_accuracy, highest_test_accuracy = 0.0, 0.0 track_training_acc, track_training_cost = [], [] track_test_acc, track_test_cost = [], [] try: sample_size = 0 fuedge_factor = 1e-6 for i in xrange(epoch): costs = 0.0 correct_count = 0 logger.debug('=' * 50) # rate = learn_rate / (1+i) rate = learn_rate # Training num_batch = train_size / batch_size for k in xrange(num_batch): accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params] hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params] for j in xrange(k*batch_size, (k+1)*batch_size): results = grbagger.compute_gradient_and_cost(senti_train_set[j], senti_train_label[j]) grads, cost, pred = results[:-2], results[-2], results[-1] if pred == senti_train_label[j]: correct_count += 1 costs += cost for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads): accumu_grad += grad hist_grad += np.square(grad) for accumu_grad, hist_grad in zip(accumu_grads, hist_grads): accumu_grad /= batch_size accumu_grad /= fuedge_factor + np.sqrt(hist_grad) grbagger.update_params(accumu_grads, rate) accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params] hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params] if num_batch * batch_size < train_size: for j in xrange(num_batch*batch_size, train_size): results = grbagger.compute_gradient_and_cost(senti_train_set[j], senti_train_label[j]) grads, cost, pred = results[:-2], results[-2], results[-1] if pred == senti_train_label[j]: correct_count += 1 costs += cost for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads): accumu_grad += grad hist_grad += np.square(grad) for accumu_grad, hist_grad in zip(accumu_grads, hist_grads): accumu_grad /= train_size-num_batch*batch_size accumu_grad /= fuedge_factor + np.sqrt(hist_grad) grbagger.update_params(accumu_grads, rate) train_accuracy = float(correct_count) / train_size logger.debug('Training epoch: %d, total cost: %f, accuracy = %f' % (i, costs, train_accuracy)) # Append all the numbers track_training_cost.append(costs) track_training_acc.append(train_accuracy) if train_accuracy > highest_train_accuracy: highest_train_accuracy = train_accuracy # Testing correct_count = 0 costs = 0.0 for j in xrange(test_size): pred = grbagger.predict(senti_test_set[j]) cost = grbagger.show_cost(senti_test_set[j], senti_test_label[j]) if pred == senti_test_label[j]: correct_count += 1 costs += cost test_accuracy = float(correct_count) / test_size logger.debug('Test accuracy: %f' % test_accuracy) # Append all the numbers track_test_cost.append(costs) track_test_acc.append(test_accuracy) if test_accuracy > highest_test_accuracy: highest_test_accuracy = test_accuracy # Sampling to show the weights and experts of training and test instances logger.debug('Training Sampling: ') for j in xrange(sample_size): idx = np.random.randint(train_size) weights = grbagger.show_weights(senti_train_set[idx]) scores = grbagger.show_scores(senti_train_set[idx]) prob = grbagger.show_prob(senti_train_set[idx]) label = senti_train_label[idx] logger.debug('Training idx: {}'.format(idx)) logger.debug('Training scores: {}'.format(scores)) logger.debug('Training weights: {}'.format(weights)) logger.debug('Training probability: {}'.format(prob)) logger.debug('Training label: {}'.format(label)) logger.debug('-' * 50) logger.debug('Test Sampling: ') for j in xrange(sample_size): idx = np.random.randint(test_size) weights = grbagger.show_weights(senti_test_set[idx]) scores = grbagger.show_scores(senti_test_set[idx]) prob = grbagger.show_prob(senti_test_set[idx]) label = senti_test_label[idx] logger.debug('Test idx: {}'.format(idx)) logger.debug('Test scores: {}'.format(scores)) logger.debug('Test weights: {}'.format(weights)) logger.debug('Test probability: {}'.format(prob)) logger.debug('Test label: {}'.format(label)) logger.debug('-' * 50) # Check norms of the model parameter for param in grbagger.params: val = param.get_value(borrow=True) norm = np.sqrt(np.sum(np.square(val))) logger.debug('Parameter: {}, L2-norm: {}'.format(param.name, norm)) except: logger.debug('Error appeared!') traceback.print_exc(file=sys.stdout) logger.debug('-' * 50) finally: end_time = time.time() logger.debug('Time used for training: %f seconds.' % (end_time-start_time)) logger.debug('Highest training accuracy: %f' % highest_train_accuracy) logger.debug('Highest test accuracy: %f' % highest_test_accuracy) GrCNNBagger.save('grbagger.model', grbagger) # Save all the tracking numbers track_training_acc = np.asarray(track_training_acc) track_training_cost = np.asarray(track_training_cost) track_test_acc = np.asarray(track_test_acc) track_test_cost = np.asarray(track_test_cost) with file('senti-records.npy', 'w') as fout: np.save(fout, track_training_acc) np.save(fout, track_training_cost) np.save(fout, track_test_acc) np.save(fout, track_test_cost) logger.debug('Training and test records saved to senti-records.npy...') logger.debug('Finished...')
logger.debug('Finished loading training and test data set...') logger.debug('Time used to load training and test pairs: %f seconds.' % (end_time - start_time)) embedding_filename = '../data/wiki_embeddings.txt' word_embedding = WordEmbedding(embedding_filename) start_time = time.time() # Beginning and trailing token for each sentence blank_token = word_embedding.wordvec('</s>') # Store original text representation train_size = len(train_pairs_txt) test_size = len(test_pairs_txt) logger.debug('Size of training pairs: %d' % train_size) logger.debug('Size of test pairs: %d' % test_size) train_pairs_set, test_pairs_set = [], [] # Build word embedding for both training and test data sets edim = word_embedding.embedding_dim() # Build training data set for i, (psent, qsent) in enumerate(train_pairs_txt): pwords = psent.split() pwords = [pword.lower() for pword in pwords] pvectors = np.zeros((len(pwords) + 2, edim), dtype=floatX) pvectors[0, :], pvectors[-1, :] = blank_token, blank_token pvectors[1:-1, :] = np.asarray( [word_embedding.wordvec(pword) for pword in pwords], dtype=floatX) qwords = qsent.split() qwords = [qword.lower() for qword in qwords] qvectors = np.zeros((len(qwords) + 2, edim), dtype=floatX) qvectors[0, :], qvectors[-1, :] = blank_token, blank_token qvectors[1:-1, :] = np.asarray( [word_embedding.wordvec(qword) for qword in qwords], dtype=floatX)
assert len(mr_txt) == len(mr_label) data_size = len(mr_txt) logger.info('Size of the data sets: %d' % data_size) random_index = np.arange(data_size) np.random.shuffle(random_index) mr_txt = list(np.asarray(mr_txt)[random_index]) mr_label = list(np.asarray(mr_label)[random_index]) end_time = time.time() # Record timing logger.info('Time used to load and shuffle MR dataset: %f seconds.' % (end_time - start_time)) # Load word-embedding embedding_filename = './wiki_embeddings.txt' # Load training/test data sets and wiki-embeddings. word_embedding = WordEmbedding(embedding_filename) embed_dim = word_embedding.embedding_dim() start_time = time.time() blank_index = word_embedding.word2index('</s>') logger.info('Blank index: {}'.format(word_embedding.index2word(blank_index))) # Word-vector representation, zero-padding all the sentences to the maximum length. max_len = 52 mr_insts = np.zeros((data_size, max_len, word_embedding.embedding_dim()), dtype=np.float32) mr_label = np.asarray(mr_label)[:, np.newaxis] for i, sent in enumerate(mr_txt): words = sent.split() words = [word.lower() for word in words] l = len(words) #截断句子 if (l > max_len - 2): words = words[:max_len - 2]