def testGroup(self): ''' Build a small synthetic data set and labelling for training on GrCNNBagger. ''' np.random.seed(42) train_size, input_dim = 500, 50 data = [np.random.rand(np.random.randint(5, 20), input_dim).astype(np.float32) for _ in xrange(train_size)] p = 0.45 labels = np.random.binomial(1, p, train_size) logger.debug('Building GrCNNBagger...') start_time = time.time() grbagger = GrCNNBagger(self.configer, verbose=True) end_time = time.time() logger.debug('Time used to build the model: %f seconds.' % (end_time-start_time)) learn_rate = 0.01 # Training using stochastic gradient descent algorithm epoch = 60 training_cost, training_acc = [], [] for i in xrange(epoch): costs = 0.0 correct_count = 0 logger.debug('=' * 50) for j in xrange(train_size): results = grbagger.compute_gradient_and_cost(data[j], labels[j]) prob = grbagger.show_prob(data[j]) scores = grbagger.show_scores(data[j]) weights = grbagger.show_weights(data[j]) grads, cost, pred = results[:-2], results[-2], results[-1] if pred == labels[j]: correct_count += 1 if cost == np.nan: logger.error('Error here, in %dth epoch, %dth instance' % (i, j)) logger.error('Gradients: ') logger.error(grads) costs += cost grbagger.update_params(grads, learn_rate) logger.debug('Training epoch: %d, total cost: %f, accuracy = %f' % (i, costs, float(correct_count) / train_size)) training_cost.append(costs) training_acc.append(float(correct_count) / train_size) logger.debug('Training finished...') logger.debug('Number of parameters in the model: %d' % grbagger.num_params) logger.debug('Output the weighting vector for each input instance: ') # Record instance scores and instance weights instance_scores, instance_weights, instance_labels = [], [], labels for j in xrange(train_size): weights = grbagger.show_weights(data[j]) scores = grbagger.show_scores(data[j]) weights = weights.ravel() scores = scores.ravel() pred = np.sum(scores * weights) >= 0.5 logger.debug('Instance %d, true label: %d, predicted label: %d' % (j, labels[j], pred)) logger.debug('scores = {}'.format(scores)) logger.debug('weights = {}'.format(weights)) logger.debug('-' * 50) instance_scores.append(scores) instance_weights.append(weights)
def testHierarchical(self): logger.debug('Inside testHierarchical...') grbagger = GrCNNBagger(self.configer, verbose=True) sent = np.random.rand(25, 50).astype(np.float32) logger.debug('Summarized Hierarchy: ') logger.debug(grbagger.show_hierarchy(sent)) logger.debug('Scores from hierarchical experts: ') logger.debug(grbagger.show_scores(sent)) logger.debug('Weights of hierarchical experts: ') logger.debug(grbagger.show_weights(sent))
import pylab as plt # Set the basic configuration of the logging system logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M') sys.path.append('../source/') logger = logging.getLogger(__name__) from grcnn import GrCNNBagger from config import GrCNNConfiger from wordvec import WordEmbedding model_filename = './grbagger.model' start_time = time.time() grbagger = GrCNNBagger.load(model_filename) end_time = time.time() logger.debug('Time used to load the model: %f seconds.' % (end_time-start_time)) np.random.seed(1991) senti_train_filename = '../data/sentiment-train.txt' senti_test_filename = '../data/sentiment-test.txt' senti_train_txt, senti_train_label = [], [] senti_test_txt, senti_test_label = [], [] start_time = time.time() # Read training data set with file(senti_train_filename, 'r') as fin: reader = csv.reader(fin, delimiter='|') for txt, label in reader: senti_train_txt.append(txt) senti_train_label.append(int(label))
senti_test_word_index.append(indices) senti_test_sparse_select.append(sparse_select) end_time = time.time() logger.debug('Time used to build initial matrices: %f seconds.' % (end_time-start_time)) p_count = np.sum(senti_train_label) logger.debug('Default positive percentage in Train: %f' % (float(p_count) / train_size)) logger.debug('Default negative percentage in Train: %f' % (float(train_size-p_count) / train_size)) p_count = np.sum(senti_test_label) logger.debug('Default positive percentage in Test: %f' % (float(p_count) / test_size)) logger.debug('Default negative percentage in Test: %f' % (float(test_size-p_count) / test_size)) # If there is a designated model, using it, else start from scratch start_time = time.time() if args.model == 'NONE': logger.debug('No designated model, training from scratch...') configer = GrCNNConfiger(args.config) grbagger = GrCNNBagger(configer, verbose=True) else: logger.debug('There is a designated model, loading: {}'.format(args.model)) grbagger = GrCNNBagger.load(args.model) end_time = time.time() logger.debug('Time used to building the model: %f seconds.' % (end_time-start_time)) logger.debug('Training start...') # Initialize model training configuration learn_rate = args.rate batch_size = args.size epoch = args.epoch # Training using AdaGrad training_threshold_epoch = 30 highest_train_accuracy, highest_test_accuracy = 0.0, 0.0 track_training_acc, track_training_cost = [], [] track_test_acc, track_test_cost = [], []
def testActiveAndPassive(self): np.random.seed(1991) sp_train_filename = '../data/refined_train_sp.txt' sp_test_filename = '../data/refined_test_sp.txt' sp_train_txt, sp_train_label = [], [] sp_test_txt, sp_test_label = [], [] start_time = time.time() # Read training data set with file(sp_train_filename, 'r') as fin: reader = csv.reader(fin, delimiter='|') for txt, label in reader: sp_train_txt.append(txt) sp_train_label.append(int(label)) # Read test data set with file(sp_test_filename, 'r') as fin: reader = csv.reader(fin, delimiter='|') for txt, label in reader: sp_test_txt.append(txt) sp_test_label.append(label) end_time = time.time() logger.debug('Finished loading training and test data sets...') logger.debug('Time used for loading: %f seconds.' % (end_time-start_time)) embedding_filename = '../data/wiki_embeddings.txt' word_embedding = WordEmbedding(embedding_filename) start_time = time.time() # Starting and Ending token for each sentence blank_token = word_embedding.wordvec('</s>') # Word-vector representation sp_train_label = np.asarray(sp_train_label, dtype=np.int32) sp_test_label = np.asarray(sp_test_label, dtype=np.int32) train_size = len(sp_train_txt) test_size = len(sp_test_txt) # Check size logger.debug('Training size: %d' % train_size) logger.debug('Test size: %d' % test_size) # Sequential modeling for each sentence sp_train_set, sp_test_set = [], [] # Embedding for training set for i, sent in enumerate(sp_train_txt): words = sent.split() words = [word.lower() for word in words] vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX) vectors[1:-1, :] = np.asarray([word_embedding.wordvec(word) for word in words]) sp_train_set.append(vectors) # Embedding for test set for i, sent in enumerate(sp_test_txt): words = sent.split() words = [word.lower() for word in words] vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX) vectors[1:-1, :] = np.asarray([word_embedding.wordvec(word) for word in words]) sp_test_set.append(vectors) end_time = time.time() logger.debug('Time used to build initial training and test matrix: %f seconds.' % (end_time-start_time)) # Now, start training start_time = time.time() grbagger = GrCNNBagger(self.configer, verbose=True) end_time = time.time() logger.debug('Time used to build the model: %f seconds.' % (end_time-start_time)) learn_rate = 2e-2 # Training using stochastic gradient descent algorithm epoch = 200 batch_size = 10 start_time = time.time() highest_train_accuracy, highest_test_accuracy = 0.0, 0.0 training_acc, training_cost = [], [] test_acc, test_cost = [], [] try: sample_size = 0 fuedge_factor = 1e-6 for i in xrange(epoch): costs = 0.0 correct_count = 0 logger.debug('=' * 50) # rate = learn_rate / (1+i) rate = learn_rate # Training num_batch = train_size / batch_size for k in xrange(num_batch): accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params] hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params] for j in xrange(k*batch_size, (k+1)*batch_size): results = grbagger.compute_gradient_and_cost(sp_train_set[j], sp_train_label[j]) for r in results: if np.isnan(np.sum(r)): logger.debug('*' * 50) logger.debug('Error!!!!!') logger.debug('NaN found at %dth training instance' % j) logger.debug('*' * 50) grads, cost, pred = results[:-2], results[-2], results[-1] if pred == sp_train_label[j]: correct_count += 1 costs += cost for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads): accumu_grad += grad hist_grad += np.square(grad) for accumu_grad, hist_grad in zip(accumu_grads, hist_grads): accumu_grad /= batch_size accumu_grad /= fuedge_factor + np.sqrt(hist_grad) grbagger.update_params(accumu_grads, rate) accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params] hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params] if num_batch * batch_size < train_size: for j in xrange(num_batch*batch_size, train_size): results = grbagger.compute_gradient_and_cost(sp_train_set[j], sp_train_label[j]) grads, cost, pred = results[:-2], results[-2], results[-1] if pred == sp_train_label[j]: correct_count += 1 costs += cost for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads): accumu_grad += grad hist_grad += np.square(grad) for accumu_grad, hist_grad in zip(accumu_grads, hist_grads): accumu_grad /= train_size-num_batch*batch_size accumu_grad /= fuedge_factor + np.sqrt(hist_grad) grbagger.update_params(accumu_grads, rate) train_accuracy = float(correct_count) / train_size logger.debug('Training epoch: %d, total cost: %f, accuracy = %f' % (i, costs, train_accuracy)) ## Adding training accuracy and training cost training_acc.append(train_accuracy) training_cost.append(costs) if train_accuracy > highest_train_accuracy: highest_train_accuracy = train_accuracy # Testing correct_count = 0 costs = 0.0 for j in xrange(test_size): pred = grbagger.predict(sp_test_set[j]) cost = grbagger.show_cost(sp_test_set[j], sp_test_label[j]) if pred == sp_test_label[j]: correct_count += 1 costs += cost test_accuracy = float(correct_count) / test_size ## Adding test accuracy and test cost test_acc.append(test_accuracy) test_cost.append(costs) logger.debug('Test accuracy: %f' % test_accuracy) if test_accuracy > highest_test_accuracy: highest_test_accuracy = test_accuracy # Sampling to show the weights and experts of training and test instances logger.debug('Training Sampling: ') for j in xrange(sample_size): idx = np.random.randint(train_size) weights = grbagger.show_weights(sp_train_set[idx]) scores = grbagger.show_scores(sp_train_set[idx]) prob = grbagger.show_prob(sp_train_set[idx]) label = sp_train_label[idx] logger.debug('Training idx: {}'.format(idx)) logger.debug('Training scores: {}'.format(scores)) logger.debug('Training weights: {}'.format(weights)) logger.debug('Training probability: {}'.format(prob)) logger.debug('Training label: {}'.format(label)) logger.debug('-' * 50) logger.debug('Test Sampling: ') for j in xrange(sample_size): idx = np.random.randint(test_size) weights = grbagger.show_weights(sp_test_set[idx]) scores = grbagger.show_scores(sp_test_set[idx]) prob = grbagger.show_prob(sp_test_set[idx]) label = sp_test_label[idx] logger.debug('Test idx: {}'.format(idx)) logger.debug('Test scores: {}'.format(scores)) logger.debug('Test weights: {}'.format(weights)) logger.debug('Test probability: {}'.format(prob)) logger.debug('Test label: {}'.format(label)) logger.debug('-' * 50) # Check norms of the model parameter for param in grbagger.params: val = param.get_value(borrow=True) norm = np.sqrt(np.sum(np.square(val))) logger.debug('Parameter: {}, L2-norm: {}'.format(param.name, norm)) except: logger.debug('Error appeared!') traceback.print_exc(file=sys.stdout) logger.debug('-' * 50) finally: end_time = time.time() logger.debug('Time used for training: %f seconds.' % (end_time-start_time)) logger.debug('Highest training accuracy: %f' % highest_train_accuracy) logger.debug('Highest test accuracy: %f' % highest_test_accuracy) GrCNNBagger.save('sp-grbagger.model', grbagger) # Save all the training and test records training_acc = np.asarray(training_acc) training_cost = np.asarray(training_cost) test_acc = np.asarray(test_acc) test_cost = np.asarray(test_cost) with file('sp-records.npy', 'w') as fout: np.save(fout, training_acc) np.save(fout, training_cost) np.save(fout, test_acc) np.save(fout, test_cost) logger.debug('Training and test records saved to sp-records.npy...') logger.debug('Finished...')
def testBuilding(self): logger.debug('Inside testBuilding...') grbagger = GrCNNBagger(self.configer, verbose=True)
def testSentimentFineTune(self): ''' Build a small model and use it on sentiment analysis task. With fine-tunning the word-embedding matrix. ''' np.random.seed(1991) fname = './grCNN.conf' configer = GrCNNConfiger(fname) senti_train_filename = '../data/sentiment-train.txt' # senti_train_filename = '../data/sentiment-train-phrases.txt' senti_test_filename = '../data/sentiment-test.txt' senti_train_txt, senti_train_label = [], [] senti_test_txt, senti_test_label = [], [] start_time = time.time() # Read training data set with file(senti_train_filename, 'r') as fin: reader = csv.reader(fin, delimiter='|') for txt, label in reader: senti_train_txt.append(txt) senti_train_label.append(int(label)) # Read test data set with file(senti_test_filename, 'r') as fin: reader = csv.reader(fin, delimiter='|') for txt, label in reader: senti_test_txt.append(txt) senti_test_label.append(int(label)) end_time = time.time() logger.debug('Time used to load training and test data set: %f seconds.' % (end_time-start_time)) embedding_filename = '../data/wiki_embeddings.txt' # Load training/test data sets and wiki-embeddings word_embedding = WordEmbedding(embedding_filename) embed_dim = word_embedding.embedding_dim() start_time = time.time() blank_index = word_embedding.word2index('</s>') logger.debug('Blank index: {}'.format(word_embedding.index2word(blank_index))) # Word-vector representation senti_train_label = np.asarray(senti_train_label, dtype=np.int32) senti_test_label = np.asarray(senti_test_label, dtype=np.int32) train_size = len(senti_train_txt) test_size = len(senti_test_txt) # Check size logger.debug('Training size: %d' % train_size) logger.debug('Test size: %d' % test_size) # Shuffling for all the instances start_time = time.time() rindex = np.arange(train_size) tindex = np.arange(test_size) np.random.shuffle(rindex) np.random.shuffle(tindex) # Shuffle label senti_train_label = senti_train_label[rindex] senti_test_label = senti_test_label[tindex] # Shuffle text senti_train_txt = list(np.asarray(senti_train_txt)[rindex]) senti_test_txt = list(np.asarray(senti_test_txt)[tindex]) end_time = time.time() logger.debug('Time used to shuffle all the data: %f seconds.' % (end_time-start_time)) # Compute word embedding senti_train_set = [] senti_test_set = [] # Record the index of each word in each sentence for only once senti_train_word_index = [] senti_test_word_index = [] # Record the sparse input indicator matrix only once for fast computation senti_train_sparse_select = [] senti_test_sparse_select = [] # Embedding for training set for i, sent in enumerate(senti_train_txt): words = sent.split() words = [word.lower() for word in words] vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32) vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words]) indices = [blank_index] indices += [word_embedding.word2index(word) for word in words] indices += [blank_index] sparse_select = lil_matrix((len(words)+2, word_embedding.dict_size()), dtype=floatX) sparse_select[range(len(words)+2), indices] = 1.0 sparse_select = csc_matrix(sparse_select) senti_train_set.append(vectors) senti_train_word_index.append(indices) senti_train_sparse_select.append(sparse_select) # Embedding for test set for i, sent in enumerate(senti_test_txt): words = sent.split() words = [word.lower() for word in words] vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32) vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words]) indices = [blank_index] indices += [word_embedding.word2index(word) for word in words] indices += [blank_index] sparse_select = lil_matrix((len(words)+2, word_embedding.dict_size()), dtype=floatX) sparse_select[range(len(words)+2), indices] = 1.0 sparse_select = csc_matrix(sparse_select) senti_test_set.append(vectors) senti_test_word_index.append(indices) senti_test_sparse_select.append(sparse_select) end_time = time.time() logger.debug('Time used to build initial matrices: %f seconds.' % (end_time-start_time)) p_count = np.sum(senti_train_label) logger.debug('Default positive percentage in Train: %f' % (float(p_count) / train_size)) logger.debug('Default negative percentage in Train: %f' % (float(train_size-p_count) / train_size)) p_count = np.sum(senti_test_label) logger.debug('Default positive percentage in Test: %f' % (float(p_count) / test_size)) logger.debug('Default negative percentage in Test: %f' % (float(test_size-p_count) / test_size)) # Now, start training start_time = time.time() grbagger = GrCNNBagger(configer, verbose=True) end_time = time.time() logger.debug('Time used to build the model: %f seconds.' % (end_time-start_time)) learn_rate = 0.02 # Training using stochastic gradient descent algorithm epoch = 200 batch_size = 20 start_time = time.time() highest_train_accuracy, highest_test_accuracy = 0.0, 0.0 track_training_acc, track_training_cost = [], [] track_test_acc, track_test_cost = [], [] training_threshold_epoch = 30 try: sample_size = 0 fuedge_factor = 1e-6 # accumu matrix for word-embedding matrix # hist matrix for word-embedding matrix accumu_embedding = np.zeros((word_embedding.dict_size(), configer.num_input), dtype=floatX) hist_embedding = np.zeros((word_embedding.dict_size(), configer.num_input), dtype=floatX) for i in xrange(epoch): costs = 0.0 correct_count = 0 logger.debug('=' * 50) # rate = learn_rate / (1+i) rate = learn_rate # Training num_batch = train_size / batch_size for k in xrange(num_batch): # Clear all the cache accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params] hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params] if i > training_threshold_epoch: accumu_embedding[:] = 0.0 hist_embedding[:] = 0.0 for j in xrange(k*batch_size, (k+1)*batch_size): train_sent_rep = senti_train_sparse_select[j].dot(word_embedding.embedding) results = grbagger.compute_gradient_and_cost(train_sent_rep, senti_train_label[j]) input_grad = grbagger.compute_input_gradient(train_sent_rep, senti_train_label[j]) grads, cost, pred = results[:-2], results[-2], results[-1] if pred == senti_train_label[j]: correct_count += 1 costs += cost for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads): accumu_grad += grad hist_grad += np.square(grad) ## Update the word-embedding matrix if i > training_threshold_epoch: tmp = senti_train_sparse_select[j].T.dot(input_grad) accumu_embedding += tmp hist_embedding += np.square(tmp) # Updating model parameters for accumu_grad, hist_grad in zip(accumu_grads, hist_grads): accumu_grad /= batch_size accumu_grad /= fuedge_factor + np.sqrt(hist_grad) # Updating word-embedding matrix if i > training_threshold_epoch: accumu_embedding /= batch_size accumu_embedding /= fuedge_factor + np.sqrt(hist_embedding) word_embedding._embedding -= rate * accumu_embedding grbagger.update_params(accumu_grads, rate) # Clear all the cache again accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params] hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params] if i > training_threshold_epoch: accumu_embedding[:] = 0.0 hist_embedding[:] = 0.0 if num_batch * batch_size < train_size: for j in xrange(num_batch*batch_size, train_size): train_sent_rep = senti_train_sparse_select[j].dot(word_embedding.embedding) results = grbagger.compute_gradient_and_cost(train_sent_rep, senti_train_label[j]) input_grad = grbagger.compute_input_gradient(train_sent_rep, senti_train_label[j]) grads, cost, pred = results[:-2], results[-2], results[-1] if pred == senti_train_label[j]: correct_count += 1 costs += cost for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads): accumu_grad += grad hist_grad += np.square(grad) ## Update the word-embedding matrix if i > training_threshold_epoch: tmp = senti_train_sparse_select[j].T.dot(input_grad) accumu_embedding += tmp hist_embedding += np.square(tmp) # Normalizing model parameters for accumu_grad, hist_grad in zip(accumu_grads, hist_grads): accumu_grad /= train_size-num_batch*batch_size accumu_grad /= fuedge_factor + np.sqrt(hist_grad) # Normalizing word-embedding matrix if i > training_threshold_epoch: accumu_embedding /= train_size-num_batch*batch_size accumu_embedding /= fuedge_factor + np.sqrt(hist_embedding) word_embedding._embedding -= rate * accumu_embedding # Updating all the parameters grbagger.update_params(accumu_grads, rate) train_accuracy = float(correct_count) / train_size logger.debug('Training epoch: %d, total cost: %f, accuracy = %f' % (i, costs, train_accuracy)) # Append all the numbers track_training_cost.append(costs) track_training_acc.append(train_accuracy) if train_accuracy > highest_train_accuracy: highest_train_accuracy = train_accuracy # Testing correct_count = 0 costs = 0.0 for j in xrange(test_size): test_sent_rep = senti_test_sparse_select[j].dot(word_embedding.embedding) pred = grbagger.predict(test_sent_rep) cost = grbagger.show_cost(test_sent_rep, senti_test_label[j]) if pred == senti_test_label[j]: correct_count += 1 costs += cost test_accuracy = float(correct_count) / test_size logger.debug('Test accuracy: %f' % test_accuracy) # Append all the numbers track_test_cost.append(costs) track_test_acc.append(test_accuracy) if test_accuracy > highest_test_accuracy: highest_test_accuracy = test_accuracy # Sampling to show the weights and experts of training and test instances logger.debug('Training Sampling: ') for j in xrange(sample_size): idx = np.random.randint(train_size) weights = grbagger.show_weights(senti_train_set[idx]) scores = grbagger.show_scores(senti_train_set[idx]) prob = grbagger.show_prob(senti_train_set[idx]) label = senti_train_label[idx] logger.debug('Training idx: {}'.format(idx)) logger.debug('Training scores: {}'.format(scores)) logger.debug('Training weights: {}'.format(weights)) logger.debug('Training probability: {}'.format(prob)) logger.debug('Training label: {}'.format(label)) logger.debug('-' * 50) logger.debug('Test Sampling: ') for j in xrange(sample_size): idx = np.random.randint(test_size) weights = grbagger.show_weights(senti_test_set[idx]) scores = grbagger.show_scores(senti_test_set[idx]) prob = grbagger.show_prob(senti_test_set[idx]) label = senti_test_label[idx] logger.debug('Test idx: {}'.format(idx)) logger.debug('Test scores: {}'.format(scores)) logger.debug('Test weights: {}'.format(weights)) logger.debug('Test probability: {}'.format(prob)) logger.debug('Test label: {}'.format(label)) logger.debug('-' * 50) # Check norms of the model parameter for param in grbagger.params: val = param.get_value(borrow=True) norm = np.sqrt(np.sum(np.square(val))) logger.debug('Parameter: {}, L2-norm: {}'.format(param.name, norm)) wnorm = np.sqrt(np.sum(np.square(word_embedding._embedding))) logger.debug('Parameter: {}, L2-norm: {}'.format('Word-Embedding', wnorm)) except: logger.debug('Error appeared!') traceback.print_exc(file=sys.stdout) logger.debug('-' * 50) finally: end_time = time.time() logger.debug('Time used for training: %f seconds.' % (end_time-start_time)) logger.debug('Highest training accuracy: %f' % highest_train_accuracy) logger.debug('Highest test accuracy: %f' % highest_test_accuracy) GrCNNBagger.save('fine-grbagger.model', grbagger) # Save all the tracking numbers track_training_acc = np.asarray(track_training_acc) track_training_cost = np.asarray(track_training_cost) track_test_acc = np.asarray(track_test_acc) track_test_cost = np.asarray(track_test_cost) with file('fine-senti-records.npy', 'w') as fout: np.save(fout, track_training_acc) np.save(fout, track_training_cost) np.save(fout, track_test_acc) np.save(fout, track_test_cost) logger.debug('Training and test records saved to fine-senti-records.npy...') logger.debug('Finished...')
def testSentiment(self): ''' Build a small model and use it on sentiment analysis task. ''' ''' Load training and test texts and labels in sentiment analysis task, preprocessing. ''' np.random.seed(1991) senti_train_filename = '../data/sentiment-train.txt' # senti_train_filename = '../data/sentiment-train-phrases.txt' senti_test_filename = '../data/sentiment-test.txt' senti_train_txt, senti_train_label = [], [] senti_test_txt, senti_test_label = [], [] start_time = time.time() # Read training data set with file(senti_train_filename, 'r') as fin: reader = csv.reader(fin, delimiter='|') for txt, label in reader: senti_train_txt.append(txt) senti_train_label.append(int(label)) # Read test data set with file(senti_test_filename, 'r') as fin: reader = csv.reader(fin, delimiter='|') for txt, label in reader: senti_test_txt.append(txt) senti_test_label.append(int(label)) end_time = time.time() logger.debug('Time used to load training and test data set: %f seconds.' % (end_time-start_time)) embedding_filename = '../data/wiki_embeddings.txt' # Load training/test data sets and wiki-embeddings word_embedding = WordEmbedding(embedding_filename) embed_dim = word_embedding.embedding_dim() start_time = time.time() # Store original text representation self.senti_train_txt = senti_train_txt self.senti_test_txt = senti_test_txt # Word-vector representation self.senti_train_label = np.asarray(senti_train_label, dtype=np.int32) self.senti_test_label = np.asarray(senti_test_label, dtype=np.int32) train_size = len(senti_train_txt) test_size = len(senti_test_txt) # Check size assert train_size == self.senti_train_label.shape[0] assert test_size == self.senti_test_label.shape[0] logger.debug('Training size: %d' % train_size) logger.debug('Test size: %d' % test_size) # Compute word embedding self.senti_train_set = [] self.senti_test_set = [] # Embedding for training set for i, sent in enumerate(senti_train_txt): words = sent.split() words = [word.lower() for word in words] vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32) vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words]) self.senti_train_set.append(vectors) # Embedding for test set for i, sent in enumerate(senti_test_txt): words = sent.split() words = [word.lower() for word in words] vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32) vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words]) self.senti_test_set.append(vectors) end_time = time.time() logger.debug('Time used to build initial training and test matrix: %f seconds.' % (end_time-start_time)) # Store data self.train_size = train_size self.test_size = test_size self.word_embedding = word_embedding # Shuffling rindex = np.arange(train_size) tindex = np.arange(test_size) np.random.shuffle(rindex) np.random.shuffle(tindex) self.senti_train_set = list(np.asarray(self.senti_train_set)[rindex]) self.senti_test_set = list(np.asarray(self.senti_test_set)[tindex]) self.senti_train_label = self.senti_train_label[rindex] self.senti_test_label = self.senti_test_label[tindex] senti_train_set, senti_test_set = self.senti_train_set, self.senti_test_set senti_train_label, senti_test_label = self.senti_train_label, self.senti_test_label p_count = np.sum(senti_train_label) logger.debug('Default positive percentage in Train: %f' % (float(p_count) / train_size)) logger.debug('Default negative percentage in Train: %f' % (float(train_size-p_count) / train_size)) p_count = np.sum(senti_test_label) logger.debug('Default positive percentage in Test: %f' % (float(p_count) / test_size)) logger.debug('Default negative percentage in Test: %f' % (float(test_size-p_count) / test_size)) # Now, start training start_time = time.time() grbagger = GrCNNBagger(self.configer, verbose=True) end_time = time.time() logger.debug('Time used to build the model: %f seconds.' % (end_time-start_time)) learn_rate = 0.02 # Training using stochastic gradient descent algorithm epoch = 200 batch_size = 20 start_time = time.time() highest_train_accuracy, highest_test_accuracy = 0.0, 0.0 track_training_acc, track_training_cost = [], [] track_test_acc, track_test_cost = [], [] try: sample_size = 0 fuedge_factor = 1e-6 for i in xrange(epoch): costs = 0.0 correct_count = 0 logger.debug('=' * 50) # rate = learn_rate / (1+i) rate = learn_rate # Training num_batch = train_size / batch_size for k in xrange(num_batch): accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params] hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params] for j in xrange(k*batch_size, (k+1)*batch_size): results = grbagger.compute_gradient_and_cost(senti_train_set[j], senti_train_label[j]) grads, cost, pred = results[:-2], results[-2], results[-1] if pred == senti_train_label[j]: correct_count += 1 costs += cost for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads): accumu_grad += grad hist_grad += np.square(grad) for accumu_grad, hist_grad in zip(accumu_grads, hist_grads): accumu_grad /= batch_size accumu_grad /= fuedge_factor + np.sqrt(hist_grad) grbagger.update_params(accumu_grads, rate) accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params] hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params] if num_batch * batch_size < train_size: for j in xrange(num_batch*batch_size, train_size): results = grbagger.compute_gradient_and_cost(senti_train_set[j], senti_train_label[j]) grads, cost, pred = results[:-2], results[-2], results[-1] if pred == senti_train_label[j]: correct_count += 1 costs += cost for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads): accumu_grad += grad hist_grad += np.square(grad) for accumu_grad, hist_grad in zip(accumu_grads, hist_grads): accumu_grad /= train_size-num_batch*batch_size accumu_grad /= fuedge_factor + np.sqrt(hist_grad) grbagger.update_params(accumu_grads, rate) train_accuracy = float(correct_count) / train_size logger.debug('Training epoch: %d, total cost: %f, accuracy = %f' % (i, costs, train_accuracy)) # Append all the numbers track_training_cost.append(costs) track_training_acc.append(train_accuracy) if train_accuracy > highest_train_accuracy: highest_train_accuracy = train_accuracy # Testing correct_count = 0 costs = 0.0 for j in xrange(test_size): pred = grbagger.predict(senti_test_set[j]) cost = grbagger.show_cost(senti_test_set[j], senti_test_label[j]) if pred == senti_test_label[j]: correct_count += 1 costs += cost test_accuracy = float(correct_count) / test_size logger.debug('Test accuracy: %f' % test_accuracy) # Append all the numbers track_test_cost.append(costs) track_test_acc.append(test_accuracy) if test_accuracy > highest_test_accuracy: highest_test_accuracy = test_accuracy # Sampling to show the weights and experts of training and test instances logger.debug('Training Sampling: ') for j in xrange(sample_size): idx = np.random.randint(train_size) weights = grbagger.show_weights(senti_train_set[idx]) scores = grbagger.show_scores(senti_train_set[idx]) prob = grbagger.show_prob(senti_train_set[idx]) label = senti_train_label[idx] logger.debug('Training idx: {}'.format(idx)) logger.debug('Training scores: {}'.format(scores)) logger.debug('Training weights: {}'.format(weights)) logger.debug('Training probability: {}'.format(prob)) logger.debug('Training label: {}'.format(label)) logger.debug('-' * 50) logger.debug('Test Sampling: ') for j in xrange(sample_size): idx = np.random.randint(test_size) weights = grbagger.show_weights(senti_test_set[idx]) scores = grbagger.show_scores(senti_test_set[idx]) prob = grbagger.show_prob(senti_test_set[idx]) label = senti_test_label[idx] logger.debug('Test idx: {}'.format(idx)) logger.debug('Test scores: {}'.format(scores)) logger.debug('Test weights: {}'.format(weights)) logger.debug('Test probability: {}'.format(prob)) logger.debug('Test label: {}'.format(label)) logger.debug('-' * 50) # Check norms of the model parameter for param in grbagger.params: val = param.get_value(borrow=True) norm = np.sqrt(np.sum(np.square(val))) logger.debug('Parameter: {}, L2-norm: {}'.format(param.name, norm)) except: logger.debug('Error appeared!') traceback.print_exc(file=sys.stdout) logger.debug('-' * 50) finally: end_time = time.time() logger.debug('Time used for training: %f seconds.' % (end_time-start_time)) logger.debug('Highest training accuracy: %f' % highest_train_accuracy) logger.debug('Highest test accuracy: %f' % highest_test_accuracy) GrCNNBagger.save('grbagger.model', grbagger) # Save all the tracking numbers track_training_acc = np.asarray(track_training_acc) track_training_cost = np.asarray(track_training_cost) track_test_acc = np.asarray(track_test_acc) track_test_cost = np.asarray(track_test_cost) with file('senti-records.npy', 'w') as fout: np.save(fout, track_training_acc) np.save(fout, track_training_cost) np.save(fout, track_test_acc) np.save(fout, track_test_cost) logger.debug('Training and test records saved to senti-records.npy...') logger.debug('Finished...')