Ejemplo n.º 1
0
    def testGroup(self):
        '''
        Build a small synthetic data set and labelling for training 
        on GrCNNBagger.
        '''
        np.random.seed(42)
        train_size, input_dim = 500, 50
        data = [np.random.rand(np.random.randint(5, 20), input_dim).astype(np.float32) 
                for _ in xrange(train_size)]
        p = 0.45
        labels = np.random.binomial(1, p, train_size)
        logger.debug('Building GrCNNBagger...')
        start_time = time.time()
        grbagger = GrCNNBagger(self.configer, verbose=True)
        end_time = time.time()
        logger.debug('Time used to build the model: %f seconds.' % (end_time-start_time))
        learn_rate = 0.01
        # Training using stochastic gradient descent algorithm
        epoch = 60
        training_cost, training_acc = [], []
        for i in xrange(epoch):
            costs = 0.0
            correct_count = 0
            logger.debug('=' * 50)

            for j in xrange(train_size):
                results = grbagger.compute_gradient_and_cost(data[j], labels[j])
                prob = grbagger.show_prob(data[j])
                scores = grbagger.show_scores(data[j])
                weights = grbagger.show_weights(data[j])
                grads, cost, pred = results[:-2], results[-2], results[-1]
                if pred == labels[j]: correct_count += 1
                if cost == np.nan:
                    logger.error('Error here, in %dth epoch, %dth instance' % (i, j))
                    logger.error('Gradients: ')
                    logger.error(grads)
                costs += cost
                grbagger.update_params(grads, learn_rate)
            logger.debug('Training epoch: %d, total cost: %f, accuracy = %f' 
                        % (i, costs, float(correct_count) / train_size))
            training_cost.append(costs)
            training_acc.append(float(correct_count) / train_size)        
        logger.debug('Training finished...')
        logger.debug('Number of parameters in the model: %d' % grbagger.num_params)
        logger.debug('Output the weighting vector for each input instance: ')
        # Record instance scores and instance weights
        instance_scores, instance_weights, instance_labels = [], [], labels
        for j in xrange(train_size):
            weights = grbagger.show_weights(data[j])
            scores = grbagger.show_scores(data[j])
            weights = weights.ravel()
            scores = scores.ravel()
            pred = np.sum(scores * weights) >= 0.5
            logger.debug('Instance %d, true label: %d, predicted label: %d' % (j, labels[j], pred))
            logger.debug('scores = {}'.format(scores))
            logger.debug('weights = {}'.format(weights))
            logger.debug('-' * 50)

            instance_scores.append(scores)
            instance_weights.append(weights)
Ejemplo n.º 2
0
 def testHierarchical(self):
     logger.debug('Inside testHierarchical...')
     grbagger = GrCNNBagger(self.configer, verbose=True)
     sent = np.random.rand(25, 50).astype(np.float32)
     logger.debug('Summarized Hierarchy: ')
     logger.debug(grbagger.show_hierarchy(sent))
     logger.debug('Scores from hierarchical experts: ')
     logger.debug(grbagger.show_scores(sent))
     logger.debug('Weights of hierarchical experts: ')
     logger.debug(grbagger.show_weights(sent))
Ejemplo n.º 3
0
import pylab as plt

# Set the basic configuration of the logging system
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
                    datefmt='%m-%d %H:%M')
sys.path.append('../source/')
logger = logging.getLogger(__name__)

from grcnn import GrCNNBagger
from config import GrCNNConfiger
from wordvec import WordEmbedding

model_filename = './grbagger.model'
start_time = time.time()
grbagger = GrCNNBagger.load(model_filename)
end_time = time.time()
logger.debug('Time used to load the model: %f seconds.' % (end_time-start_time))

np.random.seed(1991)
senti_train_filename = '../data/sentiment-train.txt'
senti_test_filename = '../data/sentiment-test.txt'
senti_train_txt, senti_train_label = [], []
senti_test_txt, senti_test_label = [], []
start_time = time.time()
# Read training data set
with file(senti_train_filename, 'r') as fin:
    reader = csv.reader(fin, delimiter='|')
    for txt, label in reader:
        senti_train_txt.append(txt)
        senti_train_label.append(int(label))
Ejemplo n.º 4
0
    senti_test_word_index.append(indices)
    senti_test_sparse_select.append(sparse_select)
end_time = time.time()
logger.debug('Time used to build initial matrices: %f seconds.' % (end_time-start_time))
p_count = np.sum(senti_train_label)
logger.debug('Default positive percentage in Train: %f' % (float(p_count) / train_size))
logger.debug('Default negative percentage in Train: %f' % (float(train_size-p_count) / train_size))
p_count = np.sum(senti_test_label)
logger.debug('Default positive percentage in Test: %f' % (float(p_count) / test_size))
logger.debug('Default negative percentage in Test: %f' % (float(test_size-p_count) / test_size))
# If there is a designated model, using it, else start from scratch
start_time = time.time()
if args.model == 'NONE':
    logger.debug('No designated model, training from scratch...')
    configer = GrCNNConfiger(args.config)
    grbagger = GrCNNBagger(configer, verbose=True)
else:
    logger.debug('There is a designated model, loading: {}'.format(args.model))
    grbagger = GrCNNBagger.load(args.model)
end_time = time.time()
logger.debug('Time used to building the model: %f seconds.' % (end_time-start_time))
logger.debug('Training start...')
# Initialize model training configuration
learn_rate = args.rate
batch_size = args.size
epoch = args.epoch
# Training using AdaGrad
training_threshold_epoch = 30
highest_train_accuracy, highest_test_accuracy = 0.0, 0.0
track_training_acc, track_training_cost = [], []
track_test_acc, track_test_cost = [], []
Ejemplo n.º 5
0
 def testActiveAndPassive(self):
     np.random.seed(1991)
     sp_train_filename = '../data/refined_train_sp.txt'
     sp_test_filename = '../data/refined_test_sp.txt'
     sp_train_txt, sp_train_label = [], []
     sp_test_txt, sp_test_label = [], []
     start_time = time.time()
     # Read training data set
     with file(sp_train_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             sp_train_txt.append(txt)
             sp_train_label.append(int(label))
     # Read test data set
     with file(sp_test_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             sp_test_txt.append(txt)
             sp_test_label.append(label)
     end_time = time.time()
     logger.debug('Finished loading training and test data sets...')
     logger.debug('Time used for loading: %f seconds.' % (end_time-start_time))
     embedding_filename = '../data/wiki_embeddings.txt'
     word_embedding = WordEmbedding(embedding_filename)
     start_time = time.time()
     # Starting and Ending token for each sentence
     blank_token = word_embedding.wordvec('</s>')
     # Word-vector representation
     sp_train_label = np.asarray(sp_train_label, dtype=np.int32)
     sp_test_label = np.asarray(sp_test_label, dtype=np.int32)
     train_size = len(sp_train_txt)
     test_size = len(sp_test_txt)
     # Check size
     logger.debug('Training size: %d' % train_size)
     logger.debug('Test size: %d' % test_size)
     # Sequential modeling for each sentence
     sp_train_set, sp_test_set = [], []
     # Embedding for training set
     for i, sent in enumerate(sp_train_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX)
         vectors[1:-1, :] = np.asarray([word_embedding.wordvec(word) for word in words])
         sp_train_set.append(vectors)
     # Embedding for test set
     for i, sent in enumerate(sp_test_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX)
         vectors[1:-1, :] = np.asarray([word_embedding.wordvec(word) for word in words])
         sp_test_set.append(vectors)
     end_time = time.time()
     logger.debug('Time used to build initial training and test matrix: %f seconds.' % (end_time-start_time))
     # Now, start training
     start_time = time.time()
     grbagger = GrCNNBagger(self.configer, verbose=True)
     end_time = time.time()
     logger.debug('Time used to build the model: %f seconds.' % (end_time-start_time))
     learn_rate = 2e-2
     # Training using stochastic gradient descent algorithm
     epoch = 200
     batch_size = 10
     start_time = time.time()
     highest_train_accuracy, highest_test_accuracy = 0.0, 0.0
     training_acc, training_cost = [], []
     test_acc, test_cost = [], []
     try:
         sample_size = 0
         fuedge_factor = 1e-6
         for i in xrange(epoch):
             costs = 0.0
             correct_count = 0
             logger.debug('=' * 50)
             # rate = learn_rate / (1+i)
             rate = learn_rate
             # Training
             num_batch = train_size / batch_size
             for k in xrange(num_batch):
                 accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 for j in xrange(k*batch_size, (k+1)*batch_size):
                     results = grbagger.compute_gradient_and_cost(sp_train_set[j], sp_train_label[j])
                     for r in results:
                         if np.isnan(np.sum(r)):
                             logger.debug('*' * 50)
                             logger.debug('Error!!!!!')
                             logger.debug('NaN found at %dth training instance' % j)
                             logger.debug('*' * 50)
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == sp_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 grbagger.update_params(accumu_grads, rate)
             accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             if num_batch * batch_size < train_size:
                 for j in xrange(num_batch*batch_size, train_size):
                     results = grbagger.compute_gradient_and_cost(sp_train_set[j], sp_train_label[j])
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == sp_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= train_size-num_batch*batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 grbagger.update_params(accumu_grads, rate)
             train_accuracy = float(correct_count) / train_size
             logger.debug('Training epoch: %d, total cost: %f, accuracy = %f' 
                         % (i, costs, train_accuracy))
             ## Adding training accuracy and training cost
             training_acc.append(train_accuracy)
             training_cost.append(costs)
             if train_accuracy > highest_train_accuracy: highest_train_accuracy = train_accuracy
             # Testing
             correct_count = 0
             costs = 0.0
             for j in xrange(test_size):
                 pred = grbagger.predict(sp_test_set[j])
                 cost = grbagger.show_cost(sp_test_set[j], sp_test_label[j])
                 if pred == sp_test_label[j]: correct_count += 1
                 costs += cost
             test_accuracy = float(correct_count) / test_size
             ## Adding test accuracy and test cost
             test_acc.append(test_accuracy)
             test_cost.append(costs)
             logger.debug('Test accuracy: %f' % test_accuracy)
             if test_accuracy > highest_test_accuracy: highest_test_accuracy = test_accuracy
             # Sampling to show the weights and experts of training and test instances
             logger.debug('Training Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(train_size)
                 weights = grbagger.show_weights(sp_train_set[idx])
                 scores = grbagger.show_scores(sp_train_set[idx])
                 prob = grbagger.show_prob(sp_train_set[idx])
                 label = sp_train_label[idx]
                 logger.debug('Training idx: {}'.format(idx))
                 logger.debug('Training scores: {}'.format(scores))
                 logger.debug('Training weights: {}'.format(weights))
                 logger.debug('Training probability: {}'.format(prob))
                 logger.debug('Training label: {}'.format(label))
                 logger.debug('-' * 50)
             logger.debug('Test Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(test_size)
                 weights = grbagger.show_weights(sp_test_set[idx])    
                 scores = grbagger.show_scores(sp_test_set[idx])
                 prob = grbagger.show_prob(sp_test_set[idx])
                 label = sp_test_label[idx]
                 logger.debug('Test idx: {}'.format(idx))
                 logger.debug('Test scores: {}'.format(scores))
                 logger.debug('Test weights: {}'.format(weights))
                 logger.debug('Test probability: {}'.format(prob))
                 logger.debug('Test label: {}'.format(label))
                 logger.debug('-' * 50)
             # Check norms of the model parameter
             for param in grbagger.params:
                 val = param.get_value(borrow=True)
                 norm = np.sqrt(np.sum(np.square(val)))
                 logger.debug('Parameter: {}, L2-norm: {}'.format(param.name, norm))
     except:
         logger.debug('Error appeared!')
         traceback.print_exc(file=sys.stdout)
         logger.debug('-' * 50)
     finally:
         end_time = time.time()
         logger.debug('Time used for training: %f seconds.' % (end_time-start_time))
         logger.debug('Highest training accuracy: %f' % highest_train_accuracy)
         logger.debug('Highest test accuracy: %f' % highest_test_accuracy)
         GrCNNBagger.save('sp-grbagger.model', grbagger)
         # Save all the training and test records
         training_acc = np.asarray(training_acc)
         training_cost = np.asarray(training_cost)
         test_acc = np.asarray(test_acc)
         test_cost = np.asarray(test_cost)
         with file('sp-records.npy', 'w') as fout:
             np.save(fout, training_acc)
             np.save(fout, training_cost)
             np.save(fout, test_acc)
             np.save(fout, test_cost)
         logger.debug('Training and test records saved to sp-records.npy...')
         logger.debug('Finished...')
Ejemplo n.º 6
0
 def testBuilding(self):
     logger.debug('Inside testBuilding...')
     grbagger = GrCNNBagger(self.configer, verbose=True)
Ejemplo n.º 7
0
 def testSentimentFineTune(self):
     '''
     Build a small model and use it on sentiment analysis task. With fine-tunning
     the word-embedding matrix.
     '''
     np.random.seed(1991)
     fname = './grCNN.conf'
     configer = GrCNNConfiger(fname)
     senti_train_filename = '../data/sentiment-train.txt'
     # senti_train_filename = '../data/sentiment-train-phrases.txt'
     senti_test_filename = '../data/sentiment-test.txt'
     senti_train_txt, senti_train_label = [], []
     senti_test_txt, senti_test_label = [], []
     start_time = time.time()
     # Read training data set
     with file(senti_train_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             senti_train_txt.append(txt)
             senti_train_label.append(int(label))
     # Read test data set
     with file(senti_test_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             senti_test_txt.append(txt)
             senti_test_label.append(int(label))
     end_time = time.time()
     logger.debug('Time used to load training and test data set: %f seconds.' % (end_time-start_time))
     embedding_filename = '../data/wiki_embeddings.txt'
     # Load training/test data sets and wiki-embeddings
     word_embedding = WordEmbedding(embedding_filename)
     embed_dim = word_embedding.embedding_dim()
     start_time = time.time()
     blank_index = word_embedding.word2index('</s>')
     logger.debug('Blank index: {}'.format(word_embedding.index2word(blank_index)))
     # Word-vector representation
     senti_train_label = np.asarray(senti_train_label, dtype=np.int32)
     senti_test_label = np.asarray(senti_test_label, dtype=np.int32)
     train_size = len(senti_train_txt)
     test_size = len(senti_test_txt)
     # Check size
     logger.debug('Training size: %d' % train_size)
     logger.debug('Test size: %d' % test_size)
     # Shuffling for all the instances
     start_time = time.time()
     rindex = np.arange(train_size)
     tindex = np.arange(test_size)
     np.random.shuffle(rindex)
     np.random.shuffle(tindex)
     # Shuffle label
     senti_train_label = senti_train_label[rindex]
     senti_test_label = senti_test_label[tindex]
     # Shuffle text
     senti_train_txt = list(np.asarray(senti_train_txt)[rindex])
     senti_test_txt = list(np.asarray(senti_test_txt)[tindex])
     end_time = time.time()
     logger.debug('Time used to shuffle all the data: %f seconds.' % (end_time-start_time))
     # Compute word embedding
     senti_train_set = []
     senti_test_set = []
     # Record the index of each word in each sentence for only once
     senti_train_word_index = []
     senti_test_word_index = []
     # Record the sparse input indicator matrix only once for fast computation
     senti_train_sparse_select = []
     senti_test_sparse_select = []
     # Embedding for training set
     for i, sent in enumerate(senti_train_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32)
         vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words])
         indices = [blank_index]
         indices += [word_embedding.word2index(word) for word in words]
         indices += [blank_index]
         sparse_select = lil_matrix((len(words)+2, word_embedding.dict_size()), dtype=floatX)
         sparse_select[range(len(words)+2), indices] = 1.0
         sparse_select = csc_matrix(sparse_select)
         senti_train_set.append(vectors)
         senti_train_word_index.append(indices)
         senti_train_sparse_select.append(sparse_select)
     # Embedding for test set
     for i, sent in enumerate(senti_test_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32)
         vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words])
         indices = [blank_index]
         indices += [word_embedding.word2index(word) for word in words]
         indices += [blank_index]
         sparse_select = lil_matrix((len(words)+2, word_embedding.dict_size()), dtype=floatX)
         sparse_select[range(len(words)+2), indices] = 1.0
         sparse_select = csc_matrix(sparse_select)
         senti_test_set.append(vectors)
         senti_test_word_index.append(indices)
         senti_test_sparse_select.append(sparse_select)
     end_time = time.time()
     logger.debug('Time used to build initial matrices: %f seconds.' % (end_time-start_time))
     p_count = np.sum(senti_train_label)
     logger.debug('Default positive percentage in Train: %f' % (float(p_count) / train_size))
     logger.debug('Default negative percentage in Train: %f' % (float(train_size-p_count) / train_size))
     p_count = np.sum(senti_test_label)
     logger.debug('Default positive percentage in Test: %f' % (float(p_count) / test_size))
     logger.debug('Default negative percentage in Test: %f' % (float(test_size-p_count) / test_size))
     # Now, start training
     start_time = time.time()
     grbagger = GrCNNBagger(configer, verbose=True)
     end_time = time.time()
     logger.debug('Time used to build the model: %f seconds.' % (end_time-start_time))
     learn_rate = 0.02
     # Training using stochastic gradient descent algorithm
     epoch = 200
     batch_size = 20
     start_time = time.time()
     highest_train_accuracy, highest_test_accuracy = 0.0, 0.0
     track_training_acc, track_training_cost = [], []
     track_test_acc, track_test_cost = [], []
     training_threshold_epoch = 30
     try:
         sample_size = 0
         fuedge_factor = 1e-6
         # accumu matrix for word-embedding matrix
         # hist matrix for word-embedding matrix
         accumu_embedding = np.zeros((word_embedding.dict_size(), configer.num_input), dtype=floatX)
         hist_embedding = np.zeros((word_embedding.dict_size(), configer.num_input), dtype=floatX)
         for i in xrange(epoch):
             costs = 0.0
             correct_count = 0
             logger.debug('=' * 50)
             # rate = learn_rate / (1+i)
             rate = learn_rate
             # Training
             num_batch = train_size / batch_size
             for k in xrange(num_batch):
                 # Clear all the cache        
                 accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 if i > training_threshold_epoch:
                     accumu_embedding[:] = 0.0
                     hist_embedding[:] = 0.0
                 for j in xrange(k*batch_size, (k+1)*batch_size):
                     train_sent_rep = senti_train_sparse_select[j].dot(word_embedding.embedding)
                     results = grbagger.compute_gradient_and_cost(train_sent_rep, senti_train_label[j])
                     input_grad = grbagger.compute_input_gradient(train_sent_rep, senti_train_label[j])
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == senti_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                     ## Update the word-embedding matrix
                     if i > training_threshold_epoch:
                         tmp = senti_train_sparse_select[j].T.dot(input_grad)
                         accumu_embedding += tmp
                         hist_embedding += np.square(tmp)
                 # Updating model parameters
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 # Updating word-embedding matrix
                 if i > training_threshold_epoch:
                     accumu_embedding /= batch_size
                     accumu_embedding /= fuedge_factor + np.sqrt(hist_embedding)
                     word_embedding._embedding -= rate * accumu_embedding
                 grbagger.update_params(accumu_grads, rate)
             # Clear all the cache again
             accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             if i > training_threshold_epoch:
                 accumu_embedding[:] = 0.0
                 hist_embedding[:] = 0.0
             if num_batch * batch_size < train_size:
                 for j in xrange(num_batch*batch_size, train_size):
                     train_sent_rep = senti_train_sparse_select[j].dot(word_embedding.embedding)
                     results = grbagger.compute_gradient_and_cost(train_sent_rep, senti_train_label[j])
                     input_grad = grbagger.compute_input_gradient(train_sent_rep, senti_train_label[j])
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == senti_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                     ## Update the word-embedding matrix
                     if i > training_threshold_epoch:
                         tmp = senti_train_sparse_select[j].T.dot(input_grad)
                         accumu_embedding += tmp
                         hist_embedding += np.square(tmp)
                 # Normalizing model parameters
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= train_size-num_batch*batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 # Normalizing word-embedding matrix
                 if i > training_threshold_epoch:
                     accumu_embedding /= train_size-num_batch*batch_size
                     accumu_embedding /= fuedge_factor + np.sqrt(hist_embedding)
                     word_embedding._embedding -= rate * accumu_embedding
                 # Updating all the parameters
                 grbagger.update_params(accumu_grads, rate)
             train_accuracy = float(correct_count) / train_size
             logger.debug('Training epoch: %d, total cost: %f, accuracy = %f' 
                         % (i, costs, train_accuracy))
             # Append all the numbers
             track_training_cost.append(costs)
             track_training_acc.append(train_accuracy)
             if train_accuracy > highest_train_accuracy: highest_train_accuracy = train_accuracy
             # Testing
             correct_count = 0
             costs = 0.0
             for j in xrange(test_size):
                 test_sent_rep = senti_test_sparse_select[j].dot(word_embedding.embedding)
                 pred = grbagger.predict(test_sent_rep)
                 cost = grbagger.show_cost(test_sent_rep, senti_test_label[j])
                 if pred == senti_test_label[j]: correct_count += 1
                 costs += cost
             test_accuracy = float(correct_count) / test_size
             logger.debug('Test accuracy: %f' % test_accuracy)
             # Append all the numbers
             track_test_cost.append(costs)
             track_test_acc.append(test_accuracy)
             if test_accuracy > highest_test_accuracy: highest_test_accuracy = test_accuracy
             # Sampling to show the weights and experts of training and test instances
             logger.debug('Training Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(train_size)
                 weights = grbagger.show_weights(senti_train_set[idx])
                 scores = grbagger.show_scores(senti_train_set[idx])
                 prob = grbagger.show_prob(senti_train_set[idx])
                 label = senti_train_label[idx]
                 logger.debug('Training idx: {}'.format(idx))
                 logger.debug('Training scores: {}'.format(scores))
                 logger.debug('Training weights: {}'.format(weights))
                 logger.debug('Training probability: {}'.format(prob))
                 logger.debug('Training label: {}'.format(label))
                 logger.debug('-' * 50)
             logger.debug('Test Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(test_size)
                 weights = grbagger.show_weights(senti_test_set[idx])    
                 scores = grbagger.show_scores(senti_test_set[idx])
                 prob = grbagger.show_prob(senti_test_set[idx])
                 label = senti_test_label[idx]
                 logger.debug('Test idx: {}'.format(idx))
                 logger.debug('Test scores: {}'.format(scores))
                 logger.debug('Test weights: {}'.format(weights))
                 logger.debug('Test probability: {}'.format(prob))
                 logger.debug('Test label: {}'.format(label))
                 logger.debug('-' * 50)
             # Check norms of the model parameter
             for param in grbagger.params:
                 val = param.get_value(borrow=True)
                 norm = np.sqrt(np.sum(np.square(val)))
                 logger.debug('Parameter: {}, L2-norm: {}'.format(param.name, norm))
             wnorm = np.sqrt(np.sum(np.square(word_embedding._embedding)))
             logger.debug('Parameter: {}, L2-norm: {}'.format('Word-Embedding', wnorm))
     except:
         logger.debug('Error appeared!')
         traceback.print_exc(file=sys.stdout)
         logger.debug('-' * 50)
     finally:
         end_time = time.time()
         logger.debug('Time used for training: %f seconds.' % (end_time-start_time))
         logger.debug('Highest training accuracy: %f' % highest_train_accuracy)
         logger.debug('Highest test accuracy: %f' % highest_test_accuracy)
         GrCNNBagger.save('fine-grbagger.model', grbagger)
         # Save all the tracking numbers
         track_training_acc = np.asarray(track_training_acc)
         track_training_cost = np.asarray(track_training_cost)
         track_test_acc = np.asarray(track_test_acc)
         track_test_cost = np.asarray(track_test_cost)
         with file('fine-senti-records.npy', 'w') as fout:
             np.save(fout, track_training_acc)
             np.save(fout, track_training_cost)
             np.save(fout, track_test_acc)
             np.save(fout, track_test_cost)
         logger.debug('Training and test records saved to fine-senti-records.npy...')
         logger.debug('Finished...')
Ejemplo n.º 8
0
 def testSentiment(self):
     '''
     Build a small model and use it on sentiment analysis task.
     '''
     '''
     Load training and test texts and labels 
     in sentiment analysis task, preprocessing.
     '''
     np.random.seed(1991)
     senti_train_filename = '../data/sentiment-train.txt'
     # senti_train_filename = '../data/sentiment-train-phrases.txt'
     senti_test_filename = '../data/sentiment-test.txt'
     senti_train_txt, senti_train_label = [], []
     senti_test_txt, senti_test_label = [], []
     start_time = time.time()
     # Read training data set
     with file(senti_train_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             senti_train_txt.append(txt)
             senti_train_label.append(int(label))
     # Read test data set
     with file(senti_test_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             senti_test_txt.append(txt)
             senti_test_label.append(int(label))
     end_time = time.time()
     logger.debug('Time used to load training and test data set: %f seconds.' % (end_time-start_time))
     embedding_filename = '../data/wiki_embeddings.txt'
     # Load training/test data sets and wiki-embeddings
     word_embedding = WordEmbedding(embedding_filename)
     embed_dim = word_embedding.embedding_dim()
     start_time = time.time()
     # Store original text representation
     self.senti_train_txt = senti_train_txt
     self.senti_test_txt = senti_test_txt
     # Word-vector representation
     self.senti_train_label = np.asarray(senti_train_label, dtype=np.int32)
     self.senti_test_label = np.asarray(senti_test_label, dtype=np.int32)
     train_size = len(senti_train_txt)
     test_size = len(senti_test_txt)
     # Check size
     assert train_size == self.senti_train_label.shape[0]
     assert test_size == self.senti_test_label.shape[0]
     logger.debug('Training size: %d' % train_size)
     logger.debug('Test size: %d' % test_size)
     # Compute word embedding
     self.senti_train_set = []
     self.senti_test_set = []
     # Embedding for training set
     for i, sent in enumerate(senti_train_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32)
         vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words])
         self.senti_train_set.append(vectors)
     # Embedding for test set
     for i, sent in enumerate(senti_test_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32)
         vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words])
         self.senti_test_set.append(vectors)
     end_time = time.time()
     logger.debug('Time used to build initial training and test matrix: %f seconds.' % (end_time-start_time))
     # Store data
     self.train_size = train_size
     self.test_size = test_size
     self.word_embedding = word_embedding
     # Shuffling
     rindex = np.arange(train_size)
     tindex = np.arange(test_size)
     np.random.shuffle(rindex)
     np.random.shuffle(tindex)
     self.senti_train_set = list(np.asarray(self.senti_train_set)[rindex])
     self.senti_test_set = list(np.asarray(self.senti_test_set)[tindex])
     self.senti_train_label = self.senti_train_label[rindex]
     self.senti_test_label = self.senti_test_label[tindex]
     senti_train_set, senti_test_set = self.senti_train_set, self.senti_test_set
     senti_train_label, senti_test_label = self.senti_train_label, self.senti_test_label   
     p_count = np.sum(senti_train_label)
     logger.debug('Default positive percentage in Train: %f' % (float(p_count) / train_size))
     logger.debug('Default negative percentage in Train: %f' % (float(train_size-p_count) / train_size))
     p_count = np.sum(senti_test_label)
     logger.debug('Default positive percentage in Test: %f' % (float(p_count) / test_size))
     logger.debug('Default negative percentage in Test: %f' % (float(test_size-p_count) / test_size))
     # Now, start training
     start_time = time.time()
     grbagger = GrCNNBagger(self.configer, verbose=True)
     end_time = time.time()
     logger.debug('Time used to build the model: %f seconds.' % (end_time-start_time))
     learn_rate = 0.02
     # Training using stochastic gradient descent algorithm
     epoch = 200
     batch_size = 20
     start_time = time.time()
     highest_train_accuracy, highest_test_accuracy = 0.0, 0.0
     track_training_acc, track_training_cost = [], []
     track_test_acc, track_test_cost = [], []
     try:
         sample_size = 0
         fuedge_factor = 1e-6
         for i in xrange(epoch):
             costs = 0.0
             correct_count = 0
             logger.debug('=' * 50)
             # rate = learn_rate / (1+i)
             rate = learn_rate
             # Training
             num_batch = train_size / batch_size
             for k in xrange(num_batch):
                 accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 for j in xrange(k*batch_size, (k+1)*batch_size):
                     results = grbagger.compute_gradient_and_cost(senti_train_set[j], senti_train_label[j])
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == senti_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 grbagger.update_params(accumu_grads, rate)
             accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             if num_batch * batch_size < train_size:
                 for j in xrange(num_batch*batch_size, train_size):
                     results = grbagger.compute_gradient_and_cost(senti_train_set[j], senti_train_label[j])
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == senti_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= train_size-num_batch*batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 grbagger.update_params(accumu_grads, rate)
             train_accuracy = float(correct_count) / train_size
             logger.debug('Training epoch: %d, total cost: %f, accuracy = %f' 
                         % (i, costs, train_accuracy))
             # Append all the numbers
             track_training_cost.append(costs)
             track_training_acc.append(train_accuracy)
             if train_accuracy > highest_train_accuracy: highest_train_accuracy = train_accuracy
             # Testing
             correct_count = 0
             costs = 0.0
             for j in xrange(test_size):
                 pred = grbagger.predict(senti_test_set[j])
                 cost = grbagger.show_cost(senti_test_set[j], senti_test_label[j])
                 if pred == senti_test_label[j]: correct_count += 1
                 costs += cost
             test_accuracy = float(correct_count) / test_size
             logger.debug('Test accuracy: %f' % test_accuracy)
             # Append all the numbers
             track_test_cost.append(costs)
             track_test_acc.append(test_accuracy)
             if test_accuracy > highest_test_accuracy: highest_test_accuracy = test_accuracy
             # Sampling to show the weights and experts of training and test instances
             logger.debug('Training Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(train_size)
                 weights = grbagger.show_weights(senti_train_set[idx])
                 scores = grbagger.show_scores(senti_train_set[idx])
                 prob = grbagger.show_prob(senti_train_set[idx])
                 label = senti_train_label[idx]
                 logger.debug('Training idx: {}'.format(idx))
                 logger.debug('Training scores: {}'.format(scores))
                 logger.debug('Training weights: {}'.format(weights))
                 logger.debug('Training probability: {}'.format(prob))
                 logger.debug('Training label: {}'.format(label))
                 logger.debug('-' * 50)
             logger.debug('Test Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(test_size)
                 weights = grbagger.show_weights(senti_test_set[idx])    
                 scores = grbagger.show_scores(senti_test_set[idx])
                 prob = grbagger.show_prob(senti_test_set[idx])
                 label = senti_test_label[idx]
                 logger.debug('Test idx: {}'.format(idx))
                 logger.debug('Test scores: {}'.format(scores))
                 logger.debug('Test weights: {}'.format(weights))
                 logger.debug('Test probability: {}'.format(prob))
                 logger.debug('Test label: {}'.format(label))
                 logger.debug('-' * 50)
             # Check norms of the model parameter
             for param in grbagger.params:
                 val = param.get_value(borrow=True)
                 norm = np.sqrt(np.sum(np.square(val)))
                 logger.debug('Parameter: {}, L2-norm: {}'.format(param.name, norm))
     except:
         logger.debug('Error appeared!')
         traceback.print_exc(file=sys.stdout)
         logger.debug('-' * 50)
     finally:
         end_time = time.time()
         logger.debug('Time used for training: %f seconds.' % (end_time-start_time))
         logger.debug('Highest training accuracy: %f' % highest_train_accuracy)
         logger.debug('Highest test accuracy: %f' % highest_test_accuracy)
         GrCNNBagger.save('grbagger.model', grbagger)
         # Save all the tracking numbers
         track_training_acc = np.asarray(track_training_acc)
         track_training_cost = np.asarray(track_training_cost)
         track_test_acc = np.asarray(track_test_acc)
         track_test_cost = np.asarray(track_test_cost)
         with file('senti-records.npy', 'w') as fout:
             np.save(fout, track_training_acc)
             np.save(fout, track_training_cost)
             np.save(fout, track_test_acc)
             np.save(fout, track_test_cost)
         logger.debug('Training and test records saved to senti-records.npy...')
         logger.debug('Finished...')