Python GrCNNBagger Examples

Programming Language: Python

Namespace/Package Name: grcnn

Class/Type: GrCNNBagger

Examples at hotexamples.com: 8

Python GrCNNBagger - 8 examples found. These are the top rated real world Python examples of grcnn.GrCNNBagger extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

GrCNNBagger(7)

show_scores(6)

show_weights(6)

compute_gradient_and_cost(5)

show_prob(5)

update_params(5)

predict(4)

save(4)

show_cost(4)

compute_input_gradient(2)

load(2)

show_hierarchy(1)

Example #1

Show file

    def testGroup(self):
        '''
        Build a small synthetic data set and labelling for training 
        on GrCNNBagger.
        '''
        np.random.seed(42)
        train_size, input_dim = 500, 50
        data = [np.random.rand(np.random.randint(5, 20), input_dim).astype(np.float32) 
                for _ in xrange(train_size)]
        p = 0.45
        labels = np.random.binomial(1, p, train_size)
        logger.debug('Building GrCNNBagger...')
        start_time = time.time()
        grbagger = GrCNNBagger(self.configer, verbose=True)
        end_time = time.time()
        logger.debug('Time used to build the model: %f seconds.' % (end_time-start_time))
        learn_rate = 0.01
        # Training using stochastic gradient descent algorithm
        epoch = 60
        training_cost, training_acc = [], []
        for i in xrange(epoch):
            costs = 0.0
            correct_count = 0
            logger.debug('=' * 50)

            for j in xrange(train_size):
                results = grbagger.compute_gradient_and_cost(data[j], labels[j])
                prob = grbagger.show_prob(data[j])
                scores = grbagger.show_scores(data[j])
                weights = grbagger.show_weights(data[j])
                grads, cost, pred = results[:-2], results[-2], results[-1]
                if pred == labels[j]: correct_count += 1
                if cost == np.nan:
                    logger.error('Error here, in %dth epoch, %dth instance' % (i, j))
                    logger.error('Gradients: ')
                    logger.error(grads)
                costs += cost
                grbagger.update_params(grads, learn_rate)
            logger.debug('Training epoch: %d, total cost: %f, accuracy = %f' 
                        % (i, costs, float(correct_count) / train_size))
            training_cost.append(costs)
            training_acc.append(float(correct_count) / train_size)        
        logger.debug('Training finished...')
        logger.debug('Number of parameters in the model: %d' % grbagger.num_params)
        logger.debug('Output the weighting vector for each input instance: ')
        # Record instance scores and instance weights
        instance_scores, instance_weights, instance_labels = [], [], labels
        for j in xrange(train_size):
            weights = grbagger.show_weights(data[j])
            scores = grbagger.show_scores(data[j])
            weights = weights.ravel()
            scores = scores.ravel()
            pred = np.sum(scores * weights) >= 0.5
            logger.debug('Instance %d, true label: %d, predicted label: %d' % (j, labels[j], pred))
            logger.debug('scores = {}'.format(scores))
            logger.debug('weights = {}'.format(weights))
            logger.debug('-' * 50)

            instance_scores.append(scores)
            instance_weights.append(weights)

Example #2

Show file

 def testHierarchical(self):
     logger.debug('Inside testHierarchical...')
     grbagger = GrCNNBagger(self.configer, verbose=True)
     sent = np.random.rand(25, 50).astype(np.float32)
     logger.debug('Summarized Hierarchy: ')
     logger.debug(grbagger.show_hierarchy(sent))
     logger.debug('Scores from hierarchical experts: ')
     logger.debug(grbagger.show_scores(sent))
     logger.debug('Weights of hierarchical experts: ')
     logger.debug(grbagger.show_weights(sent))

Example #3

Show file

File: show_plot.py Project: appscluster/sentiment-CNN

import pylab as plt

# Set the basic configuration of the logging system
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
                    datefmt='%m-%d %H:%M')
sys.path.append('../source/')
logger = logging.getLogger(__name__)

from grcnn import GrCNNBagger
from config import GrCNNConfiger
from wordvec import WordEmbedding

model_filename = './grbagger.model'
start_time = time.time()
grbagger = GrCNNBagger.load(model_filename)
end_time = time.time()
logger.debug('Time used to load the model: %f seconds.' % (end_time-start_time))

np.random.seed(1991)
senti_train_filename = '../data/sentiment-train.txt'
senti_test_filename = '../data/sentiment-test.txt'
senti_train_txt, senti_train_label = [], []
senti_test_txt, senti_test_label = [], []
start_time = time.time()
# Read training data set
with file(senti_train_filename, 'r') as fin:
    reader = csv.reader(fin, delimiter='|')
    for txt, label in reader:
        senti_train_txt.append(txt)
        senti_train_label.append(int(label))

Example #4

Show file

File: movie_sentiment.py Project: appscluster/sentiment-CNN

    senti_test_word_index.append(indices)
    senti_test_sparse_select.append(sparse_select)
end_time = time.time()
logger.debug('Time used to build initial matrices: %f seconds.' % (end_time-start_time))
p_count = np.sum(senti_train_label)
logger.debug('Default positive percentage in Train: %f' % (float(p_count) / train_size))
logger.debug('Default negative percentage in Train: %f' % (float(train_size-p_count) / train_size))
p_count = np.sum(senti_test_label)
logger.debug('Default positive percentage in Test: %f' % (float(p_count) / test_size))
logger.debug('Default negative percentage in Test: %f' % (float(test_size-p_count) / test_size))
# If there is a designated model, using it, else start from scratch
start_time = time.time()
if args.model == 'NONE':
    logger.debug('No designated model, training from scratch...')
    configer = GrCNNConfiger(args.config)
    grbagger = GrCNNBagger(configer, verbose=True)
else:
    logger.debug('There is a designated model, loading: {}'.format(args.model))
    grbagger = GrCNNBagger.load(args.model)
end_time = time.time()
logger.debug('Time used to building the model: %f seconds.' % (end_time-start_time))
logger.debug('Training start...')
# Initialize model training configuration
learn_rate = args.rate
batch_size = args.size
epoch = args.epoch
# Training using AdaGrad
training_threshold_epoch = 30
highest_train_accuracy, highest_test_accuracy = 0.0, 0.0
track_training_acc, track_training_cost = [], []
track_test_acc, track_test_cost = [], []

Example #5

Show file

 def testActiveAndPassive(self):
     np.random.seed(1991)
     sp_train_filename = '../data/refined_train_sp.txt'
     sp_test_filename = '../data/refined_test_sp.txt'
     sp_train_txt, sp_train_label = [], []
     sp_test_txt, sp_test_label = [], []
     start_time = time.time()
     # Read training data set
     with file(sp_train_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             sp_train_txt.append(txt)
             sp_train_label.append(int(label))
     # Read test data set
     with file(sp_test_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             sp_test_txt.append(txt)
             sp_test_label.append(label)
     end_time = time.time()
     logger.debug('Finished loading training and test data sets...')
     logger.debug('Time used for loading: %f seconds.' % (end_time-start_time))
     embedding_filename = '../data/wiki_embeddings.txt'
     word_embedding = WordEmbedding(embedding_filename)
     start_time = time.time()
     # Starting and Ending token for each sentence
     blank_token = word_embedding.wordvec('</s>')
     # Word-vector representation
     sp_train_label = np.asarray(sp_train_label, dtype=np.int32)
     sp_test_label = np.asarray(sp_test_label, dtype=np.int32)
     train_size = len(sp_train_txt)
     test_size = len(sp_test_txt)
     # Check size
     logger.debug('Training size: %d' % train_size)
     logger.debug('Test size: %d' % test_size)
     # Sequential modeling for each sentence
     sp_train_set, sp_test_set = [], []
     # Embedding for training set
     for i, sent in enumerate(sp_train_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX)
         vectors[1:-1, :] = np.asarray([word_embedding.wordvec(word) for word in words])
         sp_train_set.append(vectors)
     # Embedding for test set
     for i, sent in enumerate(sp_test_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX)
         vectors[1:-1, :] = np.asarray([word_embedding.wordvec(word) for word in words])
         sp_test_set.append(vectors)
     end_time = time.time()
     logger.debug('Time used to build initial training and test matrix: %f seconds.' % (end_time-start_time))
     # Now, start training
     start_time = time.time()
     grbagger = GrCNNBagger(self.configer, verbose=True)
     end_time = time.time()
     logger.debug('Time used to build the model: %f seconds.' % (end_time-start_time))
     learn_rate = 2e-2
     # Training using stochastic gradient descent algorithm
     epoch = 200
     batch_size = 10
     start_time = time.time()
     highest_train_accuracy, highest_test_accuracy = 0.0, 0.0
     training_acc, training_cost = [], []
     test_acc, test_cost = [], []
     try:
         sample_size = 0
         fuedge_factor = 1e-6
         for i in xrange(epoch):
             costs = 0.0
             correct_count = 0
             logger.debug('=' * 50)
             # rate = learn_rate / (1+i)
             rate = learn_rate
             # Training
             num_batch = train_size / batch_size
             for k in xrange(num_batch):
                 accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 for j in xrange(k*batch_size, (k+1)*batch_size):
                     results = grbagger.compute_gradient_and_cost(sp_train_set[j], sp_train_label[j])
                     for r in results:
                         if np.isnan(np.sum(r)):
                             logger.debug('*' * 50)
                             logger.debug('Error!!!!!')
                             logger.debug('NaN found at %dth training instance' % j)
                             logger.debug('*' * 50)
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == sp_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 grbagger.update_params(accumu_grads, rate)
             accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             if num_batch * batch_size < train_size:
                 for j in xrange(num_batch*batch_size, train_size):
                     results = grbagger.compute_gradient_and_cost(sp_train_set[j], sp_train_label[j])
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == sp_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= train_size-num_batch*batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 grbagger.update_params(accumu_grads, rate)
             train_accuracy = float(correct_count) / train_size
             logger.debug('Training epoch: %d, total cost: %f, accuracy = %f' 
                         % (i, costs, train_accuracy))
             ## Adding training accuracy and training cost
             training_acc.append(train_accuracy)
             training_cost.append(costs)
             if train_accuracy > highest_train_accuracy: highest_train_accuracy = train_accuracy
             # Testing
             correct_count = 0
             costs = 0.0
             for j in xrange(test_size):
                 pred = grbagger.predict(sp_test_set[j])
                 cost = grbagger.show_cost(sp_test_set[j], sp_test_label[j])
                 if pred == sp_test_label[j]: correct_count += 1
                 costs += cost
             test_accuracy = float(correct_count) / test_size
             ## Adding test accuracy and test cost
             test_acc.append(test_accuracy)
             test_cost.append(costs)
             logger.debug('Test accuracy: %f' % test_accuracy)
             if test_accuracy > highest_test_accuracy: highest_test_accuracy = test_accuracy
             # Sampling to show the weights and experts of training and test instances
             logger.debug('Training Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(train_size)
                 weights = grbagger.show_weights(sp_train_set[idx])
                 scores = grbagger.show_scores(sp_train_set[idx])
                 prob = grbagger.show_prob(sp_train_set[idx])
                 label = sp_train_label[idx]
                 logger.debug('Training idx: {}'.format(idx))
                 logger.debug('Training scores: {}'.format(scores))
                 logger.debug('Training weights: {}'.format(weights))
                 logger.debug('Training probability: {}'.format(prob))
                 logger.debug('Training label: {}'.format(label))
                 logger.debug('-' * 50)
             logger.debug('Test Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(test_size)
                 weights = grbagger.show_weights(sp_test_set[idx])    
                 scores = grbagger.show_scores(sp_test_set[idx])
                 prob = grbagger.show_prob(sp_test_set[idx])
                 label = sp_test_label[idx]
                 logger.debug('Test idx: {}'.format(idx))
                 logger.debug('Test scores: {}'.format(scores))
                 logger.debug('Test weights: {}'.format(weights))
                 logger.debug('Test probability: {}'.format(prob))
                 logger.debug('Test label: {}'.format(label))
                 logger.debug('-' * 50)
             # Check norms of the model parameter
             for param in grbagger.params:
                 val = param.get_value(borrow=True)
                 norm = np.sqrt(np.sum(np.square(val)))
                 logger.debug('Parameter: {}, L2-norm: {}'.format(param.name, norm))
     except:
         logger.debug('Error appeared!')
         traceback.print_exc(file=sys.stdout)
         logger.debug('-' * 50)
     finally:
         end_time = time.time()
         logger.debug('Time used for training: %f seconds.' % (end_time-start_time))
         logger.debug('Highest training accuracy: %f' % highest_train_accuracy)
         logger.debug('Highest test accuracy: %f' % highest_test_accuracy)
         GrCNNBagger.save('sp-grbagger.model', grbagger)
         # Save all the training and test records
         training_acc = np.asarray(training_acc)
         training_cost = np.asarray(training_cost)
         test_acc = np.asarray(test_acc)
         test_cost = np.asarray(test_cost)
         with file('sp-records.npy', 'w') as fout:
             np.save(fout, training_acc)
             np.save(fout, training_cost)
             np.save(fout, test_acc)
             np.save(fout, test_cost)
         logger.debug('Training and test records saved to sp-records.npy...')
         logger.debug('Finished...')

Example #6

Show file

 def testBuilding(self):
     logger.debug('Inside testBuilding...')
     grbagger = GrCNNBagger(self.configer, verbose=True)

Example #7

Show file

 def testSentimentFineTune(self):
     '''
     Build a small model and use it on sentiment analysis task. With fine-tunning
     the word-embedding matrix.
     '''
     np.random.seed(1991)
     fname = './grCNN.conf'
     configer = GrCNNConfiger(fname)
     senti_train_filename = '../data/sentiment-train.txt'
     # senti_train_filename = '../data/sentiment-train-phrases.txt'
     senti_test_filename = '../data/sentiment-test.txt'
     senti_train_txt, senti_train_label = [], []
     senti_test_txt, senti_test_label = [], []
     start_time = time.time()
     # Read training data set
     with file(senti_train_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             senti_train_txt.append(txt)
             senti_train_label.append(int(label))
     # Read test data set
     with file(senti_test_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             senti_test_txt.append(txt)
             senti_test_label.append(int(label))
     end_time = time.time()
     logger.debug('Time used to load training and test data set: %f seconds.' % (end_time-start_time))
     embedding_filename = '../data/wiki_embeddings.txt'
     # Load training/test data sets and wiki-embeddings
     word_embedding = WordEmbedding(embedding_filename)
     embed_dim = word_embedding.embedding_dim()
     start_time = time.time()
     blank_index = word_embedding.word2index('</s>')
     logger.debug('Blank index: {}'.format(word_embedding.index2word(blank_index)))
     # Word-vector representation
     senti_train_label = np.asarray(senti_train_label, dtype=np.int32)
     senti_test_label = np.asarray(senti_test_label, dtype=np.int32)
     train_size = len(senti_train_txt)
     test_size = len(senti_test_txt)
     # Check size
     logger.debug('Training size: %d' % train_size)
     logger.debug('Test size: %d' % test_size)
     # Shuffling for all the instances
     start_time = time.time()
     rindex = np.arange(train_size)
     tindex = np.arange(test_size)
     np.random.shuffle(rindex)
     np.random.shuffle(tindex)
     # Shuffle label
     senti_train_label = senti_train_label[rindex]
     senti_test_label = senti_test_label[tindex]
     # Shuffle text
     senti_train_txt = list(np.asarray(senti_train_txt)[rindex])
     senti_test_txt = list(np.asarray(senti_test_txt)[tindex])
     end_time = time.time()
     logger.debug('Time used to shuffle all the data: %f seconds.' % (end_time-start_time))
     # Compute word embedding
     senti_train_set = []
     senti_test_set = []
     # Record the index of each word in each sentence for only once
     senti_train_word_index = []
     senti_test_word_index = []
     # Record the sparse input indicator matrix only once for fast computation
     senti_train_sparse_select = []
     senti_test_sparse_select = []
     # Embedding for training set
     for i, sent in enumerate(senti_train_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32)
         vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words])
         indices = [blank_index]
         indices += [word_embedding.word2index(word) for word in words]
         indices += [blank_index]
         sparse_select = lil_matrix((len(words)+2, word_embedding.dict_size()), dtype=floatX)
         sparse_select[range(len(words)+2), indices] = 1.0
         sparse_select = csc_matrix(sparse_select)
         senti_train_set.append(vectors)
         senti_train_word_index.append(indices)
         senti_train_sparse_select.append(sparse_select)
     # Embedding for test set
     for i, sent in enumerate(senti_test_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32)
         vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words])
         indices = [blank_index]
         indices += [word_embedding.word2index(word) for word in words]
         indices += [blank_index]
         sparse_select = lil_matrix((len(words)+2, word_embedding.dict_size()), dtype=floatX)
         sparse_select[range(len(words)+2), indices] = 1.0
         sparse_select = csc_matrix(sparse_select)
         senti_test_set.append(vectors)
         senti_test_word_index.append(indices)
         senti_test_sparse_select.append(sparse_select)
     end_time = time.time()
     logger.debug('Time used to build initial matrices: %f seconds.' % (end_time-start_time))
     p_count = np.sum(senti_train_label)
     logger.debug('Default positive percentage in Train: %f' % (float(p_count) / train_size))
     logger.debug('Default negative percentage in Train: %f' % (float(train_size-p_count) / train_size))
     p_count = np.sum(senti_test_label)
     logger.debug('Default positive percentage in Test: %f' % (float(p_count) / test_size))
     logger.debug('Default negative percentage in Test: %f' % (float(test_size-p_count) / test_size))
     # Now, start training
     start_time = time.time()
     grbagger = GrCNNBagger(configer, verbose=True)
     end_time = time.time()
     logger.debug('Time used to build the model: %f seconds.' % (end_time-start_time))
     learn_rate = 0.02
     # Training using stochastic gradient descent algorithm
     epoch = 200
     batch_size = 20
     start_time = time.time()
     highest_train_accuracy, highest_test_accuracy = 0.0, 0.0
     track_training_acc, track_training_cost = [], []
     track_test_acc, track_test_cost = [], []
     training_threshold_epoch = 30
     try:
         sample_size = 0
         fuedge_factor = 1e-6
         # accumu matrix for word-embedding matrix
         # hist matrix for word-embedding matrix
         accumu_embedding = np.zeros((word_embedding.dict_size(), configer.num_input), dtype=floatX)
         hist_embedding = np.zeros((word_embedding.dict_size(), configer.num_input), dtype=floatX)
         for i in xrange(epoch):
             costs = 0.0
             correct_count = 0
             logger.debug('=' * 50)
             # rate = learn_rate / (1+i)
             rate = learn_rate
             # Training
             num_batch = train_size / batch_size
             for k in xrange(num_batch):
                 # Clear all the cache        
                 accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 if i > training_threshold_epoch:
                     accumu_embedding[:] = 0.0
                     hist_embedding[:] = 0.0
                 for j in xrange(k*batch_size, (k+1)*batch_size):
                     train_sent_rep = senti_train_sparse_select[j].dot(word_embedding.embedding)
                     results = grbagger.compute_gradient_and_cost(train_sent_rep, senti_train_label[j])
                     input_grad = grbagger.compute_input_gradient(train_sent_rep, senti_train_label[j])
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == senti_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                     ## Update the word-embedding matrix
                     if i > training_threshold_epoch:
                         tmp = senti_train_sparse_select[j].T.dot(input_grad)
                         accumu_embedding += tmp
                         hist_embedding += np.square(tmp)
                 # Updating model parameters
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 # Updating word-embedding matrix
                 if i > training_threshold_epoch:
                     accumu_embedding /= batch_size
                     accumu_embedding /= fuedge_factor + np.sqrt(hist_embedding)
                     word_embedding._embedding -= rate * accumu_embedding
                 grbagger.update_params(accumu_grads, rate)
             # Clear all the cache again
             accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             if i > training_threshold_epoch:
                 accumu_embedding[:] = 0.0
                 hist_embedding[:] = 0.0
             if num_batch * batch_size < train_size:
                 for j in xrange(num_batch*batch_size, train_size):
                     train_sent_rep = senti_train_sparse_select[j].dot(word_embedding.embedding)
                     results = grbagger.compute_gradient_and_cost(train_sent_rep, senti_train_label[j])
                     input_grad = grbagger.compute_input_gradient(train_sent_rep, senti_train_label[j])
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == senti_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                     ## Update the word-embedding matrix
                     if i > training_threshold_epoch:
                         tmp = senti_train_sparse_select[j].T.dot(input_grad)
                         accumu_embedding += tmp
                         hist_embedding += np.square(tmp)
                 # Normalizing model parameters
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= train_size-num_batch*batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 # Normalizing word-embedding matrix
                 if i > training_threshold_epoch:
                     accumu_embedding /= train_size-num_batch*batch_size
                     accumu_embedding /= fuedge_factor + np.sqrt(hist_embedding)
                     word_embedding._embedding -= rate * accumu_embedding
                 # Updating all the parameters
                 grbagger.update_params(accumu_grads, rate)
             train_accuracy = float(correct_count) / train_size
             logger.debug('Training epoch: %d, total cost: %f, accuracy = %f' 
                         % (i, costs, train_accuracy))
             # Append all the numbers
             track_training_cost.append(costs)
             track_training_acc.append(train_accuracy)
             if train_accuracy > highest_train_accuracy: highest_train_accuracy = train_accuracy
             # Testing
             correct_count = 0
             costs = 0.0
             for j in xrange(test_size):
                 test_sent_rep = senti_test_sparse_select[j].dot(word_embedding.embedding)
                 pred = grbagger.predict(test_sent_rep)
                 cost = grbagger.show_cost(test_sent_rep, senti_test_label[j])
                 if pred == senti_test_label[j]: correct_count += 1
                 costs += cost
             test_accuracy = float(correct_count) / test_size
             logger.debug('Test accuracy: %f' % test_accuracy)
             # Append all the numbers
             track_test_cost.append(costs)
             track_test_acc.append(test_accuracy)
             if test_accuracy > highest_test_accuracy: highest_test_accuracy = test_accuracy
             # Sampling to show the weights and experts of training and test instances
             logger.debug('Training Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(train_size)
                 weights = grbagger.show_weights(senti_train_set[idx])
                 scores = grbagger.show_scores(senti_train_set[idx])
                 prob = grbagger.show_prob(senti_train_set[idx])
                 label = senti_train_label[idx]
                 logger.debug('Training idx: {}'.format(idx))
                 logger.debug('Training scores: {}'.format(scores))
                 logger.debug('Training weights: {}'.format(weights))
                 logger.debug('Training probability: {}'.format(prob))
                 logger.debug('Training label: {}'.format(label))
                 logger.debug('-' * 50)
             logger.debug('Test Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(test_size)
                 weights = grbagger.show_weights(senti_test_set[idx])    
                 scores = grbagger.show_scores(senti_test_set[idx])
                 prob = grbagger.show_prob(senti_test_set[idx])
                 label = senti_test_label[idx]
                 logger.debug('Test idx: {}'.format(idx))
                 logger.debug('Test scores: {}'.format(scores))
                 logger.debug('Test weights: {}'.format(weights))
                 logger.debug('Test probability: {}'.format(prob))
                 logger.debug('Test label: {}'.format(label))
                 logger.debug('-' * 50)
             # Check norms of the model parameter
             for param in grbagger.params:
                 val = param.get_value(borrow=True)
                 norm = np.sqrt(np.sum(np.square(val)))
                 logger.debug('Parameter: {}, L2-norm: {}'.format(param.name, norm))
             wnorm = np.sqrt(np.sum(np.square(word_embedding._embedding)))
             logger.debug('Parameter: {}, L2-norm: {}'.format('Word-Embedding', wnorm))
     except:
         logger.debug('Error appeared!')
         traceback.print_exc(file=sys.stdout)
         logger.debug('-' * 50)
     finally:
         end_time = time.time()
         logger.debug('Time used for training: %f seconds.' % (end_time-start_time))
         logger.debug('Highest training accuracy: %f' % highest_train_accuracy)
         logger.debug('Highest test accuracy: %f' % highest_test_accuracy)
         GrCNNBagger.save('fine-grbagger.model', grbagger)
         # Save all the tracking numbers
         track_training_acc = np.asarray(track_training_acc)
         track_training_cost = np.asarray(track_training_cost)
         track_test_acc = np.asarray(track_test_acc)
         track_test_cost = np.asarray(track_test_cost)
         with file('fine-senti-records.npy', 'w') as fout:
             np.save(fout, track_training_acc)
             np.save(fout, track_training_cost)
             np.save(fout, track_test_acc)
             np.save(fout, track_test_cost)
         logger.debug('Training and test records saved to fine-senti-records.npy...')
         logger.debug('Finished...')

Example #8

Show file

 def testSentiment(self):
     '''
     Build a small model and use it on sentiment analysis task.
     '''
     '''
     Load training and test texts and labels 
     in sentiment analysis task, preprocessing.
     '''
     np.random.seed(1991)
     senti_train_filename = '../data/sentiment-train.txt'
     # senti_train_filename = '../data/sentiment-train-phrases.txt'
     senti_test_filename = '../data/sentiment-test.txt'
     senti_train_txt, senti_train_label = [], []
     senti_test_txt, senti_test_label = [], []
     start_time = time.time()
     # Read training data set
     with file(senti_train_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             senti_train_txt.append(txt)
             senti_train_label.append(int(label))
     # Read test data set
     with file(senti_test_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             senti_test_txt.append(txt)
             senti_test_label.append(int(label))
     end_time = time.time()
     logger.debug('Time used to load training and test data set: %f seconds.' % (end_time-start_time))
     embedding_filename = '../data/wiki_embeddings.txt'
     # Load training/test data sets and wiki-embeddings
     word_embedding = WordEmbedding(embedding_filename)
     embed_dim = word_embedding.embedding_dim()
     start_time = time.time()
     # Store original text representation
     self.senti_train_txt = senti_train_txt
     self.senti_test_txt = senti_test_txt
     # Word-vector representation
     self.senti_train_label = np.asarray(senti_train_label, dtype=np.int32)
     self.senti_test_label = np.asarray(senti_test_label, dtype=np.int32)
     train_size = len(senti_train_txt)
     test_size = len(senti_test_txt)
     # Check size
     assert train_size == self.senti_train_label.shape[0]
     assert test_size == self.senti_test_label.shape[0]
     logger.debug('Training size: %d' % train_size)
     logger.debug('Test size: %d' % test_size)
     # Compute word embedding
     self.senti_train_set = []
     self.senti_test_set = []
     # Embedding for training set
     for i, sent in enumerate(senti_train_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32)
         vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words])
         self.senti_train_set.append(vectors)
     # Embedding for test set
     for i, sent in enumerate(senti_test_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32)
         vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words])
         self.senti_test_set.append(vectors)
     end_time = time.time()
     logger.debug('Time used to build initial training and test matrix: %f seconds.' % (end_time-start_time))
     # Store data
     self.train_size = train_size
     self.test_size = test_size
     self.word_embedding = word_embedding
     # Shuffling
     rindex = np.arange(train_size)
     tindex = np.arange(test_size)
     np.random.shuffle(rindex)
     np.random.shuffle(tindex)
     self.senti_train_set = list(np.asarray(self.senti_train_set)[rindex])
     self.senti_test_set = list(np.asarray(self.senti_test_set)[tindex])
     self.senti_train_label = self.senti_train_label[rindex]
     self.senti_test_label = self.senti_test_label[tindex]
     senti_train_set, senti_test_set = self.senti_train_set, self.senti_test_set
     senti_train_label, senti_test_label = self.senti_train_label, self.senti_test_label   
     p_count = np.sum(senti_train_label)
     logger.debug('Default positive percentage in Train: %f' % (float(p_count) / train_size))
     logger.debug('Default negative percentage in Train: %f' % (float(train_size-p_count) / train_size))
     p_count = np.sum(senti_test_label)
     logger.debug('Default positive percentage in Test: %f' % (float(p_count) / test_size))
     logger.debug('Default negative percentage in Test: %f' % (float(test_size-p_count) / test_size))
     # Now, start training
     start_time = time.time()
     grbagger = GrCNNBagger(self.configer, verbose=True)
     end_time = time.time()
     logger.debug('Time used to build the model: %f seconds.' % (end_time-start_time))
     learn_rate = 0.02
     # Training using stochastic gradient descent algorithm
     epoch = 200
     batch_size = 20
     start_time = time.time()
     highest_train_accuracy, highest_test_accuracy = 0.0, 0.0
     track_training_acc, track_training_cost = [], []
     track_test_acc, track_test_cost = [], []
     try:
         sample_size = 0
         fuedge_factor = 1e-6
         for i in xrange(epoch):
             costs = 0.0
             correct_count = 0
             logger.debug('=' * 50)
             # rate = learn_rate / (1+i)
             rate = learn_rate
             # Training
             num_batch = train_size / batch_size
             for k in xrange(num_batch):
                 accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 for j in xrange(k*batch_size, (k+1)*batch_size):
                     results = grbagger.compute_gradient_and_cost(senti_train_set[j], senti_train_label[j])
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == senti_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 grbagger.update_params(accumu_grads, rate)
             accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             if num_batch * batch_size < train_size:
                 for j in xrange(num_batch*batch_size, train_size):
                     results = grbagger.compute_gradient_and_cost(senti_train_set[j], senti_train_label[j])
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == senti_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= train_size-num_batch*batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 grbagger.update_params(accumu_grads, rate)
             train_accuracy = float(correct_count) / train_size
             logger.debug('Training epoch: %d, total cost: %f, accuracy = %f' 
                         % (i, costs, train_accuracy))
             # Append all the numbers
             track_training_cost.append(costs)
             track_training_acc.append(train_accuracy)
             if train_accuracy > highest_train_accuracy: highest_train_accuracy = train_accuracy
             # Testing
             correct_count = 0
             costs = 0.0
             for j in xrange(test_size):
                 pred = grbagger.predict(senti_test_set[j])
                 cost = grbagger.show_cost(senti_test_set[j], senti_test_label[j])
                 if pred == senti_test_label[j]: correct_count += 1
                 costs += cost
             test_accuracy = float(correct_count) / test_size
             logger.debug('Test accuracy: %f' % test_accuracy)
             # Append all the numbers
             track_test_cost.append(costs)
             track_test_acc.append(test_accuracy)
             if test_accuracy > highest_test_accuracy: highest_test_accuracy = test_accuracy
             # Sampling to show the weights and experts of training and test instances
             logger.debug('Training Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(train_size)
                 weights = grbagger.show_weights(senti_train_set[idx])
                 scores = grbagger.show_scores(senti_train_set[idx])
                 prob = grbagger.show_prob(senti_train_set[idx])
                 label = senti_train_label[idx]
                 logger.debug('Training idx: {}'.format(idx))
                 logger.debug('Training scores: {}'.format(scores))
                 logger.debug('Training weights: {}'.format(weights))
                 logger.debug('Training probability: {}'.format(prob))
                 logger.debug('Training label: {}'.format(label))
                 logger.debug('-' * 50)
             logger.debug('Test Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(test_size)
                 weights = grbagger.show_weights(senti_test_set[idx])    
                 scores = grbagger.show_scores(senti_test_set[idx])
                 prob = grbagger.show_prob(senti_test_set[idx])
                 label = senti_test_label[idx]
                 logger.debug('Test idx: {}'.format(idx))
                 logger.debug('Test scores: {}'.format(scores))
                 logger.debug('Test weights: {}'.format(weights))
                 logger.debug('Test probability: {}'.format(prob))
                 logger.debug('Test label: {}'.format(label))
                 logger.debug('-' * 50)
             # Check norms of the model parameter
             for param in grbagger.params:
                 val = param.get_value(borrow=True)
                 norm = np.sqrt(np.sum(np.square(val)))
                 logger.debug('Parameter: {}, L2-norm: {}'.format(param.name, norm))
     except:
         logger.debug('Error appeared!')
         traceback.print_exc(file=sys.stdout)
         logger.debug('-' * 50)
     finally:
         end_time = time.time()
         logger.debug('Time used for training: %f seconds.' % (end_time-start_time))
         logger.debug('Highest training accuracy: %f' % highest_train_accuracy)
         logger.debug('Highest test accuracy: %f' % highest_test_accuracy)
         GrCNNBagger.save('grbagger.model', grbagger)
         # Save all the tracking numbers
         track_training_acc = np.asarray(track_training_acc)
         track_training_cost = np.asarray(track_training_cost)
         track_test_acc = np.asarray(track_test_acc)
         track_test_cost = np.asarray(track_test_cost)
         with file('senti-records.npy', 'w') as fout:
             np.save(fout, track_training_acc)
             np.save(fout, track_training_cost)
             np.save(fout, track_test_acc)
             np.save(fout, track_test_cost)
         logger.debug('Training and test records saved to senti-records.npy...')
         logger.debug('Finished...')