コード例 #1
0
    def setUp(self):
        '''
		Load and store the snippet data set.
		'''
        np.random.seed(42)
        snippet_train_set_filename = '../data/train_snip.txt'
        snippet_test_set_filename = '../data/test_snip.txt'
        snippet_train_label_filename = '../data/train_label.txt'
        snippet_test_label_filename = '../data/test_label.txt'
        embedding_filename = '../data/wiki_embeddings.txt'
        # Build architecture of CNN from the configuration file
        # Load wiki-embedding
        word_embedding = WordEmbedding(embedding_filename)
        # Load data and train via minibatch
        with file(snippet_train_set_filename, 'rb') as fin:
            snippet_train_txt = fin.readlines()
        with file(snippet_test_set_filename, 'rb') as fin:
            snippet_test_txt = fin.readlines()
        snippet_train_label = np.loadtxt(snippet_train_label_filename,
                                         dtype=np.int32)
        snippet_test_label = np.loadtxt(snippet_test_label_filename,
                                        dtype=np.int32)
        training_size = len(snippet_train_txt)
        test_size = len(snippet_test_txt)
        # Check size:
        pprint('Training size: %d' % training_size)
        pprint('Test size: %d' % test_size)
        assert training_size == snippet_train_label.shape[0]
        assert test_size == snippet_test_label.shape[0]
        # Word embedding
        snippet_train_set = np.zeros(
            (training_size, word_embedding.embedding_dim()), dtype=floatX)
        snippet_test_set = np.zeros(
            (test_size, word_embedding.embedding_dim()), dtype=floatX)

        for i, snippet in enumerate(snippet_train_txt):
            words = snippet.split()
            vectors = np.asarray(
                [word_embedding.wordvec(word) for word in words], dtype=floatX)
            snippet_train_set[i, :] = np.mean(vectors, axis=0)

        for i, snippet in enumerate(snippet_test_txt):
            words = snippet.split()
            vectors = np.asarray(
                [word_embedding.wordvec(word) for word in words], dtype=floatX)
            snippet_test_set[i, :] = np.mean(vectors, axis=0)
        # Shuffle training and test data set
        train_rand_index = np.random.permutation(training_size)
        test_rand_index = np.random.permutation(test_size)
        snippet_train_set = snippet_train_set[train_rand_index, :]
        snippet_train_label = snippet_train_label[train_rand_index]
        snippet_test_set = snippet_test_set[test_rand_index, :]
        snippet_test_label = snippet_test_label[test_rand_index]
        # Decrease 1 from label
        snippet_train_label -= 1
        snippet_test_label -= 1
        self.snippet_train_set = snippet_train_set
        self.snippet_train_label = snippet_train_label
        self.snippet_test_set = snippet_test_set
        self.snippet_test_label = snippet_test_label
コード例 #2
0
	def setUp(self):
		'''
		Load and store the snippet data set.
		'''
		np.random.seed(42)
		snippet_train_set_filename = '../data/train_snip.txt'
		snippet_test_set_filename = '../data/test_snip.txt'
		snippet_train_label_filename = '../data/train_label.txt'
		snippet_test_label_filename = '../data/test_label.txt'
		embedding_filename = '../data/wiki_embeddings.txt'
		# Build architecture of CNN from the configuration file
		# Load wiki-embedding
		word_embedding = WordEmbedding(embedding_filename)
		# Load data and train via minibatch
		with file(snippet_train_set_filename, 'rb') as fin:
			snippet_train_txt = fin.readlines()
		with file(snippet_test_set_filename, 'rb') as fin:
			snippet_test_txt = fin.readlines()
		snippet_train_label = np.loadtxt(snippet_train_label_filename, dtype=np.int32)
		snippet_test_label = np.loadtxt(snippet_test_label_filename, dtype=np.int32)
		training_size = len(snippet_train_txt)
		test_size = len(snippet_test_txt)
		# Check size:
		pprint('Training size: %d' % training_size)
		pprint('Test size: %d' % test_size)
		assert training_size == snippet_train_label.shape[0]
		assert test_size == snippet_test_label.shape[0]
		# Word embedding
		snippet_train_set = np.zeros((training_size, word_embedding.embedding_dim()), dtype=floatX)
		snippet_test_set = np.zeros((test_size, word_embedding.embedding_dim()), dtype=floatX)

		for i, snippet in enumerate(snippet_train_txt):
			words = snippet.split()
			vectors = np.asarray([word_embedding.wordvec(word) for word in words], dtype=floatX)
			snippet_train_set[i, :] = np.mean(vectors, axis=0)

		for i, snippet in enumerate(snippet_test_txt):
			words = snippet.split()
			vectors = np.asarray([word_embedding.wordvec(word) for word in words], dtype=floatX)
			snippet_test_set[i, :] = np.mean(vectors, axis=0)
		# Shuffle training and test data set
		train_rand_index = np.random.permutation(training_size)
		test_rand_index = np.random.permutation(test_size)
		snippet_train_set = snippet_train_set[train_rand_index, :]
		snippet_train_label = snippet_train_label[train_rand_index]
		snippet_test_set = snippet_test_set[test_rand_index, :]
		snippet_test_label = snippet_test_label[test_rand_index]
		# Decrease 1 from label
		snippet_train_label -= 1
		snippet_test_label -= 1
		self.snippet_train_set = snippet_train_set
		self.snippet_train_label = snippet_train_label
		self.snippet_test_set = snippet_test_set
		self.snippet_test_label = snippet_test_label
コード例 #3
0
ファイル: testio.py プロジェクト: appscluster/sentiment-CNN
class TestIO(unittest.TestCase):
    def setUp(self):
        embedding_fname = '../data/wiki_embeddings.txt'
        snip_train_txt = '../data/train_snip.txt'
        snip_test_txt = '../data/test_snip.txt'
        snip_train_label = '../data/train_label.txt'
        snip_test_label = '../data/test_label.txt'
        self.word_embedding = WordEmbedding(embedding_fname)
        self.train_snip_txt = utils.loadtxt(snip_train_txt)
        self.train_snip_label = utils.loadlabel(snip_train_label)
        self.test_snip_txt = utils.loadtxt(snip_test_txt)
        self.test_snip_label = utils.loadlabel(snip_test_label)

    def testEmbedding(self):
        pprint('Size of word vocabulary: %d' % self.word_embedding.dict_size())
        pprint('Dimension of word embedding: %d' %
               self.word_embedding.embedding_dim())
        self.assertEqual(self.word_embedding.dict_size(), 311467,
                         'Incorrect size of word vocabulary')
        self.assertEqual(self.word_embedding.embedding_dim(), 50,
                         'Incorrect dimension of word embedding')
        pprint("Unknown: ")
        pprint(self.word_embedding.wordvec('unknown'))

    def testSnippetTrain(self):
        self.assertEqual(len(self.train_snip_txt), 10060,
                         'Training data not complete')
        self.assertEqual(len(self.train_snip_label), 10060,
                         'Training label not complete')
        num_class = len(set(self.train_snip_label))
        self.assertEqual(num_class, 8, 'Number of classes should be 8')
        for i in xrange(num_class):
            cls_count = np.sum((i + 1) == self.train_snip_label)
            pprint("Number of instances in class %d: %d" % (i + 1, cls_count))

    def testSnippetTest(self):
        self.assertEqual(len(self.test_snip_txt), 2280,
                         'Test data not complete')
        self.assertEqual(len(self.test_snip_label), 2280,
                         'Test label not complete')
        num_class = len(set(self.test_snip_label))
        self.assertEqual(num_class, 8, 'Number of classes should be 8')
        for i in xrange(num_class):
            cls_count = np.sum((i + 1) == self.test_snip_label)
            pprint("Number of instances in class %d: %d" % (i + 1, cls_count))
コード例 #4
0
 def setUp(self):
     '''
     Load training and test data set, also, loading word-embeddings.
     '''
     np.random.seed(1991)
     sp_train_filename = '../data/refined_train_sp.txt'
     sp_test_filename = '../data/refined_test_sp.txt'
     sp_train_txt, sp_train_label = [], []
     sp_test_txt, sp_test_label = [], []
     start_time = time.time()
     # Read training data set
     with file(sp_train_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             sp_train_txt.append(txt)
             sp_train_label.append(int(label))
     # Read test data set
     with file(sp_test_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             sp_test_txt.append(txt)
             sp_test_label.append(label)
     end_time = time.time()
     logger.debug('Finished loading training and test data sets...')
     logger.debug('Time used for loading: %f seconds.' % (end_time-start_time))
     embedding_filename = '../data/wiki_embeddings.txt'
     word_embedding = WordEmbedding(embedding_filename)
     start_time = time.time()
     # Starting and Ending token for each sentence
     self.blank_token = word_embedding.wordvec('</s>')
     # Store original text representation
     self.sp_train_txt = sp_train_txt
     self.sp_test_txt = sp_test_txt
     # Word-vector representation
     self.sp_train_label = np.asarray(sp_train_label, dtype=np.int32)
     self.sp_test_label = np.asarray(sp_test_label, dtype=np.int32)
     train_size = len(sp_train_txt)
     test_size = len(sp_test_txt)
     # Check size
     assert train_size == self.sp_train_label.shape[0]
     assert test_size == self.sp_test_label.shape[0]
     logger.debug('Training size: %d' % train_size)
     logger.debug('Test size: %d' % test_size)
     # Sequential modeling for each sentence
     self.sp_train_set, self.sp_test_set = [], []
     sp_train_len, sp_test_len = [], []
     # Embedding for training set
     for i, sent in enumerate(sp_train_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX)
         vectors[1:-1, :] = np.asarray([word_embedding.wordvec(word) for word in words])
         sp_train_len.append(len(words)+2)
         self.sp_train_set.append(vectors)
     # Embedding for test set
     for i, sent in enumerate(sp_test_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX)
         vectors[1:-1, :] = np.asarray([word_embedding.wordvec(word) for word in words])
         sp_test_len.append(len(words)+2)
         self.sp_test_set.append(vectors)
     assert sp_train_len == [seq.shape[0] for seq in self.sp_train_set]
     assert sp_test_len == [seq.shape[0] for seq in self.sp_test_set]
     end_time = time.time()
     logger.debug('Time used to build initial training and test matrix: %f seconds.' % (end_time-start_time))
     # Store data
     self.train_size = train_size
     self.test_size = test_size
     self.word_embedding = word_embedding
     logger.debug('Max sentence length in training set: %d' % max(sp_train_len))
     logger.debug('Max sentence length in test set: %d' % max(sp_test_len))
コード例 #5
0
 def testActiveAndPassive(self):
     np.random.seed(1991)
     sp_train_filename = '../data/refined_train_sp.txt'
     sp_test_filename = '../data/refined_test_sp.txt'
     sp_train_txt, sp_train_label = [], []
     sp_test_txt, sp_test_label = [], []
     start_time = time.time()
     # Read training data set
     with file(sp_train_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             sp_train_txt.append(txt)
             sp_train_label.append(int(label))
     # Read test data set
     with file(sp_test_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             sp_test_txt.append(txt)
             sp_test_label.append(label)
     end_time = time.time()
     logger.debug('Finished loading training and test data sets...')
     logger.debug('Time used for loading: %f seconds.' % (end_time-start_time))
     embedding_filename = '../data/wiki_embeddings.txt'
     word_embedding = WordEmbedding(embedding_filename)
     start_time = time.time()
     # Starting and Ending token for each sentence
     blank_token = word_embedding.wordvec('</s>')
     # Word-vector representation
     sp_train_label = np.asarray(sp_train_label, dtype=np.int32)
     sp_test_label = np.asarray(sp_test_label, dtype=np.int32)
     train_size = len(sp_train_txt)
     test_size = len(sp_test_txt)
     # Check size
     logger.debug('Training size: %d' % train_size)
     logger.debug('Test size: %d' % test_size)
     # Sequential modeling for each sentence
     sp_train_set, sp_test_set = [], []
     # Embedding for training set
     for i, sent in enumerate(sp_train_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX)
         vectors[1:-1, :] = np.asarray([word_embedding.wordvec(word) for word in words])
         sp_train_set.append(vectors)
     # Embedding for test set
     for i, sent in enumerate(sp_test_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX)
         vectors[1:-1, :] = np.asarray([word_embedding.wordvec(word) for word in words])
         sp_test_set.append(vectors)
     end_time = time.time()
     logger.debug('Time used to build initial training and test matrix: %f seconds.' % (end_time-start_time))
     # Now, start training
     start_time = time.time()
     grbagger = GrCNNBagger(self.configer, verbose=True)
     end_time = time.time()
     logger.debug('Time used to build the model: %f seconds.' % (end_time-start_time))
     learn_rate = 2e-2
     # Training using stochastic gradient descent algorithm
     epoch = 200
     batch_size = 10
     start_time = time.time()
     highest_train_accuracy, highest_test_accuracy = 0.0, 0.0
     training_acc, training_cost = [], []
     test_acc, test_cost = [], []
     try:
         sample_size = 0
         fuedge_factor = 1e-6
         for i in xrange(epoch):
             costs = 0.0
             correct_count = 0
             logger.debug('=' * 50)
             # rate = learn_rate / (1+i)
             rate = learn_rate
             # Training
             num_batch = train_size / batch_size
             for k in xrange(num_batch):
                 accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 for j in xrange(k*batch_size, (k+1)*batch_size):
                     results = grbagger.compute_gradient_and_cost(sp_train_set[j], sp_train_label[j])
                     for r in results:
                         if np.isnan(np.sum(r)):
                             logger.debug('*' * 50)
                             logger.debug('Error!!!!!')
                             logger.debug('NaN found at %dth training instance' % j)
                             logger.debug('*' * 50)
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == sp_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 grbagger.update_params(accumu_grads, rate)
             accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             if num_batch * batch_size < train_size:
                 for j in xrange(num_batch*batch_size, train_size):
                     results = grbagger.compute_gradient_and_cost(sp_train_set[j], sp_train_label[j])
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == sp_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= train_size-num_batch*batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 grbagger.update_params(accumu_grads, rate)
             train_accuracy = float(correct_count) / train_size
             logger.debug('Training epoch: %d, total cost: %f, accuracy = %f' 
                         % (i, costs, train_accuracy))
             ## Adding training accuracy and training cost
             training_acc.append(train_accuracy)
             training_cost.append(costs)
             if train_accuracy > highest_train_accuracy: highest_train_accuracy = train_accuracy
             # Testing
             correct_count = 0
             costs = 0.0
             for j in xrange(test_size):
                 pred = grbagger.predict(sp_test_set[j])
                 cost = grbagger.show_cost(sp_test_set[j], sp_test_label[j])
                 if pred == sp_test_label[j]: correct_count += 1
                 costs += cost
             test_accuracy = float(correct_count) / test_size
             ## Adding test accuracy and test cost
             test_acc.append(test_accuracy)
             test_cost.append(costs)
             logger.debug('Test accuracy: %f' % test_accuracy)
             if test_accuracy > highest_test_accuracy: highest_test_accuracy = test_accuracy
             # Sampling to show the weights and experts of training and test instances
             logger.debug('Training Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(train_size)
                 weights = grbagger.show_weights(sp_train_set[idx])
                 scores = grbagger.show_scores(sp_train_set[idx])
                 prob = grbagger.show_prob(sp_train_set[idx])
                 label = sp_train_label[idx]
                 logger.debug('Training idx: {}'.format(idx))
                 logger.debug('Training scores: {}'.format(scores))
                 logger.debug('Training weights: {}'.format(weights))
                 logger.debug('Training probability: {}'.format(prob))
                 logger.debug('Training label: {}'.format(label))
                 logger.debug('-' * 50)
             logger.debug('Test Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(test_size)
                 weights = grbagger.show_weights(sp_test_set[idx])    
                 scores = grbagger.show_scores(sp_test_set[idx])
                 prob = grbagger.show_prob(sp_test_set[idx])
                 label = sp_test_label[idx]
                 logger.debug('Test idx: {}'.format(idx))
                 logger.debug('Test scores: {}'.format(scores))
                 logger.debug('Test weights: {}'.format(weights))
                 logger.debug('Test probability: {}'.format(prob))
                 logger.debug('Test label: {}'.format(label))
                 logger.debug('-' * 50)
             # Check norms of the model parameter
             for param in grbagger.params:
                 val = param.get_value(borrow=True)
                 norm = np.sqrt(np.sum(np.square(val)))
                 logger.debug('Parameter: {}, L2-norm: {}'.format(param.name, norm))
     except:
         logger.debug('Error appeared!')
         traceback.print_exc(file=sys.stdout)
         logger.debug('-' * 50)
     finally:
         end_time = time.time()
         logger.debug('Time used for training: %f seconds.' % (end_time-start_time))
         logger.debug('Highest training accuracy: %f' % highest_train_accuracy)
         logger.debug('Highest test accuracy: %f' % highest_test_accuracy)
         GrCNNBagger.save('sp-grbagger.model', grbagger)
         # Save all the training and test records
         training_acc = np.asarray(training_acc)
         training_cost = np.asarray(training_cost)
         test_acc = np.asarray(test_acc)
         test_cost = np.asarray(test_cost)
         with file('sp-records.npy', 'w') as fout:
             np.save(fout, training_acc)
             np.save(fout, training_cost)
             np.save(fout, test_acc)
             np.save(fout, test_cost)
         logger.debug('Training and test records saved to sp-records.npy...')
         logger.debug('Finished...')
コード例 #6
0
 def testSentimentFineTune(self):
     '''
     Build a small model and use it on sentiment analysis task. With fine-tunning
     the word-embedding matrix.
     '''
     np.random.seed(1991)
     fname = './grCNN.conf'
     configer = GrCNNConfiger(fname)
     senti_train_filename = '../data/sentiment-train.txt'
     # senti_train_filename = '../data/sentiment-train-phrases.txt'
     senti_test_filename = '../data/sentiment-test.txt'
     senti_train_txt, senti_train_label = [], []
     senti_test_txt, senti_test_label = [], []
     start_time = time.time()
     # Read training data set
     with file(senti_train_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             senti_train_txt.append(txt)
             senti_train_label.append(int(label))
     # Read test data set
     with file(senti_test_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             senti_test_txt.append(txt)
             senti_test_label.append(int(label))
     end_time = time.time()
     logger.debug('Time used to load training and test data set: %f seconds.' % (end_time-start_time))
     embedding_filename = '../data/wiki_embeddings.txt'
     # Load training/test data sets and wiki-embeddings
     word_embedding = WordEmbedding(embedding_filename)
     embed_dim = word_embedding.embedding_dim()
     start_time = time.time()
     blank_index = word_embedding.word2index('</s>')
     logger.debug('Blank index: {}'.format(word_embedding.index2word(blank_index)))
     # Word-vector representation
     senti_train_label = np.asarray(senti_train_label, dtype=np.int32)
     senti_test_label = np.asarray(senti_test_label, dtype=np.int32)
     train_size = len(senti_train_txt)
     test_size = len(senti_test_txt)
     # Check size
     logger.debug('Training size: %d' % train_size)
     logger.debug('Test size: %d' % test_size)
     # Shuffling for all the instances
     start_time = time.time()
     rindex = np.arange(train_size)
     tindex = np.arange(test_size)
     np.random.shuffle(rindex)
     np.random.shuffle(tindex)
     # Shuffle label
     senti_train_label = senti_train_label[rindex]
     senti_test_label = senti_test_label[tindex]
     # Shuffle text
     senti_train_txt = list(np.asarray(senti_train_txt)[rindex])
     senti_test_txt = list(np.asarray(senti_test_txt)[tindex])
     end_time = time.time()
     logger.debug('Time used to shuffle all the data: %f seconds.' % (end_time-start_time))
     # Compute word embedding
     senti_train_set = []
     senti_test_set = []
     # Record the index of each word in each sentence for only once
     senti_train_word_index = []
     senti_test_word_index = []
     # Record the sparse input indicator matrix only once for fast computation
     senti_train_sparse_select = []
     senti_test_sparse_select = []
     # Embedding for training set
     for i, sent in enumerate(senti_train_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32)
         vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words])
         indices = [blank_index]
         indices += [word_embedding.word2index(word) for word in words]
         indices += [blank_index]
         sparse_select = lil_matrix((len(words)+2, word_embedding.dict_size()), dtype=floatX)
         sparse_select[range(len(words)+2), indices] = 1.0
         sparse_select = csc_matrix(sparse_select)
         senti_train_set.append(vectors)
         senti_train_word_index.append(indices)
         senti_train_sparse_select.append(sparse_select)
     # Embedding for test set
     for i, sent in enumerate(senti_test_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32)
         vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words])
         indices = [blank_index]
         indices += [word_embedding.word2index(word) for word in words]
         indices += [blank_index]
         sparse_select = lil_matrix((len(words)+2, word_embedding.dict_size()), dtype=floatX)
         sparse_select[range(len(words)+2), indices] = 1.0
         sparse_select = csc_matrix(sparse_select)
         senti_test_set.append(vectors)
         senti_test_word_index.append(indices)
         senti_test_sparse_select.append(sparse_select)
     end_time = time.time()
     logger.debug('Time used to build initial matrices: %f seconds.' % (end_time-start_time))
     p_count = np.sum(senti_train_label)
     logger.debug('Default positive percentage in Train: %f' % (float(p_count) / train_size))
     logger.debug('Default negative percentage in Train: %f' % (float(train_size-p_count) / train_size))
     p_count = np.sum(senti_test_label)
     logger.debug('Default positive percentage in Test: %f' % (float(p_count) / test_size))
     logger.debug('Default negative percentage in Test: %f' % (float(test_size-p_count) / test_size))
     # Now, start training
     start_time = time.time()
     grbagger = GrCNNBagger(configer, verbose=True)
     end_time = time.time()
     logger.debug('Time used to build the model: %f seconds.' % (end_time-start_time))
     learn_rate = 0.02
     # Training using stochastic gradient descent algorithm
     epoch = 200
     batch_size = 20
     start_time = time.time()
     highest_train_accuracy, highest_test_accuracy = 0.0, 0.0
     track_training_acc, track_training_cost = [], []
     track_test_acc, track_test_cost = [], []
     training_threshold_epoch = 30
     try:
         sample_size = 0
         fuedge_factor = 1e-6
         # accumu matrix for word-embedding matrix
         # hist matrix for word-embedding matrix
         accumu_embedding = np.zeros((word_embedding.dict_size(), configer.num_input), dtype=floatX)
         hist_embedding = np.zeros((word_embedding.dict_size(), configer.num_input), dtype=floatX)
         for i in xrange(epoch):
             costs = 0.0
             correct_count = 0
             logger.debug('=' * 50)
             # rate = learn_rate / (1+i)
             rate = learn_rate
             # Training
             num_batch = train_size / batch_size
             for k in xrange(num_batch):
                 # Clear all the cache        
                 accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 if i > training_threshold_epoch:
                     accumu_embedding[:] = 0.0
                     hist_embedding[:] = 0.0
                 for j in xrange(k*batch_size, (k+1)*batch_size):
                     train_sent_rep = senti_train_sparse_select[j].dot(word_embedding.embedding)
                     results = grbagger.compute_gradient_and_cost(train_sent_rep, senti_train_label[j])
                     input_grad = grbagger.compute_input_gradient(train_sent_rep, senti_train_label[j])
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == senti_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                     ## Update the word-embedding matrix
                     if i > training_threshold_epoch:
                         tmp = senti_train_sparse_select[j].T.dot(input_grad)
                         accumu_embedding += tmp
                         hist_embedding += np.square(tmp)
                 # Updating model parameters
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 # Updating word-embedding matrix
                 if i > training_threshold_epoch:
                     accumu_embedding /= batch_size
                     accumu_embedding /= fuedge_factor + np.sqrt(hist_embedding)
                     word_embedding._embedding -= rate * accumu_embedding
                 grbagger.update_params(accumu_grads, rate)
             # Clear all the cache again
             accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             if i > training_threshold_epoch:
                 accumu_embedding[:] = 0.0
                 hist_embedding[:] = 0.0
             if num_batch * batch_size < train_size:
                 for j in xrange(num_batch*batch_size, train_size):
                     train_sent_rep = senti_train_sparse_select[j].dot(word_embedding.embedding)
                     results = grbagger.compute_gradient_and_cost(train_sent_rep, senti_train_label[j])
                     input_grad = grbagger.compute_input_gradient(train_sent_rep, senti_train_label[j])
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == senti_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                     ## Update the word-embedding matrix
                     if i > training_threshold_epoch:
                         tmp = senti_train_sparse_select[j].T.dot(input_grad)
                         accumu_embedding += tmp
                         hist_embedding += np.square(tmp)
                 # Normalizing model parameters
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= train_size-num_batch*batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 # Normalizing word-embedding matrix
                 if i > training_threshold_epoch:
                     accumu_embedding /= train_size-num_batch*batch_size
                     accumu_embedding /= fuedge_factor + np.sqrt(hist_embedding)
                     word_embedding._embedding -= rate * accumu_embedding
                 # Updating all the parameters
                 grbagger.update_params(accumu_grads, rate)
             train_accuracy = float(correct_count) / train_size
             logger.debug('Training epoch: %d, total cost: %f, accuracy = %f' 
                         % (i, costs, train_accuracy))
             # Append all the numbers
             track_training_cost.append(costs)
             track_training_acc.append(train_accuracy)
             if train_accuracy > highest_train_accuracy: highest_train_accuracy = train_accuracy
             # Testing
             correct_count = 0
             costs = 0.0
             for j in xrange(test_size):
                 test_sent_rep = senti_test_sparse_select[j].dot(word_embedding.embedding)
                 pred = grbagger.predict(test_sent_rep)
                 cost = grbagger.show_cost(test_sent_rep, senti_test_label[j])
                 if pred == senti_test_label[j]: correct_count += 1
                 costs += cost
             test_accuracy = float(correct_count) / test_size
             logger.debug('Test accuracy: %f' % test_accuracy)
             # Append all the numbers
             track_test_cost.append(costs)
             track_test_acc.append(test_accuracy)
             if test_accuracy > highest_test_accuracy: highest_test_accuracy = test_accuracy
             # Sampling to show the weights and experts of training and test instances
             logger.debug('Training Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(train_size)
                 weights = grbagger.show_weights(senti_train_set[idx])
                 scores = grbagger.show_scores(senti_train_set[idx])
                 prob = grbagger.show_prob(senti_train_set[idx])
                 label = senti_train_label[idx]
                 logger.debug('Training idx: {}'.format(idx))
                 logger.debug('Training scores: {}'.format(scores))
                 logger.debug('Training weights: {}'.format(weights))
                 logger.debug('Training probability: {}'.format(prob))
                 logger.debug('Training label: {}'.format(label))
                 logger.debug('-' * 50)
             logger.debug('Test Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(test_size)
                 weights = grbagger.show_weights(senti_test_set[idx])    
                 scores = grbagger.show_scores(senti_test_set[idx])
                 prob = grbagger.show_prob(senti_test_set[idx])
                 label = senti_test_label[idx]
                 logger.debug('Test idx: {}'.format(idx))
                 logger.debug('Test scores: {}'.format(scores))
                 logger.debug('Test weights: {}'.format(weights))
                 logger.debug('Test probability: {}'.format(prob))
                 logger.debug('Test label: {}'.format(label))
                 logger.debug('-' * 50)
             # Check norms of the model parameter
             for param in grbagger.params:
                 val = param.get_value(borrow=True)
                 norm = np.sqrt(np.sum(np.square(val)))
                 logger.debug('Parameter: {}, L2-norm: {}'.format(param.name, norm))
             wnorm = np.sqrt(np.sum(np.square(word_embedding._embedding)))
             logger.debug('Parameter: {}, L2-norm: {}'.format('Word-Embedding', wnorm))
     except:
         logger.debug('Error appeared!')
         traceback.print_exc(file=sys.stdout)
         logger.debug('-' * 50)
     finally:
         end_time = time.time()
         logger.debug('Time used for training: %f seconds.' % (end_time-start_time))
         logger.debug('Highest training accuracy: %f' % highest_train_accuracy)
         logger.debug('Highest test accuracy: %f' % highest_test_accuracy)
         GrCNNBagger.save('fine-grbagger.model', grbagger)
         # Save all the tracking numbers
         track_training_acc = np.asarray(track_training_acc)
         track_training_cost = np.asarray(track_training_cost)
         track_test_acc = np.asarray(track_test_acc)
         track_test_cost = np.asarray(track_test_cost)
         with file('fine-senti-records.npy', 'w') as fout:
             np.save(fout, track_training_acc)
             np.save(fout, track_training_cost)
             np.save(fout, track_test_acc)
             np.save(fout, track_test_cost)
         logger.debug('Training and test records saved to fine-senti-records.npy...')
         logger.debug('Finished...')
コード例 #7
0
class TestOnLarge(unittest.TestCase):
    '''
	Test the initial sentence model on Wiki-Data set, which contains 
	39,746,550 sentences,
	782,603,381 words
	'''
    def setUp(self):
        train_txt_filename = '../data/wiki_sentence.txt'
        wiki_filename = '../data/wiki_embeddings.txt'
        start_time = time.time()
        self.word_embedding = WordEmbedding(wiki_filename)
        with file(train_txt_filename, 'rb') as fin:
            self.train_txt = fin.readlines()
        end_time = time.time()
        # Since the maximum length in the task of sentiment_analysis is 56, during training
        # we will set 56 as the maximum length of each sentence
        self.max_length = 56
        self.num_sent = len(self.train_txt)
        self.batch_size = 2000
        self.nepoch = 5
        pprint('Time used to load wiki sentences into memory: %f seconds.' %
               (end_time - start_time))
        pprint('Number of sentences in the data set: %d' % len(self.train_txt))

    def testTrain(self):
        '''
		Train Auto-Encoder + SoftmaxLayer on batch learning mode.
		'''
        input_dim, hidden_dim = self.max_length * self.word_embedding.embedding_dim(
        ), 500
        # Build AutoEncoder + SoftmaxLayer
        start_time = time.time()
        seed = 1991
        input_matrix = T.matrix(name='input')
        num_in, num_out = input_dim, hidden_dim
        act = Activation('tanh')
        is_denoising, is_sparse = True, False
        lambda1, mask = 1e-4, 0.5
        rng = RandomStreams(seed)
        sent_model = SentModel(input_matrix, (num_in, num_out),
                               act,
                               is_denoising,
                               is_sparse,
                               lambda1,
                               mask,
                               rng,
                               verbose=True)
        end_time = time.time()
        pprint('Time used to build the model: %f seconds.' %
               (end_time - start_time))
        # Loading training data and start batch training mode
        num_batch = self.num_sent / self.batch_size
        learn_rate = 0.1
        # Pretraining
        pprint('Start pretraining...')
        start_time = time.time()
        for i in xrange(self.nepoch):
            # Batch training
            pprint('Training epoch: %d' % i)
            for j in xrange(num_batch):
                train_set = np.zeros(
                    (self.batch_size,
                     self.max_length * self.word_embedding.embedding_dim()),
                    dtype=floatX)
                train_txt = self.train_txt[j * self.batch_size:(j + 1) *
                                           self.batch_size]
                for k, sent in enumerate(train_txt):
                    words = sent.split()
                    vectors = np.asarray(
                        [self.word_embedding.wordvec(word) for word in words])
                    vectors = vectors.flatten()
                    train_set[k, :vectors.shape[0]] = vectors
                rate = learn_rate
                cost = sent_model.pretrain(train_set, rate)
                if (j + 1) % 500 == 0:
                    pprint('Training epoch: %d, Number batch: %d, cost = %f' %
                           (i, j, cost))
            # Saving temporary pretraining model in .gz
            with gzip.GzipFile('./large_pretrain.sent.gz', 'wb') as fout:
                cPickle.dump(sent_model, fout)
        end_time = time.time()
        pprint('Time used for pretraining: %f minutes.' %
               ((end_time - start_time) / 60.0))
        # Fine tuning
        pprint('Start fine-tuning...')
        start_time = time.time()
        for i in xrange(self.nepoch):
            # Batch training
            pprint('Training epoch: %d' % i)
            for j in xrange(num_batch):
                train_set = np.zeros(
                    (self.batch_size,
                     self.max_length * self.word_embedding.embedding_dim()),
                    dtype=floatX)
                train_txt = self.train_txt[j * self.batch_size:(j + 1) *
                                           self.batch_size]
                for k, sent in enumerate(train_txt):
                    words = sent.split()
                    vectors = np.asarray(
                        [self.word_embedding.wordvec(word) for word in words])
                    vectors = vectors.flatten()
                    train_set[k, :vectors.shape[0]] = vectors
                rate = learn_rate
                cost = sent_model.finetune(train_set, rate)
                if (j + 1) % 500 == 0:
                    pprint('Training epoch: %d, Number batch: %d, cost = %f' %
                           (i, j, cost))
            # Saving temporary fine-tuning model in .gz
            with gzip.GzipFile('./large_finetune.sent.gz', 'wb') as fout:
                cPickle.dump(sent_model, fout)
        end_time = time.time()
        pprint('Time used for fine-tuning: %f minutes.' %
               ((end_time - start_time) / 60.0))
コード例 #8
0
assert len(mr_txt) == len(mr_label)
data_size = len(mr_txt)
logger.info('Size of the data sets: %d' % data_size)
random_index = np.arange(data_size)
np.random.shuffle(random_index)
mr_txt = list(np.asarray(mr_txt)[random_index])
mr_label = list(np.asarray(mr_label)[random_index])
end_time = time.time()
# Record timing
logger.info('Time used to load and shuffle MR dataset: %f seconds.' %
            (end_time - start_time))
# Load word-embedding
embedding_filename = './wiki_embeddings.txt.zip'
# Load training/test data sets and wiki-embeddings.
word_embedding = WordEmbedding(embedding_filename)
embed_dim = word_embedding.embedding_dim()
start_time = time.time()
blank_index = word_embedding.word2index('</s>')
logger.info('Blank index: {}'.format(word_embedding.index2word(blank_index)))
# Word-vector representation, zero-padding all the sentences to the maximum length.
max_len = 52
mr_insts = np.zeros((data_size, max_len, word_embedding.embedding_dim()),
                    dtype=np.float32)
mr_label = np.asarray(mr_label)[:, np.newaxis]
for i, sent in enumerate(mr_txt):
    words = sent.split()
    words = [word.lower() for word in words]
    l = min(len(words), max_len - 2)
    # vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32)
    mr_insts[i, 1:l + 1, :] = np.asarray(
        [word_embedding.wordvec(word) for word in words[:l]])
コード例 #9
0
 def setUp(self):
     '''
     Load training and test texts and labels in sentiment analysis task, preprocessing.
     '''
     np.random.seed(1991)
     senti_train_filename = '../data/sentiment-train.txt'
     senti_test_filename = '../data/sentiment-test.txt'
     senti_train_txt, senti_train_label = [], []
     senti_test_txt, senti_test_label = [], []
     start_time = time.time()
     # Read training data set
     with file(senti_train_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             senti_train_txt.append(txt)
             senti_train_label.append(int(label))
     # Read test data set
     with file(senti_test_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             senti_test_txt.append(txt)
             senti_test_label.append(int(label))
     end_time = time.time()
     logger.debug('Time used to load training and test data set: %f seconds.' % (end_time-start_time))
     embedding_filename = '../data/wiki_embeddings.txt'
     # Load wiki-embeddings
     word_embedding = WordEmbedding(embedding_filename)
     self.token = word_embedding.wordvec('</s>')
     # Store the original text representation
     self.senti_train_txt = senti_train_txt
     self.senti_test_txt = senti_test_txt
     # Word-vector representation
     self.senti_train_label = np.asarray(senti_train_label, dtype=np.int32)
     self.senti_test_label = np.asarray(senti_test_label, dtype=np.int32)
     self.train_size = len(senti_train_txt)
     self.test_size = len(senti_test_txt)
     logger.debug('Training set size: %d' % self.train_size)
     logger.debug('Test set size: %d' % self.test_size)
     assert self.train_size == self.senti_train_label.shape[0]
     assert self.test_size == self.senti_test_label.shape[0]
     # Build the word-embedding matrix
     start_time = time.time()
     self.senti_train_set, self.senti_test_set = [], []
     for sent in senti_train_txt:
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX)
         vectors[0, :] = self.token
         tmp = np.asarray([word_embedding.wordvec(word) for word in words])
         vectors[1:-1, :] = tmp
         vectors[-1, :] = self.token
         self.senti_train_set.append(vectors)
     for sent in senti_test_txt:
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX)
         vectors[0, :] = self.token
         tmp = np.asarray([word_embedding.wordvec(word) for word in words])
         vectors[1:-1, :] = tmp
         vectors[-1, :] = self.token
         self.senti_test_set.append(vectors)
     end_time = time.time()
     logger.debug('Time used to build training and test word embedding matrix: %f seconds.' % (end_time-start_time))
     self.word_embedding = word_embedding
コード例 #10
0
 def setUp(self):
     '''
     Load training and test data set, also, loading word-embeddings.
     '''
     np.random.seed(42)
     sp_train_filename = '../data/refined_train_sp.txt'
     sp_test_filename = '../data/refined_test_sp.txt'    
     sp_train_txt, sp_train_label = [], []
     sp_test_txt, sp_test_label = [], []
     start_time = time.time()
     # Read training data set
     with file(sp_train_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             sp_train_txt.append(txt)
             sp_train_label.append(int(label))
     with file(sp_test_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             sp_test_txt.append(txt)
             sp_test_label.append(int(label))
     end_time = time.time()
     logger.debug('Finished loading training and test data set...')
     logger.debug('Time used for loading: %f seconds.' % (end_time-start_time))
     embedding_filename = '../data/wiki_embeddings.txt'
     word_embedding = WordEmbedding(embedding_filename)
     start_time = time.time()
     # Beginning and trailing token for each sentence
     self.blank_token = word_embedding.wordvec('</s>')
     # Store original text representation
     self.sp_train_txt = sp_train_txt
     self.sp_test_txt = sp_test_txt
     # Store original label
     self.sp_train_label = np.asarray(sp_train_label, dtype=np.int32)
     self.sp_test_label = np.asarray(sp_test_label, dtype=np.int32)
     train_size = len(sp_train_txt)
     test_size = len(sp_test_txt)
     # Check size
     assert train_size == self.sp_train_label.shape[0]
     assert test_size == self.sp_test_label.shape[0]
     # Output the information
     logger.debug('Training size: %d' % train_size)
     logger.debug('Test size: %d' % test_size)
     # Word-vector representation
     self.sp_train_set, self.sp_test_set = [], []
     sp_train_len, sp_test_len = [], []
     # Embedding for training set
     for i, sent in enumerate(sp_train_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX)
         vectors[1:-1, :] = np.asarray([word_embedding.wordvec(word) for word in words])
         sp_train_len.append(len(words)+2)
         self.sp_train_set.append(vectors)
     # Embedding for test set
     for i, sent in enumerate(sp_test_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX)
         vectors[1:-1, :] = np.asarray([word_embedding.wordvec(word) for word in words])
         sp_test_len.append(len(words)+2)
         self.sp_test_set.append(vectors)
     # Check word-length
     assert sp_train_len == [seq.shape[0] for seq in self.sp_train_set]
     assert sp_test_len == [seq.shape[0] for seq in self.sp_test_set]
     end_time = time.time()
     logger.debug('Time used to build initial training and test matrix: %f seconds' % (end_time-start_time))
     # Store metadata
     self.train_size = train_size
     self.test_size = test_size
     self.word_embedding = word_embedding
     logger.debug('Sentence of maximum length in training set: %d' % max(sp_train_len))
     logger.debug('Sentence of maximum length in test set: %d' % max(sp_test_len))
コード例 #11
0
	def setUp(self):
		'''
		Load training and test texts and labels 
		in sentiment analysis task, preprocessing.
		'''
		np.random.seed(42)
		senti_train_filename = '../data/sentiment-train.txt'
		senti_test_filename = '../data/sentiment-test.txt'
		senti_train_txt, senti_train_label = [], []
		senti_test_txt, senti_test_label = [], []
		start_time = time.time()
		# Record id of words for fine-tuning
		senti_train_words_label, senti_test_words_label = [], []
		# Load Word-Embedding
		embedding_filename = '../data/wiki_embeddings.txt'
		# Load training/test data sets and wiki-embeddings
		word_embedding = WordEmbedding(embedding_filename)
		# Starting and Ending token for each sentence
		self.blank_token = word_embedding.wordvec('</s>')
		self.blank_index = word_embedding.word2index('</s>')
		# Read training data set
		with file(senti_train_filename, 'r') as fin:
			reader = csv.reader(fin, delimiter='|')
			for txt, label in reader:
				senti_train_txt.append(txt)
				senti_train_label.append(int(label))
				words = txt.split()
				words = [word.lower() for word in words]
				tmp_indices = np.zeros(len(words)+2, dtype=np.int32)
				tmp_indices[0] = self.blank_index
				tmp_indices[1:-1] = np.asarray([word_embedding.word2index(word) for word in words])
				tmp_indices[-1] = self.blank_index
				senti_train_words_label.append(tmp_indices)
		# Read test data set
		with file(senti_test_filename, 'r') as fin:
			reader = csv.reader(fin, delimiter='|')
			for txt, label in reader:
				senti_test_txt.append(txt)
				senti_test_label.append(int(label))
				words = txt.split()
				words = [word.lower() for word in words]
				tmp_indices = np.zeros(len(words)+2, dtype=np.int32)
				tmp_indices[0] = self.blank_index
				tmp_indices[1:-1] = np.asarray([word_embedding.word2index(word) for word in words])
				tmp_indices[-1] = self.blank_index
				senti_test_words_label.append(tmp_indices)
		end_time = time.time()
		logger.debug('Time used to load training and test data set: %f seconds.' % (end_time-start_time))
		start_time = time.time()
		# Store original word index representation
		self.senti_train_words_label = senti_train_words_label
		self.senti_test_words_label = senti_test_words_label
		# Store original text representation
		self.senti_train_txt = senti_train_txt
		self.senti_test_txt = senti_test_txt
		# Word-vector representation
		self.senti_train_label = np.asarray(senti_train_label, dtype=np.int32)
		self.senti_test_label = np.asarray(senti_test_label, dtype=np.int32)
		train_size = len(senti_train_txt)
		test_size = len(senti_test_txt)
		# Check size
		assert train_size == self.senti_train_label.shape[0]
		assert test_size == self.senti_test_label.shape[0]
		logger.debug('Training size: %d' % train_size)
		logger.debug('Test size: %d' % test_size)
		# Sequential modeling for each sentence
		self.senti_train_set, self.senti_test_set = [], []
		senti_train_len, senti_test_len = [], []
		# Embedding for training set
		for i, sent in enumerate(senti_train_txt):
			words = sent.split()
			words = [word.lower() for word in words]
			vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX)
			vectors[1:-1, :] = np.asarray([word_embedding.wordvec(word) for word in words])
			senti_train_len.append(len(words)+2)
			self.senti_train_set.append(vectors)
		# Embedding for test set
		for i, sent in enumerate(senti_test_txt):
			words = sent.split()
			words = [word.lower() for word in words]
			vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX)
			vectors[1:-1, :] = np.asarray([word_embedding.wordvec(word) for word in words])
			senti_test_len.append(len(words)+2)
			self.senti_test_set.append(vectors)
		assert senti_train_len == [seq.shape[0] for seq in self.senti_train_set]
		assert senti_test_len == [seq.shape[0] for seq in self.senti_test_set]
		end_time = time.time()
		logger.debug('Time used to build initial training and test matrix: %f seconds.' % (end_time-start_time))
		# Store data
		self.train_size = train_size
		self.test_size = test_size
		self.word_embedding = word_embedding
コード例 #12
0
	def setUp(self):
		'''
		Load training and test texts and labels 
		in sentiment analysis task, preprocessing.
		'''
		np.random.seed(1991)
		# senti_train_filename = '../data/sentiment-train.txt'
		senti_train_filename = '../data/sentiment-train-phrases.txt'
		senti_test_filename = '../data/sentiment-test.txt'
		senti_train_txt, senti_train_label = [], []
		senti_test_txt, senti_test_label = [], []
		start_time = time.time()
		# Read training data set
		with file(senti_train_filename, 'r') as fin:
			reader = csv.reader(fin, delimiter='|')
			for txt, label in reader:
				senti_train_txt.append(txt)
				senti_train_label.append(int(label))
		# Read test data set
		with file(senti_test_filename, 'r') as fin:
			reader = csv.reader(fin, delimiter='|')
			for txt, label in reader:
				senti_test_txt.append(txt)
				senti_test_label.append(int(label))
		end_time = time.time()
		pprint('Time used to load training and test data set: %f seconds.' % (end_time-start_time))
		embedding_filename = '../data/wiki_embeddings.txt'
		# Load training/test data sets and wiki-embeddings
		word_embedding = WordEmbedding(embedding_filename)
		start_time = time.time()
		# Store original text representation
		self.senti_train_txt = senti_train_txt
		self.senti_test_txt = senti_test_txt
		# Word-vector representation
		self.senti_train_label = np.asarray(senti_train_label, dtype=np.int32)
		self.senti_test_label = np.asarray(senti_test_label, dtype=np.int32)
		train_size = len(senti_train_txt)
		test_size = len(senti_test_txt)
		# Check size
		assert train_size == self.senti_train_label.shape[0]
		assert test_size == self.senti_test_label.shape[0]
		pprint('Training size: %d' % train_size)
		pprint('Test size: %d' % test_size)
		# Compute word embedding
		self.senti_train_set = np.zeros((train_size, word_embedding.embedding_dim()), dtype=floatX)
		self.senti_test_set = np.zeros((test_size, word_embedding.embedding_dim()), dtype=floatX)
		# Embedding for training set
		for i, sent in enumerate(senti_train_txt):
			words = sent.split()
			words = [word.lower() for word in words]
			# pprint('Trainging set, Number of words in sentence %d: %d' % (i, len(words)))
			vectors = np.asarray([word_embedding.wordvec(word) for word in words])
			self.senti_train_set[i, :] = np.mean(vectors, axis=0)
		# Embedding for test set
		for i, sent in enumerate(senti_test_txt):
			words = sent.split()
			words = [word.lower() for word in words]
			# pprint('Test set, Number of words in sentence %d: %d' % (i, len(words)))
			vectors = np.asarray([word_embedding.wordvec(word) for word in words])
			self.senti_test_set[i, :] = np.mean(vectors, axis=0)
		# Shuffle training and test data set
		train_rand_index = np.random.permutation(train_size)
		test_rand_index = np.random.permutation(test_size)
		self.senti_train_txt = list(np.asarray(self.senti_train_txt)[train_rand_index])
		self.senti_test_txt = list(np.asarray(self.senti_test_txt)[test_rand_index])
		self.senti_train_set = self.senti_train_set[train_rand_index, :]
		self.senti_test_set = self.senti_test_set[test_rand_index, :]
		self.senti_train_label = self.senti_train_label[train_rand_index]
		self.senti_test_label = self.senti_test_label[test_rand_index]
		end_time = time.time()
		pprint('Time used to build initial training and test matrix: %f seconds.' % (end_time-start_time))
		# Store data
		self.train_size = train_size
		self.test_size = test_size
		self.word_embedding = word_embedding
コード例 #13
0
assert len(mr_txt) == len(mr_label)
data_size = len(mr_txt)
logger.info('Size of the data sets: %d' % data_size)
random_index = np.arange(data_size)
np.random.shuffle(random_index)
mr_txt = list(np.asarray(mr_txt)[random_index])
mr_label = list(np.asarray(mr_label)[random_index])
end_time = time.time()
# Record timing
logger.info('Time used to load and shuffle MR dataset: %f seconds.' %
            (end_time - start_time))
# Load word-embedding
embedding_filename = './wiki_embeddings.txt'
# Load training/test data sets and wiki-embeddings.
word_embedding = WordEmbedding(embedding_filename)
embed_dim = word_embedding.embedding_dim()
start_time = time.time()
blank_index = word_embedding.word2index('</s>')
logger.info('Blank index: {}'.format(word_embedding.index2word(blank_index)))
# Word-vector representation, zero-padding all the sentences to the maximum length.
max_len = 52
mr_insts = np.zeros((data_size, max_len, word_embedding.embedding_dim()),
                    dtype=np.float32)
mr_label = np.asarray(mr_label)[:, np.newaxis]
for i, sent in enumerate(mr_txt):
    words = sent.split()
    words = [word.lower() for word in words]
    l = len(words)
    # vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32)
    mr_insts[i, 1:l + 1, :] = np.asarray(
        [word_embedding.wordvec(word) for word in words])
コード例 #14
0
    def setUp(self):
        '''
		Load training and test texts and labels 
		in sentiment analysis task, preprocessing.
		'''
        np.random.seed(1991)
        # senti_train_filename = '../data/sentiment-train.txt'
        senti_train_filename = '../data/sentiment-train-phrases.txt'
        senti_test_filename = '../data/sentiment-test.txt'
        senti_train_txt, senti_train_label = [], []
        senti_test_txt, senti_test_label = [], []
        start_time = time.time()
        # Read training data set
        with file(senti_train_filename, 'r') as fin:
            reader = csv.reader(fin, delimiter='|')
            for txt, label in reader:
                senti_train_txt.append(txt)
                senti_train_label.append(int(label))
        # Read test data set
        with file(senti_test_filename, 'r') as fin:
            reader = csv.reader(fin, delimiter='|')
            for txt, label in reader:
                senti_test_txt.append(txt)
                senti_test_label.append(int(label))
        end_time = time.time()
        pprint('Time used to load training and test data set: %f seconds.' %
               (end_time - start_time))
        embedding_filename = '../data/wiki_embeddings.txt'
        # Load training/test data sets and wiki-embeddings
        word_embedding = WordEmbedding(embedding_filename)
        start_time = time.time()
        # Store original text representation
        self.senti_train_txt = senti_train_txt
        self.senti_test_txt = senti_test_txt
        # Word-vector representation
        self.senti_train_label = np.asarray(senti_train_label, dtype=np.int32)
        self.senti_test_label = np.asarray(senti_test_label, dtype=np.int32)
        train_size = len(senti_train_txt)
        test_size = len(senti_test_txt)
        # Check size
        assert train_size == self.senti_train_label.shape[0]
        assert test_size == self.senti_test_label.shape[0]
        pprint('Training size: %d' % train_size)
        pprint('Test size: %d' % test_size)
        # Compute word embedding
        self.senti_train_set = np.zeros(
            (train_size, word_embedding.embedding_dim()), dtype=floatX)
        self.senti_test_set = np.zeros(
            (test_size, word_embedding.embedding_dim()), dtype=floatX)
        # Embedding for training set
        for i, sent in enumerate(senti_train_txt):
            words = sent.split()
            words = [word.lower() for word in words]
            # pprint('Trainging set, Number of words in sentence %d: %d' % (i, len(words)))
            vectors = np.asarray(
                [word_embedding.wordvec(word) for word in words])
            self.senti_train_set[i, :] = np.mean(vectors, axis=0)
        # Embedding for test set
        for i, sent in enumerate(senti_test_txt):
            words = sent.split()
            words = [word.lower() for word in words]
            # pprint('Test set, Number of words in sentence %d: %d' % (i, len(words)))
            vectors = np.asarray(
                [word_embedding.wordvec(word) for word in words])
            self.senti_test_set[i, :] = np.mean(vectors, axis=0)
        # Shuffle training and test data set
        train_rand_index = np.random.permutation(train_size)
        test_rand_index = np.random.permutation(test_size)
        self.senti_train_txt = list(
            np.asarray(self.senti_train_txt)[train_rand_index])
        self.senti_test_txt = list(
            np.asarray(self.senti_test_txt)[test_rand_index])
        self.senti_train_set = self.senti_train_set[train_rand_index, :]
        self.senti_test_set = self.senti_test_set[test_rand_index, :]
        self.senti_train_label = self.senti_train_label[train_rand_index]
        self.senti_test_label = self.senti_test_label[test_rand_index]
        end_time = time.time()
        pprint(
            'Time used to build initial training and test matrix: %f seconds.'
            % (end_time - start_time))
        # Store data
        self.train_size = train_size
        self.test_size = test_size
        self.word_embedding = word_embedding
コード例 #15
0
class TestOnLarge(unittest.TestCase):
	'''
	Test the initial sentence model on Wiki-Data set, which contains 
	39,746,550 sentences,
	782,603,381 words
	'''
	def setUp(self):
		train_txt_filename = '../data/wiki_sentence.txt'
		wiki_filename = '../data/wiki_embeddings.txt'
		start_time = time.time()
		self.word_embedding = WordEmbedding(wiki_filename)
		with file(train_txt_filename, 'rb') as fin:
			self.train_txt = fin.readlines()
		end_time = time.time()
		# Since the maximum length in the task of sentiment_analysis is 56, during training
		# we will set 56 as the maximum length of each sentence
		self.max_length = 56
		self.num_sent = len(self.train_txt)
		self.batch_size = 2000
		self.nepoch = 5
		pprint('Time used to load wiki sentences into memory: %f seconds.' % (end_time-start_time))
		pprint('Number of sentences in the data set: %d' % len(self.train_txt))

	def testTrain(self):
		'''
		Train Auto-Encoder + SoftmaxLayer on batch learning mode.
		'''
		input_dim, hidden_dim = self.max_length * self.word_embedding.embedding_dim(), 500
		# Build AutoEncoder + SoftmaxLayer
		start_time = time.time()
		seed = 1991
		input_matrix = T.matrix(name='input')
		num_in, num_out = input_dim, hidden_dim
		act = Activation('tanh')
		is_denoising, is_sparse = True, False
		lambda1, mask = 1e-4, 0.5
		rng = RandomStreams(seed)
		sent_model = SentModel(input_matrix, (num_in, num_out), act, 
				is_denoising, is_sparse, lambda1, mask, rng, verbose=True)
		end_time = time.time()
		pprint('Time used to build the model: %f seconds.' % (end_time-start_time))
		# Loading training data and start batch training mode
		num_batch = self.num_sent / self.batch_size
		learn_rate = 0.1
		# Pretraining
		pprint('Start pretraining...')
		start_time = time.time()
		for i in xrange(self.nepoch):
			# Batch training
			pprint('Training epoch: %d' % i)
			for j in xrange(num_batch):
				train_set = np.zeros((self.batch_size, self.max_length * self.word_embedding.embedding_dim()), dtype=floatX)
				train_txt = self.train_txt[j*self.batch_size : (j+1)*self.batch_size]
				for k, sent in enumerate(train_txt):
					words = sent.split()
					vectors = np.asarray([self.word_embedding.wordvec(word) for word in words])
					vectors = vectors.flatten()
					train_set[k, :vectors.shape[0]] = vectors
				rate = learn_rate
				cost = sent_model.pretrain(train_set, rate)
				if (j+1) % 500 == 0:
					pprint('Training epoch: %d, Number batch: %d, cost = %f' % (i, j, cost))
			# Saving temporary pretraining model in .gz
			with gzip.GzipFile('./large_pretrain.sent.gz', 'wb') as fout:
				cPickle.dump(sent_model, fout)
		end_time = time.time()
		pprint('Time used for pretraining: %f minutes.' % ((end_time-start_time)/60.0))
		# Fine tuning
		pprint('Start fine-tuning...')
		start_time = time.time()
		for i in xrange(self.nepoch):
			# Batch training
			pprint('Training epoch: %d' % i)
			for j in xrange(num_batch):
				train_set = np.zeros((self.batch_size, self.max_length * self.word_embedding.embedding_dim()), dtype=floatX)
				train_txt = self.train_txt[j*self.batch_size : (j+1)*self.batch_size]
				for k, sent in enumerate(train_txt):
					words = sent.split()
					vectors = np.asarray([self.word_embedding.wordvec(word) for word in words])
					vectors = vectors.flatten()
					train_set[k, :vectors.shape[0]] = vectors
				rate = learn_rate
				cost = sent_model.finetune(train_set, rate)
				if (j+1) % 500 == 0:
					pprint('Training epoch: %d, Number batch: %d, cost = %f' % (i, j, cost))
			# Saving temporary fine-tuning model in .gz
			with gzip.GzipFile('./large_finetune.sent.gz', 'wb') as fout:
				cPickle.dump(sent_model, fout)
		end_time = time.time()
		pprint('Time used for fine-tuning: %f minutes.' %((end_time-start_time)/60.0))
コード例 #16
0
    for txt, label in reader:
        senti_train_txt.append(txt)
        senti_train_label.append(int(label))
# Read test data set
with file(senti_test_filename, 'r') as fin:
    reader = csv.reader(fin, delimiter='|')
    for txt, label in reader:
        senti_test_txt.append(txt)
        senti_test_label.append(int(label))
end_time = time.time()
logger.debug('Time used to load training and test data set: %f seconds.' % (end_time-start_time))
# Load word-embedding
embedding_filename = '../data/wiki_embeddings.txt'
# Load training/test data sets and wiki-embeddings
word_embedding = WordEmbedding(embedding_filename)
embed_dim = word_embedding.embedding_dim()
start_time = time.time()
blank_index = word_embedding.word2index('</s>')
logger.debug('Blank index: {}'.format(word_embedding.index2word(blank_index)))
# Word-vector representation
senti_train_label = np.asarray(senti_train_label, dtype=np.int32)
senti_test_label = np.asarray(senti_test_label, dtype=np.int32)
train_size = len(senti_train_txt)
test_size = len(senti_test_txt)
# Check size
logger.debug('Training size: %d' % train_size)
logger.debug('Test size: %d' % test_size)
# Shuffling for all the instances
start_time = time.time()
rindex = np.arange(train_size)
tindex = np.arange(test_size)
コード例 #17
0
 def testSentiment(self):
     '''
     Build a small model and use it on sentiment analysis task.
     '''
     '''
     Load training and test texts and labels 
     in sentiment analysis task, preprocessing.
     '''
     np.random.seed(1991)
     senti_train_filename = '../data/sentiment-train.txt'
     # senti_train_filename = '../data/sentiment-train-phrases.txt'
     senti_test_filename = '../data/sentiment-test.txt'
     senti_train_txt, senti_train_label = [], []
     senti_test_txt, senti_test_label = [], []
     start_time = time.time()
     # Read training data set
     with file(senti_train_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             senti_train_txt.append(txt)
             senti_train_label.append(int(label))
     # Read test data set
     with file(senti_test_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             senti_test_txt.append(txt)
             senti_test_label.append(int(label))
     end_time = time.time()
     logger.debug('Time used to load training and test data set: %f seconds.' % (end_time-start_time))
     embedding_filename = '../data/wiki_embeddings.txt'
     # Load training/test data sets and wiki-embeddings
     word_embedding = WordEmbedding(embedding_filename)
     embed_dim = word_embedding.embedding_dim()
     start_time = time.time()
     # Store original text representation
     self.senti_train_txt = senti_train_txt
     self.senti_test_txt = senti_test_txt
     # Word-vector representation
     self.senti_train_label = np.asarray(senti_train_label, dtype=np.int32)
     self.senti_test_label = np.asarray(senti_test_label, dtype=np.int32)
     train_size = len(senti_train_txt)
     test_size = len(senti_test_txt)
     # Check size
     assert train_size == self.senti_train_label.shape[0]
     assert test_size == self.senti_test_label.shape[0]
     logger.debug('Training size: %d' % train_size)
     logger.debug('Test size: %d' % test_size)
     # Compute word embedding
     self.senti_train_set = []
     self.senti_test_set = []
     # Embedding for training set
     for i, sent in enumerate(senti_train_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32)
         vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words])
         self.senti_train_set.append(vectors)
     # Embedding for test set
     for i, sent in enumerate(senti_test_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32)
         vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words])
         self.senti_test_set.append(vectors)
     end_time = time.time()
     logger.debug('Time used to build initial training and test matrix: %f seconds.' % (end_time-start_time))
     # Store data
     self.train_size = train_size
     self.test_size = test_size
     self.word_embedding = word_embedding
     # Shuffling
     rindex = np.arange(train_size)
     tindex = np.arange(test_size)
     np.random.shuffle(rindex)
     np.random.shuffle(tindex)
     self.senti_train_set = list(np.asarray(self.senti_train_set)[rindex])
     self.senti_test_set = list(np.asarray(self.senti_test_set)[tindex])
     self.senti_train_label = self.senti_train_label[rindex]
     self.senti_test_label = self.senti_test_label[tindex]
     senti_train_set, senti_test_set = self.senti_train_set, self.senti_test_set
     senti_train_label, senti_test_label = self.senti_train_label, self.senti_test_label   
     p_count = np.sum(senti_train_label)
     logger.debug('Default positive percentage in Train: %f' % (float(p_count) / train_size))
     logger.debug('Default negative percentage in Train: %f' % (float(train_size-p_count) / train_size))
     p_count = np.sum(senti_test_label)
     logger.debug('Default positive percentage in Test: %f' % (float(p_count) / test_size))
     logger.debug('Default negative percentage in Test: %f' % (float(test_size-p_count) / test_size))
     # Now, start training
     start_time = time.time()
     grbagger = GrCNNBagger(self.configer, verbose=True)
     end_time = time.time()
     logger.debug('Time used to build the model: %f seconds.' % (end_time-start_time))
     learn_rate = 0.02
     # Training using stochastic gradient descent algorithm
     epoch = 200
     batch_size = 20
     start_time = time.time()
     highest_train_accuracy, highest_test_accuracy = 0.0, 0.0
     track_training_acc, track_training_cost = [], []
     track_test_acc, track_test_cost = [], []
     try:
         sample_size = 0
         fuedge_factor = 1e-6
         for i in xrange(epoch):
             costs = 0.0
             correct_count = 0
             logger.debug('=' * 50)
             # rate = learn_rate / (1+i)
             rate = learn_rate
             # Training
             num_batch = train_size / batch_size
             for k in xrange(num_batch):
                 accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 for j in xrange(k*batch_size, (k+1)*batch_size):
                     results = grbagger.compute_gradient_and_cost(senti_train_set[j], senti_train_label[j])
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == senti_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 grbagger.update_params(accumu_grads, rate)
             accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             if num_batch * batch_size < train_size:
                 for j in xrange(num_batch*batch_size, train_size):
                     results = grbagger.compute_gradient_and_cost(senti_train_set[j], senti_train_label[j])
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == senti_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= train_size-num_batch*batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 grbagger.update_params(accumu_grads, rate)
             train_accuracy = float(correct_count) / train_size
             logger.debug('Training epoch: %d, total cost: %f, accuracy = %f' 
                         % (i, costs, train_accuracy))
             # Append all the numbers
             track_training_cost.append(costs)
             track_training_acc.append(train_accuracy)
             if train_accuracy > highest_train_accuracy: highest_train_accuracy = train_accuracy
             # Testing
             correct_count = 0
             costs = 0.0
             for j in xrange(test_size):
                 pred = grbagger.predict(senti_test_set[j])
                 cost = grbagger.show_cost(senti_test_set[j], senti_test_label[j])
                 if pred == senti_test_label[j]: correct_count += 1
                 costs += cost
             test_accuracy = float(correct_count) / test_size
             logger.debug('Test accuracy: %f' % test_accuracy)
             # Append all the numbers
             track_test_cost.append(costs)
             track_test_acc.append(test_accuracy)
             if test_accuracy > highest_test_accuracy: highest_test_accuracy = test_accuracy
             # Sampling to show the weights and experts of training and test instances
             logger.debug('Training Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(train_size)
                 weights = grbagger.show_weights(senti_train_set[idx])
                 scores = grbagger.show_scores(senti_train_set[idx])
                 prob = grbagger.show_prob(senti_train_set[idx])
                 label = senti_train_label[idx]
                 logger.debug('Training idx: {}'.format(idx))
                 logger.debug('Training scores: {}'.format(scores))
                 logger.debug('Training weights: {}'.format(weights))
                 logger.debug('Training probability: {}'.format(prob))
                 logger.debug('Training label: {}'.format(label))
                 logger.debug('-' * 50)
             logger.debug('Test Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(test_size)
                 weights = grbagger.show_weights(senti_test_set[idx])    
                 scores = grbagger.show_scores(senti_test_set[idx])
                 prob = grbagger.show_prob(senti_test_set[idx])
                 label = senti_test_label[idx]
                 logger.debug('Test idx: {}'.format(idx))
                 logger.debug('Test scores: {}'.format(scores))
                 logger.debug('Test weights: {}'.format(weights))
                 logger.debug('Test probability: {}'.format(prob))
                 logger.debug('Test label: {}'.format(label))
                 logger.debug('-' * 50)
             # Check norms of the model parameter
             for param in grbagger.params:
                 val = param.get_value(borrow=True)
                 norm = np.sqrt(np.sum(np.square(val)))
                 logger.debug('Parameter: {}, L2-norm: {}'.format(param.name, norm))
     except:
         logger.debug('Error appeared!')
         traceback.print_exc(file=sys.stdout)
         logger.debug('-' * 50)
     finally:
         end_time = time.time()
         logger.debug('Time used for training: %f seconds.' % (end_time-start_time))
         logger.debug('Highest training accuracy: %f' % highest_train_accuracy)
         logger.debug('Highest test accuracy: %f' % highest_test_accuracy)
         GrCNNBagger.save('grbagger.model', grbagger)
         # Save all the tracking numbers
         track_training_acc = np.asarray(track_training_acc)
         track_training_cost = np.asarray(track_training_cost)
         track_test_acc = np.asarray(track_test_acc)
         track_test_cost = np.asarray(track_test_cost)
         with file('senti-records.npy', 'w') as fout:
             np.save(fout, track_training_acc)
             np.save(fout, track_training_cost)
             np.save(fout, track_test_acc)
             np.save(fout, track_test_cost)
         logger.debug('Training and test records saved to senti-records.npy...')
         logger.debug('Finished...')
コード例 #18
0
logger.debug('Finished loading training and test data set...')
logger.debug('Time used to load training and test pairs: %f seconds.' %
             (end_time - start_time))
embedding_filename = '../data/wiki_embeddings.txt'
word_embedding = WordEmbedding(embedding_filename)
start_time = time.time()
# Beginning and trailing token for each sentence
blank_token = word_embedding.wordvec('</s>')
# Store original text representation
train_size = len(train_pairs_txt)
test_size = len(test_pairs_txt)
logger.debug('Size of training pairs: %d' % train_size)
logger.debug('Size of test pairs: %d' % test_size)
train_pairs_set, test_pairs_set = [], []
# Build word embedding for both training and test data sets
edim = word_embedding.embedding_dim()
# Build training data set
for i, (psent, qsent) in enumerate(train_pairs_txt):
    pwords = psent.split()
    pwords = [pword.lower() for pword in pwords]
    pvectors = np.zeros((len(pwords) + 2, edim), dtype=floatX)
    pvectors[0, :], pvectors[-1, :] = blank_token, blank_token
    pvectors[1:-1, :] = np.asarray(
        [word_embedding.wordvec(pword) for pword in pwords], dtype=floatX)

    qwords = qsent.split()
    qwords = [qword.lower() for qword in qwords]
    qvectors = np.zeros((len(qwords) + 2, edim), dtype=floatX)
    qvectors[0, :], qvectors[-1, :] = blank_token, blank_token
    qvectors[1:-1, :] = np.asarray(
        [word_embedding.wordvec(qword) for qword in qwords], dtype=floatX)
コード例 #19
0
assert len(mr_txt) == len(mr_label)
data_size = len(mr_txt)
logger.info('Size of the data sets: %d' % data_size)
random_index = np.arange(data_size)
np.random.shuffle(random_index)
mr_txt = list(np.asarray(mr_txt)[random_index])
mr_label = list(np.asarray(mr_label)[random_index])
end_time = time.time()
# Record timing
logger.info('Time used to load and shuffle MR dataset: %f seconds.' %
            (end_time - start_time))
# Load word-embedding
embedding_filename = './wiki_embeddings.txt'
# Load training/test data sets and wiki-embeddings.
word_embedding = WordEmbedding(embedding_filename)
embed_dim = word_embedding.embedding_dim()
start_time = time.time()
blank_index = word_embedding.word2index('</s>')
logger.info('Blank index: {}'.format(word_embedding.index2word(blank_index)))
# Word-vector representation, zero-padding all the sentences to the maximum length.
max_len = 52
mr_insts = np.zeros((data_size, max_len, word_embedding.embedding_dim()),
                    dtype=np.float32)
mr_label = np.asarray(mr_label)[:, np.newaxis]
for i, sent in enumerate(mr_txt):
    words = sent.split()
    words = [word.lower() for word in words]
    l = len(words)
    #截断句子
    if (l > max_len - 2):
        words = words[:max_len - 2]