def test_move_and_back(self): numpy_optimizer = NumpyBigramOptimizer(self.statistics, self.vocabulary) theano_optimizer = TheanoBigramOptimizer(self.statistics, self.vocabulary) orig_class_counts = numpy.copy(numpy_optimizer._class_counts) orig_cc_counts = numpy.copy(numpy_optimizer._cc_counts) orig_cw_counts = numpy.copy(numpy_optimizer._cw_counts) orig_wc_counts = numpy.copy(numpy_optimizer._wc_counts) word_id = self.vocabulary.word_to_id['d'] orig_class_id = numpy_optimizer.get_word_class(word_id) new_class_id = 3 if orig_class_id != 3 else 4 numpy_optimizer._move(word_id, new_class_id) theano_optimizer._move(word_id, new_class_id) self.assert_optimizers_equal(numpy_optimizer, theano_optimizer) self.assertEqual(numpy.count_nonzero(numpy_optimizer._class_counts != orig_class_counts), 2) self.assertEqual(numpy.sum(numpy_optimizer._class_counts), numpy.sum(orig_class_counts)) self.assertGreater(numpy.count_nonzero(numpy_optimizer._cc_counts != orig_cc_counts), 0) self.assertEqual(numpy.sum(numpy_optimizer._cc_counts), numpy.sum(orig_cc_counts)) self.assertGreater(numpy.count_nonzero(numpy_optimizer._cw_counts != orig_cw_counts), 0) self.assertEqual(numpy.sum(numpy_optimizer._cw_counts), numpy.sum(orig_cw_counts)) self.assertGreater(numpy.count_nonzero(numpy_optimizer._wc_counts != orig_wc_counts), 0) self.assertEqual(numpy.sum(numpy_optimizer._wc_counts), numpy.sum(orig_wc_counts)) numpy_optimizer._move(word_id, orig_class_id) theano_optimizer._move(word_id, orig_class_id) self.assert_optimizers_equal(numpy_optimizer, theano_optimizer) self.assertTrue(numpy.array_equal(numpy_optimizer._class_counts, orig_class_counts)) self.assertTrue(numpy.array_equal(numpy_optimizer._cc_counts, orig_cc_counts)) self.assertTrue(numpy.array_equal(numpy_optimizer._cw_counts, orig_cw_counts)) self.assertTrue(numpy.array_equal(numpy_optimizer._wc_counts, orig_wc_counts))
def test_move_and_recompute(self): optimizer1 = NumpyBigramOptimizer(self.statistics, self.vocabulary) word_id = self.vocabulary.word_to_id['d'] orig_class_id = optimizer1.get_word_class(word_id) new_class_id = 3 if orig_class_id != 3 else 4 optimizer1._word_to_class[word_id] = new_class_id counts = optimizer1._compute_class_statistics( optimizer1._word_counts, optimizer1._ww_counts, optimizer1._word_to_class) class_counts = numpy.zeros(optimizer1.num_classes, 'int32') cc_counts = numpy.zeros( (optimizer1.num_classes, optimizer1.num_classes), dtype='int32') cw_counts = numpy.zeros( (optimizer1.num_classes, optimizer1.vocabulary_size), dtype='int32') wc_counts = numpy.zeros( (optimizer1.vocabulary_size, optimizer1.num_classes), dtype='int32') for wid, cid in enumerate(optimizer1._word_to_class): class_counts[cid] += optimizer1._word_counts[wid] for left_wid, right_wid in zip(*optimizer1._ww_counts.nonzero()): count = optimizer1._ww_counts[left_wid, right_wid] left_cid = optimizer1._word_to_class[left_wid] right_cid = optimizer1._word_to_class[right_wid] cc_counts[left_cid, right_cid] += count cw_counts[left_cid, right_wid] += count wc_counts[left_wid, right_cid] += count self.assertTrue(numpy.array_equal(class_counts, counts[0])) self.assertTrue(numpy.array_equal(cc_counts, counts[1])) self.assertTrue(numpy.array_equal(cw_counts, counts[2])) self.assertTrue(numpy.array_equal(wc_counts, counts[3])) optimizer1._class_counts = counts[0] optimizer1._cc_counts = counts[1] optimizer1._cw_counts = counts[2] optimizer1._wc_counts = counts[3] optimizer2 = NumpyBigramOptimizer(self.statistics, self.vocabulary) orig_class_id = optimizer2.get_word_class(word_id) optimizer2._move(word_id, new_class_id) self.assertEqual( numpy.count_nonzero( optimizer1._class_counts != optimizer2._class_counts), 0) self.assertEqual( numpy.count_nonzero( optimizer1._cc_counts != optimizer2._cc_counts), 0) self.assertEqual( numpy.count_nonzero( optimizer1._cw_counts != optimizer2._cw_counts), 0) self.assertEqual( numpy.count_nonzero( optimizer1._wc_counts != optimizer2._wc_counts), 0) optimizer3 = TheanoBigramOptimizer(self.statistics, self.vocabulary) orig_class_id = optimizer3.get_word_class(word_id) optimizer3._move(word_id, new_class_id) self.assert_optimizers_equal(optimizer2, optimizer3)
def test_move_and_recompute(self): optimizer1 = NumpyBigramOptimizer(self.statistics, self.vocabulary) word_id = self.vocabulary.word_to_id['d'] orig_class_id = optimizer1.get_word_class(word_id) new_class_id = 3 if orig_class_id != 3 else 4 optimizer1._word_to_class[word_id] = new_class_id counts = optimizer1._compute_class_statistics(optimizer1._word_counts, optimizer1._ww_counts, optimizer1._word_to_class) class_counts = numpy.zeros(optimizer1.num_classes, 'int32') cc_counts = numpy.zeros((optimizer1.num_classes, optimizer1.num_classes), dtype='int32') cw_counts = numpy.zeros((optimizer1.num_classes, optimizer1.vocabulary_size), dtype='int32') wc_counts = numpy.zeros((optimizer1.vocabulary_size, optimizer1.num_classes), dtype='int32') for wid, cid in enumerate(optimizer1._word_to_class): class_counts[cid] += optimizer1._word_counts[wid] for left_wid, right_wid in zip(*optimizer1._ww_counts.nonzero()): count = optimizer1._ww_counts[left_wid, right_wid] left_cid = optimizer1._word_to_class[left_wid] right_cid = optimizer1._word_to_class[right_wid] cc_counts[left_cid,right_cid] += count cw_counts[left_cid,right_wid] += count wc_counts[left_wid,right_cid] += count self.assertTrue(numpy.array_equal(class_counts, counts[0])) self.assertTrue(numpy.array_equal(cc_counts, counts[1])) self.assertTrue(numpy.array_equal(cw_counts, counts[2])) self.assertTrue(numpy.array_equal(wc_counts, counts[3])) optimizer1._class_counts = counts[0] optimizer1._cc_counts = counts[1] optimizer1._cw_counts = counts[2] optimizer1._wc_counts = counts[3] optimizer2 = NumpyBigramOptimizer(self.statistics, self.vocabulary) orig_class_id = optimizer2.get_word_class(word_id) optimizer2._move(word_id, new_class_id) self.assertEqual(numpy.count_nonzero(optimizer1._class_counts != optimizer2._class_counts), 0) self.assertEqual(numpy.count_nonzero(optimizer1._cc_counts != optimizer2._cc_counts), 0) self.assertEqual(numpy.count_nonzero(optimizer1._cw_counts != optimizer2._cw_counts), 0) self.assertEqual(numpy.count_nonzero(optimizer1._wc_counts != optimizer2._wc_counts), 0) optimizer3 = TheanoBigramOptimizer(self.statistics, self.vocabulary) orig_class_id = optimizer3.get_word_class(word_id) optimizer3._move(word_id, new_class_id) self.assert_optimizers_equal(optimizer2, optimizer3)
def test_evaluate(self): numpy_optimizer = NumpyBigramOptimizer(self.statistics, self.vocabulary) theano_optimizer = TheanoBigramOptimizer(self.statistics, self.vocabulary) word_id = numpy_optimizer.get_word_id('d') orig_class_id = numpy_optimizer.get_word_class(word_id) new_class_id = 1 if orig_class_id != 1 else 0 orig_ll = numpy_optimizer.log_likelihood() self.assertTrue( numpy.isclose(orig_ll, theano_optimizer.log_likelihood())) ll_diff = numpy_optimizer._evaluate(word_id, new_class_id) self.assertTrue( numpy.isclose(ll_diff, theano_optimizer._evaluate(word_id, new_class_id))) numpy_optimizer._move(word_id, new_class_id) new_ll = numpy_optimizer.log_likelihood() self.assertFalse(numpy.isclose(orig_ll, new_ll)) self.assertTrue(numpy.isclose(orig_ll + ll_diff, new_ll)) theano_optimizer._move(word_id, new_class_id) self.assertTrue( numpy.isclose(new_ll, theano_optimizer.log_likelihood()))
def test_statistics(self): num_words = 8 theano_optimizer = TheanoBigramOptimizer(self.statistics, self.vocabulary) numpy_optimizer = NumpyBigramOptimizer(self.statistics, self.vocabulary) self.assertEqual(theano_optimizer.vocabulary_size, num_words) self.assertEqual(numpy_optimizer.vocabulary_size, num_words) self.assertEqual(theano_optimizer.num_classes, self.num_classes + 3) self.assertEqual(numpy_optimizer.num_classes, self.num_classes + 3) self.assertEqual(len(theano_optimizer._word_to_class.get_value()), num_words) self.assertEqual(len(numpy_optimizer._word_to_class), num_words) sos_word_id = self.vocabulary.word_to_id['<s>'] a_word_id = self.vocabulary.word_to_id['a'] b_word_id = self.vocabulary.word_to_id['b'] c_word_id = self.vocabulary.word_to_id['c'] d_word_id = self.vocabulary.word_to_id['d'] e_word_id = self.vocabulary.word_to_id['e'] unk_word_id = self.vocabulary.word_to_id['<unk>'] eos_word_id = self.vocabulary.word_to_id['</s>'] self.assert_optimizers_equal(numpy_optimizer, theano_optimizer) self.assertEqual(len(numpy_optimizer._word_counts), num_words) self.assertEqual(numpy_optimizer._word_counts[sos_word_id], 11) self.assertEqual(numpy_optimizer._word_counts[a_word_id], 13) self.assertEqual(numpy_optimizer._word_counts[b_word_id], 8) self.assertEqual(numpy_optimizer._word_counts[c_word_id], 8) self.assertEqual(numpy_optimizer._word_counts[d_word_id], 11) self.assertEqual(numpy_optimizer._word_counts[e_word_id], 15) self.assertEqual(numpy_optimizer._word_counts[unk_word_id], 0) self.assertEqual(numpy_optimizer._word_counts[eos_word_id], 11) self.assertEqual(numpy_optimizer._ww_counts.shape[0], num_words) self.assertEqual(numpy_optimizer._ww_counts.shape[1], num_words) self.assertEqual(len(numpy_optimizer._class_counts), self.num_classes + 3) self.assertEqual(numpy_optimizer._cc_counts.shape[0], self.num_classes + 3) self.assertEqual(numpy_optimizer._cw_counts.shape[0], self.num_classes + 3) self.assertEqual(numpy_optimizer._cw_counts.shape[1], num_words) self.assertEqual(numpy_optimizer._wc_counts.shape[0], num_words) self.assertEqual(numpy_optimizer._wc_counts.shape[1], self.num_classes + 3)
def test_evaluate(self): numpy_optimizer = NumpyBigramOptimizer(self.statistics, self.vocabulary) theano_optimizer = TheanoBigramOptimizer(self.statistics, self.vocabulary) word_id = numpy_optimizer.get_word_id('d') orig_class_id = numpy_optimizer.get_word_class(word_id) new_class_id = 1 if orig_class_id != 1 else 0 orig_ll = numpy_optimizer.log_likelihood() self.assertTrue(numpy.isclose(orig_ll, theano_optimizer.log_likelihood())) ll_diff = numpy_optimizer._evaluate(word_id, new_class_id) self.assertTrue(numpy.isclose(ll_diff, theano_optimizer._evaluate(word_id, new_class_id))) numpy_optimizer._move(word_id, new_class_id) new_ll = numpy_optimizer.log_likelihood() self.assertFalse(numpy.isclose(orig_ll, new_ll)) self.assertTrue(numpy.isclose(orig_ll + ll_diff, new_ll)) theano_optimizer._move(word_id, new_class_id) self.assertTrue(numpy.isclose(new_ll, theano_optimizer.log_likelihood()))
def main(): parser = argparse.ArgumentParser(prog='wctool') argument_group = parser.add_argument_group("files") argument_group.add_argument( '--training-set', metavar='FILE', type=TextFileType('r'), nargs='+', required=True, help='text or .gz files containing training data (one sentence per ' 'line)') argument_group.add_argument( '--vocabulary', metavar='FILE', type=TextFileType('r'), default=None, help='text or .gz file containing a list of words to include in class ' 'forming, and possibly their initial classes') argument_group.add_argument( '--vocabulary-format', metavar='FORMAT', type=str, default='words', help='vocabulary format, one of "words" (one word per line, default), ' '"classes" (word and class ID per line), "srilm-classes" (class ' 'name, membership probability, and word per line)') argument_group.add_argument( '--output-file', metavar='FILE', type=TextFileType('w'), default='-', help='where to write the word classes (default stdout)') argument_group.add_argument( '--output-format', metavar='FORMAT', type=str, default='srilm-classes', help='format of the output file, one of "classes" (word and class ID ' 'per line), "srilm-classes" (default; class name, membership ' 'probability, and word per line)') argument_group.add_argument( '--output-frequency', metavar='N', type=int, default='1', help='save classes N times per optimization iteration (default 1)') argument_group = parser.add_argument_group("optimization") argument_group.add_argument( '--num-classes', metavar='N', type=int, default=2000, help='number of classes to form, if vocabulary is not specified ' '(default 2000)') argument_group.add_argument( '--method', metavar='NAME', type=str, default='bigram-theano', help='method for creating word classes, one of "bigram-theano", ' '"bigram-numpy" (default "bigram-theano")') argument_group = parser.add_argument_group("logging and debugging") argument_group.add_argument( '--log-file', metavar='FILE', type=str, default='-', help='path where to write log file (default is standard output)') argument_group.add_argument( '--log-level', metavar='LEVEL', type=str, default='info', help='minimum level of events to log, one of "debug", "info", "warn" ' '(default "info")') argument_group.add_argument( '--log-interval', metavar='N', type=int, default=1000, help='print statistics after every Nth word; quiet if less than one ' '(default 1000)') args = parser.parse_args() log_file = args.log_file log_level = getattr(logging, args.log_level.upper(), None) if not isinstance(log_level, int): raise ValueError("Invalid logging level requested: " + args.log_level) log_format = '%(asctime)s %(funcName)s: %(message)s' if args.log_file == '-': logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level) else: logging.basicConfig(filename=log_file, format=log_format, level=log_level) if args.vocabulary is None: vocabulary = Vocabulary.from_corpus(args.training_set, args.num_classes) for subset_file in args.training_set: subset_file.seek(0) else: vocabulary = Vocabulary.from_file(args.vocabulary, args.vocabulary_format) print("Number of words in vocabulary:", vocabulary.num_words()) print("Number of word classes:", vocabulary.num_classes()) print("Number of normal word classes:", vocabulary.num_normal_classes) logging.info("Reading word unigram and bigram statistics.") statistics = WordStatistics(args.training_set, vocabulary) if args.method == 'bigram-theano': optimizer = TheanoBigramOptimizer(statistics, vocabulary) elif args.method == 'bigram-numpy': optimizer = NumpyBigramOptimizer(statistics, vocabulary) else: raise ValueError("Invalid method requested: " + args.method) iteration = 1 while True: logging.info("Starting iteration %d.", iteration) num_words = 0 num_moves = 0 for word in vocabulary.words(): start_time = time() num_words += 1 if optimizer.move_to_best_class(word): num_moves += 1 duration = time() - start_time if (args.log_interval >= 1) and \ (num_words % args.log_interval == 0): logging.info( "[%d] (%.1f %%) of iteration %d -- moves = %d, cost = %.2f, duration = %.1f ms", num_words, num_words / vocabulary.num_words() * 100, iteration, num_moves, optimizer.log_likelihood(), duration * 100) if is_scheduled(num_words, args.output_frequency, vocabulary.num_words()): save(optimizer, args.output_file, args.output_format) if num_moves == 0: break iteration += 1 logging.info("Optimization finished.") save(optimizer, args.output_file, args.output_format)
def main(): parser = argparse.ArgumentParser(prog='wctool') argument_group = parser.add_argument_group("files") argument_group.add_argument( '--training-set', metavar='FILE', type=TextFileType('r'), nargs='+', required=True, help='text or .gz files containing training data (one sentence per ' 'line)') argument_group.add_argument( '--vocabulary', metavar='FILE', type=TextFileType('r'), default=None, help='text or .gz file containing a list of words to include in class ' 'forming, and possibly their initial classes') argument_group.add_argument( '--vocabulary-format', metavar='FORMAT', type=str, default='words', help='vocabulary format, one of "words" (one word per line, default), ' '"classes" (word and class ID per line), "srilm-classes" (class ' 'name, membership probability, and word per line)') argument_group.add_argument( '--output-file', metavar='FILE', type=TextFileType('w'), default='-', help='where to write the word classes (default stdout)') argument_group.add_argument( '--output-format', metavar='FORMAT', type=str, default='srilm-classes', help='format of the output file, one of "classes" (word and class ID ' 'per line), "srilm-classes" (default; class name, membership ' 'probability, and word per line)') argument_group.add_argument( '--output-frequency', metavar='N', type=int, default='1', help='save classes N times per optimization iteration (default 1)') argument_group = parser.add_argument_group("optimization") argument_group.add_argument( '--num-classes', metavar='N', type=int, default=2000, help='number of classes to form, if vocabulary is not specified ' '(default 2000)') argument_group.add_argument( '--method', metavar='NAME', type=str, default='bigram-theano', help='method for creating word classes, one of "bigram-theano", ' '"bigram-numpy" (default "bigram-theano")') argument_group = parser.add_argument_group("logging and debugging") argument_group.add_argument( '--log-file', metavar='FILE', type=str, default='-', help='path where to write log file (default is standard output)') argument_group.add_argument( '--log-level', metavar='LEVEL', type=str, default='info', help='minimum level of events to log, one of "debug", "info", "warn" ' '(default "info")') argument_group.add_argument( '--log-interval', metavar='N', type=int, default=1000, help='print statistics after every Nth word; quiet if less than one ' '(default 1000)') args = parser.parse_args() log_file = args.log_file log_level = getattr(logging, args.log_level.upper(), None) if not isinstance(log_level, int): raise ValueError("Invalid logging level requested: " + args.log_level) log_format = '%(asctime)s %(funcName)s: %(message)s' if args.log_file == '-': logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level) else: logging.basicConfig(filename=log_file, format=log_format, level=log_level) if args.vocabulary is None: vocabulary = Vocabulary.from_corpus(args.training_set, args.num_classes) for subset_file in args.training_set: subset_file.seek(0) else: vocabulary = Vocabulary.from_file(args.vocabulary, args.vocabulary_format) print("Number of words in vocabulary:", vocabulary.num_words()) print("Number of word classes:", vocabulary.num_classes()) print("Number of normal word classes:", vocabulary.num_normal_classes) logging.info("Reading word unigram and bigram statistics.") statistics = WordStatistics(args.training_set, vocabulary) if args.method == 'bigram-theano': optimizer = TheanoBigramOptimizer(statistics, vocabulary) elif args.method == 'bigram-numpy': optimizer = NumpyBigramOptimizer(statistics, vocabulary) else: raise ValueError("Invalid method requested: " + args.method) iteration = 1 while True: logging.info("Starting iteration %d.", iteration) num_words = 0 num_moves = 0 for word in vocabulary.words(): start_time = time() num_words += 1 if optimizer.move_to_best_class(word): num_moves += 1 duration = time() - start_time if (args.log_interval >= 1) and \ (num_words % args.log_interval == 0): logging.info("[%d] (%.1f %%) of iteration %d -- moves = %d, cost = %.2f, duration = %.1f ms", num_words, num_words / vocabulary.num_words() * 100, iteration, num_moves, optimizer.log_likelihood(), duration * 100) if is_scheduled(num_words, args.output_frequency, vocabulary.num_words()): save(optimizer, args.output_file, args.output_format) if num_moves == 0: break iteration += 1 logging.info("Optimization finished.") save(optimizer, args.output_file, args.output_format)