Example #1
0
    def test_evaluate(self):
        numpy_optimizer = NumpyBigramOptimizer(self.statistics,
                                               self.vocabulary)
        theano_optimizer = TheanoBigramOptimizer(self.statistics,
                                                 self.vocabulary)
        word_id = numpy_optimizer.get_word_id('d')
        orig_class_id = numpy_optimizer.get_word_class(word_id)
        new_class_id = 1 if orig_class_id != 1 else 0

        orig_ll = numpy_optimizer.log_likelihood()
        self.assertTrue(
            numpy.isclose(orig_ll, theano_optimizer.log_likelihood()))

        ll_diff = numpy_optimizer._evaluate(word_id, new_class_id)
        self.assertTrue(
            numpy.isclose(ll_diff,
                          theano_optimizer._evaluate(word_id, new_class_id)))

        numpy_optimizer._move(word_id, new_class_id)
        new_ll = numpy_optimizer.log_likelihood()
        self.assertFalse(numpy.isclose(orig_ll, new_ll))
        self.assertTrue(numpy.isclose(orig_ll + ll_diff, new_ll))

        theano_optimizer._move(word_id, new_class_id)
        self.assertTrue(
            numpy.isclose(new_ll, theano_optimizer.log_likelihood()))
Example #2
0
    def test_move_and_back(self):
        numpy_optimizer = NumpyBigramOptimizer(self.statistics, self.vocabulary)
        theano_optimizer = TheanoBigramOptimizer(self.statistics, self.vocabulary)

        orig_class_counts = numpy.copy(numpy_optimizer._class_counts)
        orig_cc_counts = numpy.copy(numpy_optimizer._cc_counts)
        orig_cw_counts = numpy.copy(numpy_optimizer._cw_counts)
        orig_wc_counts = numpy.copy(numpy_optimizer._wc_counts)

        word_id = self.vocabulary.word_to_id['d']
        orig_class_id = numpy_optimizer.get_word_class(word_id)
        new_class_id = 3 if orig_class_id != 3 else 4
        numpy_optimizer._move(word_id, new_class_id)
        theano_optimizer._move(word_id, new_class_id)

        self.assert_optimizers_equal(numpy_optimizer, theano_optimizer)
        self.assertEqual(numpy.count_nonzero(numpy_optimizer._class_counts != orig_class_counts), 2)
        self.assertEqual(numpy.sum(numpy_optimizer._class_counts), numpy.sum(orig_class_counts))
        self.assertGreater(numpy.count_nonzero(numpy_optimizer._cc_counts != orig_cc_counts), 0)
        self.assertEqual(numpy.sum(numpy_optimizer._cc_counts), numpy.sum(orig_cc_counts))
        self.assertGreater(numpy.count_nonzero(numpy_optimizer._cw_counts != orig_cw_counts), 0)
        self.assertEqual(numpy.sum(numpy_optimizer._cw_counts), numpy.sum(orig_cw_counts))
        self.assertGreater(numpy.count_nonzero(numpy_optimizer._wc_counts != orig_wc_counts), 0)
        self.assertEqual(numpy.sum(numpy_optimizer._wc_counts), numpy.sum(orig_wc_counts))

        numpy_optimizer._move(word_id, orig_class_id)
        theano_optimizer._move(word_id, orig_class_id)

        self.assert_optimizers_equal(numpy_optimizer, theano_optimizer)
        self.assertTrue(numpy.array_equal(numpy_optimizer._class_counts, orig_class_counts))
        self.assertTrue(numpy.array_equal(numpy_optimizer._cc_counts, orig_cc_counts))
        self.assertTrue(numpy.array_equal(numpy_optimizer._cw_counts, orig_cw_counts))
        self.assertTrue(numpy.array_equal(numpy_optimizer._wc_counts, orig_wc_counts))
Example #3
0
    def test_move_and_back(self):
        numpy_optimizer = NumpyBigramOptimizer(self.statistics, self.vocabulary)
        theano_optimizer = TheanoBigramOptimizer(self.statistics, self.vocabulary)

        orig_class_counts = numpy.copy(numpy_optimizer._class_counts)
        orig_cc_counts = numpy.copy(numpy_optimizer._cc_counts)
        orig_cw_counts = numpy.copy(numpy_optimizer._cw_counts)
        orig_wc_counts = numpy.copy(numpy_optimizer._wc_counts)

        word_id = self.vocabulary.word_to_id['d']
        orig_class_id = numpy_optimizer.get_word_class(word_id)
        new_class_id = 3 if orig_class_id != 3 else 4
        numpy_optimizer._move(word_id, new_class_id)
        theano_optimizer._move(word_id, new_class_id)

        self.assert_optimizers_equal(numpy_optimizer, theano_optimizer)
        self.assertEqual(numpy.count_nonzero(numpy_optimizer._class_counts != orig_class_counts), 2)
        self.assertEqual(numpy.sum(numpy_optimizer._class_counts), numpy.sum(orig_class_counts))
        self.assertGreater(numpy.count_nonzero(numpy_optimizer._cc_counts != orig_cc_counts), 0)
        self.assertEqual(numpy.sum(numpy_optimizer._cc_counts), numpy.sum(orig_cc_counts))
        self.assertGreater(numpy.count_nonzero(numpy_optimizer._cw_counts != orig_cw_counts), 0)
        self.assertEqual(numpy.sum(numpy_optimizer._cw_counts), numpy.sum(orig_cw_counts))
        self.assertGreater(numpy.count_nonzero(numpy_optimizer._wc_counts != orig_wc_counts), 0)
        self.assertEqual(numpy.sum(numpy_optimizer._wc_counts), numpy.sum(orig_wc_counts))

        numpy_optimizer._move(word_id, orig_class_id)
        theano_optimizer._move(word_id, orig_class_id)

        self.assert_optimizers_equal(numpy_optimizer, theano_optimizer)
        self.assertTrue(numpy.array_equal(numpy_optimizer._class_counts, orig_class_counts))
        self.assertTrue(numpy.array_equal(numpy_optimizer._cc_counts, orig_cc_counts))
        self.assertTrue(numpy.array_equal(numpy_optimizer._cw_counts, orig_cw_counts))
        self.assertTrue(numpy.array_equal(numpy_optimizer._wc_counts, orig_wc_counts))
Example #4
0
    def test_statistics(self):
        num_words = 8
        theano_optimizer = TheanoBigramOptimizer(self.statistics,
                                                 self.vocabulary)
        numpy_optimizer = NumpyBigramOptimizer(self.statistics,
                                               self.vocabulary)
        self.assertEqual(theano_optimizer.vocabulary_size, num_words)
        self.assertEqual(numpy_optimizer.vocabulary_size, num_words)
        self.assertEqual(theano_optimizer.num_classes, self.num_classes + 3)
        self.assertEqual(numpy_optimizer.num_classes, self.num_classes + 3)
        self.assertEqual(len(theano_optimizer._word_to_class.get_value()),
                         num_words)
        self.assertEqual(len(numpy_optimizer._word_to_class), num_words)

        sos_word_id = self.vocabulary.word_to_id['<s>']
        a_word_id = self.vocabulary.word_to_id['a']
        b_word_id = self.vocabulary.word_to_id['b']
        c_word_id = self.vocabulary.word_to_id['c']
        d_word_id = self.vocabulary.word_to_id['d']
        e_word_id = self.vocabulary.word_to_id['e']
        unk_word_id = self.vocabulary.word_to_id['<unk>']
        eos_word_id = self.vocabulary.word_to_id['</s>']

        self.assert_optimizers_equal(numpy_optimizer, theano_optimizer)
        self.assertEqual(len(numpy_optimizer._word_counts), num_words)
        self.assertEqual(numpy_optimizer._word_counts[sos_word_id], 11)
        self.assertEqual(numpy_optimizer._word_counts[a_word_id], 13)
        self.assertEqual(numpy_optimizer._word_counts[b_word_id], 8)
        self.assertEqual(numpy_optimizer._word_counts[c_word_id], 8)
        self.assertEqual(numpy_optimizer._word_counts[d_word_id], 11)
        self.assertEqual(numpy_optimizer._word_counts[e_word_id], 15)
        self.assertEqual(numpy_optimizer._word_counts[unk_word_id], 0)
        self.assertEqual(numpy_optimizer._word_counts[eos_word_id], 11)

        self.assertEqual(numpy_optimizer._ww_counts.shape[0], num_words)
        self.assertEqual(numpy_optimizer._ww_counts.shape[1], num_words)
        self.assertEqual(len(numpy_optimizer._class_counts),
                         self.num_classes + 3)
        self.assertEqual(numpy_optimizer._cc_counts.shape[0],
                         self.num_classes + 3)

        self.assertEqual(numpy_optimizer._cw_counts.shape[0],
                         self.num_classes + 3)
        self.assertEqual(numpy_optimizer._cw_counts.shape[1], num_words)
        self.assertEqual(numpy_optimizer._wc_counts.shape[0], num_words)
        self.assertEqual(numpy_optimizer._wc_counts.shape[1],
                         self.num_classes + 3)
Example #5
0
    def test_evaluate(self):
        numpy_optimizer = NumpyBigramOptimizer(self.statistics, self.vocabulary)
        theano_optimizer = TheanoBigramOptimizer(self.statistics, self.vocabulary)
        word_id = numpy_optimizer.get_word_id('d')
        orig_class_id = numpy_optimizer.get_word_class(word_id)
        new_class_id = 1 if orig_class_id != 1 else 0

        orig_ll = numpy_optimizer.log_likelihood()
        self.assertTrue(numpy.isclose(orig_ll, theano_optimizer.log_likelihood()))

        ll_diff = numpy_optimizer._evaluate(word_id, new_class_id)
        self.assertTrue(numpy.isclose(ll_diff, theano_optimizer._evaluate(word_id, new_class_id)))

        numpy_optimizer._move(word_id, new_class_id)
        new_ll = numpy_optimizer.log_likelihood()
        self.assertFalse(numpy.isclose(orig_ll, new_ll))
        self.assertTrue(numpy.isclose(orig_ll + ll_diff, new_ll))

        theano_optimizer._move(word_id, new_class_id)
        self.assertTrue(numpy.isclose(new_ll, theano_optimizer.log_likelihood()))
Example #6
0
def main():
    parser = argparse.ArgumentParser(prog='wctool')

    argument_group = parser.add_argument_group("files")
    argument_group.add_argument(
        '--training-set',
        metavar='FILE',
        type=TextFileType('r'),
        nargs='+',
        required=True,
        help='text or .gz files containing training data (one sentence per '
        'line)')
    argument_group.add_argument(
        '--vocabulary',
        metavar='FILE',
        type=TextFileType('r'),
        default=None,
        help='text or .gz file containing a list of words to include in class '
        'forming, and possibly their initial classes')
    argument_group.add_argument(
        '--vocabulary-format',
        metavar='FORMAT',
        type=str,
        default='words',
        help='vocabulary format, one of "words" (one word per line, default), '
        '"classes" (word and class ID per line), "srilm-classes" (class '
        'name, membership probability, and word per line)')
    argument_group.add_argument(
        '--output-file',
        metavar='FILE',
        type=TextFileType('w'),
        default='-',
        help='where to write the word classes (default stdout)')
    argument_group.add_argument(
        '--output-format',
        metavar='FORMAT',
        type=str,
        default='srilm-classes',
        help='format of the output file, one of "classes" (word and class ID '
        'per line), "srilm-classes" (default; class name, membership '
        'probability, and word per line)')
    argument_group.add_argument(
        '--output-frequency',
        metavar='N',
        type=int,
        default='1',
        help='save classes N times per optimization iteration (default 1)')

    argument_group = parser.add_argument_group("optimization")
    argument_group.add_argument(
        '--num-classes',
        metavar='N',
        type=int,
        default=2000,
        help='number of classes to form, if vocabulary is not specified '
        '(default 2000)')
    argument_group.add_argument(
        '--method',
        metavar='NAME',
        type=str,
        default='bigram-theano',
        help='method for creating word classes, one of "bigram-theano", '
        '"bigram-numpy" (default "bigram-theano")')

    argument_group = parser.add_argument_group("logging and debugging")
    argument_group.add_argument(
        '--log-file',
        metavar='FILE',
        type=str,
        default='-',
        help='path where to write log file (default is standard output)')
    argument_group.add_argument(
        '--log-level',
        metavar='LEVEL',
        type=str,
        default='info',
        help='minimum level of events to log, one of "debug", "info", "warn" '
        '(default "info")')
    argument_group.add_argument(
        '--log-interval',
        metavar='N',
        type=int,
        default=1000,
        help='print statistics after every Nth word; quiet if less than one '
        '(default 1000)')

    args = parser.parse_args()

    log_file = args.log_file
    log_level = getattr(logging, args.log_level.upper(), None)
    if not isinstance(log_level, int):
        raise ValueError("Invalid logging level requested: " + args.log_level)
    log_format = '%(asctime)s %(funcName)s: %(message)s'
    if args.log_file == '-':
        logging.basicConfig(stream=sys.stdout,
                            format=log_format,
                            level=log_level)
    else:
        logging.basicConfig(filename=log_file,
                            format=log_format,
                            level=log_level)

    if args.vocabulary is None:
        vocabulary = Vocabulary.from_corpus(args.training_set,
                                            args.num_classes)
        for subset_file in args.training_set:
            subset_file.seek(0)
    else:
        vocabulary = Vocabulary.from_file(args.vocabulary,
                                          args.vocabulary_format)

    print("Number of words in vocabulary:", vocabulary.num_words())
    print("Number of word classes:", vocabulary.num_classes())
    print("Number of normal word classes:", vocabulary.num_normal_classes)

    logging.info("Reading word unigram and bigram statistics.")
    statistics = WordStatistics(args.training_set, vocabulary)

    if args.method == 'bigram-theano':
        optimizer = TheanoBigramOptimizer(statistics, vocabulary)
    elif args.method == 'bigram-numpy':
        optimizer = NumpyBigramOptimizer(statistics, vocabulary)
    else:
        raise ValueError("Invalid method requested: " + args.method)

    iteration = 1
    while True:
        logging.info("Starting iteration %d.", iteration)
        num_words = 0
        num_moves = 0
        for word in vocabulary.words():
            start_time = time()
            num_words += 1
            if optimizer.move_to_best_class(word):
                num_moves += 1
            duration = time() - start_time
            if (args.log_interval >= 1) and \
               (num_words % args.log_interval == 0):
                logging.info(
                    "[%d] (%.1f %%) of iteration %d -- moves = %d, cost = %.2f, duration = %.1f ms",
                    num_words, num_words / vocabulary.num_words() * 100,
                    iteration, num_moves, optimizer.log_likelihood(),
                    duration * 100)
            if is_scheduled(num_words, args.output_frequency,
                            vocabulary.num_words()):
                save(optimizer, args.output_file, args.output_format)

        if num_moves == 0:
            break
        iteration += 1

    logging.info("Optimization finished.")
    save(optimizer, args.output_file, args.output_format)
Example #7
0
    def test_move_and_recompute(self):
        optimizer1 = NumpyBigramOptimizer(self.statistics, self.vocabulary)
        word_id = self.vocabulary.word_to_id['d']
        orig_class_id = optimizer1.get_word_class(word_id)
        new_class_id = 3 if orig_class_id != 3 else 4
        optimizer1._word_to_class[word_id] = new_class_id
        counts = optimizer1._compute_class_statistics(
            optimizer1._word_counts, optimizer1._ww_counts,
            optimizer1._word_to_class)

        class_counts = numpy.zeros(optimizer1.num_classes, 'int32')
        cc_counts = numpy.zeros(
            (optimizer1.num_classes, optimizer1.num_classes), dtype='int32')
        cw_counts = numpy.zeros(
            (optimizer1.num_classes, optimizer1.vocabulary_size),
            dtype='int32')
        wc_counts = numpy.zeros(
            (optimizer1.vocabulary_size, optimizer1.num_classes),
            dtype='int32')
        for wid, cid in enumerate(optimizer1._word_to_class):
            class_counts[cid] += optimizer1._word_counts[wid]
        for left_wid, right_wid in zip(*optimizer1._ww_counts.nonzero()):
            count = optimizer1._ww_counts[left_wid, right_wid]
            left_cid = optimizer1._word_to_class[left_wid]
            right_cid = optimizer1._word_to_class[right_wid]
            cc_counts[left_cid, right_cid] += count
            cw_counts[left_cid, right_wid] += count
            wc_counts[left_wid, right_cid] += count
        self.assertTrue(numpy.array_equal(class_counts, counts[0]))
        self.assertTrue(numpy.array_equal(cc_counts, counts[1]))
        self.assertTrue(numpy.array_equal(cw_counts, counts[2]))
        self.assertTrue(numpy.array_equal(wc_counts, counts[3]))
        optimizer1._class_counts = counts[0]
        optimizer1._cc_counts = counts[1]
        optimizer1._cw_counts = counts[2]
        optimizer1._wc_counts = counts[3]

        optimizer2 = NumpyBigramOptimizer(self.statistics, self.vocabulary)
        orig_class_id = optimizer2.get_word_class(word_id)
        optimizer2._move(word_id, new_class_id)

        self.assertEqual(
            numpy.count_nonzero(
                optimizer1._class_counts != optimizer2._class_counts), 0)
        self.assertEqual(
            numpy.count_nonzero(
                optimizer1._cc_counts != optimizer2._cc_counts), 0)
        self.assertEqual(
            numpy.count_nonzero(
                optimizer1._cw_counts != optimizer2._cw_counts), 0)
        self.assertEqual(
            numpy.count_nonzero(
                optimizer1._wc_counts != optimizer2._wc_counts), 0)

        optimizer3 = TheanoBigramOptimizer(self.statistics, self.vocabulary)
        orig_class_id = optimizer3.get_word_class(word_id)
        optimizer3._move(word_id, new_class_id)

        self.assert_optimizers_equal(optimizer2, optimizer3)
Example #8
0
    def test_move_and_recompute(self):
        optimizer1 = NumpyBigramOptimizer(self.statistics, self.vocabulary)
        word_id = self.vocabulary.word_to_id['d']
        orig_class_id = optimizer1.get_word_class(word_id)
        new_class_id = 3 if orig_class_id != 3 else 4
        optimizer1._word_to_class[word_id] = new_class_id
        counts = optimizer1._compute_class_statistics(optimizer1._word_counts,
                                                      optimizer1._ww_counts,
                                                      optimizer1._word_to_class)

        class_counts = numpy.zeros(optimizer1.num_classes, 'int32')
        cc_counts = numpy.zeros((optimizer1.num_classes, optimizer1.num_classes), dtype='int32')
        cw_counts = numpy.zeros((optimizer1.num_classes, optimizer1.vocabulary_size), dtype='int32')
        wc_counts = numpy.zeros((optimizer1.vocabulary_size, optimizer1.num_classes), dtype='int32')
        for wid, cid in enumerate(optimizer1._word_to_class):
            class_counts[cid] += optimizer1._word_counts[wid]
        for left_wid, right_wid in zip(*optimizer1._ww_counts.nonzero()):
            count = optimizer1._ww_counts[left_wid, right_wid]
            left_cid = optimizer1._word_to_class[left_wid]
            right_cid = optimizer1._word_to_class[right_wid]
            cc_counts[left_cid,right_cid] += count
            cw_counts[left_cid,right_wid] += count
            wc_counts[left_wid,right_cid] += count
        self.assertTrue(numpy.array_equal(class_counts, counts[0]))
        self.assertTrue(numpy.array_equal(cc_counts, counts[1]))
        self.assertTrue(numpy.array_equal(cw_counts, counts[2]))
        self.assertTrue(numpy.array_equal(wc_counts, counts[3]))
        optimizer1._class_counts = counts[0]
        optimizer1._cc_counts = counts[1]
        optimizer1._cw_counts = counts[2]
        optimizer1._wc_counts = counts[3]

        optimizer2 = NumpyBigramOptimizer(self.statistics, self.vocabulary)
        orig_class_id = optimizer2.get_word_class(word_id)
        optimizer2._move(word_id, new_class_id)

        self.assertEqual(numpy.count_nonzero(optimizer1._class_counts != optimizer2._class_counts), 0)
        self.assertEqual(numpy.count_nonzero(optimizer1._cc_counts != optimizer2._cc_counts), 0)
        self.assertEqual(numpy.count_nonzero(optimizer1._cw_counts != optimizer2._cw_counts), 0)
        self.assertEqual(numpy.count_nonzero(optimizer1._wc_counts != optimizer2._wc_counts), 0)

        optimizer3 = TheanoBigramOptimizer(self.statistics, self.vocabulary)
        orig_class_id = optimizer3.get_word_class(word_id)
        optimizer3._move(word_id, new_class_id)

        self.assert_optimizers_equal(optimizer2, optimizer3)