コード例 #1
0
    def test_evaluate(self):
        numpy_optimizer = NumpyBigramOptimizer(self.statistics,
                                               self.vocabulary)
        theano_optimizer = TheanoBigramOptimizer(self.statistics,
                                                 self.vocabulary)
        word_id = numpy_optimizer.get_word_id('d')
        orig_class_id = numpy_optimizer.get_word_class(word_id)
        new_class_id = 1 if orig_class_id != 1 else 0

        orig_ll = numpy_optimizer.log_likelihood()
        self.assertTrue(
            numpy.isclose(orig_ll, theano_optimizer.log_likelihood()))

        ll_diff = numpy_optimizer._evaluate(word_id, new_class_id)
        self.assertTrue(
            numpy.isclose(ll_diff,
                          theano_optimizer._evaluate(word_id, new_class_id)))

        numpy_optimizer._move(word_id, new_class_id)
        new_ll = numpy_optimizer.log_likelihood()
        self.assertFalse(numpy.isclose(orig_ll, new_ll))
        self.assertTrue(numpy.isclose(orig_ll + ll_diff, new_ll))

        theano_optimizer._move(word_id, new_class_id)
        self.assertTrue(
            numpy.isclose(new_ll, theano_optimizer.log_likelihood()))
コード例 #2
0
    def test_evaluate(self):
        numpy_optimizer = NumpyBigramOptimizer(self.statistics, self.vocabulary)
        theano_optimizer = TheanoBigramOptimizer(self.statistics, self.vocabulary)
        word_id = numpy_optimizer.get_word_id('d')
        orig_class_id = numpy_optimizer.get_word_class(word_id)
        new_class_id = 1 if orig_class_id != 1 else 0

        orig_ll = numpy_optimizer.log_likelihood()
        self.assertTrue(numpy.isclose(orig_ll, theano_optimizer.log_likelihood()))

        ll_diff = numpy_optimizer._evaluate(word_id, new_class_id)
        self.assertTrue(numpy.isclose(ll_diff, theano_optimizer._evaluate(word_id, new_class_id)))

        numpy_optimizer._move(word_id, new_class_id)
        new_ll = numpy_optimizer.log_likelihood()
        self.assertFalse(numpy.isclose(orig_ll, new_ll))
        self.assertTrue(numpy.isclose(orig_ll + ll_diff, new_ll))

        theano_optimizer._move(word_id, new_class_id)
        self.assertTrue(numpy.isclose(new_ll, theano_optimizer.log_likelihood()))
コード例 #3
0
ファイル: wctool.py プロジェクト: coddinglxf/theanolm
def main():
    parser = argparse.ArgumentParser(prog='wctool')

    argument_group = parser.add_argument_group("files")
    argument_group.add_argument(
        '--training-set',
        metavar='FILE',
        type=TextFileType('r'),
        nargs='+',
        required=True,
        help='text or .gz files containing training data (one sentence per '
        'line)')
    argument_group.add_argument(
        '--vocabulary',
        metavar='FILE',
        type=TextFileType('r'),
        default=None,
        help='text or .gz file containing a list of words to include in class '
        'forming, and possibly their initial classes')
    argument_group.add_argument(
        '--vocabulary-format',
        metavar='FORMAT',
        type=str,
        default='words',
        help='vocabulary format, one of "words" (one word per line, default), '
        '"classes" (word and class ID per line), "srilm-classes" (class '
        'name, membership probability, and word per line)')
    argument_group.add_argument(
        '--output-file',
        metavar='FILE',
        type=TextFileType('w'),
        default='-',
        help='where to write the word classes (default stdout)')
    argument_group.add_argument(
        '--output-format',
        metavar='FORMAT',
        type=str,
        default='srilm-classes',
        help='format of the output file, one of "classes" (word and class ID '
        'per line), "srilm-classes" (default; class name, membership '
        'probability, and word per line)')
    argument_group.add_argument(
        '--output-frequency',
        metavar='N',
        type=int,
        default='1',
        help='save classes N times per optimization iteration (default 1)')

    argument_group = parser.add_argument_group("optimization")
    argument_group.add_argument(
        '--num-classes',
        metavar='N',
        type=int,
        default=2000,
        help='number of classes to form, if vocabulary is not specified '
        '(default 2000)')
    argument_group.add_argument(
        '--method',
        metavar='NAME',
        type=str,
        default='bigram-theano',
        help='method for creating word classes, one of "bigram-theano", '
        '"bigram-numpy" (default "bigram-theano")')

    argument_group = parser.add_argument_group("logging and debugging")
    argument_group.add_argument(
        '--log-file',
        metavar='FILE',
        type=str,
        default='-',
        help='path where to write log file (default is standard output)')
    argument_group.add_argument(
        '--log-level',
        metavar='LEVEL',
        type=str,
        default='info',
        help='minimum level of events to log, one of "debug", "info", "warn" '
        '(default "info")')
    argument_group.add_argument(
        '--log-interval',
        metavar='N',
        type=int,
        default=1000,
        help='print statistics after every Nth word; quiet if less than one '
        '(default 1000)')

    args = parser.parse_args()

    log_file = args.log_file
    log_level = getattr(logging, args.log_level.upper(), None)
    if not isinstance(log_level, int):
        raise ValueError("Invalid logging level requested: " + args.log_level)
    log_format = '%(asctime)s %(funcName)s: %(message)s'
    if args.log_file == '-':
        logging.basicConfig(stream=sys.stdout,
                            format=log_format,
                            level=log_level)
    else:
        logging.basicConfig(filename=log_file,
                            format=log_format,
                            level=log_level)

    if args.vocabulary is None:
        vocabulary = Vocabulary.from_corpus(args.training_set,
                                            args.num_classes)
        for subset_file in args.training_set:
            subset_file.seek(0)
    else:
        vocabulary = Vocabulary.from_file(args.vocabulary,
                                          args.vocabulary_format)

    print("Number of words in vocabulary:", vocabulary.num_words())
    print("Number of word classes:", vocabulary.num_classes())
    print("Number of normal word classes:", vocabulary.num_normal_classes)

    logging.info("Reading word unigram and bigram statistics.")
    statistics = WordStatistics(args.training_set, vocabulary)

    if args.method == 'bigram-theano':
        optimizer = TheanoBigramOptimizer(statistics, vocabulary)
    elif args.method == 'bigram-numpy':
        optimizer = NumpyBigramOptimizer(statistics, vocabulary)
    else:
        raise ValueError("Invalid method requested: " + args.method)

    iteration = 1
    while True:
        logging.info("Starting iteration %d.", iteration)
        num_words = 0
        num_moves = 0
        for word in vocabulary.words():
            start_time = time()
            num_words += 1
            if optimizer.move_to_best_class(word):
                num_moves += 1
            duration = time() - start_time
            if (args.log_interval >= 1) and \
               (num_words % args.log_interval == 0):
                logging.info(
                    "[%d] (%.1f %%) of iteration %d -- moves = %d, cost = %.2f, duration = %.1f ms",
                    num_words, num_words / vocabulary.num_words() * 100,
                    iteration, num_moves, optimizer.log_likelihood(),
                    duration * 100)
            if is_scheduled(num_words, args.output_frequency,
                            vocabulary.num_words()):
                save(optimizer, args.output_file, args.output_format)

        if num_moves == 0:
            break
        iteration += 1

    logging.info("Optimization finished.")
    save(optimizer, args.output_file, args.output_format)
コード例 #4
0
ファイル: wctool.py プロジェクト: kdjyss/theanolm
def main():
    parser = argparse.ArgumentParser(prog='wctool')

    argument_group = parser.add_argument_group("files")
    argument_group.add_argument(
        '--training-set', metavar='FILE', type=TextFileType('r'),
        nargs='+', required=True,
        help='text or .gz files containing training data (one sentence per '
             'line)')
    argument_group.add_argument(
        '--vocabulary', metavar='FILE', type=TextFileType('r'), default=None,
        help='text or .gz file containing a list of words to include in class '
             'forming, and possibly their initial classes')
    argument_group.add_argument(
        '--vocabulary-format', metavar='FORMAT', type=str, default='words',
        help='vocabulary format, one of "words" (one word per line, default), '
             '"classes" (word and class ID per line), "srilm-classes" (class '
             'name, membership probability, and word per line)')
    argument_group.add_argument(
        '--output-file', metavar='FILE', type=TextFileType('w'), default='-',
        help='where to write the word classes (default stdout)')
    argument_group.add_argument(
        '--output-format', metavar='FORMAT', type=str, default='srilm-classes',
        help='format of the output file, one of "classes" (word and class ID '
             'per line), "srilm-classes" (default; class name, membership '
             'probability, and word per line)')
    argument_group.add_argument(
        '--output-frequency', metavar='N', type=int, default='1',
        help='save classes N times per optimization iteration (default 1)')

    argument_group = parser.add_argument_group("optimization")
    argument_group.add_argument(
        '--num-classes', metavar='N', type=int, default=2000,
        help='number of classes to form, if vocabulary is not specified '
             '(default 2000)')
    argument_group.add_argument(
        '--method', metavar='NAME', type=str, default='bigram-theano',
        help='method for creating word classes, one of "bigram-theano", '
             '"bigram-numpy" (default "bigram-theano")')

    argument_group = parser.add_argument_group("logging and debugging")
    argument_group.add_argument(
        '--log-file', metavar='FILE', type=str, default='-',
        help='path where to write log file (default is standard output)')
    argument_group.add_argument(
        '--log-level', metavar='LEVEL', type=str, default='info',
        help='minimum level of events to log, one of "debug", "info", "warn" '
             '(default "info")')
    argument_group.add_argument(
        '--log-interval', metavar='N', type=int, default=1000,
        help='print statistics after every Nth word; quiet if less than one '
             '(default 1000)')

    args = parser.parse_args()

    log_file = args.log_file
    log_level = getattr(logging, args.log_level.upper(), None)
    if not isinstance(log_level, int):
        raise ValueError("Invalid logging level requested: " + args.log_level)
    log_format = '%(asctime)s %(funcName)s: %(message)s'
    if args.log_file == '-':
        logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level)
    else:
        logging.basicConfig(filename=log_file, format=log_format, level=log_level)

    if args.vocabulary is None:
        vocabulary = Vocabulary.from_corpus(args.training_set,
                                            args.num_classes)
        for subset_file in args.training_set:
            subset_file.seek(0)
    else:
        vocabulary = Vocabulary.from_file(args.vocabulary,
                                          args.vocabulary_format)

    print("Number of words in vocabulary:", vocabulary.num_words())
    print("Number of word classes:", vocabulary.num_classes())
    print("Number of normal word classes:", vocabulary.num_normal_classes)

    logging.info("Reading word unigram and bigram statistics.")
    statistics = WordStatistics(args.training_set, vocabulary)

    if args.method == 'bigram-theano':
        optimizer = TheanoBigramOptimizer(statistics, vocabulary)
    elif args.method == 'bigram-numpy':
        optimizer = NumpyBigramOptimizer(statistics, vocabulary)
    else:
        raise ValueError("Invalid method requested: " + args.method)

    iteration = 1
    while True:
        logging.info("Starting iteration %d.", iteration)
        num_words = 0
        num_moves = 0
        for word in vocabulary.words():
            start_time = time()
            num_words += 1
            if optimizer.move_to_best_class(word):
                num_moves += 1
            duration = time() - start_time
            if (args.log_interval >= 1) and \
               (num_words % args.log_interval == 0):
                logging.info("[%d] (%.1f %%) of iteration %d -- moves = %d, cost = %.2f, duration = %.1f ms",
                     num_words,
                     num_words / vocabulary.num_words() * 100,
                     iteration,
                     num_moves,
                     optimizer.log_likelihood(),
                     duration * 100)
            if is_scheduled(num_words,
                            args.output_frequency,
                            vocabulary.num_words()):
                save(optimizer, args.output_file, args.output_format)

        if num_moves == 0:
            break
        iteration += 1

    logging.info("Optimization finished.")
    save(optimizer, args.output_file, args.output_format)