Beispiel #1
0
    def test_score_sequence(self):
        # Network predicts <unk> probability.
        scorer = TextScorer(self.dummy_network)
        word_ids = numpy.arange(6)
        class_ids = numpy.arange(6)
        membership_probs = numpy.ones(6, dtype='float32')
        logprob = scorer.score_sequence(word_ids, class_ids, membership_probs)
        correct = word_ids[1:].astype('float32')
        correct = correct / 5
        correct = numpy.log(correct).sum()
        self.assertAlmostEqual(logprob, correct, places=5)

        # <unk> is removed from the resulting logprobs.
        scorer = TextScorer(self.dummy_network, ignore_unk=True)
        word_ids = numpy.arange(6)
        word_ids[3] = self.vocabulary.word_to_id['<unk>']
        class_ids = numpy.arange(6)
        membership_probs = numpy.ones(6, dtype='float32')
        logprob = scorer.score_sequence(word_ids, class_ids, membership_probs)
        correct = word_ids[[1, 2, 4, 5]].astype('float32')
        correct = correct / 5
        correct = numpy.log(correct).sum()
        self.assertAlmostEqual(logprob, correct, places=5)

        # <unk> is assigned a constant logprob.
        scorer = TextScorer(self.dummy_network, ignore_unk=False, unk_penalty=-5)
        word_ids = numpy.arange(6)
        word_ids[3] = self.vocabulary.word_to_id['<unk>']
        class_ids = numpy.arange(6)
        membership_probs = numpy.ones(6, dtype='float32')
        logprob = scorer.score_sequence(word_ids, class_ids, membership_probs)
        correct = word_ids[[1, 2, 4, 5]].astype('float32')
        correct = correct / 5
        correct = numpy.log(correct).sum() - 5
        self.assertAlmostEqual(logprob, correct, places=5)
Beispiel #2
0
def score(args):
    """A function that performs the "theanolm score" command.

    :type args: argparse.Namespace
    :param args: a collection of command line arguments
    """

    log_file = args.log_file
    log_level = getattr(logging, args.log_level.upper(), None)
    if not isinstance(log_level, int):
        print("Invalid logging level requested:", args.log_level)
        sys.exit(1)
    log_format = '%(asctime)s %(funcName)s: %(message)s'
    if args.log_file == '-':
        logging.basicConfig(stream=sys.stdout,
                            format=log_format,
                            level=log_level)
    else:
        logging.basicConfig(filename=log_file,
                            format=log_format,
                            level=log_level)

    if args.debug:
        theano.config.compute_test_value = 'warn'
        logging.info("Enabled computing test values for tensor variables.")
        logging.warning("GpuArray backend will fail random number generation!")
    else:
        theano.config.compute_test_value = 'off'
    theano.config.profile = args.profile
    theano.config.profile_memory = args.profile

    default_device = get_default_device(args.default_device)
    network = Network.from_file(args.model_path,
                                exclude_unk=args.exclude_unk,
                                default_device=default_device)

    logging.info("Building text scorer.")
    scorer = TextScorer(network, args.shortlist, args.exclude_unk,
                        args.profile)

    logging.info("Scoring text.")
    if args.output == 'perplexity':
        _score_text(args.input_file, network.vocabulary, scorer,
                    args.output_file, args.log_base, args.subwords, False)
    elif args.output == 'word-scores':
        _score_text(args.input_file, network.vocabulary, scorer,
                    args.output_file, args.log_base, args.subwords, True)
    elif args.output == 'utterance-scores':
        _score_utterances(args.input_file, network.vocabulary, scorer,
                          args.output_file, args.log_base)
    else:
        print("Invalid output format requested:", args.output)
        sys.exit(1)
    def test_score_sequence(self):
        # Network predicts <unk> probability.
        scorer = TextScorer(self.dummy_network)
        word_ids = numpy.arange(6)
        class_ids = numpy.arange(6)
        membership_probs = numpy.ones(6, dtype='float32')
        logprob = scorer.score_sequence(word_ids, class_ids, membership_probs)
        correct = word_ids[1:].astype('float32')    
        correct = correct / 5
        correct = numpy.log(correct).sum()
        self.assertAlmostEqual(logprob, correct, places=5)

        # <unk> is removed from the resulting logprobs.
        scorer = TextScorer(self.dummy_network, ignore_unk=True)
        word_ids = numpy.arange(6)
        word_ids[3] = self.vocabulary.word_to_id['<unk>']
        class_ids = numpy.arange(6)
        membership_probs = numpy.ones(6, dtype='float32')
        logprob = scorer.score_sequence(word_ids, class_ids, membership_probs)
        correct = word_ids[[1, 2, 4, 5]].astype('float32')
        correct = correct / 5
        correct = numpy.log(correct).sum()
        self.assertAlmostEqual(logprob, correct, places=5)

        # <unk> is assigned a constant logprob.
        scorer = TextScorer(self.dummy_network, ignore_unk=False, unk_penalty=-5)
        word_ids = numpy.arange(6)
        word_ids[3] = self.vocabulary.word_to_id['<unk>']
        class_ids = numpy.arange(6)
        membership_probs = numpy.ones(6, dtype='float32')
        logprob = scorer.score_sequence(word_ids, class_ids, membership_probs)
        correct = word_ids[[1, 2, 4, 5]].astype('float32')
        correct = correct / 5
        correct = numpy.log(correct).sum() - 5
        self.assertAlmostEqual(logprob, correct, places=5)
Beispiel #4
0
    def test_score_batch(self):
        # Network predicts <unk> probability.
        scorer = TextScorer(self.dummy_network)
        word_ids = numpy.arange(6).reshape((3, 2))
        class_ids = numpy.arange(6).reshape((3, 2))
        membership_probs = numpy.ones_like(word_ids).astype('float32')
        mask = numpy.ones_like(word_ids)
        logprobs = scorer.score_batch(word_ids, class_ids, membership_probs,
                                      mask)
        assert_almost_equal(logprobs[0],
                            numpy.log(word_ids[1:, 0].astype('float32') / 5))
        assert_almost_equal(logprobs[1],
                            numpy.log(word_ids[1:, 1].astype('float32') / 5))

        # <unk> is removed from the resulting logprobs.
        scorer = TextScorer(self.dummy_network, ignore_unk=True)
        word_ids = numpy.arange(6).reshape((3, 2))
        word_ids[1, 1] = self.vocabulary.word_to_id['<unk>']
        class_ids = numpy.arange(6).reshape((3, 2))
        membership_probs = numpy.ones_like(word_ids).astype('float32')
        mask = numpy.ones_like(word_ids)
        logprobs = scorer.score_batch(word_ids, class_ids, membership_probs,
                                      mask)
        assert_almost_equal(logprobs[0],
                            numpy.log(word_ids[1:, 0].astype('float32') / 5))
        assert_almost_equal(logprobs[1],
                            numpy.log(word_ids[2:, 1].astype('float32') / 5))

        # <unk> is assigned a constant logprob.
        scorer = TextScorer(self.dummy_network,
                            ignore_unk=False,
                            unk_penalty=-5)
        word_ids = numpy.arange(6).reshape((3, 2))
        word_ids[1, 1] = self.vocabulary.word_to_id['<unk>']
        class_ids = numpy.arange(6).reshape((3, 2))
        membership_probs = numpy.ones_like(word_ids).astype('float32')
        mask = numpy.ones_like(word_ids)
        logprobs = scorer.score_batch(word_ids, class_ids, membership_probs,
                                      mask)
        assert_almost_equal(logprobs[0],
                            numpy.log(word_ids[1:, 0].astype('float32') / 5))
        assert_almost_equal(logprobs[1][0], -5)
        assert_almost_equal(logprobs[1][1],
                            numpy.log(word_ids[2, 1].astype('float32') / 5))
    def test_score_sequence(self):
        # Network predicts <unk> probability.
        scorer = TextScorer(self.dummy_network, use_shortlist=False)
        word_ids = numpy.arange(15)
        class_ids, _ = self.vocabulary.get_class_memberships(word_ids)
        membership_probs = numpy.ones_like(word_ids).astype('float32')
        logprob = scorer.score_sequence(word_ids, class_ids, membership_probs)
        correct = word_ids[1:].astype('float32')
        correct /= 100.0
        correct[12] = 12.0 / 100.0
        correct[13] = 12.0 / 100.0
        correct = numpy.log(correct).sum()
        self.assertAlmostEqual(logprob, correct, places=4)

        # Network predicts <unk> probability. This is distributed for
        # out-of-shortlist words according to word frequency.
        scorer = TextScorer(self.dummy_network, use_shortlist=True)
        word_ids = numpy.arange(15)
        class_ids, _ = self.vocabulary.get_class_memberships(word_ids)
        membership_probs = numpy.ones_like(word_ids).astype('float32')
        logprob = scorer.score_sequence(word_ids, class_ids, membership_probs)
        correct = word_ids[1:].astype('float32')
        correct /= 100.0
        correct[11] = 1.0  # <unk> is ignored
        correct[12] = 12.0 / 100.0 * 0.3
        correct[13] = 12.0 / 100.0 * 0.7
        correct = numpy.log(correct).sum()
        self.assertAlmostEqual(logprob, correct, places=5)

        # OOV and OOS words are excluded from the resulting logprobs.
        scorer = TextScorer(self.dummy_network,
                            use_shortlist=False,
                            exclude_unk=True)
        word_ids = numpy.arange(15)
        class_ids, _ = self.vocabulary.get_class_memberships(word_ids)
        membership_probs = numpy.ones_like(word_ids).astype('float32')
        logprob = scorer.score_sequence(word_ids, class_ids, membership_probs)
        correct = word_ids[1:12].astype('float32')
        correct /= 100.0
        correct = numpy.log(correct).sum()
        self.assertAlmostEqual(logprob, correct, places=5)
Beispiel #6
0
def score(args):
    with h5py.File(args.model_path, 'r') as state:
        print("Reading vocabulary from network state.")
        sys.stdout.flush()
        vocabulary = Vocabulary.from_state(state)
        print("Number of words in vocabulary:", vocabulary.num_words())
        print("Number of word classes:", vocabulary.num_classes())
        print("Building neural network.")
        sys.stdout.flush()
        architecture = Architecture.from_state(state)
        network = Network(vocabulary, architecture)
        print("Restoring neural network state.")
        sys.stdout.flush()
        network.set_state(state)

    print("Building text scorer.")
    sys.stdout.flush()
    if args.unk_penalty is None:
        ignore_unk = False
        unk_penalty = None
    elif args.unk_penalty == 0:
        ignore_unk = True
        unk_penalty = None
    else:
        ignore_unk = False
        unk_penalty = args.unk_penalty
    scorer = TextScorer(network, ignore_unk, unk_penalty)

    print("Scoring text.")
    if args.output == 'perplexity':
        _score_text(args.input_file, vocabulary, scorer, args.output_file,
                    args.log_base, False)
    elif args.output == 'word-scores':
        _score_text(args.input_file, vocabulary, scorer, args.output_file,
                    args.log_base, True)
    elif args.output == 'utterance-scores':
        _score_utterances(args.input_file, vocabulary, scorer,
                          args.output_file, args.log_base)
    def test_score_batch(self):
        # Network predicts <unk> probability.
        scorer = TextScorer(self.dummy_network)
        word_ids = numpy.arange(6).reshape((3, 2))
        class_ids = numpy.arange(6).reshape((3, 2))
        membership_probs = numpy.ones_like(word_ids).astype('float32')
        mask = numpy.ones_like(word_ids)
        logprobs = scorer.score_batch(word_ids, class_ids, membership_probs,
                                      mask)
        assert_almost_equal(logprobs[0],
                            numpy.log(word_ids[1:,0].astype('float32') / 5))
        assert_almost_equal(logprobs[1],
                            numpy.log(word_ids[1:,1].astype('float32') / 5))

        # <unk> is removed from the resulting logprobs.
        scorer = TextScorer(self.dummy_network, ignore_unk=True)
        word_ids = numpy.arange(6).reshape((3, 2))
        word_ids[1,1] = self.vocabulary.word_to_id['<unk>']
        class_ids = numpy.arange(6).reshape((3, 2))
        membership_probs = numpy.ones_like(word_ids).astype('float32')
        mask = numpy.ones_like(word_ids)
        logprobs = scorer.score_batch(word_ids, class_ids, membership_probs,
                                      mask)
        assert_almost_equal(logprobs[0],
                            numpy.log(word_ids[1:,0].astype('float32') / 5))
        assert_almost_equal(logprobs[1],
                            numpy.log(word_ids[2:,1].astype('float32') / 5))

        # <unk> is assigned a constant logprob.
        scorer = TextScorer(self.dummy_network, ignore_unk=False, unk_penalty=-5)
        word_ids = numpy.arange(6).reshape((3, 2))
        word_ids[1,1] = self.vocabulary.word_to_id['<unk>']
        class_ids = numpy.arange(6).reshape((3, 2))
        membership_probs = numpy.ones_like(word_ids).astype('float32')
        mask = numpy.ones_like(word_ids)
        logprobs = scorer.score_batch(word_ids, class_ids, membership_probs,
                                      mask)
        assert_almost_equal(logprobs[0],
                            numpy.log(word_ids[1:,0].astype('float32') / 5))
        assert_almost_equal(logprobs[1][0], -5)
        assert_almost_equal(logprobs[1][1],
                            numpy.log(word_ids[2,1].astype('float32') / 5))
Beispiel #8
0
def train(args):
    """A function that performs the "theanolm train" command.

    :type args: argparse.Namespace
    :param args: a collection of command line arguments
    """

    numpy.random.seed(args.random_seed)

    log_file = args.log_file
    log_level = getattr(logging, args.log_level.upper(), None)
    if not isinstance(log_level, int):
        print("Invalid logging level requested:", args.log_level)
        sys.exit(1)
    log_format = '%(asctime)s %(funcName)s: %(message)s'
    if args.log_file == '-':
        logging.basicConfig(stream=sys.stdout,
                            format=log_format,
                            level=log_level)
    else:
        logging.basicConfig(filename=log_file,
                            format=log_format,
                            level=log_level)

    if args.debug:
        theano.config.compute_test_value = 'warn'
        print("Enabled computing test values for tensor variables.")
        print("Warning: GpuArray backend will fail random number generation!")
    else:
        theano.config.compute_test_value = 'off'
    theano.config.profile = args.profile
    theano.config.profile_memory = args.profile

    with h5py.File(args.model_path, 'a', driver='core') as state:
        vocabulary = _read_vocabulary(args, state)

        if args.num_noise_samples > vocabulary.num_classes():
            print("Number of noise samples ({}) is larger than the number of "
                  "classes. This doesn't make sense and would cause sampling "
                  "to fail.".format(args.num_noise_samples))
            sys.exit(1)

        num_training_files = len(args.training_set)
        if len(args.weights) > num_training_files:
            print("You specified more weights than training files.")
            sys.exit(1)
        weights = numpy.ones(num_training_files).astype(theano.config.floatX)
        for index, weight in enumerate(args.weights):
            weights[index] = weight

        training_options = {
            'batch_size': args.batch_size,
            'sequence_length': args.sequence_length,
            'validation_frequency': args.validation_frequency,
            'patience': args.patience,
            'stopping_criterion': args.stopping_criterion,
            'max_epochs': args.max_epochs,
            'min_epochs': args.min_epochs,
            'max_annealing_count': args.max_annealing_count
        }
        logging.debug("TRAINING OPTIONS")
        for option_name, option_value in training_options.items():
            logging.debug("%s: %s", option_name, str(option_value))

        optimization_options = {
            'method': args.optimization_method,
            'epsilon': args.numerical_stability_term,
            'gradient_decay_rate': args.gradient_decay_rate,
            'sqr_gradient_decay_rate': args.sqr_gradient_decay_rate,
            'learning_rate': args.learning_rate,
            'weights': weights,
            'momentum': args.momentum,
            'max_gradient_norm': args.gradient_normalization,
            'cost_function': args.cost,
            'num_noise_samples': args.num_noise_samples,
            'noise_sharing': args.noise_sharing,
            'exclude_unk': args.exclude_unk
        }
        logging.debug("OPTIMIZATION OPTIONS")
        for option_name, option_value in optimization_options.items():
            if isinstance(option_value, list):
                value_str = ', '.join(str(x) for x in option_value)
                logging.debug("%s: [%s]", option_name, value_str)
            else:
                logging.debug("%s: %s", option_name, str(option_value))

        if len(args.sampling) > len(args.training_set):
            print("You specified more sampling coefficients than training "
                  "files.")
            sys.exit(1)

        print("Creating trainer.")
        sys.stdout.flush()
        trainer = Trainer(training_options, vocabulary, args.training_set,
                          args.sampling)
        trainer.set_logging(args.log_interval)

        print("Building neural network.")
        sys.stdout.flush()
        if args.architecture == 'lstm300' or args.architecture == 'lstm1500':
            architecture = Architecture.from_package(args.architecture)
        else:
            with open(args.architecture, 'rt', encoding='utf-8') as arch_file:
                architecture = Architecture.from_description(arch_file)

        network = Network(architecture,
                          vocabulary,
                          trainer.class_prior_probs,
                          args.noise_dampening,
                          default_device=args.default_device,
                          profile=args.profile)

        print("Compiling optimization function.")
        sys.stdout.flush()
        optimizer = create_optimizer(optimization_options,
                                     network,
                                     profile=args.profile)

        if args.print_graph:
            print("Cost function computation graph:")
            theano.printing.debugprint(optimizer.gradient_update_function)

        trainer.initialize(network, state, optimizer)
        # XXX Write the model instantly back to disk. Just adds word unigram
        # counts. This is a temporary hack. Remove at some point.
        trainer.get_state(state)
        state.flush()
        # XXX

        if args.validation_file is not None:
            print("Building text scorer for cross-validation.")
            sys.stdout.flush()
            scorer = TextScorer(network,
                                use_shortlist=True,
                                exclude_unk=args.exclude_unk,
                                profile=args.profile)
            print("Validation text:", args.validation_file.name)
            validation_mmap = mmap.mmap(args.validation_file.fileno(),
                                        0,
                                        prot=mmap.PROT_READ)
            validation_iter = \
                LinearBatchIterator(validation_mmap,
                                    vocabulary,
                                    batch_size=args.batch_size,
                                    max_sequence_length=args.sequence_length,
                                    map_oos_to_unk=False)
            trainer.set_validation(validation_iter, scorer)
        else:
            print("Cross-validation will not be performed.")
            validation_iter = None

        print("Training neural network.")
        sys.stdout.flush()
        trainer.train()

        if 'layers' not in state.keys():
            print("The model has not been trained. No cross-validations were "
                  "performed or training did not improve the model.")
        elif validation_iter is not None:
            network.set_state(state)
            perplexity = scorer.compute_perplexity(validation_iter)
            print("Best validation set perplexity:", perplexity)
Beispiel #9
0
def train(args):
    numpy.random.seed(args.random_seed)

    log_file = args.log_file
    log_level = getattr(logging, args.log_level.upper(), None)
    if not isinstance(log_level, int):
        print("Invalid logging level requested:", args.log_level)
        sys.exit(1)
    log_format = "%(asctime)s %(funcName)s: %(message)s"
    if args.log_file == "-":
        logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level)
    else:
        logging.basicConfig(filename=log_file, format=log_format, level=log_level)

    if args.debug:
        theano.config.compute_test_value = "warn"
        print("Enabled computing test values for tensor variables.")
        print("Warning: GpuArray backend will fail random number generation!")
    else:
        theano.config.compute_test_value = "off"
    theano.config.profile = args.profile
    theano.config.profile_memory = args.profile

    with h5py.File(args.model_path, "a", driver="core") as state:
        if state.keys():
            print("Reading vocabulary from existing network state.")
            sys.stdout.flush()
            vocabulary = Vocabulary.from_state(state)
        elif args.vocabulary is None:
            print("Constructing vocabulary from training set.")
            sys.stdout.flush()
            vocabulary = Vocabulary.from_corpus(args.training_set, args.num_classes)
            for training_file in args.training_set:
                training_file.seek(0)
            vocabulary.get_state(state)
        else:
            print("Reading vocabulary from {}.".format(args.vocabulary))
            sys.stdout.flush()
            with open(args.vocabulary, "rt", encoding="utf-8") as vocab_file:
                vocabulary = Vocabulary.from_file(vocab_file, args.vocabulary_format)
                if args.vocabulary_format == "classes":
                    print("Computing class membership probabilities from " "unigram word counts.")
                    sys.stdout.flush()
                    vocabulary.compute_probs(args.training_set)
            vocabulary.get_state(state)
        print("Number of words in vocabulary:", vocabulary.num_words())
        print("Number of word classes:", vocabulary.num_classes())

        if args.num_noise_samples > vocabulary.num_classes():
            print(
                "Number of noise samples ({}) is larger than the number of "
                "classes. This doesn't make sense and would cause sampling "
                "to fail.".format(args.num_noise_samples)
            )
            sys.exit(1)

        if args.unk_penalty is None:
            ignore_unk = False
            unk_penalty = None
        elif args.unk_penalty == 0:
            ignore_unk = True
            unk_penalty = None
        else:
            ignore_unk = False
            unk_penalty = args.unk_penalty

        num_training_files = len(args.training_set)
        if len(args.weights) > num_training_files:
            print("You specified more weights than training files.")
            sys.exit(1)
        weights = numpy.ones(num_training_files).astype(theano.config.floatX)
        for index, weight in enumerate(args.weights):
            weights[index] = weight

        training_options = {
            "batch_size": args.batch_size,
            "sequence_length": args.sequence_length,
            "validation_frequency": args.validation_frequency,
            "patience": args.patience,
            "stopping_criterion": args.stopping_criterion,
            "max_epochs": args.max_epochs,
            "min_epochs": args.min_epochs,
            "max_annealing_count": args.max_annealing_count,
        }
        logging.debug("TRAINING OPTIONS")
        for option_name, option_value in training_options.items():
            logging.debug("%s: %s", option_name, str(option_value))

        optimization_options = {
            "method": args.optimization_method,
            "epsilon": args.numerical_stability_term,
            "gradient_decay_rate": args.gradient_decay_rate,
            "sqr_gradient_decay_rate": args.sqr_gradient_decay_rate,
            "learning_rate": args.learning_rate,
            "weights": weights,
            "momentum": args.momentum,
            "max_gradient_norm": args.gradient_normalization,
            "cost_function": args.cost,
            "num_noise_samples": args.num_noise_samples,
            "noise_sharing": args.noise_sharing,
            "ignore_unk": ignore_unk,
            "unk_penalty": unk_penalty,
        }
        logging.debug("OPTIMIZATION OPTIONS")
        for option_name, option_value in optimization_options.items():
            if type(option_value) is list:
                value_str = ", ".join(str(x) for x in option_value)
                logging.debug("%s: [%s]", option_name, value_str)
            else:
                logging.debug("%s: %s", option_name, str(option_value))

        if len(args.sampling) > len(args.training_set):
            print("You specified more sampling coefficients than training " "files.")
            sys.exit(1)

        print("Creating trainer.")
        sys.stdout.flush()
        trainer = Trainer(training_options, vocabulary, args.training_set, args.sampling)
        trainer.set_logging(args.log_interval)

        print("Building neural network.")
        sys.stdout.flush()
        if args.architecture == "lstm300" or args.architecture == "lstm1500":
            architecture = Architecture.from_package(args.architecture)
        else:
            with open(args.architecture, "rt", encoding="utf-8") as arch_file:
                architecture = Architecture.from_description(arch_file)

        network = Network(
            architecture,
            vocabulary,
            trainer.class_prior_probs,
            args.noise_dampening,
            default_device=args.default_device,
            profile=args.profile,
        )

        print("Compiling optimization function.")
        sys.stdout.flush()
        optimizer = create_optimizer(optimization_options, network, device=args.default_device, profile=args.profile)

        if args.print_graph:
            print("Cost function computation graph:")
            theano.printing.debugprint(optimizer.gradient_update_function)

        trainer.initialize(network, state, optimizer)

        if not args.validation_file is None:
            print("Building text scorer for cross-validation.")
            sys.stdout.flush()
            scorer = TextScorer(network, ignore_unk, unk_penalty, args.profile)
            print("Validation text:", args.validation_file.name)
            validation_mmap = mmap.mmap(args.validation_file.fileno(), 0, prot=mmap.PROT_READ)
            validation_iter = LinearBatchIterator(
                validation_mmap, vocabulary, batch_size=args.batch_size, max_sequence_length=None
            )
            trainer.set_validation(validation_iter, scorer)
        else:
            print("Cross-validation will not be performed.")
            validation_iter = None

        print("Training neural network.")
        sys.stdout.flush()
        trainer.train()

        if not "layers" in state.keys():
            print(
                "The model has not been trained. No cross-validations were "
                "performed or training did not improve the model."
            )
        elif not validation_iter is None:
            network.set_state(state)
            perplexity = scorer.compute_perplexity(validation_iter)
            print("Best validation set perplexity:", perplexity)
Beispiel #10
0
def train(args):
    numpy.random.seed(args.random_seed)

    log_file = args.log_file
    log_level = getattr(logging, args.log_level.upper(), None)
    if not isinstance(log_level, int):
        print("Invalid logging level requested:", args.log_level)
        sys.exit(1)
    log_format = '%(asctime)s %(funcName)s: %(message)s'
    if args.log_file == '-':
        logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level)
    else:
        logging.basicConfig(filename=log_file, format=log_format, level=log_level)

    if args.debug:
        theano.config.compute_test_value = 'warn'
    else:
        theano.config.compute_test_value = 'off'
    theano.config.profile = args.profile
    theano.config.profile_memory = args.profile

    with h5py.File(args.model_path, 'a', driver='core') as state:
        if state.keys():
            print("Reading vocabulary from existing network state.")
            sys.stdout.flush()
            vocabulary = Vocabulary.from_state(state)
        elif args.vocabulary is None:
            print("Constructing vocabulary from training set.")
            sys.stdout.flush()
            vocabulary = Vocabulary.from_corpus(args.training_set,
                                                args.num_classes)
            for training_file in args.training_set:
                training_file.seek(0)
            vocabulary.get_state(state)
        else:
            print("Reading vocabulary from {}.".format(args.vocabulary))
            sys.stdout.flush()
            with open(args.vocabulary, 'rt', encoding='utf-8') as vocab_file:
                vocabulary = Vocabulary.from_file(vocab_file,
                                                  args.vocabulary_format)
                if args.vocabulary_format == 'classes':
                    print("Computing class membership probabilities from "
                          "unigram word counts.")
                    sys.stdout.flush()
                    vocabulary.compute_probs(args.training_set)
            vocabulary.get_state(state)
        print("Number of words in vocabulary:", vocabulary.num_words())
        print("Number of word classes:", vocabulary.num_classes())

        print("Building neural network.")
        sys.stdout.flush()
        if args.architecture == 'lstm300' or args.architecture == 'lstm1500':
            architecture = Architecture.from_package(args.architecture)
        else:
            with open(args.architecture, 'rt', encoding='utf-8') as arch_file:
                architecture = Architecture.from_description(arch_file)
        network = Network(vocabulary, architecture, profile=args.profile)

        sys.stdout.flush()
        if args.unk_penalty is None:
            ignore_unk = False
            unk_penalty = None
        elif args.unk_penalty == 0:
            ignore_unk = True
            unk_penalty = None
        else:
            ignore_unk = False
            unk_penalty = args.unk_penalty

        num_training_files = len(args.training_set)
        if len(args.weights) > num_training_files:
            print("You specified more weights than training files.")
            sys.exit(1)
        weights = numpy.ones(num_training_files).astype(theano.config.floatX)
        for index, weight in enumerate(args.weights):
            weights[index] = weight

        print("Building text scorer.")
        scorer = TextScorer(network, ignore_unk, unk_penalty, args.profile)

        validation_mmap = mmap.mmap(args.validation_file.fileno(),
                                    0,
                                    prot=mmap.PROT_READ)
        validation_iter = \
            LinearBatchIterator(validation_mmap,
                                vocabulary,
                                batch_size=args.batch_size,
                                max_sequence_length=None)

        optimization_options = {
            'method': args.optimization_method,
            'epsilon': args.numerical_stability_term,
            'gradient_decay_rate': args.gradient_decay_rate,
            'sqr_gradient_decay_rate': args.sqr_gradient_decay_rate,
            'learning_rate': args.learning_rate,
            'weights': weights,
            'momentum': args.momentum,
            'max_gradient_norm': args.gradient_normalization,
            'cost_function': args.cost,
            'num_noise_samples': args.num_noise_samples,
            'ignore_unk': ignore_unk,
            'unk_penalty': unk_penalty
        }
        logging.debug("OPTIMIZATION OPTIONS")
        for option_name, option_value in optimization_options.items():
            if type(option_value) is list:
                value_str = ', '.join(str(x) for x in option_value)
                logging.debug("%s: [%s]", option_name, value_str)
            else:
                logging.debug("%s: %s", option_name, str(option_value))

        training_options = {
            'strategy': args.training_strategy,
            'batch_size': args.batch_size,
            'sequence_length': args.sequence_length,
            'validation_frequency': args.validation_frequency,
            'patience': args.patience,
            'stopping_criterion': args.stopping_criterion,
            'max_epochs': args.max_epochs,
            'min_epochs': args.min_epochs,
            'max_annealing_count': args.max_annealing_count
        }
        logging.debug("TRAINING OPTIONS")
        for option_name, option_value in training_options.items():
            logging.debug("%s: %s", option_name, str(option_value))

        print("Building neural network trainer.")
        sys.stdout.flush()
        if len(args.sampling) > len(args.training_set):
            print("You specified more sampling coefficients than training "
                  "files.")
            sys.exit(1)
        trainer = create_trainer(
            training_options, optimization_options,
            network, vocabulary, scorer,
            args.training_set, args.sampling, validation_iter,
            state, args.profile)
        trainer.set_logging(args.log_interval)

        print("Training neural network.")
        sys.stdout.flush()
        trainer.train()

        if not 'layers' in state.keys():
            print("The model has not been trained. No cross-validations were "
                  "performed or training did not improve the model.")
        else:
            network.set_state(state)
            perplexity = scorer.compute_perplexity(validation_iter)
            print("Best validation set perplexity:", perplexity)
Beispiel #11
0
def train(args):
    numpy.random.seed(args.random_seed)

    log_file = args.log_file
    log_level = getattr(logging, args.log_level.upper(), None)
    if not isinstance(log_level, int):
        raise ValueError("Invalid logging level requested: " + args.log_level)
    log_format = '%(asctime)s %(funcName)s: %(message)s'
    if args.log_file == '-':
        logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level)
    else:
        logging.basicConfig(filename=log_file, format=log_format, level=log_level)

    if args.debug:
        theano.config.compute_test_value = 'warn'
    else:
        theano.config.compute_test_value = 'off'
    theano.config.profile = args.profile
    theano.config.profile_memory = args.profile

    with h5py.File(args.model_path, 'a', driver='core') as state:
        if state.keys():
            print("Reading vocabulary from existing network state.")
            sys.stdout.flush()
            vocabulary = Vocabulary.from_state(state)
        elif args.vocabulary is None:
            print("Constructing vocabulary from training set.")
            sys.stdout.flush()
            vocabulary = Vocabulary.from_corpus(args.training_set,
                                                args.num_classes)
            for training_file in args.training_set:
                training_file.seek(0)
            vocabulary.get_state(state)
        else:
            print("Reading vocabulary from {}.".format(args.vocabulary))
            sys.stdout.flush()
            with open(args.vocabulary, 'rt', encoding='utf-8') as vocab_file:
                vocabulary = Vocabulary.from_file(vocab_file,
                                                  args.vocabulary_format)
                if args.vocabulary_format == 'classes':
                    print("Computing class membership probabilities from "
                          "unigram word counts.")
                    sys.stdout.flush()
                    vocabulary.compute_probs(args.training_set)
            vocabulary.get_state(state)
        print("Number of words in vocabulary:", vocabulary.num_words())
        print("Number of word classes:", vocabulary.num_classes())

        print("Building neural network.")
        sys.stdout.flush()
        if args.architecture == 'lstm300' or args.architecture == 'lstm1500':
            architecture = Architecture.from_package(args.architecture)
        else:
            with open(args.architecture, 'rt', encoding='utf-8') as arch_file:
                architecture = Architecture.from_description(arch_file)
        network = Network(vocabulary, architecture, profile=args.profile)

        sys.stdout.flush()
        if args.unk_penalty is None:
            ignore_unk = False
            unk_penalty = None
        elif args.unk_penalty == 0:
            ignore_unk = True
            unk_penalty = None
        else:
            ignore_unk = False
            unk_penalty = args.unk_penalty

        num_training_files = len(args.training_set)
        if len(args.weights) > num_training_files:
            print("You specified more weights than training files.")
            sys.exit(1)
        weights = numpy.ones(num_training_files).astype(theano.config.floatX)
        for index, weight in enumerate(args.weights):
            weights[index] = weight

        print("Building text scorer.")
        scorer = TextScorer(network, ignore_unk, unk_penalty, args.profile)

        validation_mmap = mmap.mmap(args.validation_file.fileno(),
                                    0,
                                    prot=mmap.PROT_READ)
        validation_iter = LinearBatchIterator(validation_mmap,
                                              vocabulary,
                                              batch_size=32)

        optimization_options = {
            'method': args.optimization_method,
            'epsilon': args.numerical_stability_term,
            'gradient_decay_rate': args.gradient_decay_rate,
            'sqr_gradient_decay_rate': args.sqr_gradient_decay_rate,
            'learning_rate': args.learning_rate,
            'weights': weights,
            'momentum': args.momentum,
            'max_gradient_norm': args.gradient_normalization,
            'ignore_unk': ignore_unk,
            'unk_penalty': unk_penalty
        }
        logging.debug("OPTIMIZATION OPTIONS")
        for option_name, option_value in optimization_options.items():
            if type(option_value) is list:
                value_str = ', '.join(str(x) for x in option_value)
                logging.debug("%s: [%s]", option_name, value_str)
            else:
                logging.debug("%s: %s", option_name, str(option_value))

        training_options = {
            'strategy': args.training_strategy,
            'batch_size': args.batch_size,
            'sequence_length': args.sequence_length,
            'validation_frequency': args.validation_frequency,
            'patience': args.patience,
            'stopping_criterion': args.stopping_criterion,
            'max_epochs': args.max_epochs,
            'min_epochs': args.min_epochs,
            'max_annealing_count': args.max_annealing_count
        }
        logging.debug("TRAINING OPTIONS")
        for option_name, option_value in training_options.items():
            logging.debug("%s: %s", option_name, str(option_value))

        print("Building neural network trainer.")
        sys.stdout.flush()
        if len(args.sampling) > len(args.training_set):
            print("You specified more sampling coefficients than training "
                  "files.")
            sys.exit(1)
        trainer = create_trainer(
            training_options, optimization_options,
            network, vocabulary, scorer,
            args.training_set, args.sampling, validation_iter,
            state, args.profile)
        trainer.set_logging(args.log_interval)

        print("Training neural network.")
        sys.stdout.flush()
        trainer.run()

        if not state.keys():
            print("The model has not been trained.")
        else:
            network.set_state(state)
            perplexity = scorer.compute_perplexity(validation_iter)
            print("Best validation set perplexity:", perplexity)
Beispiel #12
0
def train(args):
    """A function that performs the "theanolm train" command.

    :type args: argparse.Namespace
    :param args: a collection of command line arguments
    """

    numpy.random.seed(args.random_seed)

    log_file = args.log_file
    log_level = getattr(logging, args.log_level.upper(), None)
    if not isinstance(log_level, int):
        print("Invalid logging level requested:", args.log_level)
        sys.exit(1)
    log_format = '%(asctime)s %(funcName)s: %(message)s'
    if args.log_file == '-':
        logging.basicConfig(stream=sys.stdout,
                            format=log_format,
                            level=log_level)
    else:
        logging.basicConfig(filename=log_file,
                            format=log_format,
                            level=log_level)

    if args.debug:
        theano.config.compute_test_value = 'warn'
        logging.info("Enabled computing test values for tensor variables.")
        logging.warning("GpuArray backend will fail random number generation!")
    else:
        theano.config.compute_test_value = 'off'
    theano.config.profile = args.profile
    theano.config.profile_memory = args.profile

    with h5py.File(args.model_path, 'a', driver='core') as state:
        vocabulary = _read_vocabulary(args, state)

        if args.num_noise_samples > vocabulary.num_classes():
            print("Number of noise samples ({}) is larger than the number of "
                  "classes. This doesn't make sense and would cause unigram "
                  "sampling to fail.".format(args.num_noise_samples))
            sys.exit(1)

        num_training_files = len(args.training_set)
        if len(args.weights) > num_training_files:
            print("You specified more weights than training files.")
            sys.exit(1)
        weights = numpy.ones(num_training_files).astype(theano.config.floatX)
        for index, weight in enumerate(args.weights):
            weights[index] = weight
        if len(args.sampling) > num_training_files:
            print("You specified more sampling coefficients than training "
                  "files.")
            sys.exit(1)

        training_options = {
            'batch_size': args.batch_size,
            'sequence_length': args.sequence_length,
            'validation_frequency': args.validation_frequency,
            'patience': args.patience,
            'stopping_criterion': args.stopping_criterion,
            'max_epochs': args.max_epochs,
            'min_epochs': args.min_epochs,
            'max_annealing_count': args.max_annealing_count
        }
        optimization_options = {
            'method': args.optimization_method,
            'epsilon': args.numerical_stability_term,
            'gradient_decay_rate': args.gradient_decay_rate,
            'sqr_gradient_decay_rate': args.sqr_gradient_decay_rate,
            'learning_rate': args.learning_rate,
            'weights': weights,
            'momentum': args.momentum,
            'max_gradient_norm': args.gradient_normalization,
            'num_noise_samples': args.num_noise_samples,
            'noise_sharing': args.noise_sharing,
        }

        log_options(training_options, optimization_options, args)

        logging.info("Creating trainer.")
        trainer = Trainer(training_options, vocabulary, args.training_set,
                          args.sampling)
        trainer.set_logging(args.log_interval)

        logging.info("Building neural network.")
        if args.architecture == 'lstm300' or args.architecture == 'lstm1500':
            architecture = Architecture.from_package(args.architecture)
        else:
            with open(args.architecture, 'rt', encoding='utf-8') as arch_file:
                architecture = Architecture.from_description(arch_file)

        default_device = get_default_device(args.default_device)
        network = Network(architecture,
                          vocabulary,
                          trainer.class_prior_probs,
                          default_device=default_device,
                          profile=args.profile)

        network.set_sampling(args.noise_distribution, args.noise_dampening,
                             args.noise_sharing)

        logging.info("Building optimizer.")
        exclude_id = vocabulary.word_to_id['<unk>'] if args.exclude_unk \
                     else None
        epsilon = args.numerical_stability_term
        if args.cost == 'cross-entropy':
            cost_function = CrossEntropyCost(network, exclude_id,
                                             args.l1_regularization,
                                             args.l2_regularization, epsilon)
        elif args.cost == 'nce':
            cost_function = NCECost(network, exclude_id,
                                    args.l1_regularization,
                                    args.l2_regularization, epsilon)
        else:
            assert args.cost == 'blackout'
            cost_function = BlackoutCost(network, exclude_id,
                                         args.l1_regularization,
                                         args.l2_regularization, epsilon)
        try:
            optimizer = create_optimizer(optimization_options,
                                         network,
                                         cost_function,
                                         profile=args.profile)
        except theano.gradient.DisconnectedInputError as e:
            print("Cannot train the neural network because some of the "
                  "parameters are disconnected from the output. Make sure all "
                  "the layers are correctly connected in the network "
                  "architecture. The error message was: `{}´".format(e))

        if args.print_graph:
            print("Cost function computation graph:")
            theano.printing.debugprint(optimizer.gradient_update_function)

        trainer.initialize(network, state, optimizer, args.load_and_train)

        if args.validation_file is not None:
            logging.info("Building text scorer for cross-validation.")
            scorer = TextScorer(network,
                                use_shortlist=True,
                                exclude_unk=args.exclude_unk,
                                profile=args.profile)
            logging.info("Validation text: %s", args.validation_file.name)
            validation_mmap = mmap.mmap(args.validation_file.fileno(),
                                        0,
                                        prot=mmap.PROT_READ)
            validation_iter = \
                LinearBatchIterator(validation_mmap,
                                    vocabulary,
                                    batch_size=args.batch_size,
                                    max_sequence_length=args.sequence_length,
                                    map_oos_to_unk=False)
            trainer.set_validation(validation_iter, scorer)
        else:
            logging.info("Cross-validation will not be performed.")
            validation_iter = None

        logging.info("Training neural network.")
        trainer.train()

        if 'layers' not in state.keys():
            print("The model has not been trained. No cross-validations were "
                  "performed or training did not improve the model.")
        elif validation_iter is not None:
            network.set_state(state)
            perplexity = scorer.compute_perplexity(validation_iter)
            print("Best validation set perplexity:", perplexity)
Beispiel #13
0
    def test_score_batch(self):
        # Network predicts <unk> probability. Out-of-shortlist words are mapped
        # to <unk> class by .
        scorer = TextScorer(self.dummy_network, use_shortlist=False)
        word_ids = numpy.arange(15).reshape((3, 5)).T
        class_ids, _ = self.vocabulary.get_class_memberships(word_ids)
        membership_probs = numpy.ones_like(word_ids).astype('float32')
        mask = numpy.ones_like(word_ids)
        logprobs = scorer.score_batch(word_ids, class_ids, membership_probs,
                                      mask)
        assert_almost_equal(
            logprobs[0], numpy.log(word_ids[1:, 0].astype('float32') / 100.0))
        assert_almost_equal(
            logprobs[1], numpy.log(word_ids[1:, 1].astype('float32') / 100.0))
        self.assertAlmostEqual(logprobs[2][0],
                               numpy.log(11.0 / 100.0),
                               places=5)  # </s>
        self.assertAlmostEqual(logprobs[2][1],
                               numpy.log(12.0 / 100.0),
                               places=5)  # <unk>
        self.assertAlmostEqual(logprobs[2][2],
                               numpy.log(12.0 / 100.0),
                               places=5)
        self.assertAlmostEqual(logprobs[2][3],
                               numpy.log(12.0 / 100.0),
                               places=5)

        # Network predicts <unk> probability. This is distributed for
        # out-of-shortlist words according to word frequency.
        scorer = TextScorer(self.dummy_network, use_shortlist=True)
        word_ids = numpy.arange(15).reshape((3, 5)).T
        class_ids, _ = self.vocabulary.get_class_memberships(word_ids)
        membership_probs = numpy.ones_like(word_ids).astype('float32')
        mask = numpy.ones_like(word_ids)
        logprobs = scorer.score_batch(word_ids, class_ids, membership_probs,
                                      mask)
        assert_almost_equal(
            logprobs[0], numpy.log(word_ids[1:, 0].astype('float32') / 100.0))
        assert_almost_equal(
            logprobs[1], numpy.log(word_ids[1:, 1].astype('float32') / 100.0))
        self.assertAlmostEqual(logprobs[2][0],
                               numpy.log(11.0 / 100.0),
                               places=5)  # </s>
        self.assertIsNone(logprobs[2][1])  # <unk>
        self.assertAlmostEqual(logprobs[2][2],
                               numpy.log(12.0 / 100.0 * 0.3),
                               places=5)
        self.assertAlmostEqual(logprobs[2][3],
                               numpy.log(12.0 / 100.0 * 0.7),
                               places=5)

        # OOV and OOS words are replaced with None.
        scorer = TextScorer(self.dummy_network,
                            use_shortlist=False,
                            exclude_unk=True)
        word_ids = numpy.arange(15).reshape((3, 5)).T
        class_ids, _ = self.vocabulary.get_class_memberships(word_ids)
        membership_probs = numpy.ones_like(word_ids).astype('float32')
        mask = numpy.ones_like(word_ids)
        logprobs = scorer.score_batch(word_ids, class_ids, membership_probs,
                                      mask)
        assert_almost_equal(
            logprobs[0], numpy.log(word_ids[1:, 0].astype('float32') / 100.0))
        assert_almost_equal(
            logprobs[1], numpy.log(word_ids[1:, 1].astype('float32') / 100.0))
        self.assertAlmostEqual(logprobs[2][0],
                               numpy.log(11.0 / 100.0),
                               places=5)  # </s>
        self.assertIsNone(logprobs[2][1])  # <unk>
        self.assertIsNone(logprobs[2][2])
        self.assertIsNone(logprobs[2][3])