Ejemplo n.º 1
0
    def __init__(self,
                 n: int,
                 vocab: Vocabulary,
                 transforms: List[Transform] = None,
                 reader: DatasetReader = None,
                 cutoff_count: int = None,
                 interpolation_lambdas: Tuple[float, ...] = None,
                 container: Type[NGramContainer] = DictNGramContainer,
                 cache: PredictionsCache = None):
        reader = reader or LanguageModelingStreamReader(reverse=False)
        LanguageModel.__init__(self, vocab, transforms, reader)

        self.n_grams = tuple(
            container() for _ in range(n + 1))  # type: List[NGramContainer]
        self.n = n  # type: int
        self.cutoff_count = cutoff_count  # type: int
        self.cache = cache

        self.interpolation_lambdas = interpolation_lambdas  # type: Tuple[float]
        if not interpolation_lambdas:
            self.interpolation_lambdas = np.zeros(self.n + 1, dtype=np.float64)
            self.interpolation_lambdas[-1] = 1.0
        else:
            self.interpolation_lambdas = np.array(interpolation_lambdas)
        assert n + 1 == len(self.interpolation_lambdas)
Ejemplo n.º 2
0
    def __init__(self,
                 vocab: Vocabulary,
                 model: Model,
                 transforms: List[Transform] = None,
                 reader: DatasetReader = None):
        LanguageModel.__init__(self, vocab, transforms, reader)

        self.model = model
        self.log_model()
Ejemplo n.º 3
0
 def test_model_from_file(self):
     for params, vocabulary in zip(self.params_sets, self.vocabularies):
         params = params.duplicate()
         train_params = params.pop("train")
         model = LanguageModel.from_params(params, vocab=vocabulary)
         model.train(REMEMBERING_EXAMPLE, train_params)
         self._test_model_predictions(model)
Ejemplo n.º 4
0
    def get_generator(self, model_path: str, token_vocab_path: str,
                      stress_vocab_dump_path: str) -> Generator:
        if self.generator is None:
            assert os.path.isdir(model_path) and os.path.isdir(
                token_vocab_path)
            vocabulary = Vocabulary.from_files(token_vocab_path)
            stress_vocabulary = StressVocabulary()
            if not os.path.isfile(stress_vocab_dump_path):
                stress_vocabulary = inflate_stress_vocabulary(
                    vocabulary, self.get_stress_predictor())
                stress_vocabulary.save(stress_vocab_dump_path)
            else:
                stress_vocabulary.load(stress_vocab_dump_path)

            eos_index = vocabulary.get_token_index(END_SYMBOL)
            unk_index = vocabulary.get_token_index(DEFAULT_OOV_TOKEN)
            exclude_transform = ExcludeTransform((unk_index, eos_index))

            model = LanguageModel.load(model_path,
                                       vocabulary_dir=token_vocab_path,
                                       transforms=[
                                           exclude_transform,
                                       ])
            self.generator = Generator(model, vocabulary, stress_vocabulary,
                                       eos_index)
        return self.generator
Ejemplo n.º 5
0
 def test_from_params(self):
     params = Params.from_file(N_GRAM_PARAMS)
     vocabulary_params = params.pop("vocab")
     dataset = self.model.reader.read(TRAIN_EXAMPLE)
     vocabulary = Vocabulary.from_params(vocabulary_params,
                                         instances=dataset)
     model = LanguageModel.from_params(params, vocab=vocabulary)
     self.assertTrue(isinstance(model, NGramLanguageModel))
Ejemplo n.º 6
0
 def test_reversed_model(self):
     for params, vocabulary in zip(self.params_sets, self.vocabularies):
         params = params.duplicate()
         train_params = params.pop('train')
         params["reader"]["reverse"] = True
         model_reversed = LanguageModel.from_params(params,
                                                    vocab=vocabulary)
         model_reversed.train(REMEMBERING_EXAMPLE, train_params)
         self._test_model_predictions(model_reversed, reverse=True)
Ejemplo n.º 7
0
    def setUpClass(cls):
        LanguageModel.set_seed(42)
        configs = (ENCODER_ONLY_MODEL_PARAMS,
                   ENCODER_ONLY_SAMPLED_SOFTMAX_MODEL_PARAMS)
        cls.params_sets = [Params.from_file(config) for config in configs]

        cls.vocabularies = []
        for params in cls.params_sets:
            vocabulary_params = params.pop("vocabulary", default=Params({}))
            reader_params = params.duplicate().pop("reader",
                                                   default=Params({}))
            reader = DatasetReader.from_params(reader_params)
            dataset = reader.read(REMEMBERING_EXAMPLE)
            cls.vocabularies.append(
                Vocabulary.from_params(vocabulary_params, instances=dataset))

        cls.train_vocabulary = Vocabulary.from_files(TRAIN_VOCAB_EXAMPLE)

        cls.sentences = []
        with open(REMEMBERING_EXAMPLE, "r", encoding="utf-8") as r:
            for line in r:
                cls.sentences.append(line.strip())
Ejemplo n.º 8
0
    def test_save_load(self):
        for params, vocabulary in zip(self.params_sets, self.vocabularies):
            with TemporaryDirectory() as dirpath:
                vocab_dir = os.path.join(dirpath, DEFAULT_VOCAB_DIR)
                os.mkdir(vocab_dir)
                vocabulary.save_to_files(vocab_dir)

                params = params.duplicate()
                train_params = params.pop('train')
                model = LanguageModel.from_params(params, vocab=vocabulary)
                model.train(REMEMBERING_EXAMPLE, train_params, dirpath)

                loaded_model = LanguageModel.load(
                    dirpath,
                    params_file=ENCODER_ONLY_MODEL_PARAMS,
                    vocabulary_dir=vocab_dir)

                self.assertTrue(isinstance(model, NeuralNetLanguageModel))
                self.assertTrue(
                    isinstance(loaded_model, NeuralNetLanguageModel))
                self._test_model_equality(
                    cast(NeuralNetLanguageModel, model),
                    cast(NeuralNetLanguageModel, loaded_model))
Ejemplo n.º 9
0
    def test_save_load(self):
        with TemporaryDirectory() as dirpath:
            params = Params.from_file(N_GRAM_PARAMS)
            vocabulary_params = params.pop("vocab")
            dataset = self.model.reader.read(TRAIN_EXAMPLE)
            vocabulary = Vocabulary.from_params(vocabulary_params,
                                                instances=dataset)

            vocab_dir = os.path.join(dirpath, DEFAULT_VOCAB_DIR)
            os.mkdir(vocab_dir)
            vocabulary.save_to_files(vocab_dir)

            model = LanguageModel.from_params(params, vocab=vocabulary)
            model.train(TRAIN_EXAMPLE, Params({}), serialization_dir=dirpath)

            loaded_model = LanguageModel.load(dirpath,
                                              params_file=N_GRAM_PARAMS,
                                              vocabulary_dir=vocab_dir)

            self.assertTrue(isinstance(model, NGramLanguageModel))
            self.assertTrue(isinstance(loaded_model, NGramLanguageModel))
            self._assert_models_equal(cast(NGramLanguageModel, model),
                                      cast(NGramLanguageModel, loaded_model))
Ejemplo n.º 10
0
 def test_valid_ppl(self):
     for params in self.params_sets:
         params = params.duplicate()
         train_params = params.pop('train')
         train_params["trainer"]["num_epochs"] = 1
         train_params["iterator"]["batch_size"] = 50
         model = LanguageModel.from_params(params,
                                           vocab=self.train_vocabulary)
         metrics = model.train(TEST_EXAMPLE,
                               train_params,
                               valid_file_name=TEST_EXAMPLE)
         val_loss = metrics["validation_loss"]
         ppl_state = model.measure_perplexity(TEST_EXAMPLE)
         self.assertAlmostEqual(np.log(ppl_state.avg_perplexity),
                                val_loss,
                                places=3)
Ejemplo n.º 11
0
def train(model_path,
          train_path,
          val_path=None,
          vocabulary_path=None,
          config_path=None):
    vocabulary_path = vocabulary_path or os.path.join(model_path, "vocabulary")
    assert os.path.isdir(
        vocabulary_path), "Can't find vocab, run preprocess.py first"
    vocabulary = Vocabulary.from_files(vocabulary_path)

    config_path = config_path or os.path.join(model_path, "config.json")
    params = Params.from_file(config_path)
    train_params = params.pop("train", Params({}))
    model = LanguageModel.from_params(params, vocab=vocabulary)
    model.train(train_path,
                train_params,
                serialization_dir=model_path,
                valid_file_name=val_path)
Ejemplo n.º 12
0
def measure_ppl(model_path, val_path):
    model = LanguageModel.load(model_path)
    model.measure_perplexity(val_path)
Ejemplo n.º 13
0
 def __init__(self, vocab: Vocabulary, transforms: List[Transform] = None):
     LanguageModel.__init__(self, vocab, transforms)