Example #1
0
    def test_gpt2_featurize(self):
        model = Classifier(base_model=GPT2)
        
        def dataset_encoded():
            yield {"tokens": arr_encoded.token_ids, "mask": arr_encoded.mask}

        def get_input_fn():
            types, shapes = model.input_pipeline.feed_shape_type_def()
            tf_dataset = Dataset.from_generator(dataset_encoded, types[0], shapes[0])
            return tf_dataset.batch(1)

        encoded = model.input_pipeline.text_encoder._encode(self.TEST_DATA)
        encoded = EncodedOutput(token_ids=encoded.token_ids[0])
        estimator, hooks = model.get_estimator(force_build_lm=False)
        predict = estimator.predict(
            input_fn=get_input_fn, predict_keys=[PredictMode.SEQUENCE], hooks=hooks
        )
        arr_encoded = model.input_pipeline._array_format(encoded)
        sequence_features = next(predict)[PredictMode.SEQUENCE]

        np.testing.assert_allclose(
            sequence_features[:len(encoded.token_ids),:],
            np.load(
                os.path.join(
                    DIRECTORY, 
                    'data/test-gpt2-activations.npy'
                )
            ),
            atol=1e-1
        )
Example #2
0
    def test_early_termination_lm(self):
        model = Classifier(verbose=False)

        # A dirty mock to make all model inferences output a hundred _classify_ tokens
        fake_estimator = MagicMock()
        model.get_estimator = lambda *args, **kwargs: fake_estimator
        fake_estimator.predict = MagicMock(
            return_value=iter([{
                "GEN_TEXT": 100 * [ENCODER['_classify_']]
            }]))

        lm_out = model.generate_text()
        self.assertEqual(lm_out, '_start__classify_')
Example #3
0
    def test_generate_text_stop_early(self):
        model = Classifier()

        # A dirty mock to make all model inferences output a hundred _classify_ tokens
        fake_estimator = MagicMock()
        model.get_estimator = lambda *args, **kwargs: (fake_estimator, [])
        model.input_pipeline.text_encoder._lazy_init()
        fake_estimator.predict = MagicMock(return_value=iter([{
            PredictMode.GENERATE_TEXT:
            100 * [model.input_pipeline.text_encoder["_classify_"]]
        }]))
        start_id = model.input_pipeline.text_encoder.start
        start_token = model.input_pipeline.text_encoder.decoder[start_id]
        lm_out = model.generate_text(use_extra_toks=True)
        self.assertEqual(lm_out, "{}_classify_".format(start_token))