def test_gpt2_featurize(self): model = Classifier(base_model=GPT2) def dataset_encoded(): yield {"tokens": arr_encoded.token_ids, "mask": arr_encoded.mask} def get_input_fn(): types, shapes = model.input_pipeline.feed_shape_type_def() tf_dataset = Dataset.from_generator(dataset_encoded, types[0], shapes[0]) return tf_dataset.batch(1) encoded = model.input_pipeline.text_encoder._encode(self.TEST_DATA) encoded = EncodedOutput(token_ids=encoded.token_ids[0]) estimator, hooks = model.get_estimator(force_build_lm=False) predict = estimator.predict( input_fn=get_input_fn, predict_keys=[PredictMode.SEQUENCE], hooks=hooks ) arr_encoded = model.input_pipeline._array_format(encoded) sequence_features = next(predict)[PredictMode.SEQUENCE] np.testing.assert_allclose( sequence_features[:len(encoded.token_ids),:], np.load( os.path.join( DIRECTORY, 'data/test-gpt2-activations.npy' ) ), atol=1e-1 )
def test_early_termination_lm(self): model = Classifier(verbose=False) # A dirty mock to make all model inferences output a hundred _classify_ tokens fake_estimator = MagicMock() model.get_estimator = lambda *args, **kwargs: fake_estimator fake_estimator.predict = MagicMock( return_value=iter([{ "GEN_TEXT": 100 * [ENCODER['_classify_']] }])) lm_out = model.generate_text() self.assertEqual(lm_out, '_start__classify_')
def test_generate_text_stop_early(self): model = Classifier() # A dirty mock to make all model inferences output a hundred _classify_ tokens fake_estimator = MagicMock() model.get_estimator = lambda *args, **kwargs: (fake_estimator, []) model.input_pipeline.text_encoder._lazy_init() fake_estimator.predict = MagicMock(return_value=iter([{ PredictMode.GENERATE_TEXT: 100 * [model.input_pipeline.text_encoder["_classify_"]] }])) start_id = model.input_pipeline.text_encoder.start start_token = model.input_pipeline.text_encoder.decoder[start_id] lm_out = model.generate_text(use_extra_toks=True) self.assertEqual(lm_out, "{}_classify_".format(start_token))