def test_id_to_vector_op(id, expected_output): vocab = MockVocab() t_vector = vocab.id_to_vector_op()(id) with tf.Session() as sess: sess.run(tf.tables_initializer()) r_vector = sess.run(t_vector) assert (r_vector == np.array(expected_output)).all()
def test_word_to_id_op(word, expected_output): vocab = MockVocab() t_id = vocab.word_to_id_op()(word) with tf.Session() as sess: sess.run(tf.tables_initializer()) r_id = sess.run(t_id) assert (r_id == expected_output).all()
def test_id_to_vecor_or_default(id, default, expected_output): vocab = MockVocab() t_vector = vocab.id_to_vecor_or_default_op(default=default)(id) with tf.Session() as sess: sess.run(tf.tables_initializer()) r_vector = sess.run(t_vector) assert r_vector == approx(np.array(expected_output))
def test_id_to_word_op(id, expected_output): vocab = MockVocab() t_word = vocab.id_to_word_op()(id) with tf.Session() as sess: sess.run(tf.tables_initializer()) r_word = sess.run(t_word) word = [word.decode() for word in r_word] assert (word == expected_output)
def test_predictions_ids_to_tokens__undefined_id_proof(top_predicted_words_ids, expected_tokens): mockLanguageModel = mock.Mock() mockLanguageModel.predictions_ids_to_tokens = LanguageModel.predictions_ids_to_tokens mockLanguageModel.words_as_text_preview = True specials = [ SpecialUnit.OUT_OF_VOCABULARY, SpecialUnit.START_OF_SEQUENCE, SpecialUnit.END_OF_SEQUENCE ] vocab = MockVocab() generalized = GeneralizedVocabulary(vocab, specials) mockLanguageModel.vocabulary_generalized = generalized t_top_predicted_words_ids = tf.convert_to_tensor(top_predicted_words_ids) t_top_predicted_words_tokens = mockLanguageModel.predictions_ids_to_tokens( mockLanguageModel, t_top_predicted_words_ids) with tf.Session() as sess: sess.run(tf.tables_initializer()) r_top_predicted_words_tokens = sess.run(t_top_predicted_words_tokens) assert ( r_top_predicted_words_tokens[:, :, 0] == expected_tokens[:, :, 0] ).all( ) # second one is immaterial as it is not ID at all, only matters that it doesn't cause error
def test_get_special_unit_id__use_already_supported_ids(): specials = [ SpecialUnit.OUT_OF_VOCABULARY, SpecialUnit.START_OF_SEQUENCE, SpecialUnit.END_OF_SEQUENCE ] vocab = MockVocab() generalized = GeneralizedVocabulary(vocab, specials) gen_oov_id = generalized.get_special_unit_id(SpecialUnit.OUT_OF_VOCABULARY) oov_vocab_id = vocab.special_unit_to_id(SpecialUnit.OUT_OF_VOCABULARY) t_oov_id_from_vocab_via_generalized = generalized.vocab_id_to_generalized_id( )([oov_vocab_id]) with tf.Session() as sess: r_oov_id_from_vocab_via_generalized = sess.run( t_oov_id_from_vocab_via_generalized) assert r_oov_id_from_vocab_via_generalized[0] == gen_oov_id
def model_fn(features, labels, mode, params): vocab_copy = MockVocab() input_pipeline_copy = LmInputDataPipeline(vocab_copy) return get_autoregressor_model_fn( vocab_size, input_pipeline_copy.get_id_to_embedding_mapping())(features, labels, mode, params)
def input_fn(): vocab = MockVocab() input_pipeline = LmInputDataPipeline(vocab, batch_size=None) input_dataset = tf.data.Dataset.from_generator(input_generator, output_types=tf.string) corpus = input_pipeline.load_data(input_dataset).repeat() corpus = input_pipeline.padded_batch(corpus, 3) return corpus
def test_load_no_batching(): def input_generator(): yield ["a", "b", "c"] yield ["c", "b"] expected_output = [ ( { "inputs": np.array([[0.0, 1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.5, 2.5, 3.5], [0.0, 0.0, 0.0, 4.5, 5.5, 6.5], [0.0, 0.0, 0.0, 7.5, 8.5, 9.5]], dtype=np.float32), "length": np.array(4, dtype=np.int32), }, { "targets": np.array([4, 5, 6, 2], dtype=np.int32) }, ), ( { "inputs": np.array([ [0.0, 1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 7.5, 8.5, 9.5], [0.0, 0.0, 0.0, 4.5, 5.5, 6.5], ], dtype=np.float32), "length": np.array(3, dtype=np.int32), }, { "targets": np.array([6, 5, 2], dtype=np.int32) }, ), ] input_dataset = tf.data.Dataset.from_generator(input_generator, output_types=tf.string) vocab = MockVocab() input_pipeline = LmInputDataPipeline(vocab, batch_size=None) input_data = input_pipeline.load_data(input_dataset) it = input_data.make_initializable_iterator() example = it.get_next() with tf.Session() as sess: sess.run(tf.tables_initializer()) sess.run(it.initializer) #sess.run(tf.global_variables_initializer()) for _, expected in enumerate(expected_output): actual = sess.run(example) assert actual[0]["inputs"] == approx(expected[0]["inputs"]) assert actual[0]["length"] == approx(expected[0]["length"]) assert actual[1]["targets"] == approx(expected[1]["targets"])
def test_get_special_unit_id__non_supported_ids_get_non_id(): specials = [ SpecialUnit.OUT_OF_VOCABULARY, SpecialUnit.START_OF_SEQUENCE, SpecialUnit.END_OF_SEQUENCE ] vocab = MockVocab() non_id = vocab.get_non_id_integer() generalized = GeneralizedVocabulary(vocab, specials) gen_start_id = generalized.get_special_unit_id( SpecialUnit.START_OF_SEQUENCE) gen_end_id = generalized.get_special_unit_id(SpecialUnit.END_OF_SEQUENCE) t_vocab_ids = generalized.generalized_id_to_vocab_id()( [gen_start_id, gen_end_id]) with tf.Session() as sess: r_vocab_ids = sess.run(t_vocab_ids) assert r_vocab_ids[0] == non_id assert r_vocab_ids[1] == non_id
def test_get_special_unit_id__complete_uniq(specials): vocab = MockVocab() generalized = GeneralizedVocabulary(vocab, specials) ids = set() for special_unit_name in specials: id = generalized.get_special_unit_id(special_unit_name) assert isinstance(id, int) ids.add(id) assert len(specials) == len(ids)
def test_encode_features_op(generalized_id, expected_encoded, special_units): vocab = MockVocab() generalized = GeneralizedVocabulary(vocab, special_units) generalized_id = tf.convert_to_tensor(generalized_id) t_encoded = generalized.encode_features_op()(generalized_id) with tf.Session() as sess: sess.run(tf.tables_initializer()) r_encoded = sess.run(t_encoded) assert r_encoded == approx(np.array(expected_encoded))
def test_vocab_id_to_generalized_id(vocab_id, expected_generalized_id): vocab = MockVocab() special_units = [ SpecialUnit.START_OF_SEQUENCE, SpecialUnit.END_OF_SEQUENCE, SpecialUnit.OUT_OF_VOCABULARY ] generalized_vocab = GeneralizedVocabulary(vocab, special_units) t_vocab_id = tf.convert_to_tensor(vocab_id) t_generalized_id = generalized_vocab.vocab_id_to_generalized_id()( t_vocab_id) with tf.Session() as sess: r_generalized_id = sess.run(t_generalized_id) assert expected_generalized_id == approx(r_generalized_id)
def test_encode_features_op_by_special_units_names(special_unit_to_encode, supported_special_units, expected_encoded): vocab = MockVocab() generalized = GeneralizedVocabulary(vocab, supported_special_units) generalized_id = generalized.get_special_unit_id(special_unit_to_encode) generalized_id = tf.convert_to_tensor([generalized_id]) t_encoded = generalized.encode_features_op()(generalized_id) with tf.Session() as sess: sess.run(tf.tables_initializer()) r_encoded = sess.run(t_encoded) assert r_encoded == approx(np.array([expected_encoded]))
def test_generalized_id_to_token(): specials = [ SpecialUnit.OUT_OF_VOCABULARY, SpecialUnit.START_OF_SEQUENCE, SpecialUnit.END_OF_SEQUENCE ] vocab = MockVocab() generalized = GeneralizedVocabulary(vocab, specials) ids = [0, 1, 2, 4, 5, 6] expected = [ b"<<ZERO>>", b"<<START_OF_SEQUENCE>>", b"<<END_OF_SEQUENCE>>", b"a", b"b", b"c" ] tokens = generalized.generalized_id_to_token()(ids) with tf.Session() as sess: sess.run(tf.tables_initializer()) r_tokens = sess.run(tokens) assert (r_tokens == expected).all()
def test_predictions_ids_to_tokens(top_predicted_words_ids, expected_tokens): # b"<<ZERO>>", b"<<START_OF_SEQUENCE>>", b"<<END_OF_SEQUENCE>>", ???, b"a", b"b", b"c" mockLanguageModel = mock.Mock() mockLanguageModel.predictions_ids_to_tokens = LanguageModel.predictions_ids_to_tokens mockLanguageModel.words_as_text_preview = True specials = [ SpecialUnit.OUT_OF_VOCABULARY, SpecialUnit.START_OF_SEQUENCE, SpecialUnit.END_OF_SEQUENCE ] vocab = MockVocab() generalized = GeneralizedVocabulary(vocab, specials) mockLanguageModel.vocabulary_generalized = generalized t_top_predicted_words_ids = tf.convert_to_tensor(top_predicted_words_ids) t_top_predicted_words_tokens = mockLanguageModel.predictions_ids_to_tokens( mockLanguageModel, t_top_predicted_words_ids) with tf.Session() as sess: sess.run(tf.tables_initializer()) r_top_predicted_words_tokens = sess.run(t_top_predicted_words_tokens) assert (r_top_predicted_words_tokens == expected_tokens).all()
def input_fn(): vocab = MockVocab() input_pipeline = LmInputDataPipeline(vocab, batch_size=3) input_dataset = tf.data.Dataset.from_generator(input_generator, output_types=tf.string) return input_pipeline.load_data(input_dataset).repeat()
def test_special_unit_to_id(special_unit_name, expected_output): vocab = MockVocab() assert vocab.special_unit_to_id(special_unit_name) == expected_output