def test_id_to_vector_op(id, expected_output):
    vocab = MockVocab()
    t_vector = vocab.id_to_vector_op()(id)
    with tf.Session() as sess:
        sess.run(tf.tables_initializer())
        r_vector = sess.run(t_vector)
    assert (r_vector == np.array(expected_output)).all()
def test_word_to_id_op(word, expected_output):
    vocab = MockVocab()
    t_id = vocab.word_to_id_op()(word)
    with tf.Session() as sess:
        sess.run(tf.tables_initializer())
        r_id = sess.run(t_id)
    assert (r_id == expected_output).all()
Exemple #3
0
def test_id_to_vecor_or_default(id, default, expected_output):
    vocab = MockVocab()
    t_vector = vocab.id_to_vecor_or_default_op(default=default)(id)
    with tf.Session() as sess:
        sess.run(tf.tables_initializer())
        r_vector = sess.run(t_vector)
    assert r_vector == approx(np.array(expected_output))
def test_id_to_word_op(id, expected_output):
    vocab = MockVocab()
    t_word = vocab.id_to_word_op()(id)
    with tf.Session() as sess:
        sess.run(tf.tables_initializer())
        r_word = sess.run(t_word)
    word = [word.decode() for word in r_word]
    assert (word == expected_output)
def test_predictions_ids_to_tokens__undefined_id_proof(top_predicted_words_ids,
                                                       expected_tokens):
    mockLanguageModel = mock.Mock()
    mockLanguageModel.predictions_ids_to_tokens = LanguageModel.predictions_ids_to_tokens
    mockLanguageModel.words_as_text_preview = True

    specials = [
        SpecialUnit.OUT_OF_VOCABULARY, SpecialUnit.START_OF_SEQUENCE,
        SpecialUnit.END_OF_SEQUENCE
    ]
    vocab = MockVocab()
    generalized = GeneralizedVocabulary(vocab, specials)

    mockLanguageModel.vocabulary_generalized = generalized

    t_top_predicted_words_ids = tf.convert_to_tensor(top_predicted_words_ids)
    t_top_predicted_words_tokens = mockLanguageModel.predictions_ids_to_tokens(
        mockLanguageModel, t_top_predicted_words_ids)

    with tf.Session() as sess:
        sess.run(tf.tables_initializer())
        r_top_predicted_words_tokens = sess.run(t_top_predicted_words_tokens)

    assert (
        r_top_predicted_words_tokens[:, :, 0] == expected_tokens[:, :, 0]
    ).all(
    )  # second one is immaterial as it is not ID at all, only matters that it doesn't cause error
Exemple #6
0
def test_get_special_unit_id__use_already_supported_ids():
    specials = [
        SpecialUnit.OUT_OF_VOCABULARY, SpecialUnit.START_OF_SEQUENCE,
        SpecialUnit.END_OF_SEQUENCE
    ]
    vocab = MockVocab()
    generalized = GeneralizedVocabulary(vocab, specials)

    gen_oov_id = generalized.get_special_unit_id(SpecialUnit.OUT_OF_VOCABULARY)
    oov_vocab_id = vocab.special_unit_to_id(SpecialUnit.OUT_OF_VOCABULARY)
    t_oov_id_from_vocab_via_generalized = generalized.vocab_id_to_generalized_id(
    )([oov_vocab_id])
    with tf.Session() as sess:
        r_oov_id_from_vocab_via_generalized = sess.run(
            t_oov_id_from_vocab_via_generalized)
    assert r_oov_id_from_vocab_via_generalized[0] == gen_oov_id
 def model_fn(features, labels, mode, params):
     vocab_copy = MockVocab()
     input_pipeline_copy = LmInputDataPipeline(vocab_copy)
     return get_autoregressor_model_fn(
         vocab_size,
         input_pipeline_copy.get_id_to_embedding_mapping())(features,
                                                            labels, mode,
                                                            params)
 def input_fn():
     vocab = MockVocab()
     input_pipeline = LmInputDataPipeline(vocab, batch_size=None)
     input_dataset = tf.data.Dataset.from_generator(input_generator,
                                                    output_types=tf.string)
     corpus = input_pipeline.load_data(input_dataset).repeat()
     corpus = input_pipeline.padded_batch(corpus, 3)
     return corpus
def test_load_no_batching():
    def input_generator():
        yield ["a", "b", "c"]
        yield ["c", "b"]

    expected_output = [
        (
            {
                "inputs":
                np.array([[0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
                          [0.0, 0.0, 0.0, 1.5, 2.5, 3.5],
                          [0.0, 0.0, 0.0, 4.5, 5.5, 6.5],
                          [0.0, 0.0, 0.0, 7.5, 8.5, 9.5]],
                         dtype=np.float32),
                "length":
                np.array(4, dtype=np.int32),
            },
            {
                "targets": np.array([4, 5, 6, 2], dtype=np.int32)
            },
        ),
        (
            {
                "inputs":
                np.array([
                    [0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
                    [0.0, 0.0, 0.0, 7.5, 8.5, 9.5],
                    [0.0, 0.0, 0.0, 4.5, 5.5, 6.5],
                ],
                         dtype=np.float32),
                "length":
                np.array(3, dtype=np.int32),
            },
            {
                "targets": np.array([6, 5, 2], dtype=np.int32)
            },
        ),
    ]

    input_dataset = tf.data.Dataset.from_generator(input_generator,
                                                   output_types=tf.string)

    vocab = MockVocab()
    input_pipeline = LmInputDataPipeline(vocab, batch_size=None)
    input_data = input_pipeline.load_data(input_dataset)

    it = input_data.make_initializable_iterator()
    example = it.get_next()

    with tf.Session() as sess:
        sess.run(tf.tables_initializer())
        sess.run(it.initializer)
        #sess.run(tf.global_variables_initializer())
        for _, expected in enumerate(expected_output):
            actual = sess.run(example)
            assert actual[0]["inputs"] == approx(expected[0]["inputs"])
            assert actual[0]["length"] == approx(expected[0]["length"])
            assert actual[1]["targets"] == approx(expected[1]["targets"])
Exemple #10
0
def test_get_special_unit_id__non_supported_ids_get_non_id():
    specials = [
        SpecialUnit.OUT_OF_VOCABULARY, SpecialUnit.START_OF_SEQUENCE,
        SpecialUnit.END_OF_SEQUENCE
    ]
    vocab = MockVocab()
    non_id = vocab.get_non_id_integer()
    generalized = GeneralizedVocabulary(vocab, specials)

    gen_start_id = generalized.get_special_unit_id(
        SpecialUnit.START_OF_SEQUENCE)
    gen_end_id = generalized.get_special_unit_id(SpecialUnit.END_OF_SEQUENCE)
    t_vocab_ids = generalized.generalized_id_to_vocab_id()(
        [gen_start_id, gen_end_id])
    with tf.Session() as sess:
        r_vocab_ids = sess.run(t_vocab_ids)
    assert r_vocab_ids[0] == non_id
    assert r_vocab_ids[1] == non_id
Exemple #11
0
def test_get_special_unit_id__complete_uniq(specials):
    vocab = MockVocab()
    generalized = GeneralizedVocabulary(vocab, specials)
    ids = set()
    for special_unit_name in specials:
        id = generalized.get_special_unit_id(special_unit_name)
        assert isinstance(id, int)
        ids.add(id)
    assert len(specials) == len(ids)
Exemple #12
0
def test_encode_features_op(generalized_id, expected_encoded, special_units):
    vocab = MockVocab()
    generalized = GeneralizedVocabulary(vocab, special_units)

    generalized_id = tf.convert_to_tensor(generalized_id)

    t_encoded = generalized.encode_features_op()(generalized_id)

    with tf.Session() as sess:
        sess.run(tf.tables_initializer())
        r_encoded = sess.run(t_encoded)

    assert r_encoded == approx(np.array(expected_encoded))
Exemple #13
0
def test_vocab_id_to_generalized_id(vocab_id, expected_generalized_id):
    vocab = MockVocab()
    special_units = [
        SpecialUnit.START_OF_SEQUENCE, SpecialUnit.END_OF_SEQUENCE,
        SpecialUnit.OUT_OF_VOCABULARY
    ]
    generalized_vocab = GeneralizedVocabulary(vocab, special_units)

    t_vocab_id = tf.convert_to_tensor(vocab_id)
    t_generalized_id = generalized_vocab.vocab_id_to_generalized_id()(
        t_vocab_id)

    with tf.Session() as sess:
        r_generalized_id = sess.run(t_generalized_id)

    assert expected_generalized_id == approx(r_generalized_id)
Exemple #14
0
def test_encode_features_op_by_special_units_names(special_unit_to_encode,
                                                   supported_special_units,
                                                   expected_encoded):
    vocab = MockVocab()
    generalized = GeneralizedVocabulary(vocab, supported_special_units)

    generalized_id = generalized.get_special_unit_id(special_unit_to_encode)
    generalized_id = tf.convert_to_tensor([generalized_id])

    t_encoded = generalized.encode_features_op()(generalized_id)

    with tf.Session() as sess:
        sess.run(tf.tables_initializer())
        r_encoded = sess.run(t_encoded)

    assert r_encoded == approx(np.array([expected_encoded]))
Exemple #15
0
def test_generalized_id_to_token():
    specials = [
        SpecialUnit.OUT_OF_VOCABULARY, SpecialUnit.START_OF_SEQUENCE,
        SpecialUnit.END_OF_SEQUENCE
    ]
    vocab = MockVocab()
    generalized = GeneralizedVocabulary(vocab, specials)

    ids = [0, 1, 2, 4, 5, 6]
    expected = [
        b"<<ZERO>>", b"<<START_OF_SEQUENCE>>", b"<<END_OF_SEQUENCE>>", b"a",
        b"b", b"c"
    ]

    tokens = generalized.generalized_id_to_token()(ids)

    with tf.Session() as sess:
        sess.run(tf.tables_initializer())
        r_tokens = sess.run(tokens)

    assert (r_tokens == expected).all()
def test_predictions_ids_to_tokens(top_predicted_words_ids, expected_tokens):
    # b"<<ZERO>>", b"<<START_OF_SEQUENCE>>", b"<<END_OF_SEQUENCE>>", ???, b"a", b"b", b"c"
    mockLanguageModel = mock.Mock()
    mockLanguageModel.predictions_ids_to_tokens = LanguageModel.predictions_ids_to_tokens
    mockLanguageModel.words_as_text_preview = True

    specials = [
        SpecialUnit.OUT_OF_VOCABULARY, SpecialUnit.START_OF_SEQUENCE,
        SpecialUnit.END_OF_SEQUENCE
    ]
    vocab = MockVocab()
    generalized = GeneralizedVocabulary(vocab, specials)

    mockLanguageModel.vocabulary_generalized = generalized

    t_top_predicted_words_ids = tf.convert_to_tensor(top_predicted_words_ids)
    t_top_predicted_words_tokens = mockLanguageModel.predictions_ids_to_tokens(
        mockLanguageModel, t_top_predicted_words_ids)

    with tf.Session() as sess:
        sess.run(tf.tables_initializer())
        r_top_predicted_words_tokens = sess.run(t_top_predicted_words_tokens)

    assert (r_top_predicted_words_tokens == expected_tokens).all()
 def input_fn():
     vocab = MockVocab()
     input_pipeline = LmInputDataPipeline(vocab, batch_size=3)
     input_dataset = tf.data.Dataset.from_generator(input_generator,
                                                    output_types=tf.string)
     return input_pipeline.load_data(input_dataset).repeat()
def test_special_unit_to_id(special_unit_name, expected_output):
    vocab = MockVocab()
    assert vocab.special_unit_to_id(special_unit_name) == expected_output