Beispiel #1
0
def train_texts(texts: List[Text], model_name: Text,
                model_weights: Text) -> List[Message]:
    config = create_pretrained_transformers_config(model_name, model_weights)
    whitespace_tokenizer = WhitespaceTokenizer()
    transformer = HFTransformersNLP(config)

    messages = [Message.build(text=text) for text in texts]
    td = TrainingData(messages)

    whitespace_tokenizer.train(td)
    transformer.train(td)
    return messages
Beispiel #2
0
def test_lm_featurizer_shape_values(model_name, texts, expected_shape,
                                    expected_sequence_vec, expected_cls_vec):
    transformers_config = {"model_name": model_name}

    transformers_nlp = HFTransformersNLP(transformers_config)
    lm_featurizer = LanguageModelFeaturizer()

    messages = []
    for text in texts:
        messages.append(Message.build(text=text))
    td = TrainingData(messages)

    transformers_nlp.train(td)
    lm_featurizer.train(td)

    for index in range(len(texts)):

        computed_sequence_vec, computed_sentence_vec = messages[
            index].get_dense_features(TEXT, [])
        if computed_sequence_vec:
            computed_sequence_vec = computed_sequence_vec.features
        if computed_sentence_vec:
            computed_sentence_vec = computed_sentence_vec.features

        assert computed_sequence_vec.shape[0] == expected_shape[index][0] - 1
        assert computed_sequence_vec.shape[1] == expected_shape[index][1]
        assert computed_sentence_vec.shape[0] == 1
        assert computed_sentence_vec.shape[1] == expected_shape[index][1]

        # Look at the value of first dimension for a few starting timesteps
        assert np.allclose(
            computed_sequence_vec[:len(expected_sequence_vec[index]), 0],
            expected_sequence_vec[index],
            atol=1e-5,
        )

        # Look at the first value of first five dimensions
        assert np.allclose(computed_sentence_vec[0][:5],
                           expected_cls_vec[index],
                           atol=1e-5)

        intent_sequence_vec, intent_sentence_vec = messages[
            index].get_dense_features(INTENT, [])
        if intent_sequence_vec:
            intent_sequence_vec = intent_sequence_vec.features
        if intent_sentence_vec:
            intent_sentence_vec = intent_sentence_vec.features

        assert intent_sequence_vec is None
        assert intent_sentence_vec is None
Beispiel #3
0
def test_lm_tokenizer_number_of_sub_tokens(text, expected_number_of_sub_tokens):
    transformers_config = {"model_name": "bert"}  # Test for one should be enough

    transformers_nlp = HFTransformersNLP(transformers_config)
    lm_tokenizer = LanguageModelTokenizer()

    message = Message(text)

    td = TrainingData([message])

    transformers_nlp.train(td)
    lm_tokenizer.train(td)

    assert [
        t.get(NUMBER_OF_SUB_TOKENS) for t in message.get(TOKENS_NAMES[TEXT])[:-1]
    ] == expected_number_of_sub_tokens
Beispiel #4
0
def test_lm_tokenizer_custom_intent_symbol(text, expected_tokens):
    component_config = {"intent_tokenization_flag": True, "intent_split_symbol": "+"}

    transformers_config = {"model_name": "bert"}  # Test for one should be enough

    transformers_nlp = HFTransformersNLP(transformers_config)
    lm_tokenizer = LanguageModelTokenizer(component_config)

    message = Message(text)
    message.set(INTENT, text)

    td = TrainingData([message])

    transformers_nlp.train(td)
    lm_tokenizer.train(td)

    assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
def test_lm_featurizer_shape_values():
    model_name, texts, expected_shape, expected_sequence_vec, expected_cls_vec = samples[0]
    transformers_config = {"model_name": model_name}

    transformers_nlp_bert = HFTransformersNLP({"model_name": "bert"})
    transformers_nlp_gpt = HFTransformersNLP({"model_name": "gpt"})
    lm_featurizer = LanguageModelFeaturizer()

    messages = []
    for text in texts:
        messages.append(Message.build(text=text))
    td = TrainingData(messages)
    show_training_data(td)
    transformers_nlp_bert.train(td)
    show_training_data(td)
    transformers_nlp_gpt.train(td)
    show_training_data(td)
    lm_featurizer.train(td)
    show_training_data(td)
def test_lm_featurizer_shape_values():
    model_name, texts, expected_shape, expected_sequence_vec, expected_cls_vec = samples[3]
    transformers_config = {"model_name": model_name}

    transformers_nlp = HFTransformersNLP(transformers_config)
    lm_featurizer = LanguageModelFeaturizer()

    messages = []
    for text in texts:
        messages.append(Message.build(text=text))
    td = TrainingData(messages)
    show_training_data(td)
    transformers_nlp.train(td)
    show_training_data(td)
    lm_featurizer.train(td)
    show_training_data(td)


    for index in range(len(texts)):
        computed_feature_vec = messages[index].get(DENSE_FEATURE_NAMES[TEXT])
        computed_sequence_vec, computed_sentence_vec = (
            computed_feature_vec[:-1],
            computed_feature_vec[-1],
        )

        assert computed_feature_vec.shape == expected_shape[index]

        # Look at the value of first dimension for a few starting timesteps
        assert np.allclose(
            computed_sequence_vec[: len(expected_sequence_vec[index]), 0],
            expected_sequence_vec[index],
            atol=1e-5,
        )

        # Look at the first value of first five dimensions
        assert np.allclose(
            computed_sentence_vec[:5], expected_cls_vec[index], atol=1e-5
        )

        intent_vec = messages[index].get(DENSE_FEATURE_NAMES[INTENT])

        assert intent_vec is None