def test_convert_featurizer_process(monkeypatch: MonkeyPatch): tokenizer = WhitespaceTokenizer() monkeypatch.setattr(ConveRTFeaturizer, "_get_validated_model_url", lambda x: RESTRICTED_ACCESS_URL) component_config = { "name": "ConveRTFeaturizer", "model_url": RESTRICTED_ACCESS_URL } featurizer = ConveRTFeaturizer(component_config) sentence = "Hey how are you today ?" message = Message.build(text=sentence) td = TrainingData([message]) tokenizer.train(td) tokens = featurizer.tokenize(message, attribute=TEXT) featurizer.process(message, tf_hub_module=featurizer.module) expected = np.array( [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]) seq_vecs, sent_vecs = message.get_dense_features(TEXT, []) seq_vecs = seq_vecs.features sent_vecs = sent_vecs.features assert len(tokens) == len(seq_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5)
def test_convert_featurizer_tokens_to_text(sentence: Text, expected_text: Text, monkeypatch: MonkeyPatch): tokenizer = WhitespaceTokenizer() monkeypatch.setattr(ConveRTFeaturizer, "_get_validated_model_url", lambda x: RESTRICTED_ACCESS_URL) component_config = { "name": "ConveRTFeaturizer", "model_url": RESTRICTED_ACCESS_URL } featurizer = ConveRTFeaturizer(component_config) message = Message.build(text=sentence) td = TrainingData([message]) tokenizer.train(td) tokens = featurizer.tokenize(message, attribute=TEXT) actual_text = ConveRTFeaturizer._tokens_to_text([tokens])[0] assert expected_text == actual_text
def test_convert_featurizer_number_of_sub_tokens( text: Text, expected_number_of_sub_tokens: List[int], monkeypatch: MonkeyPatch): tokenizer = WhitespaceTokenizer() monkeypatch.setattr(ConveRTFeaturizer, "_get_validated_model_url", lambda x: RESTRICTED_ACCESS_URL) component_config = { "name": "ConveRTFeaturizer", "model_url": RESTRICTED_ACCESS_URL } featurizer = ConveRTFeaturizer(component_config) message = Message.build(text=text) td = TrainingData([message]) tokenizer.train(td) tokens = featurizer.tokenize(message, attribute=TEXT) assert [t.get(NUMBER_OF_SUB_TOKENS) for t in tokens] == expected_number_of_sub_tokens
def test_convert_featurizer_token_edge_cases( text: Text, expected_tokens: List[Text], expected_indices: List[Tuple[int]], monkeypatch: MonkeyPatch, ): tokenizer = WhitespaceTokenizer() monkeypatch.setattr(ConveRTFeaturizer, "_get_validated_model_url", lambda x: RESTRICTED_ACCESS_URL) component_config = { "name": "ConveRTFeaturizer", "model_url": RESTRICTED_ACCESS_URL } featurizer = ConveRTFeaturizer(component_config) message = Message.build(text=text) td = TrainingData([message]) tokenizer.train(td) tokens = featurizer.tokenize(message, attribute=TEXT) assert [t.text for t in tokens] == expected_tokens assert [t.start for t in tokens] == [i[0] for i in expected_indices] assert [t.end for t in tokens] == [i[1] for i in expected_indices]