def test_predict(nlp: Pipeline): text_1 = ("We are great fans of Slack, but we wish the subscriptions " "were more accessible to small startups.") text_2 = "We are great fans of Slack" aspect = "Slack" examples = [Example(text_1, aspect), Example(text_2, aspect)] tokenized_examples = nlp.tokenize(examples) input_batch = nlp.encode(tokenized_examples) output_batch = nlp.predict(input_batch) assert output_batch.scores.shape == [2, 3] assert output_batch.hidden_states.shape == [2, 13, 25, 768] assert output_batch.attentions.shape == [2, 12, 12, 25, 25] assert output_batch.attention_grads.shape == [2, 12, 12, 25, 25] scores = output_batch.scores.numpy() assert np.argmax(scores, axis=-1).tolist() == [2, 2]
def test_review(nlp: Pipeline): text_1 = ("We are great fans of Slack, but we wish the subscriptions " "were more accessible to small startups.") text_2 = "The Slack often has bugs." aspect = "slack" examples = [Example(text_1, aspect), Example(text_2, aspect)] tokenized_examples = nlp.tokenize(examples) input_batch = nlp.encode(tokenized_examples) output_batch = nlp.predict(input_batch) predictions = nlp.review(tokenized_examples, output_batch) predictions = list(predictions) labeled_1, labeled_2 = predictions assert labeled_1.sentiment == Sentiment.positive assert labeled_2.sentiment == Sentiment.negative assert isinstance(labeled_1, PredictedExample) assert isinstance(labeled_1.scores, list) assert isinstance(labeled_1.review, Review) assert not labeled_1.review.is_reference assert not labeled_1.review.patterns
def test_encode(nlp: Pipeline): text_1 = ("We are great fans of Slack, but we wish the subscriptions " "were more accessible to small startups.") text_2 = "We are great fans of Slack" aspect = "Slack" examples = [Example(text_1, aspect), Example(text_2, aspect)] tokenized_examples = nlp.tokenize(examples) input_batch = nlp.encode(tokenized_examples) assert isinstance(input_batch.token_ids, tf.Tensor) # 101 the CLS token, 102 the SEP tokens. token_ids = input_batch.token_ids.numpy() values = [101, 2057, 2024, 2307, 4599, 1997, 19840, 102, 19840, 102] assert token_ids[1, :10].tolist() == values assert token_ids[0, :7].tolist() == values[:7] # The second sequence should be padded (shorter), # and attention mask should be set. assert np.allclose(token_ids[1, 10:], 0) attention_mask = input_batch.attention_mask.numpy() assert np.allclose(attention_mask[1, 10:], 0) # Check how the tokenizer marked the segments. token_type_ids = input_batch.token_type_ids.numpy() assert token_type_ids[0, -2:].tolist() == [1, 1] assert np.allclose(token_type_ids[0, :-2], 0)