def test_tagged_corpus_statistics_multi_label(): train_sentence = Sentence("I love Berlin.", use_tokenizer=True).add_label('label', 'class_1') dev_sentence = Sentence("The sun is shining.", use_tokenizer=True).add_label('label', 'class_2') test_sentence = Sentence("Berlin is sunny.", use_tokenizer=True) test_sentence.add_label('label', 'class_1') test_sentence.add_label('label', 'class_2') class_to_count_dict = Corpus._count_sentence_labels( [train_sentence, dev_sentence, test_sentence] ) assert "class_1" in class_to_count_dict assert "class_2" in class_to_count_dict assert 2 == class_to_count_dict["class_1"] assert 2 == class_to_count_dict["class_2"] tokens_in_sentences = Corpus._get_tokens_per_sentence( [train_sentence, dev_sentence, test_sentence] ) assert 3 == len(tokens_in_sentences) assert 4 == tokens_in_sentences[0] assert 5 == tokens_in_sentences[1] assert 4 == tokens_in_sentences[2]
def test_tagged_corpus_statistics_multi_label(): train_sentence = Sentence("I love Berlin.", labels=["class_1"], use_tokenizer=segtok_tokenizer) dev_sentence = Sentence("The sun is shining.", labels=["class_2"], use_tokenizer=segtok_tokenizer) test_sentence = Sentence( "Berlin is sunny.", labels=["class_1", "class_2"], use_tokenizer=segtok_tokenizer, ) class_to_count_dict = Corpus._get_class_to_count( [train_sentence, dev_sentence, test_sentence]) assert "class_1" in class_to_count_dict assert "class_2" in class_to_count_dict assert 2 == class_to_count_dict["class_1"] assert 2 == class_to_count_dict["class_2"] tokens_in_sentences = Corpus._get_tokens_per_sentence( [train_sentence, dev_sentence, test_sentence]) assert 3 == len(tokens_in_sentences) assert 4 == tokens_in_sentences[0] assert 5 == tokens_in_sentences[1] assert 4 == tokens_in_sentences[2]
def test_tagged_corpus_statistics_multi_label(): train_sentence = Sentence('I love Berlin.', labels=['class_1'], use_tokenizer=True) dev_sentence = Sentence('The sun is shining.', labels=['class_2'], use_tokenizer=True) test_sentence = Sentence('Berlin is sunny.', labels=['class_1', 'class_2'], use_tokenizer=True) class_to_count_dict = Corpus._get_class_to_count( [train_sentence, dev_sentence, test_sentence]) assert ('class_1' in class_to_count_dict) assert ('class_2' in class_to_count_dict) assert (2 == class_to_count_dict['class_1']) assert (2 == class_to_count_dict['class_2']) tokens_in_sentences = Corpus._get_tokens_per_sentence( [train_sentence, dev_sentence, test_sentence]) assert (3 == len(tokens_in_sentences)) assert (4 == tokens_in_sentences[0]) assert (5 == tokens_in_sentences[1]) assert (4 == tokens_in_sentences[2])