def test_tagged_corpus_get_tag_statistic(): train_sentence = Sentence(u'Zalando Research is located in Berlin .') train_sentence[0].add_tag(u'ner', u'B-ORG') train_sentence[1].add_tag(u'ner', u'E-ORG') train_sentence[5].add_tag(u'ner', u'S-LOC') dev_sentence = Sentence( u'Facebook, Inc. is a company, and Google is one as well.', use_tokenizer=True) dev_sentence[0].add_tag(u'ner', u'B-ORG') dev_sentence[1].add_tag(u'ner', u'I-ORG') dev_sentence[2].add_tag(u'ner', u'E-ORG') dev_sentence[8].add_tag(u'ner', u'S-ORG') test_sentence = Sentence(u'Nothing to do with companies.') tag_to_count_dict = TaggedCorpus._get_tag_to_count( [train_sentence, dev_sentence, test_sentence], u'ner') assert (1 == tag_to_count_dict[u'S-ORG']) assert (1 == tag_to_count_dict[u'S-LOC']) assert (2 == tag_to_count_dict[u'B-ORG']) assert (2 == tag_to_count_dict[u'E-ORG']) assert (1 == tag_to_count_dict[u'I-ORG'])
def test_tagged_corpus_get_tag_statistic(): train_sentence = Sentence("Zalando Research is located in Berlin .") train_sentence[0].add_tag("ner", "B-ORG") train_sentence[1].add_tag("ner", "E-ORG") train_sentence[5].add_tag("ner", "S-LOC") dev_sentence = Sentence( "Facebook, Inc. is a company, and Google is one as well.", use_tokenizer=True) dev_sentence[0].add_tag("ner", "B-ORG") dev_sentence[1].add_tag("ner", "I-ORG") dev_sentence[2].add_tag("ner", "E-ORG") dev_sentence[8].add_tag("ner", "S-ORG") test_sentence = Sentence("Nothing to do with companies.") tag_to_count_dict = TaggedCorpus._get_tag_to_count( [train_sentence, dev_sentence, test_sentence], "ner") assert 1 == tag_to_count_dict["S-ORG"] assert 1 == tag_to_count_dict["S-LOC"] assert 2 == tag_to_count_dict["B-ORG"] assert 2 == tag_to_count_dict["E-ORG"] assert 1 == tag_to_count_dict["I-ORG"]