def __init__(self, rouge_types, use_stemmer=False, split_summaries=False, tokenizer=None): """Initializes a new RougeScorer. Valid rouge types that can be computed are: rougen (e.g. rouge1, rouge2): n-gram based scoring. rougeL: Longest common subsequence based scoring. Args: rouge_types: A list of rouge types to calculate. use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes to improve matching. This arg is used in the DefaultTokenizer, but other tokenizers might or might not choose to use this. split_summaries: whether to add newlines between sentences for rougeLsum tokenizer: Tokenizer object which has a tokenize() method. Returns: A dict mapping rouge types to Score tuples. """ self.rouge_types = rouge_types if tokenizer: self._tokenizer = tokenizer else: self._tokenizer = tokenizers.DefaultTokenizer(use_stemmer) logging.info("Using default tokenizer.") self._split_summaries = split_summaries
def testRougeTokenizerInit(self): scorer = rouge_scorer.RougeScorer( ["rouge1"], tokenizer=tokenizers.DefaultTokenizer()) target = "this is a test" prediction = target result = scorer.score(target, prediction) self.assertEqual(1.0, result["rouge1"].fmeasure)
def test_default_tokenizer_with_stemmer_init(self): tokenizer = tokenizers.DefaultTokenizer(use_stemmer=True) self.assertIsInstance(tokenizer, tokenizers.Tokenizer) result = tokenizer.tokenize("the friends had a meeting") self.assertListEqual(["the", "friend", "had", "a", "meet"], result)
def test_default_tokenizer_no_stemmer_init(self): tokenizer = tokenizers.DefaultTokenizer(use_stemmer=False) self.assertIsInstance(tokenizer, tokenizers.Tokenizer) result = tokenizer.tokenize("this is a test") self.assertListEqual(["this", "is", "a", "test"], result)