Esempio n. 1
0
def _score_lcs(target_tokens, prediction_tokens):
    """Computes LCS (Longest Common Subsequence) rouge scores.

  Args:
    target_tokens: Tokens from the target text.
    prediction_tokens: Tokens from the predicted text.
  Returns:
    A Score object containing computed scores.
  """

    if not target_tokens or not prediction_tokens:
        return scoring.Score(precision=0, recall=0, fmeasure=0)

    # Compute length of LCS from the bottom up in a table (DP appproach).
    cols = len(prediction_tokens) + 1
    rows = len(target_tokens) + 1
    lcs_table = np.zeros((rows, cols))
    for i in xrange(1, rows):
        for j in xrange(1, cols):
            if target_tokens[i - 1] == prediction_tokens[j - 1]:
                lcs_table[i, j] = lcs_table[i - 1, j - 1] + 1
            else:
                lcs_table[i, j] = max(lcs_table[i - 1, j], lcs_table[i, j - 1])
    lcs_length = lcs_table[-1, -1]

    precision = lcs_length / len(prediction_tokens)
    recall = lcs_length / len(target_tokens)
    fmeasure = scoring.fmeasure(precision, recall)

    return scoring.Score(precision=precision, recall=recall, fmeasure=fmeasure)
    def testConsistentPercentiles(self):
        aggregator = scoring.BootstrapAggregator(confidence_interval=0.9)
        aggregator.add_scores({
            "rouge1":
            scoring.Score(precision=1, recall=1 / 3, fmeasure=1 / 2)
        })
        aggregator.add_scores(
            {"rouge1": scoring.Score(precision=0, recall=0, fmeasure=0)})
        aggregator.add_scores(
            {"rouge1": scoring.Score(precision=1, recall=1, fmeasure=1)})
        result = aggregator.aggregate()

        self.assertSimilarAggregates((1 / 3, 2 / 3, 3 / 3),
                                     (1 / 9, 4 / 9, 7 / 9),
                                     (1 / 6, 3 / 6, 5 / 6),
                                     result["rouge1"],
                                     delta=1e-8)
    def testLargeConfidence(self):
        aggregator = scoring.BootstrapAggregator(confidence_interval=0.0)
        aggregator.add_scores({
            "rouge1":
            scoring.Score(precision=1, recall=1 / 3, fmeasure=1 / 2)
        })
        aggregator.add_scores(
            {"rouge1": scoring.Score(precision=0, recall=0, fmeasure=0)})
        aggregator.add_scores(
            {"rouge1": scoring.Score(precision=1, recall=1, fmeasure=1)})
        result = aggregator.aggregate()

        self.assertSimilarAggregates((2 / 3, 2 / 3, 2 / 3),
                                     (4 / 9, 4 / 9, 4 / 9),
                                     (3 / 6, 3 / 6, 3 / 6),
                                     result["rouge1"],
                                     delta=1e-8)
Esempio n. 4
0
def _summary_level_lcs(ref_sent, can_sent):
    """ROUGE: Summary-level LCS, section 3.2 in ROUGE paper.

  Args:
    ref_sent: list of tokenized reference sentences
    can_sent: list of tokenized candidate sentences

  Returns:
    summary level ROUGE score
  """
    if not ref_sent or not can_sent:
        return scoring.Score(precision=0, recall=0, fmeasure=0)

    m = sum(map(len, ref_sent))
    n = sum(map(len, can_sent))
    if not n or not m:
        return scoring.Score(precision=0, recall=0, fmeasure=0)

    # get token counts to prevent double counting
    token_cnts_r = collections.Counter()
    token_cnts_c = collections.Counter()
    for s in ref_sent:
        # s is a list of tokens
        token_cnts_r.update(s)
    for s in can_sent:
        token_cnts_c.update(s)

    hits = 0
    for r in ref_sent:
        lcs = _union_lcs(r, can_sent)
        # Prevent double-counting:
        # The paper describes just computing hits += len(_union_lcs()),
        # but the implementation prevents double counting. We also
        # implement this as in version 1.5.5.
        for t in lcs:
            if token_cnts_c[t] > 0 and token_cnts_r[t] > 0:
                hits += 1
                token_cnts_c[t] -= 1
                token_cnts_r[t] -= 1

    recall = hits / m
    precision = hits / n
    fmeasure = scoring.fmeasure(precision, recall)
    return scoring.Score(precision=precision, recall=recall, fmeasure=fmeasure)
Esempio n. 5
0
def _score_lcs(target_tokens, prediction_tokens):
  """Computes LCS (Longest Common Subsequence) rouge scores.
  Args:
    target_tokens: Tokens from the target text.
    prediction_tokens: Tokens from the predicted text.
  Returns:
    A Score object containing computed scores.
  """

  if not target_tokens or not prediction_tokens:
    return scoring.Score(precision=0, recall=0, fmeasure=0)

  # Compute length of LCS from the bottom up in a table (DP appproach).
  lcs_table = _lcs_table(target_tokens, prediction_tokens)
  lcs_length = lcs_table[-1][-1]

  precision = lcs_length / len(prediction_tokens)
  recall = lcs_length / len(target_tokens)
  fmeasure = scoring.fmeasure(precision, recall)

  return scoring.Score(precision=precision, recall=recall, fmeasure=fmeasure)
Esempio n. 6
0
def _score_ngrams(target_ngrams, prediction_ngrams):
  """Compute n-gram based rouge scores.
  Args:
    target_ngrams: A Counter object mapping each ngram to number of
      occurrences for the target text.
    prediction_ngrams: A Counter object mapping each ngram to number of
      occurrences for the prediction text.
  Returns:
    A Score object containing computed scores.
  """

  intersection_ngrams_count = 0
  for ngram in six.iterkeys(target_ngrams):
    intersection_ngrams_count += min(target_ngrams[ngram],
                                     prediction_ngrams[ngram])
  target_ngrams_count = sum(target_ngrams.values())
  prediction_ngrams_count = sum(prediction_ngrams.values())

  precision = intersection_ngrams_count / max(prediction_ngrams_count, 1)
  recall = intersection_ngrams_count / max(target_ngrams_count, 1)
  fmeasure = scoring.fmeasure(precision, recall)

  return scoring.Score(precision=precision, recall=recall, fmeasure=fmeasure)