Ejemplo n.º 1
0
    def output_spec(self) -> Spec:
        ret = {"tokens": lit_types.Tokens()}
        ret["tokens_" + self.config.text_a_name] = lit_types.Tokens()
        if self.config.text_b_name:
            ret["tokens_" + self.config.text_b_name] = lit_types.Tokens()
        if self.is_regression:
            ret["score"] = lit_types.RegressionScore(
                parent=self.config.label_name)
        else:
            ret["probas"] = lit_types.MulticlassPreds(
                parent=self.config.label_name,
                vocab=self.config.labels,
                null_idx=self.config.null_label_idx)
        ret["cls_emb"] = lit_types.Embeddings()

        # Gradients, if requested.
        if self.config.compute_grads:
            ret["token_grad_" +
                self.config.text_a_name] = lit_types.TokenGradients(
                    align="tokens_" + self.config.text_a_name)
            if self.config.text_b_name:
                ret["token_grad_" +
                    self.config.text_b_name] = lit_types.TokenGradients(
                        align="tokens_" + self.config.text_b_name)

        # Attention heads, one field for each layer.
        for i in range(self.model.config.num_hidden_layers):
            ret[f"layer_{i}/attention"] = lit_types.AttentionHeads(
                align=("tokens", "tokens"))

        return ret
Ejemplo n.º 2
0
Archivo: t5.py Proyecto: oceanfly/lit
    def output_spec(self):
        spec = super().output_spec()  # has 'output_text'
        spec.update({
            "input_tokens":
            lit_types.Tokens(parent="input_text"),
            "encoder_final_embedding":
            lit_types.Embeddings(),
            # If target text is given, the following will also be populated.
            "target_tokens":
            lit_types.Tokens(parent="target_text"),
            "pred_tokens":
            lit_types.TokenTopKPreds(align="target_tokens"),
        })
        if self.config.num_to_generate > 1:
            spec["output_text"] = lit_types.GeneratedTextCandidates(
                parent="target_text")

        if self.config.output_attention:
            # Add attention for each layer.
            for i in range(self.num_layers):
                spec[
                    f"encoder_layer_{i:d}_attention"] = lit_types.AttentionHeads(
                        align_in="input_tokens", align_out="input_tokens")
                spec[
                    f"decoder_layer_{i:d}_attention"] = lit_types.AttentionHeads(
                        align_in="target_tokens", align_out="target_tokens")
        return spec
Ejemplo n.º 3
0
  def output_spec(self) -> Spec:
    ret = {"tokens": lit_types.Tokens()}
    ret["tokens_" + self.config.text_a_name] = lit_types.Tokens(
        parent=self.config.text_a_name)
    if self.config.text_b_name:
      ret["tokens_" + self.config.text_b_name] = lit_types.Tokens(
          parent=self.config.text_b_name)
    if self.is_regression:
      ret["score"] = lit_types.RegressionScore(parent=self.config.label_name)
    else:
      ret["probas"] = lit_types.MulticlassPreds(
          parent=self.config.label_name,
          vocab=self.config.labels,
          null_idx=self.config.null_label_idx)
    ret["cls_emb"] = lit_types.Embeddings()
    # Average embeddings, one per layer including embeddings.
    for i in range(1 + self.model.config.num_hidden_layers):
      ret[f"layer_{i}/avg_emb"] = lit_types.Embeddings()

    ret["cls_grad"] = lit_types.Gradients(
        grad_for="cls_emb", grad_target_field_key="grad_class")

    # The input_embs_ and grad_class fields are used for Integrated Gradients.
    ret["input_embs_" + self.config.text_a_name] = lit_types.TokenEmbeddings(
        align="tokens_" + self.config.text_a_name)
    if self.config.text_b_name:
      ret["input_embs_" + self.config.text_b_name] = lit_types.TokenEmbeddings(
          align="tokens_" + self.config.text_b_name)

    # Gradients, if requested.
    if self.config.compute_grads:
      ret["grad_class"] = lit_types.CategoryLabel(required=False,
                                                  vocab=self.config.labels)
      ret["token_grad_" + self.config.text_a_name] = lit_types.TokenGradients(
          align="tokens_" + self.config.text_a_name,
          grad_for="input_embs_" + self.config.text_a_name,
          grad_target_field_key="grad_class")
      if self.config.text_b_name:
        ret["token_grad_" + self.config.text_b_name] = lit_types.TokenGradients(
            align="tokens_" + self.config.text_b_name,
            grad_for="input_embs_" + self.config.text_b_name,
            grad_target_field_key="grad_class")

    # Attention heads, one field for each layer.
    for i in range(self.model.config.num_hidden_layers):
      ret[f"layer_{i+1}/attention"] = lit_types.AttentionHeads(
          align_in="tokens", align_out="tokens")
    return ret
Ejemplo n.º 4
0
 def spec(self):
     return {
         "text":
         lit_types.TextSegment(),
         "tokens":
         lit_types.Tokens(parent="text"),
         "coref":
         lit_types.EdgeLabels(align="tokens"),
         # Metadata fields for filtering and analysis.
         "occupation":
         lit_types.CategoryLabel(),
         "participant":
         lit_types.CategoryLabel(),
         "answer":
         lit_types.CategoryLabel(vocab=ANSWER_VOCAB),
         "someone":
         lit_types.CategoryLabel(vocab=["True", "False"]),
         "pronouns":
         lit_types.CategoryLabel(vocab=list(PRONOUNS_BY_GENDER.values())),
         "pronoun_type":
         lit_types.CategoryLabel(vocab=["NOM", "POSS", "ACC"]),
         "gender":
         lit_types.CategoryLabel(vocab=[g.name for g in Gender]),
         "pf_bls":
         lit_types.Scalar(),
     }
Ejemplo n.º 5
0
 def test_find_spec_keys(self):
   spec = {
       "score": types.RegressionScore(),
       "scalar_foo": types.Scalar(),
       "text": types.TextSegment(),
       "emb_0": types.Embeddings(),
       "emb_1": types.Embeddings(),
       "tokens": types.Tokens(),
       "generated_text": types.GeneratedText(),
   }
   self.assertEqual(["score"], utils.find_spec_keys(spec,
                                                    types.RegressionScore))
   self.assertEqual(["text", "tokens", "generated_text"],
                    utils.find_spec_keys(spec,
                                         (types.TextSegment, types.Tokens)))
   self.assertEqual(["emb_0", "emb_1"],
                    utils.find_spec_keys(spec, types.Embeddings))
   self.assertEqual([], utils.find_spec_keys(spec, types.AttentionHeads))
   # Check subclasses
   self.assertEqual(
       list(spec.keys()), utils.find_spec_keys(spec, types.LitType))
   self.assertEqual(["text", "generated_text"],
                    utils.find_spec_keys(spec, types.TextSegment))
   self.assertEqual(["score", "scalar_foo"],
                    utils.find_spec_keys(spec, types.Scalar))
Ejemplo n.º 6
0
 def output_spec(self) -> lit_types.Spec:
     return {
         "tokens": lit_types.Tokens(),
         "probas": lit_types.MulticlassPreds(parent="label",
                                             vocab=self.LABELS),
         "cls_emb": lit_types.Embeddings()
     }
Ejemplo n.º 7
0
    def __init__(self, model, tasks):
        """Initialize with Stanza model and a dictionary of tasks.

    Args:
      model: A Stanza model
      tasks: A dictionary of tasks, grouped by task type.
        Keys are the grouping, which should be one of:
          ('sequence', 'span', 'edge').
        Values are a list of stanza task names as strings.
    """
        self.model = model
        # Store lists of task name strings by grouping
        self.sequence_tasks = tasks["sequence"]
        self.span_tasks = tasks["span"]
        self.edge_tasks = tasks["edge"]

        self._input_spec = {
            "sentence": lit_types.TextSegment(),
        }

        self._output_spec = {
            "tokens": lit_types.Tokens(),
        }

        # Output spec based on specified tasks
        for task in self.sequence_tasks:
            self._output_spec[task] = lit_types.SequenceTags(align="tokens")
        for task in self.span_tasks:
            self._output_spec[task] = lit_types.SpanLabels(align="tokens")
        for task in self.edge_tasks:
            self._output_spec[task] = lit_types.EdgeLabels(align="tokens")
Ejemplo n.º 8
0
 def output_spec(self) -> lit_types.Spec:
     return {
         "tokens": lit_types.Tokens(),
         "probas": lit_types.MulticlassPreds(parent="label", vocab=self._labels),
         "cls_emb": lit_types.Embeddings(),
         "token_grad_sentence": lit_types.TokenGradients(align="tokens")
     }
Ejemplo n.º 9
0
 def output_spec(self) -> lit_types.Spec:
     return {
         "tokens": lit_types.Tokens(),
         "logits": lit_types.RegressionScore(),
         "cls_emb": lit_types.Embeddings(),
         "token_grad_sentence": lit_types.TokenGradients(align="tokens")
     }
Ejemplo n.º 10
0
 def output_spec(self):
     return {
         'top_layer_embs':
         lit_types.TokenEmbeddings(),
         'wpm_tokens':
         lit_types.Tokens(),
         'offsets':
         lit_types.SubwordOffsets(align_in='tokens', align_out='wpm_tokens')
     }
Ejemplo n.º 11
0
Archivo: t5.py Proyecto: smesaric/lit
 def output_spec(self):
   spec = {
       "input_tokens": lit_types.Tokens(parent="input_text"),
       "generation": lit_types.GeneratedText(parent="target_text"),
       "encoder_final_embedding": lit_types.Embeddings(),
       # If target text is given, the following will also be populated.
       "target_tokens": lit_types.Tokens(parent="target_text"),
       "pred_tokens": lit_types.TokenTopKPreds(align="target_tokens"),
       "rougeL": lit_types.Scalar(),
   }
   if self.config.output_attention:
     # Add attention for each layer.
     for i in range(self.num_layers):
       spec[f"encoder_layer_{i:d}_attention"] = lit_types.AttentionHeads(
           align=("input_tokens", "input_tokens"))
       spec[f"decoder_layer_{i:d}_attention"] = lit_types.AttentionHeads(
           align=("target_tokens", "target_tokens"))
   return spec
Ejemplo n.º 12
0
 def output_spec(self) -> lit_types.Spec:
     return {
         "src_tokens":
         lit_types.Tokens(parent="src_text"),
         "trg_text":
         lit_types.GeneratedText(parent="ref_text"),
         "trg_tokens":
         lit_types.Tokens(parent="trg_text"),
         "attention":
         lit_types.AttentionHeads(align_in="src_tokens",
                                  align_out="trg_tokens"),
         "pred_tokens":
         lit_types.TokenTopKPreds(align="trg_tokens", parent="trg_text"),
         "encoder_final_embedding":
         lit_types.Embeddings(),
         "ter":
         lit_types.Scalar(),
         "chrf3":
         lit_types.Scalar(),
     }
Ejemplo n.º 13
0
 def spec(self) -> lit_types.Spec:
     """Should match MLM's input_spec()."""
     return {
         'input_text':
         lit_types.TextSegment(),
         'target_text':
         lit_types.TextSegment(),
         'input_tokens':
         lit_types.Tokens(required=False),
         'gece_tags':
         lit_types.SequenceTags(align='input_tokens', required=False)
     }
Ejemplo n.º 14
0
 def output_spec(self):
     spec = {
         # the "parent" keyword tells LIT which field in the input spec we should
         # compare this to when computing metrics.
         "pred_tokens": lit_types.TokenTopKPreds(align="tokens"),
         "tokens": lit_types.Tokens(parent="text"),  # all tokens
     }
     # Add attention and embeddings from each layer.
     for i in range(self.num_layers):
         spec[f"layer_{i:d}_attention"] = lit_types.AttentionHeads(
             align_in="tokens", align_out="tokens")
         spec[f"layer_{i:d}_avg_embedding"] = lit_types.Embeddings()
     return spec
Ejemplo n.º 15
0
 def output_spec(self):
   return {'probas': lit_types.MulticlassPreds(
       parent='label',
       vocab=['0', '1'],
       null_idx=0),
           'input_embs': lit_types.TokenEmbeddings(align='tokens'),
           'input_embs_grad': lit_types.TokenGradients(align='tokens',
                                                       grad_for='input_embs',
                                                       grad_target='grad_class'
                                                       ),
           'tokens': lit_types.Tokens(),
           'grad_class': lit_types.CategoryLabel(vocab=['0', '1'])
           }
Ejemplo n.º 16
0
 def output_spec(self):
     # TODO(lit-dev): also return the embeddings for each span on datasets
     # with a fixed number of targets; for Winogender this would be
     # {occupation, other participant, pronoun}
     return {
         'tokens':
         lit_types.Tokens(parent='text'),
         'coref':
         lit_types.EdgeLabels(align='tokens'),
         'pred_answer':
         lit_types.MulticlassPreds(vocab=winogender.ANSWER_VOCAB,
                                   parent='answer'),
     }
Ejemplo n.º 17
0
 def input_spec(self) -> Spec:
   ret = {}
   ret[self.config.text_a_name] = lit_types.TextSegment()
   ret["tokens_" + self.config.text_a_name] = lit_types.Tokens(
       parent=self.config.text_a_name, required=False)
   if self.config.text_b_name:
     ret[self.config.text_b_name] = lit_types.TextSegment()
     ret["tokens_" + self.config.text_b_name] = lit_types.Tokens(
         parent=self.config.text_b_name, required=False)
   if self.is_regression:
     ret[self.config.label_name] = lit_types.RegressionScore(required=False)
   else:
     ret[self.config.label_name] = lit_types.CategoryLabel(
         required=False, vocab=self.config.labels)
   # The input_embs_ and grad_class fields are used for Integrated Gradients.
   ret["input_embs_" + self.config.text_a_name] = lit_types.TokenEmbeddings(
       align="tokens", required=False)
   if self.config.text_b_name:
     ret["input_embs_" + self.config.text_b_name] = lit_types.TokenEmbeddings(
         align="tokens", required=False)
   ret["grad_class"] = lit_types.CategoryLabel(required=False,
                                               vocab=self.config.labels)
   return ret
Ejemplo n.º 18
0
    def output_spec(self) -> lit_types.Spec:
        output = {
            "input_tokens":
            lit_types.Tokens(parent="input_text"),
            "predicted":
            lit_types.GeneratedText(parent='target_text'),
            'layer_average':
            lit_types.AttentionHeads(align=('input_tokens', 'input_tokens'))
        }
        for layer in range(self.ATTENTION_LAYERS):
            output['layer{}'.format(layer)] = lit_types.AttentionHeads(
                align=('input_tokens', 'input_tokens'))

        return output
Ejemplo n.º 19
0
 def test_compatibility_optionals(self):
     """Test with optionals in the model spec."""
     mspec = model.ModelSpec(input={
         "text":
         types.TextSegment(),
         "tokens":
         types.Tokens(parent="text", required=False),
         "label":
         types.CategoryLabel(vocab=["0", "1"], required=False),
     },
                             output={})
     dspec = {
         "text": types.TextSegment(),
         "label": types.CategoryLabel(vocab=["0", "1"]),
     }
     self.assertTrue(mspec.is_compatible_with_dataset(dspec))
 def output_spec(self) -> lit_types.Spec:
     """Give the output specifications."""
     ret = {
         "tokens":  lit_types.Tokens(),
         "probas":  lit_types.MulticlassPreds(parent="label", vocab=self.LABELS),
         "cls_emb": lit_types.Embeddings()
     }
     
     # Gradients, if requested.
     if self.compute_grads:
         ret["token_grad_sentence"] = lit_types.TokenGradients(align="tokens")
     
     # Attention heads, one field for each layer.
     for i in range(self.model.config.num_hidden_layers):
         ret[f"layer_{i}/attention"] = lit_types.AttentionHeads(align=("tokens", "tokens"))
     return ret
Ejemplo n.º 21
0
 def test_compatibility_optionals_mismatch(self):
     """Test with optionals that don't match metadata."""
     mspec = model.ModelSpec(input={
         "text":
         types.TextSegment(),
         "tokens":
         types.Tokens(parent="text", required=False),
         "label":
         types.CategoryLabel(vocab=["0", "1"], required=False),
     },
                             output={})
     dspec = {
         "text": types.TextSegment(),
         # This label field doesn't match the one the model expects.
         "label": types.CategoryLabel(vocab=["foo", "bar"]),
     }
     self.assertFalse(mspec.is_compatible_with_dataset(dspec))
Ejemplo n.º 22
0
 def output_spec(self):
     # TODO(lit-dev): also return the embeddings for each span on datasets
     # with a fixed number of targets; for Winogender this would be
     # {occupation, other participant, pronoun}
     return {
         'tokens':
         lit_types.Tokens(parent='text'),
         'coref':
         lit_types.EdgeLabels(align='tokens'),
         'pred_answer':
         lit_types.MulticlassPreds(vocab=winogender.ANSWER_VOCAB,
                                   parent='answer'),
         # TODO(b/172975096): allow plotting of scalars from input data,
         # so we don't need to add this to the predictions.
         'pf_bls':
         lit_types.Scalar(),
     }
Ejemplo n.º 23
0
 def input_spec(self):
     return {
         'text':
         lit_types.TextSegment(),
         'tokens':
         lit_types.Tokens(parent='text'),
         'coref':
         lit_types.EdgeLabels(align='tokens'),
         # Index of predicted (single) edge for Winogender
         'answer':
         lit_types.CategoryLabel(vocab=winogender.ANSWER_VOCAB,
                                 required=False),
         # TODO(b/172975096): allow plotting of scalars from input data,
         # so we don't need to add this to the predictions.
         'pf_bls':
         lit_types.Scalar(required=False),
     }
Ejemplo n.º 24
0
 def output_spec(self) -> lit_types.Spec:
     spec = {
         "tokens":
         lit_types.Tokens(),
         "bio_tags":
         lit_types.SequenceTags(align="tokens"),
         "token_ids":
         lit_types.SequenceTags(align="tokens"),
         "grads":
         lit_types.TokenGradients(align="tokens"),
         "probas":
         lit_types.MulticlassPreds(parent="bio_tags", vocab=self.LABELS)
     }
     for i in range(self.model.config.num_hidden_layers):
         spec[f'layer_{i}/attention'] = lit_types.AttentionHeads(
             align=("tokens", "tokens"))
     return spec
Ejemplo n.º 25
0
 def config_spec(self) -> types.Spec:
     return {
         NUM_EXAMPLES_KEY:
         types.TextSegment(default=str(NUM_EXAMPLES_DEFAULT)),
         MAX_FLIPS_KEY:
         types.TextSegment(default=str(MAX_FLIPS_DEFAULT)),
         TOKENS_TO_IGNORE_KEY:
         types.Tokens(default=TOKENS_TO_IGNORE_DEFAULT),
         PREDICTION_KEY:
         types.FieldMatcher(spec="output",
                            types=["MulticlassPreds", "RegressionScore"]),
         REGRESSION_THRESH_KEY:
         types.TextSegment(default=str(REGRESSION_THRESH_DEFAULT)),
         FIELDS_TO_HOTFLIP_KEY:
         types.MultiFieldMatcher(spec="input",
                                 types=["Tokens"],
                                 select_all=True),
     }
Ejemplo n.º 26
0
 def input_spec(self):
     return {
         "text": lit_types.TextSegment(),
         "tokens": lit_types.Tokens(required=False),
     }
Ejemplo n.º 27
0
 def input_spec(self):
     return {'tokens': lit_types.Tokens()}
Ejemplo n.º 28
0
 def input_spec(self):
     return {
         "text": lit_types.TextSegment(),
         "tokens": lit_types.Tokens(mask_token="[MASK]", required=False),
     }
Ejemplo n.º 29
0
 def output_spec(self):
     return {
         "tokens": lit_types.Tokens(parent="text"),
         "pred_tokens": lit_types.TokenTopKPreds(align="tokens"),
         "cls_emb": lit_types.Embeddings(),
     }
Ejemplo n.º 30
0
 def spec(self):
     return {
         'text': lit_types.TextSegment(),
         'tokens': lit_types.Tokens(parent='text'),
         'coref': lit_types.EdgeLabels(align='tokens'),
     }