def test_normalize_empty_tokenizations():
    text = ""
    space_tokenized = text.split(" ")
    tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1")
    target_tokenized = tokenizer.tokenize(text)
    with pytest.raises(ValueError):
        tn.normalize_tokenizations(space_tokenized, target_tokenized,
                                   tokenizer)
def test_space_tokenization_tokenization_normalization(hf_tokenizer, hf_model):
    text = "Jeff Immelt chose to focus on the incomprehensibility of accounting rules ."
    space_tokenized = text.split(" ")
    tokenizer = hf_tokenizer.from_pretrained(hf_model)
    target_tokenized = tokenizer.tokenize(text)
    normed_space_tokenized, normed_target_tokenized = tn.normalize_tokenizations(
        space_tokenized, target_tokenized, tokenizer)
    assert "".join(normed_space_tokenized) == "".join(normed_target_tokenized)
Ejemplo n.º 3
0
def test_space_tokenization_and_xlm_uncased_tokenization_normalization():
    text = "Jeff Immelt chose to focus on the incomprehensibility of accounting rules ."
    space_tokenized = text.split(" ")
    tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
    target_tokenized = tokenizer.tokenize(text)
    normed_space_tokenized, normed_target_tokenized = tn.normalize_tokenizations(
        space_tokenized, target_tokenized, tokenizer)
    assert "".join(normed_space_tokenized) == "".join(normed_target_tokenized)
def test_space_tokenization_and_unusual_roberta_tokenization_normalization():
    text = (
        "As a practitioner of ethnic humor from the old days on the Borscht Belt , live "
        "television and the nightclub circuit , Mr. Mason instinctively reached for the "
        "vernacular .")
    space_tokenized = text.split(" ")
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    target_tokenized = tokenizer.tokenize(text)
    # note 1: exposing the target tokenization to highlight an unusual tokenization:
    # " vernacular" -> 'Ġ', 'vern', 'acular' (the usual pattern suggests 'Ġvern', 'acular')
    assert target_tokenized == [
        "As",
        "Ġa",
        "Ġpractitioner",
        "Ġof",
        "Ġethnic",
        "Ġhumor",
        "Ġfrom",
        "Ġthe",
        "Ġold",
        "Ġdays",
        "Ġon",
        "Ġthe",
        "ĠB",
        "ors",
        "cht",
        "ĠBelt",
        "Ġ,",
        "Ġlive",
        "Ġtelevision",
        "Ġand",
        "Ġthe",
        "Ġnightclub",
        "Ġcircuit",
        "Ġ,",
        "ĠMr",
        ".",
        "ĠMason",
        "Ġinstinctively",
        "Ġreached",
        "Ġfor",
        "Ġthe",
        "Ġ",
        "vern",
        "acular",
        "Ġ.",
    ]
    normed_space_tokenized, normed_target_tokenized = tn.normalize_tokenizations(
        space_tokenized, target_tokenized, tokenizer)
    # note: 2: the assert below shows that even with the unusual tokenization (see note 1 above),
    # after normalization the space tokenization and the target tokenization match.
    assert "".join(normed_space_tokenized) == "".join(normed_target_tokenized)
Ejemplo n.º 5
0
 def tokenize(self, tokenizer):
     space_tokenization = self.text.split()
     target_tokenization = tokenizer.tokenize(self.text)
     normed_space_tokenization, normed_target_tokenization = normalize_tokenizations(
         space_tokenization, target_tokenization, tokenizer)
     aligner = retokenize.TokenAligner(normed_space_tokenization,
                                       normed_target_tokenization)
     target_span = aligner.project_span(self.span[0], self.span[1])
     return TokenizedExample(
         guid=self.guid,
         tokens=target_tokenization,
         span=target_span,
         span_text=" ".join(
             target_tokenization[target_span[0]:target_span[1]]),
         label_ids=[self.task.LABEL_TO_ID[label] for label in self.labels],
         label_num=len(self.task.LABELS),
     )
Ejemplo n.º 6
0
Archivo: wic.py Proyecto: zphang/jiant
        def tokenize_span(tokenizer, sentence: str, char_span: ExclusiveSpan):
            """Tokenizes sentence and projects char_span to token span.

            Args:
                tokenizer (transformers.PreTrainedTokenizer): Tokenizer used
                sentence (str): Sentence to be tokenized
                char_span (ExclusiveSpan): character indexed span for sentence

            Returns:
                sentence_target_tokenization (List[str]): tokenized sentence
                target_span (ExclusiveSpane): token span for sentence
            """
            span_start_idx = len(sentence[:char_span.start].split())
            # If the first word in a span starts with punctuation, the first word will
            # erroneously be split into two strings by .split().
            # ie: 'takeaway' -> ["'", "takeaway"]
            # For span alignment, we start the list index at the punctuation.
            if (span_start_idx != 0) and (sentence[:(char_span.start)][-1]
                                          in string.punctuation):
                span_start_idx = span_start_idx - 1
            span_text = sentence[char_span.start:char_span.end]

            sentence_space_tokenization = sentence.split()
            sentence_target_tokenization = tokenizer.tokenize(sentence)
            (
                sentence_normed_space_tokenization,
                sentence_normed_target_tokenization,
            ) = normalize_tokenizations(sentence_space_tokenization,
                                        sentence_target_tokenization,
                                        tokenizer)
            span_start_char = len(" ".join(
                sentence_normed_space_tokenization[:span_start_idx]))
            span_text_char = len(span_text)
            aligner = retokenize.TokenAligner(
                sentence_normed_space_tokenization,
                sentence_normed_target_tokenization)
            target_span = ExclusiveSpan(*aligner.project_char_to_token_span(
                span_start_char, span_start_char + span_text_char))
            return sentence_target_tokenization, target_span
Ejemplo n.º 7
0
 def tokenize(self, tokenizer):
     space_tokenization = self.text.split()
     target_tokenization = tokenizer.tokenize(self.text)
     normed_space_tokenization, normed_target_tokenization = normalize_tokenizations(
         space_tokenization, target_tokenization, tokenizer)
     aligner = retokenize.TokenAligner(normed_space_tokenization,
                                       normed_target_tokenization)
     span1_token_count = len(self.span1_text.split())
     span2_token_count = len(self.span2_text.split())
     target_span1 = ExclusiveSpan(
         *aligner.project_token_span(self.span1_idx, self.span1_idx +
                                     span1_token_count))
     target_span2 = ExclusiveSpan(
         *aligner.project_token_span(self.span2_idx, self.span2_idx +
                                     span2_token_count))
     return TokenizedExample(
         guid=self.guid,
         tokens=target_tokenization,
         span1_span=target_span1,
         span2_span=target_span2,
         span1_text=self.span1_text,
         span2_text=self.span2_text,
         label_id=WSCTask.LABEL_TO_ID[self.label],
     )