def tokenize(self, tokenizer):
     space_tokenization = self.text.split()
     target_tokenization = tokenizer.tokenize(self.text)
     normed_space_tokenization, normed_target_tokenization = normalize_tokenizations(
         space_tokenization, target_tokenization, tokenizer)
     aligner = retokenize.TokenAligner(normed_space_tokenization,
                                       normed_target_tokenization)
     target_span = aligner.project_span(self.span[0], self.span[1])
     return TokenizedExample(
         guid=self.guid,
         tokens=target_tokenization,
         span=target_span,
         span_text=" ".join(
             target_tokenization[target_span[0]:target_span[1]]),
         label_ids=[self.task.LABEL_TO_ID[label] for label in self.labels],
         label_num=len(self.task.LABELS),
     )
Esempio n. 2
0
File: wic.py Progetto: zphang/jiant
        def tokenize_span(tokenizer, sentence: str, char_span: ExclusiveSpan):
            """Tokenizes sentence and projects char_span to token span.

            Args:
                tokenizer (transformers.PreTrainedTokenizer): Tokenizer used
                sentence (str): Sentence to be tokenized
                char_span (ExclusiveSpan): character indexed span for sentence

            Returns:
                sentence_target_tokenization (List[str]): tokenized sentence
                target_span (ExclusiveSpane): token span for sentence
            """
            span_start_idx = len(sentence[:char_span.start].split())
            # If the first word in a span starts with punctuation, the first word will
            # erroneously be split into two strings by .split().
            # ie: 'takeaway' -> ["'", "takeaway"]
            # For span alignment, we start the list index at the punctuation.
            if (span_start_idx != 0) and (sentence[:(char_span.start)][-1]
                                          in string.punctuation):
                span_start_idx = span_start_idx - 1
            span_text = sentence[char_span.start:char_span.end]

            sentence_space_tokenization = sentence.split()
            sentence_target_tokenization = tokenizer.tokenize(sentence)
            (
                sentence_normed_space_tokenization,
                sentence_normed_target_tokenization,
            ) = normalize_tokenizations(sentence_space_tokenization,
                                        sentence_target_tokenization,
                                        tokenizer)
            span_start_char = len(" ".join(
                sentence_normed_space_tokenization[:span_start_idx]))
            span_text_char = len(span_text)
            aligner = retokenize.TokenAligner(
                sentence_normed_space_tokenization,
                sentence_normed_target_tokenization)
            target_span = ExclusiveSpan(*aligner.project_char_to_token_span(
                span_start_char, span_start_char + span_text_char))
            return sentence_target_tokenization, target_span
Esempio n. 3
0
 def tokenize(self, tokenizer):
     space_tokenization = self.text.split()
     target_tokenization = tokenizer.tokenize(self.text)
     normed_space_tokenization, normed_target_tokenization = normalize_tokenizations(
         space_tokenization, target_tokenization, tokenizer)
     aligner = retokenize.TokenAligner(normed_space_tokenization,
                                       normed_target_tokenization)
     span1_token_count = len(self.span1_text.split())
     span2_token_count = len(self.span2_text.split())
     target_span1 = ExclusiveSpan(
         *aligner.project_token_span(self.span1_idx, self.span1_idx +
                                     span1_token_count))
     target_span2 = ExclusiveSpan(
         *aligner.project_token_span(self.span2_idx, self.span2_idx +
                                     span2_token_count))
     return TokenizedExample(
         guid=self.guid,
         tokens=target_tokenization,
         span1_span=target_span1,
         span2_span=target_span2,
         span1_text=self.span1_text,
         span2_text=self.span2_text,
         label_id=WSCTask.LABEL_TO_ID[self.label],
     )