def featurize(self, tokenizer, feat_spec): special_tokens_count = 2 # CLS, SEP (tokens, ) = truncate_sequences( tokens_ls=[self.tokens], max_length=feat_spec.max_seq_length - special_tokens_count, ) unpadded_tokens = tokens + [tokenizer.sep_token] unpadded_segment_ids = [feat_spec.sequence_a_segment_id ] * (len(tokens) + 1) unpadded_inputs = add_cls_token( unpadded_tokens=unpadded_tokens, unpadded_segment_ids=unpadded_segment_ids, tokenizer=tokenizer, feat_spec=feat_spec, ) input_set = create_input_set_from_tokens_and_segments( unpadded_tokens=unpadded_inputs.unpadded_tokens, unpadded_segment_ids=unpadded_inputs.unpadded_segment_ids, tokenizer=tokenizer, feat_spec=feat_spec, ) # exclusive spans are converted to inclusive spans for use with SelfAttentiveSpanExtractor span1_span = ExclusiveSpan( start=self.span1_span[0] + unpadded_inputs.cls_offset, end=self.span1_span[1] + unpadded_inputs.cls_offset, ).to_inclusive() span2_span = ExclusiveSpan( start=self.span2_span[0] + unpadded_inputs.cls_offset, end=self.span2_span[1] + unpadded_inputs.cls_offset, ).to_inclusive() assert span1_span.end <= len( tokens ), "Span 1 spans beyond max_seq_len, consider raising max_seq_len" assert span2_span.end <= len( tokens ), "Span 2 spans beyond max_seq_len, consider raising max_seq_len" binary_label_ids = np.zeros((self.label_num, ), dtype=int) for label_id in self.label_ids: binary_label_ids[label_id] = 1 return DataRow( guid=self.guid, input_ids=np.array(input_set.input_ids), input_mask=np.array(input_set.input_mask), segment_ids=np.array(input_set.segment_ids), spans=np.array([span1_span, span2_span]), label_ids=binary_label_ids, tokens=unpadded_inputs.unpadded_tokens, span1_text=self.span1_text, span2_text=self.span2_text, )
def get_token_span(sentence, span: ExclusiveSpan, tokenizer): tokenized = tokenizer.tokenize(sentence) tokenized_start1 = tokenizer.tokenize(sentence[: span.start]) tokenized_start2 = tokenizer.tokenize(sentence[: span.end]) assert starts_with(tokenized, tokenized_start1) # assert starts_with(tokenized, tokenized_start2) # <- fails because of "does" in "doesn't" word = sentence[span.to_slice()] assert word.lower().replace(" ", "") in delegate_flat_strip( tokenized_start2[len(tokenized_start1) :], tokenizer=tokenizer, ) token_span = ExclusiveSpan(start=len(tokenized_start1), end=len(tokenized_start2)) return tokenized, token_span
def featurize(self, tokenizer, feat_spec): special_tokens_count = 2 # CLS, SEP (tokens, ) = truncate_sequences( tokens_ls=[self.tokens], max_length=feat_spec.max_seq_length - special_tokens_count, ) unpadded_tokens = tokens + [tokenizer.sep_token] unpadded_segment_ids = [feat_spec.sequence_a_segment_id ] * (len(self.tokens) + 1) unpadded_inputs = add_cls_token( unpadded_tokens=unpadded_tokens, unpadded_segment_ids=unpadded_segment_ids, tokenizer=tokenizer, feat_spec=feat_spec, ) input_set = create_input_set_from_tokens_and_segments( unpadded_tokens=unpadded_inputs.unpadded_tokens, unpadded_segment_ids=unpadded_inputs.unpadded_segment_ids, tokenizer=tokenizer, feat_spec=feat_spec, ) span1_span = ExclusiveSpan( start=self.span1_span[0] + unpadded_inputs.cls_offset, end=self.span1_span[1] + unpadded_inputs.cls_offset, ).to_inclusive() span2_span = ExclusiveSpan( start=self.span2_span[0] + unpadded_inputs.cls_offset, end=self.span2_span[1] + unpadded_inputs.cls_offset, ).to_inclusive() return DataRow( guid=self.guid, input_ids=np.array(input_set.input_ids), input_mask=np.array(input_set.input_mask), segment_ids=np.array(input_set.segment_ids), spans=np.array([span1_span, span2_span]), label_id=self.label_id, tokens=unpadded_inputs.unpadded_tokens, span1_text=self.span1_text, span2_text=self.span2_text, )
def _create_examples(cls, path, set_type): for i, row in enumerate(py_io.read_jsonl(path)): yield Example( guid="%s-%s" % (set_type, i), tokenized_text=row["tokenized_text"], masked_spans=[ ExclusiveSpan(start, end) for start, end in row["masked_spans"] ], )
def _create_examples(cls, lines, set_type): examples = [] for line in lines: span1 = ExclusiveSpan(int(line["start1"]), int(line["end1"])) span2 = ExclusiveSpan(int(line["start2"]), int(line["end2"])) # Note, the chosen word may be different (e.g. different tenses) in sent1 and sent2, # hence we don't do an assert here. examples.append( Example( guid="%s-%s" % (set_type, line["idx"]), sentence1=line["sentence1"], sentence2=line["sentence2"], word=line["word"], span1=span1, span2=span2, label=line["label"] if set_type != "test" else cls.LABELS[-1], )) return examples
def extract_char_span(full_text, span_text, space_index): space_tokens = full_text.split() extracted_span_text = space_tokens[space_index] assert extracted_span_text.lower() in full_text.lower() span_length = len(span_text) if space_index == 0: start = 0 else: start = len(" ".join(space_tokens[:space_index])) + 1 # exclusive span return ExclusiveSpan(start=start, end=start + span_length)
def tokenize(self, tokenizer): space_tokenization = self.text.split() target_tokenization = tokenizer.tokenize(self.text) normed_space_tokenization, normed_target_tokenization = normalize_tokenizations( space_tokenization, target_tokenization, tokenizer) aligner = retokenize.TokenAligner(normed_space_tokenization, normed_target_tokenization) span1_token_count = len(self.span1_text.split()) span2_token_count = len(self.span2_text.split()) target_span1 = ExclusiveSpan( *aligner.project_token_span(self.span1_idx, self.span1_idx + span1_token_count)) target_span2 = ExclusiveSpan( *aligner.project_token_span(self.span2_idx, self.span2_idx + span2_token_count)) return TokenizedExample( guid=self.guid, tokens=target_tokenization, span1_span=target_span1, span2_span=target_span2, span1_text=self.span1_text, span2_text=self.span2_text, label_id=WSCTask.LABEL_TO_ID[self.label], )
def tokenize_span(tokenizer, sentence: str, char_span: ExclusiveSpan): """Tokenizes sentence and projects char_span to token span. Args: tokenizer (transformers.PreTrainedTokenizer): Tokenizer used sentence (str): Sentence to be tokenized char_span (ExclusiveSpan): character indexed span for sentence Returns: sentence_target_tokenization (List[str]): tokenized sentence target_span (ExclusiveSpane): token span for sentence """ span_start_idx = len(sentence[:char_span.start].split()) # If the first word in a span starts with punctuation, the first word will # erroneously be split into two strings by .split(). # ie: 'takeaway' -> ["'", "takeaway"] # For span alignment, we start the list index at the punctuation. if (span_start_idx != 0) and (sentence[:(char_span.start)][-1] in string.punctuation): span_start_idx = span_start_idx - 1 span_text = sentence[char_span.start:char_span.end] sentence_space_tokenization = sentence.split() sentence_target_tokenization = tokenizer.tokenize(sentence) ( sentence_normed_space_tokenization, sentence_normed_target_tokenization, ) = normalize_tokenizations(sentence_space_tokenization, sentence_target_tokenization, tokenizer) span_start_char = len(" ".join( sentence_normed_space_tokenization[:span_start_idx])) span_text_char = len(span_text) aligner = retokenize.TokenAligner( sentence_normed_space_tokenization, sentence_normed_target_tokenization) target_span = ExclusiveSpan(*aligner.project_char_to_token_span( span_start_char, span_start_char + span_text_char)) return sentence_target_tokenization, target_span
def featurize(self, tokenizer, feat_spec): if feat_spec.sep_token_extra: maybe_extra_sep = [tokenizer.sep_token] maybe_extra_sep_segment_id = [feat_spec.sequence_a_segment_id] special_tokens_count = 6 # CLS, SEP-SEP, SEP-SEP, SEP else: maybe_extra_sep = [] maybe_extra_sep_segment_id = [] special_tokens_count = 4 # CLS, SEP, SEP, SEP sentence1_tokens, sentence2_tokens = truncate_sequences( tokens_ls=[self.sentence1_tokens, self.sentence2_tokens], max_length=feat_spec.max_seq_length - len(self.word) - special_tokens_count, ) unpadded_tokens = (self.word + [tokenizer.sep_token] + maybe_extra_sep + sentence1_tokens + [tokenizer.sep_token] + maybe_extra_sep + sentence2_tokens + [tokenizer.sep_token]) # Don't have a choice here -- just leave words as part of sent1 unpadded_segment_ids = ( [feat_spec.sequence_a_segment_id] * (len(self.word) + 1) + maybe_extra_sep_segment_id + [feat_spec.sequence_a_segment_id] * (len(sentence1_tokens) + 1) + maybe_extra_sep_segment_id + [feat_spec.sequence_b_segment_id] * (len(sentence2_tokens) + 1)) unpadded_inputs = add_cls_token( unpadded_tokens=unpadded_tokens, unpadded_segment_ids=unpadded_segment_ids, tokenizer=tokenizer, feat_spec=feat_spec, ) input_set = create_input_set_from_tokens_and_segments( unpadded_tokens=unpadded_inputs.unpadded_tokens, unpadded_segment_ids=unpadded_inputs.unpadded_segment_ids, tokenizer=tokenizer, feat_spec=feat_spec, ) word_sep_offset = 2 if feat_spec.sep_token_extra else 1 sent1_sep_offset = 2 if feat_spec.sep_token_extra else 1 # Both should be inclusive spans at the end sentence1_span = ExclusiveSpan( start=self.sentence1_span[0] + unpadded_inputs.cls_offset + word_sep_offset + len(self.word), end=self.sentence1_span[1] + unpadded_inputs.cls_offset + word_sep_offset + len(self.word), ).to_inclusive() sentence2_span = ExclusiveSpan( start=self.sentence2_span[0] + unpadded_inputs.cls_offset + word_sep_offset + sent1_sep_offset + len(self.word) + len(sentence1_tokens), end=self.sentence2_span[1] + unpadded_inputs.cls_offset + word_sep_offset + sent1_sep_offset + len(self.word) + len(sentence1_tokens), ).to_inclusive() return DataRow( guid=self.guid, input_ids=np.array(input_set.input_ids), input_mask=np.array(input_set.input_mask), segment_ids=np.array(input_set.segment_ids), spans=np.array([sentence1_span, sentence2_span]), label_id=self.label_id, tokens=unpadded_inputs.unpadded_tokens, word=self.word, )
from collections import Counter from jiant.tasks.utils import ExclusiveSpan from jiant.tasks.lib.wic import Example, TokenizedExample from jiant.utils.testing.tokenizer import SimpleSpaceTokenizer EXAMPLES = [ Example( guid="train-1", sentence1="Approach a task.", sentence2="To approach the city.", word="approach", span1=ExclusiveSpan(start=0, end=8), span2=ExclusiveSpan(start=3, end=11), label=False, ), Example( guid="train-2", sentence1="In England they call takeout food 'takeaway'.", sentence2="If you're hungry, there's a takeaway just around the corner.", word="takeaway", span1=ExclusiveSpan(start=35, end=43), span2=ExclusiveSpan(start=28, end=36), label=True, ), ] TOKENIZED_EXAMPLES = [ TokenizedExample( guid="train-1",