def retokenize_record(record, tokenizer_name): """Retokenize an edge probing example. Modifies in-place.""" text = record['text'] aligner_fn = retokenize.get_aligner_fn(tokenizer_name) ta, new_tokens = aligner_fn(text) record['text'] = " ".join(new_tokens) for target in record['targets']: if 'span1' in target: target['span1'] = list(map(int, ta.project_span(*target['span1']))) if 'span2' in target: target['span2'] = list(map(int, ta.project_span(*target['span2']))) return record
def get_tags(text, current_tags, tokenizer_name, tag_dict): aligner_fn = retokenize.get_aligner_fn(tokenizer_name) assert len(text) == len(current_tags) res_tags = [] introduced_tokenizer_tag = len(tag_dict) for i in range(len(text)): token = text[i] _, new_toks = aligner_fn(token) res_tags.append(tag_dict[current_tags[i]]) if len(new_toks) > 1: for tok in new_toks[1:]: res_tags.append(introduced_tokenizer_tag) # based on BERT-paper for wordpiece, we only keep the tag # for the first part of the word. _, aligned_text = aligner_fn(" ".join(text)) assert len(aligned_text) == len(res_tags) str_tags = [str(s) for s in res_tags] return " ".join(str_tags)
def realign_spans(record, tokenizer_name): """ Builds the indices alignment while also tokenizing the input piece by piece. Parameters ----------------------- record: dict with the below fields text: str targets: list of dictionaries label: bool span1_index: int, start index of first span span1_text: str, text of first span span2_index: int, start index of second span span2_text: str, text of second span tokenizer_name: str Returns ------------------------ record: dict with the below fields: text: str in tokenized form targets: dictionary with the below fields -label: bool -span_1: (int, int) of token indices -span1_text: str, the string -span2: (int, int) of token indices -span2_text: str, the string """ # find span indices and text text = record["text"].split() span1 = record["targets"][0]["span1_index"] span1_text = record["targets"][0]["span1_text"] span2 = record["targets"][0]["span2_index"] span2_text = record["targets"][0]["span2_text"] # construct end spans given span text space-tokenized length span1 = [span1, span1 + len(span1_text.strip().split())] span2 = [span2, span2 + len(span2_text.strip().split())] indices = [span1, span2] sorted_indices = sorted(indices, key=lambda x: x[0]) current_tokenization = [] span_mapping = {} # align first span to tokenized text aligner_fn = retokenize.get_aligner_fn(tokenizer_name) _, new_tokens = aligner_fn(" ".join(text[: sorted_indices[0][0]])) current_tokenization.extend(new_tokens) new_span1start = len(current_tokenization) _, span_tokens = aligner_fn(" ".join(text[sorted_indices[0][0] : sorted_indices[0][1]])) current_tokenization.extend(span_tokens) new_span1end = len(current_tokenization) span_mapping[sorted_indices[0][0]] = [new_span1start, new_span1end] # re-indexing second span _, new_tokens = aligner_fn(" ".join(text[sorted_indices[0][1] : sorted_indices[1][0]])) current_tokenization.extend(new_tokens) new_span2start = len(current_tokenization) _, span_tokens = aligner_fn(" ".join(text[sorted_indices[1][0] : sorted_indices[1][1]])) current_tokenization.extend(span_tokens) new_span2end = len(current_tokenization) span_mapping[sorted_indices[1][0]] = [new_span2start, new_span2end] # save back into record _, all_text = aligner_fn(" ".join(text)) record["targets"][0]["span1"] = span_mapping[record["targets"][0]["span1_index"]] record["targets"][0]["span2"] = span_mapping[record["targets"][0]["span2_index"]] record["text"] = " ".join(all_text) return record