def encode_three_inputs(max_seq_length, tokenizer, inst: PayloadAsTokens) -> collections.OrderedDict: tokens_1_1: List[str] = inst.text1 tokens_1_2: List[str] = inst.text2 tokens_2_1: List[str] = tokens_1_2 tokens_2_2 = inst.passage[:max_seq_length] def combine(tokens1, tokens2): effective_length = max_seq_length - 3 if len(tokens1) + len(tokens2) > effective_length: half = int(effective_length / 2 + 1) tokens1 = tokens1[:half] remain = effective_length - len(tokens1) tokens2 = tokens2[:remain] tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"] segment_ids = [0] * (len(tokens1) + 2) \ + [1] * (len(tokens2) + 1) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] return tokens, segment_ids def fill(tokens1, seg_id): tokens = ["[CLS]"] + tokens1 + ["[SEP]"] segment_ids = [seg_id] * (len(tokens1) + 2) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] return tokens, segment_ids tokens_A, segment_ids_A = combine(tokens_1_1, tokens_1_2) tokens_B, segment_ids_B = fill(tokens_2_1, 0) tokens_C, segment_ids_C = fill(tokens_2_2, 1) features = collections.OrderedDict() input_ids, input_mask, segment_ids = get_basic_input_feature_as_list( tokenizer, max_seq_length, tokens_A, segment_ids_A) features["input_ids"] = create_int_feature(input_ids) features["input_mask"] = create_int_feature(input_mask) features["segment_ids"] = create_int_feature(segment_ids) input_ids, input_mask, segment_ids = get_basic_input_feature_as_list( tokenizer, max_seq_length, tokens_B, segment_ids_B) features["input_ids1"] = create_int_feature(input_ids) features["input_mask1"] = create_int_feature(input_mask) features["segment_ids1"] = create_int_feature(segment_ids) input_ids, input_mask, segment_ids = get_basic_input_feature_as_list( tokenizer, max_seq_length, tokens_C, segment_ids_C) features["input_ids2"] = create_int_feature(input_ids) features["input_mask2"] = create_int_feature(input_mask) features["segment_ids2"] = create_int_feature(segment_ids) features['label_ids'] = create_int_feature([inst.is_correct]) features['data_id'] = create_int_feature([inst.data_id]) return features
def combine_features_B(tokens, segment_ids, tokens2, segment_ids2, tokenizer, max_seq_length) -> collections.OrderedDict: input_ids, input_mask, segment_ids = get_basic_input_feature_as_list( tokenizer, max_seq_length, tokens, segment_ids) features = collections.OrderedDict() features["input_ids"] = create_int_feature(input_ids) features["input_mask"] = create_int_feature(input_mask) features["segment_ids"] = create_int_feature(segment_ids) input_ids, input_mask, segment_ids = get_basic_input_feature_as_list( tokenizer, max_seq_length, tokens2, segment_ids2) features["input_ids2"] = create_int_feature(input_ids) features["input_mask2"] = create_int_feature(input_mask) features["segment_ids2"] = create_int_feature(segment_ids) return features
def encode(inst: Payload) -> OrderedDict: tokens_1_1: List[str] = tokenizer.tokenize(inst.text1) tokens_1_2: List[str] = tokenizer.tokenize(inst.text2) def tokenize_from_tokens_fn(tokens): return tokenize_from_tokens(tokenizer, tokens) tokens_2_list: List[List[str]] = lmap(tokenize_from_tokens_fn, inst.passage_list) tokens, segment_ids = combine_with_sep_cls(max_seq_length, tokens_1_1, tokens_1_2) input_ids, input_mask, segment_ids = get_basic_input_feature_as_list( tokenizer, max_seq_length, tokens, segment_ids) features = collections.OrderedDict() features["input_ids"] = create_int_feature(input_ids) features["input_mask"] = create_int_feature(input_mask) features["segment_ids"] = create_int_feature(segment_ids) def iterate_over(tokens1, tokens2_list) -> Tuple[List[str], List[int]]: dummy_tokens = ["[PAD]"] * max_seq_length dummy_segment_ids = [0] * max_seq_length def make_for_each_window(tokens2): tokens, segment_ids = combine_and_pad(tokens1, tokens2) return tokens, segment_ids tokens_and_segment_ids_list: List[Tuple[List[str], List[int]]] = \ lmap(make_for_each_window, tokens2_list[:num_windows]) pad_len = num_windows - len(tokens_and_segment_ids_list) tokens_and_segment_ids_list += [(dummy_tokens, dummy_segment_ids) ] * pad_len tokens_list, segment_ids_list = zip(*tokens_and_segment_ids_list) return lflatten(tokens_list), lflatten(segment_ids_list) def get_second_feature_parts(tokens1, tokens2_list): tokens, segment_ids = iterate_over(tokens1, tokens2_list) return get_basic_input_feature_as_list(tokenizer, d_max_seq_length, tokens, segment_ids) input_ids, input_mask, segment_ids = get_second_feature_parts( tokens_1_2, tokens_2_list) features["input_ids2"] = create_int_feature(input_ids) features["input_mask2"] = create_int_feature(input_mask) features["segment_ids2"] = create_int_feature(segment_ids) input_ids, input_mask, segment_ids = get_second_feature_parts( tokens_1_1, tokens_2_list) features["input_ids3"] = create_int_feature(input_ids) features["input_mask3"] = create_int_feature(input_mask) features["segment_ids3"] = create_int_feature(segment_ids) features['label_ids'] = create_int_feature([inst.is_correct]) features['data_id'] = create_int_feature([inst.data_id]) return features
def encode_single(tokenizer, tokens, max_seq_length): effective_length = max_seq_length - 2 tokens = tokens[:effective_length] tokens = ["[CLS]"] + tokens + ["[SEP]"] segment_ids = [0] * (len(tokens) + 2) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] input_ids, input_mask, segment_ids = get_basic_input_feature_as_list( tokenizer, max_seq_length, tokens, segment_ids) return input_ids, input_mask, segment_ids
def encode_query_doc_instance(tokenizer, doc_token_length, inst: QueryDocInstance) -> OrderedDict: doc_segment_ids = [1] * len(inst.doc_tokens) doc_input_ids, doc_input_mask, doc_segment_ids \ = get_basic_input_feature_as_list(tokenizer, doc_token_length, inst.doc_tokens, doc_segment_ids) feature = collections.OrderedDict() feature['query'] = create_int_feature( tokenizer.convert_tokens_to_ids(inst.query_tokens)) feature['doc'] = create_int_feature(doc_input_ids) feature['doc_mask'] = create_int_feature(doc_input_mask) feature['label_ids'] = create_int_feature([inst.label]) feature['data_id'] = create_int_feature([inst.data_id]) return feature
def write_instance(self, instances, output_path): writer = RecordWriterWrap(output_path) for (inst_index, instance) in enumerate(instances): new_features = collections.OrderedDict() feature, contexts = instance for key in feature: v = take(feature[key]) new_features[key] = create_int_feature(v[:self.max_seq_length]) context_input_ids = [] context_input_mask = [] context_segment_ids = [] for tokens in contexts: segment_ids = [0] * len(tokens) input_ids, input_mask, segment_ids = \ get_basic_input_feature_as_list(self.tokenizer, self.max_context_len, tokens, segment_ids) context_input_ids.extend(input_ids) context_input_mask.extend(input_mask) context_segment_ids.extend(segment_ids) dummy_len = self.max_context - len(contexts) for _ in range(dummy_len): input_ids, input_mask, segment_ids = \ get_basic_input_feature_as_list(self.tokenizer, self.max_context_len, [], []) context_input_ids.extend(input_ids) context_input_mask.extend(input_mask) context_segment_ids.extend(segment_ids) new_features["context_input_ids"] = create_int_feature(context_input_ids) new_features["context_input_mask"] = create_int_feature(context_input_mask) new_features["context_segment_ids"] = create_int_feature(context_segment_ids) writer.write_feature(new_features) if inst_index < 20: log_print_feature(new_features) writer.close()
def encode_inner(max_seq_length, tokenizer, inst: PayloadAsTokens) -> OrderedDict: tokens_1: List[str] = inst.text1 tokens_2: List[str] = inst.text2 tokens_3: List[str] = inst.passage def combine(tokens1, tokens2): return combine_with_sep_cls(max_seq_length, tokens1, tokens2) features = collections.OrderedDict() for tokens_a, tokens_b, postfix in [(tokens_1, tokens_2, ""), (tokens_2, tokens_3, "2"), (tokens_1, tokens_3, "3")]: tokens, segment_ids = combine(tokens_a, tokens_b) input_ids, input_mask, segment_ids = get_basic_input_feature_as_list( tokenizer, max_seq_length, tokens, segment_ids) features["input_ids" + postfix] = create_int_feature(input_ids) features["input_mask" + postfix] = create_int_feature(input_mask) features["segment_ids" + postfix] = create_int_feature(segment_ids) features['label_ids'] = create_int_feature([inst.is_correct]) features['data_id'] = create_int_feature([inst.data_id]) return features
def get_second_feature_parts(tokens1, tokens2_list): tokens, segment_ids = iterate_over(tokens1, tokens2_list) return get_basic_input_feature_as_list(tokenizer, d_max_seq_length, tokens, segment_ids)