Exemple #1
0
def gen_tf_record():
    sequence_length = 300
    data_loader = get_biobert_nli_data_loader(sequence_length)
    todo = [("train", [data_loader.train_file]),
            ("dev", [data_loader.dev_file])]
    batch_size = 32
    dir_path = os.path.join(output_path,
                            "biobert_mnli_{}".format(sequence_length))
    exist_or_mkdir(dir_path)

    for name, files in todo[::-1]:
        output_file = os.path.join(dir_path, name)
        writer = RecordWriterWrap(output_file)
        for file in files:
            for e in data_loader.example_generator(file):
                f = entry_to_feature_dict(e)
                f["is_real_example"] = create_int_feature([1])
                writer.write_feature(f)

        if name == "dev":
            while writer.total_written % batch_size != 0:
                f["is_real_example"] = create_int_feature([0])
                writer.write_feature(f)

        writer.close()

        print("Wrote %d total instances" % writer.total_written)
Exemple #2
0
def tf_record_gen(ranked_list: Dict[str, List[SimpleRankedListEntry]],
                  queries: Dict,
                  text_reader: Callable[[str], str],
                  output_path,
                  max_seq_length: int,
                  data_info_save_name,
                  ):
    writer = RecordWriterWrap(output_path)
    tokenizer = get_tokenizer()
    dummy_label = 0

    data_id_idx = 0
    data_id_info = {}
    for query_id_str in ranked_list:
        query_rep = queries[query_id_str]
        query_str = query_rep['query']

        for ranked_entry in ranked_list[query_id_str]:
            data_id = data_id_idx
            data_id_idx += 1
            data_id_info[data_id] = (query_id_str, ranked_entry.doc_id)
            text = text_reader(ranked_entry.doc_id)
            tokens, segment_ids = encode_query_and_text(tokenizer, query_str, text, max_seq_length)
            features = get_basic_input_feature(tokenizer,
                                               max_seq_length,
                                               tokens,
                                               segment_ids)
            features['label_ids'] = create_int_feature([dummy_label])
            features['data_id'] = create_int_feature([data_id])
            writer.write_feature(features)

    save_to_pickle(data_id_info, data_info_save_name)
    writer.close()
Exemple #3
0
def encode_two_inputs(max_seq_length, tokenizer,
                      inst: PayloadAsTokens) -> OrderedDict:
    tokens_1_1: List[str] = inst.text1
    tokens_1_2: List[str] = inst.text2
    tokens_2_1: List[str] = tokens_1_2

    max_seg2_len = max_seq_length - 3 - len(tokens_2_1)

    tokens_2_2 = inst.passage[:max_seg2_len]

    def combine(tokens1, tokens2):
        effective_length = max_seq_length - 3
        if len(tokens1) + len(tokens2) > effective_length:
            half = int(effective_length / 2 + 1)
            tokens1 = tokens1[:half]
            remain = effective_length - len(tokens1)
            tokens2 = tokens2[:remain]
        tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"]
        segment_ids = [0] * (len(tokens1) + 2) \
                      + [1] * (len(tokens2) + 1)
        tokens = tokens[:max_seq_length]
        segment_ids = segment_ids[:max_seq_length]
        return tokens, segment_ids

    tokens_A, segment_ids_A = combine(tokens_1_1, tokens_1_2)
    tokens_B, segment_ids_B = combine(tokens_2_1, tokens_2_2)

    features = combine_features_B(tokens_A, segment_ids_A, tokens_B,
                                  segment_ids_B, tokenizer, max_seq_length)
    features['label_ids'] = create_int_feature([inst.is_correct])
    features['data_id'] = create_int_feature([inst.data_id])
    return features
Exemple #4
0
def encode_classification_instance_w_data_id(
        tokenizer, max_seq_length,
        inst: ClassificationInstanceWDataID) -> OrderedDict:
    feature: OrderedDict = get_basic_input_feature(tokenizer, max_seq_length,
                                                   inst.tokens, inst.seg_ids)
    feature['label_ids'] = create_int_feature([inst.label])
    feature['data_id'] = create_int_feature([inst.data_id])
    return feature
Exemple #5
0
def entry_to_feature_dict(e):
    input_ids, input_mask, segment_ids, label = e
    features = collections.OrderedDict()
    features["input_ids"] = create_int_feature(input_ids)
    features["input_mask"] = create_int_feature(input_mask)
    features["segment_ids"] = create_int_feature(segment_ids)
    features["label_ids"] = create_int_feature([label])
    return features
Exemple #6
0
def pairwise_entry_to_feature_dict(pair):
    features = collections.OrderedDict()
    for idx, e in enumerate(pair):
        input_ids, input_mask, segment_ids, label = e
        features["input_ids" + str(idx + 1)] = create_int_feature(input_ids)
        features["input_mask" + str(idx + 1)] = create_int_feature(input_mask)
        features["segment_ids" +
                 str(idx + 1)] = create_int_feature(segment_ids)
        features["label_ids" + str(idx + 1)] = create_int_feature([label])
    return features
Exemple #7
0
def encode_inst_as_input_ids(max_seq_length,
                             inst: InstAsInputIds) -> OrderedDict:
    # this pads input_ids
    input_ids, input_mask, segment_ids = get_basic_input_feature_as_list_all_ids(
        inst.input_ids, inst.seg_ids, max_seq_length)
    feature = ordered_dict_from_input_segment_mask_ids(input_ids, input_mask,
                                                       segment_ids)
    feature['label_ids'] = create_int_feature([inst.label])
    feature['data_id'] = create_int_feature([inst.data_id])
    return feature
Exemple #8
0
def encode_w_data_id(tokenizer, max_seq_length, t: Tuple[str, bool, int]):
    text, is_correct, data_id = t
    tokens1: List[str] = tokenizer.tokenize(text)
    tokens = ["[CLS]"] + tokens1 + ["[SEP]"]
    segment_ids = [0] * (len(tokens1) + 2)
    tokens = tokens[:max_seq_length]
    segment_ids = segment_ids[:max_seq_length]
    features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids)
    features['label_ids'] = create_int_feature([int(is_correct)])
    features['data_id'] = create_int_feature([int(data_id)])
    return features
Exemple #9
0
    def write(self, insts: List[Instance], out_path):
        writer = RecordWriterWrap(out_path)
        for inst in insts:
            feature = get_basic_input_feature(self.tokenizer,
                                              self.max_seq_length, inst.tokens,
                                              inst.seg_ids)
            feature["data_id"] = create_int_feature([int(inst.data_id)])
            feature["label_ids"] = create_int_feature([int(inst.label)])
            writer.write_feature(feature)

        writer.close()
Exemple #10
0
    def encode(inst: Instance) -> OrderedDict:
        tokens1: List[str] = tokenizer.tokenize(inst.text1)
        max_seg2_len = max_seq_length - 3 - len(tokens1)
        tokens2 = tokenizer.tokenize(inst.text2)[:max_seg2_len]
        tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"]

        segment_ids = [0] * (len(tokens1) + 2) + [1] * (len(tokens2) + 1)
        tokens = tokens[:max_seq_length]
        segment_ids = segment_ids[:max_seq_length]
        features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids)
        features['label_ids'] = create_int_feature([inst.label])
        features['data_id'] = create_int_feature([inst.data_id])
        return features
Exemple #11
0
def encode_query_doc_instance(tokenizer, doc_token_length,
                              inst: QueryDocInstance) -> OrderedDict:
    doc_segment_ids = [1] * len(inst.doc_tokens)
    doc_input_ids, doc_input_mask, doc_segment_ids \
        = get_basic_input_feature_as_list(tokenizer, doc_token_length, inst.doc_tokens, doc_segment_ids)

    feature = collections.OrderedDict()
    feature['query'] = create_int_feature(
        tokenizer.convert_tokens_to_ids(inst.query_tokens))
    feature['doc'] = create_int_feature(doc_input_ids)
    feature['doc_mask'] = create_int_feature(doc_input_mask)
    feature['label_ids'] = create_int_feature([inst.label])
    feature['data_id'] = create_int_feature([inst.data_id])
    return feature
 def encode(inst: Payload) -> OrderedDict:
     tokens1: List[str] = tokenizer.tokenize(inst.candidate_text)
     max_seg2_len = max_seq_length - 3 - len(tokens1)
     tokens2 = tokenize_from_tokens(inst.passage)[:max_seg2_len]
     tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"]
     segment_ids = [0] * (len(tokens1) + 2) \
                   + [1] * (len(tokens2) + 1)
     tokens = tokens[:max_seq_length]
     segment_ids = segment_ids[:max_seq_length]
     features = get_basic_input_feature(tokenizer, max_seq_length, tokens,
                                        segment_ids)
     features['label_ids'] = create_int_feature([inst.is_correct])
     features['data_id'] = create_int_feature([inst.data_id])
     return features
Exemple #13
0
    def encode_fn(self, inst: QKInstance) -> OrderedDict:
        max_seq_length = self.max_seq_length
        tokens1: List[str] = self.tokenizer.tokenize(inst.query_text)
        max_seg2_len = self.max_seq_length - 3 - len(tokens1)

        tokens2 = inst.doc_tokens[:max_seg2_len]
        tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"]

        segment_ids = [0] * (len(tokens1) + 2) \
                      + [1] * (len(tokens2) + 1)
        tokens = tokens[:max_seq_length]
        segment_ids = segment_ids[:max_seq_length]
        features = get_basic_input_feature(self.tokenizer, max_seq_length, tokens, segment_ids)
        features['label_ids'] = create_int_feature([inst.is_correct])
        features['data_id'] = create_int_feature([inst.data_id])
        return features
Exemple #14
0
    def encode_fn(self, inst: TokenScoringInstance) -> OrderedDict:
        max_seq_length = self.max_seq_length
        tokens1: List[str] = self.tokenizer.tokenize(inst.query_text)
        max_seg2_len = self.max_seq_length - 3 - len(tokens1)

        tokens2, scores = self.tokenize_from_tokens_w_scores(
            inst.doc_tokens, inst.score)
        tokens2 = tokens2[:max_seg2_len]
        scores: ScoreVector = scores[:max_seg2_len]
        tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"]

        segment_ids = [0] * (len(tokens1) + 2) \
                      + [1] * (len(tokens2) + 1)
        tokens = tokens[:max_seq_length]
        segment_ids = segment_ids[:max_seq_length]
        features = get_basic_input_feature(self.tokenizer, max_seq_length,
                                           tokens, segment_ids)

        score_vector = pad_score_vector(scores, max_seq_length, len(tokens1))
        if len(score_vector) != max_seq_length:
            print(score_vector)
            print(len(score_vector))
            print(max_seq_length)
            print(len(scores))
            print(scores)
        assert len(score_vector) == max_seq_length
        features['label_ids'] = score_vector_to_feature(score_vector)
        features['data_id'] = create_int_feature([inst.data_id])
        return features
Exemple #15
0
 def encode(inst: TextInstance) -> OrderedDict:
     tokens = tokenizer.tokenize(inst.text)
     max_len = max_seq_length - 2
     if len(tokens) > max_len:
         nonlocal long_count
         long_count = long_count + 1
         if long_count > 10 and long_warning:
             print("long text count", long_count)
     tokens = tokens[:max_len]
     tokens = ["[CLS]"] + tokens + ["[SEP]"]
     seg_ids = [0] * len(tokens)
     feature: OrderedDict = get_basic_input_feature(tokenizer,
                                                    max_seq_length, tokens,
                                                    seg_ids)
     feature['label_ids'] = create_int_feature([inst.label])
     feature['data_id'] = create_int_feature([inst.data_id])
     return feature
Exemple #16
0
def encode_three_inputs(max_seq_length_list: List[int], tokenizer,
                        inst: PayloadAsTokens) -> OrderedDict:
    tokens1: List[str] = inst.text1
    tokens2: List[str] = inst.text2
    tokens3: List[str] = inst.passage

    tokens_list = [tokens1, tokens2, tokens3]
    features = collections.OrderedDict()
    for i in range(3):
        input_ids, input_mask, segment_ids = encode_single(
            tokenizer, tokens_list[i], max_seq_length_list[i])
        features["input_ids{}".format(i)] = input_ids
        features["input_mask{}".format(i)] = input_mask
        features["segment_ids{}".format(i)] = segment_ids

    features['label_ids'] = create_int_feature([inst.is_correct])
    features['data_id'] = create_int_feature([inst.data_id])
    return features
Exemple #17
0
 def encode(self, inst: Instance) -> OrderedDict:
     if not self.reverse:
         tokens1 = self.get_p_tokens(inst.pid)
         tokens2 = inst.sent
     else:
         tokens1 = inst.sent
         tokens2 = self.get_p_tokens(inst.pid)
     tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"]
     segment_ids = [0] * (len(tokens1) + 2) \
                   + [1] * (len(tokens2) + 1)
     max_seq_length = self.max_seq_length
     tokens = tokens[:max_seq_length]
     segment_ids = segment_ids[:max_seq_length]
     features = get_basic_input_feature(self.tokenizer, max_seq_length,
                                        tokens, segment_ids)
     features['label_ids'] = create_int_feature([0])
     features['data_ids'] = create_int_feature([inst.data_id])
     return features
Exemple #18
0
def gen_mismatched():
    sequence_length = 300
    data_loader = get_modified_nli_data_loader(sequence_length)
    dir_path = os.path.join(output_path,
                            "nli_tfrecord_cls_{}".format(sequence_length))
    name = "dev_mis"
    output_file = os.path.join(dir_path, name)
    batch_size = 32
    writer = RecordWriterWrap(output_file)
    for e in data_loader.example_generator(data_loader.dev_file2):
        f = entry_to_feature_dict(e)
        f["is_real_example"] = create_int_feature([1])
        writer.write_feature(f)
    while writer.total_written % batch_size != 0:
        f["is_real_example"] = create_int_feature([0])
        writer.write_feature(f)
    writer.close()
    print("Wrote %d total instances" % writer.total_written)
Exemple #19
0
def write_tfrecord(ranked_list_d: RankedListDict, queries: List[Query],
                   q_rels: Dict[str, List[str]], save_path):
    max_seq_length = 512
    tokenizer = get_tokenizer()
    encoder = AllSegmentAsDoc(max_seq_length)
    writer = RecordWriterWrap(save_path)
    data_id = 0

    data_info = []
    for query in queries:
        if query.qid not in ranked_list_d:
            print("Warning query {} not found".format(query.qid))
            continue
        print(query.qid)
        ranked_list = ranked_list_d[query.qid]
        doc_ids = [doc_entry.doc_id for doc_entry in ranked_list]
        preload_man.preload(BertTokenizedCluewebDoc, doc_ids)
        q_tokens = tokenizer.tokenize(query.text)

        for doc_entry in ranked_list:
            try:
                tokens_list: List[List[str]] = load(BertTokenizedCluewebDoc,
                                                    doc_entry.doc_id)
                tokens = flatten(tokens_list)
                insts: List[Tuple[List,
                                  List]] = encoder.encode(q_tokens, tokens)
                for inst in insts:
                    label = doc_entry.doc_id in q_rels[query.qid]

                    input_tokens, segment_ids = inst
                    feature = get_basic_input_feature(tokenizer,
                                                      max_seq_length,
                                                      input_tokens,
                                                      segment_ids)
                    feature["label_ids"] = create_int_feature([int(label)])
                    feature["data_id"] = create_int_feature([int(data_id)])
                    writer.write_feature(feature)

                    data_info.append((data_id, query.qid, doc_entry.doc_id))
                    data_id += 1
            except KeyError as e:
                print("doc {} not found".format(doc_entry.doc_id))

    return data_info
Exemple #20
0
def enc_to_feature(tokenizer, max_seq_length,
                   pc: PerspectiveCandidate) -> OrderedDict:
    seg1 = tokenizer.tokenize(pc.claim_text)
    seg2 = tokenizer.tokenize(pc.p_text)

    input_tokens = ["[CLS]"] + seg1 + ["[SEP]"] + seg2 + ["[SEP]"]
    segment_ids = [0] * (len(seg1) + 2) + [1] * (len(seg2) + 1)

    feature = get_basic_input_feature(tokenizer, max_seq_length, input_tokens,
                                      segment_ids)
    feature["label_ids"] = create_int_feature([int(pc.label)])
    return feature
Exemple #21
0
    def encode(inst: PairedInstance) -> OrderedDict:
        tokens1: List[str] = tokenizer.tokenize(inst.candidate_text)
        max_seg2_len = max_seq_length - 3 - len(tokens1)

        def concat_tokens(raw_tokens: List[str]):
            tokens2 = tokenize_from_tokens(raw_tokens)[:max_seg2_len]
            tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"]

            segment_ids = [0] * (len(tokens1) + 2) \
                          + [1] * (len(tokens2) + 1)
            tokens = tokens[:max_seq_length]
            segment_ids = segment_ids[:max_seq_length]
            return tokens, segment_ids

        out_tokens1, seg1 = concat_tokens(inst.passage_good)
        out_tokens2, seg2 = concat_tokens(inst.passage_worse)
        features = combine_features(out_tokens1, seg1, out_tokens2, seg2,
                                    tokenizer, max_seq_length)
        features['strict_good'] = create_int_feature([inst.strict_good])
        features['strict_bad'] = create_int_feature([inst.strict_bad])
        return features
Exemple #22
0
def encode_classification_feature(
        max_seq_length, data: Iterable[Tuple[str, str,
                                             int]]) -> Iterable[OrderedDict]:
    tokenizer = get_tokenizer()
    encoder = FirstSegmentAsDoc(max_seq_length)
    for query, text, label in data:
        q_tokens = tokenizer.tokenize(query)
        text_tokens = tokenizer.tokenize(text)
        input_tokens, segment_ids = encoder.encode(q_tokens, text_tokens)[0]
        feature = get_basic_input_feature(tokenizer, max_seq_length,
                                          input_tokens, segment_ids)
        feature['label_ids'] = create_int_feature([label])
        yield feature
Exemple #23
0
def encode_inner(max_seq_length, tokenizer,
                 inst: PayloadAsTokens) -> OrderedDict:
    tokens_1: List[str] = inst.text1
    tokens_2: List[str] = inst.text2
    tokens_3: List[str] = inst.passage

    def combine(tokens1, tokens2):
        return combine_with_sep_cls(max_seq_length, tokens1, tokens2)

    features = collections.OrderedDict()
    for tokens_a, tokens_b, postfix in [(tokens_1, tokens_2, ""),
                                        (tokens_2, tokens_3, "2"),
                                        (tokens_1, tokens_3, "3")]:
        tokens, segment_ids = combine(tokens_a, tokens_b)
        input_ids, input_mask, segment_ids = get_basic_input_feature_as_list(
            tokenizer, max_seq_length, tokens, segment_ids)

        features["input_ids" + postfix] = create_int_feature(input_ids)
        features["input_mask" + postfix] = create_int_feature(input_mask)
        features["segment_ids" + postfix] = create_int_feature(segment_ids)

    features['label_ids'] = create_int_feature([inst.is_correct])
    features['data_id'] = create_int_feature([inst.data_id])
    return features
Exemple #24
0
def encode_two_input_ids(max_seq_length, tokenizer,
                         inst: PayloadAsIds) -> OrderedDict:
    tokens_1_1: List[int] = inst.text1
    tokens_1_2: List[int] = inst.text2
    tokens_2_1: List[int] = tokens_1_2

    cls_id = tokenizer.convert_tokens_to_ids(["[CLS]"])[0]
    sep_id = tokenizer.convert_tokens_to_ids(["[SEP]"])[0]
    max_seg2_len = max_seq_length - 3 - len(tokens_2_1)

    tokens_2_2 = inst.passage[:max_seg2_len]

    def combine(tokens1, tokens2):
        effective_length = max_seq_length - 3
        if len(tokens1) + len(tokens2) > effective_length:
            half = int(effective_length / 2 + 1)
            tokens1 = tokens1[:half]
            remain = effective_length - len(tokens1)
            tokens2 = tokens2[:remain]
        input_ids = [cls_id] + tokens1 + [sep_id] + tokens2 + [sep_id]
        segment_ids = [0] * (len(tokens1) + 2) + [1] * (len(tokens2) + 1)
        input_ids = input_ids[:max_seq_length]
        segment_ids = segment_ids[:max_seq_length]
        return input_ids, segment_ids

    input_ids_A, segment_ids_A = combine(tokens_1_1, tokens_1_2)
    input_ids_B, segment_ids_B = combine(tokens_2_1, tokens_2_2)

    input_ids, input_mask, segment_ids = get_basic_input_feature_as_list_all_ids(
        input_ids_A, segment_ids_A, max_seq_length)
    features = collections.OrderedDict()
    features["input_ids"] = create_int_feature(input_ids)
    features["input_mask"] = create_int_feature(input_mask)
    features["segment_ids"] = create_int_feature(segment_ids)

    input_ids, input_mask, segment_ids = get_basic_input_feature_as_list_all_ids(
        input_ids_B, segment_ids_B, max_seq_length)
    features["input_ids2"] = create_int_feature(input_ids)
    features["input_mask2"] = create_int_feature(input_mask)
    features["segment_ids2"] = create_int_feature(segment_ids)

    features['label_ids'] = create_int_feature([inst.is_correct])
    features['data_id'] = create_int_feature([inst.data_id])
    return features
Exemple #25
0
 def encode(inst: Tuple[str, int]) -> OrderedDict:
     text, label = inst
     tokens = tokenizer.tokenize(text)
     max_len = max_seq_length - 2
     if len(tokens) > max_len:
         nonlocal long_count
         long_count = long_count + 1
         if long_count > 10:
             print("long text count", long_count)
     tokens = tokens[:max_len]
     tokens = ["[CLS]"] + tokens + ["[SEP]"]
     seg_ids = [0] * len(tokens)
     feature: OrderedDict = get_basic_input_feature(tokenizer,
                                                    max_seq_length, tokens,
                                                    seg_ids)
     feature['label_ids'] = create_int_feature([label])
     return feature
Exemple #26
0
    def enc_to_feature(pc: PerspectiveCandidate) -> OrderedDict:
        emb_model = get_aux_embedding_fn(pc.cid)

        seg1 = tokenizer.tokenize(pc.claim_text)
        seg2 = tokenizer.tokenize(pc.p_text)

        input_tokens = ["[CLS]"] + seg1 + ["[SEP]"] + seg2 + ["[SEP]"]

        aux_emb = get_word_embedding(emb_model, input_tokens, dims)
        aux_emb += (max_seq_length - len(aux_emb)) * [zero_vector]
        aux_emb = np.array(aux_emb)
        flat_aux_emb = np.reshape(aux_emb, [-1])

        segment_ids = [0] * (len(seg1) + 2) + [1] * (len(seg2) + 1)

        feature = get_basic_input_feature(tokenizer, max_seq_length,
                                          input_tokens, segment_ids)
        feature["label_ids"] = create_int_feature([int(pc.label)])
        feature["aux_emb"] = create_float_feature(flat_aux_emb)
        return feature
Exemple #27
0
def encode_three_inputs(max_seq_length, tokenizer,
                        inst: PayloadAsTokens) -> collections.OrderedDict:
    tokens_1_1: List[str] = inst.text1
    tokens_1_2: List[str] = inst.text2
    tokens_2_1: List[str] = tokens_1_2
    tokens_2_2 = inst.passage[:max_seq_length]

    def combine(tokens1, tokens2):
        effective_length = max_seq_length - 3
        if len(tokens1) + len(tokens2) > effective_length:
            half = int(effective_length / 2 + 1)
            tokens1 = tokens1[:half]
            remain = effective_length - len(tokens1)
            tokens2 = tokens2[:remain]
        tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"]
        segment_ids = [0] * (len(tokens1) + 2) \
                      + [1] * (len(tokens2) + 1)
        tokens = tokens[:max_seq_length]
        segment_ids = segment_ids[:max_seq_length]
        return tokens, segment_ids

    def fill(tokens1, seg_id):
        tokens = ["[CLS]"] + tokens1 + ["[SEP]"]
        segment_ids = [seg_id] * (len(tokens1) + 2)
        tokens = tokens[:max_seq_length]
        segment_ids = segment_ids[:max_seq_length]
        return tokens, segment_ids

    tokens_A, segment_ids_A = combine(tokens_1_1, tokens_1_2)
    tokens_B, segment_ids_B = fill(tokens_2_1, 0)
    tokens_C, segment_ids_C = fill(tokens_2_2, 1)

    features = collections.OrderedDict()
    input_ids, input_mask, segment_ids = get_basic_input_feature_as_list(
        tokenizer, max_seq_length, tokens_A, segment_ids_A)
    features["input_ids"] = create_int_feature(input_ids)
    features["input_mask"] = create_int_feature(input_mask)
    features["segment_ids"] = create_int_feature(segment_ids)

    input_ids, input_mask, segment_ids = get_basic_input_feature_as_list(
        tokenizer, max_seq_length, tokens_B, segment_ids_B)
    features["input_ids1"] = create_int_feature(input_ids)
    features["input_mask1"] = create_int_feature(input_mask)
    features["segment_ids1"] = create_int_feature(segment_ids)

    input_ids, input_mask, segment_ids = get_basic_input_feature_as_list(
        tokenizer, max_seq_length, tokens_C, segment_ids_C)
    features["input_ids2"] = create_int_feature(input_ids)
    features["input_mask2"] = create_int_feature(input_mask)
    features["segment_ids2"] = create_int_feature(segment_ids)

    features['label_ids'] = create_int_feature([inst.is_correct])
    features['data_id'] = create_int_feature([inst.data_id])
    return features