コード例 #1
0
ファイル: qck_dl_genenerator.py プロジェクト: clover3/Chair
def encode_three_inputs(max_seq_length, tokenizer,
                        inst: PayloadAsTokens) -> collections.OrderedDict:
    tokens_1_1: List[str] = inst.text1
    tokens_1_2: List[str] = inst.text2
    tokens_2_1: List[str] = tokens_1_2
    tokens_2_2 = inst.passage[:max_seq_length]

    def combine(tokens1, tokens2):
        effective_length = max_seq_length - 3
        if len(tokens1) + len(tokens2) > effective_length:
            half = int(effective_length / 2 + 1)
            tokens1 = tokens1[:half]
            remain = effective_length - len(tokens1)
            tokens2 = tokens2[:remain]
        tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"]
        segment_ids = [0] * (len(tokens1) + 2) \
                      + [1] * (len(tokens2) + 1)
        tokens = tokens[:max_seq_length]
        segment_ids = segment_ids[:max_seq_length]
        return tokens, segment_ids

    def fill(tokens1, seg_id):
        tokens = ["[CLS]"] + tokens1 + ["[SEP]"]
        segment_ids = [seg_id] * (len(tokens1) + 2)
        tokens = tokens[:max_seq_length]
        segment_ids = segment_ids[:max_seq_length]
        return tokens, segment_ids

    tokens_A, segment_ids_A = combine(tokens_1_1, tokens_1_2)
    tokens_B, segment_ids_B = fill(tokens_2_1, 0)
    tokens_C, segment_ids_C = fill(tokens_2_2, 1)

    features = collections.OrderedDict()
    input_ids, input_mask, segment_ids = get_basic_input_feature_as_list(
        tokenizer, max_seq_length, tokens_A, segment_ids_A)
    features["input_ids"] = create_int_feature(input_ids)
    features["input_mask"] = create_int_feature(input_mask)
    features["segment_ids"] = create_int_feature(segment_ids)

    input_ids, input_mask, segment_ids = get_basic_input_feature_as_list(
        tokenizer, max_seq_length, tokens_B, segment_ids_B)
    features["input_ids1"] = create_int_feature(input_ids)
    features["input_mask1"] = create_int_feature(input_mask)
    features["segment_ids1"] = create_int_feature(segment_ids)

    input_ids, input_mask, segment_ids = get_basic_input_feature_as_list(
        tokenizer, max_seq_length, tokens_C, segment_ids_C)
    features["input_ids2"] = create_int_feature(input_ids)
    features["input_mask2"] = create_int_feature(input_mask)
    features["segment_ids2"] = create_int_feature(segment_ids)

    features['label_ids'] = create_int_feature([inst.is_correct])
    features['data_id'] = create_int_feature([inst.data_id])
    return features
コード例 #2
0
ファイル: pairwise_common.py プロジェクト: clover3/Chair
def combine_features_B(tokens, segment_ids, tokens2, segment_ids2, tokenizer,
                       max_seq_length) -> collections.OrderedDict:
    input_ids, input_mask, segment_ids = get_basic_input_feature_as_list(
        tokenizer, max_seq_length, tokens, segment_ids)
    features = collections.OrderedDict()
    features["input_ids"] = create_int_feature(input_ids)
    features["input_mask"] = create_int_feature(input_mask)
    features["segment_ids"] = create_int_feature(segment_ids)
    input_ids, input_mask, segment_ids = get_basic_input_feature_as_list(
        tokenizer, max_seq_length, tokens2, segment_ids2)
    features["input_ids2"] = create_int_feature(input_ids)
    features["input_mask2"] = create_int_feature(input_mask)
    features["segment_ids2"] = create_int_feature(segment_ids)
    return features
コード例 #3
0
ファイル: multi_evidence.py プロジェクト: clover3/Chair
    def encode(inst: Payload) -> OrderedDict:
        tokens_1_1: List[str] = tokenizer.tokenize(inst.text1)
        tokens_1_2: List[str] = tokenizer.tokenize(inst.text2)

        def tokenize_from_tokens_fn(tokens):
            return tokenize_from_tokens(tokenizer, tokens)

        tokens_2_list: List[List[str]] = lmap(tokenize_from_tokens_fn,
                                              inst.passage_list)

        tokens, segment_ids = combine_with_sep_cls(max_seq_length, tokens_1_1,
                                                   tokens_1_2)
        input_ids, input_mask, segment_ids = get_basic_input_feature_as_list(
            tokenizer, max_seq_length, tokens, segment_ids)
        features = collections.OrderedDict()
        features["input_ids"] = create_int_feature(input_ids)
        features["input_mask"] = create_int_feature(input_mask)
        features["segment_ids"] = create_int_feature(segment_ids)

        def iterate_over(tokens1, tokens2_list) -> Tuple[List[str], List[int]]:
            dummy_tokens = ["[PAD]"] * max_seq_length
            dummy_segment_ids = [0] * max_seq_length

            def make_for_each_window(tokens2):
                tokens, segment_ids = combine_and_pad(tokens1, tokens2)
                return tokens, segment_ids

            tokens_and_segment_ids_list: List[Tuple[List[str], List[int]]] = \
                lmap(make_for_each_window, tokens2_list[:num_windows])

            pad_len = num_windows - len(tokens_and_segment_ids_list)
            tokens_and_segment_ids_list += [(dummy_tokens, dummy_segment_ids)
                                            ] * pad_len
            tokens_list, segment_ids_list = zip(*tokens_and_segment_ids_list)
            return lflatten(tokens_list), lflatten(segment_ids_list)

        def get_second_feature_parts(tokens1, tokens2_list):
            tokens, segment_ids = iterate_over(tokens1, tokens2_list)
            return get_basic_input_feature_as_list(tokenizer, d_max_seq_length,
                                                   tokens, segment_ids)

        input_ids, input_mask, segment_ids = get_second_feature_parts(
            tokens_1_2, tokens_2_list)
        features["input_ids2"] = create_int_feature(input_ids)
        features["input_mask2"] = create_int_feature(input_mask)
        features["segment_ids2"] = create_int_feature(segment_ids)

        input_ids, input_mask, segment_ids = get_second_feature_parts(
            tokens_1_1, tokens_2_list)
        features["input_ids3"] = create_int_feature(input_ids)
        features["input_mask3"] = create_int_feature(input_mask)
        features["segment_ids3"] = create_int_feature(segment_ids)
        features['label_ids'] = create_int_feature([inst.is_correct])
        features['data_id'] = create_int_feature([inst.data_id])
        return features
コード例 #4
0
def encode_single(tokenizer, tokens, max_seq_length):
    effective_length = max_seq_length - 2
    tokens = tokens[:effective_length]
    tokens = ["[CLS]"] + tokens + ["[SEP]"]
    segment_ids = [0] * (len(tokens) + 2)
    tokens = tokens[:max_seq_length]
    segment_ids = segment_ids[:max_seq_length]
    input_ids, input_mask, segment_ids = get_basic_input_feature_as_list(
        tokenizer, max_seq_length, tokens, segment_ids)

    return input_ids, input_mask, segment_ids
コード例 #5
0
def encode_query_doc_instance(tokenizer, doc_token_length,
                              inst: QueryDocInstance) -> OrderedDict:
    doc_segment_ids = [1] * len(inst.doc_tokens)
    doc_input_ids, doc_input_mask, doc_segment_ids \
        = get_basic_input_feature_as_list(tokenizer, doc_token_length, inst.doc_tokens, doc_segment_ids)

    feature = collections.OrderedDict()
    feature['query'] = create_int_feature(
        tokenizer.convert_tokens_to_ids(inst.query_tokens))
    feature['doc'] = create_int_feature(doc_input_ids)
    feature['doc_mask'] = create_int_feature(doc_input_mask)
    feature['label_ids'] = create_int_feature([inst.label])
    feature['data_id'] = create_int_feature([inst.data_id])
    return feature
コード例 #6
0
    def write_instance(self, instances, output_path):
        writer = RecordWriterWrap(output_path)
        for (inst_index, instance) in enumerate(instances):
            new_features = collections.OrderedDict()
            feature, contexts = instance
            for key in feature:
                v = take(feature[key])
                new_features[key] = create_int_feature(v[:self.max_seq_length])

            context_input_ids = []
            context_input_mask = []
            context_segment_ids = []

            for tokens in contexts:
                segment_ids = [0] * len(tokens)
                input_ids, input_mask, segment_ids = \
                    get_basic_input_feature_as_list(self.tokenizer, self.max_context_len, tokens, segment_ids)
                context_input_ids.extend(input_ids)
                context_input_mask.extend(input_mask)
                context_segment_ids.extend(segment_ids)

            dummy_len = self.max_context - len(contexts)
            for _ in range(dummy_len):
                input_ids, input_mask, segment_ids = \
                    get_basic_input_feature_as_list(self.tokenizer, self.max_context_len, [], [])
                context_input_ids.extend(input_ids)
                context_input_mask.extend(input_mask)
                context_segment_ids.extend(segment_ids)

            new_features["context_input_ids"] = create_int_feature(context_input_ids)
            new_features["context_input_mask"] = create_int_feature(context_input_mask)
            new_features["context_segment_ids"] = create_int_feature(context_segment_ids)
            writer.write_feature(new_features)
            if inst_index < 20:
                log_print_feature(new_features)
        writer.close()
コード例 #7
0
ファイル: cppnc_triple_datagen.py プロジェクト: clover3/Chair
def encode_inner(max_seq_length, tokenizer,
                 inst: PayloadAsTokens) -> OrderedDict:
    tokens_1: List[str] = inst.text1
    tokens_2: List[str] = inst.text2
    tokens_3: List[str] = inst.passage

    def combine(tokens1, tokens2):
        return combine_with_sep_cls(max_seq_length, tokens1, tokens2)

    features = collections.OrderedDict()
    for tokens_a, tokens_b, postfix in [(tokens_1, tokens_2, ""),
                                        (tokens_2, tokens_3, "2"),
                                        (tokens_1, tokens_3, "3")]:
        tokens, segment_ids = combine(tokens_a, tokens_b)
        input_ids, input_mask, segment_ids = get_basic_input_feature_as_list(
            tokenizer, max_seq_length, tokens, segment_ids)

        features["input_ids" + postfix] = create_int_feature(input_ids)
        features["input_mask" + postfix] = create_int_feature(input_mask)
        features["segment_ids" + postfix] = create_int_feature(segment_ids)

    features['label_ids'] = create_int_feature([inst.is_correct])
    features['data_id'] = create_int_feature([inst.data_id])
    return features
コード例 #8
0
 def get_second_feature_parts(tokens1, tokens2_list):
     tokens, segment_ids = iterate_over(tokens1, tokens2_list)
     return get_basic_input_feature_as_list(tokenizer, d_max_seq_length,
                                            tokens, segment_ids)