Esempio n. 1
0
def encode_classification_instance(
        tokenizer, max_seq_length,
        inst: ClassificationInstance) -> OrderedDict:
    feature: OrderedDict = get_basic_input_feature(tokenizer, max_seq_length,
                                                   inst.tokens, inst.seg_ids)
    feature['label_ids'] = create_int_feature([inst.label])
    return feature
Esempio n. 2
0
def tf_record_gen(ranked_list: Dict[str, List[SimpleRankedListEntry]],
                  queries: Dict,
                  text_reader: Callable[[str], str],
                  output_path,
                  max_seq_length: int,
                  data_info_save_name,
                  ):
    writer = RecordWriterWrap(output_path)
    tokenizer = get_tokenizer()
    dummy_label = 0

    data_id_idx = 0
    data_id_info = {}
    for query_id_str in ranked_list:
        query_rep = queries[query_id_str]
        query_str = query_rep['query']

        for ranked_entry in ranked_list[query_id_str]:
            data_id = data_id_idx
            data_id_idx += 1
            data_id_info[data_id] = (query_id_str, ranked_entry.doc_id)
            text = text_reader(ranked_entry.doc_id)
            tokens, segment_ids = encode_query_and_text(tokenizer, query_str, text, max_seq_length)
            features = get_basic_input_feature(tokenizer,
                                               max_seq_length,
                                               tokens,
                                               segment_ids)
            features['label_ids'] = create_int_feature([dummy_label])
            features['data_id'] = create_int_feature([data_id])
            writer.write_feature(features)

    save_to_pickle(data_id_info, data_info_save_name)
    writer.close()
Esempio n. 3
0
    def encode_fn(self, inst: TokenScoringInstance) -> OrderedDict:
        max_seq_length = self.max_seq_length
        tokens1: List[str] = self.tokenizer.tokenize(inst.query_text)
        max_seg2_len = self.max_seq_length - 3 - len(tokens1)

        tokens2, scores = self.tokenize_from_tokens_w_scores(
            inst.doc_tokens, inst.score)
        tokens2 = tokens2[:max_seg2_len]
        scores: ScoreVector = scores[:max_seg2_len]
        tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"]

        segment_ids = [0] * (len(tokens1) + 2) \
                      + [1] * (len(tokens2) + 1)
        tokens = tokens[:max_seq_length]
        segment_ids = segment_ids[:max_seq_length]
        features = get_basic_input_feature(self.tokenizer, max_seq_length,
                                           tokens, segment_ids)

        score_vector = pad_score_vector(scores, max_seq_length, len(tokens1))
        if len(score_vector) != max_seq_length:
            print(score_vector)
            print(len(score_vector))
            print(max_seq_length)
            print(len(scores))
            print(scores)
        assert len(score_vector) == max_seq_length
        features['label_ids'] = score_vector_to_feature(score_vector)
        features['data_id'] = create_int_feature([inst.data_id])
        return features
Esempio n. 4
0
def encode(tokenizer, max_seq_length, t: Tuple[str, bool]):
    text, is_correct = t
    tokens1: List[str] = tokenizer.tokenize(text)
    tokens = ["[CLS]"] + tokens1 + ["[SEP]"]
    segment_ids = [0] * (len(tokens1) + 2)
    tokens = tokens[:max_seq_length]
    segment_ids = segment_ids[:max_seq_length]
    features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids)
    features['label_ids'] = create_int_feature([int(is_correct)])
    return features
Esempio n. 5
0
def encode_w_data_id(tokenizer, max_seq_length, t: Instance):
    tokens = ["[CLS]"] + t.tokens + ["[SEP]"]
    segment_ids = [0] * (len(t.tokens) + 2)
    tokens = tokens[:max_seq_length]
    segment_ids = segment_ids[:max_seq_length]
    features = get_basic_input_feature(tokenizer, max_seq_length, tokens,
                                       segment_ids)
    features['label_ids'] = create_int_feature([int(t.label)])
    features['data_id'] = create_int_feature([int(t.data_id)])
    return features
Esempio n. 6
0
    def write(self, insts: List[Instance], out_path):
        writer = RecordWriterWrap(out_path)
        for inst in insts:
            feature = get_basic_input_feature(self.tokenizer,
                                              self.max_seq_length, inst.tokens,
                                              inst.seg_ids)
            feature["data_id"] = create_int_feature([int(inst.data_id)])
            feature["label_ids"] = create_int_feature([int(inst.label)])
            writer.write_feature(feature)

        writer.close()
Esempio n. 7
0
def encode(tokenizer, get_tokens, max_seq_length, inst: Instance) -> OrderedDict:
    tokens1 = get_tokens(inst.pid1)
    tokens2 = get_tokens(inst.pid2)
    tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"]
    segment_ids = [0] * (len(tokens1) + 2) \
                  + [1] * (len(tokens2) + 1)
    tokens = tokens[:max_seq_length]
    segment_ids = segment_ids[:max_seq_length]
    features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids)
    features['label_ids'] = create_int_feature([inst.label])
    return features
Esempio n. 8
0
def enc_to_feature2(tokenizer, max_seq_length, inst: QCInstanceTokenized) -> OrderedDict:
    seg1 = inst.query_text
    seg2 = inst.candidate_text

    input_tokens = ["[CLS]"] + seg1 + ["[SEP]"] + seg2 + ["[SEP]"]
    segment_ids = [0] * (len(seg1) + 2) + [1] * (len(seg2) + 1)

    feature = get_basic_input_feature(tokenizer, max_seq_length, input_tokens, segment_ids)
    feature["data_id"] = create_int_feature([int(inst.data_id)])
    feature["label_ids"] = create_int_feature([int(inst.is_correct)])
    return feature
Esempio n. 9
0
def enc_to_feature(tokenizer, max_seq_length,
                   pc: PerspectiveCandidate) -> OrderedDict:
    seg1 = tokenizer.tokenize(pc.claim_text)
    seg2 = tokenizer.tokenize(pc.p_text)

    input_tokens = ["[CLS]"] + seg1 + ["[SEP]"] + seg2 + ["[SEP]"]
    segment_ids = [0] * (len(seg1) + 2) + [1] * (len(seg2) + 1)

    feature = get_basic_input_feature(tokenizer, max_seq_length, input_tokens,
                                      segment_ids)
    feature["label_ids"] = create_int_feature([int(pc.label)])
    return feature
Esempio n. 10
0
    def encode(inst: Instance) -> OrderedDict:
        tokens1: List[str] = tokenizer.tokenize(inst.text1)
        max_seg2_len = max_seq_length - 3 - len(tokens1)
        tokens2 = tokenizer.tokenize(inst.text2)[:max_seg2_len]
        tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"]

        segment_ids = [0] * (len(tokens1) + 2) + [1] * (len(tokens2) + 1)
        tokens = tokens[:max_seq_length]
        segment_ids = segment_ids[:max_seq_length]
        features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids)
        features['label_ids'] = create_int_feature([inst.label])
        features['data_id'] = create_int_feature([inst.data_id])
        return features
Esempio n. 11
0
    def encode(score_paragraph: ScoreParagraph) -> OrderedDict:
        para_tokens: List[Subword] = score_paragraph.paragraph.subword_tokens

        tokens = tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"
                                                  ] + para_tokens + ["[SEP]"]
        segment_ids = [0] * (len(tokens1) + 1) + [1] * (
            len(tokens2) + 1) + [2] * (len(para_tokens) + 1)
        tokens = tokens[:max_seq_length]
        segment_ids = segment_ids[:max_seq_length]
        features = get_basic_input_feature(tokenizer, max_seq_length, tokens,
                                           segment_ids)
        features['label_ids'] = create_int_feature([label])
        return features
Esempio n. 12
0
def encode_classification_feature(
        max_seq_length, data: Iterable[Tuple[str, str,
                                             int]]) -> Iterable[OrderedDict]:
    tokenizer = get_tokenizer()
    encoder = FirstSegmentAsDoc(max_seq_length)
    for query, text, label in data:
        q_tokens = tokenizer.tokenize(query)
        text_tokens = tokenizer.tokenize(text)
        input_tokens, segment_ids = encoder.encode(q_tokens, text_tokens)[0]
        feature = get_basic_input_feature(tokenizer, max_seq_length,
                                          input_tokens, segment_ids)
        feature['label_ids'] = create_int_feature([label])
        yield feature
Esempio n. 13
0
    def _write_instances(self, insts, output_file):
        writer = RecordWriterWrap(output_file)

        for instance in insts:
            word_tokens, def_tokens, segment_ids = instance
            word_tokens_ids = self.tokenizer.convert_tokens_to_ids(word_tokens)
            features = get_basic_input_feature(self.tokenizer, self.max_seq_length, def_tokens, segment_ids)
            while len(word_tokens_ids) < self.max_word_tokens:
                word_tokens_ids.append(0)
            features["word"] = create_int_feature(word_tokens_ids)
            writer.write_feature(features)
        writer.close()
        tf_logging.info("Wrote %d total instances", writer.total_written)
Esempio n. 14
0
    def write_instances(self, new_inst_list, outfile):
        writer = RecordWriterWrap(outfile)
        example_numbers = []

        for (inst_index, instance) in enumerate(new_inst_list):
            tokens, segment_ids = instance
            features = get_basic_input_feature(self.tokenizer,
                                               self.target_seq_length, tokens,
                                               segment_ids)
            features["use_context"] = create_int_feature([1])
            writer.write_feature(features)
        writer.close()
        tf_logging.info("Wrote %d total instances", writer.total_written)
        return example_numbers
Esempio n. 15
0
 def get_feature(tokens1, tokens2, info):
     data_id = data_id_gen.new_id()
     info_list[data_id] = info
     tokens = tokens1 + tokens2
     segment_ids = [0] * len(tokens1) + [1] * len(tokens2)
     tokens = tokens[:max_seq_length]
     segment_ids = segment_ids[:max_seq_length]
     features = get_basic_input_feature(tokenizer,
                                        max_seq_length,
                                        tokens,
                                        segment_ids)
     features['label_ids'] = create_int_feature([0])
     features['data_id'] = create_int_feature([data_id])
     return features
Esempio n. 16
0
 def encode(inst: Payload) -> OrderedDict:
     tokens1: List[str] = tokenizer.tokenize(inst.candidate_text)
     max_seg2_len = max_seq_length - 3 - len(tokens1)
     tokens2 = tokenize_from_tokens(inst.passage)[:max_seg2_len]
     tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"]
     segment_ids = [0] * (len(tokens1) + 2) \
                   + [1] * (len(tokens2) + 1)
     tokens = tokens[:max_seq_length]
     segment_ids = segment_ids[:max_seq_length]
     features = get_basic_input_feature(tokenizer, max_seq_length, tokens,
                                        segment_ids)
     features['label_ids'] = create_int_feature([inst.is_correct])
     features['data_id'] = create_int_feature([inst.data_id])
     return features
Esempio n. 17
0
        def encode_fn(inst: Instance):
            tokens1 = inst.tokens1
            max_seg2_len = self.max_seq_length - 3 - len(tokens1)

            tokens2 = inst.tokens2[:max_seg2_len]
            tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"]

            segment_ids = [0] * (len(tokens1) + 2) \
                          + [1] * (len(tokens2) + 1)
            tokens = tokens[:self.max_seq_length]
            segment_ids = segment_ids[:self.max_seq_length]
            features = get_basic_input_feature(self.tokenizer, self.max_seq_length, tokens, segment_ids)
            features['label_ids'] = create_int_feature([inst.label])
            features['data_id'] = create_int_feature([inst.data_id])
            return features
Esempio n. 18
0
    def write(self, insts, out_path):
        writer = RecordWriterWrap(out_path)
        f = open(out_path + ".info", "wb")
        doc_id_list = []
        for inst in insts:
            tokens, segment_ids, doc_id = inst
            feature = get_basic_input_feature(self.tokenizer,
                                              self.max_seq_length, tokens,
                                              segment_ids)
            doc_id_list.append(doc_id)

            writer.write_feature(feature)

        pickle.dump(doc_id_list, f)
        writer.close()
Esempio n. 19
0
    def write_instances(self, new_inst_list, outfile):
        writer = RecordWriterWrap(outfile)
        example_numbers = []

        for (inst_index, instance) in enumerate(new_inst_list):
            features = get_basic_input_feature(self.tokenizer,
                                               self.max_seq_length,
                                               instance.tokens,
                                               instance.segment_ids)

            writer.write_feature(features)
            if inst_index < 20:
                log_print_inst(instance, features)
        writer.close()

        return example_numbers
Esempio n. 20
0
    def encode_fn(self, inst: QKInstance) -> OrderedDict:
        max_seq_length = self.max_seq_length
        tokens1: List[str] = self.tokenizer.tokenize(inst.query_text)
        max_seg2_len = self.max_seq_length - 3 - len(tokens1)

        tokens2 = inst.doc_tokens[:max_seg2_len]
        tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"]

        segment_ids = [0] * (len(tokens1) + 2) \
                      + [1] * (len(tokens2) + 1)
        tokens = tokens[:max_seq_length]
        segment_ids = segment_ids[:max_seq_length]
        features = get_basic_input_feature(self.tokenizer, max_seq_length, tokens, segment_ids)
        features['label_ids'] = create_int_feature([inst.is_correct])
        features['data_id'] = create_int_feature([inst.data_id])
        return features
Esempio n. 21
0
 def encode(inst: TextInstance) -> OrderedDict:
     tokens = tokenizer.tokenize(inst.text)
     max_len = max_seq_length - 2
     if len(tokens) > max_len:
         nonlocal long_count
         long_count = long_count + 1
         if long_count > 10 and long_warning:
             print("long text count", long_count)
     tokens = tokens[:max_len]
     tokens = ["[CLS]"] + tokens + ["[SEP]"]
     seg_ids = [0] * len(tokens)
     feature: OrderedDict = get_basic_input_feature(tokenizer,
                                                    max_seq_length, tokens,
                                                    seg_ids)
     feature['label_ids'] = create_int_feature([inst.label])
     feature['data_id'] = create_int_feature([inst.data_id])
     return feature
Esempio n. 22
0
 def encode(inst: Tuple[str, int]) -> OrderedDict:
     text, label = inst
     tokens = tokenizer.tokenize(text)
     max_len = max_seq_length - 2
     if len(tokens) > max_len:
         nonlocal long_count
         long_count = long_count + 1
         if long_count > 10:
             print("long text count", long_count)
     tokens = tokens[:max_len]
     tokens = ["[CLS]"] + tokens + ["[SEP]"]
     seg_ids = [0] * len(tokens)
     feature: OrderedDict = get_basic_input_feature(tokenizer,
                                                    max_seq_length, tokens,
                                                    seg_ids)
     feature['label_ids'] = create_int_feature([label])
     return feature
Esempio n. 23
0
    def encode(record: Record) -> OrderedDict:
        tokens = ["[CLS]"] + record.claim_tokens + [
            "[SEP]"
        ] + record.doc_tokens + ["[SEP]"]
        segment_ids = [0] * (len(record.claim_tokens) + 2) \
                      + [1] * (len(record.doc_tokens) + 1)
        tokens = tokens[:max_seq_length]
        segment_ids = segment_ids[:max_seq_length]
        features = get_basic_input_feature(tokenizer, max_seq_length, tokens,
                                           segment_ids)

        labels = [0.] * (len(record.claim_tokens) + 2) + record.scores
        labels += (max_seq_length - len(labels)) * [0.]
        label_mask = [0] * (len(record.claim_tokens) + 2) + record.valid_mask
        label_mask += (max_seq_length - len(label_mask)) * [0]
        features['label_ids'] = create_float_feature(labels)
        features['label_masks'] = create_int_feature(label_mask)
        return features
Esempio n. 24
0
 def encode(self, inst: Instance) -> OrderedDict:
     if not self.reverse:
         tokens1 = self.get_p_tokens(inst.pid)
         tokens2 = inst.sent
     else:
         tokens1 = inst.sent
         tokens2 = self.get_p_tokens(inst.pid)
     tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"]
     segment_ids = [0] * (len(tokens1) + 2) \
                   + [1] * (len(tokens2) + 1)
     max_seq_length = self.max_seq_length
     tokens = tokens[:max_seq_length]
     segment_ids = segment_ids[:max_seq_length]
     features = get_basic_input_feature(self.tokenizer, max_seq_length,
                                        tokens, segment_ids)
     features['label_ids'] = create_int_feature([0])
     features['data_ids'] = create_int_feature([inst.data_id])
     return features
Esempio n. 25
0
def write_tfrecord(ranked_list_d: RankedListDict, queries: List[Query],
                   q_rels: Dict[str, List[str]], save_path):
    max_seq_length = 512
    tokenizer = get_tokenizer()
    encoder = AllSegmentAsDoc(max_seq_length)
    writer = RecordWriterWrap(save_path)
    data_id = 0

    data_info = []
    for query in queries:
        if query.qid not in ranked_list_d:
            print("Warning query {} not found".format(query.qid))
            continue
        print(query.qid)
        ranked_list = ranked_list_d[query.qid]
        doc_ids = [doc_entry.doc_id for doc_entry in ranked_list]
        preload_man.preload(BertTokenizedCluewebDoc, doc_ids)
        q_tokens = tokenizer.tokenize(query.text)

        for doc_entry in ranked_list:
            try:
                tokens_list: List[List[str]] = load(BertTokenizedCluewebDoc,
                                                    doc_entry.doc_id)
                tokens = flatten(tokens_list)
                insts: List[Tuple[List,
                                  List]] = encoder.encode(q_tokens, tokens)
                for inst in insts:
                    label = doc_entry.doc_id in q_rels[query.qid]

                    input_tokens, segment_ids = inst
                    feature = get_basic_input_feature(tokenizer,
                                                      max_seq_length,
                                                      input_tokens,
                                                      segment_ids)
                    feature["label_ids"] = create_int_feature([int(label)])
                    feature["data_id"] = create_int_feature([int(data_id)])
                    writer.write_feature(feature)

                    data_info.append((data_id, query.qid, doc_entry.doc_id))
                    data_id += 1
            except KeyError as e:
                print("doc {} not found".format(doc_entry.doc_id))

    return data_info
Esempio n. 26
0
    def enc_to_feature(pc: PerspectiveCandidate) -> OrderedDict:
        emb_model = get_aux_embedding_fn(pc.cid)

        seg1 = tokenizer.tokenize(pc.claim_text)
        seg2 = tokenizer.tokenize(pc.p_text)

        input_tokens = ["[CLS]"] + seg1 + ["[SEP]"] + seg2 + ["[SEP]"]

        aux_emb = get_word_embedding(emb_model, input_tokens, dims)
        aux_emb += (max_seq_length - len(aux_emb)) * [zero_vector]
        aux_emb = np.array(aux_emb)
        flat_aux_emb = np.reshape(aux_emb, [-1])

        segment_ids = [0] * (len(seg1) + 2) + [1] * (len(seg2) + 1)

        feature = get_basic_input_feature(tokenizer, max_seq_length,
                                          input_tokens, segment_ids)
        feature["label_ids"] = create_int_feature([int(pc.label)])
        feature["aux_emb"] = create_float_feature(flat_aux_emb)
        return feature