Esempio n. 1
0
def write_records(records: List[Record], max_seq_length, output_path):
    tokenizer = get_tokenizer()

    def encode(record: Record) -> OrderedDict:
        tokens = ["[CLS]"] + record.claim_tokens + [
            "[SEP]"
        ] + record.doc_tokens + ["[SEP]"]
        segment_ids = [0] * (len(record.claim_tokens) + 2) \
                      + [1] * (len(record.doc_tokens) + 1)
        tokens = tokens[:max_seq_length]
        segment_ids = segment_ids[:max_seq_length]
        features = get_basic_input_feature(tokenizer, max_seq_length, tokens,
                                           segment_ids)

        labels = [0.] * (len(record.claim_tokens) + 2) + record.scores
        labels += (max_seq_length - len(labels)) * [0.]
        label_mask = [0] * (len(record.claim_tokens) + 2) + record.valid_mask
        label_mask += (max_seq_length - len(label_mask)) * [0]
        features['label_ids'] = create_float_feature(labels)
        features['label_masks'] = create_int_feature(label_mask)
        return features

    writer = RecordWriterWrap(output_path)
    features: List[OrderedDict] = lmap(encode, records)
    foreach(writer.write_feature, features)
    writer.close()
def write_records(records: List[Payload], max_seq_length, output_path):
    tokenizer = get_tokenizer()

    def tokenize_from_tokens(tokens: List[str]) -> List[str]:
        output = []
        for t in tokens:
            ts = tokenizer.tokenize(t)
            output.extend(ts)
        return output

    def encode(inst: Payload) -> OrderedDict:
        tokens1: List[str] = tokenizer.tokenize(inst.candidate_text)
        max_seg2_len = max_seq_length - 3 - len(tokens1)
        tokens2 = tokenize_from_tokens(inst.passage)[:max_seg2_len]
        tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"]
        segment_ids = [0] * (len(tokens1) + 2) \
                      + [1] * (len(tokens2) + 1)
        tokens = tokens[:max_seq_length]
        segment_ids = segment_ids[:max_seq_length]
        features = get_basic_input_feature(tokenizer, max_seq_length, tokens,
                                           segment_ids)
        features['label_ids'] = create_int_feature([inst.is_correct])
        features['data_id'] = create_int_feature([inst.data_id])
        return features

    writer = RecordWriterWrap(output_path)
    features: List[OrderedDict] = lmap(encode, records)
    foreach(writer.write_feature, features)
    writer.close()
Esempio n. 3
0
 def work(self, job_id):
     file_path = os.path.join(self.lm_dir, str(job_id))
     out_path = os.path.join(self.working_dir, str(job_id))
     lm_itr = load_record_v2(file_path)
     random.shuffle(self.tt_entries)
     idx = 0
     writer = RecordWriterWrap(out_path)
     for lm_entry in lm_itr:
         nli_entry = self.tt_entries[idx]
         new_features = combine_feature(lm_entry, nli_entry)
         writer.write_feature(new_features)
Esempio n. 4
0
def main(dir_path):
    output_path = os.path.join(dir_path, "all")
    writer = RecordWriterWrap(output_path)
    for i in range(665):
        p = os.path.join(dir_path, str(i))
        if os.path.exists(p):
            for record in load_record(p):
                new_features = collections.OrderedDict()
                for key in record:
                    new_features[key] = create_int_feature(take(record[key]))
                writer.write_feature(new_features)
Esempio n. 5
0
    def work(self, job_id):
        features: List[ParagraphClaimPersFeature] = pickle.load(
            open(os.path.join(self.input_dir, str(job_id)), "rb"))

        writer = RecordWriterWrap(os.path.join(self.out_dir, str(job_id)))
        for f in features:
            f2: ParagraphFeature = to_paragraph_feature(f)
            encoded_list: List[OrderedDict] = format_paragraph_features(
                self.tokenizer, self.max_seq_length, f2)
            foreach(writer.write_feature, encoded_list)
        writer.close()
Esempio n. 6
0
def gen_tf_record():
    sequence_length = 300
    data_loader = get_biobert_nli_data_loader(sequence_length)
    todo = [("train", [data_loader.train_file]),
            ("dev", [data_loader.dev_file])]
    batch_size = 32
    dir_path = os.path.join(output_path,
                            "biobert_mnli_{}".format(sequence_length))
    exist_or_mkdir(dir_path)

    for name, files in todo[::-1]:
        output_file = os.path.join(dir_path, name)
        writer = RecordWriterWrap(output_file)
        for file in files:
            for e in data_loader.example_generator(file):
                f = entry_to_feature_dict(e)
                f["is_real_example"] = create_int_feature([1])
                writer.write_feature(f)

        if name == "dev":
            while writer.total_written % batch_size != 0:
                f["is_real_example"] = create_int_feature([0])
                writer.write_feature(f)

        writer.close()

        print("Wrote %d total instances" % writer.total_written)
Esempio n. 7
0
def main(dir_path):
    output_path = os.path.join(dir_path, "all_balanced")
    pos_insts = []
    neg_insts = []
    all_insts = [neg_insts, pos_insts]

    for i in range(665):
        p = os.path.join(dir_path, str(i))
        if os.path.exists(p):
            for record in load_record(p):
                new_features = collections.OrderedDict()
                for key in record:
                    new_features[key] = create_int_feature(take(record[key]))

                label = take(record['label_ids'])[0]
                all_insts[label].append(new_features)

    random.shuffle(pos_insts)
    random.shuffle(neg_insts)

    num_sel = min(len(pos_insts), len(neg_insts))
    print("{} insts per label".format(num_sel))

    insts_to_write = pos_insts[:num_sel] + neg_insts[:num_sel]
    writer = RecordWriterWrap(output_path)
    foreach(writer.write_feature, insts_to_write)
Esempio n. 8
0
def write_tfrecord(ranked_list_d: RankedListDict, queries: List[Query],
                   q_rels: Dict[str, List[str]], save_path):
    max_seq_length = 512
    tokenizer = get_tokenizer()
    encoder = AllSegmentAsDoc(max_seq_length)
    writer = RecordWriterWrap(save_path)
    data_id = 0

    data_info = []
    for query in queries:
        if query.qid not in ranked_list_d:
            print("Warning query {} not found".format(query.qid))
            continue
        print(query.qid)
        ranked_list = ranked_list_d[query.qid]
        doc_ids = [doc_entry.doc_id for doc_entry in ranked_list]
        preload_man.preload(BertTokenizedCluewebDoc, doc_ids)
        q_tokens = tokenizer.tokenize(query.text)

        for doc_entry in ranked_list:
            try:
                tokens_list: List[List[str]] = load(BertTokenizedCluewebDoc,
                                                    doc_entry.doc_id)
                tokens = flatten(tokens_list)
                insts: List[Tuple[List,
                                  List]] = encoder.encode(q_tokens, tokens)
                for inst in insts:
                    label = doc_entry.doc_id in q_rels[query.qid]

                    input_tokens, segment_ids = inst
                    feature = get_basic_input_feature(tokenizer,
                                                      max_seq_length,
                                                      input_tokens,
                                                      segment_ids)
                    feature["label_ids"] = create_int_feature([int(label)])
                    feature["data_id"] = create_int_feature([int(data_id)])
                    writer.write_feature(feature)

                    data_info.append((data_id, query.qid, doc_entry.doc_id))
                    data_id += 1
            except KeyError as e:
                print("doc {} not found".format(doc_entry.doc_id))

    return data_info
Esempio n. 9
0
    def work(self, job_id):
        features: List[ParagraphClaimPersFeature] = pickle.load(
            open(os.path.join(self.input_dir, str(job_id)), "rb"))

        info_d_all = {}
        data_id_base = job_id * 100000
        data_id_gen = DataIDGen(data_id_base)
        writer = RecordWriterWrap(os.path.join(self.out_dir, str(job_id)))
        for f in features:
            pair = to_retrieval_format(self.tokenizer, self.max_seq_length,
                                       data_id_gen, f)
            info_d: Dict = pair[0]
            f2: List[OrderedDict] = pair[1]

            info_d_all.update(info_d)
            foreach(writer.write_feature, f2)
        writer.close()

        pickle.dump(info_d_all,
                    open(os.path.join(self.info_out_dir, str(job_id)), "wb"))
Esempio n. 10
0
    def work(self, job_id):
        qid_to_max_seg_idx: Dict[Tuple[str, str], int] = self.best_seg_collector.get_best_seg_info(job_id)
        qids = self.query_group[job_id]
        output_path = os.path.join(self.out_dir, str(job_id))
        writer = RecordWriterWrap(output_path)
        for qid in qids:
            sr_per_qid = self.seg_resource_loader.load_for_qid(qid)
            for sr_per_doc in sr_per_qid.sr_per_query_doc:
                if len(sr_per_doc.segs) == 1:
                    continue
                qdid = qid, sr_per_doc.doc_id
                max_seg_idx = qid_to_max_seg_idx[qdid]
                label_id = sr_per_doc.label
                try:
                    seg = sr_per_doc.segs[max_seg_idx]
                    feature = encode_sr(seg,
                                        self.max_seq_length,
                                        label_id,
                                        )
                    writer.write_feature(feature)
                except IndexError:
                    print('qid={} doc_id={}'.format(qid, sr_per_doc.doc_id))
                    print("max_seg_idx={} but len(segs)={}".format(max_seg_idx, len(sr_per_doc.segs)))
                    raise

        writer.close()
Esempio n. 11
0
    def work(self, job_id):
        qid_to_max_seg_idx: Dict[str, Dict[str, int]] = self.best_seg_collector.get_best_seg_info_2d(job_id)
        qids = self.query_group[job_id]
        output_path = os.path.join(self.out_dir, str(job_id))
        writer = RecordWriterWrap(output_path)
        for qid in qids:
            sr_per_qid = self.seg_resource_loader.load_for_qid(qid)
            doc_ids = list(qid_to_max_seg_idx[qid].keys())
            max_seg_idx_d = qid_to_max_seg_idx[qid]
            pos_doc, neg_doc = self.pool_pos_neg_doc(doc_ids, sr_per_qid)

            def get_max_seg(sr_per_doc: SRPerQueryDoc) -> SegmentRepresentation:
                max_seg_idx = max_seg_idx_d[sr_per_doc.doc_id]
                try:
                    seg = sr_per_doc.segs[max_seg_idx]
                except IndexError:
                    print('qid={} doc_id={}'.format(qid, sr_per_doc.doc_id))
                    print("max_seg_idx={} but len(segs)={}".format(max_seg_idx, len(sr_per_doc.segs)))
                    raise
                return seg

            pos_seg = get_max_seg(pos_doc)
            neg_seg = get_max_seg(neg_doc)
            feature = encode_sr_pair(pos_seg,
                                     neg_seg,
                                     self.max_seq_length,
                                     )
            writer.write_feature(feature)
        writer.close()
Esempio n. 12
0
def convert_to_unpaired(source_path, output_path):
    def feature_transformer(feature):
        new_features_1 = collections.OrderedDict()
        new_features_2 = collections.OrderedDict()

        def put(feature_name):
            return create_int_feature(take(feature[feature_name]))

        new_features_1["input_ids"] = put("input_ids1")
        new_features_1["input_mask"] = put("input_mask1")
        new_features_1["segment_ids"] = put("segment_ids1")
        new_features_1["label_ids"] = create_int_feature([1])

        new_features_2["input_ids"] = put("input_ids2")
        new_features_2["input_mask"] = put("input_mask2")
        new_features_2["segment_ids"] = put("segment_ids2")
        new_features_2["label_ids"] = create_int_feature([0])

        return new_features_1, new_features_2

    writer = RecordWriterWrap(output_path)
    feature_itr = load_record_v2(source_path)
    for feature in feature_itr:
        new_features_1, new_features_2 = feature_transformer(feature)
        writer.write_feature(new_features_1)
        writer.write_feature(new_features_2)
    writer.close()
Esempio n. 13
0
def tf_record_gen(ranked_list: Dict[str, List[SimpleRankedListEntry]],
                  queries: Dict,
                  text_reader: Callable[[str], str],
                  output_path,
                  max_seq_length: int,
                  data_info_save_name,
                  ):
    writer = RecordWriterWrap(output_path)
    tokenizer = get_tokenizer()
    dummy_label = 0

    data_id_idx = 0
    data_id_info = {}
    for query_id_str in ranked_list:
        query_rep = queries[query_id_str]
        query_str = query_rep['query']

        for ranked_entry in ranked_list[query_id_str]:
            data_id = data_id_idx
            data_id_idx += 1
            data_id_info[data_id] = (query_id_str, ranked_entry.doc_id)
            text = text_reader(ranked_entry.doc_id)
            tokens, segment_ids = encode_query_and_text(tokenizer, query_str, text, max_seq_length)
            features = get_basic_input_feature(tokenizer,
                                               max_seq_length,
                                               tokens,
                                               segment_ids)
            features['label_ids'] = create_int_feature([dummy_label])
            features['data_id'] = create_int_feature([data_id])
            writer.write_feature(features)

    save_to_pickle(data_id_info, data_info_save_name)
    writer.close()
Esempio n. 14
0
def tfrecord_convertor(source_path: FilePath, output_path: FilePath,
                       feature_transformer):
    writer = RecordWriterWrap(output_path)
    feature_itr = load_record_v2(source_path)
    for feature in feature_itr:
        new_features = feature_transformer(feature)
        writer.write_feature(new_features)
    writer.close()
Esempio n. 15
0
def augment_topic_ids(records, topic_id, save_path):
    writer = RecordWriterWrap(save_path)
    for feature in records:
        first_inst = feature_to_ordered_dict(feature)
        first_inst["topic_ids"] = create_int_feature([topic_id])
        writer.write_feature(first_inst)

    writer.close()
Esempio n. 16
0
def encode2(itr_lm, itr_nli, out_path):
    writer = RecordWriterWrap(out_path)
    for nli_entry in itr_nli:
        lm_entry = itr_lm.__next__()
        new_features = combine_feature(lm_entry, nli_entry)
        writer.write_feature(new_features)
    print("Wrote {} items".format(writer.total_written))
    writer.close()
Esempio n. 17
0
def do_filtering(file_path, out_path, condition_fn, debug_call_back=None):
    writer = RecordWriterWrap(out_path)
    for item in load_record(file_path):
        features = feature_to_ordered_dict(item)
        if condition_fn(features):
            if debug_call_back is not None:
                debug_call_back(features)
            writer.write_feature(features)
    writer.close()
Esempio n. 18
0
def baseline_bert_gen_unbal_resplit(outpath, split):
    tokenizer = get_tokenizer()
    data: List[PerspectiveCandidate] = load_data_point_50_train_val(split)
    max_seq_length = 512

    writer = RecordWriterWrap(outpath)
    for entry in data:
        writer.write_feature(enc_to_feature(tokenizer, max_seq_length, entry))
    writer.close()
Esempio n. 19
0
def write_records(records: List[PairedInstance], max_seq_length, output_path):
    tokenizer = get_tokenizer()

    def tokenize_from_tokens(tokens: List[str]) -> List[str]:
        output = []
        for t in tokens:
            ts = tokenizer.tokenize(t)
            output.extend(ts)
        return output

    def encode(inst: PairedInstance) -> OrderedDict:
        tokens1: List[str] = tokenizer.tokenize(inst.candidate_text)
        max_seg2_len = max_seq_length - 3 - len(tokens1)

        def concat_tokens(raw_tokens: List[str]):
            tokens2 = tokenize_from_tokens(raw_tokens)[:max_seg2_len]
            tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"]

            segment_ids = [0] * (len(tokens1) + 2) \
                          + [1] * (len(tokens2) + 1)
            tokens = tokens[:max_seq_length]
            segment_ids = segment_ids[:max_seq_length]
            return tokens, segment_ids

        out_tokens1, seg1 = concat_tokens(inst.passage_good)
        out_tokens2, seg2 = concat_tokens(inst.passage_worse)
        features = combine_features(out_tokens1, seg1, out_tokens2, seg2,
                                    tokenizer, max_seq_length)
        features['strict_good'] = create_int_feature([inst.strict_good])
        features['strict_bad'] = create_int_feature([inst.strict_bad])
        return features

    writer = RecordWriterWrap(output_path)
    features: List[OrderedDict] = lmap(encode, records)
    foreach(writer.write_feature, features)
    writer.close()
Esempio n. 20
0
def write_pairwise_record(tokenizer, max_seq_length, insts, out_path):
    writer = RecordWriterWrap(out_path)
    for inst in insts:
        (tokens, segment_ids), (tokens2, segment_ids2) = inst

        features = combine_features(tokens, segment_ids, tokens2, segment_ids2,
                                    tokenizer, max_seq_length)

        writer.write_feature(features)
    writer.close()
Esempio n. 21
0
 def work(self, job_id):
     tfrecord_path = os.path.join(self.input_dir, str(job_id))
     features = load_record(tfrecord_path)
     save_path = os.path.join(self.out_dir, str(job_id))
     writer = RecordWriterWrap(save_path)
     for f in collect_passages(features, self.relevance_scores,
                               self.cpid_to_label, self.num_max_para,
                               self.window_size):
         writer.write_feature(f)
     writer.close()
Esempio n. 22
0
    def work(self, job_id):
        tfrecord_path = os.path.join(self.input_dir, str(job_id))
        features = load_record(tfrecord_path)

        save_path = os.path.join(self.out_dir, str(job_id))
        writer = RecordWriterWrap(save_path)
        for f in rel_filter(features, self.relevance_scores,
                            self.cpid_to_label):
            writer.write_feature(f)
        writer.close()
Esempio n. 23
0
def gen_mismatched():
    sequence_length = 300
    data_loader = get_modified_nli_data_loader(sequence_length)
    dir_path = os.path.join(output_path,
                            "nli_tfrecord_cls_{}".format(sequence_length))
    name = "dev_mis"
    output_file = os.path.join(dir_path, name)
    batch_size = 32
    writer = RecordWriterWrap(output_file)
    for e in data_loader.example_generator(data_loader.dev_file2):
        f = entry_to_feature_dict(e)
        f["is_real_example"] = create_int_feature([1])
        writer.write_feature(f)
    while writer.total_written % batch_size != 0:
        f["is_real_example"] = create_int_feature([0])
        writer.write_feature(f)
    writer.close()
    print("Wrote %d total instances" % writer.total_written)
Esempio n. 24
0
def do(data_id):
    working_dir = os.environ["TF_WORKING_DIR"]
    tokenzier = get_tokenizer()
    name1 = os.path.join(working_dir, "bert_loss", "{}.pickle".format(data_id))
    name2 = os.path.join(working_dir, "bfn_loss", "{}.pickle".format(data_id))

    tf_logging.debug("Loading " + name1)
    output1 = PredictionOutput(name1)
    tf_logging.debug("Loading " + name2)
    output2 = PredictionOutput(name2)

    assert len(output1.input_ids) == len(output2.input_ids)

    out_path = os.path.join(working_dir,
                            "loss_pred_train_data/{}".format(data_id))
    record_writer = RecordWriterWrap(out_path)
    n_inst = len(output1.input_ids)
    sep_id = tokenzier.vocab["[SEP]"]
    tf_logging.debug("Iterating")
    ticker = TimeEstimator(n_inst, "", 1000)
    for i in range(n_inst):
        if i % 1000 == 0:
            assert_input_equal(output1.input_ids[i], output2.input_ids[i])
        try:
            features = get_segment_and_mask(output1.input_ids[i], sep_id)
        except:
            try:
                sep_indice = get_sep_considering_masking(
                    output1.input_ids[i], sep_id, output1.masked_lm_ids[i],
                    output1.masked_lm_positions[i])
                features = get_segment_and_mask_inner(output1.input_ids[i],
                                                      sep_indice)
            except:
                tokens = tokenzier.convert_ids_to_tokens(output1.input_ids[i])
                print(tokenization.pretty_tokens(tokens))
                print(output1.masked_lm_ids[i])
                print(output1.masked_lm_positions[i])
                raise

        features["next_sentence_labels"] = create_int_feature([0])
        features["masked_lm_positions"] = create_int_feature(
            output1.masked_lm_positions[i])
        features["masked_lm_ids"] = create_int_feature(
            output1.masked_lm_ids[i])
        features["masked_lm_weights"] = create_float_feature(
            output1.masked_lm_weights[i])
        features["loss_base"] = create_float_feature(
            output1.masked_lm_example_loss[i])
        features["loss_target"] = create_float_feature(
            output2.masked_lm_example_loss[i])
        record_writer.write_feature(features)
        ticker.tick()

    record_writer.close()
Esempio n. 25
0
    def write(self, insts: List[Instance], out_path):
        writer = RecordWriterWrap(out_path)
        for inst in insts:
            feature = get_basic_input_feature(self.tokenizer,
                                              self.max_seq_length, inst.tokens,
                                              inst.seg_ids)
            feature["data_id"] = create_int_feature([int(inst.data_id)])
            feature["label_ids"] = create_int_feature([int(inst.label)])
            writer.write_feature(feature)

        writer.close()
Esempio n. 26
0
    def _write_instances(self, insts, output_file):
        writer = RecordWriterWrap(output_file)

        for instance in insts:
            word_tokens, def_tokens, segment_ids = instance
            word_tokens_ids = self.tokenizer.convert_tokens_to_ids(word_tokens)
            features = get_basic_input_feature(self.tokenizer, self.max_seq_length, def_tokens, segment_ids)
            while len(word_tokens_ids) < self.max_word_tokens:
                word_tokens_ids.append(0)
            features["word"] = create_int_feature(word_tokens_ids)
            writer.write_feature(features)
        writer.close()
        tf_logging.info("Wrote %d total instances", writer.total_written)
Esempio n. 27
0
def augment_topic_ids(records, save_path):
    writer = RecordWriterWrap(save_path)

    for feature in records:
        first_inst = feature_to_ordered_dict(feature)
        input_ids = first_inst["input_ids"].int64_list.value
        token_ids = input_ids[1]
        topic = token_ids_to_topic[token_ids]
        topic_id = data_generator.argmining.ukp_header.all_topics.index(topic)
        first_inst["topic_ids"] = create_int_feature([topic_id])
        writer.write_feature(first_inst)

    writer.close()
Esempio n. 28
0
def work(job_id):
    outfile = os.path.join(working_dir, "BLC_data", "{}".format(job_id))
    if os.path.exists(outfile):
        return "Skip"
    tf_logging.debug("Loading data")
    data = load(job_id)
    tf_logging.debug("Done")
    if data is None:
        return "No Input"

    writer = RecordWriterWrap(outfile)

    batch_size, seq_length = data[0]['input_ids'].shape
    keys = list(data[0].keys())

    vectors = flatten_batches(data)
    basic_keys = "input_ids", "input_mask", "segment_ids"
    any_key = keys[0]
    data_len = len(vectors[any_key])
    num_predictions = len(vectors["grouped_positions"][0][0])

    for i in range(data_len):
        mask_valid = [0] * seq_length
        loss1_arr = [0] * seq_length
        loss2_arr = [0] * seq_length
        positions = vectors["grouped_positions"][i]
        num_trials = len(positions)
        for t_i in range(num_trials):
            for p_i in range(num_predictions):
                loc = vectors["grouped_positions"][i][t_i][p_i]
                loss1 = vectors["grouped_loss1"][i][t_i][p_i]
                loss2 = vectors["grouped_loss2"][i][t_i][p_i]

                loss1_arr[loc] = loss1
                loss2_arr[loc] = loss2
                assert mask_valid[loc] == 0
                mask_valid[loc] = 1

        features = collections.OrderedDict()
        for key in basic_keys:
            features[key] = create_int_feature(vectors[key][i])

        features["loss_valid"] = create_int_feature(mask_valid)
        features["loss1"] = create_float_feature(loss1_arr)
        features["loss2"] = create_float_feature(loss2_arr)
        features["next_sentence_labels"] = create_int_feature([0])
        writer.write_feature(features)
        #if i < 20:
        #    log_print_feature(features)
    writer.close()
    return "Done"
Esempio n. 29
0
    def write_instances(self, new_inst_list, outfile):
        writer = RecordWriterWrap(outfile)
        example_numbers = []

        for (inst_index, instance) in enumerate(new_inst_list):
            tokens, segment_ids = instance
            features = get_basic_input_feature(self.tokenizer,
                                               self.target_seq_length, tokens,
                                               segment_ids)
            features["use_context"] = create_int_feature([1])
            writer.write_feature(features)
        writer.close()
        tf_logging.info("Wrote %d total instances", writer.total_written)
        return example_numbers
Esempio n. 30
0
    def write(self, insts, out_path):
        writer = RecordWriterWrap(out_path)
        f = open(out_path + ".info", "wb")
        doc_id_list = []
        for inst in insts:
            tokens, segment_ids, doc_id = inst
            feature = get_basic_input_feature(self.tokenizer,
                                              self.max_seq_length, tokens,
                                              segment_ids)
            doc_id_list.append(doc_id)

            writer.write_feature(feature)

        pickle.dump(doc_id_list, f)
        writer.close()