Ejemplo n.º 1
0
def gen_tf_record():
    sequence_length = 300
    data_loader = get_biobert_nli_data_loader(sequence_length)
    todo = [("train", [data_loader.train_file]),
            ("dev", [data_loader.dev_file])]
    batch_size = 32
    dir_path = os.path.join(output_path,
                            "biobert_mnli_{}".format(sequence_length))
    exist_or_mkdir(dir_path)

    for name, files in todo[::-1]:
        output_file = os.path.join(dir_path, name)
        writer = RecordWriterWrap(output_file)
        for file in files:
            for e in data_loader.example_generator(file):
                f = entry_to_feature_dict(e)
                f["is_real_example"] = create_int_feature([1])
                writer.write_feature(f)

        if name == "dev":
            while writer.total_written % batch_size != 0:
                f["is_real_example"] = create_int_feature([0])
                writer.write_feature(f)

        writer.close()

        print("Wrote %d total instances" % writer.total_written)
Ejemplo n.º 2
0
def convert_to_unpaired(source_path, output_path):
    def feature_transformer(feature):
        new_features_1 = collections.OrderedDict()
        new_features_2 = collections.OrderedDict()

        def put(feature_name):
            return create_int_feature(take(feature[feature_name]))

        new_features_1["input_ids"] = put("input_ids1")
        new_features_1["input_mask"] = put("input_mask1")
        new_features_1["segment_ids"] = put("segment_ids1")
        new_features_1["label_ids"] = create_int_feature([1])

        new_features_2["input_ids"] = put("input_ids2")
        new_features_2["input_mask"] = put("input_mask2")
        new_features_2["segment_ids"] = put("segment_ids2")
        new_features_2["label_ids"] = create_int_feature([0])

        return new_features_1, new_features_2

    writer = RecordWriterWrap(output_path)
    feature_itr = load_record_v2(source_path)
    for feature in feature_itr:
        new_features_1, new_features_2 = feature_transformer(feature)
        writer.write_feature(new_features_1)
        writer.write_feature(new_features_2)
    writer.close()
Ejemplo n.º 3
0
def write_records(records: List[Record], max_seq_length, output_path):
    tokenizer = get_tokenizer()

    def encode(record: Record) -> OrderedDict:
        tokens = ["[CLS]"] + record.claim_tokens + [
            "[SEP]"
        ] + record.doc_tokens + ["[SEP]"]
        segment_ids = [0] * (len(record.claim_tokens) + 2) \
                      + [1] * (len(record.doc_tokens) + 1)
        tokens = tokens[:max_seq_length]
        segment_ids = segment_ids[:max_seq_length]
        features = get_basic_input_feature(tokenizer, max_seq_length, tokens,
                                           segment_ids)

        labels = [0.] * (len(record.claim_tokens) + 2) + record.scores
        labels += (max_seq_length - len(labels)) * [0.]
        label_mask = [0] * (len(record.claim_tokens) + 2) + record.valid_mask
        label_mask += (max_seq_length - len(label_mask)) * [0]
        features['label_ids'] = create_float_feature(labels)
        features['label_masks'] = create_int_feature(label_mask)
        return features

    writer = RecordWriterWrap(output_path)
    features: List[OrderedDict] = lmap(encode, records)
    foreach(writer.write_feature, features)
    writer.close()
Ejemplo n.º 4
0
def write_records(records: List[Payload], max_seq_length, output_path):
    tokenizer = get_tokenizer()

    def tokenize_from_tokens(tokens: List[str]) -> List[str]:
        output = []
        for t in tokens:
            ts = tokenizer.tokenize(t)
            output.extend(ts)
        return output

    def encode(inst: Payload) -> OrderedDict:
        tokens1: List[str] = tokenizer.tokenize(inst.candidate_text)
        max_seg2_len = max_seq_length - 3 - len(tokens1)
        tokens2 = tokenize_from_tokens(inst.passage)[:max_seg2_len]
        tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"]
        segment_ids = [0] * (len(tokens1) + 2) \
                      + [1] * (len(tokens2) + 1)
        tokens = tokens[:max_seq_length]
        segment_ids = segment_ids[:max_seq_length]
        features = get_basic_input_feature(tokenizer, max_seq_length, tokens,
                                           segment_ids)
        features['label_ids'] = create_int_feature([inst.is_correct])
        features['data_id'] = create_int_feature([inst.data_id])
        return features

    writer = RecordWriterWrap(output_path)
    features: List[OrderedDict] = lmap(encode, records)
    foreach(writer.write_feature, features)
    writer.close()
Ejemplo n.º 5
0
def tf_record_gen(ranked_list: Dict[str, List[SimpleRankedListEntry]],
                  queries: Dict,
                  text_reader: Callable[[str], str],
                  output_path,
                  max_seq_length: int,
                  data_info_save_name,
                  ):
    writer = RecordWriterWrap(output_path)
    tokenizer = get_tokenizer()
    dummy_label = 0

    data_id_idx = 0
    data_id_info = {}
    for query_id_str in ranked_list:
        query_rep = queries[query_id_str]
        query_str = query_rep['query']

        for ranked_entry in ranked_list[query_id_str]:
            data_id = data_id_idx
            data_id_idx += 1
            data_id_info[data_id] = (query_id_str, ranked_entry.doc_id)
            text = text_reader(ranked_entry.doc_id)
            tokens, segment_ids = encode_query_and_text(tokenizer, query_str, text, max_seq_length)
            features = get_basic_input_feature(tokenizer,
                                               max_seq_length,
                                               tokens,
                                               segment_ids)
            features['label_ids'] = create_int_feature([dummy_label])
            features['data_id'] = create_int_feature([data_id])
            writer.write_feature(features)

    save_to_pickle(data_id_info, data_info_save_name)
    writer.close()
Ejemplo n.º 6
0
    def work(self, job_id):
        qid_to_max_seg_idx: Dict[str, Dict[str, int]] = self.best_seg_collector.get_best_seg_info_2d(job_id)
        qids = self.query_group[job_id]
        output_path = os.path.join(self.out_dir, str(job_id))
        writer = RecordWriterWrap(output_path)
        for qid in qids:
            sr_per_qid = self.seg_resource_loader.load_for_qid(qid)
            doc_ids = list(qid_to_max_seg_idx[qid].keys())
            max_seg_idx_d = qid_to_max_seg_idx[qid]
            pos_doc, neg_doc = self.pool_pos_neg_doc(doc_ids, sr_per_qid)

            def get_max_seg(sr_per_doc: SRPerQueryDoc) -> SegmentRepresentation:
                max_seg_idx = max_seg_idx_d[sr_per_doc.doc_id]
                try:
                    seg = sr_per_doc.segs[max_seg_idx]
                except IndexError:
                    print('qid={} doc_id={}'.format(qid, sr_per_doc.doc_id))
                    print("max_seg_idx={} but len(segs)={}".format(max_seg_idx, len(sr_per_doc.segs)))
                    raise
                return seg

            pos_seg = get_max_seg(pos_doc)
            neg_seg = get_max_seg(neg_doc)
            feature = encode_sr_pair(pos_seg,
                                     neg_seg,
                                     self.max_seq_length,
                                     )
            writer.write_feature(feature)
        writer.close()
Ejemplo n.º 7
0
    def work(self, job_id):
        qid_to_max_seg_idx: Dict[Tuple[str, str], int] = self.best_seg_collector.get_best_seg_info(job_id)
        qids = self.query_group[job_id]
        output_path = os.path.join(self.out_dir, str(job_id))
        writer = RecordWriterWrap(output_path)
        for qid in qids:
            sr_per_qid = self.seg_resource_loader.load_for_qid(qid)
            for sr_per_doc in sr_per_qid.sr_per_query_doc:
                if len(sr_per_doc.segs) == 1:
                    continue
                qdid = qid, sr_per_doc.doc_id
                max_seg_idx = qid_to_max_seg_idx[qdid]
                label_id = sr_per_doc.label
                try:
                    seg = sr_per_doc.segs[max_seg_idx]
                    feature = encode_sr(seg,
                                        self.max_seq_length,
                                        label_id,
                                        )
                    writer.write_feature(feature)
                except IndexError:
                    print('qid={} doc_id={}'.format(qid, sr_per_doc.doc_id))
                    print("max_seg_idx={} but len(segs)={}".format(max_seg_idx, len(sr_per_doc.segs)))
                    raise

        writer.close()
Ejemplo n.º 8
0
def augment_topic_ids(records, topic_id, save_path):
    writer = RecordWriterWrap(save_path)
    for feature in records:
        first_inst = feature_to_ordered_dict(feature)
        first_inst["topic_ids"] = create_int_feature([topic_id])
        writer.write_feature(first_inst)

    writer.close()
Ejemplo n.º 9
0
def tfrecord_convertor(source_path: FilePath, output_path: FilePath,
                       feature_transformer):
    writer = RecordWriterWrap(output_path)
    feature_itr = load_record_v2(source_path)
    for feature in feature_itr:
        new_features = feature_transformer(feature)
        writer.write_feature(new_features)
    writer.close()
Ejemplo n.º 10
0
def encode2(itr_lm, itr_nli, out_path):
    writer = RecordWriterWrap(out_path)
    for nli_entry in itr_nli:
        lm_entry = itr_lm.__next__()
        new_features = combine_feature(lm_entry, nli_entry)
        writer.write_feature(new_features)
    print("Wrote {} items".format(writer.total_written))
    writer.close()
Ejemplo n.º 11
0
def do_filtering(file_path, out_path, condition_fn, debug_call_back=None):
    writer = RecordWriterWrap(out_path)
    for item in load_record(file_path):
        features = feature_to_ordered_dict(item)
        if condition_fn(features):
            if debug_call_back is not None:
                debug_call_back(features)
            writer.write_feature(features)
    writer.close()
Ejemplo n.º 12
0
def baseline_bert_gen_unbal_resplit(outpath, split):
    tokenizer = get_tokenizer()
    data: List[PerspectiveCandidate] = load_data_point_50_train_val(split)
    max_seq_length = 512

    writer = RecordWriterWrap(outpath)
    for entry in data:
        writer.write_feature(enc_to_feature(tokenizer, max_seq_length, entry))
    writer.close()
Ejemplo n.º 13
0
def write_pairwise_record(tokenizer, max_seq_length, insts, out_path):
    writer = RecordWriterWrap(out_path)
    for inst in insts:
        (tokens, segment_ids), (tokens2, segment_ids2) = inst

        features = combine_features(tokens, segment_ids, tokens2, segment_ids2,
                                    tokenizer, max_seq_length)

        writer.write_feature(features)
    writer.close()
Ejemplo n.º 14
0
    def work(self, job_id):
        tfrecord_path = os.path.join(self.input_dir, str(job_id))
        features = load_record(tfrecord_path)

        save_path = os.path.join(self.out_dir, str(job_id))
        writer = RecordWriterWrap(save_path)
        for f in rel_filter(features, self.relevance_scores,
                            self.cpid_to_label):
            writer.write_feature(f)
        writer.close()
Ejemplo n.º 15
0
 def work(self, job_id):
     tfrecord_path = os.path.join(self.input_dir, str(job_id))
     features = load_record(tfrecord_path)
     save_path = os.path.join(self.out_dir, str(job_id))
     writer = RecordWriterWrap(save_path)
     for f in collect_passages(features, self.relevance_scores,
                               self.cpid_to_label, self.num_max_para,
                               self.window_size):
         writer.write_feature(f)
     writer.close()
Ejemplo n.º 16
0
    def work(self, job_id):
        features: List[ParagraphClaimPersFeature] = pickle.load(
            open(os.path.join(self.input_dir, str(job_id)), "rb"))

        writer = RecordWriterWrap(os.path.join(self.out_dir, str(job_id)))
        for f in features:
            f2: ParagraphFeature = to_paragraph_feature(f)
            encoded_list: List[OrderedDict] = format_paragraph_features(
                self.tokenizer, self.max_seq_length, f2)
            foreach(writer.write_feature, encoded_list)
        writer.close()
Ejemplo n.º 17
0
    def write(self, insts: List[Instance], out_path):
        writer = RecordWriterWrap(out_path)
        for inst in insts:
            feature = get_basic_input_feature(self.tokenizer,
                                              self.max_seq_length, inst.tokens,
                                              inst.seg_ids)
            feature["data_id"] = create_int_feature([int(inst.data_id)])
            feature["label_ids"] = create_int_feature([int(inst.label)])
            writer.write_feature(feature)

        writer.close()
Ejemplo n.º 18
0
def do(data_id):
    working_dir = os.environ["TF_WORKING_DIR"]
    tokenzier = get_tokenizer()
    name1 = os.path.join(working_dir, "bert_loss", "{}.pickle".format(data_id))
    name2 = os.path.join(working_dir, "bfn_loss", "{}.pickle".format(data_id))

    tf_logging.debug("Loading " + name1)
    output1 = PredictionOutput(name1)
    tf_logging.debug("Loading " + name2)
    output2 = PredictionOutput(name2)

    assert len(output1.input_ids) == len(output2.input_ids)

    out_path = os.path.join(working_dir,
                            "loss_pred_train_data/{}".format(data_id))
    record_writer = RecordWriterWrap(out_path)
    n_inst = len(output1.input_ids)
    sep_id = tokenzier.vocab["[SEP]"]
    tf_logging.debug("Iterating")
    ticker = TimeEstimator(n_inst, "", 1000)
    for i in range(n_inst):
        if i % 1000 == 0:
            assert_input_equal(output1.input_ids[i], output2.input_ids[i])
        try:
            features = get_segment_and_mask(output1.input_ids[i], sep_id)
        except:
            try:
                sep_indice = get_sep_considering_masking(
                    output1.input_ids[i], sep_id, output1.masked_lm_ids[i],
                    output1.masked_lm_positions[i])
                features = get_segment_and_mask_inner(output1.input_ids[i],
                                                      sep_indice)
            except:
                tokens = tokenzier.convert_ids_to_tokens(output1.input_ids[i])
                print(tokenization.pretty_tokens(tokens))
                print(output1.masked_lm_ids[i])
                print(output1.masked_lm_positions[i])
                raise

        features["next_sentence_labels"] = create_int_feature([0])
        features["masked_lm_positions"] = create_int_feature(
            output1.masked_lm_positions[i])
        features["masked_lm_ids"] = create_int_feature(
            output1.masked_lm_ids[i])
        features["masked_lm_weights"] = create_float_feature(
            output1.masked_lm_weights[i])
        features["loss_base"] = create_float_feature(
            output1.masked_lm_example_loss[i])
        features["loss_target"] = create_float_feature(
            output2.masked_lm_example_loss[i])
        record_writer.write_feature(features)
        ticker.tick()

    record_writer.close()
Ejemplo n.º 19
0
def augment_topic_ids(records, save_path):
    writer = RecordWriterWrap(save_path)

    for feature in records:
        first_inst = feature_to_ordered_dict(feature)
        input_ids = first_inst["input_ids"].int64_list.value
        token_ids = input_ids[1]
        topic = token_ids_to_topic[token_ids]
        topic_id = data_generator.argmining.ukp_header.all_topics.index(topic)
        first_inst["topic_ids"] = create_int_feature([topic_id])
        writer.write_feature(first_inst)

    writer.close()
Ejemplo n.º 20
0
    def _write_instances(self, insts, output_file):
        writer = RecordWriterWrap(output_file)

        for instance in insts:
            word_tokens, def_tokens, segment_ids = instance
            word_tokens_ids = self.tokenizer.convert_tokens_to_ids(word_tokens)
            features = get_basic_input_feature(self.tokenizer, self.max_seq_length, def_tokens, segment_ids)
            while len(word_tokens_ids) < self.max_word_tokens:
                word_tokens_ids.append(0)
            features["word"] = create_int_feature(word_tokens_ids)
            writer.write_feature(features)
        writer.close()
        tf_logging.info("Wrote %d total instances", writer.total_written)
Ejemplo n.º 21
0
def work(job_id):
    outfile = os.path.join(working_dir, "BLC_data", "{}".format(job_id))
    if os.path.exists(outfile):
        return "Skip"
    tf_logging.debug("Loading data")
    data = load(job_id)
    tf_logging.debug("Done")
    if data is None:
        return "No Input"

    writer = RecordWriterWrap(outfile)

    batch_size, seq_length = data[0]['input_ids'].shape
    keys = list(data[0].keys())

    vectors = flatten_batches(data)
    basic_keys = "input_ids", "input_mask", "segment_ids"
    any_key = keys[0]
    data_len = len(vectors[any_key])
    num_predictions = len(vectors["grouped_positions"][0][0])

    for i in range(data_len):
        mask_valid = [0] * seq_length
        loss1_arr = [0] * seq_length
        loss2_arr = [0] * seq_length
        positions = vectors["grouped_positions"][i]
        num_trials = len(positions)
        for t_i in range(num_trials):
            for p_i in range(num_predictions):
                loc = vectors["grouped_positions"][i][t_i][p_i]
                loss1 = vectors["grouped_loss1"][i][t_i][p_i]
                loss2 = vectors["grouped_loss2"][i][t_i][p_i]

                loss1_arr[loc] = loss1
                loss2_arr[loc] = loss2
                assert mask_valid[loc] == 0
                mask_valid[loc] = 1

        features = collections.OrderedDict()
        for key in basic_keys:
            features[key] = create_int_feature(vectors[key][i])

        features["loss_valid"] = create_int_feature(mask_valid)
        features["loss1"] = create_float_feature(loss1_arr)
        features["loss2"] = create_float_feature(loss2_arr)
        features["next_sentence_labels"] = create_int_feature([0])
        writer.write_feature(features)
        #if i < 20:
        #    log_print_feature(features)
    writer.close()
    return "Done"
Ejemplo n.º 22
0
    def write_instances(self, new_inst_list, outfile):
        writer = RecordWriterWrap(outfile)
        example_numbers = []

        for (inst_index, instance) in enumerate(new_inst_list):
            tokens, segment_ids = instance
            features = get_basic_input_feature(self.tokenizer,
                                               self.target_seq_length, tokens,
                                               segment_ids)
            features["use_context"] = create_int_feature([1])
            writer.write_feature(features)
        writer.close()
        tf_logging.info("Wrote %d total instances", writer.total_written)
        return example_numbers
Ejemplo n.º 23
0
    def write(self, insts, out_path):
        writer = RecordWriterWrap(out_path)
        f = open(out_path + ".info", "wb")
        doc_id_list = []
        for inst in insts:
            tokens, segment_ids, doc_id = inst
            feature = get_basic_input_feature(self.tokenizer,
                                              self.max_seq_length, tokens,
                                              segment_ids)
            doc_id_list.append(doc_id)

            writer.write_feature(feature)

        pickle.dump(doc_id_list, f)
        writer.close()
Ejemplo n.º 24
0
    def write_instances(self, new_inst_list, outfile):
        writer = RecordWriterWrap(outfile)
        example_numbers = []
        feature_formatter = MLMFeaturizer(self.tokenizer, self.max_seq_length,
                                          self.max_predictions_per_seq)

        for (inst_index, instance) in enumerate(new_inst_list):
            features = feature_formatter.instance_to_features(instance)
            writer.write_feature(features)
            if inst_index < 20:
                log_print_inst(instance, features)
        writer.close()

        tf_logging.info("Wrote %d total instances", writer.total_written)

        return example_numbers
Ejemplo n.º 25
0
    def write_instances(self, new_inst_list, outfile):
        writer = RecordWriterWrap(outfile)
        example_numbers = []

        for (inst_index, instance) in enumerate(new_inst_list):
            features = get_basic_input_feature(self.tokenizer,
                                               self.max_seq_length,
                                               instance.tokens,
                                               instance.segment_ids)

            writer.write_feature(features)
            if inst_index < 20:
                log_print_inst(instance, features)
        writer.close()

        return example_numbers
Ejemplo n.º 26
0
    def write(self, insts, out_path):
        writer = RecordWriterWrap(out_path)

        def tokens_to_int_feature(tokens):
            return create_int_feature(
                self.tokenizer.convert_tokens_to_ids(tokens))

        for inst in insts:
            query_tokens, content_tokens, label = inst
            feature = collections.OrderedDict()
            feature['query'] = tokens_to_int_feature(
                query_tokens[:self.max_query_len])
            feature['content'] = tokens_to_int_feature(
                content_tokens[:self.max_seq_length])
            feature['label_ids'] = create_int_feature([label])
            writer.write_feature(feature)
        writer.close()
Ejemplo n.º 27
0
    def write_instances(self, instances, outfile):
        writer = RecordWriterWrap(outfile)
        example_numbers = []
        get_basic_input_features_fn = partial(get_basic_input_feature,
                                              self.tokenizer,
                                              self.max_seq_length)
        for (inst_index, instance) in enumerate(instances):
            features = get_basic_input_features_fn(instance.tokens,
                                                   instance.segment_ids)
            features["use_context"] = create_int_feature([1])
            writer.write_feature(features)
            if inst_index < 20:
                log_print_inst(instance, features)
        writer.close()

        tf_logging.info("Wrote %d total instances", writer.total_written)
        return example_numbers
Ejemplo n.º 28
0
def gen_mismatched():
    sequence_length = 300
    data_loader = get_modified_nli_data_loader(sequence_length)
    dir_path = os.path.join(output_path,
                            "nli_tfrecord_cls_{}".format(sequence_length))
    name = "dev_mis"
    output_file = os.path.join(dir_path, name)
    batch_size = 32
    writer = RecordWriterWrap(output_file)
    for e in data_loader.example_generator(data_loader.dev_file2):
        f = entry_to_feature_dict(e)
        f["is_real_example"] = create_int_feature([1])
        writer.write_feature(f)
    while writer.total_written % batch_size != 0:
        f["is_real_example"] = create_int_feature([0])
        writer.write_feature(f)
    writer.close()
    print("Wrote %d total instances" % writer.total_written)
Ejemplo n.º 29
0
def gen_with_aux_emb(outpath, aux_embedding_d, split, dims):
    tokenizer = get_tokenizer()
    data: List[PerspectiveCandidate] = load_data_point(split)
    max_seq_length = 512
    zero_vector = [0.] * dims

    not_found = set()

    def get_aux_embedding_fn(cid):
        cid = int(cid)
        if cid in aux_embedding_d:
            return aux_embedding_d[cid]
        else:
            if cid not in not_found:
                not_found.add(cid)
                print("Aux embedding not found", cid)
            return {}

    def enc_to_feature(pc: PerspectiveCandidate) -> OrderedDict:
        emb_model = get_aux_embedding_fn(pc.cid)

        seg1 = tokenizer.tokenize(pc.claim_text)
        seg2 = tokenizer.tokenize(pc.p_text)

        input_tokens = ["[CLS]"] + seg1 + ["[SEP]"] + seg2 + ["[SEP]"]

        aux_emb = get_word_embedding(emb_model, input_tokens, dims)
        aux_emb += (max_seq_length - len(aux_emb)) * [zero_vector]
        aux_emb = np.array(aux_emb)
        flat_aux_emb = np.reshape(aux_emb, [-1])

        segment_ids = [0] * (len(seg1) + 2) + [1] * (len(seg2) + 1)

        feature = get_basic_input_feature(tokenizer, max_seq_length,
                                          input_tokens, segment_ids)
        feature["label_ids"] = create_int_feature([int(pc.label)])
        feature["aux_emb"] = create_float_feature(flat_aux_emb)
        return feature

    writer = RecordWriterWrap(outpath)
    for entry in data:
        writer.write_feature(enc_to_feature(entry))
    writer.close()
Ejemplo n.º 30
0
    def write_instances(self, new_inst_list, outfile):
        writer = RecordWriterWrap(outfile)
        example_numbers = []

        for (inst_index, instance) in enumerate(new_inst_list):
            features = get_basic_input_feature(self.tokenizer,
                                               self.max_seq_length,
                                               instance.tokens,
                                               instance.segment_ids)
            features["next_sentence_labels"] = btd.create_int_feature([0])

            writer.write_feature(features)
            if inst_index < 20:
                log_print_inst(instance, features)
        writer.close()

        tf_logging.info("Wrote %d total instances", writer.total_written)

        return example_numbers