Example #1
0
    def process_feature(self, feature):
        """Write a InputFeature to the TFRecordWriter as a tf.train.Example."""
        self.num_features += 1

        features = collections.OrderedDict()
        features["input_ids_a"] = tf_data_utils.create_int_feature(
            feature.input_ids_a)
        features["input_mask_a"] = tf_data_utils.create_int_feature(
            feature.input_mask_a)
        features["segment_ids_a"] = tf_data_utils.create_int_feature(
            feature.segment_ids_a)
        features["input_ids_b"] = tf_data_utils.create_int_feature(
            feature.input_ids_b)
        features["input_mask_b"] = tf_data_utils.create_int_feature(
            feature.input_mask_b)
        features["segment_ids_b"] = tf_data_utils.create_int_feature(
            feature.segment_ids_b)
        features["label_ids"] = tf_data_utils.create_int_feature(
            [feature.label_ids])
        try:
            features["qas_id"] = tf_data_utils.create_int_feature(
                [feature.guid])
        except:
            pass
        try:
            features["class_ratio"] = tf_data_utils.create_float_feature(
                [feature.class_ratio])
        except:
            pass

        tf_example = tf.train.Example(features=tf.train.Features(
            feature=features))
        self._writer.write(tf_example.SerializeToString())
Example #2
0
	def _read_write(self, input_file, output_file, tokenizer,
					max_length=64,
					bos='<S>', eos='<T>', **kargs):
		self._writer = tf.python_io.TFRecordWriter(output_file)
		with tf.gfile.Open(input_file, "r") as f:
			for i, line in enumerate(f):
				if not line.strip() or i == 0:
					continue
				content = clean(line.strip())
				word_seq = []

				if kargs.get('token_mapping', {}):
					for key in kargs.get('token_mapping', {}):
						content = re.sub(key, kargs.get('token_mapping', {}).get(key, ""), content)

				for word in content.split():
					if CN_CHARACTER_REGEX.findall(word):
						word_seq.extend(list(word))
					else:
						word_seq.append(word)

				if len(word_seq) > max_length:
					word_seq = word_seq[0:max_length]

				word_seq = [bos] + word_seq + [eos]
				word_id_seq = tokenizer.convert_tokens_to_ids(word_seq, max_length+2)
				seq_mask = [1] * len(word_id_seq)
				word_id_seq = tokenizer.padding(word_id_seq, max_length+2, 0)
				seq_mask = tokenizer.padding(seq_mask, max_length+2, 0)

				assert len(word_id_seq) == max_length + 2
				assert len(seq_mask) == max_length + 2

				features = collections.OrderedDict()
				features["input_ids"] = tf_data_utils.create_int_feature(word_id_seq)
				features["input_mask"] = tf_data_utils.create_int_feature(seq_mask)

				if i <= 30:
					tf.logging.info("*** Example ***")
					tf.logging.info("input_ids: %s" % " ".join([str(x) for x in word_id_seq]))
					tf.logging.info("input_ids_ori: %s" % " ".join(word_seq))
					tf.logging.info("input_ids_length: %s" % (len(input_ids)))

				tf_example = tf.train.Example(features=tf.train.Features(feature=features))
				self._writer.write(tf_example.SerializeToString())

		self._writer.close()
Example #3
0
    def process_feature(self, feature):
        self.num_features += 1
        features = collections.OrderedDict()

        features["input_ids"] = tf_data_utils.create_int_feature(
            feature.input_ids)
        features["input_mask"] = tf_data_utils.create_int_feature(
            feature.input_mask)
        features["segment_ids"] = tf_data_utils.create_int_feature(
            feature.segment_ids)
        features["masked_lm_positions"] = tf_data_utils.create_int_feature(
            feature.masked_lm_positions)
        features["masked_lm_ids"] = tf_data_utils.create_int_feature(
            feature.masked_lm_ids)
        features["masked_lm_weights"] = tf_data_utils.create_float_feature(
            feature.masked_lm_weights)
        features["label_ids"] = tf_data_utils.create_int_feature(
            [feature.label_ids])

        try:
            features["qas_id"] = tf_data_utils.create_int_feature(
                [feature.guid])
            tf_example = tf.train.Example(features=tf.train.Features(
                feature=features))
            self._writer.write(tf_example.SerializeToString())
        except:
            tf_example = tf.train.Example(features=tf.train.Features(
                feature=features))
            self._writer.write(tf_example.SerializeToString())
Example #4
0
    def process_feature(self, feature):
        self.num_features += 1
        features = collections.OrderedDict()

        features["input_ids_a"] = tf_data_utils.create_int_feature(
            feature.input_ids_a)
        features["label_ids"] = tf_data_utils.create_int_feature(
            [feature.label_ids])

        try:
            features["input_char_ids_a"] = tf_data_utils.create_int_feature(
                feature.input_char_ids_a)
        except:
            s = 0
        try:
            features["input_ids_b"] = tf_data_utils.create_int_feature(
                feature.input_ids_b)
        except:
            s = 0
        try:
            features["input_char_ids_b"] = tf_data_utils.create_int_feature(
                feature.input_char_ids_b)
        except:
            s = 0
        try:
            features["label_probs"] = tf_data_utils.create_float_feature(
                feature.label_probs)

        except:
            s = 0

        try:
            features["label_ratio"] = tf_data_utils.create_float_feature(
                [feature.label_ratio])
        except:
            s = 0
        try:
            features[
                "distillation_ratio"] = tf_data_utils.create_float_feature(
                    [feature.distillation_ratio])
        except:
            s = 0
        try:
            features[
                "distillation_feature"] = tf_data_utils.create_float_feature(
                    feature.feature)
        except:
            s = 0

        try:
            features["adv_ids"] = tf_data_utils.create_int_feature(
                [feature.adv_ids])
        except:
            s = 0

        tf_example = tf.train.Example(features=tf.train.Features(
            feature=features))
        self._writer.write(tf_example.SerializeToString())
Example #5
0
    def process_feature(self, feature, **kargs):
        self.num_features += 1
        features = collections.OrderedDict()

        # print(feature.label_probs)

        features["input_ids_a"] = tf_data_utils.create_int_feature(
            feature.input_ids_a)
        if kargs.get("label_type", "multi_class") == "multi_class":
            features["label_ids"] = tf_data_utils.create_int_feature(
                [feature.label_ids])
        else:
            features["label_ids"] = tf_data_utils.create_int_feature(
                feature.label_ids)

        try:
            features["input_char_ids_a"] = tf_data_utils.create_int_feature(
                feature.input_char_ids_a)
        except:
            s = 0
        try:
            features["input_ids_b"] = tf_data_utils.create_int_feature(
                feature.input_ids_b)
        except:
            s = 0
        try:
            features["input_char_ids_b"] = tf_data_utils.create_int_feature(
                feature.input_char_ids_b)
        except:
            s = 0
        try:
            features["label_probs"] = tf_data_utils.create_float_feature(
                feature.label_probs)

        except:
            s = 0

        try:
            features["label_ratio"] = tf_data_utils.create_float_feature(
                [feature.label_ratio])
        except:
            s = 0
        try:
            features[
                "distillation_ratio"] = tf_data_utils.create_float_feature(
                    [feature.distillation_ratio])
        except:
            s = 0
        try:
            features[
                "distillation_feature"] = tf_data_utils.create_float_feature(
                    feature.feature)
        except:
            s = 0

        tf_example = tf.train.Example(features=tf.train.Features(
            feature=features))
        self._writer.write(tf_example.SerializeToString())
Example #6
0
    def process_feature(self, feature):
        """Write a InputFeature to the TFRecordWriter as a tf.train.Example."""
        self.num_features += 1

        features = collections.OrderedDict()
        features["input_ids"] = tf_data_utils.create_int_feature(
            feature.input_ids)
        features["input_mask"] = tf_data_utils.create_int_feature(
            feature.input_mask)
        features["segment_ids"] = tf_data_utils.create_int_feature(
            feature.segment_ids)
        features["label_ids"] = tf_data_utils.create_int_feature(
            [feature.choice])

        try:
            features["qas_id"] = tf_data_utils.create_int_feature(
                [feature.unique_id])
            tf_example = tf.train.Example(features=tf.train.Features(
                feature=features))
            self._writer.write(tf_example.SerializeToString())
        except:
            tf_example = tf.train.Example(features=tf.train.Features(
                feature=features))
            self._writer.write(tf_example.SerializeToString())
Example #7
0
    def process_feature(self, feature):
        """Write a InputFeature to the TFRecordWriter as a tf.train.Example."""
        self.num_features += 1

        features = collections.OrderedDict()
        features["unique_ids"] = tf_data_utils.create_int_feature(
            [feature.unique_id])
        features["input_ids"] = tf_data_utils.create_int_feature(
            feature.input_ids)
        features["input_mask"] = tf_data_utils.create_int_feature(
            feature.input_mask)
        features["segment_ids"] = tf_data_utils.create_int_feature(
            feature.segment_ids)

        if self.is_training:
            features["start_positions"] = tf_data_utils.create_int_feature(
                [feature.start_position])
            features["end_positions"] = tf_data_utils.create_int_feature(
                [feature.end_position])

        tf_example = tf.train.Example(features=tf.train.Features(
            feature=features))
        self._writer.write(tf_example.SerializeToString())
Example #8
0
def main(_):

    print(FLAGS)
    print(tf.__version__, "==tensorflow version==")

    init_checkpoint = os.path.join(FLAGS.buckets, FLAGS.init_checkpoint)
    train_file = os.path.join(FLAGS.buckets, FLAGS.train_file)
    dev_file = os.path.join(FLAGS.buckets, FLAGS.dev_file)
    checkpoint_dir = os.path.join(FLAGS.buckets, FLAGS.model_output)

    print(init_checkpoint, train_file, dev_file, checkpoint_dir)

    sess_config = tf.ConfigProto(allow_soft_placement=True,
                                 log_device_placement=True)

    cluster = {'chief': ['localhost:2221'], 'worker': ['localhost:2222']}
    try:
        os.environ['TF_CONFIG'] = json.dumps({
            'cluster': cluster,
            'task': {
                'type': 'evaluator',
                'index': 0
            }
        })
    except:
        print("==not tf config env==")

    run_config = tf.estimator.RunConfig(keep_checkpoint_max=5,
                                        model_dir=checkpoint_dir,
                                        session_config=sess_config,
                                        save_checkpoints_secs=None,
                                        save_checkpoints_steps=None,
                                        log_step_count_steps=100)

    task_index = run_config.task_id
    is_chief = run_config.is_chief
    worker_count = 1

    print("==worker_count==", worker_count, "==local_rank==", task_index,
          "==is is_chief==", is_chief)
    target = ""

    if FLAGS.mode == "single_task":
        train_eval_api = train_eval
    elif FLAGS.mode == "multi_task":
        train_eval_api = multitask_train_eval

    if FLAGS.run_type == "estimator":
        train_eval_api.monitored_estimator(FLAGS=FLAGS,
                                           worker_count=worker_count,
                                           task_index=task_index,
                                           cluster=cluster,
                                           is_chief=is_chief,
                                           target=target,
                                           init_checkpoint=init_checkpoint,
                                           train_file=train_file,
                                           dev_file=dev_file,
                                           checkpoint_dir=checkpoint_dir,
                                           run_config=run_config,
                                           profiler=FLAGS.profiler,
                                           parse_type=FLAGS.parse_type,
                                           rule_model=FLAGS.rule_model,
                                           train_op=FLAGS.train_op,
                                           running_type="eval",
                                           input_target=FLAGS.input_target,
                                           ues_token_type=FLAGS.ues_token_type,
                                           attention_type=FLAGS.attention_type)
    elif FLAGS.run_type == "sess":
        result_dict = train_eval_api.monitored_sess(
            FLAGS=FLAGS,
            worker_count=worker_count,
            task_index=task_index,
            cluster=cluster,
            is_chief=is_chief,
            target=target,
            init_checkpoint=init_checkpoint,
            train_file=train_file,
            dev_file=dev_file,
            checkpoint_dir=checkpoint_dir,
            run_config=run_config,
            profiler=FLAGS.profiler,
            parse_type=FLAGS.parse_type,
            rule_model=FLAGS.rule_model,
            train_op=FLAGS.train_op,
            running_type="eval",
            input_target=FLAGS.input_target,
            ues_token_type=FLAGS.ues_token_type,
            attention_type=FLAGS.attention_type)

        result_log_file = os.path.join(checkpoint_dir, FLAGS.feature_output)
        print(result_log_file, "==result log path==")
        # with tf.gfile.GFile(result_log_file, 'w') as f:
        # 	f.write(json.dumps(result_dict)+"\n")
        writer = tf.python_io.TFRecordWriter(result_log_file)
        try:
            for label_id, feature, prob in zip(result_dict["label_ids"],
                                               result_dict["feature"],
                                               result_dict["prob"]):
                features = {}
                features["label_id"] = tf_data_utils.create_int_feature(
                    [label_id])
                features["feature"] = tf_data_utils.create_float_feature(
                    feature)
                features["prob"] = tf_data_utils.create_float_feature(prob)

                tf_example = tf.train.Example(features=tf.train.Features(
                    feature=features))
                writer.write(tf_example.SerializeToString())
            writer.close()
        except:
            print("===not legal output for writer===")
Example #9
0
def write2tfrecords():
    multi_task_config = Bunch(json.load(tf.gfile.Open(
        FLAGS.multi_task_config)))
    generator = create_generator(FLAGS, multi_task_config, "train",
                                 FLAGS.epoch)

    _writer = tf.python_io.TFRecordWriter(
        os.path.join(FLAGS.buckets, FLAGS.model_output))
    problem_config = multi_task_config[FLAGS.multi_task_type.split(",")[0]]

    cnt = 0
    for idx, item in enumerate(tqdm(generator)):
        features = {}
        features["input_ids"] = tf_data_utils.create_int_feature(
            item["input_ids"])
        features["input_mask"] = tf_data_utils.create_int_feature(
            item["input_mask"])
        features["segment_ids"] = tf_data_utils.create_int_feature(
            item["segment_ids"])

        if problem_config["lm_augumentation"]:
            features["masked_lm_positions"] = tf_data_utils.create_int_feature(
                item["masked_lm_positions"])
            features["masked_lm_ids"] = tf_data_utils.create_int_feature(
                item["masked_lm_ids"])
            features["masked_lm_weights"] = tf_data_utils.create_int_feature(
                item["masked_lm_weights"])

        for problem in FLAGS.multi_task_type.split(","):
            problem_dict = multi_task_config[problem]
            problem_type = multi_task_config[problem]["task_type"]

            features["{}_loss_multiplier".format(
                problem)] = tf_data_utils.create_int_feature(
                    [item["{}_loss_multiplier".format(problem)]])
            if problem_type in ['cls_task']:
                features["{}_label_ids".format(
                    problem)] = tf_data_utils.create_int_feature(
                        [item["{}_label_ids".format(problem)]])
            elif problem_type in ['seq2seq_tag_task', 'seq2seq_text_task']:
                features["{}_label_ids".format(
                    problem)] = tf_data_utils.create_int_feature(
                        item["{}_label_ids".format(problem)])

            features["task_id"] = tf_data_utils.create_int_feature(
                [item["task_id"]])

        try:
            features["guid"] = tf_data_utils.create_int_feature([idx])
            tf_example = tf.train.Example(features=tf.train.Features(
                feature=features))
            _writer.write(tf_example.SerializeToString())
        except:
            tf_example = tf.train.Example(features=tf.train.Features(
                feature=features))
            _writer.write(tf_example.SerializeToString())
        cnt += 1
    print("==total sample==", cnt)
Example #10
0
    def process_feature(self, feature, task_type, task_type_dict):
        """Write a InputFeature to the TFRecordWriter as a tf.train.Example."""
        self.num_features += 1

        features = collections.OrderedDict()

        features["input_ids"] = tf_data_utils.create_int_feature(
            feature.input_ids)
        features["input_mask"] = tf_data_utils.create_int_feature(
            feature.input_mask)
        features["segment_ids"] = tf_data_utils.create_int_feature(
            feature.segment_ids)

        for task_index, task in enumerate(task_type_dict):
            if task == task_type:
                features["{}_loss_multiplier".format(
                    task)] = tf_data_utils.create_int_feature([1])
                if task_type_dict[task]["task_type"] == "cls_task":
                    features["{}_label_ids".format(
                        task)] = tf_data_utils.create_int_feature(
                            [feature.label_ids])
                elif task_type_dict[task]["task_type"] == "seq2tag":
                    features["{}_label_ids".format(
                        task)] = tf_data_utils.create_int_feature(
                            feature.label_ids)
                elif task_type_dict[task]["task_type"] == "mrc":
                    features["{}_label_ids".format(
                        task)] = tf_data_utils.create_int_feature(
                            feature.label_ids)
                features["task_id"] = tf_data_utils.create_int_feature(
                    [task_index])
            else:
                features["{}_loss_multiplier".format(
                    task)] = tf_data_utils.create_int_feature([0])
                if task_type_dict[task]["task_type"] == "cls_task":
                    features["{}_label_ids".format(
                        task)] = tf_data_utils.create_int_feature([0])
                elif task_type_dict[task]["task_type"] == "seq2tag":
                    features["{}_label_ids".format(
                        task)] = tf_data_utils.create_int_feature(
                            [0] * len(feature.label_ids))
                elif task_type_dict[task]["task_type"] == "mrc":
                    features["{}_label_ids".format(
                        task)] = tf_data_utils.create_int_feature(
                            [0] * len(feature.label_ids))

        if self.num_features == 10:
            print(features.keys())
        try:
            features["guid"] = tf_data_utils.create_int_feature([feature.guid])
            tf_example = tf.train.Example(features=tf.train.Features(
                feature=features))
            self._writer.write(tf_example.SerializeToString())
        except:
            tf_example = tf.train.Example(features=tf.train.Features(
                feature=features))
            self._writer.write(tf_example.SerializeToString())