Esempio n. 1
0
def file_based_convert_examples_to_features(examples, tokenize_fn, output_file):
  if tf.gfile.Exists(output_file) and not FLAGS.overwrite_data:
    return

  tf.logging.info("Start writing tfrecord %s.", output_file)
  writer = tf.python_io.TFRecordWriter(output_file)

  for ex_index, example in enumerate(examples):
    if ex_index % 10000 == 0:
      tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))

    feature = convert_single_example(example, tokenize_fn)

    def create_int_feature(values):
      f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
      return f

    def create_float_feature(values):
      f = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
      return f

    features = collections.OrderedDict()
    features["input_ids"] = create_int_feature(feature.input_ids)
    features["input_mask"] = create_float_feature(feature.input_mask)
    features["segment_ids"] = create_int_feature(feature.segment_ids)
    features["label_ids"] = create_int_feature([feature.label_id])
    features["is_real_example"] = create_int_feature(
        [int(feature.is_real_example)])

    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
    writer.write(tf_example.SerializeToString())
  writer.close()
Esempio n. 2
0
def file_based_convert_examples_to_features(examples,
                                            label_list,
                                            max_seq_length,
                                            tokenize_fn,
                                            output_file,
                                            num_passes=1):
    """Convert a set of `InputExample`s to a TFRecord file."""

    # do not create duplicated records
    if tf.gfile.Exists(output_file) and not FLAGS.overwrite_data:
        tf.logging.info(
            "Do not overwrite tfrecord {} exists.".format(output_file))
        return

    tf.logging.info("Create new tfrecord {}.".format(output_file))

    writer = tf.python_io.TFRecordWriter(output_file)

    np.random.shuffle(examples)
    if num_passes > 1:
        examples *= num_passes

    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            tf.logging.info("Writing example {} of {}".format(
                ex_index, len(examples)))

        feature = convert_single_example(ex_index, example, label_list,
                                         max_seq_length, tokenize_fn)

        def create_int_feature(values):
            f = tf.train.Feature(int64_list=tf.train.Int64List(
                value=list(values)))
            return f

        def create_float_feature(values):
            f = tf.train.Feature(float_list=tf.train.FloatList(
                value=list(values)))
            return f

        features = collections.OrderedDict()
        features["input_ids"] = create_int_feature(feature.input_ids)
        features["input_mask"] = create_float_feature(feature.input_mask)
        features["segment_ids"] = create_int_feature(feature.segment_ids)
        if label_list is not None:
            features["label_ids"] = create_int_feature([feature.label_id])
        else:
            features["label_ids"] = create_float_feature(
                [float(feature.label_id)])
        features["is_real_example"] = create_int_feature(
            [int(feature.is_real_example)])

        tf_example = tf.train.Example(features=tf.train.Features(
            feature=features))
        writer.write(tf_example.SerializeToString())
    writer.close()
Esempio n. 3
0
    def process(self, input_examples):

        features = list()
        tokenize_fn = self._tokenize_wrapper()
        for (ex_index, example) in tqdm(enumerate(input_examples)):

            feature = convert_single_example(ex_index, example,
                                             self.label_list,
                                             self.max_seq_length, tokenize_fn)

            features.append(feature)

        return features
Esempio n. 4
0
def tfrecord_example2feature(examples, label_list, max_seq_length, tokenizer, output_file):

    if tf.gfile.Exists(output_file):
        tf.logging.info('File {} exists, skip.'.format(output_file))

    tf.logging.info('Create new tfrecord {}.'.format(output_file))

    writer = tf.python_io.TFRecordWriter(output_file)

    for ex_idx, example in enumerate(examples):
        if ex_idx % 10000 == 0:
            tf.logging.info('Writing example {} of {}'.format(ex_idx, len(examples)))

        feature = cutil.convert_single_example(
            ex_index=ex_idx,
            example=example,
            label_list=label_list,
            max_seq_length=max_seq_length,
            tokenize_fn=tokenizer
        )

        def create_int_feature(values):
            return tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))

        def create_float_feature(values):
            return tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))

        features = collections.OrderedDict()
        features['input_ids'] = create_int_feature(feature.input_ids)
        features['input_mask'] = create_float_feature(feature.input_mask)
        features['segment_ids'] = create_int_feature(feature.segment_ids)
        if label_list is not None:
            features['label_ids'] = create_int_feature([feature.label_id])
        else:
            features['label_ids'] = create_float_feature([float(feature.label_id)])
        tf_example = tf.train.Example(features=tf.train.Features(feature=features))
        writer.write(tf_example.SerializeToString())

    writer.close()
Esempio n. 5
0
def main(FLAG):

    tf.logging.set_verbosity(tf.logging.INFO)

    tf.gfile.MakeDirs(FLAG.output_dir)

    imdbp = ImdbProcessor()
    label_list = imdbp.get_labels()

    sp = spm.SentencePieceProcessor()
    sp.load(FLAG.spiece_model_file)

    def tokenize_fn(text):
        text = putil.preprocess_text(text, lower=FLAG.uncased)
        return putil.encode_ids(sp, text)

    n_train_step = None
    if FLAG.do_train:
        train_record_path = os.path.join(FLAG.output_dir, 'train.tfrecord')
        train_examples = imdbp.get_train_examples(FLAG.data_dir)
        random.shuffle(train_examples)
        tfrecord_example2feature(
            examples=train_examples,
            label_list=label_list,
            max_seq_length=FLAG.max_seq_length,
            tokenizer=tokenize_fn,
            output_file=train_record_path
        )
        n_train_step = int(len(train_examples)/FLAG.train_batch_size*FLAG.n_train_epoch)
        train_input_fn = tfrecord_input_fn_builder(
            input_file=train_record_path,
            seq_length=FLAG.max_seq_length,
            batch_size=FLAG.train_batch_size,
            is_training=True
        )
    if FLAG.do_valid:
        dev_record_path = os.path.join(FLAG.output_dir, 'dev.tfrecord')
        dev_examples = imdbp.get_dev_examples(FLAG.data_dir)
        tfrecord_example2feature(
            examples=dev_examples,
            label_list=label_list,
            max_seq_length=FLAG.max_seq_length,
            tokenizer=tokenize_fn,
            output_file=dev_record_path
        )
        dev_input_fn = tfrecord_input_fn_builder(
            input_file=dev_record_path,
            seq_length=FLAG.max_seq_length,
            batch_size=FLAG.dev_batch_size,
            is_training=False
        )
    if FLAG.do_test:
        test_record_path = os.path.join(FLAG.output_dir, 'test.tfrecord')
        test_examples = train_examples + dev_examples
        tfrecord_example2feature(
            examples=test_examples,
            label_list=label_list,
            max_seq_length=FLAG.max_seq_length,
            tokenizer=tokenize_fn,
            output_file=test_record_path
        )
        test_input_fn = tfrecord_input_fn_builder(
            input_file=test_record_path,
            seq_length=FLAG.max_seq_length,
            batch_size=FLAG.test_batch_size,
            is_training=False
        )
    
    model_fn = model_fn_builder(FLAG)
    run_config = tf.estimator.RunConfig(
        model_dir=FLAG.output_dir,
        save_summary_steps=FLAG.save_summary_steps,
        save_checkpoints_steps=FLAG.save_checkpoints_steps,
        keep_checkpoint_max=FLAG.keep_checkpoint_max,
        log_step_count_steps=FLAG.log_step_count_steps
    )
    warm_config = tf.estimator.WarmStartSettings(
        ckpt_to_initialize_from=FLAG.ckpt_path,
        vars_to_warm_start='model/transformer/*'
    ) if FLAG.ckpt_path and FLAG.do_train else None
    estimator = tf.estimator.Estimator(
        model_fn=model_fn,
        config=run_config,
        warm_start_from=warm_config
    )

    if FLAG.do_train and FLAG.do_valid:
        tf.logging.info('*******************************************')
        tf.logging.info('***** Running Training and Validation *****')
        tf.logging.info('*******************************************')
        tf.logging.info('  Train num examples = {}'.format(len(train_examples)))
        tf.logging.info('  Eval num examples = {}'.format(len(dev_examples)))
        tf.logging.info('  Train batch size = {}'.format(FLAG.train_batch_size))
        tf.logging.info('  Eval batch size = {}'.format(FLAG.dev_batch_size))
        tf.logging.info('  Num steps = {}'.format(n_train_step))
        train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=n_train_step)
        eval_spec = tf.estimator.EvalSpec(input_fn=dev_input_fn, start_delay_secs=0, throttle_secs=0)
        tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
    
    if FLAG.do_test:
        tf.logging.info('***************************')
        tf.logging.info('***** Running Testing *****')
        tf.logging.info('***************************')
        tf.logging.info('  Num examples = {}'.format(len(test_examples)))
        tf.logging.info('  Batch size = {}'.format(FLAG.test_batch_size))
        tf.gfile.MakeDirs(FLAG.result_dir)
        embed_tb_path = os.path.join(FLAG.result_dir, 'embed_table.npy')
        if tf.gfile.Exists(embed_tb_path):
            tf.logging.info('File {} exists, skip.'.format(embed_tb_path))
        else:
            tb = estimator.get_variable_value('model/transformer/word_embedding/lookup_table')
            np.save(embed_tb_path, tb)
        train_out_path = os.path.join(FLAG.result_dir, 'train_res.tfrecord')
        valid_out_path = os.path.join(FLAG.result_dir, 'valid_res.tfrecord')
        if tf.gfile.Exists(train_out_path) and tf.gfile.Exists(valid_out_path):
            tf.logging.info('File {} and {} exists, skip.'.format(train_out_path, valid_out_path))
            return
        train_writer = tf.python_io.TFRecordWriter(train_out_path)
        valid_writer = tf.python_io.TFRecordWriter(valid_out_path)
        result = estimator.predict(input_fn=test_input_fn, checkpoint_path=FLAG.ckpt_path)
        for i, pred in enumerate(result):
            if i % 10000 == 0:
                tf.logging.info('Writting result [{} / {}]'.format(i, len(test_examples)))
            feature = cutil.convert_single_example(
                ex_index=10,
                example=test_examples[i],
                label_list=label_list,
                max_seq_length=FLAG.max_seq_length,
                tokenize_fn=tokenize_fn
            )
            def create_int_feature(values):
                return tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
            def create_float_feature(values):
                return tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
            features = collections.OrderedDict()
            features['input_ids'] = create_int_feature(feature.input_ids)
            features['logit'] = create_float_feature(pred['logit'])
            if label_list is not None:
                features['label_ids'] = create_int_feature([feature.label_id])
            else:
                features['label_ids'] = create_float_feature([float(feature.label_id)])
            tf_example = tf.train.Example(features=tf.train.Features(feature=features))
            if i < len(train_examples):
                train_writer.write(tf_example.SerializeToString())
            else:
                valid_writer.write(tf_example.SerializeToString())
        train_writer.close()
        valid_writer.close()