Beispiel #1
0
def file_based_convert_examples_to_features(examples, tokenize_fn, output_file):
    if tf.gfile.Exists(output_file) and not FLAGS.overwrite_data:
        return

    logger.info("Start writing tfrecord %s.", output_file)
    writer = tf.python_io.TFRecordWriter(output_file)

    for ex_index, example in enumerate(examples):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))

        feature = convert_single_example(example, tokenize_fn)

        def create_int_feature(values):
            f = tf.train.Feature(
                int64_list=tf.train.Int64List(value=list(values)))
            return f

        def create_float_feature(values):
            f = tf.train.Feature(
                float_list=tf.train.FloatList(value=list(values)))
            return f

        features = collections.OrderedDict()
        features["input_ids"] = create_int_feature(feature.input_ids)
        features["input_mask"] = create_float_feature(feature.input_mask)
        features["segment_ids"] = create_int_feature(feature.segment_ids)
        features["label_ids"] = create_int_feature([feature.label_id])
        features["is_real_example"] = create_int_feature(
            [int(feature.is_real_example)])

        tf_example = tf.train.Example(
            features=tf.train.Features(feature=features))
        writer.write(tf_example.SerializeToString())
    writer.close()
Beispiel #2
0
def do_inference(model_path, vocab_file, data):
    tokenizer = tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=True)
    interpreter = tf.lite.Interpreter(model_path=model_path)

    predictions = []
    probabilities = []

    for input in data:
        guid = "predict_1"
        text_a = tokenization.convert_to_unicode(input)
        example = classifier_utils.InputExample(guid=guid, text_a=text_a, text_b=None, label="1")
        feature = classifier_utils.convert_single_example(0, example, ["0", "1"], 128, tokenizer)
        
        interpreter.allocate_tensors()

        input_details = interpreter.get_input_details()
        output_details = interpreter.get_output_details()
        
        interpreter.set_tensor(input_details[0]['index'], [to_int32(feature.input_ids) if input_details[0]['dtype'] == np.int32 else feature.input_ids])
        interpreter.set_tensor(input_details[1]['index'], [to_int32(feature.input_mask) if input_details[1]['dtype'] == np.int32 else feature.input_ids])
        interpreter.set_tensor(input_details[3]['index'], [to_int32(feature.segment_ids) if input_details[3]['dtype'] == np.int32 else feature.input_ids])
        
        interpreter.invoke()

        if len(output_details) > 1:
            output_data = interpreter.get_tensor(output_details[0]['index'])
            predictions.append(output_data[0])
            output_data = interpreter.get_tensor(output_details[1]['index'])
            probabilities.append(output_data[0])
        else:
            # compatiable with single output models
            output_data = interpreter.get_tensor(output_details[0]['index'])
            probabilities.append(output_data[0])
    return probabilities, predictions if len(probabilities) == len(predictions) else None
Beispiel #3
0
def file_based_convert_examples_to_features(examples,
                                            label_list,
                                            max_seq_length,
                                            tokenize_fn,
                                            output_file,
                                            num_passes=1):
    """Convert a set of `InputExample`s to a TFRecord file."""
    print('!' * 10000)
    # do not create duplicated records
    if tf.gfile.Exists(output_file) and not FLAGS.overwrite_data:
        tf.logging.info(
            "Do not overwrite tfrecord {} exists.".format(output_file))
        return

    tf.logging.info("Create new tfrecord {}.".format(output_file))

    writer = tf.python_io.TFRecordWriter(output_file)

    if num_passes > 1:
        examples *= num_passes
    task_name = 'imdb_reg'
    #fout = tf.gfile.Open(os.path.join("./predict","orig.tsv"), "w")
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            tf.logging.info("Writing example {} of {}".format(
                ex_index, len(examples)))

        feature = convert_single_example(ex_index, example, label_list,
                                         max_seq_length, tokenize_fn)

        def create_int_feature(values):
            f = tf.train.Feature(int64_list=tf.train.Int64List(
                value=list(values)))
            return f

        def create_float_feature(values):
            f = tf.train.Feature(float_list=tf.train.FloatList(
                value=list(values)))
            return f

        features = collections.OrderedDict()
        features["input_ids"] = create_int_feature(feature.input_ids)
        features["input_mask"] = create_float_feature(feature.input_mask)
        features["segment_ids"] = create_int_feature(feature.segment_ids)
        if label_list is not None:
            features["label_ids"] = create_int_feature([feature.label_id])
        else:
            features["label_ids"] = create_float_feature(
                [float(feature.label_id)])
        features["is_real_example"] = create_int_feature(
            [int(feature.is_real_example)])

        tf_example = tf.train.Example(features=tf.train.Features(
            feature=features))
        #print("*" * 100)
        #fout.write("{}\t{}\n".format(example.text_a, example.label))
        #w print("{}\t{}\n".format(example.text_a, example.label))
        writer.write(tf_example.SerializeToString())
    writer.close()
Beispiel #4
0
def file_based_convert_examples_to_features(examples,
                                            label_list,
                                            max_seq_length,
                                            tokenize_fn,
                                            output_file,
                                            num_passes=1,
                                            shuffle=True):
    """Convert a set of `InputExample`s to a TFRecord file."""

    # do not create duplicated records
    if tf.gfile.Exists(output_file) and not FLAGS.overwrite_data:
        tf.logging.info(
            "Do not overwrite tfrecord {} exists.".format(output_file))
        return

    tf.logging.info("Create new tfrecord {}.".format(output_file))

    writer = tf.python_io.TFRecordWriter(output_file)

    if shuffle:
        np.random.shuffle(examples)
    if num_passes > 1:
        examples *= num_passes

    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            tf.logging.info("Writing example {} of {}".format(
                ex_index, len(examples)))

        feature = convert_single_example(ex_index, example, label_list,
                                         max_seq_length, tokenize_fn)

        def create_int_feature(values):
            f = tf.train.Feature(int64_list=tf.train.Int64List(
                value=list(values)))
            return f

        def create_float_feature(values):
            f = tf.train.Feature(float_list=tf.train.FloatList(
                value=list(values)))
            return f

        features = collections.OrderedDict()
        features["input_ids"] = create_int_feature(feature.input_ids)
        features["input_mask"] = create_float_feature(feature.input_mask)
        features["segment_ids"] = create_int_feature(feature.segment_ids)
        if label_list is not None:
            features["label_ids"] = create_int_feature([feature.label_id])
        else:
            features["label_ids"] = create_float_feature(feature.label_id)
        features["is_real_example"] = create_int_feature(
            [int(feature.is_real_example)])
        features["weight"] = create_float_feature([feature.weight])

        tf_example = tf.train.Example(features=tf.train.Features(
            feature=features))
        writer.write(tf_example.SerializeToString())
    writer.close()
Beispiel #5
0
 def get_embed(self, txt):
     txt_ids = self.tokenize_fn(txt)
     example = InputExample(guid="unused_id", text_a=txt)
     feature = convert_single_example(0, example, None,
                                      FLAGS.max_seq_length,
                                      self.tokenize_fn)
     _predict_input_fn = predict_input_fn(feature)
     #for pred_cnt, result in enumerate(self.estimator.predict(input_fn=_predict_input_fn, yield_single_examples=False, checkpoint_path=FLAGS.predict_ckpt)):
     a = self.estimator.predict(input_fn=_predict_input_fn,
                                yield_single_examples=False)
     for pred_cnt, result in enumerate(
             self.estimator.predict(input_fn=_predict_input_fn,
                                    yield_single_examples=False)):
         logits = [float(x) for x in result["logits"].flat]
         summary = result["summary"]
         pass
     pass
Beispiel #6
0
 def instance_reader():
     label_list = self.get_labels() if not is_regression else None
     for epoch_index in range(epoch):
         if shuffle:
             np.random.shuffle(examples)
         if phase == 'train':
             self.current_train_epoch = epoch_index
         for (index, example) in enumerate(examples):
             if phase == 'train':
                 self.current_train_example = index + 1
             feature = convert_single_example(index, example,
                                              label_list,
                                              self.max_seq_length,
                                              self.tokenize_fn)
             instance = [
                 feature.input_ids, feature.input_mask,
                 feature.segment_ids, feature.label_id,
                 feature.is_real_example
             ]
             yield instance
Beispiel #7
0
 def run_step(self, txt):
     example = InputExample(guid="unused_id", text_a=txt)
     feature = convert_single_example(10, example, None,
                                      FLAGS.max_seq_length,
                                      self.tokenize_fn)
     logging.info("text: %s" % (example.text_a))
     logging.info("input_ids: %s" %
                  " ".join([str(x) for x in feature.input_ids]))
     logging.info("input_mask: %s" %
                  " ".join([str(x) for x in feature.input_mask]))
     logging.info("segment_ids: %s" %
                  " ".join([str(x) for x in feature.segment_ids]))
     feed_dict = {
         self.input_ids: [feature.input_ids],
         self.segment_ids: [feature.segment_ids],
         self.input_mask: [feature.input_mask]
     }
     fetch = self.sess.run([self.summary, self.input_ids], feed_dict)
     summary = fetch[0].tolist()
     for i in range(len(summary)):
         for j in range(len(summary[0])):
             summary[i][j] = round(summary[i][j], 3)
     return summary
Beispiel #8
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {
        "sentence_pair": classifier_utils.SentencePairClassificationProcessor,
        "lcqmc_pair": classifier_utils.LCQMCPairClassificationProcessor,
        "lcqmc": classifier_utils.LCQMCPairClassificationProcessor,
        "spam": classifier_utils.SpamClassificationProcessor
    }

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    if not (FLAGS.do_train or FLAGS.do_eval or FLAGS.do_predict
            or FLAGS.do_predict_raw or FLAGS.export_dir):
        raise ValueError(
            "At least one of `do_train`, `do_eval`, `do_predict`, `do_predict_raw` or `export_dir` "
            "must be True.")

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    if FLAGS.do_predict_raw:
        print("***** Running single prediction*****")
        texts = FLAGS.text
        # workaround: Getting duplicate text data when using FLAGS before runing the tf app
        texts = texts[0:int(len(texts) / 2)]
        print("text data: ", texts)
        # estimator.export_saved_model(FLAGS.output_dir, create_serving_input_receiver_fn(FLAGS.max_seq_length))
        from tensorflow.contrib import predictor
        import time
        start = time.process_time()
        predict_fn = predictor.from_saved_model(FLAGS.saved_model_dir)
        print("it took", time.process_time() - start, "to load model")
        start = time.process_time()
        input_ids_data, input_mask_data, segment_ids_data = [], [], []
        for i, t in enumerate(texts):
            guid = "predict_{}".format(i + 1)
            label = tokenization.convert_to_unicode("1")
            text_a = tokenization.convert_to_unicode(t)
            text_b = None
            example = classifier_utils.InputExample(guid=guid,
                                                    text_a=text_a,
                                                    text_b=text_b,
                                                    label=label)
            feature = classifier_utils.convert_single_example(
                0, example, label_list, FLAGS.max_seq_length, tokenizer)
            input_ids_data.append(feature.input_ids)
            input_mask_data.append(feature.input_mask)
            segment_ids_data.append(feature.segment_ids)

        features = collections.OrderedDict()
        features["input_ids"] = input_ids_data
        features["input_mask"] = input_mask_data
        features["segment_ids"] = segment_ids_data
        results = predict_fn(features)
        print(results)
        print("it took", time.process_time() - start, "to do prediction")
        return

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    # Cloud TPU: Invalid TPU configuration, ensure ClusterResolver is passed to tpu.
    print("###tpu_cluster_resolver:", tpu_cluster_resolver)
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)  # TODO
        print("###length of total train_examples:", len(train_examples))
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = classifier_utils.model_fn_builder(
        bert_config=bert_config,
        num_labels=len(label_list),
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_tpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        train_file_exists = os.path.exists(train_file)
        print("###train_file_exists:", train_file_exists, " ;train_file:",
              train_file)
        if not train_file_exists:  # if tf_record file not exist, convert from raw text file. # TODO
            classifier_utils.file_based_convert_examples_to_features(
                train_examples, label_list, FLAGS.max_seq_length, tokenizer,
                train_file)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = classifier_utils.file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        num_actual_eval_examples = len(eval_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on. These do NOT count towards the metric (all tf.metrics
            # support a per-instance weight, and these get a weight of 0.0).
            while len(eval_examples) % FLAGS.eval_batch_size != 0:
                eval_examples.append(classifier_utils.PaddingInputExample())

        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        classifier_utils.file_based_convert_examples_to_features(
            eval_examples, label_list, FLAGS.max_seq_length, tokenizer,
            eval_file)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(eval_examples), num_actual_eval_examples,
                        len(eval_examples) - num_actual_eval_examples)
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        # However, if running eval on the TPU, you will need to specify the
        # number of steps.
        if FLAGS.use_tpu:
            assert len(eval_examples) % FLAGS.eval_batch_size == 0
            eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = classifier_utils.file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        best_trial_info_file = os.path.join(FLAGS.output_dir, "best_trial.txt")

        def _best_trial_info():
            """Returns information about which checkpoints have been evaled so far."""
            if tf.gfile.Exists(best_trial_info_file):
                with tf.gfile.GFile(best_trial_info_file, "r") as best_info:
                    global_step, best_metric_global_step, metric_value = (
                        best_info.read().split(":"))
                    global_step = int(global_step)
                    best_metric_global_step = int(best_metric_global_step)
                    metric_value = float(metric_value)
            else:
                metric_value = -1
                best_metric_global_step = -1
                global_step = -1
            tf.logging.info(
                "Best trial info: Step: %s, Best Value Step: %s, "
                "Best Value: %s", global_step, best_metric_global_step,
                metric_value)
            return global_step, best_metric_global_step, metric_value

        def _remove_checkpoint(checkpoint_path):
            for ext in ["meta", "data-00000-of-00001", "index"]:
                src_ckpt = checkpoint_path + ".{}".format(ext)
                tf.logging.info("removing {}".format(src_ckpt))
                tf.gfile.Remove(src_ckpt)

        def _find_valid_cands(curr_step):
            filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
            candidates = []
            for filename in filenames:
                if filename.endswith(".index"):
                    ckpt_name = filename[:-6]
                    idx = ckpt_name.split("-")[-1]
                    if int(idx) > curr_step:
                        candidates.append(filename)
            return candidates

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")

        if task_name == "sts-b":
            key_name = "pearson"
        elif task_name == "cola":
            key_name = "matthew_corr"
        else:
            key_name = "eval_accuracy"

        global_step, best_perf_global_step, best_perf = _best_trial_info()
        writer = tf.gfile.GFile(output_eval_file, "w")
        while global_step < num_train_steps:
            steps_and_files = {}
            filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
            for filename in filenames:
                if filename.endswith(".index"):
                    ckpt_name = filename[:-6]
                    cur_filename = os.path.join(FLAGS.output_dir, ckpt_name)
                    if cur_filename.split("-")[-1] == "best":
                        continue
                    gstep = int(cur_filename.split("-")[-1])
                    if gstep not in steps_and_files:
                        tf.logging.info(
                            "Add {} to eval list.".format(cur_filename))
                        steps_and_files[gstep] = cur_filename
            tf.logging.info("found {} files.".format(len(steps_and_files)))
            if not steps_and_files:
                tf.logging.info(
                    "found 0 file, global step: {}. Sleeping.".format(
                        global_step))
                time.sleep(60)
            else:
                for checkpoint in sorted(steps_and_files.items()):
                    step, checkpoint_path = checkpoint
                    if global_step >= step:
                        if (best_perf_global_step != step
                                and len(_find_valid_cands(step)) > 1):
                            _remove_checkpoint(checkpoint_path)
                        continue
                    result = estimator.evaluate(
                        input_fn=eval_input_fn,
                        steps=eval_steps,
                        checkpoint_path=checkpoint_path)
                    global_step = result["global_step"]
                    tf.logging.info("***** Eval results *****")
                    for key in sorted(result.keys()):
                        tf.logging.info("  %s = %s", key, str(result[key]))
                        writer.write("%s = %s\n" % (key, str(result[key])))
                    writer.write("best = {}\n".format(best_perf))
                    if result[key_name] > best_perf:
                        best_perf = result[key_name]
                        best_perf_global_step = global_step
                    elif len(_find_valid_cands(global_step)) > 1:
                        _remove_checkpoint(checkpoint_path)
                    writer.write("=" * 50 + "\n")
                    writer.flush()
                    with tf.gfile.GFile(best_trial_info_file,
                                        "w") as best_info:
                        best_info.write("{}:{}:{}".format(
                            global_step, best_perf_global_step, best_perf))
        writer.close()

        for ext in ["meta", "data-00000-of-00001", "index"]:
            src_ckpt = "model.ckpt-{}.{}".format(best_perf_global_step, ext)
            tgt_ckpt = "model.ckpt-best.{}".format(ext)
            tf.logging.info("saving {} to {}".format(src_ckpt, tgt_ckpt))
            tf.io.gfile.rename(os.path.join(FLAGS.output_dir, src_ckpt),
                               os.path.join(FLAGS.output_dir, tgt_ckpt),
                               overwrite=True)

    if FLAGS.do_predict:
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        num_actual_predict_examples = len(predict_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on.
            while len(predict_examples) % FLAGS.predict_batch_size != 0:
                predict_examples.append(classifier_utils.PaddingInputExample())

        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        classifier_utils.file_based_convert_examples_to_features(
            predict_examples, label_list, FLAGS.max_seq_length, tokenizer,
            predict_file)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(predict_examples), num_actual_predict_examples,
                        len(predict_examples) - num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_drop_remainder = True if FLAGS.use_tpu else False
        predict_input_fn = classifier_utils.file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best")
        result = estimator.predict(input_fn=predict_input_fn,
                                   checkpoint_path=checkpoint_path)

        output_predict_file = os.path.join(FLAGS.output_dir,
                                           "test_results.tsv")
        output_submit_file = os.path.join(FLAGS.output_dir,
                                          "submit_results.tsv")
        with tf.gfile.GFile(output_predict_file, "w") as pred_writer,\
            tf.gfile.GFile(output_submit_file, "w") as sub_writer:
            sub_writer.write("index" + "\t" + "prediction\n")
            num_written_lines = 0
            tf.logging.info("***** Predict results *****")
            for (i, (example, prediction)) in\
                enumerate(zip(predict_examples, result)):
                probabilities = prediction["probabilities"]
                if i >= num_actual_predict_examples:
                    break
                output_line = "\t".join(
                    str(class_probability)
                    for class_probability in probabilities) + "\n"
                pred_writer.write(output_line)

                if task_name != "sts-b":
                    actual_label = label_list[int(prediction["predictions"])]
                else:
                    actual_label = str(prediction["predictions"])
                sub_writer.write(example.guid + "\t" + actual_label + "\n")
                num_written_lines += 1
        assert num_written_lines == num_actual_predict_examples

    if FLAGS.export_dir:
        tf.gfile.MakeDirs(FLAGS.export_dir)
        checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best")
        tf.logging.info("Starting to export model.")
        subfolder = estimator.export_saved_model(
            export_dir_base=FLAGS.export_dir,
            serving_input_receiver_fn=_serving_input_receiver_fn,
            checkpoint_path=checkpoint_path)
        tf.logging.info("Model exported to %s.", subfolder)

        # convert the exported model as tflite model
        converter = tf.lite.TFLiteConverter.from_saved_model(
            subfolder)  # path to the SavedModel directory
        tflite_model = converter.convert()

        tflite_model_file = os.path.join(FLAGS.export_dir, "model.tflite")
        with tf.gfile.GFile(tflite_model_file, "w") as writer:
            writer.write(tflite_model)
        tf.logging.info("Convert exported model to %s.", tflite_model_file)
Beispiel #9
0
def conver_examples_to_features(examples, all_labels, tokenize_fn):
    features = []
    for example in examples:
        feature = convert_single_example(example, tokenize_fn, all_labels)
        features.append(feature)
    return features
Beispiel #10
0
def file_based_convert_examples_to_features(
    examples, label_list, max_seq_length, tokenize_fn, output_file,
    num_passes=1):
  """Convert a set of `InputExample`s to a TFRecord file."""

  # do not create duplicated records
  if tf.gfile.Exists(output_file) and not FLAGS.overwrite_data:
    tf.logging.info("Do not overwrite tfrecord {} exists.".format(output_file))
    return

  tf.logging.info("Create new tfrecord {}.".format(output_file))

  writer = tf.python_io.TFRecordWriter(output_file)

  if num_passes > 1:
    examples *= num_passes

  for (ex_index, example) in enumerate(examples):
    if ex_index % 10000 == 0:
      tf.logging.info("Writing example {} of {}".format(ex_index,
                                                        len(examples)))

    feature = convert_single_example(ex_index, example, label_list,
                                     max_seq_length, tokenize_fn)

    def create_int_feature_alt(values):
      if type(values)==type(list()):
        if type(values[0]) == type(list()):
          list64 = tf.train.Int64List(value=values[0])
        else:
          list64 = tf.train.Int64List(value=values)
      else:
          list64 = tf.train.Int64List(value=list(values))

      f = tf.train.Feature(int64_list=list64)
      return f

    def create_int_feature(values):
        f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
        return f

    def create_float_feature(values):
      if type(values) == type(list()):
        if type(values[0]) == type(list()):
            float_list = tf.train.FloatList(value=values[0])
        else:
            float_list = tf.train.FloatList(value=values)
      else:
        float_list = tf.train.FloatList(value=list(values))
      f = tf.train.Feature( float_list=float_list)
      return f

    features = collections.OrderedDict()
    features["input_ids"] = create_int_feature(feature.input_ids)
    features["input_mask"] = create_float_feature(feature.input_mask)
    features["segment_ids"] = create_int_feature(feature.segment_ids)
    if label_list is not None:

      features["label_ids"] = create_int_feature(feature.label_ids)
    else:
      raise NotImplementedError
      features["label_ids"] = create_float_feature([float(feature.label_ids)])

    #added
    if isinstance(feature.label_ids, list):
        if len(feature.label_ids) == 100:
            label_ids = feature.label_ids
        else:
            print(len(feature.label_ids))
            if len(feature.label_ids[0])==100:
                label_ids = feature.label_ids[0]
            else:
                raise ValueError
    else:
        raise NotImplementedError
        label_ids = feature.label_ids[0]
    features["label_ids"] = create_int_feature(label_ids)
    #end added

    features["is_real_example"] = create_int_feature(
        [int(feature.is_real_example)])

    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
    writer.write(tf_example.SerializeToString())
  writer.close()
Beispiel #11
0
 def convert_example(self, index, example, labels, max_seq_length,
                     tokenize_fn):
     """Converts a single `InputExample` into a single `InputFeatures`."""
     feature = convert_single_example(index, example, labels,
                                      max_seq_length, tokenize_fn)
     return feature