def read_data(dataset_path, from_list_to_examples, class_labels, max_seq_length, use_tpu, tokenizer):
    """
    Read dataset file and prepare feature list acceptable to estimators
    """
    
    df_src = pd.read_csv(dataset_path, sep='\t')
    
    input_file = tempfile.NamedTemporaryFile(mode='w+t', encoding='utf-8', suffix='.tf_record')
    
    file_based_convert_examples_to_features(
            from_list_to_examples([list(df_src)] + df_src.values.tolist()),
            class_labels,
            max_seq_length, 
            tokenizer,
            input_file.name
        )
    
    input_fn = file_based_input_fn_builder(
            input_file=input_file.name,
            seq_length=max_seq_length,
            is_training=False,
            drop_remainder=True if use_tpu else False,
        )
    
    # Input_file object has to be kept 
    # by the caller during prediction not to be deleted
    return df_src, input_file, input_fn
Exemple #2
0
def create_tfrecord_All(kmer):

    tsv_root = "DatasetAll/asTSV/" + str(kmer) + "kmer_tsv_data/"
    tfrecord_root = "DatasetAll/asTF_Record/" + str(kmer) + "kmer_tfrecord/"

    vocab_file = "vocab/vocab_" + str(kmer) + "kmer.txt"
    processor = ColaProcessor()
    label_list = processor.get_labels()

    examples = processor.fatma_get_train_examples_All(tsv_root)
    train_file = tfrecord_root + "train_All.tf_record"
    tokenizer = tokenization.FullTokenizer(
          vocab_file=vocab_file, do_lower_case=True)
    file_based_convert_examples_to_features(
            examples, label_list, 128, tokenizer, train_file)
Exemple #3
0
    def predict(self, dir_in, filename, dir_out):
        predict_examples = self.processor.get_examples_from(
            os.path.join(dir_in, filename))
        num_actual_predict_examples = len(predict_examples)
        if self.config.use_tpu:
            while len(predict_examples) % self.config.batch_size != 0:
                predict_examples.append(PaddingInputExample())

        predict_file = os.path.join(
            dir_out, "{0}.tf_record".format(
                self.get_filename_without_extension(filename)))
        file_based_convert_examples_to_features(predict_examples,
                                                self.label_list,
                                                self.config.max_seq_length,
                                                self.tokenizer, predict_file)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(predict_examples), num_actual_predict_examples,
                        len(predict_examples) - num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", self.config.batch_size)

        predict_drop_remainder = True if self.config.use_tpu else False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=self.config.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        result = self.estimator.predict(input_fn=predict_input_fn)

        output_predict_file = os.path.join(
            dir_out, "{0}_result.tsv".format(
                self.get_filename_without_extension(filename)))
        with tf.gfile.GFile(output_predict_file, "w") as writer:
            num_written_lines = 0
            tf.logging.info("***** Predict results *****")
            for (i, prediction) in enumerate(result):
                probabilities = prediction["probabilities"]
                if i >= num_actual_predict_examples:
                    break
                output_line = "\t".join(
                    str(class_probability)
                    for class_probability in probabilities) + "\n"
                writer.write(output_line)
                num_written_lines += 1
        assert num_written_lines == num_actual_predict_examples
Exemple #4
0
    def predict(self, X, y=None):
        predict_examples = X
        num_actual_predict_examples = len(predict_examples)
        if self.config.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on.
            while len(predict_examples) % self.config.predict_batch_size != 0:
                predict_examples.append(PaddingInputExample())

        predict_file = os.path.join(self.config.output_dir,
                                    "predict.tf_record")
        file_based_convert_examples_to_features(predict_examples, y,
                                                self.config.max_seq_length,
                                                self.tokenizer, predict_file)

        print("***** Running prediction*****")
        print("  Num examples = %d (%d actual, %d padding)",
              len(predict_examples), num_actual_predict_examples,
              len(predict_examples) - num_actual_predict_examples)
        print("  Batch size = %d", self.config.predict_batch_size)

        predict_drop_remainder = True if self.config.use_tpu else False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=self.config.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        result = self.estimator.predict(input_fn=predict_input_fn)
        fulldata = []
        for (i, prediction) in enumerate(result):
            if i >= num_actual_predict_examples:
                break
            probs = [prob for prob in prediction["probabilities"]]
            data = []
            data.append(X[i].label)
            data.append(y[numpy.argsort(probs)[::-1][0]])
            data.append(X[i].text_a)
            data.extend(probs)
            fulldata.append(data)

        cols = ["true", "pred", "text"]
        cols.extend(y)
        df = pandas.DataFrame(data=fulldata, columns=cols)
        return df
Exemple #5
0
 def train(self, X, y):
     ## X is training examples, y is label list
     train_file = os.path.join(self.config.output_dir, "train.tf_record")
     file_based_convert_examples_to_features(X, y,
                                             self.config.max_seq_length,
                                             self.tokenizer, train_file)
     print("***** Running training *****")
     print("  Num examples = %d", len(X))
     print("  Batch size = %d", self.config.train_batch_size)
     print("  Num steps = %d", self.num_train_steps)
     train_input_fn = file_based_input_fn_builder(
         input_file=train_file,
         seq_length=self.config.max_seq_length,
         is_training=True,
         drop_remainder=True)
     self.estimator.train(input_fn=train_input_fn,
                          max_steps=self.num_train_steps)
def fasta2record(input_file, output_file, vocab_file, step=1):
    # This function gets an input_file which is .fasta
    # This function returns the numbers of sequences in input_file
    # This function will check if the input_file is right
    with open(input_file) as f:
        lines = f.readlines()
    print(lines)
    for index, line in enumerate(lines):
        print(line)
        if index % 2 == 0:
            if line[0] != ">":
                print("Row " + str(index + 1) + " is wrong!")
                exit()
        else:
            if line[0] == ">":
                print("Row " + str(index + 1) + " is wrong!")
                exit()
    seq_num = int(len(lines) / 2)
    with open("temp.tsv", "w") as f:
        for line in lines:
            if line[0] != ">":
                seq = ""
                line = line.strip()
                length = len(line)
                # step = 1
                for i in range(0, length, step):
                    if length - i >= step:
                        seq += line[i:i + step] + " "
                    else:
                        seq += line[i:] + " "
                seq += "\n"
                f.write("train\t1\t\t" + seq)
    processor = ColaProcessor()
    label_list = processor.get_labels()
    examples = processor.ljy_get_dev_examples("temp.tsv")
    train_file = "predict.tf_record"
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=True)
    file_based_convert_examples_to_features(examples, label_list, 128,
                                            tokenizer, train_file)
    return seq_num
Exemple #7
0
    def evaluate(self, X, y):
        eval_examples = X
        num_actual_eval_examples = len(eval_examples)
        if self.config.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on. These do NOT count towards the metric (all tf.metrics
            # support a per-instance weight, and these get a weight of 0.0).
            while len(eval_examples) % self.config.eval_batch_size != 0:
                eval_examples.append(PaddingInputExample())

        eval_file = os.path.join(self.config.output_dir, "eval.tf_record")
        file_based_convert_examples_to_features(eval_examples, y,
                                                self.config.max_seq_length,
                                                self.tokenizer, eval_file)

        print("***** Running evaluation *****")
        print("  Num examples = %d (%d actual, %d padding)",
              len(eval_examples), num_actual_eval_examples,
              len(eval_examples) - num_actual_eval_examples)
        print("  Batch size = %d", self.config.eval_batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        # However, if running eval on the TPU, you will need to specify the
        # number of steps.
        if self.config.use_tpu:
            assert len(eval_examples) % self.config.eval_batch_size == 0
            eval_steps = int(len(eval_examples) // self.config.eval_batch_size)

        eval_drop_remainder = True if self.config.use_tpu else False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=self.config.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        result = self.estimator.evaluate(input_fn=eval_input_fn,
                                         steps=eval_steps)
        return result
Exemple #8
0
label_list = processor.get_labels()
tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE,
                                       do_lower_case=DO_LOWER_CASE)

# In[6]:

# Converting training examples to features
print("################  Processing Training Data #####################")
TRAIN_TF_RECORD = os.path.join(OUTPUT_DIR, "train.tf_record")
train_examples = processor.get_train_examples(TASK_DATA_DIR)
num_train_examples = len(train_examples)
num_train_steps = int(num_train_examples / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)
run_classifier.file_based_convert_examples_to_features(train_examples,
                                                       label_list,
                                                       MAX_SEQ_LENGTH,
                                                       tokenizer,
                                                       TRAIN_TF_RECORD)

# ## Creating Classification Model

# In[7]:


def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 labels, num_labels, use_one_hot_embeddings):
    """Creates a classification model."""
    # Bert Model instant
    model = modeling.BertModel(config=bert_config,
                               is_training=is_training,
                               input_ids=input_ids,
Exemple #9
0
def main():
    # bert
    bert_config_file = tempfile.NamedTemporaryFile(mode='w+t',
                                                   encoding='utf-8',
                                                   suffix='.json')
    bert_config_file.write(
        json.dumps(
            {k: str_to_value(v)
             for k, v in config['BERT-CONFIG'].items()}))
    bert_config_file.seek(0)  # [注意] 最初からread するから
    bert_config = modeling.BertConfig.from_json_file(bert_config_file.name)
    latest_ckpt = latest_ckpt_model()
    # model.ckpt-11052.index, model.ckpt-11052.meta データの prefix
    finetuned_model_path = latest_ckpt.split('.data-00000-of-00001')[0]
    flags = FLAGS(finetuned_model_path)
    processor = LivedoorProcessor()
    label_list = processor.get_labels()

    # sentencepiece
    tokenizer = tokenization.FullTokenizer(model_file=flags.model_file,
                                           vocab_file=flags.vocab_file,
                                           do_lower_case=flags.do_lower_case)

    # no use TPU
    tpu_cluster_resolver = None
    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    # config
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=flags.master,
        model_dir=flags.output_dir,
        save_checkpoints_steps=flags.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=flags.iterations_per_loop,
            num_shards=flags.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=flags.init_checkpoint,
                                learning_rate=flags.learning_rate,
                                num_train_steps=flags.num_train_steps,
                                num_warmup_steps=flags.num_warmup_steps,
                                use_tpu=flags.use_tpu,
                                use_one_hot_embeddings=flags.use_tpu)

    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=flags.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=flags.train_batch_size,
        eval_batch_size=flags.eval_batch_size,
        predict_batch_size=flags.predict_batch_size)

    # テストデータコレクションの取得
    predict_examples = processor.get_test_examples(flags.data_dir)
    predict_file = tempfile.NamedTemporaryFile(mode='w+t',
                                               encoding='utf-8',
                                               suffix='.tf_record')
    """Convert a set of `InputExample`s to a TFRecord file."""
    """出力: predict_file.name """
    # https://github.com/yoheikikuta/bert-japanese/blob/master/src/run_classifier.py#L371-L380
    file_based_convert_examples_to_features(predict_examples, label_list,
                                            flags.max_seq_length, tokenizer,
                                            predict_file.name)
    predict_drop_remainder = True if flags.use_tpu else False

    # TPUEstimatorに渡すクロージャを作成
    predict_input_fn = file_based_input_fn_builder(
        input_file=predict_file.name,
        seq_length=flags.max_seq_length,
        is_training=False,
        drop_remainder=predict_drop_remainder)
    # 推論
    result = estimator.predict(input_fn=predict_input_fn)
    result = list(result)

    # 精度を計算
    accracy(result, label_list)
Exemple #10
0
def main():
    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)
    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
    tf.gfile.MakeDirs(FLAGS.output_dir)
    processor = MyProcessor()
    label_list = processor.get_labels()
    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=None,
        master=None,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=8,  # 这个参数在没有调用到tpu的时候实际上无法用到
            per_host_input_for_training=is_per_host))

    train_examples = processor.get_train_examples(FLAGS.data_dir)
    train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
    # 这里的train_examples在前面的processor中已经处理出来了
    file_based_convert_examples_to_features(train_examples, label_list,
                                            FLAGS.max_seq_length, tokenizer,
                                            train_file)
    train_input_fn = file_based_input_fn_builder(
        input_file=train_file,
        seq_length=FLAGS.max_seq_length,
        is_training=True,
        drop_remainder=True)
    eval_examples = processor.get_dev_examples(FLAGS.data_dir)
    num_actual_eval_examples = len(eval_examples)
    eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
    file_based_convert_examples_to_features(eval_examples, label_list,
                                            FLAGS.max_seq_length, tokenizer,
                                            eval_file)
    eval_steps = None
    eval_drop_remainder = False
    eval_input_fn = file_based_input_fn_builder(
        input_file=eval_file,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=eval_drop_remainder)

    for i in range(FLAGS.watch_times):
        num_train_steps = int((i + 1) * len(train_examples) /
                              FLAGS.train_batch_size * FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

        model_fn = model_fn_builder(bert_config=bert_config,
                                    num_labels=len(label_list),
                                    init_checkpoint=FLAGS.init_checkpoint,
                                    learning_rate=FLAGS.learning_rate,
                                    num_train_steps=num_train_steps,
                                    num_warmup_steps=num_warmup_steps,
                                    use_one_hot_embeddings=False)

        estimator = tf.estimator.Estimator(model_fn=model_fn,
                                           config=run_config,
                                           params={"batch_size": 8})

        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(eval_examples), num_actual_eval_examples,
                        len(eval_examples) - num_actual_eval_examples)
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        # This tells the estimator to run through the entire set.
        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "a") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                writer.write("%s" % "The" + str(i) + "-th eval\n")
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
                writer.write("\n")
Exemple #11
0
    def test_model(self, local_dir,
                       nb_epoch,
                       batch_size,
                       bucket_name):
        """
        Use the BERT Uncased language model to train on
        new data
        """
        tf.logging.set_verbosity(tf.logging.INFO)
        logging.info("*:BERT MODEL PATH:%s",BERT_MODEL_PATH)
        logging.info("*:Local Dir%s",local_dir)


        mod_name = self.model_name
        BERT_MODEL = 'uncased_L-12_H-768_A-12'
        BERT_PRETRAINED_DIR = BERT_MODEL_PATH
        OUTPUT_DIR = os.path.join(local_dir,'output_bert')
        DATA_DIR = os.path.join(local_dir,'data')
        logging.info('***** Model output directory: %s*****',OUTPUT_DIR)
        logging.info('***** BERT pretrained directory: %s *****',BERT_PRETRAINED_DIR)
        logging.info('***** DATA directory: %s *****',DATA_DIR)
        TRAIN_BATCH_SIZE = 32
        EVAL_BATCH_SIZE = 8
        PREDICT_BATCH_SIZE = 32
        LEARNING_RATE = 2e-5
        NUM_TRAIN_EPOCHS = 3.0
        WARMUP_PROPORTION = 0.1
        MAX_SEQ_LENGTH = 128
        # Model configs
        # if you wish to finetune a model on a larger dataset, use larger interval
        SAVE_CHECKPOINTS_STEPS = 1000
        # each checpoint weights about 1,5gb
        ITERATIONS_PER_LOOP = 1000
        NUM_TPU_CORES = 8

        VOCAB_FILE = os.path.join(BERT_PRETRAINED_DIR,'vocab.txt')
        BERT_CONFIG_FILE = os.path.join(BERT_PRETRAINED_DIR,'bert_config.json')
        with open(os.path.join(OUTPUT_DIR,'final_ckpt.txt')) as f:
            content = f.readlines()
            logging.info("***Final_cktp->%s\n",content)
        test_ckpt = content[0].split('/')[-1]
        INIT_CHECKPOINT = os.path.join(OUTPUT_DIR, test_ckpt)
        DO_LOWER_CASE = BERT_MODEL.startswith('uncased')

        logging.info("Found VOCAB File:%s",VOCAB_FILE)
        bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG_FILE)
        tf.gfile.MakeDirs(OUTPUT_DIR)
        processor = run_classifier.ColaProcessor()
        label_list = processor.get_labels()
        tokenizer = tokenization.FullTokenizer(
        vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE)

        # Since training will happen on GPU, we won't need a cluster resolver
        tpu_cluster_resolver = None
        # TPUEstimator also supports training on CPU and GPU. You don't need to define a separate tf.estimator.Estimator.
        run_config = tf.contrib.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            model_dir=OUTPUT_DIR,
            save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
            tpu_config=tf.contrib.tpu.TPUConfig(
                iterations_per_loop=ITERATIONS_PER_LOOP,
                num_shards=NUM_TPU_CORES,
                per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

        train_examples = None
        num_train_steps = None
        num_warmup_steps = None
        

        model_fn = run_classifier.model_fn_builder(
            bert_config=bert_config,
            num_labels=len(label_list),
            init_checkpoint=INIT_CHECKPOINT,
            learning_rate=LEARNING_RATE,
            num_train_steps=num_train_steps,
            num_warmup_steps=num_warmup_steps,
            use_tpu=False,  # If False training will fall on CPU or GPU, depending on what is available
            use_one_hot_embeddings=False) #Try with True

        estimator = tf.contrib.tpu.TPUEstimator(
            use_tpu=False,  # If False training will fall on CPU or GPU, depending on what is available
            model_fn=model_fn,
            config=run_config,
            train_batch_size=TRAIN_BATCH_SIZE,
            eval_batch_size=EVAL_BATCH_SIZE,
            predict_batch_size=PREDICT_BATCH_SIZE)
        
        predict_examples = processor.get_test_examples(DATA_DIR)
        num_actual_predict_examples = len(predict_examples)
        predict_file = os.path.join(OUTPUT_DIR, "predict.tf_record")
        run_classifier.file_based_convert_examples_to_features(predict_examples, label_list,
                                                MAX_SEQ_LENGTH, tokenizer,
                                                predict_file)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(predict_examples), num_actual_predict_examples,
                        len(predict_examples) - num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", batch_size)

        predict_input_fn = run_classifier.file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=MAX_SEQ_LENGTH,
            is_training=False,
            drop_remainder=False)

        result = estimator.predict(input_fn=predict_input_fn)
        output_predict_file = os.path.join(OUTPUT_DIR, "test_results.tsv")
        with tf.gfile.GFile(output_predict_file, "w") as writer:
            num_written_lines = 0
            tf.logging.info("***** Predict results *****")
            for (i, prediction) in enumerate(result):
                probabilities = prediction["probabilities"]
                if i >= num_actual_predict_examples:
                    break
                output_line = "\t".join(
                    str(class_probability)
                    for class_probability in probabilities) + "\n"
                writer.write(output_line)
                num_written_lines += 1
        assert num_written_lines == num_actual_predict_examples
        s3 = boto3.resource('s3')
        tf.logging.info("Done with prediction uploading results to S3")
        try:
            s3.Bucket(bucket_name).upload_file(output_predict_file, output_predict_file)
        except Exception as err:
            logging.info("Unable to upload to S3")
            logging.info(err)


        return 1
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {
        "cola": run_classifier.ColaProcessor,
        "mnli": run_classifier.MnliProcessor,
        "mrpc": run_classifier.MrpcProcessor,
    }

    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    tf.gfile.MakeDirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = create_tokenizer_from_hub_module(FLAGS.bert_hub_module_handle)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(
        num_labels=len(label_list),
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
        bert_hub_module_handle=FLAGS.bert_hub_module_handle)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        train_features = run_classifier.convert_examples_to_features(
            train_examples, label_list, FLAGS.max_seq_length, tokenizer)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = run_classifier.input_fn_builder(
            features=train_features,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        eval_features = run_classifier.convert_examples_to_features(
            eval_examples, label_list, FLAGS.max_seq_length, tokenizer)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d", len(eval_examples))
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        # However, if running eval on the TPU, you will need to specify the
        # number of steps.
        if FLAGS.use_tpu:
            # Eval will be slightly WRONG on the TPU because it will truncate
            # the last batch.
            eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = run_classifier.input_fn_builder(
            features=eval_features,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    if FLAGS.do_predict:
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        if FLAGS.use_tpu:
            # Discard batch remainder if running on TPU
            n = len(predict_examples)
            predict_examples = predict_examples[:(
                n - n % FLAGS.predict_batch_size)]

        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        run_classifier.file_based_convert_examples_to_features(
            predict_examples, label_list, FLAGS.max_seq_length, tokenizer,
            predict_file)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d", len(predict_examples))
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_input_fn = run_classifier.file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=FLAGS.use_tpu)

        result = estimator.predict(input_fn=predict_input_fn)

        output_predict_file = os.path.join(FLAGS.output_dir,
                                           "test_results.tsv")
        with tf.gfile.GFile(output_predict_file, "w") as writer:
            tf.logging.info("***** Predict results *****")
            for prediction in result:
                probabilities = prediction["probabilities"]
                output_line = "\t".join(
                    str(class_probability)
                    for class_probability in probabilities) + "\n"
                writer.write(output_line)
Exemple #13
0
def main(_):
    tf.logging.set_verbosity(tf.logging.DEBUG)

    processors = {
        "mana169": ManaProcessor169,
    }

    tokenization.validate_case_matches_checkpoint(DO_LOWER_CASE, INIT_CKPT)

    bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG_FILE)

    if MAX_SEQ_LENGTH > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (MAX_SEQ_LENGTH, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(OUTPUT_DIR)

    task_name = 'mana169'

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE,
                                           do_lower_case=DO_LOWER_CASE)

    tpu_cluster_resolver = None

    hooks = []
    # create a logging tensor hook because this takes forever on cpu
    logger = tf.train.LoggingTensorHook({"Input": "IteratorGetNext:0"},
                                        every_n_iter=1)
    hooks.append(logger)
    # debug_hook = tfdbg.LocalCLIDebugHook()
    # hooks.append(debug_hook)

    run_config = tf.contrib.tpu.RunConfig(cluster=tpu_cluster_resolver,
                                          model_dir=OUTPUT_DIR,
                                          save_checkpoints_steps=1)

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=INIT_CKPT,
                                learning_rate=5e-5,
                                num_train_steps=None,
                                num_warmup_steps=None,
                                use_tpu=False,
                                use_one_hot_embeddings=False)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=False,
        model_fn=model_fn,
        config=run_config,
        predict_batch_size=PREDICT_BATCH_SIZE)

    input_file = sys.argv[1]

    predict_examples = read_input_examples(input_file)
    num_actual_predict_examples = len(predict_examples)
    # if FLAGS.use_tpu:
    #     # TPU requires a fixed batch size for all batches, therefore the number
    #     # of examples must be a multiple of the batch size, or else examples
    #     # will get dropped. So we pad with fake examples which are ignored
    #     # later on.
    #     while len(predict_examples) % FLAGS.predict_batch_size != 0:
    #         predict_examples.append(PaddingInputExample())

    predict_file = os.path.join(OUTPUT_DIR, "predict.tf_record")
    file_based_convert_examples_to_features(predict_examples, label_list,
                                            MAX_SEQ_LENGTH, tokenizer,
                                            predict_file)

    tf.logging.info("***** Running prediction*****")
    tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                    len(predict_examples), num_actual_predict_examples,
                    len(predict_examples) - num_actual_predict_examples)
    tf.logging.info("  Batch size = %d", PREDICT_BATCH_SIZE)

    predict_drop_remainder = False
    predict_input_fn = file_based_input_fn_builder(
        input_file=predict_file,
        seq_length=MAX_SEQ_LENGTH,
        is_training=False,
        drop_remainder=predict_drop_remainder)

    result = estimator.predict(input_fn=predict_input_fn, hooks=hooks)

    output_predict_file = os.path.join(OUTPUT_DIR, sys.argv[2])

    scores_list = []

    num_written_lines = 0
    tf.logging.info("***** Predict results *****")
    for (i, prediction) in enumerate(result):
        probabilities = prediction["probabilities"]
        if i >= num_actual_predict_examples:
            break
        # writer.write(output_line)
        scores_list.append(probabilities)
        num_written_lines += 1
    assert num_written_lines == num_actual_predict_examples

    scores_array = np.array(scores_list)
    # write the scores in a useful form
    top3_scores = []
    all_topics = processor.get_labels()
    for i, row in enumerate(scores_array):
        top3_indices = row.argsort()[::-1][:3]
        # index 1, score 1, index 2, score 2, etc
        l = []
        l += [input_df.values[i][j] for j in range(input_df.shape[1] - 1)
              ]  # all but the original input
        l.append(str(input_df.values[i][-1]).replace(
            '\n', ''))  # take the original input and remove newlines
        for v in top3_indices:
            l.append(all_topics[v])
            l.append(row[v])
        top3_scores.append(l)

    score_df = pd.DataFrame(
        top3_scores,
        columns=list(input_df.columns.values) +
        ["Class 1", "Score 1", "Class 2", "Score 2", "Class 3", "Score 3"])
    score_df.to_csv(output_predict_file, index=None)
Exemple #14
0
def fasta2record(en_file, pr_file, output_train_file, vocab_file):
    # This function gets two input files (en_file and pr_file) which are .fasta
    # This function returns the numbers of sequences pairs
    # This function will check if the input_files are right
    with open(en_file) as f:
        lines = f.readlines()
    f.close()

    print(lines)

    for index, line in enumerate(lines):
        print(line)
        if index % 2 == 0:
            if line[0] != ">":
                print("Row " + str(index + 1) + " is wrong!")
                exit()
        else:
            if line[0] == ">":
                print("Row " + str(index + 1) + " is wrong!")
                exit()

    seq_num = int(len(lines) / 2)

    with open("en_temp.tsv", "w") as f:
        for line in lines:
            if line[0] != ">":
                seq = ""
                length = len(line.strip())
                for i in range(length):
                    seq += line[i] + " "
                seq += "\n"
                f.write("train\t1\t\t" + seq)
    f.close()

    with open(pr_file) as f:
        lines = f.readlines()
    f.close()

    print(lines)

    for index, line in enumerate(lines):
        print(line)
        if index % 2 == 0:
            if line[0] != ">":
                print("Row " + str(index + 1) + " is wrong!")
                exit()
        else:
            if line[0] == ">":
                print("Row " + str(index + 1) + " is wrong!")
                exit()

    with open("pr_temp.tsv", "w") as f:
        for line in lines:
            if line[0] != ">":
                seq = ""
                length = len(line.strip())
                for i in range(length):
                    seq += line[i] + " "
                seq += "\n"
                f.write("train\t1\t\t" + seq)
    f.close()

    processor = ColaProcessor()
    label_list = processor.get_labels()
    examples = processor.fatma_get_dev_examples_predict("en_temp.tsv", "pr_temp.tsv")
    #train_file = "predict.tf_record"
    tokenizer = tokenization.FullTokenizer(
        vocab_file=vocab_file, do_lower_case=True)
    file_based_convert_examples_to_features(
        examples, label_list, 128, tokenizer, output_train_file)
    return seq_num
Exemple #15
0
    def traineval_model(self, local_dir,
                       nb_epoch,
                       batch_size):
        """
        Use the BERT Uncased language model to train on
        new data
        """
        tf.logging.set_verbosity(tf.logging.INFO)
        logging.info("*:BERT MODEL PATH:%s",BERT_MODEL_PATH)
        logging.info("*:Local Dir%s",local_dir)


        mod_name = self.model_name
        BERT_MODEL = 'uncased_L-12_H-768_A-12'
        BERT_PRETRAINED_DIR = BERT_MODEL_PATH
        OUTPUT_DIR = os.path.join(local_dir,'output_bert')
        DATA_DIR = os.path.join(local_dir,'data')
        logging.info('***** Model output directory: %s*****',OUTPUT_DIR)
        logging.info('***** BERT pretrained directory: %s *****',BERT_PRETRAINED_DIR)
        logging.info('***** DATA directory: %s *****',DATA_DIR)
        TRAIN_BATCH_SIZE = 32
        EVAL_BATCH_SIZE = 8
        LEARNING_RATE = 2e-5
        NUM_TRAIN_EPOCHS = 3.0
        WARMUP_PROPORTION = 0.1
        MAX_SEQ_LENGTH = 128
        # Model configs
        # if you wish to finetune a model on a larger dataset, use larger interval
        SAVE_CHECKPOINTS_STEPS = 1000
        # each checpoint weights about 1,5gb
        ITERATIONS_PER_LOOP = 1000
        NUM_TPU_CORES = 8

        VOCAB_FILE = os.path.join(BERT_PRETRAINED_DIR,'vocab.txt')
        BERT_CONFIG_FILE = os.path.join(BERT_PRETRAINED_DIR,'bert_config.json')
        INIT_CHECKPOINT = os.path.join(BERT_PRETRAINED_DIR, 'bert_model.ckpt')
        DO_LOWER_CASE = BERT_MODEL.startswith('uncased')

        logging.info("Found VOCAB File:%s",VOCAB_FILE)
        bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG_FILE)
        tf.gfile.MakeDirs(OUTPUT_DIR)
        processor = run_classifier.ColaProcessor()
        label_list = processor.get_labels()
        tokenizer = tokenization.FullTokenizer(
        vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE)

        # Since training will happen on GPU, we won't need a cluster resolver
        tpu_cluster_resolver = None
        # TPUEstimator also supports training on CPU and GPU. You don't need to define a separate tf.estimator.Estimator.
        run_config = tf.contrib.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            model_dir=OUTPUT_DIR,
            save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
            tpu_config=tf.contrib.tpu.TPUConfig(
                iterations_per_loop=ITERATIONS_PER_LOOP,
                num_shards=NUM_TPU_CORES,
                per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

        train_examples = None
        num_train_steps = None
        num_warmup_steps = None
        train_examples = processor.get_train_examples(DATA_DIR)
        num_train_steps = int(
            len(train_examples) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
        num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

        model_fn = run_classifier.model_fn_builder(
            bert_config=bert_config,
            num_labels=len(label_list),
            init_checkpoint=INIT_CHECKPOINT,
            learning_rate=LEARNING_RATE,
            num_train_steps=num_train_steps,
            num_warmup_steps=num_warmup_steps,
            use_tpu=False,  # If False training will fall on CPU or GPU, depending on what is available
            use_one_hot_embeddings=False) #Try with True

        estimator = tf.contrib.tpu.TPUEstimator(
            use_tpu=False,  # If False training will fall on CPU or GPU, depending on what is available
            model_fn=model_fn,
            config=run_config,
            train_batch_size=TRAIN_BATCH_SIZE,
            eval_batch_size=EVAL_BATCH_SIZE)

        # Train the model.
        logging.info('Starting Training...')
        train_file = os.path.join(OUTPUT_DIR, "train.tf_record")
        run_classifier.file_based_convert_examples_to_features(
            train_examples, label_list, MAX_SEQ_LENGTH, tokenizer, train_file)
        tf.logging.info('***** Started training at {} *****'.format(datetime.datetime.now()))
        tf.logging.info('  Num examples = {}'.format(len(train_examples)))
        tf.logging.info('  Batch size = {}'.format(TRAIN_BATCH_SIZE))
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = run_classifier.file_based_input_fn_builder(
            input_file=train_file,
            seq_length=MAX_SEQ_LENGTH,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
        final_ckpt = estimator.latest_checkpoint()
        print('***** Finished training at {} *****'.format(datetime.datetime.now()))
        logging.info("*****Final Checkpoint*****%s",final_ckpt)
        final_ckpt_file = os.path.join(OUTPUT_DIR, "final_ckpt.txt")
        with tf.gfile.GFile(final_ckpt_file, "w") as writer:
            writer.write("%s" % final_ckpt)


        # Do Eval
        logging.info('Starting Eval..')
        eval_examples = processor.get_dev_examples(DATA_DIR)
        num_actual_eval_examples = len(eval_examples)
        eval_file = os.path.join(OUTPUT_DIR, "eval.tf_record")
        run_classifier.file_based_convert_examples_to_features(
            eval_examples, label_list, MAX_SEQ_LENGTH, tokenizer, eval_file)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(eval_examples), num_actual_eval_examples,
                        len(eval_examples) - num_actual_eval_examples)
        tf.logging.info("  Batch size = %d", TRAIN_BATCH_SIZE)
        eval_steps = None

        eval_input_fn = run_classifier.file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=MAX_SEQ_LENGTH,
            is_training=False,
            drop_remainder=False)

        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

        output_eval_file = os.path.join(OUTPUT_DIR, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
          tf.logging.info("***** Eval results *****")
          for key in sorted(result.keys()):
            tf.logging.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))
        
        return result
Exemple #16
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)
    num_iter = 1
    jit_xla = tf.OptimizerOptions.ON_1 if FLAGS.xla else 0

    processors = {
        "cola": rc.ColaProcessor,
        "mnli": rc.MnliProcessor,
        "mrpc": rc.MrpcProcessor,
        "xnli": rc.XnliProcessor,
    }

    # sanity check
    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)
    bert_config = my_modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))
    tf.gfile.MakeDirs(FLAGS.output_dir)
    task_name = FLAGS.task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    # prepare data
    processor = processors[task_name]()
    label_list = processor.get_labels()
    predict_examples = processor.get_test_examples(FLAGS.data_dir)
    tokenizer = tokenization.FullTokenizer(
        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
    predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
    rc.file_based_convert_examples_to_features(predict_examples, label_list,
                                               FLAGS.max_seq_length, tokenizer,
                                               predict_file)

    # get model function and input function
    # drop_remainder option should be turned on for fast transformer inference
    drop_remainder = True
    predict_input_fn = rc.file_based_input_fn_builder(
        input_file=predict_file,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=drop_remainder)

    def graph_fn():
        model_fn = model_fn_builder(bert_config=bert_config)
        dataset = predict_input_fn({'batch_size': FLAGS.predict_batch_size})
        next_item = dataset.make_one_shot_iterator().get_next()
        output_var = model_fn(next_item)
        return output_var, next_item

    if FLAGS.tf_profile:
        tf.logging.info("***** Running tensorflow transformer*****")
        p1 = profile_util.Profiler(os.path.join(
            FLAGS.output_dir, 'prof/bert_origin'))
        t1, r1 = profile_util.run_profile(
            graph_fn, jit_xla, num_iter, p1, init_checkpoint=FLAGS.init_checkpoint)
        tf.reset_default_graph()

        my_modeling.transformer_model = fiu.fast_transformer_model_trans
        tf.logging.info("***** Running fast transformer*****")
        p2 = profile_util.Profiler(os.path.join(
            FLAGS.output_dir, 'prof/bert_fastinfer'))
        t2, r2 = profile_util.run_profile(
            graph_fn, jit_xla, num_iter, p2, init_checkpoint=FLAGS.init_checkpoint)

    else:
        tf.logging.info("***** Running tensorflow transformer*****")
        t1, r1 = profile_util.run_profile(
            graph_fn, jit_xla, num_iter, check_result=False, init_checkpoint=FLAGS.init_checkpoint, 
            export_path='./export_default_{}/{}/model.savedmodel/'.format(FLAGS.max_seq_length, FLAGS.predict_batch_size))
        tf.reset_default_graph()

        my_modeling.transformer_model = fiu.fast_transformer_model_trans
        tf.logging.info("***** Running fast transformer*****")
        t2, r2 = profile_util.run_profile(
            graph_fn, jit_xla, num_iter, check_result=False, init_checkpoint=FLAGS.init_checkpoint, 
            export_path='./export_ft_{}/{}/model.savedmodel/'.format(FLAGS.max_seq_length, FLAGS.predict_batch_size))

    print('average time (seconds) elasped original tensorflow:', t1)
    print('average time (seconds) elasped fast transformer:', t2)
    if len(r1) + len(r2) > 0:
        check_res = np.asarray([np.allclose(
            r1[i], r2[i], atol=1e-4, rtol=0) for i in range(num_iter)])
        if check_res.all():
            print('Pass')
            print(np.mean(r1))
            print(np.mean(r2))
        else:
            for i in np.where(np.logical_not(check_res))[0]:
                diff = np.fabs(r1[i] - r2[i])
                idx = np.unravel_index(diff.argmax(), diff.shape)
                print('Failed iter:', i, "max diff:",
                      diff[idx], idx, r1[i][idx], r2[i][idx])
Exemple #17
0
tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE,
                                       do_lower_case=DO_LOWER_CASE)


def convert_input(x):
    return run_classifier.InputExample(guid=x["id"],
                                       text_a=x["comment_text"],
                                       text_b=None,
                                       label=0)


test_InputExamples = data.apply(convert_input, axis=1)

run_classifier.file_based_convert_examples_to_features(test_InputExamples,
                                                       [0, 1], MAX_SEQ_LENGTH,
                                                       tokenizer, TEST_FILE)

NUM_TRAIN_STEPS = 0  # int(len(test_InputExamples) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
NUM_WARMUP_STEPS = 0  # int(NUM_TRAIN_STEPS * WARMUP_PROPORTION)

test_input_fn = run_classifier.file_based_input_fn_builder(
    input_file=TEST_FILE,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False)

bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG_FILE)

tpu_config = tf.contrib.tpu.TPUConfig(iterations_per_loop=ITERATIONS_PER_LOOP,
                                      num_shards=NUM_TPU_CORES,
# coding:utf-8

from run_classifier import ColaProcessor
import tokenization
from run_classifier import file_based_convert_examples_to_features

data_name = "Legionellapneumophilatmp"
input_root = "./dataset/1kmer_tsv_data/"
output_root = "./dataset/1kmer_tfrecord/"
vocab_file = "./vocab/vocab_1kmer.txt"

processor = ColaProcessor()
label_list = processor.get_labels()
examples = processor.get_dev_examples(input_root + data_name + "/")
train_file = output_root + data_name + "/dev.tf_record"
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                       do_lower_case=True)
file_based_convert_examples_to_features(examples, label_list, 128, tokenizer,
                                        train_file)

examples = processor.get_train_examples(input_root + data_name + "/")
train_file = output_root + data_name + "/train.tf_record"
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                       do_lower_case=True)
file_based_convert_examples_to_features(examples, label_list, 128, tokenizer,
                                        train_file)
Exemple #19
0
def mrpc_classifier(sent_list1, sent_list2, args):
    TUNED_MODEL_DIR = args.tuned_model_dir

    config = {
        "task_name": 'MRPC',
        "do_predict": True,
        "vocab_file": f"{TUNED_MODEL_DIR}/vocab.txt",
        "bert_config_file": f"{TUNED_MODEL_DIR}/bert_config.json",
        "init_checkpoint": f"{TUNED_MODEL_DIR}",
        "max_seq_length": 128,
        "output_dir": f"{TUNED_MODEL_DIR}",
        "do_lower_case": True,
        "predict_batch_size": 8
    }

    bert_config = modeling.BertConfig.from_json_file(
        config["bert_config_file"])
    processor = run_classifier.MrpcProcessor()
    label_list = processor.get_labels()
    tokenizer = tokenization.FullTokenizer(
        vocab_file=config["vocab_file"], do_lower_case=config["do_lower_case"])

    run_config = tf.contrib.tpu.RunConfig(
        cluster=None,
        master=None,
        model_dir=config["output_dir"],
        save_checkpoints_steps=1000,
    )

    model_fn = run_classifier.model_fn_builder(
        bert_config=bert_config,
        num_labels=len(label_list),
        init_checkpoint=config["init_checkpoint"],
        learning_rate=5e-5,
        num_train_steps=None,
        num_warmup_steps=None,
        use_tpu=False,
        use_one_hot_embeddings=False)

    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=False,
        model_fn=model_fn,
        config=run_config,
        predict_batch_size=config["predict_batch_size"])

    predict_examples = get_predict_examples(sent_list1, sent_list2)
    num_actual_predict_examples = len(predict_examples)

    predict_file = os.path.join(config["output_dir"], "predict.tf_record")
    run_classifier.file_based_convert_examples_to_features(
        predict_examples, label_list, config["max_seq_length"], tokenizer,
        predict_file)

    tf.logging.info("***** Running prediction*****")
    tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                    len(predict_examples), num_actual_predict_examples,
                    len(predict_examples) - num_actual_predict_examples)
    tf.logging.info("  Batch size = %d", config["predict_batch_size"])

    predict_drop_remainder = False
    predict_input_fn = run_classifier.file_based_input_fn_builder(
        input_file=predict_file,
        seq_length=config["max_seq_length"],
        is_training=False,
        drop_remainder=predict_drop_remainder)

    result = estimator.predict(input_fn=predict_input_fn)

    probabilities = [
        prediction["probabilities"][1] for (i, prediction) in enumerate(result)
    ]
    return probabilities