Python FullTokenizerの例、model.tokenization.FullTokenizer Pythonの例

コード例 #1

0

ファイルを表示

ファイル: md_tasks.py プロジェクト: erayyildiz/electra

def test_MdExample():
    tokenizer = tokenization.FullTokenizer(vocab_file=DATA_DIR + 'vocab.txt',
                                           do_lower_case=True)

    words = ['Refah', 'da', 'Türkçe', 'için', 'görüş', 'istedi']
    analyzes = [['Refah+Noun+Prop+A3sg+Pnon+Nom', 'refah+Noun+A3sg+Pnon+Nom'],
                ['da+Conj'],
                [
                    'Türkçe+Noun+Prop+A3sg+Pnon+Nom', 'türkçe+Adj',
                    'türk+Noun+A3sg+Pnon+Equ', 'türk+Adj^DB+Adverb+Ly',
                    'türk+Adj^DB+Adj+AsIf'
                ],
                [
                    'için+Postp+PCNom', 'iç+Noun+A3sg+P2sg+Nom',
                    'iç+Noun+A3sg+Pnon+Gen', 'iç+Verb+Pos+Imp+A2pl'
                ],
                [
                    'görüş+Noun+A3sg+Pnon+Nom', 'gör+Verb+Recip+Pos+Imp+A2sg',
                    'gör+Verb+Pos^DB+Noun+Inf3+A3sg+Pnon+Nom',
                    'görüş+Verb+Pos+Imp+A2sg'
                ], ['iste+Verb+Pos+Past+A3sg']]

    example = MdExample(0, 'md', words, analyzes, tokenizer)
    print('candidate_roots', example.candidate_roots)
    print('candidate_tags', example.candidate_tags)
    print('word_ids', example.word_ids)
    print('sub_tokens', example.sub_tokens)
    print('words', example.words)

コード例 #2

0

ファイルを表示

ファイル: task_builder.py プロジェクト: MonsoonNLP/electra

def get_tasks(config: configure_finetuning.FinetuningConfig):
    tokenizer = tokenization.FullTokenizer(vocab_file=config.vocab_file,
                                           do_lower_case=config.do_lower_case)
    return [
        get_task(config, task_name, tokenizer)
        for task_name in config.task_names
    ]

コード例 #3

0

ファイルを表示

def build_token_synonym(text, vocab_file, do_lower_case):
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)
    tokens = tokenizer.tokenize(text)

    token_synonym = {}
    for vocab in tokenizer.vocab.keys():
        token_synonym[vocab] = []
    src = True
    for token in tokens:
        if src:
            src_token = token
            src = False
        elif token == '_':
            continue
        elif token == '/':
            src = True
            continue
        else:
            # append synonym token to corresponding src_token
            if token not in token_synonym[src_token]:
                token_synonym[src_token].append(token)
            # reverse-wise appending
            if src_token not in token_synonym[token]:
                token_synonym[token].append(src_token)

    return token_synonym

コード例 #4

0

ファイルを表示

def get_vocab(config):
  """Memoized load of the vocab file."""
  if config.vocab_file not in VOCAB_MAPPING:
    vocab = tokenization.FullTokenizer(
        config.vocab_file, do_lower_case=True).vocab
    VOCAB_MAPPING[config.vocab_file] = vocab
  return VOCAB_MAPPING[config.vocab_file]

コード例 #5

0

ファイルを表示

ファイル: build_pretraining_dataset.py プロジェクト: samsonleegh/Malaya

 def __init__(
     self,
     job_id,
     vocab_file,
     output_dir,
     max_seq_length,
     num_jobs,
     blanks_separate_docs,
     do_lower_case,
     num_out_files=1000,
 ):
     self._blanks_separate_docs = blanks_separate_docs
     tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                            do_lower_case=do_lower_case)
     self._example_builder = ExampleBuilder(tokenizer, max_seq_length)
     self._writers = []
     for i in range(num_out_files):
         if i % num_jobs == job_id:
             output_fname = os.path.join(
                 output_dir,
                 'pretrain_data.tfrecord-{:}-of-{:}'.format(
                     i, num_out_files),
             )
             self._writers.append(tf.io.TFRecordWriter(output_fname))
     self.n_written = 0

コード例 #6

0

ファイルを表示

    def write_classification_outputs(
            self, tasks, trial, split,
            config: configure_finetuning.FinetuningConfig):
        """Write classification predictions to disk."""
        utils.log("Writing out predictions for", tasks, split)
        predict_input_fn, _ = self._preprocessor.prepare_predict(tasks, split)
        results = self._estimator.predict(input_fn=predict_input_fn,
                                          yield_single_examples=True)
        # task name -> eid -> model-logits
        logits = collections.defaultdict(dict)
        for r in results:
            if r["task_id"] != len(self._tasks):
                r = utils.nest_dict(r, self._config.task_names)
                task_name = self._config.task_names[r["task_id"]]
                logits[task_name][r[task_name]["eid"]] = (
                    r[task_name]["eid"],
                    r[task_name]["input_ids"],
                    r[task_name]["input_mask"],
                    r[task_name]["token_type_ids"],
                    r[task_name]["logits"]
                    if "logits" in r[task_name] else None,
                    r[task_name]["predictions"],
                    r[task_name]["label_ids"] if "label_ids" in r[task_name]
                    else r[task_name]['targets'],
                )

        print('[RESULT]')

        tokenizer = tokenization.FullTokenizer(
            vocab_file=config.vocab_file, do_lower_case=config.do_lower_case)

        for task_name in logits:
            utils.log(
                "Saving Dev Error Analysis for {:} {:} examples ({:})".format(
                    len(logits[task_name]), task_name, split))
            if trial <= self._config.n_writes_test:
                print('Write to: ' +
                      self._config.dev_analysis(task_name, split, trial))
                with open(self._config.dev_analysis(task_name, split, trial),
                          'w',
                          encoding='utf-8') as fout:
                    fout.write('ID\tINPUT\tLOGITS\tPREDICTION\tLABEL\n')
                    for eid in logits[task_name]:
                        print('=>' + str(eid))
                        (_, input_id, input_mask, token_type_id, logit,
                         prediction, label_id) = logits[task_name][eid]
                        input_tokens = tokenizer.convert_ids_to_tokens(
                            input_id)
                        input_tokens = filter(lambda x: x != '[PAD]',
                                              input_tokens)
                        input_tokens = ' '.join(input_tokens)

                        fout.write(
                            str(eid) + '\t' + str(input_tokens) + '\t' +
                            str(logit) + '\t' + str(prediction) + '\t' +
                            str(label_id) + '\n')
                        print('Inputs: ' + str(input_tokens) + ', Logits: ' +
                              str(logit) + ', Predictions: ' +
                              str(prediction) + ', Labels: ' + str(label_id))

コード例 #7

0

ファイルを表示

ファイル: run_classifier.py プロジェクト: Zhangbeibei1991/Bert_for_CSSM

 def __init__(self, batch_size=args.batch_size):
     self.mode = None
     self.max_seq_length = args.max_seq_len
     self.tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=True)
     self.batch_size = batch_size
     self.estimator = None
     self.processor = SimProcessor()
     tf.logging.set_verbosity(tf.logging.INFO)

コード例 #8

0

ファイルを表示

    def __init__(self, config: configure_pretraining.PretrainingConfig,
                 features, ratio, is_training):
        # Set up model config
        self._config = config
        self._bert_config = training_utils.get_bert_config(config)

        embedding_size = (self._bert_config.hidden_size
                          if config.embedding_size is None else
                          config.embedding_size)

        tokenizer = tokenization.FullTokenizer(
            config.vocab_file, do_lower_case=config.do_lower_case)
        self._vocab = tokenizer.vocab
        self._inv_vocab = tokenizer.inv_vocab

        # Mask the input
        inputs = pretrain_data.features_to_inputs(features)
        # Load ratio
        with tf.variable_scope("rw_masking"):
            with tf.variable_scope("ratio"):
                self.ratios = tf.constant(ratio)
                action_prob = tf.nn.embedding_lookup(self.ratios,
                                                     inputs.input_ids)

        log_q, masked_inputs = self._sample_masking_subset(inputs, action_prob)

        # BERT model
        model = self._build_transformer(masked_inputs,
                                        is_training,
                                        reuse=tf.AUTO_REUSE,
                                        embedding_size=embedding_size)
        mlm_output = self._get_masked_lm_output(masked_inputs, model)
        self.total_loss = mlm_output.loss

        # Evaluation`
        eval_fn_inputs = {
            "input_ids": masked_inputs.input_ids,
            "masked_lm_preds": mlm_output.preds,
            "mlm_loss": mlm_output.per_example_loss,
            "masked_lm_ids": masked_inputs.masked_lm_ids,
            "masked_lm_weights": masked_inputs.masked_lm_weights,
            "input_mask": masked_inputs.input_mask
        }
        eval_fn_keys = eval_fn_inputs.keys()
        eval_fn_values = [eval_fn_inputs[k] for k in eval_fn_keys]
        """Computes the loss and accuracy of the model."""
        d = {k: arg for k, arg in zip(eval_fn_keys, eval_fn_values)}
        metrics = dict()
        metrics["masked_lm_accuracy"] = tf.metrics.accuracy(
            labels=tf.reshape(d["masked_lm_ids"], [-1]),
            predictions=tf.reshape(d["masked_lm_preds"], [-1]),
            weights=tf.reshape(d["masked_lm_weights"], [-1]))
        metrics["masked_lm_loss"] = tf.metrics.mean(
            values=tf.reshape(d["mlm_loss"], [-1]),
            weights=tf.reshape(d["masked_lm_weights"], [-1]))
        self.eval_metrics = metrics

コード例 #9

0

ファイルを表示

 def __init__(self, input_fname, vocab_file, output_dir, max_seq_length,
              blanks_separate_docs, do_lower_case):
   self._blanks_separate_docs = blanks_separate_docs
   tokenizer = tokenization.FullTokenizer(
       vocab_file=vocab_file,
       do_lower_case=do_lower_case)
   self._example_builder = build_pretraining_dataset.ExampleBuilder(tokenizer, max_seq_length)
   output_fname = os.path.join(output_dir, "{}.tfrecord".format(input_fname.split("/")[-1]))
   self._writer = tf.io.TFRecordWriter(output_fname)
   self.n_written = 0

コード例 #10

0

ファイルを表示

ファイル: create_pretraining_data.py プロジェクト: ranggarppb/bert-bahasa-plagiarism-checker

def prepare_pretraining_data(input_file,
                             output_file,
                             vocab_file,
                             do_lower_case=True,
                             random_seed=42,
                             max_seq_length=128,
                             max_predictions_per_seq=20,
                             short_seq_prob=0.1,
                             masked_lm_prob=0.15,
                             dupe_factor=5):
    tf.logging.set_verbosity(tf.logging.INFO)

    tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

    input_files = []
    for input_pattern in input_file.split(','):
        input_files.extend(tf.gfile.Glob(input_pattern))

    tf.logging.info('*** Reading from input files ***')
    for input_file in input_files:
        tf.logging.info('  %s', input_file)

    rng = random.Random(random_seed)
    instances = create_training_instances(
        input_files,
        tokenizer,
        max_seq_length,
        dupe_factor,
        short_seq_prob,
        masked_lm_prob,
        max_predictions_per_seq,
        rng,
    )

    output_files = output_file.split(',')
    tf.logging.info('*** Writing to output files ***')
    for output_file in output_files:
        tf.logging.info('  %s', output_file)

    write_instance_to_example_files(
        instances,
        tokenizer,
        max_seq_length,
        max_predictions_per_seq,
        output_files,
    )

コード例 #11

0

ファイルを表示

    def test_full_tokenizer(self):
        vocab_tokens = [
            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un",
            "runn", "##ing", ","
        ]
        with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
            if six.PY2:
                vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
            else:
                vocab_writer.write("".join([x + "\n" for x in vocab_tokens
                                            ]).encode("utf-8"))

            vocab_file = vocab_writer.name

        tokenizer = tokenization.FullTokenizer(vocab_file)
        os.unlink(vocab_file)

        tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
        self.assertAllEqual(tokens,
                            ["un", "##want", "##ed", ",", "runn", "##ing"])

        self.assertAllEqual(tokenizer.convert_tokens_to_ids(tokens),
                            [7, 4, 5, 10, 8, 9])

コード例 #12

0

ファイルを表示

ファイル: build_pretraining_dataset.py プロジェクト: krevas/Dialog-KoELECTRA

    def __init__(
        self,
        job_id,
        vocab_file,
        output_dir,
        max_seq_length,
        num_jobs,
        blanks_separate_docs,
        do_lower_case,
        tokenizer_type,
        num_out_files=500,
    ):
        self._blanks_separate_docs = blanks_separate_docs

        if tokenizer_type == "mecab_wordpiece":
            tokenizer = KoNLPyBertTokenizer(
                konlpy_wordpiece=KoNLPyWordPieceTokenizer(Mecab(),
                                                          use_tag=False),
                vocab_file=vocab_file,
                do_lower_case=do_lower_case,
            )
        elif tokenizer_type == "wordpiece":
            tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                                   do_lower_case=do_lower_case)
        self._example_builder = ExampleBuilder(tokenizer, max_seq_length,
                                               tokenizer_type)
        self._writers = []
        for i in range(num_out_files):
            if i % num_jobs == job_id:
                output_fname = os.path.join(
                    output_dir,
                    "pretrain_data.tfrecord-{:}-of-{:}".format(
                        i, num_out_files),
                )
                self._writers.append(tf.io.TFRecordWriter(output_fname))
        self.n_written = 0

コード例 #13

0

ファイルを表示

    def __init__(self, config: configure_pretraining.PretrainingConfig,
                 features, is_training):
        # Set up model config
        self._config = config
        self._bert_config = training_utils.get_bert_config(config)
        self._teacher_config = training_utils.get_teacher_config(config)

        embedding_size = (self._bert_config.hidden_size
                          if config.embedding_size is None else
                          config.embedding_size)

        tokenizer = tokenization.FullTokenizer(
            config.vocab_file, do_lower_case=config.do_lower_case)
        self._vocab = tokenizer.vocab
        self._inv_vocab = tokenizer.inv_vocab

        # Mask the input
        inputs = pretrain_data.features_to_inputs(features)
        old_model = self._build_transformer(inputs,
                                            is_training,
                                            embedding_size=embedding_size)
        input_states = old_model.get_sequence_output()
        input_states = tf.stop_gradient(input_states)

        teacher_output = self._build_teacher(input_states,
                                             inputs,
                                             is_training,
                                             embedding_size=embedding_size)
        # calculate the proposal distribution

        action_prob = teacher_output.action_probs  #pi(x_i)

        coin_toss = tf.random.uniform([])
        log_q, masked_inputs = self._sample_masking_subset(inputs, action_prob)
        if config.masking_strategy == pretrain_helpers.MIX_ADV_STRATEGY:
            random_masked_input = pretrain_helpers.mask(
                config, pretrain_data.features_to_inputs(features),
                config.mask_prob)
            B, L = modeling.get_shape_list(inputs.input_ids)
            N = config.max_predictions_per_seq
            strategy_prob = tf.random.uniform([B])
            strategy_prob = tf.expand_dims(
                tf.cast(tf.greater(strategy_prob, 0.5), tf.int32), 1)
            l_strategy_prob = tf.tile(strategy_prob, [1, L])
            n_strategy_prob = tf.tile(strategy_prob, [1, N])
            mix_input_ids = masked_inputs.input_ids * l_strategy_prob + random_masked_input.input_ids * (
                1 - l_strategy_prob)
            mix_masked_lm_positions = masked_inputs.masked_lm_positions * n_strategy_prob + random_masked_input.masked_lm_positions * (
                1 - n_strategy_prob)
            mix_masked_lm_ids = masked_inputs.masked_lm_ids * n_strategy_prob + random_masked_input.masked_lm_ids * (
                1 - n_strategy_prob)
            n_strategy_prob = tf.cast(n_strategy_prob, tf.float32)
            mix_masked_lm_weights = masked_inputs.masked_lm_weights * n_strategy_prob + random_masked_input.masked_lm_weights * (
                1 - n_strategy_prob)
            mix_masked_inputs = pretrain_data.get_updated_inputs(
                inputs,
                input_ids=tf.stop_gradient(mix_input_ids),
                masked_lm_positions=mix_masked_lm_positions,
                masked_lm_ids=mix_masked_lm_ids,
                masked_lm_weights=mix_masked_lm_weights,
                tag_ids=inputs.tag_ids)
            masked_inputs = mix_masked_inputs

        # BERT model
        model = self._build_transformer(masked_inputs,
                                        is_training,
                                        reuse=tf.AUTO_REUSE,
                                        embedding_size=embedding_size)
        mlm_output = self._get_masked_lm_output(masked_inputs, model)
        self.total_loss = mlm_output.loss

        # Teacher reward is the -log p(x_S|x;B)
        reward = tf.stop_gradient(
            tf.reduce_mean(mlm_output.per_example_loss, 1))
        self._baseline = tf.reduce_mean(reward, -1)
        self._std = tf.math.reduce_std(reward, -1)

        # Calculate teacher loss
        def compute_teacher_loss(log_q, reward, baseline, std):
            advantage = tf.abs((reward - baseline) / std)
            advantage = tf.stop_gradient(advantage)
            log_q = tf.Print(log_q, [log_q], "log_q: ")
            teacher_loss = tf.reduce_mean(-log_q * advantage)
            return teacher_loss

        teacher_loss = tf.cond(
            coin_toss < 0.1, lambda: compute_teacher_loss(
                log_q, reward, self._baseline, self._std),
            lambda: tf.constant(0.0))
        self.total_loss = mlm_output.loss + teacher_loss
        self.teacher_loss = teacher_loss
        self.mlm_loss = mlm_output.loss

        # Evaluation`
        eval_fn_inputs = {
            "input_ids": masked_inputs.input_ids,
            "masked_lm_preds": mlm_output.preds,
            "mlm_loss": mlm_output.per_example_loss,
            "masked_lm_ids": masked_inputs.masked_lm_ids,
            "masked_lm_weights": masked_inputs.masked_lm_weights,
            "input_mask": masked_inputs.input_mask
        }
        eval_fn_keys = eval_fn_inputs.keys()
        eval_fn_values = [eval_fn_inputs[k] for k in eval_fn_keys]
        """Computes the loss and accuracy of the model."""
        d = {k: arg for k, arg in zip(eval_fn_keys, eval_fn_values)}
        metrics = dict()
        metrics["masked_lm_accuracy"] = tf.metrics.accuracy(
            labels=tf.reshape(d["masked_lm_ids"], [-1]),
            predictions=tf.reshape(d["masked_lm_preds"], [-1]),
            weights=tf.reshape(d["masked_lm_weights"], [-1]))
        metrics["masked_lm_loss"] = tf.metrics.mean(
            values=tf.reshape(d["mlm_loss"], [-1]),
            weights=tf.reshape(d["masked_lm_weights"], [-1]))
        self.eval_metrics = metrics

コード例 #14

0

ファイルを表示

ファイル: pretrain_helpers.py プロジェクト: Wangkaixinlove/nlp

def mask(config: configure_pretraining.PretrainingConfig,
         inputs: pretrain_data.Inputs, mask_prob, proposal_distribution=1.0,
         disallow_from_mask=None, already_masked=None):
  """Implementation of dynamic masking. The optional arguments aren't needed for
  BERT/ELECTRA and are from early experiments in "strategically" masking out
  tokens instead of uniformly at random.

  Args:
    config: configure_pretraining.PretrainingConfig
    inputs: pretrain_data.Inputs containing input input_ids/input_mask
    mask_prob: percent of tokens to mask
    proposal_distribution: for non-uniform masking can be a [B, L] tensor
                           of scores for masking each position.
    disallow_from_mask: a boolean tensor of [B, L] of positions that should
                        not be masked out
    already_masked: a boolean tensor of [B, N] of already masked-out tokens
                    for multiple rounds of masking
  Returns: a pretrain_data.Inputs with masking added
  """
  # Get the batch size, sequence length, and max masked-out tokens
  N = config.max_predictions_per_seq
  B, L = modeling.get_shape_list(inputs.input_ids)

  # Find indices where masking out a token is allowed
  vocab = tokenization.FullTokenizer(
      config.vocab_file, do_lower_case=config.do_lower_case).vocab
  candidates_mask = _get_candidates_mask(inputs, vocab, disallow_from_mask)

  # Set the number of tokens to mask out per example
  num_tokens = tf.cast(tf.reduce_sum(inputs.input_mask, -1), tf.float32)
  num_to_predict = tf.maximum(1, tf.minimum(
      N, tf.cast(tf.round(num_tokens * mask_prob), tf.int32)))
  masked_lm_weights = tf.cast(tf.sequence_mask(num_to_predict, N), tf.float32)
  if already_masked is not None:
    masked_lm_weights *= (1 - already_masked)

  # Get a probability of masking each position in the sequence
  candidate_mask_float = tf.cast(candidates_mask, tf.float32)
  sample_prob = (proposal_distribution * candidate_mask_float)
  sample_prob /= tf.reduce_sum(sample_prob, axis=-1, keepdims=True)

  # Sample the positions to mask out
  sample_prob = tf.stop_gradient(sample_prob)
  sample_logits = tf.log(sample_prob)
  masked_lm_positions = tf.random.categorical(
      sample_logits, N, dtype=tf.int32)
  masked_lm_positions *= tf.cast(masked_lm_weights, tf.int32)

  # Get the ids of the masked-out tokens
  shift = tf.expand_dims(L * tf.range(B), -1)
  flat_positions = tf.reshape(masked_lm_positions + shift, [-1, 1])
  masked_lm_ids = tf.gather_nd(tf.reshape(inputs.input_ids, [-1]),
                               flat_positions)
  masked_lm_ids = tf.reshape(masked_lm_ids, [B, -1])
  masked_lm_ids *= tf.cast(masked_lm_weights, tf.int32)

  # Update the input ids
  replace_with_mask_positions = masked_lm_positions * tf.cast(
      tf.less(tf.random.uniform([B, N]), 0.85), tf.int32)
  inputs_ids, _ = scatter_update(
      inputs.input_ids, tf.fill([B, N], vocab["[MASK]"]),
      replace_with_mask_positions)

  return pretrain_data.get_updated_inputs(
      inputs,
      input_ids=tf.stop_gradient(inputs_ids),
      masked_lm_positions=masked_lm_positions,
      masked_lm_ids=masked_lm_ids,
      masked_lm_weights=masked_lm_weights
  )

コード例 #15

0

ファイルを表示

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--corpus", help="corpus file", required=True)
    parser.add_argument("--output_file",
                        help="output model file",
                        required=True)
    parser.add_argument("--vocab_file", help="vocab file", required=True)
    args = parser.parse_args()
    corpus_file = args.corpus
    output_file = args.output_file
    vocab_file = args.vocab_file
    print(corpus_file)
    counter = collections.Counter()
    total_count = 0

    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=False)
    with tqdm(open(corpus_file, "r"), desc=f"loading {corpus_file}") as f:
        for line in f:
            line = line.strip()
            if line:
                line = "[CLS] {} [SEP]".format(line)
                tokens = line.split()
                pieces = tokenize_and_align(tokenizer, tokens)
                total_count += len(pieces)
                counter.update(list(pieces))

    with open(output_file, "w") as fout:
        for key in tokenizer.vocab:
            if key in counter:
                p = float(counter[key]) / total_count
            else:

コード例 #16

0

ファイルを表示

ファイル: sentence_embedding.py プロジェクト: zhp510730568/sentence_embedding

def main(_):
    tf.logging.set_verbosity(tf.logging.DEBUG)

    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    processor = SentenceEmbeddingProcessor()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    run_config = tf.contrib.tpu.RunConfig(
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps)

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=FLAGS.init_checkpoint,
                                is_training=FLAGS.do_train,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_one_hot_embeddings=FLAGS.use_tpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size)

    if FLAGS.do_train:
        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        convert_examples_to_features(train_examples, label_list,
                                     FLAGS.max_seq_length, tokenizer,
                                     train_file)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = input_fn_builder(input_file=train_file,
                                          seq_length=FLAGS.max_seq_length,
                                          is_training=True,
                                          batch_size=FLAGS.train_batch_size,
                                          drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    if FLAGS.do_predict:
        predict_examples = processor.get_dev_examples(FLAGS.data_dir)
        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        convert_examples_to_features(predict_examples, label_list,
                                     FLAGS.max_seq_length, tokenizer,
                                     predict_file)

        tf.logging.info("***** Running prediction *****")
        tf.logging.info("  Num examples = %d", len(predict_examples))
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        predict_input_fn = input_fn_builder(input_file=predict_file,
                                            seq_length=FLAGS.max_seq_length,
                                            is_training=False,
                                            batch_size=FLAGS.eval_batch_size,
                                            drop_remainder=True)

        output_predict_file = os.path.join(FLAGS.output_dir, "predict.txt")
        with tf.gfile.Open(output_predict_file, 'w') as file:
            for result in estimator.predict(predict_input_fn):
                file.write('%d\n' % result['predictions'])

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        convert_examples_to_features(eval_examples, label_list,
                                     FLAGS.max_seq_length, tokenizer,
                                     eval_file)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d", len(eval_examples))
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        # However, if running eval on the TPU, you will need to specify the
        # number of steps.
        if FLAGS.use_tpu:
            # Eval will be slightly WRONG on the TPU because it will truncate
            # the last batch.
            eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = input_fn_builder(input_file=eval_file,
                                         seq_length=FLAGS.max_seq_length,
                                         is_training=False,
                                         batch_size=FLAGS.eval_batch_size,
                                         drop_remainder=eval_drop_remainder)

        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

コード例 #17

0

ファイルを表示

def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument("--model_type",
                        default=None,
                        type=str,
                        required=True,
                        help="Model type selected in the list: ")
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list")
    parser.add_argument(
        "--task_name",
        default=None,
        type=str,
        required=True,
        help="The name of the task to train selected in the list: " +
        ", ".join(processors.keys()))
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument("--vocab_file", default=None, type=str)
    parser.add_argument("--spm_model_file",
                        default=None,
                        required=True,
                        type=str)
    ## Other parameters
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=512,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_predict",
        action='store_true',
        help="Whether to run the model in inference mode on the test set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument('--share_type',
                        default='all',
                        type=str,
                        choices=['all', 'attention', 'ffn', 'None'])

    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-6,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for,E.g., 0.1 = 10% of training."
    )

    parser.add_argument('--logging_steps',
                        type=int,
                        default=10,
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps',
                        type=int,
                        default=1000,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action='store_true',
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
    )
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        '--fp16',
        action='store_true',
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="For distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="For distant debugging.")
    args = parser.parse_args()

    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)
    init_logger(log_file=args.output_dir +
                '/{}-{}.log'.format(args.model_type, args.task_name))
    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # Set seed
    seed_everything(args.seed)
    # Prepare GLUE task
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % (args.task_name))
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    config = BertConfig.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=args.task_name,
        share_type=args.share_type)
    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case,
                                           spm_model_file=args.spm_model_file)
    model = AlbertForSequenceClassification.from_pretrained(
        args.model_name_or_path,
        from_tf=bool('.ckpt' in args.model_name_or_path),
        config=config)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                args.task_name,
                                                tokenizer,
                                                data_type='train')
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = model.module if hasattr(
            model,
            'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

    # Evaluation
    results = []
    if args.do_eval and args.local_rank in [-1, 0]:
        tokenizer = tokenization.FullTokenizer(
            vocab_file=args.vocab_file,
            do_lower_case=args.do_lower_case,
            spm_model_file=args.spm_model_file)
        checkpoints = [(0, args.output_dir)]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME,
                              recursive=True)))
            logging.getLogger("transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce logging
            checkpoints = [(int(checkpoint.split('-')[-1]), checkpoint)
                           for checkpoint in checkpoints
                           if checkpoint.find('checkpoint') != -1]
            checkpoints = sorted(checkpoints, key=lambda x: x[0])
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for _, checkpoint in checkpoints:
            global_step = checkpoint.split(
                '-')[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split(
                '/')[-1] if checkpoint.find('checkpoint') != -1 else ""

            model = AlbertForSequenceClassification.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, prefix=prefix)
            results.extend([(k + '_{}'.format(global_step), v)
                            for k, v in result.items()])
        output_eval_file = os.path.join(args.output_dir,
                                        "checkpoint_eval_results.txt")
        with open(output_eval_file, "w") as writer:
            for key, value in results:
                writer.write("%s = %s\n" % (key, str(value)))

コード例 #18

0

ファイルを表示

ファイル: pretrain_helpers.py プロジェクト: trangvu/mlm4uda

def mask(config: configure_pretraining.PretrainingConfig,
         inputs: pretrain_data.Inputs, mask_prob, proposal_distribution=1.0,
         disallow_from_mask=None, already_masked=None):
  """Implementation of dynamic masking. The optional arguments aren't needed for
  BERT/ELECTRA and are from early experiments in "strategically" masking out
  tokens instead of uniformly at random.

  Args:
    config: configure_pretraining.PretrainingConfig
    inputs: pretrain_data.Inputs containing input input_ids/input_mask
    mask_prob: percent of tokens to mask
    proposal_distribution: for non-uniform masking can be a [B, L] tensor
                           of scores for masking each position.
    disallow_from_mask: a boolean tensor of [B, L] of positions that should
                        not be masked out
    already_masked: a boolean tensor of [B, N] of already masked-out tokens
                    for multiple rounds of masking
  Returns: a pretrain_data.Inputs with masking added
  """
  # Get the batch size, sequence length, and max masked-out tokens
  N = config.max_predictions_per_seq
  B, L = modeling.get_shape_list(inputs.input_ids)

  # Find indices where masking out a token is allowed
  tokenizer = tokenization.FullTokenizer(
      config.vocab_file, do_lower_case=config.do_lower_case)
  vocab = tokenizer.vocab
  inv_vocab = tokenizer.inv_vocab
  candidates_mask = _get_candidates_mask(inputs, vocab, disallow_from_mask)

  # Set the number of tokens to mask out per example
  num_tokens = tf.cast(tf.reduce_sum(inputs.input_mask, -1), tf.float32)
  num_to_predict = tf.maximum(1, tf.minimum(
      N, tf.cast(tf.round(num_tokens * mask_prob), tf.int32)))
  masked_lm_weights = tf.cast(tf.sequence_mask(num_to_predict, N), tf.float32)
  if already_masked is not None:
    masked_lm_weights *= (1 - already_masked)

  # Get a probability of masking each position in the sequence
  candidate_mask_float = tf.cast(candidates_mask, tf.float32)

  if config.masking_strategy == RAND_STRATEGY or config.masking_strategy == MIX_ADV_STRATEGY:
    sample_prob = (proposal_distribution * candidate_mask_float)
  elif config.masking_strategy == POS_STRATEGY:
    unfavor_pos_mask = _get_unfavor_pos_mask(inputs)
    unfavor_pos_mask_float = tf.cast(unfavor_pos_mask, tf.float32)
    prefer_pos_mask_float = 1 - unfavor_pos_mask_float

    # prefered pos have 80% propabiblity, not preferred ones have 20% probability
    # proposal_distribution = prefer_pos_mask_float
    proposal_distribution = 0.95 * prefer_pos_mask_float + 0.05
    sample_prob = (proposal_distribution * candidate_mask_float)
  elif config.masking_strategy == ENTROPY_STRATEGY:
    sample_prob = (proposal_distribution * candidate_mask_float)
  elif config.masking_strategy == MIX_POS_STRATEGY:
    rand_sample_prob = (proposal_distribution * candidate_mask_float)
    unfavor_pos_mask = _get_unfavor_pos_mask(inputs)
    unfavor_pos_mask_float = tf.cast(unfavor_pos_mask, tf.float32)
    prefer_pos_mask_float = 1 - unfavor_pos_mask_float

    # prefered pos have 80% propabiblity, not preferred ones have 20% probability
    # proposal_distribution = prefer_pos_mask_float
    proposal_distribution = 0.95 * prefer_pos_mask_float + 0.05
    pos_sample_prob = (proposal_distribution * candidate_mask_float)

    strategy_prob = tf.random.uniform([B])
    strategy_prob = tf.expand_dims(tf.cast(tf.greater(strategy_prob,0.5), tf.float32),1)
    strategy_prob = tf.tile(strategy_prob, [1,L])
    sample_prob = rand_sample_prob * strategy_prob + pos_sample_prob * (1 - strategy_prob)
  elif config.masking_strategy == MIX_ENTROPY_STRATEGY:
    rand_sample_prob = (proposal_distribution * candidate_mask_float)
    entropy_sample_prob = (proposal_distribution * candidate_mask_float)
    strategy_prob = tf.random.uniform([B])
    strategy_prob = tf.expand_dims(tf.cast(tf.greater(strategy_prob,0.5), tf.float32),1)
    strategy_prob = tf.tile(strategy_prob, [1, L])
    sample_prob = rand_sample_prob * strategy_prob + entropy_sample_prob * (1 - strategy_prob)
  else:
    raise ValueError("{} strategy is not supported".format(config.masking_strategy))
  sample_prob /= tf.reduce_sum(sample_prob, axis=-1, keepdims=True)

  # Sample the positions to mask out
  sample_prob = tf.stop_gradient(sample_prob)
  sample_logits = tf.log(sample_prob)
  masked_lm_positions = tf.random.categorical(
      sample_logits, N, dtype=tf.int32)
  masked_lm_positions *= tf.cast(masked_lm_weights, tf.int32)

  # Get the ids of the masked-out tokens
  shift = tf.expand_dims(L * tf.range(B), -1)
  flat_positions = tf.reshape(masked_lm_positions + shift, [-1, 1])
  masked_lm_ids = tf.gather_nd(tf.reshape(inputs.input_ids, [-1]),
                               flat_positions)
  masked_lm_ids = tf.reshape(masked_lm_ids, [B, -1])
  masked_lm_ids *= tf.cast(masked_lm_weights, tf.int32)

  # Update the input ids
  replace_prob = tf.random.uniform([B, N])
  replace_with_mask_positions = masked_lm_positions * tf.cast(
      tf.less(replace_prob, 0.85), tf.int32)
  inputs_ids, _ = scatter_update(
      inputs.input_ids, tf.fill([B, N], vocab["[MASK]"]),
      replace_with_mask_positions)

  # Replace with random tokens
  replace_with_random_positions = masked_lm_positions * tf.cast(
    tf.greater(replace_prob, 0.925), tf.int32)
  random_tokens = tf.random.uniform([B,N], minval=0, maxval=len(vocab), dtype=tf.int32)
  inputs_ids, _ = scatter_update(
    inputs_ids, random_tokens,
    replace_with_random_positions)

  if config.debug:
    def pretty_print(inputs_ids, masked_lm_ids, masked_lm_positions, masked_lm_weights, tag_ids):
      debug_inputs = Inputs(
      input_ids=inputs_ids,
      input_mask=None,
      segment_ids=None,
      masked_lm_positions=masked_lm_positions,
      masked_lm_ids=masked_lm_ids,
      masked_lm_weights=masked_lm_weights,
      tag_ids = tag_ids)
      pretrain_data.print_tokens(debug_inputs, inv_vocab)

      ## TODO: save to the mask choice
      return inputs_ids, masked_lm_ids, masked_lm_positions, masked_lm_weights

    mask_shape = masked_lm_ids.get_shape()
    inputs_ids, masked_lm_ids, masked_lm_positions, masked_lm_weights = \
      tf.py_func(pretty_print,[inputs_ids, masked_lm_ids, masked_lm_positions, masked_lm_weights, inputs.tag_ids],
                 (tf.int32, tf.int32, tf.int32, tf.float32))
    inputs_ids.set_shape(inputs.input_ids.get_shape())
    masked_lm_ids.set_shape(mask_shape)
    masked_lm_positions.set_shape(mask_shape)
    masked_lm_weights.set_shape(mask_shape)

  return pretrain_data.get_updated_inputs(
      inputs,
      input_ids=tf.stop_gradient(inputs_ids),
      masked_lm_positions=masked_lm_positions,
      masked_lm_ids=masked_lm_ids,
      masked_lm_weights=masked_lm_weights,
      tag_ids = inputs.tag_ids
    )

コード例 #19

0

ファイルを表示

ファイル: extract_features.py プロジェクト: ranggarppb/bert-bahasa-plagiarism-checker

def extract(input_texts, vocab_file, bert_config_file, init_checkpoint, 
layers=LAYERS, do_lower_case=True, max_seq_length=MAX_SEQ_LENGTH, master=None, num_tpu_cores=8, use_tpu=False, batch_size=BATCH_SIZE, use_one_hot_embeddings=False, to_json=False, output_file=None):
  tf.logging.set_verbosity(tf.logging.INFO)

  layer_indexes = [int(x) for x in LAYERS.split(",")]

  bert_config = modeling.BertConfig.from_json_file(bert_config_file)

  tokenizer = tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)

  is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2
  run_config = tf.estimator.tpu.RunConfig(
      master=master,
      tpu_config=tf.estimator.tpu.TPUConfig(
          num_shards=num_tpu_cores,
          per_host_input_for_training=is_per_host))

  examples = read_examples(input_texts)

  features = convert_examples_to_features(
      examples=examples, seq_length=MAX_SEQ_LENGTH, tokenizer=tokenizer)

  unique_id_to_feature = {}
  for feature in features:
    unique_id_to_feature[feature.unique_id] = feature

  model_fn = model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=init_checkpoint,
      layer_indexes=layer_indexes,
      use_tpu=use_tpu,
      use_one_hot_embeddings=use_one_hot_embeddings)

  # If TPU is not available, this will fall back to normal Estimator on CPU
  # or GPU.
  estimator = tf.estimator.tpu.TPUEstimator(
      use_tpu=use_tpu,
      model_fn=model_fn,
      config=run_config,
      predict_batch_size=BATCH_SIZE)

  input_fn = input_fn_builder(
      features=features, seq_length=MAX_SEQ_LENGTH)

  if (to_json == True) and (output_file != None) : 
    with codecs.getwriter("utf-8")(tf.io.gfile.GFile(output_file,
                                                "w")) as writer:
      list_output_json = []
      for result in estimator.predict(input_fn, yield_single_examples=True):
        unique_id = int(result["unique_id"])
        feature = unique_id_to_feature[unique_id]
        output_json = collections.OrderedDict()
        output_json["linex_index"] = unique_id
        all_features = []
        for (i, token) in enumerate(feature.tokens):
          all_layers = []
          for (j, layer_index) in enumerate(layer_indexes):
            layer_output = result["layer_output_%d" % j]
            layers = collections.OrderedDict()
            layers["index"] = layer_index
            layers["values"] = [
                round(float(x), 6) for x in layer_output[i:(i + 1)].flat
            ]
            all_layers.append(layers)
          features = collections.OrderedDict()
          features["token"] = token
          features["layers"] = all_layers
          all_features.append(features)
        output_json["features"] = all_features
        list_output_json.append(json.dumps(output_json))
        writer.write(json.dumps(output_json) + "\n")
  else :
    list_output_json = []
    for result in estimator.predict(input_fn, yield_single_examples=True):
      unique_id = int(result["unique_id"])
      feature = unique_id_to_feature[unique_id]
      output_json = collections.OrderedDict()
      output_json["linex_index"] = unique_id
      all_features = []
      for (i, token) in enumerate(feature.tokens):
        all_layers = []
        for (j, layer_index) in enumerate(layer_indexes):
          layer_output = result["layer_output_%d" % j]
          layers = collections.OrderedDict()
          layers["index"] = layer_index
          layers["values"] = [
              round(float(x), 6) for x in layer_output[i:(i + 1)].flat
          ]
          all_layers.append(layers)
        features = collections.OrderedDict()
        features["token"] = token
        features["layers"] = all_layers
        all_features.append(features)
      output_json["features"] = all_features
      list_output_json.append(json.dumps(output_json))

  return list_output_json