def test_MdExample(): tokenizer = tokenization.FullTokenizer(vocab_file=DATA_DIR + 'vocab.txt', do_lower_case=True) words = ['Refah', 'da', 'Türkçe', 'için', 'görüş', 'istedi'] analyzes = [['Refah+Noun+Prop+A3sg+Pnon+Nom', 'refah+Noun+A3sg+Pnon+Nom'], ['da+Conj'], [ 'Türkçe+Noun+Prop+A3sg+Pnon+Nom', 'türkçe+Adj', 'türk+Noun+A3sg+Pnon+Equ', 'türk+Adj^DB+Adverb+Ly', 'türk+Adj^DB+Adj+AsIf' ], [ 'için+Postp+PCNom', 'iç+Noun+A3sg+P2sg+Nom', 'iç+Noun+A3sg+Pnon+Gen', 'iç+Verb+Pos+Imp+A2pl' ], [ 'görüş+Noun+A3sg+Pnon+Nom', 'gör+Verb+Recip+Pos+Imp+A2sg', 'gör+Verb+Pos^DB+Noun+Inf3+A3sg+Pnon+Nom', 'görüş+Verb+Pos+Imp+A2sg' ], ['iste+Verb+Pos+Past+A3sg']] example = MdExample(0, 'md', words, analyzes, tokenizer) print('candidate_roots', example.candidate_roots) print('candidate_tags', example.candidate_tags) print('word_ids', example.word_ids) print('sub_tokens', example.sub_tokens) print('words', example.words)
def get_tasks(config: configure_finetuning.FinetuningConfig): tokenizer = tokenization.FullTokenizer(vocab_file=config.vocab_file, do_lower_case=config.do_lower_case) return [ get_task(config, task_name, tokenizer) for task_name in config.task_names ]
def build_token_synonym(text, vocab_file, do_lower_case): tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) tokens = tokenizer.tokenize(text) token_synonym = {} for vocab in tokenizer.vocab.keys(): token_synonym[vocab] = [] src = True for token in tokens: if src: src_token = token src = False elif token == '_': continue elif token == '/': src = True continue else: # append synonym token to corresponding src_token if token not in token_synonym[src_token]: token_synonym[src_token].append(token) # reverse-wise appending if src_token not in token_synonym[token]: token_synonym[token].append(src_token) return token_synonym
def get_vocab(config): """Memoized load of the vocab file.""" if config.vocab_file not in VOCAB_MAPPING: vocab = tokenization.FullTokenizer( config.vocab_file, do_lower_case=True).vocab VOCAB_MAPPING[config.vocab_file] = vocab return VOCAB_MAPPING[config.vocab_file]
def __init__( self, job_id, vocab_file, output_dir, max_seq_length, num_jobs, blanks_separate_docs, do_lower_case, num_out_files=1000, ): self._blanks_separate_docs = blanks_separate_docs tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) self._example_builder = ExampleBuilder(tokenizer, max_seq_length) self._writers = [] for i in range(num_out_files): if i % num_jobs == job_id: output_fname = os.path.join( output_dir, 'pretrain_data.tfrecord-{:}-of-{:}'.format( i, num_out_files), ) self._writers.append(tf.io.TFRecordWriter(output_fname)) self.n_written = 0
def write_classification_outputs( self, tasks, trial, split, config: configure_finetuning.FinetuningConfig): """Write classification predictions to disk.""" utils.log("Writing out predictions for", tasks, split) predict_input_fn, _ = self._preprocessor.prepare_predict(tasks, split) results = self._estimator.predict(input_fn=predict_input_fn, yield_single_examples=True) # task name -> eid -> model-logits logits = collections.defaultdict(dict) for r in results: if r["task_id"] != len(self._tasks): r = utils.nest_dict(r, self._config.task_names) task_name = self._config.task_names[r["task_id"]] logits[task_name][r[task_name]["eid"]] = ( r[task_name]["eid"], r[task_name]["input_ids"], r[task_name]["input_mask"], r[task_name]["token_type_ids"], r[task_name]["logits"] if "logits" in r[task_name] else None, r[task_name]["predictions"], r[task_name]["label_ids"] if "label_ids" in r[task_name] else r[task_name]['targets'], ) print('[RESULT]') tokenizer = tokenization.FullTokenizer( vocab_file=config.vocab_file, do_lower_case=config.do_lower_case) for task_name in logits: utils.log( "Saving Dev Error Analysis for {:} {:} examples ({:})".format( len(logits[task_name]), task_name, split)) if trial <= self._config.n_writes_test: print('Write to: ' + self._config.dev_analysis(task_name, split, trial)) with open(self._config.dev_analysis(task_name, split, trial), 'w', encoding='utf-8') as fout: fout.write('ID\tINPUT\tLOGITS\tPREDICTION\tLABEL\n') for eid in logits[task_name]: print('=>' + str(eid)) (_, input_id, input_mask, token_type_id, logit, prediction, label_id) = logits[task_name][eid] input_tokens = tokenizer.convert_ids_to_tokens( input_id) input_tokens = filter(lambda x: x != '[PAD]', input_tokens) input_tokens = ' '.join(input_tokens) fout.write( str(eid) + '\t' + str(input_tokens) + '\t' + str(logit) + '\t' + str(prediction) + '\t' + str(label_id) + '\n') print('Inputs: ' + str(input_tokens) + ', Logits: ' + str(logit) + ', Predictions: ' + str(prediction) + ', Labels: ' + str(label_id))
def __init__(self, batch_size=args.batch_size): self.mode = None self.max_seq_length = args.max_seq_len self.tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=True) self.batch_size = batch_size self.estimator = None self.processor = SimProcessor() tf.logging.set_verbosity(tf.logging.INFO)
def __init__(self, config: configure_pretraining.PretrainingConfig, features, ratio, is_training): # Set up model config self._config = config self._bert_config = training_utils.get_bert_config(config) embedding_size = (self._bert_config.hidden_size if config.embedding_size is None else config.embedding_size) tokenizer = tokenization.FullTokenizer( config.vocab_file, do_lower_case=config.do_lower_case) self._vocab = tokenizer.vocab self._inv_vocab = tokenizer.inv_vocab # Mask the input inputs = pretrain_data.features_to_inputs(features) # Load ratio with tf.variable_scope("rw_masking"): with tf.variable_scope("ratio"): self.ratios = tf.constant(ratio) action_prob = tf.nn.embedding_lookup(self.ratios, inputs.input_ids) log_q, masked_inputs = self._sample_masking_subset(inputs, action_prob) # BERT model model = self._build_transformer(masked_inputs, is_training, reuse=tf.AUTO_REUSE, embedding_size=embedding_size) mlm_output = self._get_masked_lm_output(masked_inputs, model) self.total_loss = mlm_output.loss # Evaluation` eval_fn_inputs = { "input_ids": masked_inputs.input_ids, "masked_lm_preds": mlm_output.preds, "mlm_loss": mlm_output.per_example_loss, "masked_lm_ids": masked_inputs.masked_lm_ids, "masked_lm_weights": masked_inputs.masked_lm_weights, "input_mask": masked_inputs.input_mask } eval_fn_keys = eval_fn_inputs.keys() eval_fn_values = [eval_fn_inputs[k] for k in eval_fn_keys] """Computes the loss and accuracy of the model.""" d = {k: arg for k, arg in zip(eval_fn_keys, eval_fn_values)} metrics = dict() metrics["masked_lm_accuracy"] = tf.metrics.accuracy( labels=tf.reshape(d["masked_lm_ids"], [-1]), predictions=tf.reshape(d["masked_lm_preds"], [-1]), weights=tf.reshape(d["masked_lm_weights"], [-1])) metrics["masked_lm_loss"] = tf.metrics.mean( values=tf.reshape(d["mlm_loss"], [-1]), weights=tf.reshape(d["masked_lm_weights"], [-1])) self.eval_metrics = metrics
def __init__(self, input_fname, vocab_file, output_dir, max_seq_length, blanks_separate_docs, do_lower_case): self._blanks_separate_docs = blanks_separate_docs tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=do_lower_case) self._example_builder = build_pretraining_dataset.ExampleBuilder(tokenizer, max_seq_length) output_fname = os.path.join(output_dir, "{}.tfrecord".format(input_fname.split("/")[-1])) self._writer = tf.io.TFRecordWriter(output_fname) self.n_written = 0
def prepare_pretraining_data(input_file, output_file, vocab_file, do_lower_case=True, random_seed=42, max_seq_length=128, max_predictions_per_seq=20, short_seq_prob=0.1, masked_lm_prob=0.15, dupe_factor=5): tf.logging.set_verbosity(tf.logging.INFO) tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case) input_files = [] for input_pattern in input_file.split(','): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info('*** Reading from input files ***') for input_file in input_files: tf.logging.info(' %s', input_file) rng = random.Random(random_seed) instances = create_training_instances( input_files, tokenizer, max_seq_length, dupe_factor, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng, ) output_files = output_file.split(',') tf.logging.info('*** Writing to output files ***') for output_file in output_files: tf.logging.info(' %s', output_file) write_instance_to_example_files( instances, tokenizer, max_seq_length, max_predictions_per_seq, output_files, )
def test_full_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing", "," ] with tempfile.NamedTemporaryFile(delete=False) as vocab_writer: if six.PY2: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) else: vocab_writer.write("".join([x + "\n" for x in vocab_tokens ]).encode("utf-8")) vocab_file = vocab_writer.name tokenizer = tokenization.FullTokenizer(vocab_file) os.unlink(vocab_file) tokens = tokenizer.tokenize(u"UNwant\u00E9d,running") self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) self.assertAllEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
def __init__( self, job_id, vocab_file, output_dir, max_seq_length, num_jobs, blanks_separate_docs, do_lower_case, tokenizer_type, num_out_files=500, ): self._blanks_separate_docs = blanks_separate_docs if tokenizer_type == "mecab_wordpiece": tokenizer = KoNLPyBertTokenizer( konlpy_wordpiece=KoNLPyWordPieceTokenizer(Mecab(), use_tag=False), vocab_file=vocab_file, do_lower_case=do_lower_case, ) elif tokenizer_type == "wordpiece": tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) self._example_builder = ExampleBuilder(tokenizer, max_seq_length, tokenizer_type) self._writers = [] for i in range(num_out_files): if i % num_jobs == job_id: output_fname = os.path.join( output_dir, "pretrain_data.tfrecord-{:}-of-{:}".format( i, num_out_files), ) self._writers.append(tf.io.TFRecordWriter(output_fname)) self.n_written = 0
def __init__(self, config: configure_pretraining.PretrainingConfig, features, is_training): # Set up model config self._config = config self._bert_config = training_utils.get_bert_config(config) self._teacher_config = training_utils.get_teacher_config(config) embedding_size = (self._bert_config.hidden_size if config.embedding_size is None else config.embedding_size) tokenizer = tokenization.FullTokenizer( config.vocab_file, do_lower_case=config.do_lower_case) self._vocab = tokenizer.vocab self._inv_vocab = tokenizer.inv_vocab # Mask the input inputs = pretrain_data.features_to_inputs(features) old_model = self._build_transformer(inputs, is_training, embedding_size=embedding_size) input_states = old_model.get_sequence_output() input_states = tf.stop_gradient(input_states) teacher_output = self._build_teacher(input_states, inputs, is_training, embedding_size=embedding_size) # calculate the proposal distribution action_prob = teacher_output.action_probs #pi(x_i) coin_toss = tf.random.uniform([]) log_q, masked_inputs = self._sample_masking_subset(inputs, action_prob) if config.masking_strategy == pretrain_helpers.MIX_ADV_STRATEGY: random_masked_input = pretrain_helpers.mask( config, pretrain_data.features_to_inputs(features), config.mask_prob) B, L = modeling.get_shape_list(inputs.input_ids) N = config.max_predictions_per_seq strategy_prob = tf.random.uniform([B]) strategy_prob = tf.expand_dims( tf.cast(tf.greater(strategy_prob, 0.5), tf.int32), 1) l_strategy_prob = tf.tile(strategy_prob, [1, L]) n_strategy_prob = tf.tile(strategy_prob, [1, N]) mix_input_ids = masked_inputs.input_ids * l_strategy_prob + random_masked_input.input_ids * ( 1 - l_strategy_prob) mix_masked_lm_positions = masked_inputs.masked_lm_positions * n_strategy_prob + random_masked_input.masked_lm_positions * ( 1 - n_strategy_prob) mix_masked_lm_ids = masked_inputs.masked_lm_ids * n_strategy_prob + random_masked_input.masked_lm_ids * ( 1 - n_strategy_prob) n_strategy_prob = tf.cast(n_strategy_prob, tf.float32) mix_masked_lm_weights = masked_inputs.masked_lm_weights * n_strategy_prob + random_masked_input.masked_lm_weights * ( 1 - n_strategy_prob) mix_masked_inputs = pretrain_data.get_updated_inputs( inputs, input_ids=tf.stop_gradient(mix_input_ids), masked_lm_positions=mix_masked_lm_positions, masked_lm_ids=mix_masked_lm_ids, masked_lm_weights=mix_masked_lm_weights, tag_ids=inputs.tag_ids) masked_inputs = mix_masked_inputs # BERT model model = self._build_transformer(masked_inputs, is_training, reuse=tf.AUTO_REUSE, embedding_size=embedding_size) mlm_output = self._get_masked_lm_output(masked_inputs, model) self.total_loss = mlm_output.loss # Teacher reward is the -log p(x_S|x;B) reward = tf.stop_gradient( tf.reduce_mean(mlm_output.per_example_loss, 1)) self._baseline = tf.reduce_mean(reward, -1) self._std = tf.math.reduce_std(reward, -1) # Calculate teacher loss def compute_teacher_loss(log_q, reward, baseline, std): advantage = tf.abs((reward - baseline) / std) advantage = tf.stop_gradient(advantage) log_q = tf.Print(log_q, [log_q], "log_q: ") teacher_loss = tf.reduce_mean(-log_q * advantage) return teacher_loss teacher_loss = tf.cond( coin_toss < 0.1, lambda: compute_teacher_loss( log_q, reward, self._baseline, self._std), lambda: tf.constant(0.0)) self.total_loss = mlm_output.loss + teacher_loss self.teacher_loss = teacher_loss self.mlm_loss = mlm_output.loss # Evaluation` eval_fn_inputs = { "input_ids": masked_inputs.input_ids, "masked_lm_preds": mlm_output.preds, "mlm_loss": mlm_output.per_example_loss, "masked_lm_ids": masked_inputs.masked_lm_ids, "masked_lm_weights": masked_inputs.masked_lm_weights, "input_mask": masked_inputs.input_mask } eval_fn_keys = eval_fn_inputs.keys() eval_fn_values = [eval_fn_inputs[k] for k in eval_fn_keys] """Computes the loss and accuracy of the model.""" d = {k: arg for k, arg in zip(eval_fn_keys, eval_fn_values)} metrics = dict() metrics["masked_lm_accuracy"] = tf.metrics.accuracy( labels=tf.reshape(d["masked_lm_ids"], [-1]), predictions=tf.reshape(d["masked_lm_preds"], [-1]), weights=tf.reshape(d["masked_lm_weights"], [-1])) metrics["masked_lm_loss"] = tf.metrics.mean( values=tf.reshape(d["mlm_loss"], [-1]), weights=tf.reshape(d["masked_lm_weights"], [-1])) self.eval_metrics = metrics
def mask(config: configure_pretraining.PretrainingConfig, inputs: pretrain_data.Inputs, mask_prob, proposal_distribution=1.0, disallow_from_mask=None, already_masked=None): """Implementation of dynamic masking. The optional arguments aren't needed for BERT/ELECTRA and are from early experiments in "strategically" masking out tokens instead of uniformly at random. Args: config: configure_pretraining.PretrainingConfig inputs: pretrain_data.Inputs containing input input_ids/input_mask mask_prob: percent of tokens to mask proposal_distribution: for non-uniform masking can be a [B, L] tensor of scores for masking each position. disallow_from_mask: a boolean tensor of [B, L] of positions that should not be masked out already_masked: a boolean tensor of [B, N] of already masked-out tokens for multiple rounds of masking Returns: a pretrain_data.Inputs with masking added """ # Get the batch size, sequence length, and max masked-out tokens N = config.max_predictions_per_seq B, L = modeling.get_shape_list(inputs.input_ids) # Find indices where masking out a token is allowed vocab = tokenization.FullTokenizer( config.vocab_file, do_lower_case=config.do_lower_case).vocab candidates_mask = _get_candidates_mask(inputs, vocab, disallow_from_mask) # Set the number of tokens to mask out per example num_tokens = tf.cast(tf.reduce_sum(inputs.input_mask, -1), tf.float32) num_to_predict = tf.maximum(1, tf.minimum( N, tf.cast(tf.round(num_tokens * mask_prob), tf.int32))) masked_lm_weights = tf.cast(tf.sequence_mask(num_to_predict, N), tf.float32) if already_masked is not None: masked_lm_weights *= (1 - already_masked) # Get a probability of masking each position in the sequence candidate_mask_float = tf.cast(candidates_mask, tf.float32) sample_prob = (proposal_distribution * candidate_mask_float) sample_prob /= tf.reduce_sum(sample_prob, axis=-1, keepdims=True) # Sample the positions to mask out sample_prob = tf.stop_gradient(sample_prob) sample_logits = tf.log(sample_prob) masked_lm_positions = tf.random.categorical( sample_logits, N, dtype=tf.int32) masked_lm_positions *= tf.cast(masked_lm_weights, tf.int32) # Get the ids of the masked-out tokens shift = tf.expand_dims(L * tf.range(B), -1) flat_positions = tf.reshape(masked_lm_positions + shift, [-1, 1]) masked_lm_ids = tf.gather_nd(tf.reshape(inputs.input_ids, [-1]), flat_positions) masked_lm_ids = tf.reshape(masked_lm_ids, [B, -1]) masked_lm_ids *= tf.cast(masked_lm_weights, tf.int32) # Update the input ids replace_with_mask_positions = masked_lm_positions * tf.cast( tf.less(tf.random.uniform([B, N]), 0.85), tf.int32) inputs_ids, _ = scatter_update( inputs.input_ids, tf.fill([B, N], vocab["[MASK]"]), replace_with_mask_positions) return pretrain_data.get_updated_inputs( inputs, input_ids=tf.stop_gradient(inputs_ids), masked_lm_positions=masked_lm_positions, masked_lm_ids=masked_lm_ids, masked_lm_weights=masked_lm_weights )
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--corpus", help="corpus file", required=True) parser.add_argument("--output_file", help="output model file", required=True) parser.add_argument("--vocab_file", help="vocab file", required=True) args = parser.parse_args() corpus_file = args.corpus output_file = args.output_file vocab_file = args.vocab_file print(corpus_file) counter = collections.Counter() total_count = 0 tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=False) with tqdm(open(corpus_file, "r"), desc=f"loading {corpus_file}") as f: for line in f: line = line.strip() if line: line = "[CLS] {} [SEP]".format(line) tokens = line.split() pieces = tokenize_and_align(tokenizer, tokens) total_count += len(pieces) counter.update(list(pieces)) with open(output_file, "w") as fout: for key in tokenizer.vocab: if key in counter: p = float(counter[key]) / total_count else:
def main(_): tf.logging.set_verbosity(tf.logging.DEBUG) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) processor = SentenceEmbeddingProcessor() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) run_config = tf.contrib.tpu.RunConfig( master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, is_training=FLAGS.do_train, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = input_fn_builder(input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, batch_size=FLAGS.train_batch_size, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_predict: predict_examples = processor.get_dev_examples(FLAGS.data_dir) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction *****") tf.logging.info(" Num examples = %d", len(predict_examples)) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) predict_input_fn = input_fn_builder(input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, batch_size=FLAGS.eval_batch_size, drop_remainder=True) output_predict_file = os.path.join(FLAGS.output_dir, "predict.txt") with tf.gfile.Open(output_predict_file, 'w') as file: for result in estimator.predict(predict_input_fn): file.write('%d\n' % result['predictions']) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", len(eval_examples)) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: # Eval will be slightly WRONG on the TPU because it will truncate # the last batch. eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = input_fn_builder(input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, batch_size=FLAGS.eval_batch_size, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: ") parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list") parser.add_argument( "--task_name", default=None, type=str, required=True, help="The name of the task to train selected in the list: " + ", ".join(processors.keys())) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument("--vocab_file", default=None, type=str) parser.add_argument("--spm_model_file", default=None, required=True, type=str) ## Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=512, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_predict", action='store_true', help="Whether to run the model in inference mode on the test set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument('--share_type', default='all', type=str, choices=['all', 'attention', 'ffn', 'None']) parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-6, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for,E.g., 0.1 = 10% of training." ) parser.add_argument('--logging_steps', type=int, default=10, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=1000, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") args = parser.parse_args() if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) init_logger(log_file=args.output_dir + '/{}-{}.log'.format(args.model_type, args.task_name)) if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed seed_everything(args.seed) # Prepare GLUE task args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: %s" % (args.task_name)) processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config = BertConfig.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name, share_type=args.share_type) tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case, spm_model_file=args.spm_model_file) model = AlbertForSequenceClassification.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, data_type='train') global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) # Evaluation results = [] if args.do_eval and args.local_rank in [-1, 0]: tokenizer = tokenization.FullTokenizer( vocab_file=args.vocab_file, do_lower_case=args.do_lower_case, spm_model_file=args.spm_model_file) checkpoints = [(0, args.output_dir)] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging checkpoints = [(int(checkpoint.split('-')[-1]), checkpoint) for checkpoint in checkpoints if checkpoint.find('checkpoint') != -1] checkpoints = sorted(checkpoints, key=lambda x: x[0]) logger.info("Evaluate the following checkpoints: %s", checkpoints) for _, checkpoint in checkpoints: global_step = checkpoint.split( '-')[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split( '/')[-1] if checkpoint.find('checkpoint') != -1 else "" model = AlbertForSequenceClassification.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=prefix) results.extend([(k + '_{}'.format(global_step), v) for k, v in result.items()]) output_eval_file = os.path.join(args.output_dir, "checkpoint_eval_results.txt") with open(output_eval_file, "w") as writer: for key, value in results: writer.write("%s = %s\n" % (key, str(value)))
def mask(config: configure_pretraining.PretrainingConfig, inputs: pretrain_data.Inputs, mask_prob, proposal_distribution=1.0, disallow_from_mask=None, already_masked=None): """Implementation of dynamic masking. The optional arguments aren't needed for BERT/ELECTRA and are from early experiments in "strategically" masking out tokens instead of uniformly at random. Args: config: configure_pretraining.PretrainingConfig inputs: pretrain_data.Inputs containing input input_ids/input_mask mask_prob: percent of tokens to mask proposal_distribution: for non-uniform masking can be a [B, L] tensor of scores for masking each position. disallow_from_mask: a boolean tensor of [B, L] of positions that should not be masked out already_masked: a boolean tensor of [B, N] of already masked-out tokens for multiple rounds of masking Returns: a pretrain_data.Inputs with masking added """ # Get the batch size, sequence length, and max masked-out tokens N = config.max_predictions_per_seq B, L = modeling.get_shape_list(inputs.input_ids) # Find indices where masking out a token is allowed tokenizer = tokenization.FullTokenizer( config.vocab_file, do_lower_case=config.do_lower_case) vocab = tokenizer.vocab inv_vocab = tokenizer.inv_vocab candidates_mask = _get_candidates_mask(inputs, vocab, disallow_from_mask) # Set the number of tokens to mask out per example num_tokens = tf.cast(tf.reduce_sum(inputs.input_mask, -1), tf.float32) num_to_predict = tf.maximum(1, tf.minimum( N, tf.cast(tf.round(num_tokens * mask_prob), tf.int32))) masked_lm_weights = tf.cast(tf.sequence_mask(num_to_predict, N), tf.float32) if already_masked is not None: masked_lm_weights *= (1 - already_masked) # Get a probability of masking each position in the sequence candidate_mask_float = tf.cast(candidates_mask, tf.float32) if config.masking_strategy == RAND_STRATEGY or config.masking_strategy == MIX_ADV_STRATEGY: sample_prob = (proposal_distribution * candidate_mask_float) elif config.masking_strategy == POS_STRATEGY: unfavor_pos_mask = _get_unfavor_pos_mask(inputs) unfavor_pos_mask_float = tf.cast(unfavor_pos_mask, tf.float32) prefer_pos_mask_float = 1 - unfavor_pos_mask_float # prefered pos have 80% propabiblity, not preferred ones have 20% probability # proposal_distribution = prefer_pos_mask_float proposal_distribution = 0.95 * prefer_pos_mask_float + 0.05 sample_prob = (proposal_distribution * candidate_mask_float) elif config.masking_strategy == ENTROPY_STRATEGY: sample_prob = (proposal_distribution * candidate_mask_float) elif config.masking_strategy == MIX_POS_STRATEGY: rand_sample_prob = (proposal_distribution * candidate_mask_float) unfavor_pos_mask = _get_unfavor_pos_mask(inputs) unfavor_pos_mask_float = tf.cast(unfavor_pos_mask, tf.float32) prefer_pos_mask_float = 1 - unfavor_pos_mask_float # prefered pos have 80% propabiblity, not preferred ones have 20% probability # proposal_distribution = prefer_pos_mask_float proposal_distribution = 0.95 * prefer_pos_mask_float + 0.05 pos_sample_prob = (proposal_distribution * candidate_mask_float) strategy_prob = tf.random.uniform([B]) strategy_prob = tf.expand_dims(tf.cast(tf.greater(strategy_prob,0.5), tf.float32),1) strategy_prob = tf.tile(strategy_prob, [1,L]) sample_prob = rand_sample_prob * strategy_prob + pos_sample_prob * (1 - strategy_prob) elif config.masking_strategy == MIX_ENTROPY_STRATEGY: rand_sample_prob = (proposal_distribution * candidate_mask_float) entropy_sample_prob = (proposal_distribution * candidate_mask_float) strategy_prob = tf.random.uniform([B]) strategy_prob = tf.expand_dims(tf.cast(tf.greater(strategy_prob,0.5), tf.float32),1) strategy_prob = tf.tile(strategy_prob, [1, L]) sample_prob = rand_sample_prob * strategy_prob + entropy_sample_prob * (1 - strategy_prob) else: raise ValueError("{} strategy is not supported".format(config.masking_strategy)) sample_prob /= tf.reduce_sum(sample_prob, axis=-1, keepdims=True) # Sample the positions to mask out sample_prob = tf.stop_gradient(sample_prob) sample_logits = tf.log(sample_prob) masked_lm_positions = tf.random.categorical( sample_logits, N, dtype=tf.int32) masked_lm_positions *= tf.cast(masked_lm_weights, tf.int32) # Get the ids of the masked-out tokens shift = tf.expand_dims(L * tf.range(B), -1) flat_positions = tf.reshape(masked_lm_positions + shift, [-1, 1]) masked_lm_ids = tf.gather_nd(tf.reshape(inputs.input_ids, [-1]), flat_positions) masked_lm_ids = tf.reshape(masked_lm_ids, [B, -1]) masked_lm_ids *= tf.cast(masked_lm_weights, tf.int32) # Update the input ids replace_prob = tf.random.uniform([B, N]) replace_with_mask_positions = masked_lm_positions * tf.cast( tf.less(replace_prob, 0.85), tf.int32) inputs_ids, _ = scatter_update( inputs.input_ids, tf.fill([B, N], vocab["[MASK]"]), replace_with_mask_positions) # Replace with random tokens replace_with_random_positions = masked_lm_positions * tf.cast( tf.greater(replace_prob, 0.925), tf.int32) random_tokens = tf.random.uniform([B,N], minval=0, maxval=len(vocab), dtype=tf.int32) inputs_ids, _ = scatter_update( inputs_ids, random_tokens, replace_with_random_positions) if config.debug: def pretty_print(inputs_ids, masked_lm_ids, masked_lm_positions, masked_lm_weights, tag_ids): debug_inputs = Inputs( input_ids=inputs_ids, input_mask=None, segment_ids=None, masked_lm_positions=masked_lm_positions, masked_lm_ids=masked_lm_ids, masked_lm_weights=masked_lm_weights, tag_ids = tag_ids) pretrain_data.print_tokens(debug_inputs, inv_vocab) ## TODO: save to the mask choice return inputs_ids, masked_lm_ids, masked_lm_positions, masked_lm_weights mask_shape = masked_lm_ids.get_shape() inputs_ids, masked_lm_ids, masked_lm_positions, masked_lm_weights = \ tf.py_func(pretty_print,[inputs_ids, masked_lm_ids, masked_lm_positions, masked_lm_weights, inputs.tag_ids], (tf.int32, tf.int32, tf.int32, tf.float32)) inputs_ids.set_shape(inputs.input_ids.get_shape()) masked_lm_ids.set_shape(mask_shape) masked_lm_positions.set_shape(mask_shape) masked_lm_weights.set_shape(mask_shape) return pretrain_data.get_updated_inputs( inputs, input_ids=tf.stop_gradient(inputs_ids), masked_lm_positions=masked_lm_positions, masked_lm_ids=masked_lm_ids, masked_lm_weights=masked_lm_weights, tag_ids = inputs.tag_ids )
def extract(input_texts, vocab_file, bert_config_file, init_checkpoint, layers=LAYERS, do_lower_case=True, max_seq_length=MAX_SEQ_LENGTH, master=None, num_tpu_cores=8, use_tpu=False, batch_size=BATCH_SIZE, use_one_hot_embeddings=False, to_json=False, output_file=None): tf.logging.set_verbosity(tf.logging.INFO) layer_indexes = [int(x) for x in LAYERS.split(",")] bert_config = modeling.BertConfig.from_json_file(bert_config_file) tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=do_lower_case) is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.estimator.tpu.RunConfig( master=master, tpu_config=tf.estimator.tpu.TPUConfig( num_shards=num_tpu_cores, per_host_input_for_training=is_per_host)) examples = read_examples(input_texts) features = convert_examples_to_features( examples=examples, seq_length=MAX_SEQ_LENGTH, tokenizer=tokenizer) unique_id_to_feature = {} for feature in features: unique_id_to_feature[feature.unique_id] = feature model_fn = model_fn_builder( bert_config=bert_config, init_checkpoint=init_checkpoint, layer_indexes=layer_indexes, use_tpu=use_tpu, use_one_hot_embeddings=use_one_hot_embeddings) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.estimator.tpu.TPUEstimator( use_tpu=use_tpu, model_fn=model_fn, config=run_config, predict_batch_size=BATCH_SIZE) input_fn = input_fn_builder( features=features, seq_length=MAX_SEQ_LENGTH) if (to_json == True) and (output_file != None) : with codecs.getwriter("utf-8")(tf.io.gfile.GFile(output_file, "w")) as writer: list_output_json = [] for result in estimator.predict(input_fn, yield_single_examples=True): unique_id = int(result["unique_id"]) feature = unique_id_to_feature[unique_id] output_json = collections.OrderedDict() output_json["linex_index"] = unique_id all_features = [] for (i, token) in enumerate(feature.tokens): all_layers = [] for (j, layer_index) in enumerate(layer_indexes): layer_output = result["layer_output_%d" % j] layers = collections.OrderedDict() layers["index"] = layer_index layers["values"] = [ round(float(x), 6) for x in layer_output[i:(i + 1)].flat ] all_layers.append(layers) features = collections.OrderedDict() features["token"] = token features["layers"] = all_layers all_features.append(features) output_json["features"] = all_features list_output_json.append(json.dumps(output_json)) writer.write(json.dumps(output_json) + "\n") else : list_output_json = [] for result in estimator.predict(input_fn, yield_single_examples=True): unique_id = int(result["unique_id"]) feature = unique_id_to_feature[unique_id] output_json = collections.OrderedDict() output_json["linex_index"] = unique_id all_features = [] for (i, token) in enumerate(feature.tokens): all_layers = [] for (j, layer_index) in enumerate(layer_indexes): layer_output = result["layer_output_%d" % j] layers = collections.OrderedDict() layers["index"] = layer_index layers["values"] = [ round(float(x), 6) for x in layer_output[i:(i + 1)].flat ] all_layers.append(layers) features = collections.OrderedDict() features["token"] = token features["layers"] = all_layers all_features.append(features) output_json["features"] = all_features list_output_json.append(json.dumps(output_json)) return list_output_json