def main(): if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if not os.path.isdir(FLAGS.output_dir): os.makedirs(FLAGS.output_dir) def _get_text_file(text_dir): import glob #file_list = glob.glob(f'{text_dir}/**/*') # seqが512 #file_list = ['/nfs/ai16storage01/sec/akp2/1706nasubi/inatomi/benchmark/bert-chainer/data/wiki_data_pickle/all'] # seqが128 file_list = ['/nfs/ai16storage01/sec/akp2/1706nasubi/inatomi/benchmark/bert-chainer/data/wiki_data_pickle/all_seq128'] # debug #file_list = ['/nfs/ai16storage01/sec/akp2/1706nasubi/inatomi/benchmark/bert-chainer/data/wiki_data_pickle/AA/wiki_00'] files = ",".join(file_list) return files input_files = _get_text_file(FLAGS.input_file).split(',') # model_fn = model_fn_builder( # bert_config=bert_config, # init_checkpoint=FLAGS.init_checkpoint, # learning_rate=FLAGS.learning_rate, # num_train_steps=FLAGS.num_train_steps, # num_warmup_steps=FLAGS.num_warmup_steps, # use_tpu=FLAGS.use_tpu, # use_one_hot_embeddings=FLAGS.use_tpu) if FLAGS.do_train: input_files = input_files bert = modeling.BertModel(config=bert_config) model = modeling.BertPretrainer(bert) if FLAGS.init_checkpoint: serializers.load_npz(FLAGS.init_checkpoint, model) model = modeling.BertPretrainer(model.bert) if FLAGS.gpu >= 0: pass #chainer.backends.cuda.get_device_from_id(FLAGS.gpu).use() #model.to_gpu() if FLAGS.do_train: """chainerでのpretrainを記述。BERTClassificationに変わるものを作成し、BERTの出力をこねこねしてmodel_fnが返すものと同じものを返すようにすれば良いか?""" # Adam with weight decay only for 2D matrices optimizer = optimization.WeightDecayForMatrixAdam( alpha=1., # ignore alpha. instead, use eta as actual lr eps=1e-6, weight_decay_rate=0.01) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(1.)) """ ConcatenatedDatasetはon memolyなため、巨大データセットのPickleを扱えない input_files = sorted(input_files)[:len(input_files) // 2] input_files = sorted(input_files)[:200] import concurrent.futures train_examples = [] with concurrent.futures.ThreadPoolExecutor() as executor: for train_exapmle in executor.map(_load_data_using_dataset_api, input_files): train_examples.append(train_exapmle) train_examples = ConcatenatedDataset(*train_examples) """ train_examples = _load_data_using_dataset_api(input_files[0]) train_iter = chainer.iterators.SerialIterator( train_examples, FLAGS.train_batch_size) converter = Converter() if False: updater = training.updaters.StandardUpdater( train_iter, optimizer, converter=converter, device=FLAGS.gpu) else: updater = training.updaters.ParallelUpdater( iterator=train_iter, optimizer=optimizer, converter=converter, # The device of the name 'main' is used as a "master", while others are # used as slaves. Names other than 'main' are arbitrary. devices={'main': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, }, ) # learning rate (eta) scheduling in Adam num_warmup_steps = FLAGS.num_warmup_steps num_train_steps = FLAGS.num_train_steps trainer = training.Trainer( updater, (num_train_steps, 'iteration'), out=FLAGS.output_dir) lr_decay_init = FLAGS.learning_rate * \ (num_train_steps - num_warmup_steps) / num_train_steps trainer.extend(extensions.LinearShift( # decay 'eta', (lr_decay_init, 0.), (num_warmup_steps, num_train_steps))) trainer.extend(extensions.WarmupShift( # warmup 'eta', 0., num_warmup_steps, FLAGS.learning_rate)) trainer.extend(extensions.observe_value( 'eta', lambda trainer: trainer.updater.get_optimizer('main').eta), trigger=(50, 'iteration')) # logging trainer.extend(extensions.snapshot_object( model, 'seq_128_model_snapshot_iter_{.updater.iteration}.npz'), trigger=(1000, 'iteration')) trainer.extend(extensions.LogReport( trigger=(1, 'iteration'))) #trainer.extend(extensions.PlotReport( # [ # 'main/next_sentence_loss', # 'main/next_sentence_accuracy', # ], (3, 'iteration'), file_name='next_sentence.png')) #trainer.extend(extensions.PlotReport( # [ # 'main/masked_lm_loss', # 'main/masked_lm_accuracy', # ], (3, 'iteration'), file_name='masked_lm.png')) trainer.extend(extensions.PlotReport( y_keys=[ 'main/loss', 'main/next_sentence_loss', 'main/next_sentence_accuracy', 'main/masked_lm_loss', 'main/masked_lm_accuracy', ], x_key='iteration', trigger=(100, 'iteration'), file_name='loss.png')) trainer.extend(extensions.PrintReport( ['iteration', 'main/loss', 'main/masked_lm_loss', 'main/masked_lm_accuracy', 'main/next_sentence_loss', 'main/next_sentence_accuracy', 'elapsed_time'])) trainer.extend(extensions.ProgressBar(update_interval=20)) trainer.run() if FLAGS.do_eval: tf.logging.info("***** Running evaluation *****") tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False) result = estimator.evaluate( input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): if not FLAGS.do_train and not FLAGS.do_predict and not FLAGS.do_print_test: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if FLAGS.do_train: if not FLAGS.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if FLAGS.do_predict: if not FLAGS.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) if not os.path.isdir(FLAGS.output_dir): os.makedirs(FLAGS.output_dir) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = read_squad_examples(input_file=FLAGS.train_file, is_training=True) train_features = convert_examples_to_features(train_examples, tokenizer, FLAGS.max_seq_length, FLAGS.doc_stride, FLAGS.max_query_length, is_training=True) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) bert = modeling.BertModel(config=bert_config) model = modeling.BertSQuAD(bert) if FLAGS.do_train: # If training, load BERT parameters only. ignore_names = ['output/W', 'output/b'] else: # If only do_predict, load all parameters. ignore_names = None chainer.serializers.load_npz(FLAGS.init_checkpoint, model, ignore_names=ignore_names) if FLAGS.gpu >= 0: chainer.backends.cuda.get_device_from_id(FLAGS.gpu).use() model.to_gpu() if FLAGS.do_train: # Adam with weight decay only for 2D matrices optimizer = optimization.WeightDecayForMatrixAdam( alpha=1., # ignore alpha. instead, use eta as actual lr eps=1e-6, weight_decay_rate=0.01) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(1.)) train_iter = chainer.iterators.SerialIterator(train_features, FLAGS.train_batch_size) converter = Converter(is_training=True) updater = training.updaters.StandardUpdater( train_iter, optimizer, converter=converter, device=FLAGS.gpu, loss_func=model.compute_loss) trainer = training.Trainer(updater, (num_train_steps, 'iteration'), out=FLAGS.output_dir) # learning rate (eta) scheduling in Adam lr_decay_init = FLAGS.learning_rate * \ (num_train_steps - num_warmup_steps) / num_train_steps trainer.extend( extensions.LinearShift( # decay 'eta', (lr_decay_init, 0.), (num_warmup_steps, num_train_steps))) trainer.extend( extensions.WarmupShift( # warmup 'eta', 0., num_warmup_steps, FLAGS.learning_rate)) trainer.extend(extensions.observe_value( 'eta', lambda trainer: trainer.updater.get_optimizer('main').eta), trigger=(100, 'iteration')) # logging trainer.extend(extensions.snapshot_object( model, 'model_snapshot_iter_{.updater.iteration}.npz'), trigger=(num_train_steps // 2, 'iteration')) # TODO trainer.extend(extensions.LogReport(trigger=(100, 'iteration'))) trainer.extend( extensions.PrintReport([ 'iteration', 'main/loss', 'main/accuracy', 'elapsed_time', 'eta' ])) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.run() if FLAGS.do_predict: eval_examples = read_squad_examples(input_file=FLAGS.predict_file, is_training=False) eval_features = convert_examples_to_features(eval_examples, tokenizer, FLAGS.max_seq_length, FLAGS.doc_stride, FLAGS.max_query_length, is_training=False) test_iter = chainer.iterators.SerialIterator(eval_features, FLAGS.predict_batch_size, repeat=False, shuffle=False) converter = Converter(is_training=False) print('Evaluating ...') evaluate(eval_examples, test_iter, model, converter=converter, device=FLAGS.gpu, predict_func=model.predict) print('Finished.')
def main(): processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "livedoor": LivedoorProcessor, } if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_print_test: raise ValueError("At least one of `do_train` or `do_eval` " "or `do_print_test` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) if not os.path.isdir(FLAGS.output_dir): os.makedirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(model_file=FLAGS.model_file, vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) train_examples = None num_train_steps = None num_warmup_steps = None # TODO: use special Adam from "optimization.py" if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) bert = modeling.BertModel(config=bert_config) pretrained = modeling.BertPretrainer(bert) chainer.serializers.load_npz(FLAGS.init_checkpoint, pretrained) model = modeling.BertClassifier(pretrained.bert, num_labels=len(label_list)) if FLAGS.gpu >= 0: chainer.backends.cuda.get_device_from_id(FLAGS.gpu).use() model.to_gpu() if FLAGS.do_train: # Adam with weight decay only for 2D matrices optimizer = optimization.WeightDecayForMatrixAdam( alpha=1., # ignore alpha. instead, use eta as actual lr eps=1e-6, weight_decay_rate=0.01) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(1.)) train_iter = chainer.iterators.SerialIterator(train_examples, FLAGS.train_batch_size) converter = Converter(label_list, FLAGS.max_seq_length, tokenizer) updater = training.updaters.StandardUpdater(train_iter, optimizer, converter=converter, device=FLAGS.gpu) trainer = training.Trainer(updater, (num_train_steps, 'iteration'), out=FLAGS.output_dir) # learning rate (eta) scheduling in Adam lr_decay_init = FLAGS.learning_rate * \ (num_train_steps - num_warmup_steps) / num_train_steps trainer.extend( extensions.LinearShift( # decay 'eta', (lr_decay_init, 0.), (num_warmup_steps, num_train_steps))) trainer.extend( extensions.WarmupShift( # warmup 'eta', 0., num_warmup_steps, FLAGS.learning_rate)) trainer.extend(extensions.observe_value( 'eta', lambda trainer: trainer.updater.get_optimizer('main').eta), trigger=(50, 'iteration')) # logging trainer.extend(extensions.snapshot_object( model, 'model_snapshot_iter_{.updater.iteration}.npz'), trigger=(num_train_steps, 'iteration')) trainer.extend(extensions.LogReport(trigger=(50, 'iteration'))) trainer.extend( extensions.PrintReport( ['iteration', 'main/loss', 'main/accuracy', 'elapsed_time'])) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.run() if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) test_iter = chainer.iterators.SerialIterator(eval_examples, FLAGS.train_batch_size * 2, repeat=False, shuffle=False) converter = Converter(label_list, FLAGS.max_seq_length, tokenizer) evaluator = extensions.Evaluator(test_iter, model, converter=converter, device=FLAGS.gpu) results = evaluator() print(results) # if you wanna see some output arrays for debugging if FLAGS.do_print_test: short_eval_examples = processor.get_dev_examples(FLAGS.data_dir)[:3] short_eval_examples = short_eval_examples[:FLAGS.eval_batch_size] short_test_iter = chainer.iterators.SerialIterator( short_eval_examples, FLAGS.eval_batch_size, repeat=False, shuffle=False) converter = Converter(label_list, FLAGS.max_seq_length, tokenizer) evaluator = extensions.Evaluator(test_iter, model, converter=converter, device=FLAGS.gpu) with chainer.using_config('train', False): with chainer.no_backprop_mode(): data = short_test_iter.__next__() out = model.bert.get_pooled_output( *converter(data, FLAGS.gpu)[:-1]) print(out) print(out.shape) print(converter(data, -1))