def load(self, model_dir, is_checkpoint=False): """ Load persistables or parameters. """ # TODO: support dygraph. if is_checkpoint: init_checkpoint(self.exe, model_dir, self.program) else: init_pretraining_params(self.exe, model_dir, self.program) return
def main(args): task_name = args.task_name.lower() processors = { 'match': reader.MatchProcessor, } processor = processors[task_name](data_dir=args.data_dir, vocab_path=args.vocab_path, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case) num_labels = len(processor.get_labels()) infer_data_generator = processor.data_generator( batch_size=args.batch_size, phase='dev', epoch=args.epoch, shuffle=False) main_program = fluid.default_main_program() feed_order, loss, probs, accuracy, num_seqs = create_model( args, num_labels=num_labels) if args.use_cuda: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(framework.default_startup_program()) if args.init_checkpoint: init_pretraining_params(exe, args.init_checkpoint, main_program) feed_list = [ main_program.global_block().var(var_name) for var_name in feed_order ] feeder = fluid.DataFeeder(feed_list, place) label_list = [] for batch_id, data in enumerate(infer_data_generator()): results = exe.run( fetch_list=[probs], feed=feeder.feed(data), return_numpy=True) for elem in results[0]: label_list.append(str(elem[1])) return label_list
def _build_programs(self): """ Build programs. Build train_program, eval_program and inference_program. Only use in static graph mode. """ if self.run_infer: self.startup_program = fluid.Program() # build infer program self.infer_program = fluid.Program() with fluid.program_guard(self.infer_program, self.startup_program): with fluid.unique_name.guard(): self.infer_feed_dict = inputs = self._get_feed_dict(is_infer=True) outputs = self.forward(inputs, is_infer=True) predictions = self.infer(inputs, outputs) self.infer_fetch_dict = predictions self.infer_program = self.infer_program.clone(for_test=True) self.exe.run(self.startup_program) if self.init_pretraining_params is not None: init_pretraining_params(self.exe, self.init_pretraining_params, self.infer_program) if self.init_checkpoint is not None: init_checkpoint(self.exe, self.init_checkpoint, self.infer_program) else: self.startup_program = fluid.Program() # build train program self.train_program = fluid.Program() with fluid.program_guard(self.train_program, self.startup_program): with fluid.unique_name.guard(): self.feed_dict = inputs = self._get_feed_dict() outputs = self.forward(inputs) metrics, statistics = self.get_metrics_and_statistics(inputs, outputs) # build eval program self.eval_program = self.train_program.clone(for_test=True) self.eval_fetch_dict = {**metrics, **statistics} self.optimize(metrics) self.train_fetch_dict = metrics self.exe.run(self.startup_program) if self.init_pretraining_params is not None: init_pretraining_params(self.exe, self.init_pretraining_params, self.train_program) if self.init_checkpoint is not None: init_checkpoint(self.exe, self.init_checkpoint, self.train_program) return
def main(args): task_name = args.task_name.lower() processor = reader.MatchProcessor(data_dir=args.data_dir, task_name=task_name, vocab_path=args.vocab_path, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case) num_labels = len(processor.get_labels()) infer_data_generator = processor.data_generator(batch_size=args.batch_size, phase='test', epoch=1, shuffle=False) num_test_examples = processor.get_num_examples(phase='test') main_program = fluid.default_main_program() feed_order, loss, probs, accuracy, num_seqs = create_model( args, num_labels=num_labels, is_prediction=True) if args.use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) exe.run(framework.default_startup_program()) if args.init_checkpoint: init_pretraining_params(exe, args.init_checkpoint, main_program) feed_list = [ main_program.global_block().var(var_name) for var_name in feed_order ] feeder = fluid.DataFeeder(feed_list, place) for batch_id, data in enumerate(infer_data_generator()): results = exe.run(fetch_list=[probs], feed=feeder.feed(data), return_numpy=True) for elem in results[0]: print(elem[1])
def main(args): """ Main Function """ args = parser.parse_args() ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() if args.use_cuda: place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) reader = task_reader.SequenceLabelReader( vocab_path=args.vocab_path, label_map_config=args.label_map_config, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=False, random_seed=args.random_seed) if not (args.do_train or args.do_test or args.do_infer): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: num_train_examples = reader.get_num_examples(args.train_set) max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): # create ernie_pyreader train_pyreader, ernie_inputs, words, labels = ernie_pyreader( args, pyreader_name='train_reader') train_pyreader.decorate_tensor_provider( reader.data_generator(args.train_set, args.batch_size, args.epoch, shuffle=True, phase="train")) # get ernie_embeddings embeddings = ernie_encoder(ernie_inputs, ernie_config=ernie_config) # user defined model based on ernie embeddings train_ret = create_model(args, embeddings, labels=labels, is_prediction=False) optimizer = fluid.optimizer.Adam(learning_rate=args.lr) fluid.clip.set_gradient_clip( clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)) optimizer.minimize(train_ret["loss"]) lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_test: test_program = fluid.Program() with fluid.program_guard(test_program, startup_prog): with fluid.unique_name.guard(): # create ernie_pyreader test_pyreader, ernie_inputs, words, labels = ernie_pyreader( args, pyreader_name='test_reader') test_pyreader.decorate_tensor_provider( reader.data_generator(args.test_set, args.batch_size, phase='test', epoch=1, shuffle=False)) # get ernie_embeddings embeddings = ernie_encoder(ernie_inputs, ernie_config=ernie_config) # user defined model based on ernie embeddings test_ret = create_model(args, embeddings, labels=labels, is_prediction=False) test_program = test_program.clone(for_test=True) if args.do_infer: infer_program = fluid.Program() with fluid.program_guard(infer_program, startup_prog): with fluid.unique_name.guard(): # create ernie_pyreader infer_pyreader, ernie_inputs, words, labels = ernie_pyreader( args, pyreader_name='infer_reader') infer_pyreader.decorate_tensor_provider( reader.data_generator(args.infer_set, args.batch_size, phase='infer', epoch=1, shuffle=False)) # get ernie_embeddings embeddings = ernie_encoder(ernie_inputs, ernie_config=ernie_config) # user defined model based on ernie embeddings infer_ret = create_model(args, embeddings, labels=labels, is_prediction=True) infer_ret["words"] = words infer_program = infer_program.clone(for_test=True) exe.run(startup_prog) # load checkpoints if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: utils.init_checkpoint(exe, args.init_checkpoint, startup_prog) elif args.init_pretraining_params: utils.init_pretraining_params(exe, args.init_pretraining_params, startup_prog) elif args.do_test or args.do_infer: if not args.init_checkpoint: raise ValueError( "args 'init_checkpoint' should be set if only doing test or infer!" ) utils.init_checkpoint(exe, args.init_checkpoint, startup_prog) if args.do_train: train_pyreader.start() steps = 0 total_cost, total_acc, total_num_seqs = [], [], [] while True: try: steps += 1 if steps % args.skip_steps == 0: fetch_list = [ train_ret["loss"], train_ret["num_infer_chunks"], train_ret["num_label_chunks"], train_ret["num_correct_chunks"], ] else: fetch_list = [] start_time = time.time() outputs = exe.run(program=train_program, fetch_list=fetch_list) end_time = time.time() if steps % args.skip_steps == 0: loss, nums_infer, nums_label, nums_correct = outputs train_ret["chunk_evaluator"].reset() train_ret["chunk_evaluator"].update( nums_infer, nums_label, nums_correct) precision, recall, f1_score = train_ret[ "chunk_evaluator"].eval() print( "[train] batch_id = %d, loss = %.5f, P: %.5f, R: %.5f, F1: %.5f, elapsed time %.5f, " "pyreader queue_size: %d " % (steps, loss, precision, recall, f1_score, end_time - start_time, train_pyreader.queue.size())) if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) print("\tsaving model as %s" % (save_path)) fluid.io.save_persistables(exe, save_path, train_program) if steps % args.validation_steps == 0: # evaluate test set if args.do_test: evaluate(exe, test_program, test_pyreader, test_ret) except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break # final eval on test set if args.do_test: evaluate(exe, test_program, test_pyreader, test_ret) if args.do_infer: # create dict id2word_dict = dict([(str(word_id), word) for word, word_id in reader.vocab.items()]) id2label_dict = dict([(str(label_id), label) for label, label_id in reader.label_map.items()]) Dataset = namedtuple("Dataset", ["id2word_dict", "id2label_dict"]) dataset = Dataset(id2word_dict, id2label_dict) infer_pyreader.start() while True: try: (words, crf_decode) = exe.run( infer_program, fetch_list=[infer_ret["words"], infer_ret["crf_decode"]], return_numpy=False) # User should notice that words had been clipped if long than args.max_seq_len results = utils.parse_result(words, crf_decode, dataset) for result in results: print(result) except fluid.core.EOFException: infer_pyreader.reset() break
def _build_programs(self): """ Build programs. Build train_program, eval_program and inference_program. Only use in static graph mode. """ if self.run_infer: self.startup_program = fluid.Program() # build infer program self.infer_program = fluid.Program() with fluid.program_guard(self.infer_program, self.startup_program): with fluid.unique_name.guard(): self.infer_feed_dict = inputs = self._get_feed_dict( is_infer=True) outputs = self.forward(inputs, is_infer=True) predictions = self.infer(inputs, outputs) self.infer_fetch_dict = predictions self.infer_program = self.infer_program.clone(for_test=True) self.program = self.infer_program else: if self.is_distributed: exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = True exec_strategy.num_threads = 4 exec_strategy.num_iteration_per_drop_scope = 1 dist_strategy = DistributedStrategy() dist_strategy.exec_strategy = exec_strategy dist_strategy.nccl_comm_num = 1 dist_strategy.fuse_all_reduce_ops = True if self.use_recompute: dist_strategy.forward_recompute = True dist_strategy.enable_sequential_execution = True if self.use_amp: dist_strategy.use_amp = True dist_strategy.amp_loss_scaling = self.amp_loss_scaling self.dist_strategy = dist_strategy self.startup_program = fluid.Program() # build train program self.train_program = fluid.Program() with fluid.program_guard(self.train_program, self.startup_program): with fluid.unique_name.guard(): self.feed_dict = inputs = self._get_feed_dict() outputs = self.forward(inputs) if self.is_distributed and self.use_recompute: self.dist_strategy.recompute_checkpoints = outputs[ "checkpoints"] metrics, statistics = self.get_metrics_and_statistics( inputs, outputs) # build eval program self.eval_program = self.train_program.clone(for_test=True) self.eval_fetch_dict = {**metrics, **statistics} scheduled_lr = self.optimize(metrics) metrics["scheduled_lr"] = scheduled_lr self.train_fetch_dict = metrics self.program = self.train_program if self.is_distributed: self.train_program = fleet.main_program self.exe.run(self.startup_program) if self.init_pretraining_params != "": init_pretraining_params(self.exe, self.init_pretraining_params, self.program) elif self.init_checkpoint != "": init_checkpoint(self.exe, self.init_checkpoint, self.program) return
def do_train(args): """ Main Function """ ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() if args.use_cuda: place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) dev_count = 1 else: dev_count = min(multiprocessing.cpu_count(), args.cpu_num) if (dev_count < args.cpu_num): print( "WARNING: The total CPU NUM in this machine is %d, which is less than cpu_num parameter you set. " "Change the cpu_num from %d to %d" % (dev_count, args.cpu_num, dev_count)) os.environ['CPU_NUM'] = str(dev_count) place = fluid.CPUPlace() exe = fluid.Executor(place) startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): # user defined model based on ernie embeddings train_ret = creator.create_ernie_model(args, ernie_config) # ernie pyreader train_pyreader = creator.create_pyreader( args, file_name=args.train_data, feed_list=train_ret['feed_list'], model="ernie", place=place) test_program = train_program.clone(for_test=True) test_pyreader = creator.create_pyreader( args, file_name=args.test_data, feed_list=train_ret['feed_list'], model="ernie", place=place) clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0) optimizer = fluid.optimizer.Adam( learning_rate=args.base_learning_rate, grad_clip=clip) optimizer.minimize(train_ret["avg_cost"]) lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) print("Device count: %d" % dev_count) exe.run(startup_prog) # load checkpoints if args.init_checkpoint and args.init_pretraining_params: print("WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: utils.init_checkpoint(exe, args.init_checkpoint, startup_prog) elif args.init_pretraining_params: utils.init_pretraining_params(exe, args.init_pretraining_params, startup_prog) if dev_count > 1 and not args.use_cuda: device = "GPU" if args.use_cuda else "CPU" print("%d %s are used to train model" % (dev_count, device)) # multi cpu/gpu config exec_strategy = fluid.ExecutionStrategy() build_strategy = fluid.BuildStrategy() compiled_prog = fluid.compiler.CompiledProgram( train_program).with_data_parallel( loss_name=train_ret['avg_cost'].name, build_strategy=build_strategy, exec_strategy=exec_strategy) else: compiled_prog = fluid.compiler.CompiledProgram(train_program) # start training steps = 0 for epoch_id in range(args.epoch): for data in train_pyreader(): steps += 1 if steps % args.print_steps == 0: fetch_list = [ train_ret["avg_cost"], train_ret["precision"], train_ret["recall"], train_ret["f1_score"], ] else: fetch_list = [] start_time = time.time() outputs = exe.run(program=compiled_prog, feed=data[0], fetch_list=fetch_list) end_time = time.time() if steps % args.print_steps == 0: loss, precision, recall, f1_score = [ np.mean(x) for x in outputs ] print( "[train] batch_id = %d, loss = %.5f, P: %.5f, R: %.5f, F1: %.5f, elapsed time %.5f, " "pyreader queue_size: %d " % (steps, loss, precision, recall, f1_score, end_time - start_time, train_pyreader.queue.size())) if steps % args.save_steps == 0: save_path = os.path.join(args.model_save_dir, "step_" + str(steps), "checkpoint") print("\tsaving model as %s" % (save_path)) fluid.save(train_program, save_path) if steps % args.validation_steps == 0: evaluate(exe, test_program, test_pyreader, train_ret) save_path = os.path.join(args.model_save_dir, "step_" + str(steps), "checkpoint") fluid.save(train_program, save_path)