def backward(self, loss): scheduled_lr, _ = optimization( loss=loss, warmup_steps=int(self.run_config.max_steps * self.config.warmup_proportion), num_train_steps=self.run_config.max_steps, learning_rate=self.config.learning_rate, train_program=F.default_main_program(), startup_prog=F.default_startup_program(), weight_decay=self.config.weight_decay, scheduler="linear_warmup_decay",) propeller.summary.scalar('lr', scheduled_lr)
def dataRateOptm(numOfRecPacStat,numOfRecChanStat,threshold,nodes,i): #print i temp = nodes[i].getPacInterval() x,y = nodes[i].getChannelIndicators(numOfRecChanStat) sucPac,allPac = nodes[i].getPacStat(numOfRecPacStat) # recent 20 stat if allPac < 1: return temp oldX,oldY = nodes[i].getOldXY() if abs(oldX-x) > threshold or abs(oldY-y)>threshold: temp= optimization(x,y,sucPac/float(allPac),nodes[i].getPacInterval(),3) nodes[i].setPacInterval(temp) #print x,y nodes[i].setOldXY(x,y) #print i,temp return temp
def backward(self, loss): scheduled_lr, _ = optimization( loss=loss, warmup_steps=int(self.run_config.max_steps * self.hparam['warmup_proportion']), num_train_steps=self.run_config.max_steps, learning_rate=self.hparam['learning_rate'], train_program=F.default_main_program(), startup_prog=F.default_startup_program(), weight_decay=self.hparam['weight_decay'], scheduler="linear_warmup_decay", use_fp16=self.hparam.get('use_fp16', 0), use_dynamic_loss_scaling=True, layer_decay_rate=self.hparam.get("layer_decay_rate", 0.), n_layers=self.hparam.ernie_config["num_hidden_layers"]) propeller.summary.scalar('lr', scheduled_lr)
'base': 0.0005, 'interval': 5e3, 'optimizer': 'rmsprop', 'rho': 0.9, 'eps': 1E-6, #'lr_type': 'inv', 'base': 0.5, 'gamma': 0.0001, 'power': 0.75, #'lr_type': 'episodic', 'base': 0.005, 'interval': 10e3, #'lr_type': 'fixed', 'base': 0.003, #'optimizer': 'sgd', #'optimizer': 'rmsprop', 'rho': 0.9, 'eps': 1E-6, #'optimizer': 'rmsprop_graves', 'aleph': 0.95, 'beit': 0.9, 'gimmel': 0.0001, 'dalet': 0.0001, #'optimizer': 'adadelta', 'rho': 0.9, 'eps': 1.E-6, #'optimizer': 'adagrad', 'rho': 0.9, 'eps': 1.E-6, 'momentum': 0.9, 'n_train_iters': 100000, 'test_interval': 1000, # 'grad_clip_val': 5, 'l1_weight_decay': 0.0001, } snapshots_dir = '/homes/nirb/work/buffe/Applications/roundabout-learn/snapshots/' trained_model = '' # trained_model = '/homes/nirb/work/buffe/Applications/roundabout-learn/snapshots/2016-01-06-08-22-012000.sn' optimization(game_params=game_params, arch_params=arch_params, solver_params=solver_params, trained_model=trained_model, sn_dir=snapshots_dir)
""" Created on Tue Feb 24 09:38:09 2015 @author: wirkert """ import numpy as np import matplotlib.pyplot as plt from setup import data from optimization import optimization #%% load data dataFolder = "data/output/" trainingParameters, trainingReflectances, testParameters, testReflectances = \ data.perfect(dataFolder) dummy1, dummy2, testParameters, testReflectances = \ data.noisy(dataFolder) testingErrors, r2Score = optimization(trainingParameters, trainingReflectances, testParameters, testReflectances) #%% test print("error distribution BVF, Volume fraction") print("median: " + str(np.median(testingErrors, axis=0))) print("lower quartile: " + str(np.percentile(testingErrors, 25, axis=0))) print("higher quartile: " + str(np.percentile(testingErrors, 75, axis=0))) print("r2Score", str(r2Score))
def net(self, args=None): """ BERT net struct. Args: fleet: args (ArgumentParser): run args to config dist fleet. Returns: tuple: the return value contains avg_cost, py_reader """ args = p_args() bert_config = BertConfig(DATA_DIR + "uncased_L-24_H-1024_A-16/bert_config.json") bert_config.print_config() place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) exe = fluid.Executor(place) # init program train_program = fluid.Program() startup_prog = fluid.Program() if args.random_seed != 0: print("set program random seed as: ", args.random_seed) startup_prog.random_seed = args.random_seed train_program.random_seed = args.random_seed task_name = args.task_name.lower() processors = { 'xnli': reader.XnliProcessor, 'cola': reader.ColaProcessor, 'mrpc': reader.MrpcProcessor, 'mnli': reader.MnliProcessor, } processor = processors[task_name](data_dir=args.data_dir, vocab_path=args.vocab_path, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed) num_labels = len(processor.get_labels()) dev_count = 1 self.train_data_generator = processor.data_generator( batch_size=args.batch_size, phase='train', epoch=args.epoch, dev_count=dev_count, dev_idx=0, shuffle=args.shuffle, shuffle_seed=args.shuffle_seed) num_train_examples = processor.get_num_examples(phase='train') max_train_steps = 5 self.warmup_steps = 0.5 exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope dist_strategy = DistributedStrategy() args.run_params = json.loads(args.run_params) dist_strategy.enable_inplace = args.run_params['enable_inplace'] dist_strategy.fuse_all_reduce_ops = args.run_params[ 'fuse_all_reduce_ops'] dist_strategy.nccl_comm_num = args.run_params['nccl_comm_num'] dist_strategy.use_local_sgd = args.run_params['use_local_sgd'] dist_strategy.mode = args.run_params["mode"] dist_strategy.collective_mode = args.run_params["collective"] dist_strategy.exec_strategy = exec_strategy dist_strategy.use_hierarchical_allreduce = False with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): self.train_pyreader, self.loss, probs, accuracy, num_seqs, checkpoints = create_model( args, bert_config=bert_config, num_labels=num_labels) scheduled_lr = optimization(loss=self.loss, warmup_steps=self.warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=False, loss_scaling=args.loss_scaling, dist_strategy=dist_strategy) exe.run(startup_prog) with open("__model__", "wb") as f: f.write(fleet._origin_program.desc.serialize_to_string()) with open("debug_program", "w") as f: f.write(str(fleet._origin_program)) return self.loss
max_seq_len=args.max_seq_len, token_mode=args.token_mode, batch_size=args.batch_size) num_train_examples = len(train_ds) max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) scheduled_lr, loss_scaling = optimization( loss=graph_ernie.loss, warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_prog, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=False, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio) test_prog = F.Program() with F.program_guard(test_prog, startup_prog): with F.unique_name.guard(): if args.sage_mode == "gcn": _graph_ernie = GCNGraphErnie(args, task="listwise") elif args.sage_mode == "gat": _graph_ernie = GATGraphErnie(args, task="listwise")
def main(args): if not (args.do_train or args.do_eval or args.do_predict): raise ValueError("For args `do_train`, `do_eval` and `do_predict`, at " "least one of them must be True.") if args.do_predict and not args.predict_dir: raise ValueError("args 'predict_dir' should be given when doing predict") if not os.path.exists(args.predict_dir): os.makedirs(args.predict_dir) xlnet_config = XLNetConfig(args.model_config_path) xlnet_config.print_config() if args.use_cuda: place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) dev_count = get_device_num() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) task_name = args.task_name.lower() processors = { "mnli_matched": reader.MnliMatchedProcessor, "mnli_mismatched": reader.MnliMismatchedProcessor, 'sts-b': reader.StsbProcessor, 'imdb': reader.ImdbProcessor, "yelp5": reader.Yelp5Processor } processor = processors[task_name](args) label_list = processor.get_labels() if not args.is_regression else None num_labels = len(label_list) if label_list is not None else None train_program = fluid.Program() startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed train_program.random_seed = args.random_seed if args.do_train: # NOTE: If num_trainers > 1, the shuffle_seed must be set, because # the order of batch data generated by reader # must be the same in the respective processes. shuffle_seed = 1 if num_trainers > 1 else None train_data_generator = processor.data_generator( batch_size=args.train_batch_size, is_regression=args.is_regression, phase='train', epoch=args.epoch, dev_count=dev_count, shuffle=args.shuffle) num_train_examples = processor.get_num_examples(phase='train') print("Device count: %d" % dev_count) print("Max num of epoches: %d" % args.epoch) print("Num of train examples: %d" % num_train_examples) print("Num of train steps: %d" % args.train_steps) print("Num of warmup steps: %d" % args.warmup_steps) with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_data_loader, loss, logits, num_seqs, label_ids = create_model( args, xlnet_config=xlnet_config, n_class=num_labels) scheduled_lr = optimization( loss=loss, warmup_steps=args.warmup_steps, num_train_steps=args.train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, lr_layer_decay_rate=args.lr_layer_decay_rate, scheduler=args.lr_scheduler) if args.do_eval: dev_prog = fluid.Program() with fluid.program_guard(dev_prog, startup_prog): with fluid.unique_name.guard(): dev_data_loader, loss, logits, num_seqs, label_ids = create_model( args, xlnet_config=xlnet_config, n_class=num_labels) dev_prog = dev_prog.clone(for_test=True) dev_data_loader.set_batch_generator( processor.data_generator( batch_size=args.eval_batch_size, is_regression=args.is_regression, phase=args.eval_split, epoch=1, dev_count=1, shuffle=False), place) if args.do_predict: predict_prog = fluid.Program() with fluid.program_guard(predict_prog, startup_prog): with fluid.unique_name.guard(): predict_data_loader, loss, logits, num_seqs, label_ids = create_model( args, xlnet_config=xlnet_config, n_class=num_labels) predict_prog = predict_prog.clone(for_test=True) predict_data_loader.set_batch_generator( processor.data_generator( batch_size=args.predict_batch_size, is_regression=args.is_regression, phase=args.eval_split, epoch=1, dev_count=1, shuffle=False), place) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog) elif args.init_pretraining_params: init_pretraining_params( exe, args.init_pretraining_params, main_program=startup_prog) elif args.do_eval or args.do_predict: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog) if args.do_train: exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count build_strategy = fluid.BuildStrategy() if args.use_cuda and num_trainers > 1: assert shuffle_seed is not None dist_utils.prepare_for_multi_process(exe, build_strategy, train_program) train_data_generator = fluid.contrib.reader.distributed_batch_reader( train_data_generator) train_compiled_program = fluid.CompiledProgram(train_program).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) train_data_loader.set_batch_generator(train_data_generator, place) if args.do_train: train_data_loader.start() steps = 0 total_cost, total_num_seqs, total_time = [], [], 0.0 throughput = [] ce_info = [] while steps < args.train_steps: try: time_begin = time.time() steps += 1 if steps % args.skip_steps == 0: fetch_list = [loss.name, scheduled_lr.name, num_seqs.name] else: fetch_list = [] outputs = exe.run(train_compiled_program, fetch_list=fetch_list) time_end = time.time() used_time = time_end - time_begin total_time += used_time if steps % args.skip_steps == 0: np_loss, np_lr, np_num_seqs = outputs total_cost.extend(np_loss * np_num_seqs) total_num_seqs.extend(np_num_seqs) if args.verbose: verbose = "train data_loader queue size: %d, " % train_data_loader.queue.size( ) verbose += "learning rate: %f" % np_lr[0] print(verbose) current_example, current_epoch = processor.get_train_progress( ) log_record = "epoch: {}, progress: {}/{}, step: {}, ave loss: {}".format( current_epoch, current_example, num_train_examples, steps, np.sum(total_cost) / np.sum(total_num_seqs)) ce_info.append([np.sum(total_cost) / np.sum(total_num_seqs), used_time]) if steps > 0 : throughput.append( args.skip_steps / total_time) log_record = log_record + ", speed: %f steps/s" % (args.skip_steps / total_time) print(log_record) else: print(log_record) total_cost, total_num_seqs, total_time = [], [], 0.0 if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps % args.validation_steps == 0: print("Average throughtput: %s" % (np.average(throughput))) throughput = [] # evaluate dev set if args.do_eval: evaluate(exe, dev_prog, dev_data_loader, [loss.name, num_seqs.name, logits.name, label_ids.name], args.eval_split, processor.get_num_examples(phase=args.eval_split)) except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) train_data_loader.reset() break if args.enable_ce: card_num = get_cards() ce_cost = 0 ce_time = 0 try: ce_cost = ce_info[-2][0] ce_time = ce_info[-2][1] except: print("ce info error") print("kpis\ttrain_duration_%s_card%s\t%s" % (args.task_name.replace("-", "_"), card_num, ce_time)) print("kpis\ttrain_cost_%s_card%s\t%f" % (args.task_name.replace("-", "_"), card_num, ce_cost)) # final eval on dev set if args.do_eval: evaluate(exe, dev_prog, dev_data_loader, [loss.name, num_seqs.name, logits.name, label_ids], args.eval_split, processor.get_num_examples(phase=args.eval_split)) # final eval on test set if args.do_predict: predict(exe, predict_prog, predict_data_loader, task_name, label_list, [logits.name])
def main(args): """main function""" ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() if args.use_cuda: place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) reader = task_reader.ClassifyReader(vocab_path=args.vocab_path, label_map_config=args.label_map_config, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed, tokenizer=args.tokenizer, is_classify=args.is_classify, is_regression=args.is_regression, for_cn=args.for_cn, task_id=args.task_id) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") if args.do_test: assert args.test_save is not None startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.predict_batch_size is None: args.predict_batch_size = args.batch_size if args.do_train: train_data_generator = reader.data_generator( input_file=args.train_set, batch_size=args.batch_size, epoch=args.epoch, dev_count=dev_count, shuffle=True, phase="train") num_train_examples = reader.get_num_examples(args.train_set) if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() """ if args.random_seed is not None and args.enable_ce: train_program.random_seed = args.random_seed """ with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars = create_model( args, pyreader_name='train_reader', ernie_config=ernie_config, is_classify=args.is_classify, is_regression=args.is_regression) scheduled_lr, loss_scaling = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, graph_vars = create_model( args, pyreader_name='test_reader', ernie_config=ernie_config, is_classify=args.is_classify, is_regression=args.is_regression) test_prog = test_prog.clone(for_test=True) nccl2_num_trainers = 1 nccl2_trainer_id = 0 exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=graph_vars["loss"].name, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None test_exe = exe if args.do_val or args.do_test: if args.use_multi_gpu_test: test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, main_program=test_prog, share_vars_from=train_exe) steps = 10000 current_epoch = 1 if args.do_train: train_pyreader.start() steps = 0 if warmup_steps > 0: graph_vars["learning_rate"] = scheduled_lr ce_info = [] time_begin = time.time() last_epoch = 0 current_epoch = 0 previous_eval_acc = 0.80 previous_train_acc = 0.90 while True: try: steps += 1 if steps % args.skip_steps != 0: train_exe.run(fetch_list=[]) else: outputs = evaluate(train_exe, train_program, train_pyreader, graph_vars, "train", metric=args.metric, is_classify=args.is_classify, is_regression=args.is_regression) acc = outputs["accuracy"] if acc > previous_train_acc or acc > 0.95: print( "previous train accuracy is %f and current train accuracy is %f " % (previous_train_acc, acc)) previous_train_acc = acc eval_acc = evaluate_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) print( "previous evaluate accuracy is %f and current evaluate accuracy is %f " % (previous_eval_acc, eval_acc)) if eval_acc > previous_eval_acc: previous_eval_acc = eval_acc save_path = os.path.join( args.checkpoints, "evalacc_" + str(eval_acc).split('.')[1]) fluid.io.save_persistables(exe, save_path, train_program) predict_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps="evalacc_" + str(eval_acc).split('.')[1]) print( "predict and save model!!!!!!!!!!!!!!!!!!!!!!!!!! in %s" % (save_path)) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % ( outputs["learning_rate"] if warmup_steps > 0 else args.learning_rate) print(verbose) current_example, current_epoch = reader.get_train_progress( ) time_end = time.time() used_time = time_end - time_begin print( "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " "ave acc: %f, speed: %f steps/s" % (current_epoch, current_example, num_train_examples, steps, outputs["loss"], outputs["accuracy"], args.skip_steps / used_time)) ce_info.append( [outputs["loss"], outputs["accuracy"], used_time]) time_begin = time.time() # if steps % args.save_steps == 0: # save_path = os.path.join(args.checkpoints, # "step_" + str(steps)) # fluid.io.save_persistables(exe, save_path, train_program) # if steps % args.validation_steps == 0 or last_epoch != current_epoch: # # evaluate dev set # if args.do_val: # ret=evaluate_wrapper(args, reader, exe, test_prog, # test_pyreader, graph_vars, # current_epoch, steps) # if args.do_test: # predict_wrapper(args, reader, exe, # test_prog, test_pyreader, graph_vars, # current_epoch, steps) if last_epoch != current_epoch: last_epoch = current_epoch except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break # final eval on dev set # if args.do_val: # evaluate_wrapper(args, reader, exe, test_prog, test_pyreader, # graph_vars, current_epoch, steps) # final eval on test set steps = 0 # if args.do_test: # current_epoch = 0 # predict_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, # current_epoch, steps) # final eval on dianostic, hack for glue-ax if args.diagnostic: test_pyreader.decorate_tensor_provider( reader.data_generator(args.diagnostic, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False)) print("Final diagnostic") qids, preds, probs = predict(test_exe, test_prog, test_pyreader, graph_vars, is_classify=args.is_classify, is_regression=args.is_regression) assert len(qids) == len(preds), '{} v.s. {}'.format( len(qids), len(preds)) with open(args.diagnostic_save, 'w') as f: for id, s, p in zip(qids, preds, probs): f.write('{}\t{}\t{}\n'.format(id, s, p)) print("Done final diagnostic, saving to {}".format( args.diagnostic_save))
def DayAheadMarket(): df_price, df_genprod, df_lineflow, df_loadshed, df_windsolarload, df_revenueprod, network, times, generators, startup_number_df, df_zonalconsumption, df_windprod, df_solarprod = results.optimization() gen_dataframe = df_revenueprod gen_dataframe['TotalRevenue'] = gen_dataframe['Total Revenue'].map('{:.2f}'.format) gen_dataframe['TotalProduction'] = gen_dataframe['Total Production'].map('{:.2f}'.format) gen_dataframe['NumberofS/U'] = startup_number_df['Total Start-Ups'] gen_dataframe['Capacity'] = generators.capacity gen_dataframe['MarginalCost'] = generators.lincost gen_dataframe['S/Ucost'] = generators.cyclecost gen_dataframe['FixedO&MCost'] = generators.fixedomcost gen_dataframe['VarO&MCost'] = generators.varomcost gen_dataframe['LevelizedCapitalCost'] = generators.levcapcost gen_dataframe['PrimaryFuel'] = generators.primaryfuel gen_dataframe.to_csv('revenue_cost_gen.csv') return df_lineflow, df_price, df_windprod, df_solarprod, df_windsolarload
def net(self): args = self.p_args() bert_config = BertConfig("uncased_L-24_H-1024_A-16/bert_config.json") bert_config.print_config() place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) dev_count = 1 if args.do_train: my_dist_env = dist_env() worker_endpoints_env = my_dist_env["trainer_endpoints"] worker_endpoints = worker_endpoints_env.split(",") current_endpoint = my_dist_env["current_endpoint"] trainer_id = worker_endpoints.index(current_endpoint) # new rolemaker here print("current_id: ", trainer_id) print("worker_endpoints: ", worker_endpoints) role = role_maker.UserDefinedCollectiveRoleMaker( current_id=trainer_id, worker_endpoints=worker_endpoints) # Fleet get role of each worker fleet.init(role) exe = fluid.Executor(place) # init program train_program = fluid.Program() startup_prog = fluid.Program() if args.random_seed != 0: print("set program random seed as: ", args.random_seed) startup_prog.random_seed = args.random_seed train_program.random_seed = args.random_seed task_name = args.task_name.lower() processors = { 'xnli': reader.XnliProcessor, 'cola': reader.ColaProcessor, 'mrpc': reader.MrpcProcessor, 'mnli': reader.MnliProcessor, } processor = processors[task_name](data_dir=args.data_dir, vocab_path=args.vocab_path, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed) num_labels = len(processor.get_labels()) dev_count = len(worker_endpoints) # we need to keep every trainer of fleet the same shuffle_seed print("shuffle_seed: ", args.shuffle_seed) self.train_data_generator = processor.data_generator( batch_size=args.batch_size, phase='train', epoch=args.epoch, dev_count=dev_count, dev_idx=0, shuffle=args.shuffle, shuffle_seed=args.shuffle_seed) num_train_examples = processor.get_num_examples(phase='train') max_train_steps = 5 self.warmup_steps = int(5 * 0.1) exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope dist_strategy = DistributedStrategy() dist_strategy.exec_strategy = exec_strategy dist_strategy.nccl_comm_num = 3 dist_strategy.use_hierarchical_allreduce = True #dist_strategy.mode = "collective" #dist_strategy.collective_mode = "grad_allreduce" with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): self.train_pyreader, self.loss, probs, accuracy, num_seqs, checkpoints = create_model( args, bert_config=bert_config, num_labels=num_labels) scheduled_lr = optimization(loss=self.loss, warmup_steps=self.warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=False, loss_scaling=args.loss_scaling, dist_strategy=dist_strategy) exe.run(startup_prog) with open("__model__", "wb") as f: f.write(fleet._origin_program.desc.serialize_to_string()) with open("debug_program", "w") as f: f.write(str(fleet._origin_program)) return self.loss
# -*- coding: utf-8 -*- """ Created on Tue Feb 24 09:38:09 2015 @author: wirkert """ import numpy as np import matplotlib.pyplot as plt from setup import data from optimization import optimization #%% load data dataFolder = "data/output/" trainingParameters, trainingReflectances, testParameters, testReflectances = \ data.perfect(dataFolder) dummy1, dummy2, testParameters, testReflectances = \ data.noisy(dataFolder) testingErrors, r2Score = optimization(trainingParameters, trainingReflectances, testParameters, testReflectances) #%% test print("error distribution BVF, Volume fraction") print("median: " + str(np.median(testingErrors, axis=0))) print("lower quartile: " + str(np.percentile(testingErrors, 25, axis=0))) print("higher quartile: " + str(np.percentile(testingErrors, 75, axis=0))) print("r2Score", str(r2Score))
'w_mines': 5.5, 'w_step_size': 0.05, } solver_params = { 'controler_0' : controler_0_solver_params, 'controler_1' : controler_1_solver_params, # 'lr_type': 'inv', 'base': 0.5, 'gamma': 0.0001, 'power': 0.75, # 'lr_type': 'episodic', 'base': 0.005, 'interval': 10e3, # 'lr_type': 'fixed', 'base': 0.003, # 'optimizer': 'sgd', #'optimizer': 'rmsprop', 'rho': 0.9, 'eps': 1E-6, #'optimizer': 'rmsprop_graves', 'aleph': 0.95, 'beit': 0.9, 'gimmel': 0.0001, 'dalet': 0.0001, #'optimizer': 'adadelta', 'rho': 0.9, 'eps': 1.E-6, #'optimizer': 'adagrad', 'rho': 0.9, 'eps': 1.E-6, 'momentum': 0.9, 'n_train_iters': 1E6, 'test_interval': 1000, 'switch_interval': 100000, 'trnsprnt_interval': 0, } snapshots_dir = os.getcwd() + '/snapshots/' trained_model=['',''] # trained_model = '' optimization(game_params = game_params, arch_params=arch_params, solver_params=solver_params, trained_model= trained_model, sn_dir = snapshots_dir)
def main(args): ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() if args.use_cuda: place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) reader = task_reader.SequenceLabelReader( vocab_path=args.vocab_path, label_map_config=args.label_map_config, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = reader.data_generator( input_file=args.train_set, batch_size=args.batch_size, epoch=args.epoch, shuffle=True, phase="train") num_train_examples = reader.get_num_examples(args.train_set) if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars = create_model( args, pyreader_name='train_reader', ernie_config=ernie_config) scheduled_lr = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, loss_scaling=args.loss_scaling) fluid.memory_optimize( input_program=train_program, skip_opt_set=[ graph_vars["loss"].name, graph_vars["labels"].name, graph_vars["infers"].name, graph_vars["seq_lens"].name ]) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, graph_vars = create_model( args, pyreader_name='test_reader', ernie_config=ernie_config) test_prog = test_prog.clone(for_test=True) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params( exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor( use_cuda=args.use_cuda, loss_name=graph_vars["loss"].name, exec_strategy=exec_strategy, main_program=train_program) train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None if args.do_val or args.do_test: test_exe = fluid.ParallelExecutor( use_cuda=args.use_cuda, main_program=test_prog, share_vars_from=train_exe) if args.do_train: train_pyreader.start() steps = 0 if warmup_steps > 0: graph_vars["learning_rate"] = scheduled_lr if args.save_log and args.log_path: if os.path.exists(args.log_path): raise FileExistsError("Logging file already exists!") with open(args.log_path, 'w') as logfile: logfile.write('%s\n' % time.asctime()) print('Writing logs into %s' % args.log_path) time_begin = time.time() while True: try: steps += 1 if steps % args.skip_steps != 0: train_exe.run(fetch_list=[]) else: outputs = evaluate(train_exe, train_program, train_pyreader, graph_vars, args.num_labels, "train", dev_count) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % ( outputs["lr"] if warmup_steps > 0 else args.learning_rate) print(verbose) current_example, current_epoch = reader.get_train_progress() time_end = time.time() used_time = time_end - time_begin print("epoch: %d, progress: %d/%d, step: %d, loss: %f, " "f1: %f, precision: %f, recall: %f, speed: %f steps/s" % (current_epoch, current_example, num_train_examples, steps, outputs["loss"], outputs["f1"], outputs["precision"], outputs["recall"], args.skip_steps / used_time)) if args.save_log and args.log_path: with open(args.log_path, 'a') as logfile: logfile.write("epoch: %d, progress: %d/%d, step: %d, loss: %f, " "f1: %f, precision: %f, recall: %f\n" % ( current_epoch, current_example, num_train_examples, steps, outputs["loss"], outputs["f1"], outputs["precision"], outputs["recall"])) time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps % args.validation_steps == 0: # evaluate dev set if args.do_val: test_pyreader.decorate_tensor_provider( reader.data_generator( args.dev_set, batch_size=args.batch_size, epoch=1, shuffle=False)) evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels, "dev") # evaluate test set if args.do_test: test_pyreader.decorate_tensor_provider( reader.data_generator( args.test_set, batch_size=args.batch_size, epoch=1, shuffle=False)) evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels, "test") except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break # final eval on dev set if args.do_val: test_pyreader.decorate_tensor_provider( reader.data_generator( args.dev_set, batch_size=args.batch_size, epoch=1, shuffle=False)) print("Final validation result:") evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels, "dev") if args.do_predict: print("Saving predicted results...") predict(exe, test_prog, test_pyreader, graph_vars, args.label_map_config, "test", output_dir="./predicted_results") # final eval on test set if args.do_test: test_pyreader.decorate_tensor_provider( reader.data_generator( args.test_set, batch_size=args.batch_size, epoch=1, shuffle=False)) print("Final test result:") evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels, "test") if args.do_predict: print("Saving predicted results...") predict(exe, test_prog, test_pyreader, graph_vars, args.label_map_config, "test", output_dir="./predicted_results")
def main(args): bert_config = BertConfig(args.bert_config_path) bert_config.print_config() if args.use_cuda: place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) task_name = args.task_name.lower() processors = { 'xnli': reader.XnliProcessor, 'cola': reader.ColaProcessor, 'mrpc': reader.MrpcProcessor, 'mnli': reader.MnliProcessor, } processor = processors[task_name](data_dir=args.data_dir, vocab_path=args.vocab_path, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed) num_labels = len(processor.get_labels()) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = processor.data_generator( batch_size=args.batch_size, phase='train', epoch=args.epoch, shuffle=True) num_train_examples = processor.get_num_examples(phase='train') if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, loss, probs, accuracy, num_seqs = create_model( args, pyreader_name='train_reader', bert_config=bert_config, num_labels=num_labels) scheduled_lr = optimization(loss=loss, warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, loss_scaling=args.loss_scaling) fluid.memory_optimize(input_program=train_program, skip_opt_set=[ loss.name, probs.name, accuracy.name, num_seqs.name ]) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, loss, probs, accuracy, num_seqs = create_model( args, pyreader_name='test_reader', bert_config=bert_config, num_labels=num_labels) test_prog = test_prog.clone(for_test=True) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=loss.name, exec_strategy=exec_strategy, main_program=train_program) train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None if args.do_val or args.do_test: test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, main_program=test_prog, share_vars_from=train_exe) if args.do_train: train_pyreader.start() steps = 0 total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() while True: try: steps += 1 if steps % args.skip_steps == 0: if warmup_steps <= 0: fetch_list = [loss.name, accuracy.name, num_seqs.name] else: fetch_list = [ loss.name, accuracy.name, scheduled_lr.name, num_seqs.name ] else: fetch_list = [] outputs = train_exe.run(fetch_list=fetch_list) if steps % args.skip_steps == 0: if warmup_steps <= 0: np_loss, np_acc, np_num_seqs = outputs else: np_loss, np_acc, np_lr, np_num_seqs = outputs total_cost.extend(np_loss * np_num_seqs) total_acc.extend(np_acc * np_num_seqs) total_num_seqs.extend(np_num_seqs) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % (np_lr[0] if warmup_steps > 0 else args.learning_rate) print(verbose) current_example, current_epoch = processor.get_train_progress( ) time_end = time.time() used_time = time_end - time_begin print( "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " "ave acc: %f, speed: %f steps/s" % (current_epoch, current_example, num_train_examples, steps, np.sum(total_cost) / np.sum(total_num_seqs), np.sum(total_acc) / np.sum(total_num_seqs), args.skip_steps / used_time)) total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps % args.validation_steps == 0: # evaluate dev set if args.do_val: test_pyreader.decorate_tensor_provider( processor.data_generator( batch_size=args.batch_size, phase='dev', epoch=1, shuffle=False)) evaluate(exe, test_prog, test_pyreader, [loss.name, accuracy.name, num_seqs.name], "dev") # evaluate test set if args.do_test: test_pyreader.decorate_tensor_provider( processor.data_generator( batch_size=args.batch_size, phase='test', epoch=1, shuffle=False)) evaluate(exe, test_prog, test_pyreader, [loss.name, accuracy.name, num_seqs.name], "test") except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break # final eval on dev set if args.do_val: test_pyreader.decorate_tensor_provider( processor.data_generator(batch_size=args.batch_size, phase='dev', epoch=1, shuffle=False)) print("Final validation result:") evaluate(exe, test_prog, test_pyreader, [loss.name, accuracy.name, num_seqs.name], "dev") # final eval on test set if args.do_test: test_pyreader.decorate_tensor_provider( processor.data_generator(batch_size=args.batch_size, phase='test', epoch=1, shuffle=False)) print("Final test result:") evaluate(exe, test_prog, test_pyreader, [loss.name, accuracy.name, num_seqs.name], "test")
def train(args): print("pretraining start") ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() with open(args.task_group_json) as f: task_group = json.load(f) exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = 4 if args.use_amp else 2 exec_strategy.num_iteration_per_drop_scope = min(1, args.skip_steps) node_nums = 1 #int(os.getenv("PADDLE_NODES_NUM")) print("args.is_distributed:", args.is_distributed) num_trainers = 1 trainer_id = 0 dist_strategy = None gpu_id = 0 gpus = 1 #fluid.core.get_cuda_device_count() print(gpus) if args.is_distributed: gpus = os.getenv("FLAGS_selected_gpus").split(",") gpu_id = int(gpus[0]) place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) print("Device count %d, gpu_id:%d" % (dev_count, gpu_id)) train_program = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): fetch_vars, train_data_names = create_model( pyreader_name='train_reader', ernie_config=ernie_config, task_group=task_group) graph_vars = fetch_vars["graph_vars"] checkpoints = fetch_vars["checkpoints"] total_loss = graph_vars[-1] if args.use_recompute: dist_strategy.recompute_checkpoints = checkpoints fetch_list_ascend = [var for var in graph_vars] scheduled_lr, loss_scaling = optimization( loss=total_loss, warmup_steps=args.warmup_steps, num_train_steps=args.num_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_amp, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio, fetch_list=fetch_list_ascend, dist_strategy=dist_strategy) origin_train_program = train_program test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): fetch_vars, test_data_names = create_model( pyreader_name='test_reader', ernie_config=ernie_config, task_group=task_group) graph_vars = fetch_vars["graph_vars"] total_loss = graph_vars[-1] test_prog = test_prog.clone(for_test=True) exe = fluid.Executor(place) exe.run(startup_prog) if args.init_checkpoint and args.init_checkpoint != "": #init_checkpoint(exe, args.init_checkpoint, origin_train_program, args.use_amp) init_pretraining_params(exe, args.init_checkpoint, origin_train_program, args.use_amp) data_reader = ErnieDataReader(task_group, False, batch_size=args.batch_size, vocab_path=args.vocab_path, voc_size=ernie_config['vocab_size'], epoch=args.epoch, max_seq_len=args.max_seq_len, generate_neg_sample=args.generate_neg_sample, hack_old_trainset=args.hack_old_data) #only fleet train_exe = exe predict = predict_wrapper(args, exe, ernie_config, task_group, test_prog=test_prog, data_names=test_data_names, fetch_list=[var for var in graph_vars]) #train_pyreader.set_batch_generator(data_reader.data_generator()) #train_pyreader.start() train_data_generator = data_reader.data_generator() steps = 0 time_begin = time.time() feed_list = {} while True: #steps < args.num_train_steps: try: steps += 1 #node_nums skip_steps = args.skip_steps # * node_nums input_list = next(train_data_generator(), None) for index in range(len(input_list)): feed_list[train_data_names[index]] = input_list[index] fetch_list = [] if trainer_id == 0 and steps % skip_steps == 0: fetch_list = [var for var in graph_vars] + [scheduled_lr.name] if args.use_amp: fetch_list.append(loss_scaling.name) outputs = train_exe.run(feed=feed_list, fetch_list=fetch_list, program=train_program) time_end = time.time() used_time = time_end - time_begin if outputs: each_mask_lm_cost, lm_w = outputs[:2] if args.use_amp: each_total_constract_loss, each_total_cost, np_lr, l_scaling = outputs[ -4:] else: each_total_constract_loss, each_total_cost, np_lr = outputs[ -3:] acc_list = [] index = 2 for task in task_group: each_task_acc = outputs[index] task_w = outputs[index + 1] acc = np.sum(each_task_acc * task_w) / np.sum(task_w) acc_list.append("%s acc: %f" % (task["task_name"], acc)) index += 2 epoch, current_file_index, total_file, current_file, mask_type = data_reader.get_progress( ) if args.use_amp: print("current learning_rate:%f, loss scaling:%f" % (np_lr[0], l_scaling[0])) else: print("current learning_rate:%f" % np_lr[0]) print( "epoch: %d, progress: %d/%d, step: %d, constract_loss: %f, loss: %f, " "ppl: %f, %s, speed: %f steps/s, file: %s, mask_type: %s" % (epoch, current_file_index, total_file, steps, np.mean(each_total_constract_loss), np.mean(each_total_cost), np.exp(np.sum(each_mask_lm_cost * lm_w) / np.sum(lm_w)), ", ".join(acc_list), skip_steps / used_time, current_file, mask_type)) time_begin = time.time() elif steps % skip_steps == 0: epoch, current_file_index, total_file, current_file, mask_type = data_reader.get_progress( ) print("epoch: %d, progress: %d/%d, step: %d, " "speed: %f steps/s, file: %s, mask_type: %s" % (epoch, current_file_index, total_file, steps, skip_steps / used_time, current_file, mask_type)) time_begin = time.time() if not trainer_id == 0: continue if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, origin_train_program) if steps % args.validation_steps == 0: valid_list = predict() print("[validation_set] epoch: %d, step: %d, %s" % \ (epoch, steps, ", ".join(valid_list))) except fluid.core.EOFException: train_pyreader.reset() break
def train(args): if not (args.do_train or args.do_predict): raise ValueError("For args `do_train` and `do_predict`, at " "least one of them must be True.") xlnet_config = XLNetConfig(args.model_config_path) xlnet_config.print_config() if args.use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) processor = DataProcessor(spiece_model_file=args.spiece_model_file, uncased=args.uncased, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length) startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = processor.data_generator( data_path=args.train_file, batch_size=args.train_batch_size, phase='train', shuffle=True, dev_count=dev_count, epoch=args.epoch) num_train_examples = processor.get_num_examples(phase='train') print("Device count: %d" % dev_count) print("Max num of epoches: %d" % args.epoch) print("Num of train examples: %d" % num_train_examples) print("Num of train steps: %d" % args.train_steps) print("Num of warmup steps: %d" % args.warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_data_loader, loss = create_model( xlnet_config=xlnet_config, is_training=True) scheduled_lr = optimization( loss=loss, warmup_steps=args.warmup_steps, num_train_steps=args.train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, lr_layer_decay_rate=args.lr_layer_decay_rate, scheduler=args.lr_scheduler) if args.do_predict: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_data_loader, predictions = create_model( xlnet_config=xlnet_config, is_training=False) test_prog = test_prog.clone(for_test=True) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog) elif args.do_predict: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing prediction!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog) if args.do_train: exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope build_strategy = fluid.BuildStrategy() # These two flags must be set in this model for correctness build_strategy.fuse_all_optimizer_ops = True build_strategy.enable_inplace = False train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=loss.name, exec_strategy=exec_strategy, build_strategy=build_strategy, main_program=train_program) train_data_loader.set_batch_generator(train_data_generator, place) train_data_loader.start() steps = 0 total_cost = [] time_begin = time.time() print("Begin to train model ...") print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) while steps < args.train_steps: try: steps += 1 if steps % args.skip_steps == 0: fetch_list = [loss.name, scheduled_lr.name] else: fetch_list = [] outputs = train_exe.run(fetch_list=fetch_list) if steps % args.skip_steps == 0: np_loss, np_lr = outputs total_cost.extend(np_loss) if args.verbose: verbose = "train data_loader queue size: %d, " % train_data_loader.queue.size( ) verbose += "learning rate: %f " % np_lr[0] print(verbose) time_end = time.time() used_time = time_end - time_begin current_example, epoch = processor.get_train_progress() print("epoch: %d, progress: %d/%d, step: %d, loss: %f, " "speed: %f steps/s" % (epoch, current_example, num_train_examples, steps, np.mean(total_cost), args.skip_steps / used_time)) total_cost = [] time_begin = time.time() if steps % args.save_steps == 0 or steps == args.train_steps: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps) + "_final") fluid.io.save_persistables(exe, save_path, train_program) train_data_loader.reset() break print("Finish model training ...") print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) if args.do_predict: print("Begin to do prediction ...") print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) test_data_loader.set_batch_generator( processor.data_generator(data_path=args.predict_file, batch_size=args.predict_batch_size, phase='predict', shuffle=False, dev_count=1, epoch=1), place) predict(exe, test_prog, test_data_loader, [ predictions['unique_ids'].name, predictions['start_top_log_probs'].name, predictions['start_top_index'].name, predictions['end_top_log_probs'].name, predictions['end_top_index'].name, predictions['cls_logits'].name ], processor, name='') print("Finish prediction ...") print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
def train(args): print("pretraining start") ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() train_program = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model( pyreader_name='train_reader', ernie_config=ernie_config) scheduled_lr = optimization(loss=total_loss, warmup_steps=args.warmup_steps, num_train_steps=args.num_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, loss_scaling=args.loss_scaling) fluid.memory_optimize(input_program=train_program, skip_opt_set=[ next_sent_acc.name, mask_lm_loss.name, total_loss.name ]) test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model( pyreader_name='test_reader', ernie_config=ernie_config) test_prog = test_prog.clone(for_test=True) if args.use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) print("Device count %d" % dev_count) print("theoretical memory usage: ") if args.in_tokens: print( fluid.contrib.memory_usage(program=train_program, batch_size=args.batch_size // args.max_seq_len)) else: print( fluid.contrib.memory_usage(program=train_program, batch_size=args.batch_size)) nccl2_num_trainers = 1 nccl2_trainer_id = 0 print("args.is_distributed:", args.is_distributed) if args.is_distributed: worker_endpoints_env = os.getenv("worker_endpoints") worker_endpoints = worker_endpoints_env.split(",") trainers_num = len(worker_endpoints) current_endpoint = os.getenv("current_endpoint") trainer_id = worker_endpoints.index(current_endpoint) if trainer_id == 0: print("train_id == 0, sleep 60s") time.sleep(60) print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) # prepare nccl2 env. config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_program, startup_program=startup_prog) nccl2_num_trainers = trainers_num nccl2_trainer_id = trainer_id exe = fluid.Executor(place) exe.run(startup_prog) if args.init_checkpoint and args.init_checkpoint != "": init_checkpoint(exe, args.init_checkpoint, train_program, args.use_fp16) data_reader = ErnieDataReader(filelist=args.train_filelist, batch_size=args.batch_size, vocab_path=args.vocab_path, voc_size=ernie_config['vocab_size'], epoch=args.epoch, max_seq_len=args.max_seq_len, generate_neg_sample=args.generate_neg_sample, in_tokens=args.in_tokens, is_bidirection=args.is_bidirection) exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = min(10, args.skip_steps) build_strategy = fluid.BuildStrategy() build_strategy.remove_unnecessary_lock = False train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=total_loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) if args.valid_filelist and args.valid_filelist != "": predict = predict_wrapper(args, exe, ernie_config, test_prog=test_prog, pyreader=test_pyreader, fetch_list=[ next_sent_acc.name, mask_lm_loss.name, total_loss.name ]) train_pyreader.decorate_tensor_provider(data_reader.data_generator()) train_pyreader.start() steps = 0 cost = [] lm_cost = [] acc = [] time_begin = time.time() while steps < args.num_train_steps: try: steps += nccl2_num_trainers skip_steps = args.skip_steps * nccl2_num_trainers if nccl2_trainer_id != 0: train_exe.run(fetch_list=[]) continue if steps % skip_steps != 0: train_exe.run(fetch_list=[]) else: each_next_acc, each_mask_lm_cost, each_total_cost, np_lr = train_exe.run( fetch_list=[ next_sent_acc.name, mask_lm_loss.name, total_loss.name, scheduled_lr.name ]) acc.extend(each_next_acc) lm_cost.extend(each_mask_lm_cost) cost.extend(each_total_cost) print("feed_queue size", train_pyreader.queue.size()) time_end = time.time() used_time = time_end - time_begin epoch, current_file_index, total_file, current_file, mask_type = data_reader.get_progress( ) print("current learning_rate:%f" % np_lr[0]) print( "epoch: %d, progress: %d/%d, step: %d, loss: %f, " "ppl: %f, next_sent_acc: %f, speed: %f steps/s, file: %s, mask_type: %s" % (epoch, current_file_index, total_file, steps, np.mean(np.array(cost)), np.mean(np.exp( np.array(lm_cost))), np.mean(np.array(acc)), skip_steps / used_time, current_file, mask_type)) cost = [] lm_cost = [] acc = [] time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if args.valid_filelist and steps % args.validation_steps == 0: vali_cost, vali_lm_cost, vali_acc, vali_steps, vali_speed = predict( ) print("[validation_set] epoch: %d, step: %d, " "loss: %f, global ppl: %f, batch-averged ppl: %f, " "next_sent_acc: %f, speed: %f steps/s" % (epoch, steps, np.mean(np.array(vali_cost) / vali_steps), np.exp(np.mean(np.array(vali_lm_cost) / vali_steps)), np.mean(np.exp(np.array(vali_lm_cost) / vali_steps)), np.mean(np.array(vali_acc) / vali_steps), vali_speed)) except fluid.core.EOFException: train_pyreader.reset() break
def main(args): """main function""" bert_config = BertConfig(args.bert_config_path) bert_config.print_config() if args.use_cuda: place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) task_name = args.task_name.lower() paradigm_inst = define_paradigm.Paradigm(task_name) processors = { 'udc': reader.UDCProcessor, 'swda': reader.SWDAProcessor, 'mrda': reader.MRDAProcessor, 'atis_slot': reader.ATISSlotProcessor, 'atis_intent': reader.ATISIntentProcessor, 'dstc2': reader.DSTC2Processor, } in_tokens = { 'udc': True, 'swda': True, 'mrda': True, 'atis_slot': False, 'atis_intent': True, 'dstc2': True, } processor = processors[task_name](data_dir=args.data_dir, vocab_path=args.vocab_path, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=in_tokens[task_name], task_name=task_name, random_seed=args.random_seed) num_labels = len(processor.get_labels()) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = processor.data_generator( batch_size=args.batch_size, phase='train', epoch=args.epoch, shuffle=True) num_train_examples = processor.get_num_examples(phase='train') if in_tokens[task_name]: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() if args.random_seed is not None: train_program.random_seed = args.random_seed with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): results = create_model( args, pyreader_name='train_reader', bert_config=bert_config, num_labels=num_labels, paradigm_inst=paradigm_inst) train_pyreader = results.get("pyreader", None) loss = results.get("loss", None) probs = results.get("probs", None) accuracy = results.get("accuracy", None) num_seqs = results.get("num_seqs", None) scheduled_lr = optimization( loss=loss, warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, loss_scaling=args.loss_scaling) if accuracy is not None: skip_opt_set = [loss.name, probs.name, accuracy.name, num_seqs.name] else: skip_opt_set = [loss.name, probs.name, num_seqs.name] fluid.memory_optimize( input_program=train_program, skip_opt_set=skip_opt_set) if args.verbose: if in_tokens[task_name]: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_results = create_model( args, pyreader_name='test_reader', bert_config=bert_config, num_labels=num_labels, paradigm_inst=paradigm_inst) test_pyreader = test_results.get("pyreader", None) loss = test_results.get("loss", None) probs = test_results.get("probs", None) accuracy = test_results.get("accuracy", None) num_seqs = test_results.get("num_seqs", None) test_prog = test_prog.clone(for_test=True) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params( exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor( use_cuda=args.use_cuda, loss_name=loss.name, exec_strategy=exec_strategy, main_program=train_program) train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None if args.do_val or args.do_test: test_exe = fluid.ParallelExecutor( use_cuda=args.use_cuda, main_program=test_prog, share_vars_from=train_exe) if args.do_train: train_pyreader.start() steps = 0 total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() ce_info = [] while True: try: steps += 1 if steps % args.skip_steps == 0: if warmup_steps <= 0: if accuracy is not None: fetch_list = [loss.name, accuracy.name, num_seqs.name] else: fetch_list = [loss.name, num_seqs.name] else: if accuracy is not None: fetch_list = [ loss.name, accuracy.name, scheduled_lr.name, num_seqs.name ] else: fetch_list = [loss.name, scheduled_lr.name, num_seqs.name] else: fetch_list = [] if accuracy is not None: fetch_test_list = [loss.name, accuracy.name, num_seqs.name] else: fetch_test_list = [loss.name, num_seqs.name] outputs = train_exe.run(fetch_list=fetch_list) if steps % args.skip_steps == 0: if warmup_steps <= 0: if accuracy is not None: np_loss, np_acc, np_num_seqs = outputs else: np_loss, np_num_seqs = outputs else: if accuracy is not None: np_loss, np_acc, np_lr, np_num_seqs = outputs else: np_loss, np_lr, np_num_seqs = outputs total_cost.extend(np_loss * np_num_seqs) total_num_seqs.extend(np_num_seqs) if accuracy is not None: total_acc.extend(np_acc * np_num_seqs) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size() verbose += "learning rate: %f" % ( np_lr[0] if warmup_steps > 0 else args.learning_rate) print(verbose) current_example, current_epoch = processor.get_train_progress() time_end = time.time() used_time = time_end - time_begin current_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) if accuracy is not None: print("%s epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " "ave acc: %f, speed: %f steps/s" % (current_time, current_epoch, current_example, num_train_examples, steps, np.sum(total_cost) / np.sum(total_num_seqs), np.sum(total_acc) / np.sum(total_num_seqs), args.skip_steps / used_time)) ce_info.append([np.sum(total_cost) / np.sum(total_num_seqs), np.sum(total_acc) / np.sum(total_num_seqs), args.skip_steps / used_time]) else: print("%s epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " "speed: %f steps/s" % (current_time, current_epoch, current_example, num_train_examples, steps, np.sum(total_cost) / np.sum(total_num_seqs), args.skip_steps / used_time)) ce_info.append([np.sum(total_cost) / np.sum(total_num_seqs), args.skip_steps / used_time]) total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps % args.validation_steps == 0: #evaluate dev set if args.do_val: test_pyreader.decorate_tensor_provider( processor.data_generator( batch_size=args.batch_size, phase='dev', epoch=1, shuffle=False)) evaluate(test_exe, test_prog, test_pyreader, fetch_test_list, "dev") #evaluate test set if args.do_test: test_pyreader.decorate_tensor_provider( processor.data_generator( batch_size=args.batch_size, phase='test', epoch=1, shuffle=False)) evaluate(test_exe, test_prog, test_pyreader, fetch_test_list, "test") except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break if args.do_train and args.enable_ce: card_num = get_cards() print("zytest_card_num", card_num) ce_loss = 0 ce_acc = 0 ce_time = 0 try: ce_loss = ce_info[-2][0] ce_acc = ce_info[-2][1] ce_time = ce_info[-2][2] except: print("ce info error") print("kpis\teach_step_duration_%s_card%s\t%s" % (task_name, card_num, ce_time)) print("kpis\ttrain_loss_%s_card%s\t%f" % (task_name, card_num, ce_loss)) print("kpis\ttrain_acc_%s_card%s\t%f" % (task_name, card_num, ce_acc)) #final eval on dev set if args.do_val: test_pyreader.decorate_tensor_provider( processor.data_generator( batch_size=args.batch_size, phase='dev', epoch=1, shuffle=False)) print("Final validation result:") evaluate(test_exe, test_prog, test_pyreader, fetch_test_list, "dev") #final eval on test set if args.do_test: test_pyreader.decorate_tensor_provider( processor.data_generator( batch_size=args.batch_size, phase='test', epoch=1, shuffle=False)) print("Final test result:") evaluate(test_exe, test_prog, test_pyreader, fetch_test_list, "test")
def main(args): bert_config = BertConfig(args.bert_config_path) bert_config.print_config() if args.use_cuda: place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) dev_count = get_device_num() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) task_name = args.task_name.lower() processors = { 'xnli': reader.XnliProcessor, 'cola': reader.ColaProcessor, 'mrpc': reader.MrpcProcessor, 'mnli': reader.MnliProcessor, } processor = processors[task_name](data_dir=args.data_dir, vocab_path=args.vocab_path, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed) num_labels = len(processor.get_labels()) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") train_program = fluid.Program() startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed train_program.random_seed = args.random_seed if args.do_train: # NOTE: If num_trainers > 1, the shuffle_seed must be set, because # the order of batch data generated by reader # must be the same in the respective processes. shuffle_seed = 1 if num_trainers > 1 else None train_data_generator = processor.data_generator( batch_size=args.batch_size, phase='train', epoch=args.epoch, dev_count=dev_count, shuffle=args.shuffle, shuffle_seed=shuffle_seed) num_train_examples = processor.get_num_examples(phase='train') if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_data_loader, loss, probs, accuracy, num_seqs = create_model( args, bert_config=bert_config, num_labels=num_labels) scheduled_lr, loss_scaling = optimization( loss=loss, warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio) if args.do_val: dev_prog = fluid.Program() with fluid.program_guard(dev_prog, startup_prog): with fluid.unique_name.guard(): dev_data_loader, loss, probs, accuracy, num_seqs = create_model( args, bert_config=bert_config, num_labels=num_labels) dev_prog = dev_prog.clone(for_test=True) dev_data_loader.set_batch_generator( processor.data_generator(batch_size=args.batch_size, phase='dev', epoch=1, dev_count=1, shuffle=False), place) if args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_data_loader, loss, probs, accuracy, num_seqs = create_model( args, bert_config=bert_config, num_labels=num_labels) test_prog = test_prog.clone(for_test=True) test_data_loader.set_batch_generator( processor.data_generator(batch_size=args.batch_size, phase='test', epoch=1, dev_count=1, shuffle=False), place) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope build_strategy = fluid.BuildStrategy() if args.use_cuda and num_trainers > 1: assert shuffle_seed is not None dist_utils.prepare_for_multi_process(exe, build_strategy, train_program) train_data_generator = fluid.contrib.reader.distributed_batch_reader( train_data_generator) train_compiled_program = fluid.CompiledProgram( train_program).with_data_parallel(loss_name=loss.name, build_strategy=build_strategy) train_data_loader.set_batch_generator(train_data_generator, place) if args.do_train: train_data_loader.start() steps = 0 total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() throughput = [] ce_info = [] total_batch_num = 0 # used for benchmark while True: try: steps += 1 total_batch_num += 1 # used for benchmark if args.max_iter and total_batch_num == args.max_iter: # used for benchmark return if steps % args.skip_steps == 0: if args.use_fp16: fetch_list = [ loss.name, accuracy.name, scheduled_lr.name, num_seqs.name, loss_scaling.name ] else: fetch_list = [ loss.name, accuracy.name, scheduled_lr.name, num_seqs.name ] else: fetch_list = [] outputs = exe.run(train_compiled_program, fetch_list=fetch_list) if steps % args.skip_steps == 0: if args.use_fp16: np_loss, np_acc, np_lr, np_num_seqs, np_scaling = outputs else: np_loss, np_acc, np_lr, np_num_seqs = outputs total_cost.extend(np_loss * np_num_seqs) total_acc.extend(np_acc * np_num_seqs) total_num_seqs.extend(np_num_seqs) if args.verbose: verbose = "train data_loader queue size: %d, " % train_data_loader.queue.size( ) verbose += "learning rate: %f" % np_lr[0] if args.use_fp16: verbose += ", loss scaling: %f" % np_scaling[0] print(verbose) current_example, current_epoch = processor.get_train_progress( ) time_end = time.time() used_time = time_end - time_begin # profiler tools if args.is_profiler and current_epoch == 0 and steps == args.skip_steps: profiler.start_profiler("All") elif args.is_profiler and current_epoch == 0 and steps == args.skip_steps * 2: profiler.stop_profiler("total", args.profiler_path) return log_record = "epoch: {}, progress: {}/{}, step: {}, ave loss: {}, ave acc: {}".format( current_epoch, current_example, num_train_examples, steps, np.sum(total_cost) / np.sum(total_num_seqs), np.sum(total_acc) / np.sum(total_num_seqs)) ce_info.append([ np.sum(total_cost) / np.sum(total_num_seqs), np.sum(total_acc) / np.sum(total_num_seqs), used_time ]) if steps > 0: throughput.append(args.skip_steps / used_time) log_record = log_record + ", speed: %f steps/s" % ( args.skip_steps / used_time) print(log_record) else: print(log_record) total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.save(program=train_program, model_path=save_path) if steps % args.validation_steps == 0: print("Average throughtput: %s" % (np.average(throughput))) throughput = [] # evaluate dev set if args.do_val: evaluate(exe, dev_prog, dev_data_loader, [loss.name, accuracy.name, num_seqs.name], "dev") # evaluate test set if args.do_test: evaluate(exe, test_prog, test_data_loader, [loss.name, accuracy.name, num_seqs.name], "test") except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.save(program=train_program, model_path=save_path) train_data_loader.reset() break if args.enable_ce: card_num = get_cards() ce_cost = 0 ce_acc = 0 ce_time = 0 try: ce_cost = ce_info[-2][0] ce_acc = ce_info[-2][1] ce_time = ce_info[-2][2] except: print("ce info error") print("kpis\ttrain_duration_%s_card%s\t%s" % (args.task_name, card_num, ce_time)) print("kpis\ttrain_cost_%s_card%s\t%f" % (args.task_name, card_num, ce_cost)) print("kpis\ttrain_acc_%s_card%s\t%f" % (args.task_name, card_num, ce_acc)) # final eval on dev set if args.do_val: print("Final validation result:") evaluate(exe, dev_prog, dev_data_loader, [loss.name, accuracy.name, num_seqs.name], "dev") # final eval on test set if args.do_test: print("Final test result:") evaluate(exe, test_prog, test_data_loader, [loss.name, accuracy.name, num_seqs.name], "test")
def train(args): print("pretraining start") ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() with open(args.task_group_json) as f: task_group = json.load(f) exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = 4 if args.use_amp else 2 exec_strategy.num_iteration_per_drop_scope = min(1, args.skip_steps) node_nums = int(os.getenv("PADDLE_NODES_NUM")) print("args.is_distributed:", args.is_distributed) num_trainers = 1 trainer_id = 0 if args.is_distributed: role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) trainer_id = fleet.worker_index() current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") worker_endpoints = fleet.worker_endpoints() trainers_num = len(worker_endpoints) print("worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}" .format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) dist_strategy = DistributedStrategy() dist_strategy.exec_strategy = exec_strategy dist_strategy.remove_unnecessary_lock = False # not useful dist_strategy.fuse_all_reduce_ops = True if args.use_fuse else False dist_strategy.nccl_comm_num = args.nccl_comm_num if args.use_hierarchical_allreduce \ and trainers_num > args.hierarchical_allreduce_inter_nranks: dist_strategy.use_hierarchical_allreduce = args.use_hierarchical_allreduce dist_strategy.hierarchical_allreduce_inter_nranks = \ args.hierarchical_allreduce_inter_nranks assert dist_strategy.use_hierarchical_allreduce > 1 assert trainers_num % dist_strategy.hierarchical_allreduce_inter_nranks == 0 dist_strategy.hierarchical_allreduce_exter_nranks = \ trainers_num / dist_strategy.hierarchical_allreduce_inter_nranks if args.use_amp: dist_strategy.use_amp = True dist_strategy.amp_loss_scaling = args.init_loss_scaling if args.use_recompute: dist_strategy.forward_recompute = True dist_strategy.enable_sequential_execution=True trainer_id = fleet.worker_index() current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") worker_endpoints = fleet.worker_endpoints() trainers_num = len(worker_endpoints) print("worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}" .format(worker_endpoints,trainers_num, current_endpoint, trainer_id)) else: dist_strategy=None gpu_id=0 gpus = fluid.core.get_cuda_device_count() if args.is_distributed: gpus = os.getenv("FLAGS_selected_gpus").split(",") gpu_id = int(gpus[0]) if args.use_cuda: place = fluid.CUDAPlace(gpu_id) dev_count = len(gpus) else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) print("Device count %d, gpu_id:%d" % (dev_count, gpu_id)) train_program = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, fetch_vars = create_model( pyreader_name='train_reader', ernie_config=ernie_config, task_group=task_group) graph_vars = fetch_vars["graph_vars"] checkpoints = fetch_vars["checkpoints"] total_loss = graph_vars[-1] if args.use_recompute: dist_strategy.recompute_checkpoints = checkpoints scheduled_lr, loss_scaling = optimization( loss=total_loss, warmup_steps=args.warmup_steps, num_train_steps=args.num_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_amp, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio, dist_strategy=dist_strategy) origin_train_program = train_program if args.is_distributed: #raped by fleet, need to assign fleet's modified train_grogram back train_program = fleet.main_program origin_train_program = fleet._origin_program test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, fetch_vars = create_model( pyreader_name='test_reader', ernie_config=ernie_config, task_group=task_group) graph_vars = fetch_vars["graph_vars"] total_loss = graph_vars[-1] test_prog = test_prog.clone(for_test=True) exe = fluid.Executor(place) exe.run(startup_prog) if args.init_checkpoint and args.init_checkpoint != "": #init_checkpoint(exe, args.init_checkpoint, origin_train_program, args.use_amp) init_pretraining_params(exe, args.init_checkpoint, origin_train_program, args.use_amp) data_reader = ErnieDataReader( task_group, False, batch_size=args.batch_size, vocab_path=args.vocab_path, voc_size=ernie_config['vocab_size'], epoch=args.epoch, max_seq_len=args.max_seq_len, generate_neg_sample=args.generate_neg_sample, hack_old_trainset=args.hack_old_data) #only fleet train_exe = exe predict = predict_wrapper( args, exe, ernie_config, task_group, test_prog=test_prog, pyreader=test_pyreader, fetch_list=[var.name for var in graph_vars]) train_pyreader.set_batch_generator(data_reader.data_generator()) train_pyreader.start() steps = 112000 time_begin = time.time() node_nums = int(os.getenv("PADDLE_NODES_NUM")) while True:#steps < args.num_train_steps: try: steps += 1#node_nums skip_steps = args.skip_steps# * node_nums fetch_list = [] if trainer_id == 0 and steps % skip_steps == 0: fetch_list = [var.name for var in graph_vars] + [scheduled_lr.name] if args.use_amp: fetch_list.append(loss_scaling.name) outputs = train_exe.run(fetch_list=fetch_list, program=train_program) time_end = time.time() used_time = time_end - time_begin if outputs: each_mask_lm_cost, lm_w = outputs[:2] if args.use_amp: each_total_constract_loss, each_total_cost, np_lr, l_scaling = outputs[-4:] else: each_total_constract_loss, each_total_cost, np_lr = outputs[-3:] acc_list =[] index = 2 for task in task_group: each_task_acc = outputs[index] task_w = outputs[index + 1] acc = np.sum(each_task_acc * task_w) / np.sum(task_w) acc_list.append("%s acc: %f" % (task["task_name"], acc)) index += 2 print("feed_queue size", train_pyreader.queue.size()) epoch, current_file_index, total_file, current_file, mask_type = data_reader.get_progress() if args.use_amp: print("current learning_rate:%f, loss scaling:%f" % (np_lr[0], l_scaling[0])) else: print("current learning_rate:%f" % np_lr[0]) print( "epoch: %d, progress: %d/%d, step: %d, constract_loss: %f, loss: %f, " "ppl: %f, %s, speed: %f steps/s, file: %s, mask_type: %s" % (epoch, current_file_index, total_file, steps, np.mean(each_total_constract_loss), np.mean(each_total_cost), np.exp(np.sum(each_mask_lm_cost * lm_w) / np.sum(lm_w)), ", ".join(acc_list), skip_steps / used_time, current_file, mask_type)) time_begin = time.time() elif steps % skip_steps == 0: epoch, current_file_index, total_file, current_file, mask_type = data_reader.get_progress( ) print("feed_queue size", train_pyreader.queue.size()) print("epoch: %d, progress: %d/%d, step: %d, " "speed: %f steps/s, file: %s, mask_type: %s" % (epoch, current_file_index, total_file, steps, skip_steps / used_time, current_file, mask_type)) time_begin = time.time() if not trainer_id == 0: continue if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, origin_train_program) if steps % args.validation_steps == 0: valid_list = predict() print("[validation_set] epoch: %d, step: %d, %s" % \ (epoch, steps, ", ".join(valid_list))) except fluid.core.EOFException: train_pyreader.reset() break
def main(args): ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() if args.use_cuda: dev_list = fluid.cuda_places() place = dev_list[0] dev_count = len(dev_list) else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) reader = task_reader.MisspellingReader( vocab_path=args.vocab_path, label_map_config=args.label_map_config, max_seq_len=args.max_seq_len, tokenizer=args.tokenizer, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed, task_id=args.task_id) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = reader.data_generator( input_file=args.train_set, batch_size=args.batch_size, epoch=args.epoch, shuffle=True, phase="train") num_train_examples = reader.get_num_examples(args.train_set) if args.in_tokens: if args.batch_size < args.max_seq_len: raise ValueError( 'if in_tokens=True, batch_size should greater than max_sqelen, got batch_size:%d seqlen:%d' % (args.batch_size, args.max_seq_len)) max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) log.info("Device count: %d" % dev_count) log.info("Num train examples: %d" % num_train_examples) log.info("Max train steps: %d" % max_train_steps) log.info("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars = create_model( args, pyreader_name='train_reader', ernie_config=ernie_config) scheduled_lr, loss_scaling = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) log.info("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, graph_vars = create_model( args, pyreader_name='test_reader', ernie_config=ernie_config) test_prog = test_prog.clone(for_test=True) nccl2_num_trainers = 1 nccl2_trainer_id = 0 if args.is_distributed: trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") worker_endpoints = worker_endpoints_env.split(",") trainers_num = len(worker_endpoints) log.info("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) # prepare nccl2 env. config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_program if args.do_train else test_prog, startup_program=startup_prog) nccl2_num_trainers = trainers_num nccl2_trainer_id = trainer_id exe = fluid.Executor(place) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: log.info( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=graph_vars["loss"].name, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) train_pyreader.set_batch_generator(train_data_generator) else: train_exe = None if args.do_val or args.do_test: test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, main_program=test_prog, share_vars_from=train_exe) if args.do_train: train_pyreader.start() steps = 0 graph_vars["learning_rate"] = scheduled_lr time_begin = time.time() while True: try: steps += 1 if steps % args.skip_steps != 0: train_exe.run(fetch_list=[]) else: fetch_list = [ graph_vars["num_infer"].name, graph_vars["num_label"].name, graph_vars["num_correct"].name, graph_vars["loss"].name, graph_vars['learning_rate'].name, ] out = train_exe.run(fetch_list=fetch_list) num_infer, num_label, num_correct, np_loss, np_lr = out lr = float(np_lr[0]) loss = np_loss.mean() precision, recall, f1 = calculate_f1( num_label, num_infer, num_correct) if args.verbose: log.info( "train pyreader queue size: %d, learning rate: %f" % (train_pyreader.queue.size(), lr if warmup_steps > 0 else args.learning_rate)) current_example, current_epoch = reader.get_train_progress( ) time_end = time.time() used_time = time_end - time_begin log.info( "epoch: %d, progress: %d/%d, step: %d, loss: %f, " "f1: %f, precision: %f, recall: %f, speed: %f steps/s" % (current_epoch, current_example, num_train_examples, steps, loss, f1, precision, recall, args.skip_steps / used_time)) time_begin = time.time() if nccl2_trainer_id == 0 and steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) latest_path = os.path.join( args.checkpoints, "latest" ) # Always save the current copy and cover with the latest copy fluid.io.save_persistables(exe, save_path, train_program) fluid.io.save_persistables(exe, latest_path, train_program) if nccl2_trainer_id == 0 and steps % args.validation_steps == 0: # evaluate dev set if args.do_val: evaluate_wrapper(reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) # evaluate test set if args.do_test: predict_wrapper(reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break # final eval on dev set if nccl2_trainer_id == 0 and args.do_val: current_example, current_epoch = reader.get_train_progress() evaluate_wrapper(reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, 'final') if nccl2_trainer_id == 0 and args.do_test: current_example, current_epoch = reader.get_train_progress() predict_wrapper(reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, 'final')
def train(args): bert_config = BertConfig(args.bert_config_path) bert_config.print_config() if not (args.do_train or args.do_predict or args.do_val): raise ValueError("For args `do_train` and `do_predict`, at " "least one of them must be True.") if args.use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) wn_id2concept, wn_concept2id, wn_concept_embedding_mat = read_concept_embedding( args.wn_concept_embedding_path) nell_id2concept, nell_concept2id, nell_concept_embedding_mat = read_concept_embedding( args.nell_concept_embedding_path) processor = DataProcessor(vocab_path=args.vocab_path, do_lower_case=args.do_lower_case, max_seq_length=args.max_seq_len, in_tokens=args.in_tokens, doc_stride=args.doc_stride, max_query_length=args.max_query_length) startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed random.seed(args.random_seed) np.random.seed(args.random_seed) if args.do_train: train_concept_settings = { 'tokenization_path': '../retrieve_concepts/tokenization_squad/tokens/train.tokenization.{}.data' .format('uncased' if args.do_lower_case else 'cased'), 'wn_concept2id': wn_concept2id, 'nell_concept2id': nell_concept2id, 'use_wordnet': args.use_wordnet, 'retrieved_synset_path': args.retrieved_synset_path, 'use_nell': args.use_nell, 'retrieved_nell_concept_path': args.train_retrieved_nell_concept_path, } train_data_generator = processor.data_generator( data_path=args.train_file, batch_size=args.batch_size, phase='train', shuffle=True, dev_count=dev_count, version_2_with_negative=args.version_2_with_negative, epoch=args.epoch, **train_concept_settings) num_train_examples = processor.get_num_examples(phase='train') if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // ( args.batch_size) // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) logger.info("Device count: %d" % dev_count) logger.info("Num train examples: %d" % num_train_examples) logger.info("Max train steps: %d" % max_train_steps) logger.info("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() # if args.random_seed is not None: # train_program.random_seed = args.random_seed with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, loss, num_seqs = create_model( pyreader_name='train_reader', bert_config=bert_config, max_wn_concept_length=processor. train_wn_max_concept_length, max_nell_concept_length=processor. train_nell_max_concept_length, wn_concept_embedding_mat=wn_concept_embedding_mat, nell_concept_embedding_mat=nell_concept_embedding_mat, is_training=True, freeze=args.freeze) scheduled_lr = optimization(loss=loss, warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, loss_scaling=args.loss_scaling) if args.use_ema: ema = fluid.optimizer.ExponentialMovingAverage( args.ema_decay) ema.update() fluid.memory_optimize(train_program, skip_opt_set=[loss.name, num_seqs.name]) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) logger.info( "Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_predict or args.do_val: eval_concept_settings = { 'tokenization_path': '../retrieve_concepts/tokenization_squad/tokens/dev.tokenization.{}.data' .format('uncased' if args.do_lower_case else 'cased'), 'wn_concept2id': wn_concept2id, 'nell_concept2id': nell_concept2id, 'use_wordnet': args.use_wordnet, 'retrieved_synset_path': args.retrieved_synset_path, 'use_nell': args.use_nell, 'retrieved_nell_concept_path': args.dev_retrieved_nell_concept_path, } eval_data_generator = processor.data_generator( data_path=args.predict_file, batch_size=args.batch_size, phase='predict', shuffle=False, dev_count=1, epoch=1, **eval_concept_settings) test_prog = fluid.Program() # if args.random_seed is not None: # test_prog.random_seed = args.random_seed with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, unique_ids, start_logits, end_logits, num_seqs = create_model( pyreader_name='test_reader', bert_config=bert_config, max_wn_concept_length=processor. predict_wn_max_concept_length, max_nell_concept_length=processor. predict_nell_max_concept_length, wn_concept_embedding_mat=wn_concept_embedding_mat, nell_concept_embedding_mat=nell_concept_embedding_mat, is_training=False) if args.use_ema and 'ema' not in dir(): ema = fluid.optimizer.ExponentialMovingAverage( args.ema_decay) fluid.memory_optimize(test_prog, skip_opt_set=[ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ]) test_prog = test_prog.clone(for_test=True) # if args.random_seed is not None: # test_prog.random_seed = args.random_seed exe.run(startup_prog) if args.do_train: logger.info('load pretrained concept embedding') fluid.global_scope().find_var('wn_concept_emb_mat').get_tensor().set( wn_concept_embedding_mat, place) fluid.global_scope().find_var('nell_concept_emb_mat').get_tensor().set( nell_concept_embedding_mat, place) if args.init_checkpoint and args.init_pretraining_params: logger.info( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_predict or args.do_val: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing prediction!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=loss.name, exec_strategy=exec_strategy, main_program=train_program) train_pyreader.decorate_tensor_provider(train_data_generator) train_pyreader.start() steps = 0 total_cost, total_num_seqs = [], [] time_begin = time.time() while steps < max_train_steps: try: steps += 1 if steps % args.skip_steps == 0: if warmup_steps <= 0: fetch_list = [loss.name, num_seqs.name] else: fetch_list = [ loss.name, scheduled_lr.name, num_seqs.name ] else: fetch_list = [] outputs = train_exe.run(fetch_list=fetch_list) if steps % args.skip_steps == 0: if warmup_steps <= 0: np_loss, np_num_seqs = outputs else: np_loss, np_lr, np_num_seqs = outputs total_cost.extend(np_loss * np_num_seqs) total_num_seqs.extend(np_num_seqs) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % (np_lr[0] if warmup_steps > 0 else args.learning_rate) logger.info(verbose) time_end = time.time() used_time = time_end - time_begin current_example, epoch = processor.get_train_progress() logger.info( "epoch: %d, progress: %d/%d, step: %d, loss: %f, " "speed: %f steps/s" % (epoch, current_example, num_train_examples, steps, np.sum(total_cost) / np.sum(total_num_seqs), args.skip_steps / used_time)) total_cost, total_num_seqs = [], [] time_begin = time.time() if steps % args.save_steps == 0 or steps == max_train_steps: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps % args.validation_steps == 0 or steps == max_train_steps: if args.do_val: test_pyreader.decorate_tensor_provider( processor.data_generator( data_path=args.predict_file, batch_size=args.batch_size, phase='predict', shuffle=False, dev_count=1, epoch=1, **eval_concept_settings)) val_performance = predict( exe, test_prog, test_pyreader, [ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ], processor, eval_concept_settings, 'validate_result_step_{}.json'.format(steps)) logger.info( "Validation performance after step {}:\n* Exact_match: {}\n* F1: {}" .format(steps, val_performance['exact_match'], val_performance['f1'])) except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps) + "_final") fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break if args.do_predict: test_pyreader.decorate_tensor_provider(eval_data_generator) if args.use_ema: with ema.apply(exe): eval_performance = predict(exe, test_prog, test_pyreader, [ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ], processor, eval_concept_settings) else: eval_performance = predict(exe, test_prog, test_pyreader, [ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ], processor, eval_concept_settings) logger.info("Eval performance:\n* Exact_match: {}\n* F1: {}".format( eval_performance['exact_match'], eval_performance['f1']))
def train(args): ernie_config = ErnieConfig(args.ernie_config) ernie_config.print_config() if not (args.do_train or args.do_predict): raise ValueError("For args `do_train` and `do_predict`, at " "least one of them must be True.") if args.use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) processor = DataProcessor(vocab_path=args.vocab_path, do_lower_case=args.do_lower_case, max_seq_length=args.max_seq_len, in_tokens=args.in_tokens, doc_stride=args.doc_stride, max_query_length=args.max_query_length) startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = processor.data_generator( data_path=args.train_file, batch_size=args.batch_size, phase='train', shuffle=True, dev_count=dev_count, version_2_with_negative=args.version_2_with_negative, epoch=args.epoch) num_train_examples = processor.get_num_examples(phase='train') if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // ( args.batch_size) // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_data_loader, loss, num_seqs = create_model( ernie_config=ernie_config, is_training=True) scheduled_lr, loss_scaling = optimization( loss=loss, warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio) if args.do_predict: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_data_loader, unique_ids, start_logits, end_logits, num_seqs = create_model( ernie_config=ernie_config, is_training=False) test_prog = test_prog.clone(for_test=True) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_predict: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing prediction!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_compiled_program = fluid.CompiledProgram( train_program).with_data_parallel(loss_name=loss.name, exec_strategy=exec_strategy) train_data_loader.set_batch_generator(train_data_generator, place) train_data_loader.start() steps = 0 total_cost, total_num_seqs = [], [] time_begin = time.time() while True: try: steps += 1 if steps % args.skip_steps == 0: if args.use_fp16: fetch_list = [ loss.name, scheduled_lr.name, num_seqs.name, loss_scaling.name ] else: fetch_list = [ loss.name, scheduled_lr.name, num_seqs.name ] else: fetch_list = [] outputs = exe.run(train_compiled_program, fetch_list=fetch_list) if steps % args.skip_steps == 0: if args.use_fp16: np_loss, np_lr, np_num_seqs, np_scaling = outputs else: np_loss, np_lr, np_num_seqs = outputs total_cost.extend(np_loss * np_num_seqs) total_num_seqs.extend(np_num_seqs) if args.verbose: verbose = "train data_loader queue size: %d, " % train_data_loader.queue.size( ) verbose += "learning rate: %f " % np_lr[0] if args.use_fp16: verbose += ", loss scaling: %f" % np_scaling[0] print(verbose) time_end = time.time() used_time = time_end - time_begin current_example, epoch = processor.get_train_progress() print("epoch: %d, progress: %d/%d, step: %d, loss: %f, " "speed: %f steps/s" % (epoch, current_example, num_train_examples, steps, np.sum(total_cost) / np.sum(total_num_seqs), args.skip_steps / used_time)) total_cost, total_num_seqs = [], [] time_begin = time.time() if steps % args.save_steps == 0 or steps == max_train_steps: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps) + "_final") fluid.io.save_persistables(exe, save_path, train_program) train_data_loader.reset() break if args.do_predict: input_files = [] for input_pattern in args.predict_file: input_files.extend(glob.glob(input_pattern)) assert len(input_files) > 0, 'Can not find predict_file {}'.format( args.predict_file) for input_file in input_files: print('Run prediction on {}'.format(input_file)) prefix = os.path.basename(input_file) prefix = re.sub('.json', '', prefix) test_data_loader.set_batch_generator( processor.data_generator(data_path=input_file, batch_size=args.batch_size, phase='predict', shuffle=False, dev_count=1, epoch=1), place) predict(exe, test_prog, test_data_loader, [ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ], processor, prefix=prefix)
def main(args): ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() if args.use_cuda: dev_list = fluid.cuda_places() place = dev_list[0] dev_count = len(dev_list) else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) reader = reader_ce.ClassifyReader(vocab_path=args.vocab_path, label_map_config=args.label_map_config, max_seq_len=args.max_seq_len, total_num=args.train_data_size, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed, tokenizer=args.tokenizer, for_cn=args.for_cn, task_id=args.task_id) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") if args.do_test: assert args.test_save is not None startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.predict_batch_size == None: args.predict_batch_size = args.batch_size if args.do_train: role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) dev_count = fleet.worker_num() train_data_generator = reader.data_generator( input_file=args.train_set, batch_size=args.batch_size, epoch=args.epoch, dev_count=1, trainer_id=fleet.worker_index(), trainer_num=fleet.worker_num(), shuffle=True, phase="train") num_train_examples = reader.get_num_examples(args.train_set) if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) log.info("Device count: %d" % dev_count) log.info("Num train examples: %d" % num_train_examples) log.info("Max train steps: %d" % max_train_steps) log.info("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() # use fleet api exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count if args.is_distributed: exec_strategy.num_threads = 3 exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope dist_strategy = DistributedStrategy() dist_strategy.exec_strategy = exec_strategy dist_strategy.nccl_comm_num = 1 if args.is_distributed: dist_strategy.nccl_comm_num = 2 dist_strategy.use_hierarchical_allreduce = True if args.use_mix_precision: dist_strategy.use_amp = True with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars = create_model( args, pyreader_name='train_reader', ernie_config=ernie_config) scheduled_lr = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio, dist_strategy=dist_strategy) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) log.info("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, graph_vars = create_model( args, pyreader_name='test_reader', ernie_config=ernie_config, is_prediction=True) test_prog = test_prog.clone(for_test=True) train_program = fleet.main_program exe = fluid.Executor(place) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: log.warning( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog) if args.do_train: train_exe = exe train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None test_exe = exe # if args.do_val or args.do_test: # if args.use_multi_gpu_test: # test_exe = fluid.ParallelExecutor( # use_cuda=args.use_cuda, # main_program=test_prog, # share_vars_from=train_exe) current_epoch = 0 steps = 0 if args.do_train: train_pyreader.start() if warmup_steps > 0: graph_vars["learning_rate"] = scheduled_lr ce_info = [] time_begin = time.time() last_epoch = 0 while True: try: steps += 1 # log.info("step: %d" % steps) if fleet.worker_index() != 0: train_exe.run(fetch_list=[], program=train_program) continue if steps % args.skip_steps != 0: train_exe.run(fetch_list=[], program=train_program) else: outputs = evaluate(train_exe, train_program, train_pyreader, graph_vars, "train", metric=args.metric) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % ( outputs["learning_rate"] if warmup_steps > 0 else args.learning_rate) log.info(verbose) current_example, current_epoch = reader.get_train_progress( ) time_end = time.time() used_time = time_end - time_begin log.info( "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " "ave acc: %f, speed: %f steps/s" % (current_epoch, current_example * dev_count, num_train_examples, steps, outputs["loss"], outputs["accuracy"], args.skip_steps / used_time)) ce_info.append( [outputs["loss"], outputs["accuracy"], used_time]) time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, fleet._origin_program) # if steps % args.validation_steps == 0 or last_epoch != current_epoch: if steps % args.validation_steps == 0: # evaluate dev set if args.do_val: evaluate_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) if args.do_test: predict_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) if last_epoch != current_epoch: last_epoch = current_epoch except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, fleet._origin_program) train_pyreader.reset() break # final eval on dev set if args.do_val: evaluate_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) # final eval on test set if args.do_test: predict_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) # final eval on dianostic, hack for glue-ax if args.diagnostic: test_pyreader.decorate_tensor_provider( reader.data_generator(args.diagnostic, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False)) log.info("Final diagnostic") qids, preds, probs = predict(test_exe, test_prog, test_pyreader, graph_vars) assert len(qids) == len(preds), '{} v.s. {}'.format( len(qids), len(preds)) with open(args.diagnostic_save, 'w') as f: for id, s, p in zip(qids, preds, probs): f.write('{}\t{}\t{}\n'.format(id, s, p)) log.info("Done final diagnostic, saving to {}".format( args.diagnostic_save))
def DayAheadMarket(): consumption = data.load() windproduction = data.wind() solarproduction = data.solar() generators = pd.read_csv('generators.csv', sep=',', encoding='latin-1').set_index('ID') country = generators['country'].unique().tolist() transmissioncapacity = pd.read_csv('transmissioncapacity.csv', sep=',', encoding='latin-1').set_index('line') priceopt = pd.DataFrame(index = list(range(0,8760)), columns = country) priceopt.fillna(0, inplace = True) priceopt.index.rename('time', inplace=True) windopt = pd.DataFrame(index = list(range(0,8760)), columns = country) windopt.fillna(0, inplace = True) windopt.index.rename('time', inplace=True) solaropt = pd.DataFrame(index = list(range(0,8760)), columns = country) solaropt.fillna(0, inplace = True) solaropt.index.rename('time', inplace=True) loadshedopt = pd.DataFrame(index = list(range(0,8760)), columns = country) loadshedopt.fillna(0, inplace = True) loadshedopt.index.rename('time', inplace=True) zonalconsumptionopt = pd.DataFrame(index = list(range(0,8760)), columns = country) zonalconsumptionopt.fillna(0, inplace = True) zonalconsumptionopt.index.rename('time', inplace=True) lineflowopt = pd.DataFrame(index = list(range(0,8760)), columns = transmissioncapacity.index) lineflowopt.fillna(0, inplace = True) lineflowopt.index.rename('time', inplace=True) genprodopt = pd.DataFrame(index = list(range(0,8760)), columns = generators.index) genprodopt.fillna(0, inplace = True) genprodopt.index.rename('time', inplace=True) optimizationtimes = list(range(1,366)) for t in optimizationtimes: consumptionopt = consumption.loc[24*t-24+1:24*t] windprodopt = windproduction.loc[24*t-24+1:24*t] solarprodopt = solarproduction.loc[24*t-24+1:24*t] index = list(range(1,25)) consumptionopt = consumptionopt.reset_index() windprodopt = windprodopt.reset_index() solarprodopt = solarprodopt.reset_index() consumptionopt['time'] = index windprodopt['time'] = index solarprodopt['time'] = index consumptionopt = consumptionopt.set_index('time') windprodopt = windprodopt.set_index('time') solarprodopt = solarprodopt.set_index('time') consumptionopt.to_csv('consumptionopt.csv') windprodopt.to_csv('windprodopt.csv') solarprodopt.to_csv('solarprodopt.csv') df_zonalconsumption, df_price, df_genprod, df_lineflow, df_loadshed, df_windprod, df_solarprod, generators = results.optimization() for l in df_lineflow.index: lineflowopt.ix[24*t-24+l] = df_lineflow.ix[l] priceopt.ix[24*t-24+l] = df_price.ix[l] windopt.ix[24*t-24+l] = df_windprod.ix[l] solaropt.ix[24*t-24+l] = df_solarprod.ix[l] genprodopt.ix[24*t-24+l] = df_genprod.ix[l] loadshedopt.ix[24*t-24+l] = df_loadshed.ix[l] zonalconsumptionopt.ix[24*t-24+l] = df_zonalconsumption.ix[l] times = priceopt.index zones = priceopt.columns # Total consumption in the system subtracted loadshedding Consumption = zonalconsumptionopt.set_index(np.arange(0,len(zonalconsumptionopt))) Consumption = (Consumption.sum(axis=1)) - (loadshedopt.sum(axis=1)) # Calculating the wind penetration level WindPenetration = (windopt.sum(axis=1) / Consumption) * 100 # Calculating the solar penetration level SolarPenetration = (solaropt.sum(axis=1) / Consumption) * 100 WindLoad_df = pd.DataFrame({'Time':WindPenetration.index, 'WindProduction[MW]':windopt.sum(axis=1).values,\ 'SolarProduction[MW]':solaropt.sum(axis=1).values, 'TotalConsumption[MW]': Consumption.values, 'WindPenetration[%]':WindPenetration.values,\ 'SolarPenetration[%]':SolarPenetration.values}).set_index('Time') # Assigning each zone to a generator zone_generator = generators[['name','country']].values.tolist() zone_for_gens = defaultdict(list) for generator, zone in zone_generator: zone_for_gens[generator].append(zone) # Creating a dictionary to contain the market prices pricedict = {} for t in times: for z in np.arange(len(zones)): pricedict[priceopt.columns[z], t] = priceopt.ix[priceopt.index[t], priceopt.columns[z]] # Creating a dictionary to contain the generator production genproddict = {} for t in times: for g in np.arange(len(generators.index)): genproddict[genprodopt.columns[g], t] = genprodopt.ix[genprodopt.index[t], genprodopt.columns[g]] # Calculating the revenue for each generator revenue = {} for t in times: for g in generators.index: for z in zone_for_gens[g]: revenue[g, t] = pricedict[z, t] * genproddict[g, t] # Summing the revenues for all hours revenue_total = {} for g in generators.index: revenue_total[g] = sum(revenue[g, t] for t in times) df_revenue = pd.DataFrame([[key,value] for key,value in revenue_total.items()],columns=["Generator","TotalRevenue"]).set_index('Generator') # Catching the start-up number of each generator startup_number = {} for t in times[1:]: for g in generators.index: startup_number[g, t] = 0 if(genproddict[g, t] > 0 and genproddict[g, t-1] == 0): startup_number[g, t] = 1 # Summing total number of start-ups for all generators startup_number_total = {} for g in generators.index: startup_number_total[g] = sum(startup_number[g,t] for t in times[1:]) startup_number_df = pd.DataFrame([[key,value] for key,value in startup_number_total.items()],columns=["Generator","TotalStart-Ups"]).set_index('Generator') # A dataframe is returned to Excel for further work Gen_dataframe = df_revenue Gen_dataframe['TotalRevenue'] = Gen_dataframe['TotalRevenue'].map('{:.2f}'.format) Gen_dataframe['TotalProduction'] = genprodopt.sum(axis=0) Gen_dataframe['TotalProduction'] = Gen_dataframe['TotalProduction'].map('{:.2f}'.format) Gen_dataframe['NumberofS/U'] = startup_number_df['TotalStart-Ups'] Gen_dataframe['Capacity'] = generators.capacity Gen_dataframe['MarginalCost'] = generators.lincost Gen_dataframe['S/Ucost'] = generators.cyclecost Gen_dataframe['FixedO&MCost'] = generators.fixedomcost Gen_dataframe['VarO&MCost'] = generators.varomcost Gen_dataframe['LevelizedCapitalCost'] = generators.levcapcost Gen_dataframe['PrimaryFuel'] = generators.primaryfuel Gen_dataframe['Country'] = generators.country Gen_dataframe.to_csv('revenue_cost_gen.csv') return lineflowopt, priceopt, windopt, solaropt
def main(args): ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() if args.use_cuda: place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) reader = task_reader.MRCReader(vocab_path=args.vocab_path, label_map_config=args.label_map_config, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed, tokenizer=args.tokenizer, is_classify=args.is_classify, is_regression=args.is_regression, for_cn=args.for_cn, task_id=args.task_id, doc_stride=args.doc_stride, max_query_length=args.max_query_length) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.predict_batch_size == None: args.predict_batch_size = args.batch_size if args.do_train: train_data_generator = reader.data_generator( input_file=args.train_set, batch_size=args.batch_size, epoch=args.epoch, dev_count=dev_count, shuffle=True, phase="train") num_train_examples = reader.get_num_examples("train") if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars = create_model( args, pyreader_name='train_reader', ernie_config=ernie_config, is_training=True) scheduled_lr, loss_scaling = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16) """ fluid.memory_optimize( input_program=train_program, skip_opt_set=[ graph_vars["loss"].name, graph_vars["num_seqs"].name, ]) """ if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, test_graph_vars = create_model( args, pyreader_name='test_reader', ernie_config=ernie_config, is_training=False) test_prog = test_prog.clone(for_test=True) nccl2_num_trainers = 1 nccl2_trainer_id = 0 exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=graph_vars["loss"].name, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None if args.do_train: train_pyreader.start() steps = 0 if warmup_steps > 0: graph_vars["learning_rate"] = scheduled_lr time_begin = time.time() while True: try: steps += 1 if steps % args.skip_steps != 0: train_exe.run(fetch_list=[]) else: outputs = evaluate(train_exe, train_program, train_pyreader, graph_vars, "train") if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % ( outputs["learning_rate"] if warmup_steps > 0 else args.learning_rate) print(verbose) current_example, current_epoch = reader.get_train_progress( ) time_end = time.time() used_time = time_end - time_begin print( "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " "speed: %f steps/s" % (current_epoch, current_example, num_train_examples, steps, outputs["loss"], args.skip_steps / used_time)) time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps % args.validation_steps == 0: if args.do_val: test_pyreader.decorate_tensor_provider( reader.data_generator(args.dev_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False, phase="dev")) evaluate(exe, test_prog, test_pyreader, test_graph_vars, str(steps) + "_dev", examples=reader.get_examples("dev"), features=reader.get_features("dev"), args=args) if args.do_test: test_pyreader.decorate_tensor_provider( reader.data_generator(args.test_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False, phase="test")) evaluate(exe, test_prog, test_pyreader, test_graph_vars, str(steps) + "_test", examples=reader.get_examples("test"), features=reader.get_features("test"), args=args) except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break # final eval on dev set if args.do_val: print("Final validation result:") test_pyreader.decorate_tensor_provider( reader.data_generator(args.dev_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False, phase="dev")) evaluate(exe, test_prog, test_pyreader, test_graph_vars, "dev", examples=reader.get_examples("dev"), features=reader.get_features("dev"), args=args) # final eval on test set if args.do_test: print("Final test result:") test_pyreader.decorate_tensor_provider( reader.data_generator(args.test_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False, phase="test")) evaluate(exe, test_prog, test_pyreader, test_graph_vars, "test", examples=reader.get_examples("test"), features=reader.get_features("test"), args=args)
def main(args): """main""" reader = task_reader.RoleSequenceLabelReader( vocab_path=args.vocab_path, labels_map=labels_map, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed, task_id=args.task_id) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = reader.data_generator( input_file=args.train_set, batch_size=args.batch_size, epoch=args.epoch, shuffle=True, phase="train") num_train_examples = reader.get_num_examples(args.train_set) if args.in_tokens: if args.batch_size < args.max_seq_len: raise ValueError( 'if in_tokens=True, batch_size should greater than max_sqelen, got batch_size:%d seqlen:%d' % (args.batch_size, args.max_seq_len)) max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars = create_model( args, pyreader_name='train_reader', ernie_config=ernie_config) scheduled_lr, loss_scaling = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, graph_vars = create_model( args, pyreader_name='test_reader', ernie_config=ernie_config) test_prog = test_prog.clone(for_test=True) nccl2_num_trainers = 1 nccl2_trainer_id = 0 exe = fluid.Executor(place) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=graph_vars["loss"].name, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None if args.do_val or args.do_test: test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, main_program=test_prog, share_vars_from=train_exe) if args.do_train: train_pyreader.start() steps = 0 graph_vars["learning_rate"] = scheduled_lr time_begin = time.time() while True: try: steps += 1 if steps % args.skip_steps != 0: train_exe.run(fetch_list=[]) else: fetch_list = [ graph_vars["num_infer"].name, graph_vars["num_label"].name, graph_vars["num_correct"].name, graph_vars["loss"].name, graph_vars['learning_rate'].name, ] out = train_exe.run(fetch_list=fetch_list) num_infer, num_label, num_correct, np_loss, np_lr = out lr = float(np_lr[0]) loss = np_loss.mean() precision, recall, f1 = calculate_f1( num_label, num_infer, num_correct) if args.verbose: print( "train pyreader queue size: %d, learning rate: %f" % (train_pyreader.queue.size(), lr if warmup_steps > 0 else args.learning_rate)) current_example, current_epoch = reader.get_train_progress( ) time_end = time.time() used_time = time_end - time_begin print( u"【train】epoch: {}, step: {}, loss: {:.6f}, " "f1: {:.4f}, precision: {:.4f}, recall: {:.4f}, speed: {:.3f} steps/s" .format(current_epoch, steps, float(loss), float(f1), float(precision), float(recall), args.skip_steps / used_time)) time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps % args.validation_steps == 0: # evaluate dev set if args.do_val: precision, recall, f1 = evaluate_wrapper( reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) print( u"【dev】precision {:.4f} , recall {:.4f}, f1-score {:.4f}" .format(float(precision), float(recall), float(f1))) # evaluate test set if args.do_test: precision, recall, f1 = evaluate_wrapper( reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) print( u"【test】precision {:.4f} , recall {:.4f}, f1-score {:.4f}" .format(float(precision), float(recall), float(f1))) except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "final_model") fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break # final eval on dev set if args.do_val: precision, recall, f1 = evaluate_wrapper(reader, exe, test_prog, test_pyreader, graph_vars, 1, 'final') print(u"【dev】precision {:.4f} , recall {:.4f}, f1-score {:.4f}".format( float(precision), float(recall), float(f1))) if args.do_test: test_ret = predict_wrapper(reader, exe, test_prog, test_pyreader, graph_vars, 1, 'final') utils.write_by_lines(args.trigger_pred_save_path, test_ret)
def train(args): bert_config = BertConfig(args.bert_config_path) bert_config.print_config() if not (args.do_train or args.do_predict): raise ValueError("For args `do_train` and `do_predict`, at " "least one of them must be True.") if args.use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) processor = DataProcessor(vocab_path=args.vocab_path, do_lower_case=args.do_lower_case, max_seq_length=args.max_seq_len, in_tokens=args.in_tokens, doc_stride=args.doc_stride, max_query_length=args.max_query_length) startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: build_strategy = fluid.BuildStrategy() print("estimating runtime number of examples...") num_train_examples = processor.estimate_runtime_examples( args.train_file, sample_rate=args.sample_rate) print("runtime number of examples:") print(num_train_examples) train_data_generator = processor.data_generator( data_path=args.train_file, batch_size=args.batch_size, max_len=args.max_seq_len, phase='train', shuffle=True, dev_count=dev_count, with_negative=args.with_negative, epoch=args.epoch) if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // ( args.batch_size) // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, loss, num_seqs = create_model( pyreader_name='train_reader', bert_config=bert_config, is_training=True) train_pyreader.decorate_tensor_provider(train_data_generator) scheduled_lr = optimization(loss=loss, warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, loss_scaling=args.loss_scaling) loss.persistable = True num_seqs.persistable = True ema = fluid.optimizer.ExponentialMovingAverage(args.ema_decay) ema.update() train_compiled_program = fluid.CompiledProgram( train_program).with_data_parallel(loss_name=loss.name, build_strategy=build_strategy) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_predict: build_strategy = fluid.BuildStrategy() test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, unique_ids, start_logits, end_logits, num_seqs = create_model( pyreader_name='test_reader', bert_config=bert_config, is_training=False) if 'ema' not in dir(): ema = fluid.optimizer.ExponentialMovingAverage( args.ema_decay) unique_ids.persistable = True start_logits.persistable = True end_logits.persistable = True num_seqs.persistable = True test_prog = test_prog.clone(for_test=True) test_compiled_program = fluid.CompiledProgram( test_prog).with_data_parallel(build_strategy=build_strategy) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_predict: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing prediction!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: train_pyreader.start() steps = 0 total_cost, total_num_seqs = [], [] time_begin = time.time() while True: try: steps += 1 if steps % args.skip_steps == 0: if warmup_steps <= 0: fetch_list = [loss.name, num_seqs.name] else: fetch_list = [ loss.name, scheduled_lr.name, num_seqs.name ] else: fetch_list = [] outputs = exe.run(train_compiled_program, fetch_list=fetch_list) if steps % args.skip_steps == 0: if warmup_steps <= 0: np_loss, np_num_seqs = outputs else: np_loss, np_lr, np_num_seqs = outputs total_cost.extend(np_loss * np_num_seqs) total_num_seqs.extend(np_num_seqs) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % (np_lr[0] if warmup_steps > 0 else args.learning_rate) print(verbose) time_end = time.time() used_time = time_end - time_begin current_example, epoch = processor.get_train_progress() print("epoch: %d, progress: %d/%d, step: %d, loss: %f, " "speed: %f steps/s" % (epoch, current_example, num_train_examples, steps, np.sum(total_cost) / np.sum(total_num_seqs), args.skip_steps / used_time)) total_cost, total_num_seqs = [], [] time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps == max_train_steps: save_path = os.path.join(args.checkpoints, "step_" + str(steps) + "_final") fluid.io.save_persistables(exe, save_path, train_program) break except Exception as err: save_path = os.path.join(args.checkpoints, "step_" + str(steps) + "_final") fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break if args.do_predict: test_pyreader.decorate_tensor_provider( processor.data_generator(data_path=args.predict_file, batch_size=args.batch_size, max_len=args.max_seq_len, phase='predict', shuffle=False, dev_count=dev_count, epoch=1)) if args.use_ema: with ema.apply(exe): predict(exe, test_compiled_program, test_pyreader, [ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ], processor, prefix='ema_') else: predict(exe, test_compiled_program, test_pyreader, [ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ], processor)
def train(args): print("pretraining start") bert_config = BertConfig(args.bert_config_path) bert_config.print_config() train_program = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_data_loader, next_sent_acc, mask_lm_loss, total_loss = create_model( bert_config=bert_config) scheduled_lr, loss_scaling = optimization( loss=total_loss, warmup_steps=args.warmup_steps, num_train_steps=args.num_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio) test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_data_loader, next_sent_acc, mask_lm_loss, total_loss = create_model( bert_config=bert_config) test_prog = test_prog.clone(for_test=True) if args.use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) print("Device count %d" % dev_count) nccl2_num_trainers = 1 nccl2_trainer_id = 0 print("args.is_distributed:", args.is_distributed) if args.is_distributed: worker_endpoints_env = os.getenv("worker_endpoints") worker_endpoints = worker_endpoints_env.split(",") trainers_num = len(worker_endpoints) current_endpoint = os.getenv("current_endpoint") trainer_id = worker_endpoints.index(current_endpoint) if trainer_id == 0: print("train_id == 0, sleep 60s") time.sleep(60) print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) # prepare nccl2 env. config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_program, startup_program=startup_prog) nccl2_num_trainers = trainers_num nccl2_trainer_id = trainer_id exe = fluid.Executor(place) exe.run(startup_prog) if args.init_checkpoint and args.init_checkpoint != "": init_checkpoint(exe, args.init_checkpoint, train_program, args.use_fp16) data_reader = DataReader(data_dir=args.data_dir, batch_size=args.batch_size, in_tokens=args.in_tokens, vocab_path=args.vocab_path, voc_size=bert_config['vocab_size'], epoch=args.epoch, max_seq_len=args.max_seq_len, generate_neg_sample=args.generate_neg_sample) exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope build_strategy = fluid.BuildStrategy() if not sys.platform == "win32": build_strategy.num_trainers = nccl2_num_trainers elif nccl2_num_trainers > 1: raise ValueError( "Windows platform doesn't support distributed training!") build_strategy.trainer_id = nccl2_trainer_id # use_ngraph is for CPU only, please refer to README_ngraph.md for details use_ngraph = os.getenv('FLAGS_use_ngraph') if not use_ngraph: train_compiled_program = fluid.CompiledProgram( train_program).with_data_parallel(loss_name=total_loss.name, exec_strategy=exec_strategy, build_strategy=build_strategy) if args.validation_set_dir and args.validation_set_dir != "": predict = predict_wrapper(args, exe, bert_config, test_prog=test_prog, data_loader=test_data_loader, fetch_list=[ next_sent_acc.name, mask_lm_loss.name, total_loss.name ]) train_data_loader.set_batch_generator(data_reader.data_generator()) train_data_loader.start() steps = 0 cost = [] lm_cost = [] acc = [] time_begin = time.time() while steps < args.num_train_steps: try: steps += 1 skip_steps = args.skip_steps * nccl2_num_trainers if nccl2_trainer_id != 0: if use_ngraph: exe.run(fetch_list=[], program=train_program) else: exe.run(fetch_list=[], program=train_compiled_program) continue if steps % args.skip_steps != 0: if use_ngraph: exe.run(fetch_list=[], program=train_program) else: exe.run(fetch_list=[], program=train_compiled_program) else: fetch_list = [ next_sent_acc.name, mask_lm_loss.name, total_loss.name, scheduled_lr.name ] if args.use_fp16: fetch_list.append(loss_scaling.name) if use_ngraph: outputs = exe.run(fetch_list=fetch_list, program=train_program) else: outputs = exe.run(fetch_list=fetch_list, program=train_compiled_program) if args.use_fp16: each_next_acc, each_mask_lm_cost, each_total_cost, np_lr, np_scaling = outputs else: each_next_acc, each_mask_lm_cost, each_total_cost, np_lr = outputs acc.extend(each_next_acc) lm_cost.extend(each_mask_lm_cost) cost.extend(each_total_cost) time_end = time.time() used_time = time_end - time_begin epoch, current_file_index, total_file, current_file = data_reader.get_progress( ) if args.verbose: verbose = "feed_queue size: %d, " % train_data_loader.queue.size( ) verbose += "current learning_rate: %f, " % np_lr[0] if args.use_fp16: verbose += "loss scaling: %f" % np_scaling[0] print(verbose) print( "epoch: %d, progress: %d/%d, step: %d, loss: %f, " "ppl: %f, next_sent_acc: %f, speed: %f steps/s, file: %s" % (epoch, current_file_index, total_file, steps, np.mean(np.array(cost)), np.mean(np.exp( np.array(lm_cost))), np.mean(np.array(acc)), skip_steps / used_time, current_file)) cost = [] lm_cost = [] acc = [] time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.save(program=train_program, model_path=save_path) if args.validation_set_dir and steps % args.validation_steps == 0: vali_cost, vali_lm_cost, vali_acc, vali_steps, vali_speed = predict( ) print("[validation_set] epoch: %d, step: %d, " "loss: %f, global ppl: %f, batch-averged ppl: %f, " "next_sent_acc: %f, speed: %f steps/s" % (epoch, steps, np.mean(np.array(vali_cost) / vali_steps), np.exp(np.mean(np.array(vali_lm_cost) / vali_steps)), np.mean(np.exp(np.array(vali_lm_cost) / vali_steps)), np.mean(np.array(vali_acc) / vali_steps), vali_speed)) except fluid.core.EOFException: train_data_loader.reset() break
def main(): # --- Входные данные: # Матрица расстояний # None - нет пути D = [[ 0, 24.33963418, None, None, 57.96404481, 63.28330636, 92.04159379, 97.41273522, 34.13305879, 8.501392603, None, None, 43.92023683, 13.11889291, None, 94.92898583, 40.45057893, 30.26080728, 84.720999, None ], [ 24.33963418, 0, None, 73.85107875, None, 72.56256938, 92.0316875, None, 14.01893497, 65.85347056, 67.89705157, 54.96413112, None, None, 85.38839221, 91.94399714, None, None, None, None ], [ None, None, 0, 49.04459119, 40.96282125, None, 53.19741368, None, 39.55920339, None, 75.43881536, None, 61.22687459, None, 22.31400609, None, None, 19.07089353, 96.94798589, 68.55219007 ], [ None, 73.85107875, 49.04459119, 0, 61.08250022, None, 4.982894659, 29.04109359, 78.52806449, None, 60.23363471, 43.33114028, None, 39.18160796, 87.18847632, None, None, None, None, 48.74228835 ], [ 57.96404481, None, 40.96282125, 61.08250022, 0, 10.12077928, 32.69773126, None, 61.20582223, None, None, 61.52334809, 57.69343972, 47.02911973, 13.95421624, None, 76.08047128, None, None, None ], [ 63.28330636, 72.56256938, None, None, 10.12077928, 0, 86.94083095, 94.44400668, 42.9277122, None, None, 99.59403872, 97.64439464, None, 90.90544581, 19.80754733, None, 81.85216784, None, 33.2095325 ], [ 92.04159379, 92.0316875, 53.19741368, 4.982894659, 32.69773126, 86.94083095, 0, 0.494223833, None, None, 2.063733339, None, 36.42662168, None, 58.00264478, None, 79.68242764, 42.0504868, None, 42.74333119 ], [ 97.41273522, None, None, 29.04109359, None, 94.44400668, 0.494223833, 0, None, 1.947158575, None, None, 48.47936034, None, 82.70263076, None, None, 28.87930274, None, 81.40396476 ], [ 34.13305879, 14.01893497, 39.55920339, 78.52806449, 61.20582223, 42.9277122, None, None, 0, 81.9386065, None, None, 74.06826615, None, 42.4292624, None, 44.19479966, 65.73415399, 48.50550294, None ], [ 8.501392603, 65.85347056, None, None, None, None, None, 1.947158575, 81.9386065, 0, 15.85895419, 57.43348002, None, None, None, None, 88.36366534, None, 95.32405734, None ], [ None, 67.89705157, 75.43881536, 60.23363471, None, None, 2.063733339, None, None, 15.85895419, 0, None, 3.607875109, None, None, 71.37306333, None, None, 38.88342977, 79.96090055 ], [ None, 54.96413112, None, 43.33114028, 61.52334809, 99.59403872, None, None, None, 57.43348002, None, 0, None, None, 10.83549857, 77.0598352, None, None, None, None ], [ 43.92023683, None, 61.22687459, None, 57.69343972, 97.64439464, 36.42662168, 48.47936034, 74.06826615, None, 3.607875109, None, 0, 18.74819398, 5.097728968, None, 5.193954706, None, 54.12440896, 19.87610459 ], [ 13.11889291, None, None, 39.18160796, 47.02911973, None, None, None, None, None, None, None, 18.74819398, 0, 16.07987285, None, 78.85996699, None, 43.64811778, None ], [ None, 85.38839221, 22.31400609, 87.18847632, 13.95421624, 90.90544581, 58.00264478, 82.70263076, 42.4292624, None, None, 10.83549857, 5.097728968, 16.07987285, 0, None, None, None, None, 96.07403874 ], [ 94.92898583, 91.94399714, None, None, None, 19.80754733, None, None, None, None, 71.37306333, 77.0598352, None, None, None, 0, 28.11259627, 96.32515311, 31.03093505, 38.13814521 ], [ 40.45057893, None, None, None, 76.08047128, None, 79.68242764, None, 44.19479966, 88.36366534, None, None, 5.193954706, 78.85996699, None, 28.11259627, 0, None, None, None ], [ 30.26080728, None, 19.07089353, None, None, 81.85216784, 42.0504868, 28.87930274, 65.73415399, None, None, None, None, None, None, 96.32515311, None, 0, 51.32429004, None ], [ 84.720999, None, 96.94798589, None, None, None, None, None, 48.50550294, 95.32405734, 38.88342977, None, 54.12440896, 43.64811778, None, 31.03093505, None, 51.32429004, 0, None ], [ None, None, 68.55219007, 48.74228835, None, 33.2095325, 42.74333119, 81.40396476, None, None, 79.96090055, None, 19.87610459, None, 96.07403874, 38.13814521, None, None, None, 0 ]] # Захадкодить матрицу смежностей S = [[ 48.71573972, 55.90371142, 44.89405271, 42.82012594, 53.94055819, 30.13655601, 55.56523673, 58.36534556, 44.64173521, 15.25597761, 20.82542486, 26.89950711, 61.49162093, 45.03559667, 36.63527015, 10.13577859, 33.57053564, 44.92482314, 25.51483791, 36.32756589 ], [ 55.90371142, 64.15226308, 51.51813729, 49.13820415, 61.89944803, 34.58318277, 63.76384669, 66.97710954, 51.22859053, 17.50698593, 23.89820104, 30.86850968, 70.56466454, 51.68056596, 42.04077742, 11.63130529, 38.52384358, 51.55344787, 29.27953355, 41.68767162 ], [ 44.89405271, 51.51813729, 41.37217212, 39.46094224, 49.7089909, 27.77238203, 51.2062155, 53.78665941, 41.1396486, 14.05916582, 19.19169712, 24.78927544, 56.6676825, 41.50261214, 33.76128042, 9.34063982, 30.93697038, 41.40052865, 23.51323175, 33.47771515 ], [ 42.82012594, 49.13820415, 39.46094224, 37.63800358, 47.41263313, 26.48940838, 48.84069191, 51.30192957, 39.23916038, 13.40968825, 18.305117, 23.64410946, 54.04986084, 39.58535645, 32.20164347, 8.909139392, 29.50780533, 39.48798881, 22.42701392, 31.9311778 ], [ 53.94055819, 61.89944803, 49.7089909, 47.41263313, 59.72574438, 33.36873591, 61.52467154, 64.62509523, 49.42961207, 16.89219855, 23.05897536, 29.78450983, 68.08666718, 49.86571562, 40.56444453, 11.22285237, 37.17101376, 49.74306149, 28.25133328, 40.22373863 ], [ 30.13655601, 34.58318277, 27.77238203, 26.48940838, 33.36873591, 18.64309181, 34.37379538, 36.10600016, 27.61629324, 9.437660732, 12.88303506, 16.64058695, 38.03997823, 27.85994403, 22.66332971, 6.270200575, 20.76742206, 27.79141724, 15.78400218, 22.47297753 ], [ 55.56523673, 63.76384669, 51.2062155, 48.84069191, 61.52467154, 34.37379538, 63.37778201, 66.57158983, 50.91842183, 17.40098811, 23.75350696, 30.68161315, 70.13742362, 51.36766073, 41.78623727, 11.56088238, 38.29059706, 51.24131229, 29.10225764, 41.43526938 ], [ 58.36534556, 66.97710954, 53.78665941, 51.30192957, 64.62509523, 36.10600016, 66.57158983, 69.92634378, 53.4843629, 18.27787919, 24.95052165, 32.22775713, 73.67187126, 53.95624038, 43.89197854, 12.14347197, 40.22018192, 53.82352484, 30.56881287, 43.52332426 ], [ 44.64173521, 51.22859053, 41.1396486, 39.23916038, 49.42961207, 27.61629324, 50.91842183, 53.4843629, 40.90843194, 13.98014927, 19.08383425, 24.64995258, 56.34919381, 41.26935552, 33.5715323, 9.288142735, 30.7630957, 41.16784576, 23.38108062, 33.28956076 ], [ 15.25597761, 17.50698593, 14.05916582, 13.40968825, 16.89219855, 9.437660732, 17.40098811, 18.27787919, 13.98014927, 4.777610977, 6.521756977, 8.423936096, 19.25691363, 14.10349219, 11.47281894, 3.174152997, 10.51305683, 14.06880199, 7.990308637, 11.37645728 ], [ 20.82542486, 23.89820104, 19.19169712, 18.305117, 23.05897536, 12.88303506, 23.75350696, 24.95052165, 19.08383425, 6.521756977, 8.902632356, 11.49923346, 26.28696883, 19.25220555, 15.66116147, 4.33293011, 14.35102231, 19.20485112, 10.90730312, 15.52962139 ], [ 26.89950711, 30.86850968, 24.78927544, 23.64410946, 29.78450983, 16.64058695, 30.68161315, 32.22775713, 24.64995258, 8.423936096, 11.49923346, 14.85317655, 33.9540014, 24.86743217, 20.22900023, 5.596701392, 18.53673715, 24.80626603, 14.08859986, 20.0590943 ], [ 61.49162093, 70.56466454, 56.6676825, 54.04986084, 68.08666718, 38.03997823, 70.13742362, 73.67187126, 56.34919381, 19.25691363, 26.28696883, 33.9540014, 77.61802379, 56.84634689, 46.24300396, 12.79392366, 42.3745316, 56.70652258, 32.20619762, 45.85460313 ], [ 45.03559667, 51.68056596, 41.50261214, 39.58535645, 49.86571562, 27.85994403, 51.36766073, 53.95624038, 41.26935552, 14.10349219, 19.25220555, 24.86743217, 56.84634689, 41.63346342, 33.86772448, 9.370089404, 31.03450983, 41.53105807, 23.58736531, 33.58326518 ], [ 36.63527015, 42.04077742, 33.76128042, 32.20164347, 40.56444453, 22.66332971, 41.78623727, 43.89197854, 33.5715323, 11.47281894, 15.66116147, 20.22900023, 46.24300396, 33.86772448, 27.55050066, 7.62232061, 25.24575527, 33.78442043, 19.1876996, 27.31910052 ], [ 10.13577859, 11.63130529, 9.34063982, 8.909139392, 11.22285237, 6.270200575, 11.56088238, 12.14347197, 9.288142735, 3.174152997, 4.33293011, 5.596701392, 12.79392366, 9.370089404, 7.62232061, 2.108846303, 6.984673093, 9.347041903, 5.308607634, 7.558299775 ], [ 33.57053564, 38.52384358, 30.93697038, 29.50780533, 37.17101376, 20.76742206, 38.29059706, 40.22018192, 30.7630957, 10.51305683, 14.35102231, 18.53673715, 42.3745316, 31.03450983, 25.24575527, 6.984673093, 23.13381404, 30.95817461, 17.58254684, 25.03371298 ], [ 44.92482314, 51.55344787, 41.40052865, 39.48798881, 49.74306149, 27.79141724, 51.24131229, 53.82352484, 41.16784576, 14.06880199, 19.20485112, 24.80626603, 56.70652258, 41.53105807, 33.78442043, 9.347041903, 30.95817461, 41.42890461, 23.52934774, 33.50066081 ], [ 25.51483791, 29.27953355, 23.51323175, 22.42701392, 28.25133328, 15.78400218, 29.10225764, 30.56881287, 23.38108062, 7.990308637, 10.90730312, 14.08859986, 32.20619762, 23.58736531, 19.1876996, 5.308607634, 17.58254684, 23.52934774, 13.36338024, 19.02653969 ], [ 36.32756589, 41.68767162, 33.47771515, 31.9311778, 40.22373863, 22.47297753, 41.43526938, 43.52332426, 33.28956076, 11.37645728, 15.52962139, 20.0590943, 45.85460313, 33.58326518, 27.31910052, 7.558299775, 25.03371298, 33.50066081, 19.02653969, 27.08964394 ]] # Матрица путей, считается в C++ P = [ [1, 1, 15, 7, 15, 5, 8, 10, 1, 1, 7, 15, 11, 1, 13, 17, 13, 1, 11, 13], [2, 2, 9, 7, 15, 9, 8, 10, 2, 1, 7, 2, 11, 1, 13, 17, 13, 1, 9, 13], [ 10, 9, 3, 7, 15, 5, 11, 7, 3, 8, 13, 15, 15, 15, 3, 17, 13, 3, 11, 13 ], [10, 1, 15, 4, 15, 5, 4, 7, 1, 8, 7, 15, 11, 1, 13, 17, 13, 8, 11, 13], [10, 1, 15, 7, 5, 5, 11, 7, 6, 8, 13, 15, 15, 15, 5, 6, 13, 8, 16, 13], [10, 9, 15, 7, 6, 6, 11, 7, 6, 8, 13, 15, 15, 15, 5, 6, 13, 8, 16, 6], [10, 1, 15, 7, 15, 5, 7, 7, 1, 8, 7, 15, 11, 1, 13, 17, 13, 8, 11, 13], [10, 1, 15, 7, 15, 5, 8, 8, 1, 8, 7, 15, 11, 1, 13, 17, 13, 8, 11, 13], [9, 9, 9, 7, 6, 9, 8, 10, 9, 1, 7, 15, 15, 1, 9, 6, 9, 3, 9, 13], [ 10, 1, 15, 7, 15, 5, 8, 10, 1, 10, 7, 15, 11, 1, 13, 17, 13, 8, 11, 13 ], [ 10, 1, 15, 7, 15, 5, 11, 7, 1, 8, 11, 15, 11, 13, 13, 17, 13, 8, 11, 13 ], [ 10, 12, 15, 7, 15, 5, 11, 7, 15, 8, 13, 12, 15, 15, 12, 17, 13, 8, 11, 13 ], [ 10, 1, 15, 7, 15, 5, 11, 7, 15, 8, 13, 15, 13, 13, 13, 17, 13, 8, 11, 13 ], [ 14, 1, 15, 7, 15, 5, 8, 10, 1, 1, 13, 15, 14, 14, 14, 17, 13, 1, 14, 13 ], [ 10, 1, 15, 7, 15, 5, 11, 7, 15, 8, 13, 15, 15, 15, 15, 17, 13, 8, 11, 13 ], [ 10, 1, 15, 7, 6, 16, 11, 7, 6, 8, 13, 15, 17, 13, 13, 16, 16, 8, 16, 16 ], [ 10, 1, 15, 7, 15, 5, 11, 7, 17, 8, 13, 15, 17, 13, 13, 17, 17, 8, 11, 13 ], [ 18, 1, 18, 7, 15, 5, 8, 18, 3, 8, 7, 15, 11, 1, 13, 17, 13, 18, 18, 13 ], [ 10, 9, 15, 7, 6, 16, 11, 7, 19, 8, 19, 15, 11, 19, 13, 19, 13, 19, 19, 13 ], [ 10, 1, 15, 7, 15, 20, 11, 7, 15, 8, 13, 15, 20, 13, 13, 20, 13, 8, 11, 20 ] ] # Смежности почему то Y # В восстановление путей передавать ее и D # Проверить на точность переноса алгоритма дейкстры # Матрицу посчитать в екселе # Количество узлов связи n = 20 # Длина пакета L = 200 # байт # Codec Codec = 'G.711' # Интенсивность удельной абонентской нагрузки y0 = 0.1 # Эрл # --- Требования к качеству обслуживания: # Начальное требование к величине задержки T0 T0 = 0.1 # 100ms = 0.1s # Доля вызовов, обслуженных с гарантированным качеством q = 98 # 98% # Распределение абонентов по узлам связи (количество) N = [ 7916, 9084, 7295, 6958, 8765, 4897, 9029, 9484, 7254, 2479, 3384, 4371, 9992, 7318, 5953, 1647, 5455, 7300, 4146, 5903 ] # --- Рассчеты: y = C.calcTrafficNodeIntensity(N, y0, n) print( "1. Интенсивность исходящего трафика от каждого из узлов сети была вычислена." ) OUT.printArr(y) k = C.calcTrafficRatio(y, n) print( "2. Коэффициенты распределения трафика по направлениям связи были вычислены." ) OUT.printMatrix(k, n) Y = C.calcTrafficMatrixIntensity(k, y, n) print( "3. Матрица интенсивностей трафика в направлениях связи была вычислена:" ) OUT.printMatrix(Y, n) R = C.calcByFloydsAlgorithm(D, n) print( "4. Матрица кратчайших маршрутов между вершинами графа была вычислена:" ) OUT.printMatrix(D, n) _next = C.calcPath(P, S, n) OUT.printMatrix(_next, n) TNEXT = [list(x) for x in zip(*_next)] OUT.printMatrix(TNEXT, n) # Матрица интенсивностей нагрузок на линии связи: print("5. Матрица интенсивностей нагрузок на линии связи была вычислена:") Ytilda = [[ 48.72, 646.77, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 262.62, 1375.73, 0.00, 0.00, 0.00, 338.53, 0.00, 0.00, 0.00, 138.01, 0.00, 0.00 ], [ 646.77, 64.15, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 166.61, 0.00, 0.00, 30.87, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ], [ 0.00, 0.00, 41.37, 0.00, 0.00, 0.00, 0.00, 0.00, 133.83, 0.00, 0.00, 0.00, 0.00, 0.00, 554.07, 0.00, 0.00, 82.57, 0.00, 0.00 ], [ 0.00, 0.00, 0.00, 37.64, 0.00, 0.00, 658.16, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ], [ 0.00, 0.00, 0.00, 0.00, 59.73, 453.23, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 1025.46, 0.00, 0.00, 0.00, 0.00, 0.00 ], [ 0.00, 0.00, 0.00, 0.00, 453.23, 18.64, 0.00, 0.00, 120.92, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 70.82, 0.00, 0.00, 0.00, 22.47 ], [ 0.00, 0.00, 0.00, 658.16, 0.00, 0.00, 63.38, 2182.19, 0.00, 0.00, 2318.57, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ], [ 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 2182.19, 69.93, 0.00, 1497.16, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 444.46, 0.00, 0.00 ], [ 262.62, 166.61, 133.83, 0.00, 0.00, 120.92, 0.00, 0.00, 40.91, 0.00, 0.00, 0.00, 0.00, 0.00, 147.86, 0.00, 30.76, 0.00, 52.66, 0.00 ], [ 1375.73, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 1497.16, 0.00, 4.78, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ], [ 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 2318.57, 0.00, 0.00, 0.00, 8.90, 0.00, 2334.07, 0.00, 0.00, 0.00, 0.00, 0.00, 252.12, 0.00 ], [ 0.00, 30.87, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 14.85, 0.00, 0.00, 391.38, 0.00, 0.00, 0.00, 0.00, 0.00 ], [ 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 2334.07, 0.00, 77.62, 150.09, 1860.29, 0.00, 600.58, 0.00, 0.00, 533.18 ], [ 338.53, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 150.09, 41.63, 177.96, 0.00, 0.00, 0.00, 23.59, 0.00 ], [ 0.00, 0.00, 554.07, 0.00, 1025.46, 0.00, 0.00, 0.00, 147.86, 0.00, 0.00, 391.38, 1860.29, 177.96, 27.55, 0.00, 0.00, 0.00, 0.00, 0.00 ], [ 0.00, 0.00, 0.00, 0.00, 0.00, 70.82, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 2.11, 122.94, 0.00, 49.34, 7.56 ], [ 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 30.76, 0.00, 0.00, 0.00, 600.58, 0.00, 0.00, 122.94, 23.13, 0.00, 0.00, 0.00 ], [ 138.01, 0.00, 82.57, 0.00, 0.00, 0.00, 0.00, 444.46, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 41.43, 23.53, 0.00 ], [ 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 52.66, 0.00, 252.12, 0.00, 0.00, 23.59, 0.00, 49.34, 0.00, 23.53, 13.36, 0.00 ], [ 0.00, 0.00, 0.00, 0.00, 0.00, 22.47, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 533.18, 0.00, 0.00, 7.56, 0.00, 0.00, 0.00, 27.09 ]] OUT.printMatrix(Ytilda, n) # Матрица потоков: print("6. Матрица потоков была вычислена:") V = C.calcStreamMatrix(Ytilda, q, n) OUT.printMatrix(V, n) # Интенсивность трафика ПД в линиях связи: print("7. Интенсивность трафика ПД в линиях связи была вычислена:") A = C.calcTrafficLineIntensity(V, Codec, n) OUT.printMatrix(A, n) # Пропускная способность линий связи: print("8. Пропускная способность линий связи была вычислена:") B = C.calcLinesCapacity(A, L, T0, n) OUT.printMatrix(B, n) summa = 0 for i in range(n): for j in range(n): summa += B[i][j] print("Сумма: " + str(summa)) # --- Оптимизация: print("9. Оптимизация пропускной способности линий связи:") O.optimization(B, A, L, TNEXT, n)
def main(args): """main""" ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() if args.use_cuda: dev_list = fluid.cuda_places() place = dev_list[0] dev_count = len(dev_list) else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) reader = task_reader.RankReader( vocab_path=args.vocab_path, label_map_config=args.label_map_config, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed, tokenizer=args.tokenizer, is_classify=args.is_classify, is_regression=args.is_regression, for_cn=args.for_cn, task_id=args.task_id, ) if not (args.do_train or args.do_val or args.do_test): raise ValueError( "For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.", ) if args.do_test: assert args.test_save is not None startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = reader.data_generator( input_file=args.train_set, batch_size=args.batch_size, epoch=args.epoch, dev_count=dev_count, shuffle=True, phase="train", ) num_train_examples = reader.get_num_examples(args.train_set) if args.in_tokens: if args.batch_size < args.max_seq_len: raise ValueError( 'if in_tokens=True, batch_size should greater than max_sqelen, \ got batch_size:%d seqlen:%d' % (args.batch_size, args.max_seq_len)) max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) log.info("Device count: %d" % dev_count) log.info("Num train examples: %d" % num_train_examples) log.info("Max train steps: %d" % max_train_steps) log.info("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() if args.random_seed is not None and args.enable_ce: train_program.random_seed = args.random_seed with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars = create_model( args, pyreader_name='train_reader', ernie_config=ernie_config, is_classify=args.is_classify, is_regression=args.is_regression, ) scheduled_lr, loss_scaling = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio, ) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len, ) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size, ) log.info("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, graph_vars = create_model( args, pyreader_name='test_reader', ernie_config=ernie_config, is_classify=args.is_classify, is_regression=args.is_regression, ) test_prog = test_prog.clone(for_test=True) nccl2_num_trainers = 1 nccl2_trainer_id = 0 if args.is_distributed: trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") worker_endpoints = worker_endpoints_env.split(",") trainers_num = len(worker_endpoints) log.info("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) t.transpile( trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_program if args.do_train else test_prog, startup_program=startup_prog, ) nccl2_num_trainers = trainers_num nccl2_trainer_id = trainer_id exe = fluid.Executor(place) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: log.warning( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.", ) if args.init_checkpoint: init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16, ) elif args.init_pretraining_params: init_pretraining_params( exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16, ) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError( "args 'init_checkpoint' should be set if" "only doing validation or testing!", ) init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16, ) if args.do_train: exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor( use_cuda=args.use_cuda, loss_name=graph_vars["loss"].name, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id, ) train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None test_exe = exe if args.do_val or args.do_test: if args.use_multi_gpu_test: test_exe = fluid.ParallelExecutor( use_cuda=args.use_cuda, main_program=test_prog, share_vars_from=train_exe, ) if args.do_train: train_pyreader.start() steps = 0 if warmup_steps > 0: graph_vars["learning_rate"] = scheduled_lr ce_info = [] time_begin = time.time() last_epoch = 0 current_epoch = 0 while True: try: steps += 1 if steps % args.skip_steps != 0: train_exe.run(fetch_list=[]) else: outputs = evaluate( train_exe, train_program, train_pyreader, graph_vars, "train", metric=args.metric, is_classify=args.is_classify, is_regression=args.is_regression, ) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % ( outputs["learning_rate"] if warmup_steps > 0 else args.learning_rate) log.info(verbose) current_example, current_epoch = reader.get_train_progress( ) time_end = time.time() used_time = time_end - time_begin if args.is_classify: log.info( "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " "ave acc: %f, speed: %f steps/s" % ( current_epoch, current_example, num_train_examples, steps, outputs["loss"], outputs['acc'], args.skip_steps / used_time, ), ) ce_info.append([outputs["loss"], used_time], ) if args.is_regression: log.info( "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " " speed: %f steps/s" % ( current_epoch, current_example, num_train_examples, steps, outputs["loss"], args.skip_steps / used_time, ), ) time_begin = time.time() if nccl2_trainer_id == 0: if steps % args.save_steps == 0: save_path = os.path.join( args.checkpoints, "step_" + str(steps), ) fluid.io.save_persistables( exe, save_path, train_program, ) if steps % args.validation_steps == 0 or last_epoch != current_epoch: # evaluate dev set if args.do_val: evaluate_wrapper( args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps, ) if args.do_test: predict_wrapper( args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps, ) if last_epoch != current_epoch: last_epoch = current_epoch except fluid.core.EOFException: save_path = os.path.join( args.checkpoints, "step_" + str(steps), ) fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break if args.enable_ce: card_num = get_cards() ce_loss = 0 ce_acc = 0 ce_time = 0 try: ce_loss = ce_info[-2][0] ce_acc = ce_info[-2][1] ce_time = ce_info[-2][2] except: log.info("ce info error") log.info("kpis\ttrain_duration_card%s\t%s" % (card_num, ce_time)) log.info("kpis\ttrain_loss_card%s\t%f" % (card_num, ce_loss)) log.info("kpis\ttrain_acc_card%s\t%f" % (card_num, ce_acc)) # final eval on dev set if args.do_val: evaluate_wrapper( args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps, ) # final eval on test set if args.do_test: predict_wrapper( args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps, ) # final eval on dianostic, hack for glue-ax if args.diagnostic: test_pyreader.decorate_tensor_provider( reader.data_generator( args.diagnostic, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False, ), ) log.info("Final diagnostic") qids, preds, probs = predict( test_exe, test_prog, test_pyreader, graph_vars, is_classify=args.is_classify, is_regression=args.is_regression, ) assert len(qids) == len(preds), '{} v.s. {}'.format( len(qids), len(preds), ) with open(args.diagnostic_save, 'w') as f: for id, s, p in zip(qids, preds, probs): f.write('{}\t{}\t{}\n'.format(id, s, p)) log.info("Done final diagnostic, saving to {}".format( args.diagnostic_save, ))
def PLRmarket(): market = PLRclass() market.optimize() df_generators = pd.read_csv('generators.csv').set_index('ID') df_cost = market.data.cost zones = market.data.zones times = market.data.times df_gencapacity = pd.DataFrame({'capacity': {g: market.variables.gcap[g].x for g in df_generators.index}}) df_zonaldemand = pd.DataFrame({'demand': {z: market.variables.demand[z].x for z in zones}}) df_zonalconsumption = pd.DataFrame(index = times, data = {z: [market.data.zonalconsumption[z,t] for t in times] for z in zones}) gens_for_zones = market.data.gens_for_country reservemargin = market.data.reservemargin timeperiod = market.data.timeperiod model = market.data.model ActivationPrice = df_generators['lincostold'].max() + 1 # ActivationPrice = 1000 for g in df_generators.index: if model == 'Swedish': if df_gencapacity['capacity'][g] > 0: df_generators['lincost'][g] = ActivationPrice + (df_generators['lincostold'][g]*0.05) elif df_gencapacity['capacity'][g] <= 0: df_generators['lincost'][g] = df_generators['lincostold'][g] df_generators.to_csv('generators.csv') df_price, df_genprod, df_lineflow, df_loadshed, df_windsolarload, df_revenueprod, network, times, generators, startup_number_df, df_zonalconsumption, df_windprod, df_solarprod = results.optimization() # Cost of wind windcost = {} for z in df_price.columns: for t in df_price.index: windcost[z,t] = df_windprod[z][t] * df_price[z][t] totalwindcost = sum(windcost.values()) # Cost of solar solarcost = {} for z in df_price.columns: for t in df_price.index: solarcost[z,t] = df_solarprod[z][t] * df_price[z][t] totalsolarcost = sum(solarcost.values()) windpenlevel = df_windsolarload['WindPenetration[%]'].mean() solarpenlevel = df_windsolarload['SolarPenetration[%]'].mean() # A dataframe is returned to Excel for further work Gen_dataframe = df_revenueprod Gen_dataframe['TotalRevenue'] = Gen_dataframe['Total Revenue'].map('{:.2f}'.format) Gen_dataframe['TotalProduction'] = Gen_dataframe['Total Production'].map('{:.2f}'.format) Gen_dataframe['NumberofS/U'] = startup_number_df['Total Start-Ups'] Gen_dataframe['Capacity'] = generators.capacity Gen_dataframe['MarginalCost'] = generators.lincost Gen_dataframe['S/Ucost'] = generators.cyclecost Gen_dataframe['FixedO&MCost'] = generators.fixedomcost Gen_dataframe['VarO&MCost'] = generators.varomcost Gen_dataframe['LevelizedCapitalCost'] = generators.levcapcost Gen_dataframe['PrimaryFuel'] = generators.primaryfuel Gen_dataframe['PLRplants'] = df_gencapacity['capacity'] PLRbid = {} for g in df_generators.index: if df_gencapacity['capacity'][g] > 0: PLRbid[g] = df_cost['PLRbid'][g] elif df_gencapacity['capacity'][g] == 0: PLRbid[g] = 0 PLRbid_df = pd.DataFrame([[key,value] for key,value in PLRbid.items()],columns=["Generator","PLRbid"]).set_index('Generator') Gen_dataframe['PLRbid'] = PLRbid_df['PLRbid'].map('{:.2f}'.format) Gen_dataframe.to_csv('revenue_cost_gen_PLR.csv') Gen_dataframe = plrmodel.missingmoneyPLR(timeperiod) return df_gencapacity, df_zonaldemand, df_price, zones, df_generators, df_zonalconsumption, Gen_dataframe, gens_for_zones, reservemargin, timeperiod, df_cost, windpenlevel, totalwindcost, solarpenlevel, totalsolarcost
import os from optimization import optimization if __name__ == '__main__': #now, we are going to learn the parameters train = optimization() train.run_optimization()
def train(args): bert_config = BertConfig(args.bert_config_path) bert_config.print_config() if not (args.do_train or args.do_predict): raise ValueError("For args `do_train` and `do_predict`, at " "least one of them must be True.") if args.use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) processor = DataProcessor( vocab_path=args.vocab_path, do_lower_case=args.do_lower_case, max_seq_length=args.max_seq_len, in_tokens=args.in_tokens, doc_stride=args.doc_stride, max_query_length=args.max_query_length, adv_text_path=args.adv_text_path) startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = processor.data_generator( data_path=args.train_file, batch_size=args.batch_size, phase='train', shuffle=True, dev_count=dev_count, version_2_with_negative=args.version_2_with_negative, epoch=args.epoch) num_train_examples = processor.get_num_examples(phase='train') if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // ( args.batch_size) // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, loss, num_seqs = create_model( pyreader_name='train_reader', bert_config=bert_config, is_training=True) scheduled_lr = optimization( loss=loss, warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, loss_scaling=args.loss_scaling) fluid.memory_optimize(train_program, skip_opt_set=[loss.name, num_seqs.name]) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_predict: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, unique_ids, start_logits, end_logits, num_seqs = create_model( pyreader_name='test_reader', bert_config=bert_config, is_training=False) fluid.memory_optimize(test_prog, skip_opt_set=[unique_ids.name, start_logits.name, end_logits.name, num_seqs.name]) test_prog = test_prog.clone(for_test=True) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params( exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_predict: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing prediction!") init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor( use_cuda=args.use_cuda, loss_name=loss.name, exec_strategy=exec_strategy, main_program=train_program) train_pyreader.decorate_tensor_provider(train_data_generator) train_pyreader.start() steps = 0 total_cost, total_num_seqs = [], [] time_begin = time.time() best_f1 = -1 while steps < max_train_steps: try: steps += 1 if steps % args.skip_steps == 0: if warmup_steps <= 0: fetch_list = [loss.name, num_seqs.name] else: fetch_list = [ loss.name, scheduled_lr.name, num_seqs.name ] else: fetch_list = [] outputs = train_exe.run(fetch_list=fetch_list) if steps % args.skip_steps == 0: if warmup_steps <= 0: np_loss, np_num_seqs = outputs else: np_loss, np_lr, np_num_seqs = outputs total_cost.extend(np_loss * np_num_seqs) total_num_seqs.extend(np_num_seqs) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % ( np_lr[0] if warmup_steps > 0 else args.learning_rate) print(verbose) time_end = time.time() used_time = time_end - time_begin current_example, epoch = processor.get_train_progress() print("epoch: %d, progress: %d/%d, step: %d, loss: %f, " "speed: %f steps/s" % (epoch, current_example, num_train_examples, steps, np.sum(total_cost) / np.sum(total_num_seqs), args.skip_steps / used_time)) total_cost, total_num_seqs = [], [] time_begin = time.time() if (steps % args.save_steps == 0 or steps == max_train_steps) and steps > int(max_train_steps/3.0): #if (steps % args.save_steps == 0 or steps == max_train_steps): if args.do_predict: test_pyreader.decorate_tensor_provider( processor.data_generator( data_path=args.predict_file, batch_size=args.batch_size, phase='predict', shuffle=False, dev_count=1, epoch=1)) adv_f1 = predict(exe, test_prog, test_pyreader, [ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ], processor) # print(adv_f1) # continue # if steps != max_train_steps: if adv_f1 > best_f1: best_f1 = adv_f1 save_path = os.path.join(args.checkpoints, "step_best") print("best adv model saved") # else: # save_path = os.path.join(args.checkpoints, # "step_last") fluid.io.save_persistables(exe, save_path, train_program) test_pyreader.decorate_tensor_provider( processor.data_generator( data_path=args.predict_file.replace("dev", "test"), batch_size=args.batch_size, phase='predict', shuffle=False, dev_count=1, epoch=1)) test_f1 = predict(exe, test_prog, test_pyreader, [ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ], processor, args.predict_file.replace("dev", "test")) print("This is the test score.") except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps) + "_final") fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break if args.do_predict and not args.do_train: test_pyreader.decorate_tensor_provider( processor.data_generator( data_path=args.predict_file, batch_size=args.batch_size, phase='predict', shuffle=False, dev_count=1, epoch=1)) predict(exe, test_prog, test_pyreader, [ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ], processor)