def default_exe_params(is_distributed, use_cuda, thread_num): """ Set the default execute parameters. """ gpu_id = 0 trainer_num = 1 trainer_id = 0 dist_strategy = None places = None if is_distributed: if use_cuda: role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) gpu_id = int(os.getenv("FLAGS_selected_gpus")) trainer_num = fleet.worker_num() trainer_id = fleet.worker_index() exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = True exec_strategy.num_threads = 4 exec_strategy.num_iteration_per_drop_scope = 1 dist_strategy = DistributedStrategy() dist_strategy.exec_strategy = exec_strategy dist_strategy.nccl_comm_num = 2 dist_strategy.fuse_all_reduce_ops = True dist_strategy.forward_recompute = True dist_strategy.use_amp = True dist_strategy.amp_loss_scaling = 12800.0 places = fluid.cuda_places() else: print('Only gpu is supported for distributed mode at present.') exit(-1) else: if use_cuda: places = fluid.cuda_places() else: places = fluid.cpu_places(thread_num) os.environ['CPU_NUM'] = str(thread_num) if use_cuda: exe = fluid.Executor(fluid.CUDAPlace(gpu_id)) else: exe = fluid.Executor(fluid.CPUPlace()) return { 'exe': exe, 'trainer_num': trainer_num, 'trainer_id': trainer_id, 'gpu_id': gpu_id, 'dist_strategy': dist_strategy, 'places': places }
def _wrapper(): ''' Sampling according to the worker index to uniformly separate samples. ''' rank = fleet.worker_index() nranks = fleet.worker_num() for idx, sample in enumerate(generator()): if idx % nranks == rank: yield sample
def train(args): """ Train main function. """ if args.is_distributed: role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) dev_count = fluid.core.get_cuda_device_count() gpu_id = int(os.getenv("FLAGS_selected_gpus")) trainers_num = fleet.worker_num() trainer_id = fleet.worker_index() else: dev_count = 1 gpu_id = 0 trainers_num = 1 trainer_id = 0 place = fluid.CUDAPlace(gpu_id) task = tasks.create_task(args) model = models.create_model(args, place) train_generator = task.reader.data_generator(input_file=args.train_file, num_epochs=args.num_epochs, num_part=trainers_num, part_id=trainer_id, phase="train") valid_generator = task.reader.data_generator( input_file=args.valid_file, num_part=dev_count, part_id=gpu_id, phase="distributed_valid" if args.is_distributed else "valid") # run training model_timer = Timer() for step, data in enumerate(train_generator(), 1): model_timer.start() metrics = task.train_step(model, data) model_timer.pause() if step % args.log_steps == 0: time_cost = model_timer.pass_time current_epoch, current_file_index, total_file = task.reader.get_train_progress( ) print( f"[train][{current_epoch}] progress: {current_file_index}/{total_file} " f"step: {step}, time: {time_cost:.3f}, " f"speed: {args.log_steps / time_cost:.3f} steps/s") print("\tcurrent lr:", metrics.pop('scheduled_lr')) print("\t" + task.show_metrics(metrics)) model_timer.reset() if step % args.validation_steps == 0: evaluate(task, model, valid_generator, args, dev_count, gpu_id) if step % args.save_steps == 0: save_path = f"{args.save_path}/step_{step}" model.save(save_path, is_checkpoint=True)
def main(args): """tbd""" model_config = json.load(open(args.model_config, 'r')) model_config['context_pooling'] = args.context_pooling ### build model train_prog = fluid.Program() test_prog = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): model = PreGNNContextpredModel(model_config) model.forward() opt = fluid.optimizer.Adam(learning_rate=args.lr) if args.distributed: opt = get_distributed_optimizer(opt) opt.minimize(model.loss) with fluid.program_guard(test_prog, fluid.Program()): with fluid.unique_name.guard(): model = PreGNNContextpredModel(model_config) model.forward(is_test=True) place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0))) \ if args.use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) if not args.init_model is None and not args.init_model == "": load_partial_params(exe, args.init_model, train_prog) ### load data k = model_config['layer_num'] l1 = k - 1 l2 = l1 + args.context_size featurizer = PreGNNContextPredFeaturizer( model.substruct_graph_wrapper, model.context_graph_wrapper, k, l1, l2) dataset = load_zinc_dataset(args.data_path, featurizer=featurizer) splitter = RandomSplitter() train_dataset, _, test_dataset = splitter.split( dataset, frac_train=0.9, frac_valid=0, frac_test=0.1) if args.distributed: indices = list(range(fleet.worker_num(), len(train_dataset), fleet.worker_index())) train_dataset = train_dataset[indices] print("Train/Test num: %s/%s" % (len(train_dataset), len(test_dataset))) ### start train list_test_loss = [] for epoch_id in range(args.max_epoch): train_loss = train(args, exe, train_prog, model, train_dataset, featurizer) test_loss = evaluate(args, exe, test_prog, model, test_dataset, featurizer) if not args.distributed or fleet.worker_index() == 0: fluid.io.save_params(exe, '%s/epoch%s' % (args.model_dir, epoch_id), train_prog) list_test_loss.append(test_loss) print("epoch:%d train/loss:%s" % (epoch_id, train_loss)) print("epoch:%d test/loss:%s" % (epoch_id, test_loss)) if not args.distributed or fleet.worker_index() == 0: best_epoch_id = np.argmax(list_test_loss) fluid.io.load_params(exe, '%s/epoch%d' % (args.model_dir, best_epoch_id), train_prog) fluid.io.save_params(exe, '%s/epoch_best' % (args.model_dir), train_prog) return list_test_loss[best_epoch_id]
def main(args): """ Call the configuration function of the model, build the model and load data, then start training. model_config: a json file with the model configurations,such as dropout rate ,learning rate,num tasks and so on; task_num: It means the number of chembl filtered task; PreGNNSupervisedModel: It means the PretrainGNNModel for supervised strategy. Graph-level multi-task supervised pre-training to jointly predict a diverse set of supervised labels of individual graphs. """ model_config = json.load(open(args.model_config, 'r')) if not args.dropout_rate is None: model_config['dropout_rate'] = args.dropout_rate task_num = get_chembl_filtered_task_num() model_config['task_num'] = task_num ### build model train_prog = fluid.Program() test_prog = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): model = PreGNNSupervisedModel(model_config) model.forward() opt = fluid.optimizer.Adam(learning_rate=args.lr) if args.distributed: opt = get_distributed_optimizer(opt) opt.minimize(model.loss) with fluid.program_guard(test_prog, fluid.Program()): with fluid.unique_name.guard(): model = PreGNNSupervisedModel(model_config) model.forward(is_test=True) """ Use CUDAPlace for GPU training, or use CPUPlace for CPU training. """ place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0))) \ if args.use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) if not args.init_model is None and not args.init_model == "": load_partial_params(exe, args.init_model, train_prog) ### load data """ PreGNNSupervisedFeaturizer: It is used along with `PreGNNSupervised`. It inherits from the super class `Featurizer` which is used for feature extractions. The `Featurizer` has two functions: `gen_features` for converting from a single raw smiles to a single graph data, `collate_fn` for aggregating a sublist of graph data into a big batch. splitter: split type of the dataset:random,scaffold,random with scaffold. Here is randomsplit. `ScaffoldSplitter` will firstly order the compounds according to Bemis-Murcko scaffold, then take the first `frac_train` proportion as the train set, the next `frac_valid` proportion as the valid set and the rest as the test set. `ScaffoldSplitter` can better evaluate the generalization ability of the model on out-of-distribution samples. Note that other splitters like `RandomSplitter`, `RandomScaffoldSplitter` and `IndexSplitter` is also available." """ featurizer = PreGNNSupervisedFeaturizer(model.graph_wrapper) dataset = load_chembl_filtered_dataset(args.data_path, featurizer=featurizer) splitter = RandomSplitter() train_dataset, _, test_dataset = splitter.split(dataset, frac_train=0.9, frac_valid=0, frac_test=0.1) if args.distributed: indices = list( range(fleet.worker_index(), len(train_dataset), fleet.worker_num())) train_dataset = train_dataset[indices] print("Train/Test num: %s/%s" % (len(train_dataset), len(test_dataset))) ### start train """ Load the train function and calculate the train loss and test loss in each epoch. Here we set the epoch is in range of max epoch,you can change it if you want. Then we will calculate the train loss ,test loss and print them. Finally we save the best epoch to the model according to the dataset. """ list_test_loss = [] for epoch_id in range(args.max_epoch): train_loss = train(args, exe, train_prog, model, train_dataset, featurizer) test_loss = evaluate(args, exe, test_prog, model, test_dataset, featurizer) if not args.distributed or fleet.worker_index() == 0: fluid.io.save_params(exe, '%s/epoch%s' % (args.model_dir, epoch_id), train_prog) list_test_loss.append(test_loss) print("epoch:%d train/loss:%s" % (epoch_id, train_loss)) print("epoch:%d test/loss:%s" % (epoch_id, test_loss)) if not args.distributed or fleet.worker_index() == 0: best_epoch_id = np.argmax(list_test_loss) fluid.io.load_params(exe, '%s/epoch%d' % (args.model_dir, best_epoch_id), train_prog) fluid.io.save_params(exe, '%s/epoch_best' % (args.model_dir), train_prog) return list_test_loss[best_epoch_id]
def main(args): with open(args.config, 'r') as f: config = json.load(f) logging.info('Load data ...') if len(args.dataset.split(',')) > 1: # for large pretraining dataset, ZINC15 and ChEMBL # directly load the processed npz files train_data_list = [] for ds in args.dataset.split(','): # use processed data.npz train_data_list.extend( load_data(os.path.join(args.root, ds, 'processed'))) # dataset = MoleculeDataset( # args.root, ds, # add_symmetry=False, # add_self_loop=False) # data_list = dataset.get_data_list() # processed_dir = os.path.join(args.root, ds, 'processed') # os.makedirs(processed_dir, exist_ok=True) # save_data_list_to_npz( # data_list, os.path.join(processed_dir, 'data.npz')) # logging.info('Processed {}'.format(ds)) # train_data_list.extend(data_list) else: if args.dataset == 'mutag': train_data_list, _ = load_mutag_dataset( os.path.join(args.root, args.dataset, 'raw')) elif args.dataset == 'ptc_mr': train_data_list, _ = load_ptc_mr_dataset( os.path.join(args.root, args.dataset, 'raw')) else: raise ValueError('Unsupported dataset') if args.is_fleet: train_data_list = [ x for i, x in enumerate(train_data_list) if i % fleet.worker_num() == fleet.worker_index() ] logging.info("Data loaded.") logging.info("Train Examples: %s" % len(train_data_list)) sys.stdout.flush() if args.emb_dir is not None: os.makedirs(args.emb_dir, exist_ok=True) train_prog = F.Program() test_prog = F.Program() startup_prog = F.Program() with F.program_guard(train_prog, startup_prog): with F.unique_name.guard(): agent = create_model(args, config) test_prog = train_prog.clone(for_test=True) opt = F.optimizer.Adam(learning_rate=args.lr) if args.is_fleet: dist_strategy = DistributedStrategy() role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) opt = fleet.distributed_optimizer(opt, strategy=dist_strategy) opt.minimize(agent.loss) place = F.CUDAPlace(0) if args.use_cuda else F.CPUPlace() exe = F.Executor(place) exe.run(startup_prog) if (not args.dont_save_emb) and \ (not args.is_fleet or fleet.worker_index() == 0): save_embedding(args, exe, test_prog, agent, train_data_list, -1) for epoch_id in range(args.max_epoch): train(args, exe, train_prog, agent, train_data_list, epoch_id) if not args.is_fleet or fleet.worker_index() == 0: F.io.save_params(exe, '%s/epoch%s' % (args.model_dir, epoch_id), train_prog) if not args.dont_save_emb: save_embedding(args, exe, test_prog, agent, train_data_list, epoch_id)
def main(args): ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() if args.use_cuda: dev_list = fluid.cuda_places() place = dev_list[0] dev_count = len(dev_list) else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) reader = reader_ce.ClassifyReader(vocab_path=args.vocab_path, label_map_config=args.label_map_config, max_seq_len=args.max_seq_len, total_num=args.train_data_size, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed, tokenizer=args.tokenizer, for_cn=args.for_cn, task_id=args.task_id) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") if args.do_test: assert args.test_save is not None startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.predict_batch_size == None: args.predict_batch_size = args.batch_size if args.do_train: role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) dev_count = fleet.worker_num() train_data_generator = reader.data_generator( input_file=args.train_set, batch_size=args.batch_size, epoch=args.epoch, dev_count=1, trainer_id=fleet.worker_index(), trainer_num=fleet.worker_num(), shuffle=True, phase="train") num_train_examples = reader.get_num_examples(args.train_set) if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) log.info("Device count: %d" % dev_count) log.info("Num train examples: %d" % num_train_examples) log.info("Max train steps: %d" % max_train_steps) log.info("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() # use fleet api exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count if args.is_distributed: exec_strategy.num_threads = 3 exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope dist_strategy = DistributedStrategy() dist_strategy.exec_strategy = exec_strategy dist_strategy.nccl_comm_num = 1 if args.is_distributed: dist_strategy.nccl_comm_num = 2 dist_strategy.use_hierarchical_allreduce = True if args.use_mix_precision: dist_strategy.use_amp = True with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars = create_model( args, pyreader_name='train_reader', ernie_config=ernie_config) scheduled_lr = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio, dist_strategy=dist_strategy) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) log.info("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, graph_vars = create_model( args, pyreader_name='test_reader', ernie_config=ernie_config, is_prediction=True) test_prog = test_prog.clone(for_test=True) train_program = fleet.main_program exe = fluid.Executor(place) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: log.warning( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog) if args.do_train: train_exe = exe train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None test_exe = exe # if args.do_val or args.do_test: # if args.use_multi_gpu_test: # test_exe = fluid.ParallelExecutor( # use_cuda=args.use_cuda, # main_program=test_prog, # share_vars_from=train_exe) current_epoch = 0 steps = 0 if args.do_train: train_pyreader.start() if warmup_steps > 0: graph_vars["learning_rate"] = scheduled_lr ce_info = [] time_begin = time.time() last_epoch = 0 while True: try: steps += 1 # log.info("step: %d" % steps) if fleet.worker_index() != 0: train_exe.run(fetch_list=[], program=train_program) continue if steps % args.skip_steps != 0: train_exe.run(fetch_list=[], program=train_program) else: outputs = evaluate(train_exe, train_program, train_pyreader, graph_vars, "train", metric=args.metric) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % ( outputs["learning_rate"] if warmup_steps > 0 else args.learning_rate) log.info(verbose) current_example, current_epoch = reader.get_train_progress( ) time_end = time.time() used_time = time_end - time_begin log.info( "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " "ave acc: %f, speed: %f steps/s" % (current_epoch, current_example * dev_count, num_train_examples, steps, outputs["loss"], outputs["accuracy"], args.skip_steps / used_time)) ce_info.append( [outputs["loss"], outputs["accuracy"], used_time]) time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, fleet._origin_program) # if steps % args.validation_steps == 0 or last_epoch != current_epoch: if steps % args.validation_steps == 0: # evaluate dev set if args.do_val: evaluate_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) if args.do_test: predict_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) if last_epoch != current_epoch: last_epoch = current_epoch except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, fleet._origin_program) train_pyreader.reset() break # final eval on dev set if args.do_val: evaluate_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) # final eval on test set if args.do_test: predict_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) # final eval on dianostic, hack for glue-ax if args.diagnostic: test_pyreader.decorate_tensor_provider( reader.data_generator(args.diagnostic, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False)) log.info("Final diagnostic") qids, preds, probs = predict(test_exe, test_prog, test_pyreader, graph_vars) assert len(qids) == len(preds), '{} v.s. {}'.format( len(qids), len(preds)) with open(args.diagnostic_save, 'w') as f: for id, s, p in zip(qids, preds, probs): f.write('{}\t{}\t{}\n'.format(id, s, p)) log.info("Done final diagnostic, saving to {}".format( args.diagnostic_save))
def train(args): """ Train main function. """ if args.is_distributed: role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) dev_count = fluid.core.get_cuda_device_count() gpu_id = int(os.getenv("FLAGS_selected_gpus")) trainers_num = fleet.worker_num() trainer_id = fleet.worker_index() else: dev_count = 1 gpu_id = 0 trainers_num = 1 trainer_id = 0 place = fluid.CUDAPlace(gpu_id) task = tasks.create_task(args) model = models.create_model(args, place) train_generator = task.get_data_loader(model, input_file=args.train_file, num_epochs=args.num_epochs, num_part=trainers_num, part_id=trainer_id, phase="train") valid_generator = task.get_data_loader( model, input_file=args.valid_file, num_part=dev_count, part_id=gpu_id, phase="distributed_valid" if args.is_distributed else "valid") # run training timer = Timer() timer.start() if args.Model.model == 'NSPModel': best_metrics = 0.0 else: best_metrics = 10000 shuffledatafile() for step, data in enumerate(train_generator(), args.start_step + 1): outputs = task.train_step(model, data) timer.pause() if step % args.log_steps == 0: time_cost = timer.pass_time current_epoch, current_file_index, total_file = task.reader.get_train_progress( ) print( f"[train][{current_epoch}] progress: {current_file_index}/{total_file} " f"step: {step}, time: {time_cost:.3f}, " f"speed: {args.log_steps / time_cost:.3f} steps/s") print(f"\tcurrent lr: {outputs.pop('scheduled_lr'):.7f}") metrics = task.get_metrics(outputs) print("\t" + ", ".join(f"{k}: {v:.4f}" for k, v in metrics.items())) timer.reset() if step % args.validation_steps == 0: # shuffledatafile() metrics = evaluate(task, model, valid_generator, args, dev_count, gpu_id, step) if args.Model.model == 'NSPModel' and metrics[ 'nsp_acc'] > best_metrics: best_metrics = metrics['nsp_acc'] save_path = f"{args.save_path}/step_{step}_{best_metrics}" model.save(save_path, is_checkpoint=True) elif args.Model.model == 'Plato' and metrics['loss'] < best_metrics: best_metrics = metrics['loss'] save_path = f"{args.save_path}/step_{step}_{best_metrics}" model.save(save_path, is_checkpoint=True) # if step % args.save_steps == 0 and trainer_id == 0: # save_path = f"{args.save_path}/step_{step}" # model.save(save_path, is_checkpoint=True) # with open(save_path + ".finish", "w") as f: # pass timer.start()
def do_training(self, fleet, args): """ begin training. Args: fleet (Collective): Collective inherited base class Fleet args (ArgumentParser): run args to config dist fleet. Returns: tuple: the value is train losses """ args = parse_args() logging.info(args) gpu_id = int(os.environ.get('FLAGS_selected_gpus', 4)) place = fluid.CUDAPlace(gpu_id) dev_count = 1 exe = fluid.Executor(place) train_program = fluid.Program() startup_program = fluid.Program() args.num_trainers = fleet.worker_num() args.trainer_id = fleet.worker_index() args.run_params = json.loads(args.run_params) dist_strategy = DistributedStrategy() dist_strategy.enable_inplace = args.run_params['enable_inplace'] dist_strategy.fuse_all_reduce_ops = args.run_params[ 'fuse_all_reduce_ops'] dist_strategy.nccl_comm_num = args.run_params['nccl_comm_num'] dist_strategy.use_local_sgd = args.run_params['use_local_sgd'] dist_strategy.mode = args.run_params["mode"] dist_strategy.collective_mode = args.run_params["collective"] with fluid.program_guard(train_program, startup_program): with fluid.unique_name.guard(): sum_cost, avg_cost, predict, token_num, pyreader = transformer( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.prepostprocess_dropout, ModelHyperParams.attention_dropout, ModelHyperParams.relu_dropout, ModelHyperParams.preprocess_cmd, ModelHyperParams.postprocess_cmd, ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps, ModelHyperParams.bos_idx, use_py_reader=args.use_py_reader, is_test=False) optimizer = fluid.optimizer.SGD(0.003) if args.run_params["fp16"]: optimizer = decorate(optimizer, init_loss_scaling=64.0) optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) optimizer.minimize(avg_cost, startup_program) train_program = fleet.main_program exe.run(startup_program) train_data = prepare_data_generator( args, is_test=False, count=dev_count, pyreader=pyreader, py_reader_provider_wrapper=py_reader_provider_wrapper) loss_normalizer = -( (1. - TrainTaskConfig.label_smooth_eps) * np.log( (1. - TrainTaskConfig.label_smooth_eps)) + TrainTaskConfig.label_smooth_eps * np.log(TrainTaskConfig.label_smooth_eps / (ModelHyperParams.trg_vocab_size - 1) + 1e-20)) step_idx = 0 init_flag = True result_loss = [] result_ppl = [] train_info = [] for pass_id in six.moves.xrange(args.num_epochs): pass_start_time = time.time() if args.use_py_reader: pyreader.start() data_generator = None else: data_generator = train_data() batch_id = 0 while True: try: feed_dict_list = prepare_feed_dict_list( data_generator, init_flag, dev_count) t1 = time.time() outs = exe.run(program=train_program, fetch_list=[sum_cost.name, token_num.name] if step_idx % args.fetch_steps == 0 else [], feed=feed_dict_list) if step_idx % args.fetch_steps == 0: sum_cost_val, token_num_val = np.array( outs[0]), np.array(outs[1]) total_sum_cost = sum_cost_val.sum() total_token_num = token_num_val.sum() total_avg_cost = total_sum_cost / total_token_num result_loss.append(total_avg_cost - loss_normalizer) result_ppl.append( np.exp([min(total_avg_cost, 100)]).item(0)) train_info.append(result_loss) init_flag = False batch_id += 1 step_idx += 1 if batch_id >= 5: break except (StopIteration, fluid.core.EOFException): if args.use_py_reader: pyreader.reset() break train_info = [round(i, 6) for i in train_info[0]] return train_info
exe.run(start_prog) train_prog = fleet.main_program x_data = np.ones(shape=[1, 2], dtype=np.float32) label_data = np.ones(shape=[1, 1], dtype=np.float32) out = exe.run(train_prog, feed={ 'x': x_data, 'label': label_data }, fetch_list=[loss.name]) # EXE testing test_prog = fluid.Program() with fluid.program_guard(test_prog): y = fluid.data(name='y', shape=[-1, 1], dtype='float32') v1 = fluid.layers.collective._c_allgather(y, fleet.worker_num(), use_calc_stream=True) v2 = fluid.layers.collective._c_allreduce(y, use_calc_stream=True) y_data = np.ones(shape=[1, 1], dtype=np.float32) v1, v2 = exe.run(test_prog, feed={'y': y_data}, fetch_list=[v1.name, v2.name]) if role.worker_index() == 1: time.sleep(1) print "" print 'rank:', role.worker_index() print 'allgather:', v1 print 'allreduce:', v2
import os import paddle.fluid as fluid import numpy as np from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy # new line 1 from paddle.fluid.incubate.fleet.base import role_maker # new line 2 role = role_maker.PaddleCloudRoleMaker(is_collective=True) # new line 3 fleet.init(role) # new line 4 x = fluid.data(name='x', shape=[-1, 2], dtype='float32') label = fluid.data(name='label', shape=[-1, 1], dtype='float32') y = fluid.layers.fc(x, size=1, param_attr=fluid.initializer.Constant(1.0)) fluid.layers.Print(y) # testing code v1 = fluid.layers.collective._c_allgather(y, fleet.worker_num(), use_calc_stream=True) v2 = fluid.layers.collective._c_allreduce(y, use_calc_stream=True) fluid.layers.Print(v1) fluid.layers.Print(v2) # end of testing code cost = fluid.layers.square_error_cost(y, label) loss = fluid.layers.reduce_sum(cost) optimizer = fluid.optimizer.SGD(learning_rate=0.0) strategy = DistributedStrategy() strategy.mode = "collective" strategy.collective_mode = "grad_allreduce" optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) # new line 5 optimizer.minimize(loss, fluid.default_startup_program())
def main(args): """ Call the configuration function of the model, build the model and load data, then start training. model_config: a json file with the model configurations,such as dropout rate ,learning rate,num tasks and so on; context_pooling: it means the pooling type of context prediction; PreGNNContextpredModel: It is an unsupervised pretraining model which use subgraphs to predict their surrounding graph structures. Our goal is to pre-train a GNN so that it maps nodes appearing in similar structural contexts to nearby embeddings. """ model_config = json.load(open(args.model_config, 'r')) if not args.dropout_rate is None: model_config['dropout_rate'] = args.dropout_rate model_config['context_pooling'] = args.context_pooling ### build model train_prog = fluid.Program() test_prog = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): model = PreGNNContextpredModel(model_config) model.forward() opt = fluid.optimizer.Adam(learning_rate=args.lr) if args.distributed: opt = get_distributed_optimizer(opt) opt.minimize(model.loss) with fluid.program_guard(test_prog, fluid.Program()): with fluid.unique_name.guard(): model = PreGNNContextpredModel(model_config) model.forward(is_test=True) # Use CUDAPlace for GPU training, or use CPUPlace for CPU training. place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0))) \ if args.use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) if not args.init_model is None and not args.init_model == "": load_partial_params(exe, args.init_model, train_prog) ### load data # PreGNNContextPredFeaturizer: # It is used along with `PreGNNContextPredModel`. It inherits from the super class `Featurizer` which is used for feature extractions. The `Featurizer` has two functions: `gen_features` for converting from a single raw smiles to a single graph data, `collate_fn` for aggregating a sublist of graph data into a big batch. # k is the number of layer,l1 and l2 are the different size of context,usually l1 < l2. # splitter: # split type of the dataset:random,scaffold,random with scaffold. Here is randomsplit. # `ScaffoldSplitter` will firstly order the compounds according to Bemis-Murcko scaffold, # then take the first `frac_train` proportion as the train set, the next `frac_valid` proportion as the valid set # and the rest as the test set. `ScaffoldSplitter` can better evaluate the generalization ability of the model on # out-of-distribution samples. Note that other splitters like `RandomSplitter`, `RandomScaffoldSplitter` # and `IndexSplitter` is also available." k = model_config['layer_num'] l1 = k - 1 l2 = l1 + args.context_size featurizer = PreGNNContextPredFeaturizer( model.substruct_graph_wrapper, model.context_graph_wrapper, k, l1, l2) dataset = load_zinc_dataset(args.data_path, featurizer=featurizer) splitter = RandomSplitter() train_dataset, _, test_dataset = splitter.split( dataset, frac_train=0.9, frac_valid=0, frac_test=0.1) if args.distributed: indices = list(range(fleet.worker_index(), len(train_dataset), fleet.worker_num())) train_dataset = train_dataset[indices] print("Train/Test num: %s/%s" % (len(train_dataset), len(test_dataset))) ### start train # Load the train function and calculate the train loss and test loss in each epoch. # Here we set the epoch is in range of max epoch,you can change it if you want. # Then we will calculate the train loss ,test loss and print them. # Finally we save the best epoch to the model according to the dataset. list_test_loss = [] for epoch_id in range(args.max_epoch): train_loss = train(args, exe, train_prog, model, train_dataset, featurizer) test_loss = evaluate(args, exe, test_prog, model, test_dataset, featurizer) if not args.distributed or fleet.worker_index() == 0: fluid.io.save_params(exe, '%s/epoch%s' % (args.model_dir, epoch_id), train_prog) list_test_loss.append(test_loss) print("epoch:%d train/loss:%s" % (epoch_id, train_loss)) print("epoch:%d test/loss:%s" % (epoch_id, test_loss)) if not args.distributed or fleet.worker_index() == 0: best_epoch_id = np.argmin(list_test_loss) fluid.io.load_params(exe, '%s/epoch%d' % (args.model_dir, best_epoch_id), train_prog) fluid.io.save_params(exe, '%s/epoch_best' % (args.model_dir), train_prog) return list_test_loss[best_epoch_id]
def train(args): """train start""" logging.info(args) gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) dev_count = 1 exe = fluid.Executor(place) train_program = fluid.Program() startup_program = fluid.Program() # For Distributed Training. role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) args.num_trainers = fleet.worker_num() args.trainer_id = fleet.worker_index() dist_strategy = DistributedStrategy() with fluid.program_guard(train_program, startup_program): with fluid.unique_name.guard(): sum_cost, avg_cost, predict, token_num, pyreader = transformer( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.prepostprocess_dropout, ModelHyperParams.attention_dropout, ModelHyperParams.relu_dropout, ModelHyperParams.preprocess_cmd, ModelHyperParams.postprocess_cmd, ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps, ModelHyperParams.bos_idx, use_py_reader=args.use_py_reader, is_test=False) optimizer = None if args.sync: lr_decay = fluid.layers.learning_rate_scheduler.noam_decay( ModelHyperParams.d_model, TrainTaskConfig.warmup_steps) with fluid.default_main_program()._lr_schedule_guard(): learning_rate = lr_decay * TrainTaskConfig.learning_rate optimizer = fluid.optimizer.Adam(learning_rate=learning_rate, beta1=TrainTaskConfig.beta1, beta2=TrainTaskConfig.beta2, epsilon=TrainTaskConfig.eps) else: optimizer = fluid.optimizer.SGD(0.003) if args.use_fp16: optimizer = decorate(optimizer, init_loss_scaling=args.loss_scaling) optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) optimizer.minimize(avg_cost, startup_program) train_program = fleet.main_program orig_train_program = fleet._origin_program train_loop(args, exe, train_program, orig_train_program, startup_program, dev_count, sum_cost, avg_cost, token_num, predict, pyreader)