def train(args): """ Train main function. """ if args.is_distributed: role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) dev_count = fluid.core.get_cuda_device_count() gpu_id = int(os.getenv("FLAGS_selected_gpus")) trainers_num = fleet.worker_num() trainer_id = fleet.worker_index() else: dev_count = 1 gpu_id = 0 trainers_num = 1 trainer_id = 0 place = fluid.CUDAPlace(gpu_id) task = tasks.create_task(args) model = models.create_model(args, place) train_generator = task.reader.data_generator(input_file=args.train_file, num_epochs=args.num_epochs, num_part=trainers_num, part_id=trainer_id, phase="train") valid_generator = task.reader.data_generator( input_file=args.valid_file, num_part=dev_count, part_id=gpu_id, phase="distributed_valid" if args.is_distributed else "valid") # run training model_timer = Timer() for step, data in enumerate(train_generator(), 1): model_timer.start() metrics = task.train_step(model, data) model_timer.pause() if step % args.log_steps == 0: time_cost = model_timer.pass_time current_epoch, current_file_index, total_file = task.reader.get_train_progress( ) print( f"[train][{current_epoch}] progress: {current_file_index}/{total_file} " f"step: {step}, time: {time_cost:.3f}, " f"speed: {args.log_steps / time_cost:.3f} steps/s") print("\tcurrent lr:", metrics.pop('scheduled_lr')) print("\t" + task.show_metrics(metrics)) model_timer.reset() if step % args.validation_steps == 0: evaluate(task, model, valid_generator, args, dev_count, gpu_id) if step % args.save_steps == 0: save_path = f"{args.save_path}/step_{step}" model.save(save_path, is_checkpoint=True)
def train_epoch(ep, model_object, model, train_loader, test_loader=None): model.train() train_timer = Timer('M') model_object.criterion.train() model_object.train_metrics.reset() print_counter = 0 # counter for printing for datum in train_loader: model_object.optimizer.zero_grad() loss, prob, tgt, _ = predict(ep, model_object, model, datum) loss.backward() # grad normalization grad = clip_grad_norm_(model.parameters(), 10) model_object.train_metrics.collect(y_prob=prob, y_true=tgt) # update network model_object.optimizer.step() print_counter += 1 if print_counter % model_object.print_freq == 0 and model_object.log_path is not None: train_auc = model_object.train_metrics.roc_auc() train_ap = model_object.train_metrics.average_precision() status = '[epoch {}], train batch {} - batch loss: {:.5f}, running train auc: {:.5f}, running train ap: {:.5f} - time taken: {:.2f} mins' status = status.format(ep, model_object.criterion.n_batch_train, model_object.criterion.get_running_loss(), train_auc, train_ap, train_timer.time()) log_status(model_object.log_path, status, init=False) if ( model_object.log_path and model_object.verbose > 0) else None # evaluate on validation set if test_loader is not None: train_timer.pause() valid_epoch(ep, model_object, model, test_loader) train_timer.resume() model.train() # print and log status train_loss = model_object.criterion.get_running_loss() train_auc = model_object.train_metrics.roc_auc() train_acc = model_object.train_metrics.accuracy() train_ap = model_object.train_metrics.average_precision() if model_object.log_path: status = '[epoch {}], train loss: {:.5f}, train acc: {:.5f}, train auc: {:.5f}, train ap: {:.5f}, time taken: {:.2f} mins' status = status.format(ep, train_loss, train_acc, train_auc, train_ap, train_timer.time_since_start()) log_status(model_object.log_path, status, init=False) if model_object.verbose > 0 else None # reset running criterion losses and timer model_object.criterion.reset_train() train_timer.reset()
def train(args): """ Train main function. """ if args.is_distributed: role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) dev_count = fluid.core.get_cuda_device_count() gpu_id = int(os.getenv("FLAGS_selected_gpus")) trainers_num = fleet.worker_num() trainer_id = fleet.worker_index() else: dev_count = 1 gpu_id = 0 trainers_num = 1 trainer_id = 0 place = fluid.CUDAPlace(gpu_id) task = tasks.create_task(args) model = models.create_model(args, place) train_generator = task.get_data_loader(model, input_file=args.train_file, num_epochs=args.num_epochs, num_part=trainers_num, part_id=trainer_id, phase="train") valid_generator = task.get_data_loader( model, input_file=args.valid_file, num_part=dev_count, part_id=gpu_id, phase="distributed_valid" if args.is_distributed else "valid") # run training timer = Timer() timer.start() if args.Model.model == 'NSPModel': best_metrics = 0.0 else: best_metrics = 10000 shuffledatafile() for step, data in enumerate(train_generator(), args.start_step + 1): outputs = task.train_step(model, data) timer.pause() if step % args.log_steps == 0: time_cost = timer.pass_time current_epoch, current_file_index, total_file = task.reader.get_train_progress( ) print( f"[train][{current_epoch}] progress: {current_file_index}/{total_file} " f"step: {step}, time: {time_cost:.3f}, " f"speed: {args.log_steps / time_cost:.3f} steps/s") print(f"\tcurrent lr: {outputs.pop('scheduled_lr'):.7f}") metrics = task.get_metrics(outputs) print("\t" + ", ".join(f"{k}: {v:.4f}" for k, v in metrics.items())) timer.reset() if step % args.validation_steps == 0: # shuffledatafile() metrics = evaluate(task, model, valid_generator, args, dev_count, gpu_id, step) if args.Model.model == 'NSPModel' and metrics[ 'nsp_acc'] > best_metrics: best_metrics = metrics['nsp_acc'] save_path = f"{args.save_path}/step_{step}_{best_metrics}" model.save(save_path, is_checkpoint=True) elif args.Model.model == 'Plato' and metrics['loss'] < best_metrics: best_metrics = metrics['loss'] save_path = f"{args.save_path}/step_{step}_{best_metrics}" model.save(save_path, is_checkpoint=True) # if step % args.save_steps == 0 and trainer_id == 0: # save_path = f"{args.save_path}/step_{step}" # model.save(save_path, is_checkpoint=True) # with open(save_path + ".finish", "w") as f: # pass timer.start()