def train_eval(params, embedding_matrix, folds, output_path): """Generates, trains and evaluates a model for each fold. Args: params: dictionary containing training and evaluation parameters embedding_matrix: embedding matrix that is used in the model folds: data folds that are used for training and validation output_path: path to the folder that will contain the training and evaluation results Returns: """ path = save_load.create_ts_path(params, output_path) histories = [] for i, fold in enumerate(folds): x_train, y_train = preprocessing.generateXY(fold[0], params['pad_length']) x_valid, y_valid = preprocessing.generateXY(fold[1], params['pad_length']) m = model.build_lstm_model(y_train.shape, params, embedding_matrix) history, trained_model = train_model(x_train, y_train, x_valid, y_valid, m, params['batch_size'], params['epochs']) save_load.save_model(trained_model, path, i) h = save_load.save_history(path+'/history_fold{}.csv'.format(i), history.history) histories.append(h) save_load.save_dictionary(path, params, 'params.json') save_load.write_final_results(path, histories)
def train(file_path, model, optimizer, dataloader_train, dataloader_valid, eval_every=100, num_epochs=1, best_valid_loss=float('inf')): # initialize running_loss = 0.0 valid_running_loss = 0.0 global_step = 0 train_loss_list = [] valid_loss_list = [] global_steps_list = [] # training model.train() for epoch in range(num_epochs): for batch in dataloader_train: optimizer.zero_grad() input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) labels = batch['labels'].to(device) output = model(input_ids, token_type_ids=None, attention_mask=attention_mask, labels=labels) output[0].backward() optimizer.step() running_loss += output[0].item() global_step += 1 # validation if global_step % eval_every == 0: model.eval() with torch.no_grad(): for batch in dataloader_valid: input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) labels = batch['labels'].to(device) output = model(input_ids, token_type_ids=None, attention_mask=attention_mask, labels=labels) valid_running_loss += output[0].item() average_train_loss = running_loss / eval_every average_valid_loss = valid_running_loss / len(dataloader_valid) train_loss_list.append(average_train_loss) valid_loss_list.append(average_valid_loss) global_steps_list.append(global_step) # reinitialize running_loss = 0.0 valid_running_loss = 0.0 model.train() print( 'epoch {}/{}, step {}/{}, train loss: {:.4f}, valid loss: {:.4f}' .format(epoch + 1, num_epochs, global_step, num_epochs * len(dataloader_train), average_train_loss, average_valid_loss)) # save if best_valid_loss > average_valid_loss: best_valid_loss = average_valid_loss save_model(file_path + '/' + 'model.pt', model, best_valid_loss) save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
def main(args): config = get_config(args.config, overrides=args.override, show=True) if config.get("is_distributed", True): fleet.init(is_collective=True) # assign the place use_gpu = config.get("use_gpu", True) # amp related config use_amp = config.get('use_amp', False) use_pure_fp16 = config.get('use_pure_fp16', False) if use_amp or use_pure_fp16: AMP_RELATED_FLAGS_SETTING = { 'FLAGS_cudnn_exhaustive_search': 1, 'FLAGS_conv_workspace_size_limit': 4000, 'FLAGS_cudnn_batchnorm_spatial_persistent': 1, 'FLAGS_max_inplace_grad_add': 8, } os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = '1' paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING) use_xpu = config.get("use_xpu", False) assert ( use_gpu and use_xpu ) is not True, "gpu and xpu can not be true in the same time in static mode!" if use_gpu: place = paddle.set_device('gpu') elif use_xpu: place = paddle.set_device('xpu') else: place = paddle.set_device('cpu') # startup_prog is used to do some parameter init work, # and train prog is used to hold the network startup_prog = paddle.static.Program() train_prog = paddle.static.Program() best_top1_acc = 0.0 # best top1 acc record train_fetchs, lr_scheduler, train_feeds = program.build( config, train_prog, startup_prog, is_train=True, is_distributed=config.get("is_distributed", True)) if config.validate: valid_prog = paddle.static.Program() valid_fetchs, _, valid_feeds = program.build(config, valid_prog, startup_prog, is_train=False, is_distributed=config.get( "is_distributed", True)) # clone to prune some content which is irrelevant in valid_prog valid_prog = valid_prog.clone(for_test=True) # create the "Executor" with the statement of which place exe = paddle.static.Executor(place) # Parameter initialization exe.run(startup_prog) if config.get("use_pure_fp16", False): cast_parameters_to_fp16(place, train_prog, fluid.global_scope()) # load pretrained models or checkpoints init_model(config, train_prog, exe) if not config.get("is_distributed", True): compiled_train_prog = program.compile( config, train_prog, loss_name=train_fetchs["loss"][0].name) else: compiled_train_prog = train_prog if not config.get('use_dali', False): train_dataloader = Reader(config, 'train', places=place)() if config.validate and paddle.distributed.get_rank() == 0: valid_dataloader = Reader(config, 'valid', places=place)() if use_xpu: compiled_valid_prog = valid_prog else: compiled_valid_prog = program.compile(config, valid_prog) else: assert use_gpu is True, "DALI only support gpu, please set use_gpu to True!" import dali train_dataloader = dali.train(config) if config.validate and paddle.distributed.get_rank() == 0: valid_dataloader = dali.val(config) compiled_valid_prog = program.compile(config, valid_prog) vdl_writer = None if args.vdl_dir: if version_info.major == 2: logger.info( "visualdl is just supported for python3, so it is disabled in python2..." ) else: from visualdl import LogWriter vdl_writer = LogWriter(args.vdl_dir) for epoch_id in range(config.epochs): # 1. train with train dataset program.run(train_dataloader, exe, compiled_train_prog, train_feeds, train_fetchs, epoch_id, 'train', config, vdl_writer, lr_scheduler) if paddle.distributed.get_rank() == 0: # 2. validate with validate dataset if config.validate and epoch_id % config.valid_interval == 0: top1_acc = program.run(valid_dataloader, exe, compiled_valid_prog, valid_feeds, valid_fetchs, epoch_id, 'valid', config) if top1_acc > best_top1_acc: best_top1_acc = top1_acc message = "The best top1 acc {:.5f}, in epoch: {:d}".format( best_top1_acc, epoch_id) logger.info("{:s}".format(logger.coloring(message, "RED"))) if epoch_id % config.save_interval == 0: model_path = os.path.join(config.model_save_dir, config.ARCHITECTURE["name"]) save_model(train_prog, model_path, "best_model") # 3. save the persistable model if epoch_id % config.save_interval == 0: model_path = os.path.join(config.model_save_dir, config.ARCHITECTURE["name"]) save_model(train_prog, model_path, epoch_id)
else: best_train_loss = float('inf') for times in epoch: # training train_losses, train_accuracy_list = train(dataloader, model, CRITERION, OPTIMIZER, DEVICE) # evaluate if SPLIT_DATASET: valid_losses, valid_accuracy_list = valid(valid_dataloader, model, CRITERION, OPTIMIZER, DEVICE) # leave log train_message, train_loss, train_accuracy = leave_log(train_losses, train_accuracy_list, times, mode='train') print(train_message) if SPLIT_DATASET: valid_message, valid_loss, valid_accuracy = leave_log(valid_losses, valid_accuracy_list, times, mode='valid') print(valid_message + '\n') # model save if SAVE_MODEL == True: if SPLIT_DATASET: if best_valid_loss > valid_loss: save_model(model, MODEL_PATH) best_valid_loss = valid_loss else: if best_train_loss > train_loss: save_model(MODEL_PATH) best_train_loss = train_loss ############################################################################################