def do_train(agrs): train_data_loader, dev_data_loader = create_distill_loader( args.task_name, model_name=args.model_name, vocab_path=args.vocab_path, batch_size=args.batch_size, max_seq_length=args.max_seq_length, n_iter=args.n_iter) emb_tensor = load_embedding( args.vocab_path) if args.use_pretrained_emb else None model = BiLSTM(args.emb_dim, args.hidden_size, args.vocab_size, args.output_dim, args.padding_idx, args.num_layers, args.dropout_prob, args.init_scale, emb_tensor) if args.optimizer == 'adadelta': optimizer = paddle.optimizer.Adadelta(learning_rate=args.lr, rho=0.95, parameters=model.parameters()) else: optimizer = paddle.optimizer.Adam(learning_rate=args.lr, parameters=model.parameters()) ce_loss = nn.CrossEntropyLoss() mse_loss = nn.MSELoss() klloss = nn.KLDivLoss() metric_class = TASK_CLASSES[args.task_name][1] metric = metric_class() teacher = TeacherModel(model_name=args.model_name, param_path=args.teacher_path) print("Start to distill student model.") global_step = 0 tic_train = time.time() for epoch in range(args.max_epoch): model.train() for i, batch in enumerate(train_data_loader): if args.task_name == 'qqp': bert_input_ids, bert_segment_ids, student_input_ids_1, seq_len_1, student_input_ids_2, seq_len_2, labels = batch else: bert_input_ids, bert_segment_ids, student_input_ids, seq_len, labels = batch # Calculate teacher model's forward. with paddle.no_grad(): teacher_logits = teacher.model(bert_input_ids, bert_segment_ids) # Calculate student model's forward. if args.task_name == 'qqp': logits = model(student_input_ids_1, seq_len_1, student_input_ids_2, seq_len_2) else: logits = model(student_input_ids, seq_len) loss = args.alpha * ce_loss(logits, labels) + ( 1 - args.alpha) * mse_loss(logits, teacher_logits) loss.backward() optimizer.step() optimizer.clear_grad() if i % args.log_freq == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.4f step/s" % (global_step, epoch, i, loss, args.log_freq / (time.time() - tic_train))) tic_eval = time.time() acc = evaluate(args.task_name, model, metric, dev_data_loader) print("eval done total : %s s" % (time.time() - tic_eval)) tic_train = time.time() global_step += 1
def do_train(agrs): device = paddle.set_device(args.device) train_data_loader, dev_data_loader = create_distill_loader( args.task_name, model_name=args.model_name, vocab_path=args.vocab_path, batch_size=args.batch_size, max_seq_length=args.max_seq_length, n_iter=args.n_iter, whole_word_mask=args.whole_word_mask, seed=args.seed) model = BiLSTM(args.emb_dim, args.hidden_size, args.vocab_size, args.output_dim, args.vocab_path, args.padding_idx, args.num_layers, args.dropout_prob, args.init_scale, args.embedding_name) if args.optimizer == 'adadelta': optimizer = paddle.optimizer.Adadelta(learning_rate=args.lr, rho=0.95, parameters=model.parameters()) else: optimizer = paddle.optimizer.Adam(learning_rate=args.lr, parameters=model.parameters()) ce_loss = nn.CrossEntropyLoss() mse_loss = nn.MSELoss() klloss = nn.KLDivLoss() metric_class = TASK_CLASSES[args.task_name][1] metric = metric_class() teacher = TeacherModel(model_name=args.model_name, param_path=args.teacher_path) print("Start to distill student model.") if args.init_from_ckpt: model.set_state_dict(paddle.load(args.init_from_ckpt + ".pdparams")) optimizer.set_state_dict(paddle.load(args.init_from_ckpt + ".pdopt")) print("Loaded checkpoint from %s" % args.init_from_ckpt) global_step = 0 tic_train = time.time() for epoch in range(args.max_epoch): model.train() for i, batch in enumerate(train_data_loader): global_step += 1 if args.task_name == 'qqp': bert_input_ids, bert_segment_ids, student_input_ids_1, seq_len_1, student_input_ids_2, seq_len_2, labels = batch else: bert_input_ids, bert_segment_ids, student_input_ids, seq_len, labels = batch # Calculate teacher model's forward. with paddle.no_grad(): teacher_logits = teacher.model(bert_input_ids, bert_segment_ids) # Calculate student model's forward. if args.task_name == 'qqp': logits = model(student_input_ids_1, seq_len_1, student_input_ids_2, seq_len_2) else: logits = model(student_input_ids, seq_len) loss = args.alpha * ce_loss(logits, labels) + ( 1 - args.alpha) * mse_loss(logits, teacher_logits) loss.backward() optimizer.step() optimizer.clear_grad() if global_step % args.log_freq == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.4f step/s" % (global_step, epoch, i, loss, args.log_freq / (time.time() - tic_train))) tic_eval = time.time() acc = evaluate(args.task_name, model, metric, dev_data_loader) print("eval done total : %s s" % (time.time() - tic_eval)) tic_train = time.time() if global_step % args.save_steps == 0: paddle.save( model.state_dict(), os.path.join(args.output_dir, "step_" + str(global_step) + ".pdparams")) paddle.save( optimizer.state_dict(), os.path.join(args.output_dir, "step_" + str(global_step) + ".pdopt"))