Example #1
0
def do_train(agrs):
    train_data_loader, dev_data_loader = create_distill_loader(
        args.task_name,
        model_name=args.model_name,
        vocab_path=args.vocab_path,
        batch_size=args.batch_size,
        max_seq_length=args.max_seq_length,
        n_iter=args.n_iter)

    emb_tensor = load_embedding(
        args.vocab_path) if args.use_pretrained_emb else None

    model = BiLSTM(args.emb_dim, args.hidden_size, args.vocab_size,
                   args.output_dim, args.padding_idx, args.num_layers,
                   args.dropout_prob, args.init_scale, emb_tensor)

    if args.optimizer == 'adadelta':
        optimizer = paddle.optimizer.Adadelta(learning_rate=args.lr,
                                              rho=0.95,
                                              parameters=model.parameters())
    else:
        optimizer = paddle.optimizer.Adam(learning_rate=args.lr,
                                          parameters=model.parameters())

    ce_loss = nn.CrossEntropyLoss()
    mse_loss = nn.MSELoss()
    klloss = nn.KLDivLoss()

    metric_class = TASK_CLASSES[args.task_name][1]
    metric = metric_class()

    teacher = TeacherModel(model_name=args.model_name,
                           param_path=args.teacher_path)

    print("Start to distill student model.")

    global_step = 0
    tic_train = time.time()
    for epoch in range(args.max_epoch):
        model.train()
        for i, batch in enumerate(train_data_loader):
            if args.task_name == 'qqp':
                bert_input_ids, bert_segment_ids, student_input_ids_1, seq_len_1, student_input_ids_2, seq_len_2, labels = batch
            else:
                bert_input_ids, bert_segment_ids, student_input_ids, seq_len, labels = batch

            # Calculate teacher model's forward.
            with paddle.no_grad():
                teacher_logits = teacher.model(bert_input_ids,
                                               bert_segment_ids)

            # Calculate student model's forward.
            if args.task_name == 'qqp':
                logits = model(student_input_ids_1, seq_len_1,
                               student_input_ids_2, seq_len_2)
            else:
                logits = model(student_input_ids, seq_len)

            loss = args.alpha * ce_loss(logits, labels) + (
                1 - args.alpha) * mse_loss(logits, teacher_logits)

            loss.backward()
            optimizer.step()
            optimizer.clear_grad()

            if i % args.log_freq == 0:
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.4f step/s"
                    % (global_step, epoch, i, loss, args.log_freq /
                       (time.time() - tic_train)))
                tic_eval = time.time()
                acc = evaluate(args.task_name, model, metric, dev_data_loader)
                print("eval done total : %s s" % (time.time() - tic_eval))
                tic_train = time.time()
            global_step += 1
Example #2
0
def do_train(agrs):
    device = paddle.set_device(args.device)
    train_data_loader, dev_data_loader = create_distill_loader(
        args.task_name,
        model_name=args.model_name,
        vocab_path=args.vocab_path,
        batch_size=args.batch_size,
        max_seq_length=args.max_seq_length,
        n_iter=args.n_iter,
        whole_word_mask=args.whole_word_mask,
        seed=args.seed)

    model = BiLSTM(args.emb_dim, args.hidden_size, args.vocab_size,
                   args.output_dim, args.vocab_path, args.padding_idx,
                   args.num_layers, args.dropout_prob, args.init_scale,
                   args.embedding_name)

    if args.optimizer == 'adadelta':
        optimizer = paddle.optimizer.Adadelta(learning_rate=args.lr,
                                              rho=0.95,
                                              parameters=model.parameters())
    else:
        optimizer = paddle.optimizer.Adam(learning_rate=args.lr,
                                          parameters=model.parameters())

    ce_loss = nn.CrossEntropyLoss()
    mse_loss = nn.MSELoss()
    klloss = nn.KLDivLoss()

    metric_class = TASK_CLASSES[args.task_name][1]
    metric = metric_class()

    teacher = TeacherModel(model_name=args.model_name,
                           param_path=args.teacher_path)

    print("Start to distill student model.")

    if args.init_from_ckpt:
        model.set_state_dict(paddle.load(args.init_from_ckpt + ".pdparams"))
        optimizer.set_state_dict(paddle.load(args.init_from_ckpt + ".pdopt"))
        print("Loaded checkpoint from %s" % args.init_from_ckpt)

    global_step = 0
    tic_train = time.time()
    for epoch in range(args.max_epoch):
        model.train()
        for i, batch in enumerate(train_data_loader):
            global_step += 1
            if args.task_name == 'qqp':
                bert_input_ids, bert_segment_ids, student_input_ids_1, seq_len_1, student_input_ids_2, seq_len_2, labels = batch
            else:
                bert_input_ids, bert_segment_ids, student_input_ids, seq_len, labels = batch

            # Calculate teacher model's forward.
            with paddle.no_grad():
                teacher_logits = teacher.model(bert_input_ids,
                                               bert_segment_ids)

            # Calculate student model's forward.
            if args.task_name == 'qqp':
                logits = model(student_input_ids_1, seq_len_1,
                               student_input_ids_2, seq_len_2)
            else:
                logits = model(student_input_ids, seq_len)

            loss = args.alpha * ce_loss(logits, labels) + (
                1 - args.alpha) * mse_loss(logits, teacher_logits)

            loss.backward()
            optimizer.step()
            optimizer.clear_grad()

            if global_step % args.log_freq == 0:
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.4f step/s"
                    % (global_step, epoch, i, loss, args.log_freq /
                       (time.time() - tic_train)))
                tic_eval = time.time()
                acc = evaluate(args.task_name, model, metric, dev_data_loader)
                print("eval done total : %s s" % (time.time() - tic_eval))
                tic_train = time.time()

            if global_step % args.save_steps == 0:
                paddle.save(
                    model.state_dict(),
                    os.path.join(args.output_dir,
                                 "step_" + str(global_step) + ".pdparams"))
                paddle.save(
                    optimizer.state_dict(),
                    os.path.join(args.output_dir,
                                 "step_" + str(global_step) + ".pdopt"))