Beispiel #1
0
def fit(model, training_iter, eval_iter, num_epoch, pbar, lr_decay_mode, initial_lr, verbose=1):
    model.apply(weights_init)

    if use_cuda:
        model.cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=initial_lr)
    loss_fn = nn.CrossEntropyLoss()

    train_losses = []
    eval_losses = []
    train_accuracy = []
    eval_accuracy = []

    history = {
        "train_loss": train_losses,
        "train_acc": train_accuracy,
        "eval_loss": eval_losses,
        "eval_acc": eval_accuracy
    }

    start = time.time()
    for e in range(num_epoch):
        if e > 0:
            lr_update(optimizer=optimizer, epoch=e, lr_decay_mode=lr_decay_mode)

        model.train()
        for index, (inputs, label, length) in enumerate(training_iter):
            if config.use_mem_track:
                gpu_tracker.track()
            if use_cuda:
                inputs = Variable(inputs.cuda())
                label = Variable(label.squeeze(1).cuda())
                length = Variable(length.cuda())

            y_preds = model(inputs, length)
            train_loss = loss_fn(y_preds, label)
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()

            train_acc, _ = model.evaluate(y_preds, label)
            pbar.show_process(train_acc, train_loss.data, time.time()-start, index)

            if config.use_mem_track:
                gpu_tracker.track()

        if use_cuda:
            torch.cuda.empty_cache()

        model.eval()
        eval_loss, eval_acc, eval_f1 = 0, 0, 0
        with torch.no_grad():
            count = 0
            for eval_inputs, eval_label, eval_length in eval_iter:
                if use_cuda:
                    eval_inputs, eval_label, length = eval_inputs.cuda(), eval_label.squeeze(1).cuda(), eval_length.cuda()
                y_preds = model(eval_inputs, eval_length)
                eval_loss += loss_fn(y_preds, eval_label).data
                eval_accur, eval_f1_score = model.evaluate(y_preds, eval_label)
                eval_acc += eval_accur
                eval_f1 += eval_f1_score
                count += 1

            logger.info(
                '\n\nEpoch %d - train_loss: %4f - eval_loss: %4f - train_acc:%4f - eval_acc:%4f - eval_f1:%4f\n'
                % (e + 1,
                   train_loss.data,
                   eval_loss/count,
                   train_acc,
                   eval_acc/count,
                   eval_f1/count))

            if e % verbose == 0:
                train_losses.append(train_loss.data)
                train_accuracy.append(train_acc)
                eval_losses.append(eval_loss/count)
                eval_accuracy.append(eval_acc/count)
    model.save()
    loss_acc_plot(history)
def fit(model,
        training_iter,
        eval_iter,
        num_epoch,
        pbar,
        num_train_steps,
        verbose=1):
    # ------------------判断CUDA模式----------------------
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()  # 多GPU
        # n_gpu = 1
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))
    # ---------------------优化器-------------------------
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    t_total = num_train_steps

    ## ---------------------GPU半精度fp16-----------------------------
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)
    ## ------------------------GPU单精度fp32---------------------------
    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=t_total)
    # ---------------------模型初始化----------------------
    if args.fp16:
        model.half()

    model.to(device)

    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)

    elif n_gpu > 1:
        model = torch.nn.DataParallel(model, device_ids=[0, 1])

    train_losses = []
    eval_losses = []
    train_accuracy = []
    eval_accuracy = []

    history = {
        "train_loss": train_losses,
        "train_acc": train_accuracy,
        "eval_loss": eval_losses,
        "eval_acc": eval_accuracy
    }

    # ------------------------训练------------------------------
    best_f1 = 0
    start = time.time()
    global_step = 0
    for e in range(num_epoch):
        model.train()
        for step, batch in enumerate(training_iter):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, start_positions, end_positions, answer_types = batch
            start_logits, end_logits, answer_type_logits = model(
                input_ids, segment_ids, input_mask)
            train_loss = loss_fn(start_logits, end_logits, answer_type_logits,
                                 start_positions, end_positions, answer_types)

            if args.gradient_accumulation_steps > 1:
                train_loss = train_loss / args.gradient_accumulation_steps

            if args.fp16:
                optimizer.backward(train_loss)
            else:
                train_loss.backward()

            if (step + 1) % args.gradient_accumulation_steps == 0:
                # modify learning rate with special warm up BERT uses
                lr_this_step = args.learning_rate * warmup_linear(
                    global_step / t_total, args.warmup_proportion)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr_this_step
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

            start_logits, end_logits = start_logits.cpu(), end_logits.cpu()
            start_positions, end_positions = start_positions.cpu(
            ), end_positions.cpu()
            train_acc, f1 = qa_evaluate((start_logits, end_logits),
                                        (start_positions, end_positions))
            pbar.show_process(train_acc, train_loss.item(), f1,
                              time.time() - start, step)

# -----------------------验证----------------------------
        model.eval()
        count = 0
        y_predicts, y_labels = [], []
        eval_starts_predict, eval_ends_predict = [], []
        eval_starts_label, eval_ends_label = [], []
        eval_loss, eval_acc, eval_f1 = 0, 0, 0
        with torch.no_grad():
            for step, batch in enumerate(eval_iter):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, start_positions, end_positions, answer_types = batch
                start_logits, end_logits, answer_type_logits = model(
                    input_ids, segment_ids, input_mask)
                eval_los = loss_fn(start_logits, end_logits,
                                   answer_type_logits, start_positions,
                                   end_positions, answer_types)
                eval_loss = eval_los + eval_loss
                count += 1
                eval_starts_predict.append(start_logits)
                eval_ends_predict.append(end_logits)
                eval_starts_label.append(start_positions)
                eval_ends_label.append(end_positions)
            eval_starts_predicted = torch.cat(eval_starts_predict, dim=0).cpu()
            eval_ends_predicted = torch.cat(eval_ends_predict, dim=0).cpu()
            eval_starts_labeled = torch.cat(eval_starts_label, dim=0).cpu()
            eval_ends_labeled = torch.cat(eval_ends_label, dim=0).cpu()
            eval_predicted = (eval_starts_predicted, eval_ends_predicted)
            eval_labeled = (eval_starts_labeled, eval_ends_labeled)

            eval_acc, eval_f1 = qa_evaluate(eval_predicted, eval_labeled)

            logger.info(
                '\n\nEpoch %d - train_loss: %4f - eval_loss: %4f - train_acc:%4f - eval_acc:%4f - eval_f1:%4f\n'
                % (e + 1, train_loss.item(), eval_loss.item() / count,
                   train_acc, eval_acc, eval_f1))

            # 保存最好的模型
            if eval_f1 > best_f1:
                best_f1 = eval_f1
                save_model(model, args.output_dir)

            if e % verbose == 0:
                train_losses.append(train_loss.item())
                train_accuracy.append(train_acc)
                eval_losses.append(eval_loss.item() / count)
                eval_accuracy.append(eval_acc)
    loss_acc_plot(history)
Beispiel #3
0
def fit(model,
        training_iter,
        eval_iter,
        num_epoch,
        pbar,
        lr_decay_mode,
        initial_lr,
        verbose=1):
    model.apply(weights_init)

    if use_cuda:
        model.cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=initial_lr)

    train_losses = []
    eval_losses = []
    train_accuracy = []
    eval_accuracy = []

    history = {
        "train_loss": train_losses,
        "train_acc": train_accuracy,
        "eval_loss": eval_losses,
        "eval_acc": eval_accuracy
    }

    best_f1 = 0

    start = time.time()
    for e in range(num_epoch):
        if e > 0:
            lr_update(optimizer=optimizer,
                      epoch=e,
                      lr_decay_mode=lr_decay_mode)

        model.train()
        for index, (inputs, label, length) in enumerate(training_iter):
            if config.use_mem_track:
                gpu_tracker.track()
            if use_cuda:
                inputs = Variable(inputs.cuda())
                label = Variable(label.cuda())
                length = Variable(length.cuda())

            output = model(inputs, length)
            train_loss = model.loss_fn(output, label, length)
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()

            with torch.no_grad():
                predicts = model.predict(output, length)
                predicts = predicts.view(1, -1).squeeze()
                predicts = predicts[predicts != -1]
                label = label.view(1, -1).squeeze()
                label = label[label != -1]
                train_acc, _ = model.evaluate(predicts, label)
                pbar.show_process(train_acc, train_loss.detach(),
                                  time.time() - start, index)

            if config.use_mem_track:
                gpu_tracker.track()

        if use_cuda:
            torch.cuda.empty_cache()

        model.eval()
        eval_loss, eval_acc, eval_f1 = 0, 0, 0
        with torch.no_grad():
            predict_set, label_set = [], []
            count = 0
            for eval_inputs, eval_label, eval_length in eval_iter:
                if use_cuda:
                    eval_inputs, eval_label, eval_length = eval_inputs.cuda(
                    ), eval_label.cuda(), eval_length.cuda()
                output = model(eval_inputs, eval_length)
                eval_loss += model.loss_fn(output, eval_label,
                                           eval_length).detach()
                eval_predicts = model.predict(output, eval_length)
                eval_predicts = eval_predicts.view(1, -1).squeeze()
                eval_predicts = eval_predicts[eval_predicts != -1]
                predict_set.append(eval_predicts)
                eval_label = eval_label.view(1, -1).squeeze()
                eval_label = eval_label[eval_label != -1]
                label_set.append(eval_label)
                count += 1
            predict_set = torch.cat(predict_set, dim=0)
            label_set = torch.cat(label_set, dim=0)

            eval_acc, eval_f1 = model.evaluate(predict_set, label_set)
            model.class_report(predict_set, label_set)

            logger.info(
                '\n\nEpoch %d - train_loss: %4f - eval_loss: %4f - train_acc:%4f - eval_acc:%4f - eval_f1:%4f\n'
                % (e + 1, train_loss.detach(), eval_loss / count, train_acc,
                   eval_acc, eval_f1))

            # 保存最好的模型
            if eval_f1 > best_f1:
                best_f1 = eval_f1
                model.save()

            if e % verbose == 0:
                train_losses.append(train_loss.data)
                train_accuracy.append(train_acc)
                eval_losses.append(eval_loss / count)
                eval_accuracy.append(eval_acc / count)
    model.save()
    loss_acc_plot(history)
Beispiel #4
0
    def fit(self,
            training_data,
            eval_data,
            pbar,
            num_epochs=100,
            early_stopping_rounds=5,
            verbose=1,
            train_from_scratch=True):
        """train the model"""
        if train_from_scratch is False:
            self.restore_model()

        # Initialize best loss. This variable will store the lowest loss on the
        # eval dataset.
        best_loss = 2018

        # Initialize classes to update the mean loss of train and eval
        train_loss = []
        eval_loss = []
        train_accuracy = []
        eval_accuracy = []

        # Initialize dictionary to store the loss history
        self.history['train_loss'] = []
        self.history['eval_loss'] = []
        self.history['train_accuracy'] = []
        self.history['eval_accuracy'] = []

        count = early_stopping_rounds

        # Begin training
        for i in range(num_epochs):
            # 在每个epoch训练之初初始化optimizer,决定是否使用学习率衰减
            learning_rate = lr_update(i + 1, mode=config.lr_mode)
            optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)

            # Training with gradient descent
            start = time.time()
            for index, (sequence, label, _) in enumerate(training_data):
                # cpu需要类型转换,不然会报错:Could not find valid device
                sequence = tf.cast(sequence, dtype=tf.float32)
                label = tf.cast(label, dtype=tf.int64)
                grads = self.grads_fn(sequence, label, training=True)
                optimizer.apply_gradients(zip(grads, self.variables))
                pbar.show(index, use_time=time.time() - start)

            # Compute the loss on the training data after one epoch
            for sequence, label, _ in training_data:
                sequence = tf.cast(sequence, dtype=tf.float32)
                label = tf.cast(label, dtype=tf.int64)
                train_los = self.loss_fn(sequence, label, training=False)
                train_acc = self.get_accuracy(sequence, label, training=False)
                train_loss.append(train_los)
                train_accuracy.append(train_acc)
            self.history['train_loss'].append(np.mean(train_loss))
            self.history['train_accuracy'].append(np.mean(train_accuracy))

            # Compute the loss on the eval data after one epoch
            for sequence, label, _ in eval_data:
                sequence = tf.cast(sequence, dtype=tf.float32)
                label = tf.cast(label, dtype=tf.int64)
                eval_los = self.loss_fn(sequence, label, training=False)
                eval_acc = self.get_accuracy(sequence, label, training=False)
                eval_loss.append(eval_los)
                eval_accuracy.append(eval_acc)
            self.history['eval_loss'].append(np.mean(eval_loss))
            self.history['eval_accuracy'].append(np.mean(eval_accuracy))

            # Print train and eval losses
            if (i == 0) | ((i + 1) % verbose == 0):
                print(
                    'Epoch %d - train_loss: %4f - eval_loss: %4f - train_acc:%4f - eval_acc:%4f'
                    % (i + 1, self.history['train_loss'][-1],
                       self.history['eval_loss'][-1],
                       self.history['train_accuracy'][-1],
                       self.history['eval_accuracy'][-1]))

            # Check for early stopping
            if self.history['eval_loss'][-1] < best_loss:
                best_loss = self.history['eval_loss'][-1]
                count = early_stopping_rounds
            else:
                count -= 1
            if count == 0:
                break
        # 画出loss_acc曲线
        loss_acc_plot(history=self.history)
def fit(model,
        training_iter,
        eval_iter,
        num_epoch,
        pbar,
        num_train_steps,
        verbose=1):
    # ------------------判断CUDA模式----------------------
    device = torch.device(args.device if torch.cuda.is_available()
                          and not args.no_cuda else "cpu")

    # ---------------------优化器-------------------------
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    t_total = num_train_steps

    ## ---------------------GPU半精度fp16-----------------------------
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)
    ## ------------------------GPU单精度fp32---------------------------
    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=t_total)
    # ---------------------模型初始化----------------------
    if args.fp16:
        model.half()

    model.to(device)

    train_losses = []
    eval_losses = []
    train_accuracy = []
    eval_accuracy = []

    history = {
        "train_loss": train_losses,
        "train_acc": train_accuracy,
        "eval_loss": eval_losses,
        "eval_acc": eval_accuracy
    }

    # ------------------------训练------------------------------
    best_f1 = 0
    start = time.time()
    global_step = 0
    for e in range(num_epoch):
        model.train()
        for step, batch in enumerate(training_iter):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids, output_mask = batch
            # print("input_id", input_ids)
            # print("input_mask", input_mask)
            # print("segment_id", segment_ids)
            bert_encode = model(input_ids, segment_ids, input_mask).cpu()
            train_loss = model.loss_fn(bert_encode=bert_encode,
                                       tags=label_ids,
                                       output_mask=output_mask)

            if args.gradient_accumulation_steps > 1:
                train_loss = train_loss / args.gradient_accumulation_steps

            if args.fp16:
                optimizer.backward(train_loss)
            else:
                train_loss.backward()

            if (step + 1) % args.gradient_accumulation_steps == 0:
                # modify learning rate with special warm up BERT uses
                lr_this_step = args.learning_rate * warmup_linear(
                    global_step / t_total, args.warmup_proportion)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr_this_step
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

            predicts = model.predict(bert_encode, output_mask)
            label_ids = label_ids.view(1, -1)
            label_ids = label_ids[label_ids != -1]
            label_ids = label_ids.cpu()

            train_acc, f1 = model.acc_f1(predicts, label_ids)
            pbar.show_process(train_acc, train_loss.item(), f1,
                              time.time() - start, step)

# -----------------------验证----------------------------
        model.eval()
        count = 0
        y_predicts, y_labels = [], []
        eval_loss, eval_acc, eval_f1 = 0, 0, 0
        with torch.no_grad():
            for step, batch in enumerate(eval_iter):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids, output_mask = batch
                bert_encode = model(input_ids, segment_ids, input_mask).cpu()
                eval_los = model.loss_fn(bert_encode=bert_encode,
                                         tags=label_ids,
                                         output_mask=output_mask)
                eval_loss = eval_los + eval_loss
                count += 1
                predicts = model.predict(bert_encode, output_mask)
                y_predicts.append(predicts)

                label_ids = label_ids.view(1, -1)
                label_ids = label_ids[label_ids != -1]
                y_labels.append(label_ids)

            eval_predicted = torch.cat(y_predicts, dim=0).cpu()
            eval_labeled = torch.cat(y_labels, dim=0).cpu()

            eval_acc, eval_f1 = model.acc_f1(eval_predicted, eval_labeled)
            model.class_report(eval_predicted, eval_labeled)

            logger.info(
                '\n\nEpoch %d - train_loss: %4f - eval_loss: %4f - train_acc:%4f - eval_acc:%4f - eval_f1:%4f\n'
                % (e + 1, train_loss.item(), eval_loss.item() / count,
                   train_acc, eval_acc, eval_f1))

            # 保存最好的模型
            if eval_f1 > best_f1:
                best_f1 = eval_f1
                save_model(model, args.output_dir)

            if e % verbose == 0:
                train_losses.append(train_loss.item())
                train_accuracy.append(train_acc)
                eval_losses.append(eval_loss.item() / count)
                eval_accuracy.append(eval_acc)

    loss_acc_plot(history)
Beispiel #6
0
def fit(model,
        training_iter,
        eval_iter,
        num_epoch,
        pbar,
        lr_decay_mode,
        initial_lr,
        verbose=1):
    model.apply(weights_init)

    if use_cuda:
        model.cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=initial_lr)
    loss_fn = nn.CrossEntropyLoss()

    train_losses = []
    eval_losses = []
    train_accuracy = []
    eval_accuracy = []

    history = {
        "train_loss": train_losses,
        "train_acc": train_accuracy,
        "eval_loss": eval_losses,
        "eval_acc": eval_accuracy
    }

    best_f1 = 0

    start = time.time()
    for e in range(num_epoch):
        if e > 0:
            lr_update(optimizer=optimizer,
                      epoch=e,
                      lr_decay_mode=lr_decay_mode)

        model.train()
        for index, (inputs, label) in enumerate(training_iter):
            if config.use_mem_track:
                gpu_tracker.track()
            if use_cuda:
                inputs = Variable(inputs.cuda())
                label = Variable(label.squeeze(1).cuda())

            y_preds = model(inputs)
            train_loss = loss_fn(y_preds, label)
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()

            train_acc, _ = model.evaluate(y_preds, label)
            pbar.show_process(train_acc, train_loss.data,
                              time.time() - start, index)

            if config.use_mem_track:
                gpu_tracker.track()

        if use_cuda:
            torch.cuda.empty_cache()

        model.eval()
        count = 0
        y_predicts, y_labels = [], []
        eval_loss, eval_acc, eval_f1 = 0, 0, 0
        with torch.no_grad():
            for eval_inputs, eval_label in eval_iter:
                if use_cuda:
                    eval_inputs, eval_label = eval_inputs.cuda(
                    ), eval_label.squeeze(1).cuda()
                eval_y_preds = model(eval_inputs)
                eval_loss += loss_fn(eval_y_preds, eval_label).data
                y_predicts.append(eval_y_preds)
                y_labels.append(eval_label)
                count += 1
            eval_predicted = torch.cat(y_predicts, dim=0)
            eval_labeled = torch.cat(y_labels, dim=0)

            eval_acc, eval_f1 = model.evaluate(eval_predicted, eval_labeled)
            model.class_report(eval_predicted, eval_labeled)

            logger.info(
                '\n\nEpoch %d - train_loss: %4f - eval_loss: %4f - train_acc:%4f - eval_acc:%4f - eval_f1:%4f\n'
                % (e + 1, train_loss.data, eval_loss / count, train_acc,
                   eval_acc, eval_f1))

            # 保存最好的模型
            if eval_f1 > best_f1:
                best_f1 = eval_f1
                model.save()

            if e % verbose == 0:
                train_losses.append(train_loss.data)
                train_accuracy.append(train_acc)
                eval_losses.append(eval_loss / count)
                eval_accuracy.append(eval_acc)
    loss_acc_plot(history)