def train(self):
        self.model.to(DEVICE)
        # weight decay是放在正则项(regularization)前面的一个系数,正则项一般指示模型的复杂度,
        # 所以weight decay的作用是调节模型复杂度对损失函数的影响,若weight decay很大,则复杂的模型损失函数的值也就大。
        optimizer = optim.Adam(self.model.parameters(),
                               lr=self.learning_rate,
                               weight_decay=0.0005)
        # schedule = ReduceLROnPlateau(optimizer=optimizer, mode='min', factor=0.1, patience=100, eps=1e-4, verbose=True)
        total_size = math.ceil(self.dataset.get_train_length() /
                               self.batch_size)
        for epoch in range(self.epochs):
            for step in range(self.dataset.get_step() // self.epochs):
                self.model.train()
                # 与optimizer.zero_grad()作用一样
                self.model.zero_grad()
                x_train, y_train = self.dataset.next_train_batch()
                x_val, y_val = self.dataset.next_validation_batch()
                batch = tuple(
                    t.to(DEVICE) for t in create_batch_iter(
                        mode='train', X=x_train, y=y_train).dataset.tensors)
                b_input_ids, b_input_mask, b_labels, b_out_masks = batch
                bert_encode = self.model(b_input_ids, b_input_mask)
                loss = self.model.loss_fn(bert_encode=bert_encode,
                                          tags=b_labels,
                                          output_mask=b_out_masks)
                loss.backward()

                # 梯度裁剪
                # torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1)
                optimizer.step()
                # schedule.step(loss)
                if step % 50 == 0:
                    self.model.eval()
                    eval_loss, eval_acc, eval_f1 = 0, 0, 0
                    with torch.no_grad():
                        batch = tuple(
                            t.to(DEVICE) for t in create_batch_iter(
                                mode='dev', X=x_val, y=y_val).dataset.tensors)
                        batch = tuple(t.to(DEVICE) for t in batch)
                        input_ids, input_mask, label_ids, output_mask = batch
                        bert_encode = self.model(input_ids, input_mask)
                        eval_los = self.model.loss_fn(bert_encode=bert_encode,
                                                      tags=label_ids,
                                                      output_mask=output_mask)
                        eval_loss = eval_los + eval_loss
                        predicts = self.model.predict(bert_encode, output_mask)

                        label_ids = label_ids.view(1, -1)
                        label_ids = label_ids[label_ids != -1]

                        self.model.acc_f1(predicts, label_ids)
                        self.model.class_report(predicts, label_ids)
                        print('eval_loss: ', eval_loss)
                    print("-" * 50)
                    progress = ("█" * int(step * 25 / total_size)).ljust(25)
                    print("step {}".format(step))
                    print("epoch [{}] |{}| {}/{}\n\tloss {:.2f}".format(
                        epoch, progress, step, total_size, loss.item()))

        save_model(self.model, arguments.output_dir)
 def save_model(self, network, path, name=None, overwrite=False):
     save_model(model=network, output_dir=path)
Ejemplo n.º 3
0
def fit(model, training_iter, eval_iter, num_train_steps, device, n_gpu, verbose=1):
    # ------------------结果可视化------------------------
    if args.local_rank in [-1, 0]:
        TIMESTAMP = "{0:%Y-%m-%dT%H-%M-%S/}".format(datetime.now())
        tb_writer = SummaryWriter('log/%s'%TIMESTAMP)
    # ---------------------优化器-------------------------
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

    t_total = num_train_steps

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)#int(t_total*args.warmup_proportion)
    # ---------------------GPU半精度fp16-----------------------------
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank,
                                                          find_unused_parameters=True)
    # ---------------------模型初始化----------------------
    model.to(device)
    tr_loss, logging_loss = 0.0, 0.0
    # ------------------------训练------------------------------
    best_f1 = 0
    #start = time.time()
    global_step = 0
    set_seed(args, n_gpu)  # Added here for reproductibility (even between python 2 and 3)
    bar = tqdm(range(t_total), total = t_total)
    nb_tr_examples, nb_tr_steps = 0, 0

    for step in bar:
        model.train()
        batch = next(training_iter)
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,
                  # XLM don't use segment_ids
                  'labels': batch[3]}
        encode = model(**inputs)
        encode = encode[0]#提取预测结果
        loss = model.loss_fn(encode, labels=inputs['labels'])

        if n_gpu > 1:
            loss = loss.mean()  # mean() to average on multi-gpu parallel training
        if args.gradient_accumulation_steps > 1:
            loss = loss / args.gradient_accumulation_steps

        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            #torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
        else:
            loss.backward()
            #torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        tr_loss += loss.item()
        train_loss = round(tr_loss*args.gradient_accumulation_steps/(nb_tr_steps+1),4)
        bar.set_description("loss {}".format(train_loss))
        nb_tr_examples += inputs['input_ids'].size(0)
        nb_tr_steps += 1

        if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0:
            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            optimizer.zero_grad()
            global_step += 1

        if (step + 1) %(args.eval_steps*args.gradient_accumulation_steps)==0:
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            logger.info("***** Report result *****")
            logger.info("  %s = %s", 'global_step', str(global_step))
            logger.info("  %s = %s", 'train loss', str(train_loss))


        if args.local_rank in [-1, 0] and \
                args.do_eval and (step+1)%(args.eval_steps*args.gradient_accumulation_steps)==0:

            # -----------------------验证----------------------------
            model.eval()
            y_predicts, y_labels = [], []
            eval_loss, eval_acc, eval_f1 = 0, 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0

            for _, batch in enumerate(eval_iter):
                batch = tuple(t.to(device) for t in batch)
                inputs = {'input_ids': batch[0],
                          'attention_mask': batch[1],
                          'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,
                          # XLM don't use segment_ids
                          'labels': batch[3]}
                with torch.no_grad():
                    encode = model(**inputs)
                    encode = encode[0]  # 提取预测结果
                    eval_los = model.loss_fn(encode, labels=inputs['labels'])

                    predicts = model.predict(encode)#.detach().cpu().numpy()

                nb_eval_examples += inputs['input_ids'].size(0)
                nb_eval_steps += 1
                eval_loss += eval_los.mean().item()
                y_predicts.append(torch.from_numpy(predicts))

                labels = inputs['labels'].view(1, -1)
                labels = labels[labels != -1]
                y_labels.append(labels)

            eval_loss = eval_loss / nb_eval_steps
            eval_predicted = torch.cat(y_predicts, dim=0).cpu().numpy()
            eval_labeled = torch.cat(y_labels, dim=0).cpu().numpy()

            eval_f1 = model.acc_rec_f1(eval_predicted, eval_labeled)#eval_acc, eval_rec,

            logger.info(
                '\n\nglobal_step %d - train_loss: %4f - eval_loss: %4f - eval_f1:%4f\n'
                % (global_step,
                   train_loss,
                   eval_loss,
                   eval_f1))

            # 保存最好的模型
            if eval_f1 > best_f1:
                best_f1 = eval_f1
                save_model(model, args.output_dir)

            if args.local_rank in [-1, 0]:
                tb_writer.add_scalar('train_loss', train_loss, step)#.item()
                tb_writer.add_scalar('eval_loss', eval_loss, step)#.item() / count
                tb_writer.add_scalar('eval_f1', eval_f1, step)#eval_acc

            tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)

    if args.local_rank in [-1, 0]:
        tb_writer.close()
    def train(self, train_source, train_target, dev_source, dev_target):
        if os.path.exists(self.args.output_dir) is True:
            shutil.rmtree(self.args.output_dir)

        train_dataloader = create_batch_iter(mode='train', X=train_source, y=train_target, batch_size=self.args.BATCH)
        dev_dataloader = create_batch_iter(mode='dev', X=dev_source, y=dev_target, batch_size=self.args.BATCH)

        self.model.to(DEVICE)

        # 优化器准备
        param_optimizer = list(self.model.named_parameters())
        no_decay = list(['bias', 'LayerNorm.bias', 'LayerNorm.weight'])
        optimizer_grouped_parameters = list([{'params': [p for n, p in param_optimizer if not any(
            nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(
            nd in n for nd in no_decay)], 'weight_decay': 0.0}])

        optimizer = AdamW(params=optimizer_grouped_parameters, lr=self.args.learning_rate)

        total_size = math.ceil(len(train_source) / self.args.BATCH)

        best_acc = 0
        for epoch in range(self.args.EPOCHS):
            for train_step, train_batch in enumerate(tqdm(train_dataloader, desc='Train_Iteration')):
                self.model.train()
                self.model.zero_grad()

                train_batch = tuple(t.to(DEVICE) for t in train_batch)
                t_input_ids, t_input_mask, t_labels, t_out_masks = train_batch

                t_bert_encode = self.model(t_input_ids, t_input_mask)
                loss = self.model.loss_fn(bert_encode=t_bert_encode, tags=t_labels, output_mask=t_out_masks)
                loss.backward()

                # 梯度裁剪
                # torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1)
                optimizer.step()

                if train_step % 10 == 0:
                    self.model.eval()
                    eval_loss = 0

                    for dev_step, dev_batch in enumerate(dev_dataloader):
                        dev_batch = tuple(t.to(DEVICE) for t in dev_batch)
                        d_input_ids, d_input_mask, d_label_ids, d_output_mask = dev_batch

                        with torch.no_grad():
                            d_bert_encode = self.model(d_input_ids, d_input_mask)
                        eval_loss += self.model.loss_fn(bert_encode=d_bert_encode, tags=d_label_ids,
                                                        output_mask=d_output_mask)
                        predicts = self.model.predict(d_bert_encode, d_output_mask)

                        d_label_ids = d_label_ids.view(1, -1)
                        d_label_ids = d_label_ids[d_label_ids != -1]

                        eval_acc, eval_f1 = self.model.acc_f1(predicts, d_label_ids)

                        if eval_acc > best_acc:
                            best_acc = eval_acc
                            save_model(self.model, self.args.output_dir)

                        self.model.class_report(predicts, d_label_ids)

                    logger.info("\n>step {}".format(train_step))
                    logger.info("\n>epoch [{}] {}/{}\n\tloss {:.2f}".format(epoch, train_step, total_size, loss.item()))
        if self.args.output_dir is False:
            save_model(self.model, self.args.output_dir)
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_path', type=str, default='data/bair')
    parser.add_argument('--model_path', type=str, default='model/bair')
    parser.add_argument('--epoch', type=int, default=10)
    parser.add_argument('--batch_size', type=int, default=16)
    parser.add_argument('--horizon', type=int, default=10)
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--cpu_workers', type=int, default=4)
    parser.add_argument('--gpu_id', type=int, default=0)
    parser.add_argument('--model_name', type=str, default='cdna')
    parser.add_argument('--start_point', type=int, default=0)
    parser.add_argument('--no-gif', dest='save_gif', action='store_false')
    parser.set_defaults(save_gif=True)

    args = parser.parse_args()

    setup_seed(args.seed)

    device = 'cuda:%d' % args.gpu_id if torch.cuda.device_count(
    ) > 0 else 'cpu'

    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # dataset setup
    train_set = VideoDataset(args.data_path,
                             'train',
                             args.horizon,
                             fix_start=False)
    val_set = VideoDataset(args.data_path, 'val', args.horizon, fix_start=True)

    config = train_set.get_config()
    H, W, C = config['observations']
    A = config['actions'][0]
    T = args.horizon

    train_loader = torch.utils.data.DataLoader(train_set,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.cpu_workers)
    val_loader = torch.utils.data.DataLoader(val_set,
                                             batch_size=args.batch_size,
                                             num_workers=args.cpu_workers)

    # model setup
    if args.model_name == 'cdna':
        model = CDNA(T, H, W, C, A)
    elif args.model_name == 'etd':
        model = ETD(H, W, C, A, T, 5)
    elif args.model_name == 'etds':
        model = ETDS(H, W, C, A, T, 5)
    elif args.model_name == 'etdm':
        model = ETDM(H, W, C, A, T, 5)
    elif args.model_name == 'etdsd':
        model = ETDSD(H, W, C, A, T, 5)

    model.to(device)

    model_path = os.path.join(args.model_path,
                              '{}_{}'.format(args.model_name, args.horizon))
    if not os.path.exists(model_path):
        os.makedirs(model_path)

    if args.start_point > 0:
        load_model(model,
                   os.path.join(
                       model_path, '{}_{}.pt'.format(args.model_name,
                                                     args.start_point)),
                   eval_mode=False)

    opt = torch.optim.Adam(model.parameters(), lr=1e-3)

    # tensorboard
    writer = SummaryWriter()

    step = 0
    epoch = args.start_point
    while epoch < args.start_point + args.epoch:
        for j, data in enumerate(train_loader):
            observations = data['observations']
            actions = data['actions']

            # B x T ==> T x B
            observations = torch.transpose(observations, 0, 1).to(device)
            actions = torch.transpose(actions, 0, 1).to(device)

            predicted_observations = model(observations[0], actions)

            loss = mse_loss(observations,
                            predicted_observations) / args.batch_size
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 10)

            opt.step()
            opt.zero_grad()

            # add summary
            if step % 100 == 0:
                writer.add_scalar('loss', loss.item(), global_step=step)
                writer.add_video('video',
                                 predicted_observations.permute(1, 0, 2, 3, 4),
                                 global_step=step,
                                 fps=10)

            step += 1

        epoch += 1
        save_model(
            model,
            os.path.join(model_path, '{}_{}.pt'.format(args.model_name,
                                                       epoch)))

        gif_path = os.path.join(model_path, 'val_{}'.format(epoch))
        if not os.path.exists(gif_path):
            os.makedirs(gif_path)

        losses = []
        videos = []

        for j, data in enumerate(val_loader):
            observations = data['observations']
            actions = data['actions']

            # B x T ==> T x B
            observations = torch.transpose(observations, 0, 1).to(device)
            actions = torch.transpose(actions, 0, 1).to(device)

            predicted_observations = model(observations[0], actions)

            video = torch.cat([
                observations[0, 0].unsqueeze(0),
                predicted_observations[0:T - 1, 0]
            ])  # tensor[T, C, H, W]
            videos.append(video.unsqueeze(0).detach())

            if args.save_gif:
                torch_save_gif(os.path.join(gif_path, "{}.gif".format(j)),
                               video.detach().cpu(),
                               fps=10)

            loss = mse_loss(observations,
                            predicted_observations).item() / args.batch_size
            losses.append(loss)

            opt.zero_grad()

        videos = torch.cat(videos, 0)
        writer.add_video('val_video', videos, global_step=epoch, fps=10)

        print("-" * 50)
        print("In epoch {}, loss in val set is {}".format(
            epoch, np.mean(losses)))
        print("-" * 50)
def main():
    """
    项目的超参
    """
    parser = argparse.ArgumentParser()
    parser.add_argument("-e", "--EPOCHS", default=50, type=int, help="train epochs")
    parser.add_argument("-b", "--BATCH", default=8, type=int, help="batch size")
    args = parser.parse_args()

    # ------------------判断CUDA模式----------------------
    if torch.cuda.is_available():
        device = 'cuda'
    else:
        device = 'cpu'
    device = torch.device(device)

    # ------------------预处理数据----------------------
    dataset = Dataset(epochs=args.EPOCHS, batch=args.BATCH)

    network = Net.from_pretrained(arguments.bert_model, num_tag=len(arguments.labels)).to(device)
    logger.info('\n预处理结束!!!\n')
    # ---------------------优化器-------------------------
    param_optimizer = list(network.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

    t_total = int(dataset.get_train_length() / arguments.gradient_accumulation_steps / args.BATCH * args.EPOCHS)

    # ---------------------GPU半精度fp16-----------------------------
    if arguments.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=arguments.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if arguments.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=arguments.loss_scale)

    # ------------------------GPU单精度fp32---------------------------
    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=arguments.learning_rate,
                             warmup=arguments.warmup_proportion,
                             t_total=t_total
                             )

    # ---------------------模型初始化----------------------
    if arguments.fp16:
        network.half()

    train_losses = []
    eval_losses = []
    train_accuracy = []
    eval_accuracy = []

    best_f1 = 0
    start = time.time()
    global_step = 0
    for e in range(args.EPOCHS):
        network.train()
        for step in range(dataset.get_step() // args.EPOCHS):
            x_train, y_train = dataset.next_train_batch()
            batch = create_batch_iter(mode='train', X=x_train, y=y_train).dataset.tensors
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids, output_mask = batch
            bert_encode = network(input_ids, segment_ids, input_mask)
            train_loss = network.loss_fn(bert_encode=bert_encode, tags=label_ids, output_mask=output_mask)

            if arguments.gradient_accumulation_steps > 1:
                train_loss = train_loss / arguments.gradient_accumulation_steps

            if arguments.fp16:
                optimizer.backward(train_loss)
            else:
                train_loss.backward()

            if (step + 1) % arguments.gradient_accumulation_steps == 0:
                def warmup_linear(x, warmup=0.002):
                    if x < warmup:
                        return x / warmup
                    return 1.0 - x

                # modify learning rate with special warm up BERT uses
                lr_this_step = arguments.learning_rate * warmup_linear(global_step / t_total,
                                                                       arguments.warmup_proportion)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr_this_step
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

            predicts = network.predict(bert_encode, output_mask)
            label_ids = label_ids.view(1, -1)
            label_ids = label_ids[label_ids != -1]
            label_ids = label_ids.cpu()

            train_acc, f1 = network.acc_f1(predicts, label_ids)

        logger.info("\n train_acc: %f - train_loss: %f - f1: %f - using time: %f - step: %d \n" % (train_acc,
                                                                                                   train_loss.item(),
                                                                                                   f1,
                                                                                                   (
                                                                                                           time.time() - start),
                                                                                                   step))

        # -----------------------验证----------------------------
        network.eval()
        count = 0
        y_predicts, y_labels = [], []
        eval_loss, eval_acc, eval_f1 = 0, 0, 0
        with torch.no_grad():
            for step in range(dataset.get_step() // args.EPOCHS):
                x_val, y_val = dataset.next_validation_batch()
                batch = create_batch_iter(mode='dev', X=x_val, y=y_val).dataset.tensors
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids, output_mask = batch
                bert_encode = network(input_ids, segment_ids, input_mask).cpu()
                eval_los = network.loss_fn(bert_encode=bert_encode, tags=label_ids, output_mask=output_mask)
                eval_loss = eval_los + eval_loss
                count += 1
                predicts = network.predict(bert_encode, output_mask)
                y_predicts.append(predicts)

                label_ids = label_ids.view(1, -1)
                label_ids = label_ids[label_ids != -1]
                y_labels.append(label_ids)

            eval_predicted = torch.cat(y_predicts, dim=0).cpu()
            eval_labeled = torch.cat(y_labels, dim=0).cpu()
            print('eval:')
            print(eval_predicted.numpy().tolist())
            print(eval_labeled.numpy().tolist())

            eval_acc, eval_f1 = network.acc_f1(eval_predicted, eval_labeled)
            network.class_report(eval_predicted, eval_labeled)

            logger.info(
                '\n\nEpoch %d - train_loss: %4f - eval_loss: %4f - train_acc:%4f - eval_acc:%4f - eval_f1:%4f\n'
                % (e + 1, train_loss.item(), eval_loss.item() / count, train_acc, eval_acc, eval_f1))

            # 保存最好的模型
            if eval_f1 > best_f1:
                best_f1 = eval_f1
                save_model(network, arguments.output_dir)

            if e % 1 == 0:
                train_losses.append(train_loss.item())
                train_accuracy.append(train_acc)
                eval_losses.append(eval_loss.item() / count)
                eval_accuracy.append(eval_acc)