コード例 #1
0
ファイル: train.py プロジェクト: NTT123/sketch-transformer
def _train(start_iteration, model, optimizer, device, train_dataloader,
           test_dataloader, args):
    train_loss = deque(maxlen=args.log_freq)
    test_loss = deque(maxlen=args.log_freq)
    model = model.to(device)
    start_time = time.perf_counter()
    test_iter = iter(test_dataloader)
    train_iter = iter(train_dataloader)
    loss_func = partial(_loss_func, model=model, device=device)
    oclr = OneCycleLR(optimizer,
                      args.learning_rate,
                      pct_start=0.01,
                      total_steps=1_000_000,
                      cycle_momentum=False,
                      last_epoch=start_iteration - 2)

    for iteration in range(start_iteration, 1 + args.num_training_steps):
        loss = loss_func(train_iter)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        oclr.step()
        train_loss.append(loss.detach())

        if iteration % (10 * args.log_freq) == 0:
            ckpt = f'checkpoint_{iteration:07d}.pt'
            print('Saving checkpoint', ckpt)
            torch.save(
                {
                    'iteration': iteration,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'args': args
                }, ckpt)

        if iteration % 20 == 0:
            with torch.no_grad():
                model.eval()
                test_loss.append(loss_func(test_iter).detach())
                model.train()

        if iteration % args.log_freq == 0:
            avg_train_loss = sum(train_loss).item() / len(train_loss)
            avg_test_loss = sum(test_loss).item() / len(test_loss)
            end_time = time.perf_counter()
            duration, start_time = end_time - start_time, end_time
            lr = oclr.get_last_lr()[0]
            with torch.no_grad():
                model.eval()
                cat = random.randrange(0, len(dataset.categories))
                sample = generate(model, device, cat)
                model.train()
            train_sample = next(train_iter)[0, :]
            test_sample = next(test_iter)[0, :]
            plot_encoded_figure(train_sample[:, 0].tolist(),
                                train_sample[0, 2], 'train_sample.png')
            plot_encoded_figure(test_sample[:, 0].tolist(), test_sample[0, 2],
                                'test_sample.png')
            plot_encoded_figure(sample, cat, 'random_sample.png')
            print(
                f'Iteration {iteration:07d}  Train loss {avg_train_loss:.3f}  Test loss {avg_test_loss:.3f}  LR {lr:.3e}  Duration {duration:.3f}'
            )
            if args.use_wandb:
                wandb.log({
                    'iteration': iteration,
                    'train loss': avg_train_loss,
                    'test loss': avg_test_loss,
                    'duration': duration,
                    'learning rate': lr,
                    'train sample': wandb.Image('train_sample.png'),
                    'test sample': wandb.Image('test_sample.png'),
                    'random sample': wandb.Image('random_sample.png'),
                })
コード例 #2
0
def train(args, training_features, model, tokenizer):
    """ Train the model """
    wandb.init(project=os.getenv("WANDB_PROJECT", "huggingface"),
               config=args,
               name=args.run_name)
    wandb.watch(model)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
    else:
        amp = None

    # model recover
    recover_step = utils.get_max_epoch_model(args.output_dir)

    # if recover_step:
    #     model_recover_checkpoint = os.path.join(args.output_dir, "model.{}.bin".format(recover_step))
    #     logger.info(" ** Recover model checkpoint in %s ** ", model_recover_checkpoint)
    #     model_state_dict = torch.load(model_recover_checkpoint, map_location='cpu')
    #     optimizer_recover_checkpoint = os.path.join(args.output_dir, "optim.{}.bin".format(recover_step))
    #     checkpoint_state_dict = torch.load(optimizer_recover_checkpoint, map_location='cpu')
    #     checkpoint_state_dict['model'] = model_state_dict
    # else:
    checkpoint_state_dict = None

    model.to(args.device)
    model, optimizer = prepare_for_training(args,
                                            model,
                                            checkpoint_state_dict,
                                            amp=amp)

    if args.n_gpu == 0 or args.no_cuda:
        per_node_train_batch_size = args.per_gpu_train_batch_size * args.gradient_accumulation_steps
    else:
        per_node_train_batch_size = args.per_gpu_train_batch_size * args.n_gpu * args.gradient_accumulation_steps

    train_batch_size = per_node_train_batch_size * (
        torch.distributed.get_world_size() if args.local_rank != -1 else 1)
    global_step = recover_step if recover_step else 0

    if args.num_training_steps == -1:
        args.num_training_steps = int(args.num_training_epochs *
                                      len(training_features) /
                                      train_batch_size)

    if args.warmup_portion:
        args.num_warmup_steps = args.warmup_portion * args.num_training_steps

    if args.scheduler == "linear":
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args.num_warmup_steps,
            num_training_steps=args.num_training_steps,
            last_epoch=-1)

    elif args.scheduler == "constant":
        scheduler = get_constant_schedule(optimizer, last_epoch=-1)

    elif args.scheduler == "1cycle":
        scheduler = OneCycleLR(optimizer,
                               max_lr=args.learning_rate,
                               total_steps=args.num_training_steps,
                               pct_start=args.warmup_portion,
                               anneal_strategy=args.anneal_strategy,
                               final_div_factor=1e4,
                               last_epoch=-1)

    else:
        assert False

    if checkpoint_state_dict:
        scheduler.load_state_dict(checkpoint_state_dict["lr_scheduler"])

    train_dataset = utils.Seq2seqDatasetForBert(
        features=training_features,
        max_source_len=args.max_source_seq_length,
        max_target_len=args.max_target_seq_length,
        vocab_size=tokenizer.vocab_size,
        cls_id=tokenizer.cls_token_id,
        sep_id=tokenizer.sep_token_id,
        pad_id=tokenizer.pad_token_id,
        mask_id=tokenizer.mask_token_id,
        random_prob=args.random_prob,
        keep_prob=args.keep_prob,
        offset=train_batch_size * global_step,
        num_training_instances=train_batch_size * args.num_training_steps,
    )

    logger.info("Check dataset:")
    for i in range(5):
        source_ids, target_ids, pseudo_ids, num_source_tokens, num_target_tokens = train_dataset.__getitem__(
            i)
        logger.info("Instance-%d" % i)
        logger.info("Source tokens = %s" %
                    " ".join(tokenizer.convert_ids_to_tokens(source_ids)))
        logger.info("Target tokens = %s" %
                    " ".join(tokenizer.convert_ids_to_tokens(target_ids)))

    logger.info("Mode = %s" % str(model))

    # Train!
    logger.info("  ***** Running training *****  *")
    logger.info("  Num examples = %d", len(training_features))
    logger.info("  Num Epochs = %.2f",
                len(train_dataset) / len(training_features))
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info("  Batch size per node = %d", per_node_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        train_batch_size)
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", args.num_training_steps)

    if args.num_training_steps <= global_step:
        logger.info(
            "Training is done. Please use a new dir or clean this dir!")
    else:
        # The training features are shuffled
        train_sampler = SequentialSampler(train_dataset) \
            if args.local_rank == -1 else DistributedSampler(train_dataset, shuffle=False)
        train_dataloader = DataLoader(
            train_dataset,
            sampler=train_sampler,
            batch_size=per_node_train_batch_size //
            args.gradient_accumulation_steps,
            collate_fn=utils.batch_list_to_batch_tensors)

        train_iterator = tqdm.tqdm(train_dataloader,
                                   initial=global_step,
                                   desc="Iter (loss=X.XXX, lr=X.XXXXXXX)",
                                   disable=args.local_rank not in [-1, 0])

        model.train()
        model.zero_grad()

        tr_loss, logging_loss = 0.0, 0.0

        for step, batch in enumerate(train_iterator):
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'source_ids': batch[0],
                'target_ids': batch[1],
                'pseudo_ids': batch[2],
                'num_source_tokens': batch[3],
                'num_target_tokens': batch[4]
            }
            loss = model(**inputs)
            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel (not distributed) training

            train_iterator.set_description(
                'Iter (loss=%5.3f) lr=%9.7f' %
                (loss.item(), scheduler.get_last_lr()[0]))

            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            logging_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    wandb.log(
                        {
                            'lr': scheduler.get_last_lr()[0],
                            'loss': logging_loss / args.logging_steps
                        },
                        step=global_step)

                    logger.info(" Step [%d ~ %d]: %.2f",
                                global_step - args.logging_steps, global_step,
                                logging_loss)
                    logging_loss = 0.0

                if args.local_rank in [-1, 0] and args.save_steps > 0 and \
                        (global_step % args.save_steps == 0 or global_step == args.num_training_steps):

                    save_path = os.path.join(args.output_dir,
                                             "ckpt-%d" % global_step)
                    os.makedirs(save_path, exist_ok=True)
                    model_to_save = model.module if hasattr(
                        model, "module") else model
                    model_to_save.save_pretrained(save_path)

                    # optim_to_save = {
                    #     "optimizer": optimizer.state_dict(),
                    #     "lr_scheduler": scheduler.state_dict(),
                    # }
                    # if args.fp16:
                    #     optim_to_save["amp"] = amp.state_dict()
                    # torch.save(
                    #     optim_to_save, os.path.join(args.output_dir, 'optim.{}.bin'.format(global_step)))

                    logger.info("Saving model checkpoint %d into %s",
                                global_step, save_path)

    wandb.save(f'{save_path}/*')
コード例 #3
0
ファイル: train.py プロジェクト: yupeijei1997/NLPCollection
def train(args, writer):

    # 1.数据处理
    # 获得预定义的fields,划分过的训练数据集
    # train_dataset中的每一行是一个torchtext.data.Example对象,这个对象的'id': ,'category': ,'news_text': 这三个属性保存了原来csv中每一行的数据
    # 此时还未数字化,要等到构造迭代器的时候才数字化
    fields, train_dataset = build_and_cache_dataset(args, mode='train')

    # NEWS_TEXT,CATEGORY是要存词汇表的,之后构造迭代器的时候会用上
    ID, CATEGORY, NEWS_TEXT = fields
    # 词向量
    vectors = Vectors(name=args.embed_path, cache=args.data_dir)

    # import gensim
    # word2vec = gensim.models.KeyedVectors.load_word2vec_format(args.embed_path, binary=True)

    # 创建数据集的词汇表,同时加载预训练的词向量
    # 创建词汇表,作为一个Vocab对象,存在Field对象NEWS_TEXT里,其中stoi是词和数字的映射字典,vectors是词的词向量矩阵,两者是对应的,第一个词映射为0,且词向量在vectors里也是第一行
    NEWS_TEXT.build_vocab(
        train_dataset,  # 根据训练数据集创建词汇表
        max_size=args.vocab_size,  # 句子最大长度
        vectors=vectors,  # 根据词汇表,从加载的预训练词向量中抽出相应的词向量
        unk_init=torch.nn.init.xavier_normal_,
    )
    # 创建标签的词汇表,作为一个Vocab对象,存在Field对象CATEGORY里
    CATEGORY.build_vocab(train_dataset)
    # 实例化模型
    model = TextClassifier(
        vocab_size=len(NEWS_TEXT.vocab),  # 训练集划分后的词的总个数,即词汇表长度
        output_dim=args.num_labels,  # 类别数
        pad_idx=NEWS_TEXT.vocab.stoi[
            NEWS_TEXT.
            pad_token],  # NEWS_TEXT.pad_token = <pad>,从stoi('<pad> : 1')里取出<pad>的值
        dropout=args.dropout,
    )

    # 为embedding层的矩阵赋值为NEWS.vocab.vectors
    model.embedding.from_pretrained(NEWS_TEXT.vocab.vectors)

    # 构造训练集迭代器,在这一步将torchtext.data.Example对象中的news_text属性数字化
    # 还会对同一个batch内的不够长的句子做pad,pad成batch内最长的句子的长度,但是在batch.news_text里会记录句子真实的长度
    bucket_iterator = BucketIterator(
        train_dataset,
        batch_size=args.train_batch_size,  # batch_size大小
        sort_within_batch=True,  # batch内排序
        shuffle=True,  # 2.batch间进行乱序
        sort_key=lambda x: len(
            x.news_text),  # 1.按句子长度排序,x代表训练集中的每一行,即一个torchtext.data.Example对象
        device=args.device,  # 放入GPU里
    )

    # 2.训练
    model.to(args.device)
    # 损失函数
    criterion = nn.CrossEntropyLoss()
    # 优化器
    optimizer = Adam(model.parameters(),
                     lr=args.learning_rate,
                     eps=args.adam_epsilon)
    # 学习率随epoch改变
    scheduler = OneCycleLR(optimizer,
                           max_lr=args.learning_rate * 10,
                           epochs=args.num_train_epochs,
                           steps_per_epoch=len(bucket_iterator))

    global_step = 0
    # 梯度清零
    model.zero_grad()

    # tqdm(list) 方法可以传入任意一种list

    # trange(i) 是 tqdm(range(i)) 的简单写法
    # 下式左边,等价于tqdm(range(0, 5))
    train_trange = trange(0, args.num_train_epochs, desc="Train epoch")

    for _ in train_trange:
        epoch_iterator = tqdm(bucket_iterator, desc='Training')  # 进度条

        # 对每个batch做一个前向传播和反向传播,更新参数
        for step, batch in enumerate(epoch_iterator):  # for循环结束进度条才为100%
            model.train()

            # news_text:所有句子组成一个list[[句子1],[句子2],...],实际是按列是一个句子
            # [句子1] = [单词1(单词对应的下标),单词2,单词3,...]
            # news_text_lengths:所有句子的长度组成一个list
            news_text, news_text_lengths = batch.news_text  # news_text中,每一列是一个数字化后的句子,batch_size是多少,就有多少列
            # print(batch.news_text)
            #
            # print(len(news_text))
            # print(news_text.shape)
            #
            # print(len(news_text_lengths))
            # print(news_text_lengths)
            category = batch.category  # 标签的list

            # 前向传播
            preds = model(news_text, news_text_lengths)

            # 计算损失值
            loss = criterion(preds, category)
            # 计算梯度
            loss.backward()

            # loss随每次batch的变化,写入tensorboard
            writer.add_scalar('Train/Loss', loss.item(), global_step)
            # 学习率随每次batch的变化,写入tensorboard
            writer.add_scalar('Train/lr',
                              scheduler.get_last_lr()[0], global_step)

            # NOTE: Update model, optimizer should update before scheduler
            # 更新参数
            optimizer.step()
            # 更新学习率
            scheduler.step()
            # 记录用过多少个batch进行参数更新了
            global_step += 1

            # 评估
            # 每50轮评估一次
            if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                # 返回损失值,精准率,召回率,f1_score的字典
                results = evaluate(args, model, CATEGORY.vocab,
                                   NEWS_TEXT.vocab)

                # 损失值,精准率,召回率,f1_score随每次batch的变化,写入tensorboard
                for key, value in results.items():
                    writer.add_scalar("Eval/{}".format(key), value,
                                      global_step)

            # 每100轮保存一次模型
            if args.save_steps > 0 and global_step % args.save_steps == 0:
                save_model(args, model, optimizer, scheduler, global_step)

    writer.close()
コード例 #4
0
class Detector(object):
    def __init__(self, cfg):
        self.device = cfg["device"]
        self.model = Models().get_model(cfg["network"]) # cfg.network
        self.model.to(self.device)
        params = [p for p in self.model.parameters() if p.requires_grad]
        self.optimizer = AdamW(params, lr=0.00001)
        self.lr_scheduler = OneCycleLR(self.optimizer,
                                       max_lr=1e-4,
                                       epochs=cfg["nepochs"],
                                       steps_per_epoch=169,  # len(dataloader)/accumulations
                                       div_factor=25,  # for initial lr, default: 25
                                       final_div_factor=1e3,  # for final lr, default: 1e4
                                       )

    def fit(self, data_loader, accumulation_steps=4, wandb=None):
        self.model.train()
        #     metric_logger = utils.MetricLogger(delimiter="  ")
        #     metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
        avg_loss = MetricLogger('scalar')
        total_loss = MetricLogger('dict')
        lr_log = MetricLogger('list')

        self.optimizer.zero_grad()
        device = self.device

        for i, (images, targets) in enumerate(data_loader):
            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
            loss_dict = self.model(images, targets)
            losses = sum(loss for loss in loss_dict.values())
            loss_value = losses.detach().item()
            if not math.isfinite(loss_value):
                print("Loss is {}, stopping training".format(loss_value))
                sys.exit(1)

            losses.backward()
            if (i+1) % accumulation_steps == 0:
                self.optimizer.step()
                self.optimizer.zero_grad()

                if self.lr_scheduler is not None:
                    self.lr_scheduler.step()
                    lr_log.update(self.lr_scheduler.get_last_lr())


            print(f"\rTrain iteration: [{i+1}/{len(data_loader)}]", end="")
            avg_loss.update(loss_value)
            total_loss.update(loss_dict)

            # metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
            # metric_logger.update(lr=optimizer.param_groups[0]["lr"])
        print()
        #print(loss_dict)
        return {"train_avg_loss": avg_loss.avg}, total_loss.avg


    def mixup_fit(self, data_loader, accumulation_steps=4, wandb=None):
        self.model.train()
        torch.cuda.empty_cache()
        #     metric_logger = utils.MetricLogger(delimiter="  ")
        #     metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
        avg_loss = MetricLogger('scalar')
        total_loss = MetricLogger('dict')
        #lr_log = MetricLogger('list')

        self.optimizer.zero_grad()
        device = self.device

        for i, (batch1, batch2) in enumerate(data_loader):
            images1, targets1 = batch1
            images2, targets2 = batch2
            images = mixup_images(images1, images2)
            targets = merge_targets(targets1, targets2)
            del images1, images2, targets1, targets2, batch1, batch2

            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
            loss_dict = self.model(images, targets)
            losses = sum(loss for loss in loss_dict.values())
            loss_value = losses.detach().item()
            if not math.isfinite(loss_value):
                print("Loss is {}, stopping training".format(loss_value))
                sys.exit(1)

            losses.backward()
            if (i+1) % accumulation_steps == 0:
                self.optimizer.step()
                self.optimizer.zero_grad()

                if self.lr_scheduler is not None:
                    self.lr_scheduler.step()
                    #lr_log.update(self.lr_scheduler.get_last_lr())


            print(f"Train iteration: [{i+1}/{674}]\r", end="")
            avg_loss.update(loss_value)
            total_loss.update(loss_dict)

            # metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
            # metric_logger.update(lr=optimizer.param_groups[0]["lr"])
        print()
        #print(loss_dict)
        return {"train_avg_loss": avg_loss.avg}, total_loss.avg


    def evaluate(self, val_dataloader):
        device = self.device
        torch.cuda.empty_cache()
        # self.model.to(device)
        self.model.eval()
        mAp_logger = MetricLogger('list')
        with torch.no_grad():
            for (j, batch) in enumerate(val_dataloader):
                print(f"\rValidation: [{j+1}/{len(val_dataloader)}]", end="")
                images, targets = batch
                del batch
                images = [img.to(device) for img in images]
                # targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
                predictions = self.model(images)#, targets)
                for i, pred in enumerate(predictions):
                    probas = pred["scores"].detach().cpu().numpy()
                    mask = probas > 0.6
                    preds = pred["boxes"].detach().cpu().numpy()[mask]
                    gts = targets[i]["boxes"].detach().cpu().numpy()
                    score, scores = map_score(gts, preds, thresholds=[.5, .55, .6, .65, .7, .75])
                    mAp_logger.update(scores)
            print()
        return {"validation_mAP_score": mAp_logger.avg}

    def get_checkpoint(self):
        self.model.eval()
        model_state = self.model.state_dict()
        optimizer_state = self.optimizer.state_dict()
        checkpoint = {'model_state_dict': model_state,
                      'optimizer_state_dict': optimizer_state
                      }
        # if self.lr_scheduler:
        #     scheduler_state = self.lr_scheduler.state_dict()
        #     checkpoint['lr_scheduler_state_dict'] = scheduler_state

        return checkpoint

    def load_checkpoint(self, checkpoint):
        self.model.eval()
        self.model.load_state_dict(checkpoint["model_state_dict"])
        self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"])