Exemple #1
0
def train(model, train_list, test_list):
    train_dataset = MyDataset(train_list)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=Config.batch_size,
                                  shuffle=True,
                                  num_workers=2,
                                  collate_fn=collate_fn,
                                  drop_last=True)
    model.train()

    # 计算所有epoch进行参数优化的总步数total_steps
    total_steps = int(train_dataset.__len__() * Config.epochs /
                      Config.batch_size / Config.gradient_accumulation)
    print("total train step num: {}".format(total_steps))

    optimizer = BertAdam(model.parameters(),
                         lr=Config.lr,
                         warmup=0.05,
                         t_total=total_steps)
    print('start training...')
    # 开始训练
    for epoch in range(Config.epochs):
        epoch_start_time = datetime.now()
        for batch_idx, input_ids in enumerate(train_dataloader):
            # 注意:GPT2模型的forward()函数,是对于给定的context,生成一个token,而不是生成一串token
            # GPT2Model的输入为n个token_id时,输出也是n个hidden_state,使用第n个hidden_state预测第n+1个token
            input_ids = input_ids.to(Config.device)

            outputs = model.forward(input_ids=input_ids)

            loss, accuracy = calculate_loss_and_accuracy(outputs,
                                                         labels=input_ids)
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           Config.max_grad_norm)

            optimizer.step()
            optimizer.zero_grad()
            print('epoch:{}, step:{}, loss: {:6f}, accuracy:{:6f}'.format(
                epoch + 1, batch_idx + 1, loss, accuracy))

        average_acc, average_loss = evaluate(model, test_list)
        res = "VALID epoch:{}, loss {:6f},  acc {:6f}".format(
            epoch, average_loss, average_acc)
        print(res)
        res += '\n'
        with open('log.txt', 'a+') as f:
            f.write(res)
        # 一个epoch跑完保存一下模型
        model_path = join(Config.model_output_path,
                          'model_epoch{}'.format(epoch + 1))
        if not os.path.exists(model_path):
            os.mkdir(model_path)
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(model_path)
        epoch_finish_time = datetime.now()
        print('跑完一个epoch花费时间为: {}'.format(epoch_finish_time -
                                          epoch_start_time))
Exemple #2
0
def detect_test_data(model_path='weights/face_mask_weights.pth', num=20):
    yolo_detector = YOLO4_inference(model_path=model_path)
    test_data = MyDataset(test_root)
    for i in random.sample(range(test_data.__len__()), num):
        img, gt = test_data.getRaw(i)
        image = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        boxes, labels, scores = yolo_detector.predict(image)
        detect_test_data_show(image, gt, boxes, labels, scores)
Exemple #3
0
def train(args):
    print(args)
    ds_train = MyDataset(My_PATH, set='train')
    ds_val = MyDataset(My_PATH, set='val')
    loader_train = data_utils.DataLoader(ds_train,
                                         batch_size=args.batch_size,
                                         num_workers=args.nb_worker,
                                         shuffle=True)
    loader_val = data_utils.DataLoader(ds_val,
                                       batch_size=1,
                                       num_workers=1,
                                       shuffle=False)

    model = torch.load('./models/2.pkl')
    model = model.cuda(0)

    trainAcc = 0
    trainNum = ds_train.__len__()
    for i, (images, label) in enumerate(loader_train):
        images = images.cuda(0)
        label = label.cuda(0)

        images = Variable(images)
        label = Variable(label)

        outputs = model(images)
        _, pred = torch.max(outputs.data, 1)
        trainAcc += torch.sum(pred == label.data)
        print(i)
        print(trainAcc)
    print('-----------------------------')
    valAcc = 0
    for i, (images, label) in enumerate(loader_val):
        images = images.cuda(0)
        images = Variable(images)
        label = label.cuda(0)

        outputs = model(images)
        _, pred = torch.max(outputs.data, 1)
        valAcc += torch.sum(pred == label)
        print(i)
        print(valAcc)
    print(
        "Epoch [%d/%d],trainAcc: %.4f,valAcc: %.4f" %
        (1, args.nb_epoch, int(trainAcc) * 1.0 / trainNum, int(valAcc) * 1.0 /
         (i + 1)))
Exemple #4
0
def train(model, device, train_list, multi_gpu, args):
    train_dataset = MyDataset(train_list)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  shuffle=False,
                                  num_workers=args.num_workers,
                                  collate_fn=collate_fn)

    total_steps = int(train_dataset.__len__() * args.epochs / args.batch_size /
                      args.gradient_accumulation)
    logger.info('total training steps = {}'.format(total_steps))

    save_step = max(int(args.save_step_percentage * total_steps), 1)
    logger.info('save per {} steps'.format(save_step))

    optimizer = transformers.AdamW(model.parameters(),
                                   lr=args.lr,
                                   correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(
        optimizer, warmup_steps=args.warmup_steps, t_total=total_steps)

    logger.info('starting training')
    running_loss = 0
    overall_step = -1

    model_path = join(args.model_output_path, "saved.pt")
    if os.path.exists(model_path):
        checkpoint = torch.load(model_path)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        scheduler.load_state_dict(checkpoint['scheduler'])
        running_loss = checkpoint['running_loss']
        overall_step = checkpoint['overall_step']
    logger.info("running loss:{}, overall step:{}".format(
        running_loss, overall_step))
    tb_writer = SummaryWriter(log_dir=args.writer_dir)
    oom_time = 0
    model.train()
    oom_flag = False
    epoch_start_time = datetime.now()
    for batch_idx, input_ids in enumerate(train_dataloader):
        if batch_idx <= overall_step:
            continue

        input_ids = input_ids.to(device)
        try:
            mu, logvar, bow_probs = model.forward(input=input_ids)

            bow_loss = calculate_bow(bow_probs, input_ids, device)

            loss = bow_loss

            if multi_gpu:
                loss = loss.mean()
            if args.gradient_accumulation > 1:
                loss = loss / args.gradient_accumulation
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           args.max_grad_norm)
            if (batch_idx + 1) % args.gradient_accumulation == 0:
                running_loss += loss.item()
                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()
                overall_step += 1
                if (overall_step + 1) % args.log_step == 0 or (overall_step + 1
                                                               == total_steps):
                    logger.info("step {}, loss {:.6}".format(
                        overall_step, loss))
                    tb_writer.add_scalar('loss', loss.item(), overall_step)
            if (overall_step + 1) % save_step == 0 or (overall_step
                                                       == total_steps):
                logger.info('saving for step {}'.format(overall_step))
                if not os.path.exists(args.model_output_path):
                    os.mkdir(args.model_output_path)

                torch.save(
                    {
                        # 'finished_epoch': epoch,
                        'model': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'scheduler': scheduler.state_dict(),
                        'overall_step': overall_step,
                        'running_loss': running_loss
                    },
                    model_path)

                logger.info('finish saving for step {}'.format(overall_step))

        except RuntimeError as exception:
            if "out of memory" in str(exception):
                oom_time += 1
                if not oom_flag:
                    logger.info("WARNING: ran out of memory,times: {}".format(
                        oom_time))
                    logger.info("batch_idx = ", batch_idx)
                    oom_flag = True
                if hasattr(torch.cuda, 'empty_cache'):
                    torch.cuda.empty_cache()
            else:
                logger.info(str(exception))
                raise exception

    epoch_finish_time = datetime.now()
    logger.info('time for one epoch: {}'.format(epoch_finish_time -
                                                epoch_start_time))
    logger.info('training finished')
Exemple #5
0
def train(model, device, train_list, multi_gpu, args, valid_list, test_list):
    train_dataset = MyDataset(train_list)
    train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers,
                                  collate_fn=collate_fn)
    model.train()
    # 计算所有epoch进行参数优化的总步数total_steps
    total_steps = int(train_dataset.__len__() * args.epochs / args.batch_size / args.gradient_accumulation)
    logger.info('total training steps = {}'.format(total_steps))

    # 设置优化器,并且在初始训练时,使用warmup策略
    optimizer = transformers.AdamW(model.parameters(), lr=args.lr, correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=total_steps)

    logger.info('starting training')
    # 用于统计每次梯度累计的loss
    running_loss = 0
    # 统计一共训练了多少个step
    overall_step = 0
    # 记录tensorboardX
    tb_writer = SummaryWriter(log_dir=args.writer_dir)
    # 记录 out of memory的次数
    oom_time = 0
    # 开始训练
    for epoch in range(args.epochs):
        epoch_start_time = datetime.now()
        for batch_idx, input_ids in enumerate(train_dataloader):
            # 注意:GPT2模型的forward()函数,是对于给定的context,生成一个token,而不是生成一串token
            # GPT2Model的输入为n个token_id时,输出也是n个hidden_state,使用第n个hidden_state预测第n+1个token
            input_ids = input_ids.to(device)
            # 解决在运行过程中,由于显存不足产生的cuda out of memory的问题
            try:
                outputs = model.forward(input_ids=input_ids)
                loss, accuracy = calculate_loss_and_accuracy(outputs, labels=input_ids, device=device)

                if multi_gpu:
                    loss = loss.mean()
                    accuracy = accuracy.mean()
                if args.gradient_accumulation > 1:
                    loss = loss / args.gradient_accumulation
                    accuracy = accuracy / args.gradient_accumulation
                loss.backward()
                # 梯度裁剪解决的是梯度消失或爆炸的问题,即设定阈值
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                # 进行一定step的梯度累计之后,更新参数
                if (batch_idx + 1) % args.gradient_accumulation == 0:
                    running_loss += loss.item()
                    # 更新参数
                    optimizer.step()
                    # 清空梯度信息
                    optimizer.zero_grad()
                    # 进行warm up
                    scheduler.step()
                    overall_step += 1
                    # 更新日志与tnesorboardX信息
                    if (overall_step + 1) % args.log_step == 0:
                        logger.info(
                            "batch {} of epoch {}, loss {}, accuracy {}".format(batch_idx + 1, epoch + 1, loss,
                                                                                accuracy))
                        tb_writer.add_scalar('loss', loss.item(), overall_step)
            except RuntimeError as exception:
                if "out of memory" in str(exception):
                    oom_time += 1
                    logger.info("WARNING: ran out of memory,times: {}".format(oom_time))
                    if hasattr(torch.cuda, 'empty_cache'):
                        torch.cuda.empty_cache()
                else:
                    logger.info(str(exception))
                    raise exception
        logger.info('saving model for epoch {}'.format(epoch + 1))
        if args.train_mmi:  # 当前训练MMI模型
            model_path = join(args.mmi_model_output_path, 'model_epoch{}'.format(epoch + 1))
        else:  # 当前训练对话模型
            model_path = join(args.dialogue_model_output_path, 'model_epoch{}'.format(epoch + 1))
        if not os.path.exists(model_path):
            os.mkdir(model_path)
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(model_path)
        logger.info('epoch {} finished'.format(epoch + 1))
        epoch_finish_time = datetime.now()
        logger.info('time for one epoch: {}'.format(epoch_finish_time - epoch_start_time))
        
        logger.info ("Start Valid Set")
        evaluate(model, device, valid_list, multi_gpu, args)
        logger.info ("Start Test Set")
        evaluate(model, device, test_list, multi_gpu, args)
        
    logger.info('training finished')
Exemple #6
0
def train(model, device, train_list, args):
    train_dataset = MyDataset(train_list)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  shuffle=False,
                                  num_workers=args.num_workers,
                                  collate_fn=collate_fn)

    total_steps = int(train_dataset.__len__() * args.epochs / args.batch_size)
    logger.info('total training steps = {}'.format(total_steps))

    save_step = max(int(args.save_step_percentage * total_steps), 1)
    logger.info('save per {} steps'.format(save_step))

    optimizer = transformers.AdamW(model.parameters(),
                                   lr=args.lr,
                                   correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(
        optimizer, warmup_steps=args.warmup_steps, t_total=total_steps)

    logger.info('starting training')
    running_loss = 0
    overall_step = -1
    kl_anneal_x0 = int(total_steps * args.kl_anneal_percentage)

    model_path = join(args.model_output_path, "saved.pt")
    if os.path.exists(model_path):
        checkpoint = torch.load(model_path)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        scheduler.load_state_dict(checkpoint['scheduler'])
        # finished_epoch = checkpoint['finished_epoch'] + 1
        running_loss = checkpoint['running_loss']
        overall_step = checkpoint['overall_step']
    logger.info("running loss:{}, overall step:{}".format(
        running_loss, overall_step))
    tb_writer = SummaryWriter(log_dir=args.writer_dir)
    oom_time = 0
    model.train()
    oom_flag = False
    # for epoch in range(finished_epoch, args.epochs):
    epoch_start_time = datetime.now()
    for batch_idx, input_ids in enumerate(train_dataloader):
        if batch_idx <= overall_step:
            continue

        input_ids = input_ids.to(device)
        try:
            outputs, mu, logvar, bow_probs = model.forward(input=input_ids)
            # anneal_function, step, k, x0
            ce, accuracy = calculate_loss_and_accuracy(outputs,
                                                       labels=input_ids,
                                                       device=device)

            kl_weight = min(
                0.5,
                kl_anneal_function(anneal_function=args.kl_anneal_function,
                                   step=overall_step,
                                   k=args.kl_anneal_k,
                                   x0=kl_anneal_x0))
            kld = (-0.5 *
                   torch.sum(logvar - torch.pow(mu, 2) - torch.exp(logvar) + 1,
                             1)).mean().squeeze()

            bow_loss = calculate_bow(bow_probs, input_ids, device)

            loss = ce + kl_weight * kld + args.bow_weight * bow_loss

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           args.max_grad_norm)
            running_loss += loss.item()
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()
            overall_step += 1
            if overall_step == 0 or (
                    overall_step + 1) % args.log_step == 0 or (overall_step + 1
                                                               == total_steps):
                logger.info(
                    "step {}, ce {:.6}, kld {:.6}, kl_weight {:.6}, bow {:.6}, bow_weight {:.6}, loss {:.6}, accuracy {:.6}"
                    .format(overall_step, ce, kld, kl_weight, bow_loss,
                            args.bow_weight, loss, accuracy))
                tb_writer.add_scalar('ce', ce.item(), overall_step)
                tb_writer.add_scalar('kld', kld.item(), overall_step)
                tb_writer.add_scalar('loss', loss.item(), overall_step)
            if (overall_step + 1) % save_step == 0 or (overall_step + 1
                                                       == total_steps):
                logger.info('saving for step {}'.format(overall_step))
                if not os.path.exists(args.model_output_path):
                    os.mkdir(args.model_output_path)

                torch.save(
                    {
                        # 'finished_epoch': epoch,
                        'model': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'scheduler': scheduler.state_dict(),
                        'overall_step': overall_step,
                        'running_loss': running_loss
                    },
                    model_path)

                decoder_path = join(args.model_output_path, 'decoder/')

                if not os.path.exists(decoder_path):
                    os.mkdir(decoder_path)

                model.save_decoder(decoder_path)
                logger.info('finish saving for step {}'.format(overall_step))

        except RuntimeError as exception:
            if "out of memory" in str(exception):
                oom_time += 1
                if not oom_flag:
                    logger.info("WARNING: ran out of memory,times: {}".format(
                        oom_time))
                    logger.info("batch_idx = ", batch_idx)
                    oom_flag = True
                if hasattr(torch.cuda, 'empty_cache'):
                    torch.cuda.empty_cache()
            else:
                logger.info(str(exception))
                raise exception

    epoch_finish_time = datetime.now()
    logger.info('time for one epoch: {}'.format(epoch_finish_time -
                                                epoch_start_time))
    logger.info('training finished')
Exemple #7
0
def train(model, device, train_list, multi_gpu, args, logdir):
    train_dataset = MyDataset(train_list, args.tensor_cache)
    # loader_batch_size = int(args.batch_size / args.gradient_accumulation / torch.cuda.device_count())
    # print(loader_batch_size)
    if args.distributed:
        loader_batch_size = int(args.batch_size / args.gradient_accumulation /
                                torch.cuda.device_count())
        sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
        train_dataloader = DataLoader(train_dataset,
                                      batch_size=loader_batch_size,
                                      shuffle=False,
                                      num_workers=args.num_workers,
                                      collate_fn=collate_fn,
                                      sampler=sampler)
    else:
        loader_batch_size = int(args.batch_size / args.gradient_accumulation)
        train_dataloader = DataLoader(train_dataset,
                                      batch_size=loader_batch_size,
                                      shuffle=True,
                                      num_workers=args.num_workers,
                                      collate_fn=collate_fn)
    print(loader_batch_size)
    model.train()
    # 计算所有epoch进行参数优化的总步数total_steps
    total_steps = int(train_dataset.__len__() * args.epochs / args.batch_size)
    logger.info('total training steps = {}'.format(total_steps))

    # 设置优化器,并且在初始训练时,使用warmup策略
    base_optimizer = transformers.AdamW(model.parameters(),
                                        lr=args.lr,
                                        correct_bias=True)
    # if args.linear_schedule:
    #     optimizer = NoamOpt(model.embeddings_size, args.warmup_steps, base_optimizer, lr=args.lr, linear_schedule=True,
    #                         total_steps=total_steps, apex_level=None, loss_weight=None,
    #                         extra_module_lr_rate=1)
    # else:
    #     optimizer = NoamOpt(model.embeddings_size, args.warmup_steps, base_optimizer, lr=args.lr, linear_schedule=False,
    #                         apex_level=None, loss_weight=None, extra_module_lr_rate=1)
    optimizer = NoamOpt(768,
                        args.warmup_steps,
                        base_optimizer,
                        lr=args.lr,
                        linear_schedule=True,
                        total_steps=total_steps,
                        apex_level=None,
                        loss_weight=None,
                        extra_module_lr_rate=1)

    logger.info('starting training')
    # 用于统计每次梯度累计的loss
    running_loss = 0
    # 统计一共训练了多少个step
    overall_step = 0
    log_loss = 0
    log_acc = 0
    log_ppl = 0
    # 记录tensorboardX
    tb_writer = SummaryWriter(log_dir=args.writer_dir)
    # 记录 out of memory的次数
    oom_time = 0
    # 开始训练
    for epoch in tqdm(range(args.epochs), desc='Epoch'):
        tqdm_data = tqdm(train_dataloader,
                         desc="Train (epoch #{})".format(epoch))
        for batch_idx, input_ids in enumerate(tqdm_data):
            # 注意:GPT2模型的forward()函数,是对于给定的context,生成一个token,而不是生成一串token
            # GPT2Model的输入为n个token_id时,输出也是n个hidden_state,使用第n个hidden_state预测第n+1个token
            input_ids = input_ids.to(device)
            # 解决在运行过程中,由于显存不足产生的cuda out of memory的问题
            rest_size = 0 if device == 'cpu' else (
                torch.cuda.device_count() -
                input_ids.size(0) % torch.cuda.device_count())
            if rest_size != 0:
                input_ids = torch.cat([input_ids] +
                                      [input_ids[:1, :]] * rest_size,
                                      dim=0)
            try:
                outputs = model.forward(input_ids=input_ids)
                loss, accuracy = calculate_loss_and_accuracy(outputs,
                                                             labels=input_ids,
                                                             device=device)

                if multi_gpu:
                    loss = loss.mean()
                    accuracy = accuracy.mean()
                if args.gradient_accumulation > 1:
                    loss = loss / args.gradient_accumulation
                    accuracy = accuracy / args.gradient_accumulation
                loss.backward()
                # 梯度裁剪解决的是梯度消失或爆炸的问题,即设定阈值
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)
                # 进行一定step的梯度累计之后,更新参数
                if (batch_idx + 1) % args.gradient_accumulation == 0:
                    running_loss += loss.item()
                    # 更新参数
                    optimizer.step()
                    # 清空梯度信息
                    optimizer.zero_grad()
                    # 进行warm up
                    overall_step += 1
                    log_loss = (log_loss * batch_idx +
                                loss.item()) / (batch_idx + 1)
                    log_acc = (log_acc * batch_idx +
                               accuracy.item()) / (batch_idx + 1)
                    tqdm_data.set_postfix({'loss': log_loss, 'acc': log_acc})
                    # 更新日志与tnesorboardX信息
                    if (overall_step + 1) % args.log_step == 0:
                        if args.local_rank in [-1, 0]:
                            logger.info('loss is %.5f, acc is %.5f', log_loss,
                                        log_acc)
                            tb_writer.add_scalar('loss', loss.item(),
                                                 overall_step)
            except RuntimeError as exception:
                if "out of memory" in str(exception):
                    oom_time += 1
                    logger.info("WARNING: ran out of memory,times: {}".format(
                        oom_time))
                    if hasattr(torch.cuda, 'empty_cache'):
                        torch.cuda.empty_cache()
                else:
                    logger.info(str(exception))
                    raise exception
        if args.local_rank in [-1, 0]:
            logger.info('saving model for epoch {}'.format(epoch + 1))
            model_path = join(logdir, 'model_epoch{}'.format(epoch + 1))
            if not os.path.exists(model_path):
                os.mkdir(model_path)
            model_to_save = model.module if hasattr(model, 'module') else model
            model_to_save.save_pretrained(model_path)
            logger.info('epoch {} finished'.format(epoch + 1))
    logger.info('training finished')
Exemple #8
0
def train(model, train_list):
    train_dataset = MyDataset(train_list)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=Config.batch_size,
                                  shuffle=True,
                                  num_workers=2,
                                  collate_fn=collate_fn,
                                  drop_last=True)
    model.train()

    # 计算所有epoch进行参数优化的总步数total_steps
    total_steps = int(train_dataset.__len__() * Config.epochs /
                      Config.batch_size / Config.gradient_accumulation)
    print("total train step num: {}".format(total_steps))

    # 设置优化器,并且在初始训练时,使用warmup策略
    optimizer = transformers.AdamW(model.parameters(),
                                   lr=Config.lr,
                                   correct_bias=True)
    scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=Config.warmup_steps,
        num_training_steps=total_steps)

    print("start training ...")

    running_loss = 0  # 用于统计每次梯度累计的loss
    overall_step = 0  # 统计一共训练了多少个step
    tb_writer = SummaryWriter(log_dir=Config.writer_dir)  # 日志输出文件

    # 开始训练
    for epoch in range(Config.epochs):
        epoch_start_time = datetime.now()
        for batch_idx, input_ids in enumerate(train_dataloader):
            # print(batch_idx)
            # print(input_ids.size())   # torch.Size([2, 208]) 这里的max_len是按批填充的

            # 注意:GPT2模型的forward()函数,是对于给定的context,生成一个token,而不是生成一串token
            # GPT2Model的输入为n个token_id时,输出也是n个hidden_state,使用第n个hidden_state预测第n+1个token
            input_ids = input_ids.to(Config.device)

            outputs = model.forward(input_ids=input_ids)

            loss, accuracy = calculate_loss_and_accuracy(outputs,
                                                         labels=input_ids)

            if Config.gradient_accumulation > 1:
                loss = loss / Config.gradient_accumulation
                accuracy = accuracy / Config.gradient_accumulation
            loss.backward()

            # 梯度裁剪
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           Config.max_grad_norm)

            # 进行一定的step的梯度累计之后, 更新参数
            if batch_idx + 1 % Config.gradient_accumulation == 0:
                running_loss += loss.item()
                optimizer.step()
                optimizer.zero_grad()

                # 进行warm_up
                scheduler.step()
                overall_step += 1

                if overall_step + 1 % Config.log_step == 0:
                    print('epoch:{}, step:{}, loss: {}, accuracy:{}'.format(
                        epoch + 1, batch_idx + 1, loss, accuracy))
                    tb_writer.add_scalar('loss', loss.item(), overall_step)

        # 一个epoch跑完保存一下模型
        model_path = join(Config.model_output_path,
                          'model_epoch{}'.format(epoch + 1))
        if not os.path.exists(model_path):
            os.mkdir(model_path)
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(model_path)
        epoch_finish_time = datetime.now()
        print('跑完一个epoch花费时间为: {}'.format(epoch_finish_time -
                                          epoch_start_time))
Exemple #9
0
def train(model, device, train_list, multi_gpu, hparams):
    train_data = MyDataset(train_list)
    train_dataloader = DataLoader(train_data,
                                  batch_size=hparams.batch_size,
                                  shuffle=True,
                                  num_workers=hparams.num_workers,
                                  collate_fn=collate)
    model.train()
    total_steps = int(train_data.__len__() * hparams.epochs /
                      hparams.batch_size / hparams.gradient_accumulation)
    logger.info('total training steps = {}'.format(total_steps))

    optimizer = transformers.AdamW(model.parameters(),
                                   lr=hparams.lr,
                                   correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(
        optimizer, warmup_steps=hparams.warmup_steps, t_total=total_steps)

    logger.info('starting training')
    run_loss = 0
    over_step = 0
    tb_writer = SummaryWriter(log_dir=hparams.writer_dir)
    oom_time = 0  # out of memory 次数
    for epoch in range(hparams.epochs):
        start_time = datetime.now()
        for batch_index, input_ids in enumerate(train_dataloader):
            # GPT2Model的输入为n个token_id时,输出也是n个hidden_state,使用第n个hidden_state预测第n+1个token
            input_ids = input_ids.to(device)
            try:
                outputs = model.forward(input_ids=input_ids)
                loss, accuracy = cal_loss_accuracy(outputs, input_ids, device)
                if multi_gpu:
                    loss = loss.mean()
                    accuracy = accuracy.mean()
                if hparams.gradient_accumulation > 1:
                    loss = loss / hparams.gradient_accumulation
                    accuracy = accuracy / hparams.gradient_accumulation
                loss.backward()
                # 解决梯度爆照和消失问题
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               hparams.max_grad_norm)
                if (batch_index + 1) % hparams.gradient_accumulation == 0:
                    run_loss += loss.item()
                    optimizer.step()
                    optimizer.zero_grad()  # 清空梯度信息
                    scheduler.step()
                    over_step += 1
                    if (over_step + 1) % hparams.log_step == 0:
                        logger.info(
                            "batch {} of epoch {}, loss {}, accuracy {}".
                            format(batch_index + 1, epoch + 1, loss, accuracy))
                        tb_writer.add_scalar('loss', loss.item(), over_step)
            except RuntimeError as e:
                if "out of memory" in str(e):
                    oom_time += 1
                    logger.info("WARNING: ran out of memory,times: {}".format(
                        oom_time))
                    if hasattr(torch.cuda, 'empty_cache'):
                        torch.cuda.empty_cache()
                else:
                    logger.info(str(e))
        logger.info('saving model for epoch {}'.format(epoch + 1))
        if hparams.train_mmi:
            model_path = join(hparams.mmi_model_output_path,
                              "model_epoch{}".format(epoch + 1))
        else:
            model_path = join(hparams.dialog_model_output_path,
                              "model_epoch{}".format(epoch + 1))
        if not os.path.exists(model_path):
            os.mkdir(model_path)
        model_to_save = model.module if hasattr(model, "module") else model
        model_to_save.save_pretrained(model_path)
        logger.info('epoch {} finished'.format(epoch + 1))
        epoch_finish_time = datetime.now()
        logger.info('time for one epoch: {}'.format(epoch_finish_time -
                                                    start_time))
    logger.info('training finished')
Exemple #10
0
def train(args):
    print(args)
    ds_train = MyDataset(My_PATH, set='train')
    ds_val   = MyDataset(My_PATH, set='val')
    loader_train = data_utils.DataLoader(ds_train,
                                            batch_size=args.batch_size,
                                            num_workers=args.nb_worker,shuffle=True)
    loader_val = data_utils.DataLoader(ds_val,
                                         batch_size=1,
                                         num_workers=1,shuffle=False)
    model = models.vgg16(pretrained = True)
    model.classifier = torch.nn.Sequential(torch.nn.Linear(25088, 4096),
                                           torch.nn.ReLU(),
                                           torch.nn.Dropout(p=0.5),
                                           torch.nn.Linear(4096, 4096),
                                           torch.nn.ReLU(),
                                           torch.nn.Dropout(p=0.5),
                                           torch.nn.Linear(4096, 2))

    print ("init_params done.")
    model=model.cuda(0)
    if not os.path.exists("./models"):
        os.mkdir ("./models")

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    cost=torch.nn.CrossEntropyLoss()
    cost=cost.cuda(0)
    for epoch in range(args.nb_epoch):
        starttime = datetime.datetime.now()
        trainAcc = 0
        trainNum = ds_train.__len__()
        for i, (images, label) in enumerate(loader_train):
            images = images.cuda(0)
            label = label.cuda(0)

            images = Variable(images)
            label = Variable(label)

            optimizer.zero_grad()
            outputs = model(images)
            loss = cost(outputs, label)
            _, pred = torch.max(outputs.data, 1)
            trainAcc += torch.sum(pred == label.data)
            loss.backward()
            optimizer.step()

        valAcc = 0
        for i, (images, label) in enumerate(loader_val):
            images=images.cuda(0)
            images = Variable(images)
            label=label.cuda(0)

            outputs = model(images)
            _, pred = torch.max(outputs.data, 1)
            valAcc += torch.sum(pred == label)

        print("Epoch [%d/%d] Loss: %.6f,trainAcc: %.4f,valAcc: %.4f" % (
        epoch + 1, args.nb_epoch, loss.data[0], int(trainAcc)* 1.0 / trainNum,int(valAcc)* 1.0 / (i + 1)))
        logging.info("Epoch [%d/%d] Loss: %.6f,trainAcc: %.4f,valAcc: %.4f" % (
        epoch + 1, args.nb_epoch, loss.data[0],int(trainAcc)* 1.0 / trainNum,int(valAcc)* 1.0 / (i + 1)))
        torch.save(model, "./models/{}.pkl".format(epoch))
        endtime=datetime.datetime.now()
        print((endtime-starttime).seconds)
    logging.info('time:{}'.format((endtime-starttime).seconds))