Python WarmupLinearSchedule Examples

Programming Language: Python

Namespace/Package Name: transformers

Method/Function: WarmupLinearSchedule

Examples at hotexamples.com: 10

Python WarmupLinearSchedule - 10 examples found. These are the top rated real world Python examples of transformers.WarmupLinearSchedule extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: train_loop.py Project: ruleGreen/TG_CRS_Code

    def init_optim(self):
        """
        Initialize optimizer with model parameters.

        :param params:
            parameters from the model

        :param optim_states:
            optional argument providing states of optimizer to load

        :param saved_optim_type:
            type of optimizer being loaded, if changed will skip loading
            optimizer states
        """
        # 设置优化器，并且在初始训练时，使用warmup策略
        self.optimizer = transformers.AdamW(self.model.parameters(),
                                            lr=self.args.lr,
                                            correct_bias=True)
        self.scheduler = transformers.WarmupLinearSchedule(
            self.optimizer,
            warmup_steps=self.args.warmup_steps,
            t_total=self.total_steps)

Example #2

Show file

File: train_single2.py Project: yf1291/nlp4

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='设置使用哪些显卡')
    parser.add_argument('--model_config',
                        default='config/model_config.json',
                        type=str,
                        required=False,
                        help='选择模型参数')
    parser.add_argument('--tokenizer_path',
                        default='cache/vocab_small.txt',
                        type=str,
                        required=False,
                        help='选择词库')
    parser.add_argument('--raw_data_path',
                        default='data/train.json',
                        type=str,
                        required=False,
                        help='原始训练语料')
    parser.add_argument('--tokenized_data_path',
                        default='data/tokenized/',
                        type=str,
                        required=False,
                        help='tokenized语料存放位置')
    parser.add_argument('--raw', action='store_true', help='是否先做tokenize')
    parser.add_argument('--epochs',
                        default=5,
                        type=int,
                        required=False,
                        help='训练循环')
    parser.add_argument('--batch_size',
                        default=8,
                        type=int,
                        required=False,
                        help='训练batch size')
    parser.add_argument('--lr',
                        default=1.5e-4,
                        type=float,
                        required=False,
                        help='学习率')
    parser.add_argument('--warmup_steps',
                        default=2000,
                        type=int,
                        required=False,
                        help='warm up步数')
    parser.add_argument('--log_step',
                        default=1,
                        type=int,
                        required=False,
                        help='多少步汇报一次loss')
    parser.add_argument('--stride',
                        default=768,
                        type=int,
                        required=False,
                        help='训练时取训练数据的窗口步长')
    parser.add_argument('--gradient_accumulation',
                        default=1,
                        type=int,
                        required=False,
                        help='梯度积累')
    parser.add_argument('--fp16', action='store_true', help='混合精度')
    parser.add_argument('--fp16_opt_level',
                        default='O1',
                        type=str,
                        required=False)
    parser.add_argument('--max_grad_norm',
                        default=1.0,
                        type=float,
                        required=False)
    parser.add_argument('--num_pieces',
                        default=100,
                        type=int,
                        required=False,
                        help='将训练语料分成多少份')
    parser.add_argument('--output_dir',
                        default='model/',
                        type=str,
                        required=False,
                        help='模型输出路径')
    parser.add_argument('--pretrained_model',
                        default='',
                        type=str,
                        required=False,
                        help='模型训练起点路径')
    parser.add_argument('--segment', action='store_true', help='中文以词为单位')

    args = parser.parse_args()
    args.device = 0

    print('args:\n' + args.__repr__())

    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡
    model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(
        args.model_config)
    print('config:\n' + model_config.to_json_string())

    n_ctx = model_config.n_ctx
    full_tokenizer = tokenization_bert.BertTokenizer(
        vocab_file=args.tokenizer_path)
    '''
    直接使用gpt2的tokenizer
    '''

    full_tokenizer.max_len = 999999
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    raw = args.raw  # 选择是否从零开始构建数据集
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    stride = args.stride
    gradient_accumulation = args.gradient_accumulation
    fp16 = args.fp16  # 不支持半精度的显卡请勿打开
    fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    num_pieces = args.num_pieces
    output_dir = args.output_dir

    if raw:
        print('building files')
        build_files(raw_data_path=raw_data_path,
                    tokenized_data_path=tokenized_data_path,
                    full_tokenizer=full_tokenizer,
                    num_pieces=num_pieces)
        print('files built')

    if not args.pretrained_model:
        model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config)
    else:
        model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(
            args.pretrained_model)
    model.train()
    model.to(device)
    multi_gpu = False
    full_len = 0
    print('calculating total steps')
    for i in tqdm(range(num_pieces)):
        with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                  'r') as f:
            full_len += len([int(item) for item in f.read().strip().split()])
    total_steps = int(full_len / stride * epochs / batch_size /
                      gradient_accumulation)
    print('total steps = {}'.format(total_steps))

    optimizer = transformers.AdamW(model.parameters(),
                                   lr=lr,
                                   correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(optimizer,
                                                  warmup_steps=warmup_steps,
                                                  t_total=total_steps)
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(model)
        multi_gpu = True
    print('starting training')
    running_loss = 0
    for epoch in range(epochs):
        print('epoch {}'.format(epoch + 1))
        now = datetime.now()
        print('time: {}'.format(now))
        x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
        random.shuffle(x)
        piece_num = 0
        for i in x:
            with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                      'r') as f:
                line = f.read().strip()
            tokens = line.split()
            tokens = [int(token) for token in tokens]
            start_point = 0
            samples = []
            while start_point < len(tokens) - n_ctx:
                samples.append(tokens[start_point:start_point + n_ctx])
                start_point += stride
            if start_point < len(tokens):
                samples.append(tokens[len(tokens) - n_ctx:])
            random.shuffle(samples)
            for step in range(len(samples) // batch_size):

                #  prepare data
                batch = samples[step * batch_size:(step + 1) * batch_size]
                batch_labels = []
                batch_inputs = []
                for ids in batch:
                    int_ids_for_labels = [int(x) for x in ids]
                    int_ids_for_inputs = [int(x) for x in ids]
                    batch_labels.append(int_ids_for_labels)
                    batch_inputs.append(int_ids_for_inputs)
                batch_labels = torch.tensor(batch_labels).long().to(device)
                batch_inputs = torch.tensor(batch_inputs).long().to(device)

                #  forward pass
                outputs = model.forward(input_ids=batch_inputs,
                                        labels=batch_labels)
                loss, logits = outputs[:2]

                #  get loss
                if multi_gpu:
                    loss = loss.mean()
                if gradient_accumulation > 1:
                    loss = loss / gradient_accumulation

                #  loss backward
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   max_grad_norm)

                #  optimizer step
                if (step + 1) % gradient_accumulation == 0:
                    running_loss += loss.item()
                    optimizer.step()
                    optimizer.zero_grad()
                    scheduler.step()
                if (step + 1) % log_step == 0:
                    print(
                        'now time: {}:{}. Step {} of piece {} of epoch {}, loss {}'
                        .format(datetime.now().hour,
                                datetime.now().minute,
                                (step + 1) // gradient_accumulation, piece_num,
                                epoch + 1, running_loss / log_step))
                    running_loss = 0
            piece_num += 1

        print('saving model for epoch {}'.format(epoch + 1))
        if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)):
            os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1))
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(output_dir +
                                      'model_epoch{}'.format(epoch + 1))
        # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1))
        # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1))
        print('epoch {} finished'.format(epoch + 1))

        then = datetime.now()
        print('time: {}'.format(then))
        print('time for one epoch: {}'.format(then - now))

    print('training finished')
    if not os.path.exists(output_dir + 'final_model'):
        os.mkdir(output_dir + 'final_model')
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(output_dir + 'final_model')

Example #3

Show file

def train(model, device, train_list, multi_gpu, args):
    train_dataset = MyDataset(train_list)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  shuffle=False,
                                  num_workers=args.num_workers,
                                  collate_fn=collate_fn)

    total_steps = int(train_dataset.__len__() * args.epochs / args.batch_size /
                      args.gradient_accumulation)
    logger.info('total training steps = {}'.format(total_steps))

    save_step = max(int(args.save_step_percentage * total_steps), 1)
    logger.info('save per {} steps'.format(save_step))

    optimizer = transformers.AdamW(model.parameters(),
                                   lr=args.lr,
                                   correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(
        optimizer, warmup_steps=args.warmup_steps, t_total=total_steps)

    logger.info('starting training')
    running_loss = 0
    overall_step = -1

    model_path = join(args.model_output_path, "saved.pt")
    if os.path.exists(model_path):
        checkpoint = torch.load(model_path)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        scheduler.load_state_dict(checkpoint['scheduler'])
        running_loss = checkpoint['running_loss']
        overall_step = checkpoint['overall_step']
    logger.info("running loss:{}, overall step:{}".format(
        running_loss, overall_step))
    tb_writer = SummaryWriter(log_dir=args.writer_dir)
    oom_time = 0
    model.train()
    oom_flag = False
    epoch_start_time = datetime.now()
    for batch_idx, input_ids in enumerate(train_dataloader):
        if batch_idx <= overall_step:
            continue

        input_ids = input_ids.to(device)
        try:
            mu, logvar, bow_probs = model.forward(input=input_ids)

            bow_loss = calculate_bow(bow_probs, input_ids, device)

            loss = bow_loss

            if multi_gpu:
                loss = loss.mean()
            if args.gradient_accumulation > 1:
                loss = loss / args.gradient_accumulation
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           args.max_grad_norm)
            if (batch_idx + 1) % args.gradient_accumulation == 0:
                running_loss += loss.item()
                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()
                overall_step += 1
                if (overall_step + 1) % args.log_step == 0 or (overall_step + 1
                                                               == total_steps):
                    logger.info("step {}, loss {:.6}".format(
                        overall_step, loss))
                    tb_writer.add_scalar('loss', loss.item(), overall_step)
            if (overall_step + 1) % save_step == 0 or (overall_step
                                                       == total_steps):
                logger.info('saving for step {}'.format(overall_step))
                if not os.path.exists(args.model_output_path):
                    os.mkdir(args.model_output_path)

                torch.save(
                    {
                        # 'finished_epoch': epoch,
                        'model': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'scheduler': scheduler.state_dict(),
                        'overall_step': overall_step,
                        'running_loss': running_loss
                    },
                    model_path)

                logger.info('finish saving for step {}'.format(overall_step))

        except RuntimeError as exception:
            if "out of memory" in str(exception):
                oom_time += 1
                if not oom_flag:
                    logger.info("WARNING: ran out of memory,times: {}".format(
                        oom_time))
                    logger.info("batch_idx = ", batch_idx)
                    oom_flag = True
                if hasattr(torch.cuda, 'empty_cache'):
                    torch.cuda.empty_cache()
            else:
                logger.info(str(exception))
                raise exception

    epoch_finish_time = datetime.now()
    logger.info('time for one epoch: {}'.format(epoch_finish_time -
                                                epoch_start_time))
    logger.info('training finished')

Example #4

Show file

def train(model, device, train_list, args):
    train_dataset = MyDataset(train_list)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  shuffle=False,
                                  num_workers=args.num_workers,
                                  collate_fn=collate_fn)

    total_steps = int(train_dataset.__len__() * args.epochs / args.batch_size)
    logger.info('total training steps = {}'.format(total_steps))

    save_step = max(int(args.save_step_percentage * total_steps), 1)
    logger.info('save per {} steps'.format(save_step))

    optimizer = transformers.AdamW(model.parameters(),
                                   lr=args.lr,
                                   correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(
        optimizer, warmup_steps=args.warmup_steps, t_total=total_steps)

    logger.info('starting training')
    running_loss = 0
    overall_step = -1
    kl_anneal_x0 = int(total_steps * args.kl_anneal_percentage)

    model_path = join(args.model_output_path, "saved.pt")
    if os.path.exists(model_path):
        checkpoint = torch.load(model_path)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        scheduler.load_state_dict(checkpoint['scheduler'])
        # finished_epoch = checkpoint['finished_epoch'] + 1
        running_loss = checkpoint['running_loss']
        overall_step = checkpoint['overall_step']
    logger.info("running loss:{}, overall step:{}".format(
        running_loss, overall_step))
    tb_writer = SummaryWriter(log_dir=args.writer_dir)
    oom_time = 0
    model.train()
    oom_flag = False
    # for epoch in range(finished_epoch, args.epochs):
    epoch_start_time = datetime.now()
    for batch_idx, input_ids in enumerate(train_dataloader):
        if batch_idx <= overall_step:
            continue

        input_ids = input_ids.to(device)
        try:
            outputs, mu, logvar, bow_probs = model.forward(input=input_ids)
            # anneal_function, step, k, x0
            ce, accuracy = calculate_loss_and_accuracy(outputs,
                                                       labels=input_ids,
                                                       device=device)

            kl_weight = min(
                0.5,
                kl_anneal_function(anneal_function=args.kl_anneal_function,
                                   step=overall_step,
                                   k=args.kl_anneal_k,
                                   x0=kl_anneal_x0))
            kld = (-0.5 *
                   torch.sum(logvar - torch.pow(mu, 2) - torch.exp(logvar) + 1,
                             1)).mean().squeeze()

            bow_loss = calculate_bow(bow_probs, input_ids, device)

            loss = ce + kl_weight * kld + args.bow_weight * bow_loss

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           args.max_grad_norm)
            running_loss += loss.item()
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()
            overall_step += 1
            if overall_step == 0 or (
                    overall_step + 1) % args.log_step == 0 or (overall_step + 1
                                                               == total_steps):
                logger.info(
                    "step {}, ce {:.6}, kld {:.6}, kl_weight {:.6}, bow {:.6}, bow_weight {:.6}, loss {:.6}, accuracy {:.6}"
                    .format(overall_step, ce, kld, kl_weight, bow_loss,
                            args.bow_weight, loss, accuracy))
                tb_writer.add_scalar('ce', ce.item(), overall_step)
                tb_writer.add_scalar('kld', kld.item(), overall_step)
                tb_writer.add_scalar('loss', loss.item(), overall_step)
            if (overall_step + 1) % save_step == 0 or (overall_step + 1
                                                       == total_steps):
                logger.info('saving for step {}'.format(overall_step))
                if not os.path.exists(args.model_output_path):
                    os.mkdir(args.model_output_path)

                torch.save(
                    {
                        # 'finished_epoch': epoch,
                        'model': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'scheduler': scheduler.state_dict(),
                        'overall_step': overall_step,
                        'running_loss': running_loss
                    },
                    model_path)

                decoder_path = join(args.model_output_path, 'decoder/')

                if not os.path.exists(decoder_path):
                    os.mkdir(decoder_path)

                model.save_decoder(decoder_path)
                logger.info('finish saving for step {}'.format(overall_step))

        except RuntimeError as exception:
            if "out of memory" in str(exception):
                oom_time += 1
                if not oom_flag:
                    logger.info("WARNING: ran out of memory,times: {}".format(
                        oom_time))
                    logger.info("batch_idx = ", batch_idx)
                    oom_flag = True
                if hasattr(torch.cuda, 'empty_cache'):
                    torch.cuda.empty_cache()
            else:
                logger.info(str(exception))
                raise exception

    epoch_finish_time = datetime.now()
    logger.info('time for one epoch: {}'.format(epoch_finish_time -
                                                epoch_start_time))
    logger.info('training finished')

Example #5

Show file

def train(model, device, train_list, multi_gpu, args, valid_list, test_list):
    train_dataset = MyDataset(train_list)
    train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers,
                                  collate_fn=collate_fn)
    model.train()
    # 计算所有epoch进行参数优化的总步数total_steps
    total_steps = int(train_dataset.__len__() * args.epochs / args.batch_size / args.gradient_accumulation)
    logger.info('total training steps = {}'.format(total_steps))

    # 设置优化器，并且在初始训练时，使用warmup策略
    optimizer = transformers.AdamW(model.parameters(), lr=args.lr, correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=total_steps)

    logger.info('starting training')
    # 用于统计每次梯度累计的loss
    running_loss = 0
    # 统计一共训练了多少个step
    overall_step = 0
    # 记录tensorboardX
    tb_writer = SummaryWriter(log_dir=args.writer_dir)
    # 记录 out of memory的次数
    oom_time = 0
    # 开始训练
    for epoch in range(args.epochs):
        epoch_start_time = datetime.now()
        for batch_idx, input_ids in enumerate(train_dataloader):
            # 注意：GPT2模型的forward()函数，是对于给定的context，生成一个token，而不是生成一串token
            # GPT2Model的输入为n个token_id时，输出也是n个hidden_state，使用第n个hidden_state预测第n+1个token
            input_ids = input_ids.to(device)
            # 解决在运行过程中，由于显存不足产生的cuda out of memory的问题
            try:
                outputs = model.forward(input_ids=input_ids)
                loss, accuracy = calculate_loss_and_accuracy(outputs, labels=input_ids, device=device)

                if multi_gpu:
                    loss = loss.mean()
                    accuracy = accuracy.mean()
                if args.gradient_accumulation > 1:
                    loss = loss / args.gradient_accumulation
                    accuracy = accuracy / args.gradient_accumulation
                loss.backward()
                # 梯度裁剪解决的是梯度消失或爆炸的问题，即设定阈值
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                # 进行一定step的梯度累计之后，更新参数
                if (batch_idx + 1) % args.gradient_accumulation == 0:
                    running_loss += loss.item()
                    # 更新参数
                    optimizer.step()
                    # 清空梯度信息
                    optimizer.zero_grad()
                    # 进行warm up
                    scheduler.step()
                    overall_step += 1
                    # 更新日志与tnesorboardX信息
                    if (overall_step + 1) % args.log_step == 0:
                        logger.info(
                            "batch {} of epoch {}, loss {}, accuracy {}".format(batch_idx + 1, epoch + 1, loss,
                                                                                accuracy))
                        tb_writer.add_scalar('loss', loss.item(), overall_step)
            except RuntimeError as exception:
                if "out of memory" in str(exception):
                    oom_time += 1
                    logger.info("WARNING: ran out of memory,times: {}".format(oom_time))
                    if hasattr(torch.cuda, 'empty_cache'):
                        torch.cuda.empty_cache()
                else:
                    logger.info(str(exception))
                    raise exception
        logger.info('saving model for epoch {}'.format(epoch + 1))
        if args.train_mmi:  # 当前训练MMI模型
            model_path = join(args.mmi_model_output_path, 'model_epoch{}'.format(epoch + 1))
        else:  # 当前训练对话模型
            model_path = join(args.dialogue_model_output_path, 'model_epoch{}'.format(epoch + 1))
        if not os.path.exists(model_path):
            os.mkdir(model_path)
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(model_path)
        logger.info('epoch {} finished'.format(epoch + 1))
        epoch_finish_time = datetime.now()
        logger.info('time for one epoch: {}'.format(epoch_finish_time - epoch_start_time))
        
        logger.info ("Start Valid Set")
        evaluate(model, device, valid_list, multi_gpu, args)
        logger.info ("Start Test Set")
        evaluate(model, device, test_list, multi_gpu, args)
        
    logger.info('training finished')

Example #6

Show file

File: train.py Project: xiaoshiqi/GPT2-Chinese

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='cuda visible devices')
    parser.add_argument('--model_config',
                        default='config/model_config.json',
                        type=str,
                        required=False,
                        help='path of the model configration file')
    parser.add_argument('--tokenizer_path',
                        default='data/vocabs.txt',
                        type=str,
                        required=False,
                        help='path of the vocabulary file')
    parser.add_argument('--raw_data_path',
                        default='data/samples.json',
                        type=str,
                        required=False,
                        help='path of the samples file')
    parser.add_argument('--tokenized_data_path',
                        default='data/tokenized/',
                        type=str,
                        required=False,
                        help='save the tokenized samples file to this dir')
    parser.add_argument(
        '--raw',
        action='store_true',
        help=
        'do tokenize before training, no need if already tokenized with same configration'
    )
    parser.add_argument('--epochs', default=24, type=int, required=False)
    parser.add_argument('--batch_size', default=16, type=int, required=False)
    parser.add_argument('--lr', default=2e-4, type=float, required=False)
    parser.add_argument('--warmup_steps',
                        default=4000,
                        type=int,
                        required=False)
    parser.add_argument('--log_step',
                        default=4000,
                        type=int,
                        required=False,
                        help='period of reporting loss')
    parser.add_argument('--gradient_accumulation',
                        default=1,
                        type=int,
                        required=False)
    parser.add_argument('--max_grad_norm',
                        default=1.0,
                        type=float,
                        required=False)
    parser.add_argument('--output_dir',
                        default='model/',
                        type=str,
                        required=False,
                        help='save the model to this dir')
    parser.add_argument('--pretrained_model',
                        default='',
                        type=str,
                        required=False,
                        help='pre-trained model dir')

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    from tokenizations import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device

    model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(
        args.model_config)
    print('config:\n' + model_config.to_json_string())

    n_ctx = model_config.n_ctx
    full_tokenizer = tokenization_bert.BertTokenizer(
        vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = 999999

    if torch.cuda.is_available():
        device = 'cuda'
        print(torch.cuda.get_device_name(0))
    else:
        device = 'cpu'
        print(device)

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    raw = args.raw
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    gradient_accumulation = args.gradient_accumulation
    max_grad_norm = args.max_grad_norm
    output_dir = args.output_dir
    assert log_step % gradient_accumulation == 0

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    if raw:
        print('building files')
        build_files(data_path=raw_data_path,
                    tokenized_data_path=tokenized_data_path,
                    full_tokenizer=full_tokenizer,
                    n_ctx=n_ctx)
        print('files built')

    if not args.pretrained_model:
        model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config)
    else:
        model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(
            args.pretrained_model)
    model.train()
    model.to(device)

    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print('number of parameters: {}'.format(num_parameters))

    multi_gpu = False
    full_len = 0
    print('calculating total steps')

    with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(0),
              'r') as f:
        full_len += len([int(item) for item in f.read().strip().split()])

    total_steps = int(full_len / n_ctx * epochs / batch_size /
                      gradient_accumulation)
    print('total steps = {}'.format(total_steps))

    optimizer = transformers.AdamW(model.parameters(),
                                   lr=lr,
                                   correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(optimizer,
                                                  warmup_steps=warmup_steps,
                                                  t_total=total_steps)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")

        device_ids = []
        for i in args.device.split(','):
            try:
                print(torch.cuda.get_device_name(int(i)))
                device_ids.append(int(i))
            except:
                pass
        model = DataParallel(model, device_ids=device_ids)
        multi_gpu = True
    print('starting training')
    overall_step = 0
    running_loss = 0

    with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(0),
              'r') as f:
        line = f.read().strip()
    tokens = line.split()
    tokens = [int(token) for token in tokens]
    start_point = 0
    samples = []

    while start_point < len(tokens) - n_ctx:
        samples.append(tokens[start_point:start_point + n_ctx])
        start_point += n_ctx
    if start_point < len(tokens):
        samples.append(tokens[len(tokens) - n_ctx:])

    for epoch in range(epochs):
        print('epoch {}'.format(epoch + 1))
        now = datetime.now()
        print('time: {}'.format(now))

        samples2 = copy.deepcopy(samples)
        random.shuffle(samples2)

        for step in range(len(samples2) // batch_size):  # drop last
            #  prepare data
            batch = samples2[step * batch_size:(step + 1) * batch_size]
            batch_inputs = torch.tensor(batch).long().to(device)
            #  forward pass
            outputs = model.forward(input_ids=batch_inputs,
                                    labels=batch_inputs)
            loss, logits = outputs[:2]

            if multi_gpu:
                loss = loss.mean()
            if gradient_accumulation > 1:
                loss = loss / gradient_accumulation

            #  loss backward
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

            #  optimizer step
            if (overall_step + 1) % gradient_accumulation == 0:
                running_loss += loss.item()
                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()
            if (overall_step + 1) % log_step == 0:
                tb_writer.add_scalar('loss',
                                     loss.item() * gradient_accumulation,
                                     overall_step)
                print('now time: {}:{}. Step {} of epoch {}, loss {}'.format(
                    datetime.now().hour,
                    datetime.now().minute, step + 1, epoch + 1,
                    running_loss * gradient_accumulation /
                    (log_step / gradient_accumulation)))
                running_loss = 0
            overall_step += 1

        print('saving model for epoch {}'.format(epoch + 1))
        temp_epoch = (epoch + 1) % 2  # save disk space

        if not os.path.exists(output_dir + 'model_epoch{}'.format(temp_epoch)):
            os.mkdir(output_dir + 'model_epoch{}'.format(temp_epoch))
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(output_dir +
                                      'model_epoch{}'.format(temp_epoch))
        #torch.save(scheduler, output_dir + 'model_epoch{}/scheduler.pt'.format(temp_epoch))
        #torch.save(optimizer, output_dir + 'model_epoch{}/optimizer.pt'.format(temp_epoch))
        print('epoch {} finished'.format(epoch + 1))

        then = datetime.now()
        print('time: {}'.format(then))
        print('time for one epoch: {}'.format(then - now))

    print('training finished')

Example #7

Show file

File: train_single4OnlyRunThis.py Project: yf1291/nlp4

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='设置使用哪些显卡')
    parser.add_argument('--model_config',
                        default='config/model_config.json',
                        type=str,
                        required=False,
                        help='选择模型参数')
    parser.add_argument('--tokenizer_path',
                        default='cache/vocab_small.txt',
                        type=str,
                        required=False,
                        help='选择词库')
    parser.add_argument('--raw_data_path',
                        default='data/train.json',
                        type=str,
                        required=False,
                        help='原始训练语料')
    parser.add_argument('--tokenized_data_path',
                        default='data/tokenized/',
                        type=str,
                        required=False,
                        help='tokenized语料存放位置')
    parser.add_argument('--raw', action='store_true', help='是否先做tokenize')
    parser.add_argument('--epochs',
                        default=5,
                        type=int,
                        required=False,
                        help='训练循环')
    parser.add_argument('--batch_size',
                        default=8,
                        type=int,
                        required=False,
                        help='训练batch size')
    parser.add_argument('--lr',
                        default=1.5e-4,
                        type=float,
                        required=False,
                        help='学习率')
    parser.add_argument('--warmup_steps',
                        default=2000,
                        type=int,
                        required=False,
                        help='warm up步数')
    parser.add_argument('--log_step',
                        default=1,
                        type=int,
                        required=False,
                        help='多少步汇报一次loss')
    parser.add_argument('--stride',
                        default=768,
                        type=int,
                        required=False,
                        help='训练时取训练数据的窗口步长')
    parser.add_argument('--gradient_accumulation',
                        default=1,
                        type=int,
                        required=False,
                        help='梯度积累')
    parser.add_argument('--fp16', action='store_true', help='混合精度')
    parser.add_argument('--fp16_opt_level',
                        default='O1',
                        type=str,
                        required=False)
    parser.add_argument('--max_grad_norm',
                        default=1.0,
                        type=float,
                        required=False)
    parser.add_argument('--num_pieces',
                        default=1,
                        type=int,
                        required=False,
                        help='将训练语料分成多少份')
    parser.add_argument('--output_dir',
                        default='model/',
                        type=str,
                        required=False,
                        help='模型输出路径')
    parser.add_argument('--pretrained_model',
                        default='',
                        type=str,
                        required=False,
                        help='模型训练起点路径')
    parser.add_argument('--segment', action='store_true', help='中文以词为单位')
    '''
    配置参数-------------------------------------------------------------------
    '''
    args = parser.parse_args()
    args.device = '1'
    args.batch_size = 5
    from tokenizations import tokenization
    proj_root_path = os.path.dirname(
        os.path.dirname(os.path.realpath(__file__)))
    vocab_file_path = "tokenizations/clue-vocab.txt"
    #使用预训练里面的词典进行编码
    text = '我是一个人'
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path,
                                           do_lower_case=True)
    line = tokenization.convert_to_unicode(text)
    bert_tokens = tokenizer.tokenize(line)
    encoded = tokenizer.convert_tokens_to_ids(bert_tokens)

    # 下面关注一下数据集的写法.
    args.raw = True
    args.raw_data_path = '172166.txt'  # -small是小的版本
    args.epochs = 200
    args.output_dir = 'model/'  # 结果存到e盘的final_model
    args.num_pieces = 10  # 结果存到e盘的final_model
    from pre_data_byOnlyOneBook import get_data as get_data
    name2 = args.raw_data_path.split('.')[0]
    get_data(name2 + '.txt', name2 + '.json')
    # 下面使用166893.json即可.
    '''
    ------------------------------------------------------------------------------
    '''

    #---------------配置完毕
    print('args:\n' + args.__repr__())
    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡
    model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(
        args.model_config)  # 这个参数很重要,表示一句话的长度.
    print('config:\n' + model_config.to_json_string())
    n_ctx = model_config.n_ctx
    # full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)
    full_tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path,
                                                do_lower_case=True)
    '''
    full_tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path, do_lower_case=True)
    '''
    '''
    直接使用gpt2的tokenizer
    '''

    full_tokenizer.max_len = 999999
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    raw = args.raw  # 选择是否从零开始构建数据集
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    stride = args.stride
    gradient_accumulation = args.gradient_accumulation
    fp16 = args.fp16  # 不支持半精度的显卡请勿打开
    fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    num_pieces = args.num_pieces
    output_dir = args.output_dir
    #  'data/tokenized/'  编码之后的东西放在这里.
    if raw:
        print('building files')
        build_files(raw_data_path=name2 + '.json',
                    tokenized_data_path=tokenized_data_path,
                    full_tokenizer=full_tokenizer,
                    num_pieces=num_pieces)
        print('files built')

    if not args.pretrained_model:
        model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config)
    else:
        model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(
            args.pretrained_model)
    model.train()
    model.to(device)
    multi_gpu = False
    full_len = 0
    print('calculating total steps')
    for i in tqdm(range(num_pieces)):
        with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                  'r') as f:
            full_len += len([int(item) for item in f.read().strip().split()])
    import math
    total_steps = math.ceil(full_len / stride * epochs / batch_size /
                            gradient_accumulation)
    print('total steps = {}'.format(total_steps))

    optimizer = transformers.AdamW(model.parameters(),
                                   lr=lr,
                                   correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(optimizer,
                                                  warmup_steps=warmup_steps,
                                                  t_total=total_steps)
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(model)
        multi_gpu = True
    print('starting training')
    running_loss = 0
    for epoch in range(epochs):
        print('epoch {}'.format(epoch + 1))
        now = datetime.now()
        print('time: {}'.format(now))
        x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
        random.shuffle(x)
        piece_num = 0
        loss_save = []
        for i in x:
            with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                      'r') as f:
                line = f.read().strip()
            tokens = line.split()
            tokens = [int(token) for token in tokens]
            start_point = 0
            samples = []
            while start_point < len(tokens) - n_ctx:  # n_ctx 表示上下文的长度.
                samples.append(tokens[start_point:start_point + n_ctx])
                start_point += stride
            if start_point < len(tokens):  # 拼接上最后一个例子.
                samples.append(tokens[len(tokens) - n_ctx:])
            random.shuffle(samples)
            for step in range((len(samples) // batch_size) + 1):  # 多跑一个

                #  prepare data
                #先判断是否超界,如果超界就表示最后一个组不成batch,所以break
                if step * batch_size > len(samples) - 1:
                    break
                batch = samples[step * batch_size:(step + 1) * batch_size]

                batch_labels = []
                batch_inputs = []
                for ids in batch:
                    int_ids_for_labels = [int(x) for x in ids]
                    int_ids_for_inputs = [int(x) for x in ids]
                    batch_labels.append(int_ids_for_labels)
                    batch_inputs.append(int_ids_for_inputs)
                batch_labels = torch.tensor(batch_labels).long().to(device)
                batch_inputs = torch.tensor(batch_inputs).long().to(device)

                #  forward pass       居然输入输出都一样????????很奇怪这个模型.
                '''
                下面为了对比,把ctrl的模型写这里:
                
                
                    flag_input, inputs = numericalize(domain+tokenized_train_text[i:i+seq_length])  # 注意输入要牵头加上domain.
                    flag_output, outputs = numericalize(tokenized_train_text[i:i+seq_length+1])  # ctrl算法输入是 i:j 输出是i:j+1 
                    
                    
                    研究一下这个数据的问题:
                    https://www.cnblogs.com/wwj99/p/12503545.html
                    
                    好像还真是,样本和标签一样.
                '''
                outputs = model.forward(input_ids=batch_inputs,
                                        labels=batch_labels)
                loss, logits = outputs[:2]

                #  get loss
                if multi_gpu:
                    loss = loss.mean()
                if gradient_accumulation > 1:
                    loss = loss / gradient_accumulation

                #  loss backward
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   max_grad_norm)

                #  optimizer step
                if (step + 1) % gradient_accumulation == 0:
                    running_loss += loss.item()
                    optimizer.step()
                    optimizer.zero_grad()
                    scheduler.step()
                if (step + 1) % log_step == 0:
                    print(
                        'now time: {}:{}. Step {} of piece {} of epoch {}, loss {}'
                        .format(datetime.now().hour,
                                datetime.now().minute,
                                (step + 1) // gradient_accumulation, piece_num,
                                epoch + 1, running_loss / log_step))
                    loss_save.append(running_loss / log_step)
                    running_loss = 0
            piece_num += 1
        #--------------检测是否提前退出
        last = loss_save[:10]
        avg1 = sum(last) / 10
        #如果全在avg1上下百分之5以内就停止:
        last = np.array(last)
        avg1 = np.array(avg1)
        tmp = np.all(last >= avg1 * 0.97) and np.all(last >= avg1 * 1.03)
        if len(last) >= 10 and tmp and loss_save[-1] < 0.05:
            break


#--------------------

    print('training finished')
    if not os.path.exists(output_dir + 'final_model'):
        os.makedirs(output_dir + 'final_model')
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(output_dir + 'final_model')

Example #8

Show file

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--device", default="0,1,2,3", type=str, required=False, help="设置使用哪些显卡"
    )
    parser.add_argument(
        "--model_config",
        default="config/model_config_small.json",
        type=str,
        required=False,
        help="选择模型参数",
    )
    parser.add_argument(
        "--tokenizer_path",
        default="cache/vocab_small.txt",
        type=str,
        required=False,
        help="选择词库",
    )
    parser.add_argument(
        "--raw_data_path",
        default="data/train.json",
        type=str,
        required=False,
        help="原始训练语料",
    )
    parser.add_argument(
        "--tokenized_data_path",
        default="data/tokenized/",
        type=str,
        required=False,
        help="tokenized语料存放位置",
    )
    parser.add_argument("--raw", action="store_true", help="是否先做tokenize")
    parser.add_argument("--epochs", default=5, type=int, required=False, help="训练循环")
    parser.add_argument(
        "--batch_size", default=8, type=int, required=False, help="训练batch size"
    )
    parser.add_argument("--lr", default=1.5e-4, type=float, required=False, help="学习率")
    parser.add_argument(
        "--warmup_steps", default=2000, type=int, required=False, help="warm up步数"
    )
    parser.add_argument(
        "--log_step",
        default=1,
        type=int,
        required=False,
        help="多少步汇报一次loss，设置为gradient accumulation的整数倍",
    )
    parser.add_argument(
        "--stride", default=768, type=int, required=False, help="训练时取训练数据的窗口步长"
    )
    parser.add_argument(
        "--gradient_accumulation", default=1, type=int, required=False, help="梯度积累"
    )
    parser.add_argument("--fp16", action="store_true", help="混合精度")
    parser.add_argument("--fp16_opt_level", default="O1", type=str, required=False)
    parser.add_argument("--max_grad_norm", default=1.0, type=float, required=False)
    parser.add_argument(
        "--num_pieces", default=100, type=int, required=False, help="将训练语料分成多少份"
    )
    parser.add_argument(
        "--min_length", default=128, type=int, required=False, help="最短收录文章长度"
    )
    parser.add_argument(
        "--output_dir", default="model/", type=str, required=False, help="模型输出路径"
    )
    parser.add_argument(
        "--pretrained_model", default="", type=str, required=False, help="模型训练起点路径"
    )
    parser.add_argument(
        "--writer_dir",
        default="tensorboard_summary/",
        type=str,
        required=False,
        help="Tensorboard路径",
    )
    parser.add_argument("--segment", action="store_true", help="中文以词为单位")
    parser.add_argument("--bpe_token", action="store_true", help="subword")
    parser.add_argument(
        "--encoder_json",
        default="tokenizations/encoder.json",
        type=str,
        help="encoder.json",
    )
    parser.add_argument(
        "--vocab_bpe", default="tokenizations/vocab.bpe", type=str, help="vocab.bpe"
    )

    args = parser.parse_args()
    print("args:\n" + args.__repr__())

    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡

    model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(
        args.model_config
    )
    print("config:\n" + model_config.to_json_string())

    n_ctx = model_config.n_ctx
    if args.bpe_token:
        full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe)
    else:
        full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = 999999
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("using device:", device)

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    raw = args.raw  # 选择是否从零开始构建数据集
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    stride = args.stride
    gradient_accumulation = args.gradient_accumulation
    fp16 = args.fp16  # 不支持半精度的显卡请勿打开
    fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    num_pieces = args.num_pieces
    min_length = args.min_length
    output_dir = args.output_dir
    tb_writer = SummaryWriter(log_dir=args.writer_dir)
    assert log_step % gradient_accumulation == 0

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    if raw:
        print("building files")
        build_files(
            data_path=raw_data_path,
            tokenized_data_path=tokenized_data_path,
            num_pieces=num_pieces,
            full_tokenizer=full_tokenizer,
            min_length=min_length,
        )
        print("files built")

    if not args.pretrained_model:
        model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config)
    else:
        model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(
            args.pretrained_model
        )
    model.train()
    model.to(device)

    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print("number of parameters: {}".format(num_parameters))

    multi_gpu = False
    full_len = 0
    print("calculating total steps")
    for i in tqdm(range(num_pieces)):
        with open(tokenized_data_path + "tokenized_train_{}.txt".format(i), "r") as f:
            full_len += len([int(item) for item in f.read().strip().split()])
    total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation)
    print("total steps = {}".format(total_steps))

    optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(
        optimizer, warmup_steps=warmup_steps, t_total=total_steps
    )
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(model, device_ids=[int(i) for i in args.device.split(",")])
        multi_gpu = True
    print("starting training")
    overall_step = 0
    running_loss = 0
    saving_time = datetime.now()
    for epoch in range(epochs):
        print("epoch {}".format(epoch + 1))
        now = datetime.now()
        print("time: {}".format(now))
        x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
        random.shuffle(x)
        piece_num = 0
        for i in x:
            with open(
                tokenized_data_path + "tokenized_train_{}.txt".format(i), "r"
            ) as f:
                line = f.read().strip()
            tokens = line.split()
            tokens = [int(token) for token in tokens]
            start_point = 0
            samples = []
            while start_point < len(tokens) - n_ctx:
                samples.append(tokens[start_point : start_point + n_ctx])
                start_point += stride
            if start_point < len(tokens):
                samples.append(tokens[len(tokens) - n_ctx :])
            random.shuffle(samples)
            for step in range(len(samples) // batch_size):  # drop last

                #  prepare data
                batch = samples[step * batch_size : (step + 1) * batch_size]
                batch_inputs = []
                for ids in batch:
                    int_ids = [int(x) for x in ids]
                    batch_inputs.append(int_ids)
                batch_inputs = torch.tensor(batch_inputs).long().to(device)

                #  forward pass
                outputs = model.forward(input_ids=batch_inputs, labels=batch_inputs)
                loss, logits = outputs[:2]

                #  get loss
                if multi_gpu:
                    loss = loss.mean()
                if gradient_accumulation > 1:
                    loss = loss / gradient_accumulation

                #  loss backward
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), max_grad_norm
                        )
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

                #  optimizer step
                if (overall_step + 1) % gradient_accumulation == 0:
                    running_loss += loss.item()
                    optimizer.step()
                    optimizer.zero_grad()
                    scheduler.step()
                if (overall_step + 1) % log_step == 0:
                    tb_writer.add_scalar(
                        "loss", loss.item() * gradient_accumulation, overall_step
                    )
                    print(
                        "now time: {}:{}. Step {} of piece {} of epoch {}, loss {}".format(
                            datetime.now().hour,
                            datetime.now().minute,
                            step + 1,
                            piece_num,
                            epoch + 1,
                            running_loss
                            * gradient_accumulation
                            / (log_step / gradient_accumulation),
                        )
                    )
                    running_loss = 0
                delta_time = datetime.now() - saving_time
                if delta_time.seconds > 1800:
                    print("saving model for epoch {}".format(epoch + 1))
                    if not os.path.exists(
                        output_dir + "model_epoch{}".format(epoch + 1)
                    ):
                        os.mkdir(output_dir + "model_epoch{}".format(epoch + 1))
                    model_to_save = model.module if hasattr(model, "module") else model
                    model_to_save.save_pretrained(
                        output_dir + "model_epoch{}".format(epoch + 1)
                    )
                    saving_time = datetime.now()
                overall_step += 1
            piece_num += 1

        print("saving model for epoch {}".format(epoch + 1))
        if not os.path.exists(output_dir + "model_epoch{}".format(epoch + 1)):
            os.mkdir(output_dir + "model_epoch{}".format(epoch + 1))
        model_to_save = model.module if hasattr(model, "module") else model
        model_to_save.save_pretrained(output_dir + "model_epoch{}".format(epoch + 1))
        # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1))
        # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1))
        print("epoch {} finished".format(epoch + 1))

        then = datetime.now()
        print("time: {}".format(then))
        print("time for one epoch: {}".format(then - now))

    print("training finished")
    if not os.path.exists(output_dir + "final_model"):
        os.mkdir(output_dir + "final_model")
    model_to_save = model.module if hasattr(model, "module") else model
    model_to_save.save_pretrained(output_dir + "final_model")

Example #9

Show file

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='设置使用哪些显卡')
    parser.add_argument('--model_config',
                        default='../config/model_config_small.json',
                        type=str,
                        required=False,
                        help='选择模型参数')
    parser.add_argument('--tokenizer_path',
                        default='../cache/vocab_small.txt',
                        type=str,
                        required=False,
                        help='选择词库')
    parser.add_argument('--raw_data_path',
                        default='data_quantangshi',
                        type=str,
                        required=False,
                        help='原始训练语料')
    parser.add_argument('--tokenized_data_path',
                        default='data_quantangshi/tokenized/',
                        type=str,
                        required=False,
                        help='tokenized语料存放位置')
    parser.add_argument('--raw', action='store_true', help='是否先做tokenize')
    parser.add_argument('--epochs',
                        default=15,
                        type=int,
                        required=False,
                        help='训练循环')
    parser.add_argument('--batch_size',
                        default=1,
                        type=int,
                        required=False,
                        help='训练batch size')
    parser.add_argument('--lr',
                        default=1.5e-4,
                        type=float,
                        required=False,
                        help='学习率')
    parser.add_argument('--warmup_steps',
                        default=1024,
                        type=int,
                        required=False,
                        help='warm up步数')
    parser.add_argument('--log_step',
                        default=1,
                        type=int,
                        required=False,
                        help='多少步汇报一次loss，设置为gradient accumulation的整数倍')
    parser.add_argument('--stride',
                        default=468,
                        type=int,
                        required=False,
                        help='训练时取训练数据的窗口步长')
    parser.add_argument('--gradient_accumulation',
                        default=1,
                        type=int,
                        required=False,
                        help='梯度积累')
    parser.add_argument('--fp16', action='store_true', help='混合精度')
    parser.add_argument('--fp16_opt_level',
                        default='O1',
                        type=str,
                        required=False)
    parser.add_argument('--max_grad_norm',
                        default=1.0,
                        type=float,
                        required=False)
    parser.add_argument('--num_pieces',
                        default=1,
                        type=int,
                        required=False,
                        help='将训练语料分成多少份')
    parser.add_argument('--min_length',
                        default=128,
                        type=int,
                        required=False,
                        help='最短收录文章长度')
    parser.add_argument('--output_dir',
                        default='model_quantangshi/',
                        type=str,
                        required=False,
                        help='模型输出路径')
    parser.add_argument('--pretrained_model',
                        default='',
                        type=str,
                        required=False,
                        help='模型训练起点路径')
    parser.add_argument('--writer_dir',
                        default='../tensorboard_summary/',
                        type=str,
                        required=False,
                        help='Tensorboard路径')
    parser.add_argument('--segment', action='store_true', help='中文以词为单位')
    parser.add_argument('--bpe_token', action='store_true', help='subword')
    parser.add_argument('--encoder_json',
                        default="../tokenizations/encoder.json",
                        type=str,
                        help="encoder.json")
    parser.add_argument('--vocab_bpe',
                        default="../tokenizations/vocab.bpe",
                        type=str,
                        help="vocab.bpe")

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    # args.segment = False

    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡

    model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(
        args.model_config)
    print('config:\n' + model_config.to_json_string())

    n_ctx = model_config.n_ctx

    if args.bpe_token:
        full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe)
    else:
        full_tokenizer = tokenization_bert.BertTokenizer(
            vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = 999999
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path

    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    stride = args.stride
    gradient_accumulation = args.gradient_accumulation
    fp16 = args.fp16  # 不支持半精度的显卡请勿打开
    fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    num_pieces = args.num_pieces
    min_length = args.min_length
    output_dir = args.output_dir
    tb_writer = SummaryWriter(log_dir=args.writer_dir)
    assert log_step % gradient_accumulation == 0

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    if not args.pretrained_model:
        model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config)
    else:
        model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(
            args.pretrained_model)
    model.train()
    model.to(device)

    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print('number of parameters: {}'.format(num_parameters))

    multi_gpu = False
    full_len = 0
    print('calculating total steps')
    for i in tqdm(range(num_pieces)):
        with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                  'r') as f:
            full_len += len([int(item) for item in f.read().strip().split()])
    total_steps = int(full_len / stride * epochs / batch_size /
                      gradient_accumulation)
    print('total steps = {}'.format(total_steps))

    optimizer = transformers.AdamW(model.parameters(),
                                   lr=lr,
                                   correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(optimizer,
                                                  warmup_steps=warmup_steps,
                                                  t_total=total_steps)
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(
            model, device_ids=[int(i) for i in args.device.split(',')])
        multi_gpu = True
    print('starting training')
    overall_step = 0
    running_loss = 0
    size = 0
    for epoch in range(epochs):
        print('epoch {}'.format(epoch + 1))
        now = datetime.now()
        print('time: {}'.format(now))
        x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
        random.shuffle(x)
        piece_num = 0
        for i in x:
            piecestart = datetime.now()

            with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                      'r') as f:
                line = f.read().strip()
            tokens = line.split()
            tokens = [int(token) for token in tokens]
            start_point = 0
            samples = []
            while start_point < len(tokens) - n_ctx:
                samples.append(tokens[start_point:start_point + n_ctx])
                start_point += stride
            if start_point < len(tokens):
                samples.append(tokens[len(tokens) - n_ctx:])
            random.shuffle(samples)
            for step in range(len(samples) // batch_size):  # drop last

                #  prepare data
                batch = samples[step * batch_size:(step + 1) * batch_size]
                batch_inputs = []
                for ids in batch:
                    int_ids = [int(x) for x in ids]
                    batch_inputs.append(int_ids)
                batch_inputs = torch.tensor(batch_inputs).long().to(device)

                #  forward pass
                outputs = model.forward(input_ids=batch_inputs,
                                        labels=batch_inputs)
                loss, logits = outputs[:2]

                #  get loss
                if multi_gpu:
                    loss = loss.mean()
                if gradient_accumulation > 1:
                    loss = loss / gradient_accumulation

                #  loss backward
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   max_grad_norm)

                #  optimizer step
                if (overall_step + 1) % gradient_accumulation == 0:
                    running_loss += loss.item()
                    optimizer.step()
                    optimizer.zero_grad()
                    scheduler.step()
                if (overall_step + 1) % log_step == 0:
                    tb_writer.add_scalar('loss',
                                         loss.item() * gradient_accumulation,
                                         overall_step)
                    log('now time: {}:{}：{}. Step {} of piece {} of epoch {}, loss {}'
                        .format(
                            datetime.now().hour,
                            datetime.now().minute,
                            datetime.now().second, step + 1, piece_num,
                            epoch + 1, running_loss * gradient_accumulation /
                            (log_step / gradient_accumulation)))
                    running_loss = 0
                overall_step += 1
                # print('now time: {}:{}. Step {} of piece {} of epoch {} '.format(
                #     datetime.now().hour,
                #             datetime.now().minute,
                #             step + 1,
                #             piece_num,
                #             epoch + 1
                # ))
                size += 1
            pieceend = datetime.now()
            log('{} times train: {} piece time :start= {} ;end={} ;all={} '.
                format(size, piece_num, piecestart, pieceend,
                       (pieceend - piecestart)))

            piece_num += 1

        log('saving model for epoch {}'.format(epoch + 1))
        if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)):
            os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1))
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(output_dir +
                                      'model_epoch{}'.format(epoch + 1))
        # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1))
        # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1))
        log('epoch {} finished'.format(epoch + 1))

        then = datetime.now()
        log('time: {}'.format(then))
        log('time for one epoch: {}'.format(then - now))

    log('training finished')
    if not os.path.exists(output_dir + 'final_model'):
        os.mkdir(output_dir + 'final_model')
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(output_dir + 'final_model')

Example #10

Show file

File: gpt_train.py Project: radtek/ChatBot

def train(model, device, train_list, multi_gpu, hparams):
    train_data = MyDataset(train_list)
    train_dataloader = DataLoader(train_data,
                                  batch_size=hparams.batch_size,
                                  shuffle=True,
                                  num_workers=hparams.num_workers,
                                  collate_fn=collate)
    model.train()
    total_steps = int(train_data.__len__() * hparams.epochs /
                      hparams.batch_size / hparams.gradient_accumulation)
    logger.info('total training steps = {}'.format(total_steps))

    optimizer = transformers.AdamW(model.parameters(),
                                   lr=hparams.lr,
                                   correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(
        optimizer, warmup_steps=hparams.warmup_steps, t_total=total_steps)

    logger.info('starting training')
    run_loss = 0
    over_step = 0
    tb_writer = SummaryWriter(log_dir=hparams.writer_dir)
    oom_time = 0  # out of memory 次数
    for epoch in range(hparams.epochs):
        start_time = datetime.now()
        for batch_index, input_ids in enumerate(train_dataloader):
            # GPT2Model的输入为n个token_id时，输出也是n个hidden_state，使用第n个hidden_state预测第n+1个token
            input_ids = input_ids.to(device)
            try:
                outputs = model.forward(input_ids=input_ids)
                loss, accuracy = cal_loss_accuracy(outputs, input_ids, device)
                if multi_gpu:
                    loss = loss.mean()
                    accuracy = accuracy.mean()
                if hparams.gradient_accumulation > 1:
                    loss = loss / hparams.gradient_accumulation
                    accuracy = accuracy / hparams.gradient_accumulation
                loss.backward()
                # 解决梯度爆照和消失问题
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               hparams.max_grad_norm)
                if (batch_index + 1) % hparams.gradient_accumulation == 0:
                    run_loss += loss.item()
                    optimizer.step()
                    optimizer.zero_grad()  # 清空梯度信息
                    scheduler.step()
                    over_step += 1
                    if (over_step + 1) % hparams.log_step == 0:
                        logger.info(
                            "batch {} of epoch {}, loss {}, accuracy {}".
                            format(batch_index + 1, epoch + 1, loss, accuracy))
                        tb_writer.add_scalar('loss', loss.item(), over_step)
            except RuntimeError as e:
                if "out of memory" in str(e):
                    oom_time += 1
                    logger.info("WARNING: ran out of memory,times: {}".format(
                        oom_time))
                    if hasattr(torch.cuda, 'empty_cache'):
                        torch.cuda.empty_cache()
                else:
                    logger.info(str(e))
        logger.info('saving model for epoch {}'.format(epoch + 1))
        if hparams.train_mmi:
            model_path = join(hparams.mmi_model_output_path,
                              "model_epoch{}".format(epoch + 1))
        else:
            model_path = join(hparams.dialog_model_output_path,
                              "model_epoch{}".format(epoch + 1))
        if not os.path.exists(model_path):
            os.mkdir(model_path)
        model_to_save = model.module if hasattr(model, "module") else model
        model_to_save.save_pretrained(model_path)
        logger.info('epoch {} finished'.format(epoch + 1))
        epoch_finish_time = datetime.now()
        logger.info('time for one epoch: {}'.format(epoch_finish_time -
                                                    start_time))
    logger.info('training finished')