Example #1
0
    def init_optim(self):
        """
        Initialize optimizer with model parameters.

        :param params:
            parameters from the model

        :param optim_states:
            optional argument providing states of optimizer to load

        :param saved_optim_type:
            type of optimizer being loaded, if changed will skip loading
            optimizer states
        """
        # 设置优化器,并且在初始训练时,使用warmup策略
        self.optimizer = transformers.AdamW(self.model.parameters(),
                                            lr=self.args.lr,
                                            correct_bias=True)
        self.scheduler = transformers.WarmupLinearSchedule(
            self.optimizer,
            warmup_steps=self.args.warmup_steps,
            t_total=self.total_steps)
Example #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='设置使用哪些显卡')
    parser.add_argument('--model_config',
                        default='config/model_config.json',
                        type=str,
                        required=False,
                        help='选择模型参数')
    parser.add_argument('--tokenizer_path',
                        default='cache/vocab_small.txt',
                        type=str,
                        required=False,
                        help='选择词库')
    parser.add_argument('--raw_data_path',
                        default='data/train.json',
                        type=str,
                        required=False,
                        help='原始训练语料')
    parser.add_argument('--tokenized_data_path',
                        default='data/tokenized/',
                        type=str,
                        required=False,
                        help='tokenized语料存放位置')
    parser.add_argument('--raw', action='store_true', help='是否先做tokenize')
    parser.add_argument('--epochs',
                        default=5,
                        type=int,
                        required=False,
                        help='训练循环')
    parser.add_argument('--batch_size',
                        default=8,
                        type=int,
                        required=False,
                        help='训练batch size')
    parser.add_argument('--lr',
                        default=1.5e-4,
                        type=float,
                        required=False,
                        help='学习率')
    parser.add_argument('--warmup_steps',
                        default=2000,
                        type=int,
                        required=False,
                        help='warm up步数')
    parser.add_argument('--log_step',
                        default=1,
                        type=int,
                        required=False,
                        help='多少步汇报一次loss')
    parser.add_argument('--stride',
                        default=768,
                        type=int,
                        required=False,
                        help='训练时取训练数据的窗口步长')
    parser.add_argument('--gradient_accumulation',
                        default=1,
                        type=int,
                        required=False,
                        help='梯度积累')
    parser.add_argument('--fp16', action='store_true', help='混合精度')
    parser.add_argument('--fp16_opt_level',
                        default='O1',
                        type=str,
                        required=False)
    parser.add_argument('--max_grad_norm',
                        default=1.0,
                        type=float,
                        required=False)
    parser.add_argument('--num_pieces',
                        default=100,
                        type=int,
                        required=False,
                        help='将训练语料分成多少份')
    parser.add_argument('--output_dir',
                        default='model/',
                        type=str,
                        required=False,
                        help='模型输出路径')
    parser.add_argument('--pretrained_model',
                        default='',
                        type=str,
                        required=False,
                        help='模型训练起点路径')
    parser.add_argument('--segment', action='store_true', help='中文以词为单位')

    args = parser.parse_args()
    args.device = 0

    print('args:\n' + args.__repr__())

    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡
    model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(
        args.model_config)
    print('config:\n' + model_config.to_json_string())

    n_ctx = model_config.n_ctx
    full_tokenizer = tokenization_bert.BertTokenizer(
        vocab_file=args.tokenizer_path)
    '''
    直接使用gpt2的tokenizer
    '''

    full_tokenizer.max_len = 999999
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    raw = args.raw  # 选择是否从零开始构建数据集
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    stride = args.stride
    gradient_accumulation = args.gradient_accumulation
    fp16 = args.fp16  # 不支持半精度的显卡请勿打开
    fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    num_pieces = args.num_pieces
    output_dir = args.output_dir

    if raw:
        print('building files')
        build_files(raw_data_path=raw_data_path,
                    tokenized_data_path=tokenized_data_path,
                    full_tokenizer=full_tokenizer,
                    num_pieces=num_pieces)
        print('files built')

    if not args.pretrained_model:
        model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config)
    else:
        model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(
            args.pretrained_model)
    model.train()
    model.to(device)
    multi_gpu = False
    full_len = 0
    print('calculating total steps')
    for i in tqdm(range(num_pieces)):
        with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                  'r') as f:
            full_len += len([int(item) for item in f.read().strip().split()])
    total_steps = int(full_len / stride * epochs / batch_size /
                      gradient_accumulation)
    print('total steps = {}'.format(total_steps))

    optimizer = transformers.AdamW(model.parameters(),
                                   lr=lr,
                                   correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(optimizer,
                                                  warmup_steps=warmup_steps,
                                                  t_total=total_steps)
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(model)
        multi_gpu = True
    print('starting training')
    running_loss = 0
    for epoch in range(epochs):
        print('epoch {}'.format(epoch + 1))
        now = datetime.now()
        print('time: {}'.format(now))
        x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
        random.shuffle(x)
        piece_num = 0
        for i in x:
            with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                      'r') as f:
                line = f.read().strip()
            tokens = line.split()
            tokens = [int(token) for token in tokens]
            start_point = 0
            samples = []
            while start_point < len(tokens) - n_ctx:
                samples.append(tokens[start_point:start_point + n_ctx])
                start_point += stride
            if start_point < len(tokens):
                samples.append(tokens[len(tokens) - n_ctx:])
            random.shuffle(samples)
            for step in range(len(samples) // batch_size):

                #  prepare data
                batch = samples[step * batch_size:(step + 1) * batch_size]
                batch_labels = []
                batch_inputs = []
                for ids in batch:
                    int_ids_for_labels = [int(x) for x in ids]
                    int_ids_for_inputs = [int(x) for x in ids]
                    batch_labels.append(int_ids_for_labels)
                    batch_inputs.append(int_ids_for_inputs)
                batch_labels = torch.tensor(batch_labels).long().to(device)
                batch_inputs = torch.tensor(batch_inputs).long().to(device)

                #  forward pass
                outputs = model.forward(input_ids=batch_inputs,
                                        labels=batch_labels)
                loss, logits = outputs[:2]

                #  get loss
                if multi_gpu:
                    loss = loss.mean()
                if gradient_accumulation > 1:
                    loss = loss / gradient_accumulation

                #  loss backward
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   max_grad_norm)

                #  optimizer step
                if (step + 1) % gradient_accumulation == 0:
                    running_loss += loss.item()
                    optimizer.step()
                    optimizer.zero_grad()
                    scheduler.step()
                if (step + 1) % log_step == 0:
                    print(
                        'now time: {}:{}. Step {} of piece {} of epoch {}, loss {}'
                        .format(datetime.now().hour,
                                datetime.now().minute,
                                (step + 1) // gradient_accumulation, piece_num,
                                epoch + 1, running_loss / log_step))
                    running_loss = 0
            piece_num += 1

        print('saving model for epoch {}'.format(epoch + 1))
        if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)):
            os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1))
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(output_dir +
                                      'model_epoch{}'.format(epoch + 1))
        # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1))
        # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1))
        print('epoch {} finished'.format(epoch + 1))

        then = datetime.now()
        print('time: {}'.format(then))
        print('time for one epoch: {}'.format(then - now))

    print('training finished')
    if not os.path.exists(output_dir + 'final_model'):
        os.mkdir(output_dir + 'final_model')
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(output_dir + 'final_model')
Example #3
0
def train(model, device, train_list, multi_gpu, args):
    train_dataset = MyDataset(train_list)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  shuffle=False,
                                  num_workers=args.num_workers,
                                  collate_fn=collate_fn)

    total_steps = int(train_dataset.__len__() * args.epochs / args.batch_size /
                      args.gradient_accumulation)
    logger.info('total training steps = {}'.format(total_steps))

    save_step = max(int(args.save_step_percentage * total_steps), 1)
    logger.info('save per {} steps'.format(save_step))

    optimizer = transformers.AdamW(model.parameters(),
                                   lr=args.lr,
                                   correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(
        optimizer, warmup_steps=args.warmup_steps, t_total=total_steps)

    logger.info('starting training')
    running_loss = 0
    overall_step = -1

    model_path = join(args.model_output_path, "saved.pt")
    if os.path.exists(model_path):
        checkpoint = torch.load(model_path)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        scheduler.load_state_dict(checkpoint['scheduler'])
        running_loss = checkpoint['running_loss']
        overall_step = checkpoint['overall_step']
    logger.info("running loss:{}, overall step:{}".format(
        running_loss, overall_step))
    tb_writer = SummaryWriter(log_dir=args.writer_dir)
    oom_time = 0
    model.train()
    oom_flag = False
    epoch_start_time = datetime.now()
    for batch_idx, input_ids in enumerate(train_dataloader):
        if batch_idx <= overall_step:
            continue

        input_ids = input_ids.to(device)
        try:
            mu, logvar, bow_probs = model.forward(input=input_ids)

            bow_loss = calculate_bow(bow_probs, input_ids, device)

            loss = bow_loss

            if multi_gpu:
                loss = loss.mean()
            if args.gradient_accumulation > 1:
                loss = loss / args.gradient_accumulation
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           args.max_grad_norm)
            if (batch_idx + 1) % args.gradient_accumulation == 0:
                running_loss += loss.item()
                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()
                overall_step += 1
                if (overall_step + 1) % args.log_step == 0 or (overall_step + 1
                                                               == total_steps):
                    logger.info("step {}, loss {:.6}".format(
                        overall_step, loss))
                    tb_writer.add_scalar('loss', loss.item(), overall_step)
            if (overall_step + 1) % save_step == 0 or (overall_step
                                                       == total_steps):
                logger.info('saving for step {}'.format(overall_step))
                if not os.path.exists(args.model_output_path):
                    os.mkdir(args.model_output_path)

                torch.save(
                    {
                        # 'finished_epoch': epoch,
                        'model': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'scheduler': scheduler.state_dict(),
                        'overall_step': overall_step,
                        'running_loss': running_loss
                    },
                    model_path)

                logger.info('finish saving for step {}'.format(overall_step))

        except RuntimeError as exception:
            if "out of memory" in str(exception):
                oom_time += 1
                if not oom_flag:
                    logger.info("WARNING: ran out of memory,times: {}".format(
                        oom_time))
                    logger.info("batch_idx = ", batch_idx)
                    oom_flag = True
                if hasattr(torch.cuda, 'empty_cache'):
                    torch.cuda.empty_cache()
            else:
                logger.info(str(exception))
                raise exception

    epoch_finish_time = datetime.now()
    logger.info('time for one epoch: {}'.format(epoch_finish_time -
                                                epoch_start_time))
    logger.info('training finished')
Example #4
0
def train(model, device, train_list, args):
    train_dataset = MyDataset(train_list)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  shuffle=False,
                                  num_workers=args.num_workers,
                                  collate_fn=collate_fn)

    total_steps = int(train_dataset.__len__() * args.epochs / args.batch_size)
    logger.info('total training steps = {}'.format(total_steps))

    save_step = max(int(args.save_step_percentage * total_steps), 1)
    logger.info('save per {} steps'.format(save_step))

    optimizer = transformers.AdamW(model.parameters(),
                                   lr=args.lr,
                                   correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(
        optimizer, warmup_steps=args.warmup_steps, t_total=total_steps)

    logger.info('starting training')
    running_loss = 0
    overall_step = -1
    kl_anneal_x0 = int(total_steps * args.kl_anneal_percentage)

    model_path = join(args.model_output_path, "saved.pt")
    if os.path.exists(model_path):
        checkpoint = torch.load(model_path)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        scheduler.load_state_dict(checkpoint['scheduler'])
        # finished_epoch = checkpoint['finished_epoch'] + 1
        running_loss = checkpoint['running_loss']
        overall_step = checkpoint['overall_step']
    logger.info("running loss:{}, overall step:{}".format(
        running_loss, overall_step))
    tb_writer = SummaryWriter(log_dir=args.writer_dir)
    oom_time = 0
    model.train()
    oom_flag = False
    # for epoch in range(finished_epoch, args.epochs):
    epoch_start_time = datetime.now()
    for batch_idx, input_ids in enumerate(train_dataloader):
        if batch_idx <= overall_step:
            continue

        input_ids = input_ids.to(device)
        try:
            outputs, mu, logvar, bow_probs = model.forward(input=input_ids)
            # anneal_function, step, k, x0
            ce, accuracy = calculate_loss_and_accuracy(outputs,
                                                       labels=input_ids,
                                                       device=device)

            kl_weight = min(
                0.5,
                kl_anneal_function(anneal_function=args.kl_anneal_function,
                                   step=overall_step,
                                   k=args.kl_anneal_k,
                                   x0=kl_anneal_x0))
            kld = (-0.5 *
                   torch.sum(logvar - torch.pow(mu, 2) - torch.exp(logvar) + 1,
                             1)).mean().squeeze()

            bow_loss = calculate_bow(bow_probs, input_ids, device)

            loss = ce + kl_weight * kld + args.bow_weight * bow_loss

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           args.max_grad_norm)
            running_loss += loss.item()
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()
            overall_step += 1
            if overall_step == 0 or (
                    overall_step + 1) % args.log_step == 0 or (overall_step + 1
                                                               == total_steps):
                logger.info(
                    "step {}, ce {:.6}, kld {:.6}, kl_weight {:.6}, bow {:.6}, bow_weight {:.6}, loss {:.6}, accuracy {:.6}"
                    .format(overall_step, ce, kld, kl_weight, bow_loss,
                            args.bow_weight, loss, accuracy))
                tb_writer.add_scalar('ce', ce.item(), overall_step)
                tb_writer.add_scalar('kld', kld.item(), overall_step)
                tb_writer.add_scalar('loss', loss.item(), overall_step)
            if (overall_step + 1) % save_step == 0 or (overall_step + 1
                                                       == total_steps):
                logger.info('saving for step {}'.format(overall_step))
                if not os.path.exists(args.model_output_path):
                    os.mkdir(args.model_output_path)

                torch.save(
                    {
                        # 'finished_epoch': epoch,
                        'model': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'scheduler': scheduler.state_dict(),
                        'overall_step': overall_step,
                        'running_loss': running_loss
                    },
                    model_path)

                decoder_path = join(args.model_output_path, 'decoder/')

                if not os.path.exists(decoder_path):
                    os.mkdir(decoder_path)

                model.save_decoder(decoder_path)
                logger.info('finish saving for step {}'.format(overall_step))

        except RuntimeError as exception:
            if "out of memory" in str(exception):
                oom_time += 1
                if not oom_flag:
                    logger.info("WARNING: ran out of memory,times: {}".format(
                        oom_time))
                    logger.info("batch_idx = ", batch_idx)
                    oom_flag = True
                if hasattr(torch.cuda, 'empty_cache'):
                    torch.cuda.empty_cache()
            else:
                logger.info(str(exception))
                raise exception

    epoch_finish_time = datetime.now()
    logger.info('time for one epoch: {}'.format(epoch_finish_time -
                                                epoch_start_time))
    logger.info('training finished')
Example #5
0
def train(model, device, train_list, multi_gpu, args, valid_list, test_list):
    train_dataset = MyDataset(train_list)
    train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers,
                                  collate_fn=collate_fn)
    model.train()
    # 计算所有epoch进行参数优化的总步数total_steps
    total_steps = int(train_dataset.__len__() * args.epochs / args.batch_size / args.gradient_accumulation)
    logger.info('total training steps = {}'.format(total_steps))

    # 设置优化器,并且在初始训练时,使用warmup策略
    optimizer = transformers.AdamW(model.parameters(), lr=args.lr, correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=total_steps)

    logger.info('starting training')
    # 用于统计每次梯度累计的loss
    running_loss = 0
    # 统计一共训练了多少个step
    overall_step = 0
    # 记录tensorboardX
    tb_writer = SummaryWriter(log_dir=args.writer_dir)
    # 记录 out of memory的次数
    oom_time = 0
    # 开始训练
    for epoch in range(args.epochs):
        epoch_start_time = datetime.now()
        for batch_idx, input_ids in enumerate(train_dataloader):
            # 注意:GPT2模型的forward()函数,是对于给定的context,生成一个token,而不是生成一串token
            # GPT2Model的输入为n个token_id时,输出也是n个hidden_state,使用第n个hidden_state预测第n+1个token
            input_ids = input_ids.to(device)
            # 解决在运行过程中,由于显存不足产生的cuda out of memory的问题
            try:
                outputs = model.forward(input_ids=input_ids)
                loss, accuracy = calculate_loss_and_accuracy(outputs, labels=input_ids, device=device)

                if multi_gpu:
                    loss = loss.mean()
                    accuracy = accuracy.mean()
                if args.gradient_accumulation > 1:
                    loss = loss / args.gradient_accumulation
                    accuracy = accuracy / args.gradient_accumulation
                loss.backward()
                # 梯度裁剪解决的是梯度消失或爆炸的问题,即设定阈值
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                # 进行一定step的梯度累计之后,更新参数
                if (batch_idx + 1) % args.gradient_accumulation == 0:
                    running_loss += loss.item()
                    # 更新参数
                    optimizer.step()
                    # 清空梯度信息
                    optimizer.zero_grad()
                    # 进行warm up
                    scheduler.step()
                    overall_step += 1
                    # 更新日志与tnesorboardX信息
                    if (overall_step + 1) % args.log_step == 0:
                        logger.info(
                            "batch {} of epoch {}, loss {}, accuracy {}".format(batch_idx + 1, epoch + 1, loss,
                                                                                accuracy))
                        tb_writer.add_scalar('loss', loss.item(), overall_step)
            except RuntimeError as exception:
                if "out of memory" in str(exception):
                    oom_time += 1
                    logger.info("WARNING: ran out of memory,times: {}".format(oom_time))
                    if hasattr(torch.cuda, 'empty_cache'):
                        torch.cuda.empty_cache()
                else:
                    logger.info(str(exception))
                    raise exception
        logger.info('saving model for epoch {}'.format(epoch + 1))
        if args.train_mmi:  # 当前训练MMI模型
            model_path = join(args.mmi_model_output_path, 'model_epoch{}'.format(epoch + 1))
        else:  # 当前训练对话模型
            model_path = join(args.dialogue_model_output_path, 'model_epoch{}'.format(epoch + 1))
        if not os.path.exists(model_path):
            os.mkdir(model_path)
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(model_path)
        logger.info('epoch {} finished'.format(epoch + 1))
        epoch_finish_time = datetime.now()
        logger.info('time for one epoch: {}'.format(epoch_finish_time - epoch_start_time))
        
        logger.info ("Start Valid Set")
        evaluate(model, device, valid_list, multi_gpu, args)
        logger.info ("Start Test Set")
        evaluate(model, device, test_list, multi_gpu, args)
        
    logger.info('training finished')
Example #6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='cuda visible devices')
    parser.add_argument('--model_config',
                        default='config/model_config.json',
                        type=str,
                        required=False,
                        help='path of the model configration file')
    parser.add_argument('--tokenizer_path',
                        default='data/vocabs.txt',
                        type=str,
                        required=False,
                        help='path of the vocabulary file')
    parser.add_argument('--raw_data_path',
                        default='data/samples.json',
                        type=str,
                        required=False,
                        help='path of the samples file')
    parser.add_argument('--tokenized_data_path',
                        default='data/tokenized/',
                        type=str,
                        required=False,
                        help='save the tokenized samples file to this dir')
    parser.add_argument(
        '--raw',
        action='store_true',
        help=
        'do tokenize before training, no need if already tokenized with same configration'
    )
    parser.add_argument('--epochs', default=24, type=int, required=False)
    parser.add_argument('--batch_size', default=16, type=int, required=False)
    parser.add_argument('--lr', default=2e-4, type=float, required=False)
    parser.add_argument('--warmup_steps',
                        default=4000,
                        type=int,
                        required=False)
    parser.add_argument('--log_step',
                        default=4000,
                        type=int,
                        required=False,
                        help='period of reporting loss')
    parser.add_argument('--gradient_accumulation',
                        default=1,
                        type=int,
                        required=False)
    parser.add_argument('--max_grad_norm',
                        default=1.0,
                        type=float,
                        required=False)
    parser.add_argument('--output_dir',
                        default='model/',
                        type=str,
                        required=False,
                        help='save the model to this dir')
    parser.add_argument('--pretrained_model',
                        default='',
                        type=str,
                        required=False,
                        help='pre-trained model dir')

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    from tokenizations import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device

    model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(
        args.model_config)
    print('config:\n' + model_config.to_json_string())

    n_ctx = model_config.n_ctx
    full_tokenizer = tokenization_bert.BertTokenizer(
        vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = 999999

    if torch.cuda.is_available():
        device = 'cuda'
        print(torch.cuda.get_device_name(0))
    else:
        device = 'cpu'
        print(device)

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    raw = args.raw
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    gradient_accumulation = args.gradient_accumulation
    max_grad_norm = args.max_grad_norm
    output_dir = args.output_dir
    assert log_step % gradient_accumulation == 0

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    if raw:
        print('building files')
        build_files(data_path=raw_data_path,
                    tokenized_data_path=tokenized_data_path,
                    full_tokenizer=full_tokenizer,
                    n_ctx=n_ctx)
        print('files built')

    if not args.pretrained_model:
        model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config)
    else:
        model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(
            args.pretrained_model)
    model.train()
    model.to(device)

    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print('number of parameters: {}'.format(num_parameters))

    multi_gpu = False
    full_len = 0
    print('calculating total steps')

    with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(0),
              'r') as f:
        full_len += len([int(item) for item in f.read().strip().split()])

    total_steps = int(full_len / n_ctx * epochs / batch_size /
                      gradient_accumulation)
    print('total steps = {}'.format(total_steps))

    optimizer = transformers.AdamW(model.parameters(),
                                   lr=lr,
                                   correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(optimizer,
                                                  warmup_steps=warmup_steps,
                                                  t_total=total_steps)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")

        device_ids = []
        for i in args.device.split(','):
            try:
                print(torch.cuda.get_device_name(int(i)))
                device_ids.append(int(i))
            except:
                pass
        model = DataParallel(model, device_ids=device_ids)
        multi_gpu = True
    print('starting training')
    overall_step = 0
    running_loss = 0

    with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(0),
              'r') as f:
        line = f.read().strip()
    tokens = line.split()
    tokens = [int(token) for token in tokens]
    start_point = 0
    samples = []

    while start_point < len(tokens) - n_ctx:
        samples.append(tokens[start_point:start_point + n_ctx])
        start_point += n_ctx
    if start_point < len(tokens):
        samples.append(tokens[len(tokens) - n_ctx:])

    for epoch in range(epochs):
        print('epoch {}'.format(epoch + 1))
        now = datetime.now()
        print('time: {}'.format(now))

        samples2 = copy.deepcopy(samples)
        random.shuffle(samples2)

        for step in range(len(samples2) // batch_size):  # drop last
            #  prepare data
            batch = samples2[step * batch_size:(step + 1) * batch_size]
            batch_inputs = torch.tensor(batch).long().to(device)
            #  forward pass
            outputs = model.forward(input_ids=batch_inputs,
                                    labels=batch_inputs)
            loss, logits = outputs[:2]

            if multi_gpu:
                loss = loss.mean()
            if gradient_accumulation > 1:
                loss = loss / gradient_accumulation

            #  loss backward
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

            #  optimizer step
            if (overall_step + 1) % gradient_accumulation == 0:
                running_loss += loss.item()
                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()
            if (overall_step + 1) % log_step == 0:
                tb_writer.add_scalar('loss',
                                     loss.item() * gradient_accumulation,
                                     overall_step)
                print('now time: {}:{}. Step {} of epoch {}, loss {}'.format(
                    datetime.now().hour,
                    datetime.now().minute, step + 1, epoch + 1,
                    running_loss * gradient_accumulation /
                    (log_step / gradient_accumulation)))
                running_loss = 0
            overall_step += 1

        print('saving model for epoch {}'.format(epoch + 1))
        temp_epoch = (epoch + 1) % 2  # save disk space

        if not os.path.exists(output_dir + 'model_epoch{}'.format(temp_epoch)):
            os.mkdir(output_dir + 'model_epoch{}'.format(temp_epoch))
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(output_dir +
                                      'model_epoch{}'.format(temp_epoch))
        #torch.save(scheduler, output_dir + 'model_epoch{}/scheduler.pt'.format(temp_epoch))
        #torch.save(optimizer, output_dir + 'model_epoch{}/optimizer.pt'.format(temp_epoch))
        print('epoch {} finished'.format(epoch + 1))

        then = datetime.now()
        print('time: {}'.format(then))
        print('time for one epoch: {}'.format(then - now))

    print('training finished')
Example #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='设置使用哪些显卡')
    parser.add_argument('--model_config',
                        default='config/model_config.json',
                        type=str,
                        required=False,
                        help='选择模型参数')
    parser.add_argument('--tokenizer_path',
                        default='cache/vocab_small.txt',
                        type=str,
                        required=False,
                        help='选择词库')
    parser.add_argument('--raw_data_path',
                        default='data/train.json',
                        type=str,
                        required=False,
                        help='原始训练语料')
    parser.add_argument('--tokenized_data_path',
                        default='data/tokenized/',
                        type=str,
                        required=False,
                        help='tokenized语料存放位置')
    parser.add_argument('--raw', action='store_true', help='是否先做tokenize')
    parser.add_argument('--epochs',
                        default=5,
                        type=int,
                        required=False,
                        help='训练循环')
    parser.add_argument('--batch_size',
                        default=8,
                        type=int,
                        required=False,
                        help='训练batch size')
    parser.add_argument('--lr',
                        default=1.5e-4,
                        type=float,
                        required=False,
                        help='学习率')
    parser.add_argument('--warmup_steps',
                        default=2000,
                        type=int,
                        required=False,
                        help='warm up步数')
    parser.add_argument('--log_step',
                        default=1,
                        type=int,
                        required=False,
                        help='多少步汇报一次loss')
    parser.add_argument('--stride',
                        default=768,
                        type=int,
                        required=False,
                        help='训练时取训练数据的窗口步长')
    parser.add_argument('--gradient_accumulation',
                        default=1,
                        type=int,
                        required=False,
                        help='梯度积累')
    parser.add_argument('--fp16', action='store_true', help='混合精度')
    parser.add_argument('--fp16_opt_level',
                        default='O1',
                        type=str,
                        required=False)
    parser.add_argument('--max_grad_norm',
                        default=1.0,
                        type=float,
                        required=False)
    parser.add_argument('--num_pieces',
                        default=1,
                        type=int,
                        required=False,
                        help='将训练语料分成多少份')
    parser.add_argument('--output_dir',
                        default='model/',
                        type=str,
                        required=False,
                        help='模型输出路径')
    parser.add_argument('--pretrained_model',
                        default='',
                        type=str,
                        required=False,
                        help='模型训练起点路径')
    parser.add_argument('--segment', action='store_true', help='中文以词为单位')
    '''
    配置参数-------------------------------------------------------------------
    '''
    args = parser.parse_args()
    args.device = '1'
    args.batch_size = 5
    from tokenizations import tokenization
    proj_root_path = os.path.dirname(
        os.path.dirname(os.path.realpath(__file__)))
    vocab_file_path = "tokenizations/clue-vocab.txt"
    #使用预训练里面的词典进行编码
    text = '我是一个人'
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path,
                                           do_lower_case=True)
    line = tokenization.convert_to_unicode(text)
    bert_tokens = tokenizer.tokenize(line)
    encoded = tokenizer.convert_tokens_to_ids(bert_tokens)

    # 下面关注一下数据集的写法.
    args.raw = True
    args.raw_data_path = '172166.txt'  # -small是小的版本
    args.epochs = 200
    args.output_dir = 'model/'  # 结果存到e盘的final_model
    args.num_pieces = 10  # 结果存到e盘的final_model
    from pre_data_byOnlyOneBook import get_data as get_data
    name2 = args.raw_data_path.split('.')[0]
    get_data(name2 + '.txt', name2 + '.json')
    # 下面使用166893.json即可.
    '''
    ------------------------------------------------------------------------------
    '''

    #---------------配置完毕
    print('args:\n' + args.__repr__())
    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡
    model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(
        args.model_config)  # 这个参数很重要,表示一句话的长度.
    print('config:\n' + model_config.to_json_string())
    n_ctx = model_config.n_ctx
    # full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)
    full_tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path,
                                                do_lower_case=True)
    '''
    full_tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path, do_lower_case=True)
    '''
    '''
    直接使用gpt2的tokenizer
    '''

    full_tokenizer.max_len = 999999
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    raw = args.raw  # 选择是否从零开始构建数据集
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    stride = args.stride
    gradient_accumulation = args.gradient_accumulation
    fp16 = args.fp16  # 不支持半精度的显卡请勿打开
    fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    num_pieces = args.num_pieces
    output_dir = args.output_dir
    #  'data/tokenized/'  编码之后的东西放在这里.
    if raw:
        print('building files')
        build_files(raw_data_path=name2 + '.json',
                    tokenized_data_path=tokenized_data_path,
                    full_tokenizer=full_tokenizer,
                    num_pieces=num_pieces)
        print('files built')

    if not args.pretrained_model:
        model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config)
    else:
        model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(
            args.pretrained_model)
    model.train()
    model.to(device)
    multi_gpu = False
    full_len = 0
    print('calculating total steps')
    for i in tqdm(range(num_pieces)):
        with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                  'r') as f:
            full_len += len([int(item) for item in f.read().strip().split()])
    import math
    total_steps = math.ceil(full_len / stride * epochs / batch_size /
                            gradient_accumulation)
    print('total steps = {}'.format(total_steps))

    optimizer = transformers.AdamW(model.parameters(),
                                   lr=lr,
                                   correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(optimizer,
                                                  warmup_steps=warmup_steps,
                                                  t_total=total_steps)
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(model)
        multi_gpu = True
    print('starting training')
    running_loss = 0
    for epoch in range(epochs):
        print('epoch {}'.format(epoch + 1))
        now = datetime.now()
        print('time: {}'.format(now))
        x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
        random.shuffle(x)
        piece_num = 0
        loss_save = []
        for i in x:
            with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                      'r') as f:
                line = f.read().strip()
            tokens = line.split()
            tokens = [int(token) for token in tokens]
            start_point = 0
            samples = []
            while start_point < len(tokens) - n_ctx:  # n_ctx 表示上下文的长度.
                samples.append(tokens[start_point:start_point + n_ctx])
                start_point += stride
            if start_point < len(tokens):  # 拼接上最后一个例子.
                samples.append(tokens[len(tokens) - n_ctx:])
            random.shuffle(samples)
            for step in range((len(samples) // batch_size) + 1):  # 多跑一个

                #  prepare data
                #先判断是否超界,如果超界就表示最后一个组不成batch,所以break
                if step * batch_size > len(samples) - 1:
                    break
                batch = samples[step * batch_size:(step + 1) * batch_size]

                batch_labels = []
                batch_inputs = []
                for ids in batch:
                    int_ids_for_labels = [int(x) for x in ids]
                    int_ids_for_inputs = [int(x) for x in ids]
                    batch_labels.append(int_ids_for_labels)
                    batch_inputs.append(int_ids_for_inputs)
                batch_labels = torch.tensor(batch_labels).long().to(device)
                batch_inputs = torch.tensor(batch_inputs).long().to(device)

                #  forward pass       居然输入输出都一样????????很奇怪这个模型.
                '''
                下面为了对比,把ctrl的模型写这里:
                
                
                    flag_input, inputs = numericalize(domain+tokenized_train_text[i:i+seq_length])  # 注意输入要牵头加上domain.
                    flag_output, outputs = numericalize(tokenized_train_text[i:i+seq_length+1])  # ctrl算法输入是 i:j 输出是i:j+1 
                    
                    
                    研究一下这个数据的问题:
                    https://www.cnblogs.com/wwj99/p/12503545.html
                    
                    好像还真是,样本和标签一样.
                '''
                outputs = model.forward(input_ids=batch_inputs,
                                        labels=batch_labels)
                loss, logits = outputs[:2]

                #  get loss
                if multi_gpu:
                    loss = loss.mean()
                if gradient_accumulation > 1:
                    loss = loss / gradient_accumulation

                #  loss backward
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   max_grad_norm)

                #  optimizer step
                if (step + 1) % gradient_accumulation == 0:
                    running_loss += loss.item()
                    optimizer.step()
                    optimizer.zero_grad()
                    scheduler.step()
                if (step + 1) % log_step == 0:
                    print(
                        'now time: {}:{}. Step {} of piece {} of epoch {}, loss {}'
                        .format(datetime.now().hour,
                                datetime.now().minute,
                                (step + 1) // gradient_accumulation, piece_num,
                                epoch + 1, running_loss / log_step))
                    loss_save.append(running_loss / log_step)
                    running_loss = 0
            piece_num += 1
        #--------------检测是否提前退出
        last = loss_save[:10]
        avg1 = sum(last) / 10
        #如果全在avg1上下百分之5以内就停止:
        last = np.array(last)
        avg1 = np.array(avg1)
        tmp = np.all(last >= avg1 * 0.97) and np.all(last >= avg1 * 1.03)
        if len(last) >= 10 and tmp and loss_save[-1] < 0.05:
            break


#--------------------

    print('training finished')
    if not os.path.exists(output_dir + 'final_model'):
        os.makedirs(output_dir + 'final_model')
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(output_dir + 'final_model')
Example #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--device", default="0,1,2,3", type=str, required=False, help="设置使用哪些显卡"
    )
    parser.add_argument(
        "--model_config",
        default="config/model_config_small.json",
        type=str,
        required=False,
        help="选择模型参数",
    )
    parser.add_argument(
        "--tokenizer_path",
        default="cache/vocab_small.txt",
        type=str,
        required=False,
        help="选择词库",
    )
    parser.add_argument(
        "--raw_data_path",
        default="data/train.json",
        type=str,
        required=False,
        help="原始训练语料",
    )
    parser.add_argument(
        "--tokenized_data_path",
        default="data/tokenized/",
        type=str,
        required=False,
        help="tokenized语料存放位置",
    )
    parser.add_argument("--raw", action="store_true", help="是否先做tokenize")
    parser.add_argument("--epochs", default=5, type=int, required=False, help="训练循环")
    parser.add_argument(
        "--batch_size", default=8, type=int, required=False, help="训练batch size"
    )
    parser.add_argument("--lr", default=1.5e-4, type=float, required=False, help="学习率")
    parser.add_argument(
        "--warmup_steps", default=2000, type=int, required=False, help="warm up步数"
    )
    parser.add_argument(
        "--log_step",
        default=1,
        type=int,
        required=False,
        help="多少步汇报一次loss,设置为gradient accumulation的整数倍",
    )
    parser.add_argument(
        "--stride", default=768, type=int, required=False, help="训练时取训练数据的窗口步长"
    )
    parser.add_argument(
        "--gradient_accumulation", default=1, type=int, required=False, help="梯度积累"
    )
    parser.add_argument("--fp16", action="store_true", help="混合精度")
    parser.add_argument("--fp16_opt_level", default="O1", type=str, required=False)
    parser.add_argument("--max_grad_norm", default=1.0, type=float, required=False)
    parser.add_argument(
        "--num_pieces", default=100, type=int, required=False, help="将训练语料分成多少份"
    )
    parser.add_argument(
        "--min_length", default=128, type=int, required=False, help="最短收录文章长度"
    )
    parser.add_argument(
        "--output_dir", default="model/", type=str, required=False, help="模型输出路径"
    )
    parser.add_argument(
        "--pretrained_model", default="", type=str, required=False, help="模型训练起点路径"
    )
    parser.add_argument(
        "--writer_dir",
        default="tensorboard_summary/",
        type=str,
        required=False,
        help="Tensorboard路径",
    )
    parser.add_argument("--segment", action="store_true", help="中文以词为单位")
    parser.add_argument("--bpe_token", action="store_true", help="subword")
    parser.add_argument(
        "--encoder_json",
        default="tokenizations/encoder.json",
        type=str,
        help="encoder.json",
    )
    parser.add_argument(
        "--vocab_bpe", default="tokenizations/vocab.bpe", type=str, help="vocab.bpe"
    )

    args = parser.parse_args()
    print("args:\n" + args.__repr__())

    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡

    model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(
        args.model_config
    )
    print("config:\n" + model_config.to_json_string())

    n_ctx = model_config.n_ctx
    if args.bpe_token:
        full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe)
    else:
        full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = 999999
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("using device:", device)

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    raw = args.raw  # 选择是否从零开始构建数据集
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    stride = args.stride
    gradient_accumulation = args.gradient_accumulation
    fp16 = args.fp16  # 不支持半精度的显卡请勿打开
    fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    num_pieces = args.num_pieces
    min_length = args.min_length
    output_dir = args.output_dir
    tb_writer = SummaryWriter(log_dir=args.writer_dir)
    assert log_step % gradient_accumulation == 0

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    if raw:
        print("building files")
        build_files(
            data_path=raw_data_path,
            tokenized_data_path=tokenized_data_path,
            num_pieces=num_pieces,
            full_tokenizer=full_tokenizer,
            min_length=min_length,
        )
        print("files built")

    if not args.pretrained_model:
        model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config)
    else:
        model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(
            args.pretrained_model
        )
    model.train()
    model.to(device)

    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print("number of parameters: {}".format(num_parameters))

    multi_gpu = False
    full_len = 0
    print("calculating total steps")
    for i in tqdm(range(num_pieces)):
        with open(tokenized_data_path + "tokenized_train_{}.txt".format(i), "r") as f:
            full_len += len([int(item) for item in f.read().strip().split()])
    total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation)
    print("total steps = {}".format(total_steps))

    optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(
        optimizer, warmup_steps=warmup_steps, t_total=total_steps
    )
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(model, device_ids=[int(i) for i in args.device.split(",")])
        multi_gpu = True
    print("starting training")
    overall_step = 0
    running_loss = 0
    saving_time = datetime.now()
    for epoch in range(epochs):
        print("epoch {}".format(epoch + 1))
        now = datetime.now()
        print("time: {}".format(now))
        x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
        random.shuffle(x)
        piece_num = 0
        for i in x:
            with open(
                tokenized_data_path + "tokenized_train_{}.txt".format(i), "r"
            ) as f:
                line = f.read().strip()
            tokens = line.split()
            tokens = [int(token) for token in tokens]
            start_point = 0
            samples = []
            while start_point < len(tokens) - n_ctx:
                samples.append(tokens[start_point : start_point + n_ctx])
                start_point += stride
            if start_point < len(tokens):
                samples.append(tokens[len(tokens) - n_ctx :])
            random.shuffle(samples)
            for step in range(len(samples) // batch_size):  # drop last

                #  prepare data
                batch = samples[step * batch_size : (step + 1) * batch_size]
                batch_inputs = []
                for ids in batch:
                    int_ids = [int(x) for x in ids]
                    batch_inputs.append(int_ids)
                batch_inputs = torch.tensor(batch_inputs).long().to(device)

                #  forward pass
                outputs = model.forward(input_ids=batch_inputs, labels=batch_inputs)
                loss, logits = outputs[:2]

                #  get loss
                if multi_gpu:
                    loss = loss.mean()
                if gradient_accumulation > 1:
                    loss = loss / gradient_accumulation

                #  loss backward
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), max_grad_norm
                        )
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

                #  optimizer step
                if (overall_step + 1) % gradient_accumulation == 0:
                    running_loss += loss.item()
                    optimizer.step()
                    optimizer.zero_grad()
                    scheduler.step()
                if (overall_step + 1) % log_step == 0:
                    tb_writer.add_scalar(
                        "loss", loss.item() * gradient_accumulation, overall_step
                    )
                    print(
                        "now time: {}:{}. Step {} of piece {} of epoch {}, loss {}".format(
                            datetime.now().hour,
                            datetime.now().minute,
                            step + 1,
                            piece_num,
                            epoch + 1,
                            running_loss
                            * gradient_accumulation
                            / (log_step / gradient_accumulation),
                        )
                    )
                    running_loss = 0
                delta_time = datetime.now() - saving_time
                if delta_time.seconds > 1800:
                    print("saving model for epoch {}".format(epoch + 1))
                    if not os.path.exists(
                        output_dir + "model_epoch{}".format(epoch + 1)
                    ):
                        os.mkdir(output_dir + "model_epoch{}".format(epoch + 1))
                    model_to_save = model.module if hasattr(model, "module") else model
                    model_to_save.save_pretrained(
                        output_dir + "model_epoch{}".format(epoch + 1)
                    )
                    saving_time = datetime.now()
                overall_step += 1
            piece_num += 1

        print("saving model for epoch {}".format(epoch + 1))
        if not os.path.exists(output_dir + "model_epoch{}".format(epoch + 1)):
            os.mkdir(output_dir + "model_epoch{}".format(epoch + 1))
        model_to_save = model.module if hasattr(model, "module") else model
        model_to_save.save_pretrained(output_dir + "model_epoch{}".format(epoch + 1))
        # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1))
        # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1))
        print("epoch {} finished".format(epoch + 1))

        then = datetime.now()
        print("time: {}".format(then))
        print("time for one epoch: {}".format(then - now))

    print("training finished")
    if not os.path.exists(output_dir + "final_model"):
        os.mkdir(output_dir + "final_model")
    model_to_save = model.module if hasattr(model, "module") else model
    model_to_save.save_pretrained(output_dir + "final_model")
Example #9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='设置使用哪些显卡')
    parser.add_argument('--model_config',
                        default='../config/model_config_small.json',
                        type=str,
                        required=False,
                        help='选择模型参数')
    parser.add_argument('--tokenizer_path',
                        default='../cache/vocab_small.txt',
                        type=str,
                        required=False,
                        help='选择词库')
    parser.add_argument('--raw_data_path',
                        default='data_quantangshi',
                        type=str,
                        required=False,
                        help='原始训练语料')
    parser.add_argument('--tokenized_data_path',
                        default='data_quantangshi/tokenized/',
                        type=str,
                        required=False,
                        help='tokenized语料存放位置')
    parser.add_argument('--raw', action='store_true', help='是否先做tokenize')
    parser.add_argument('--epochs',
                        default=15,
                        type=int,
                        required=False,
                        help='训练循环')
    parser.add_argument('--batch_size',
                        default=1,
                        type=int,
                        required=False,
                        help='训练batch size')
    parser.add_argument('--lr',
                        default=1.5e-4,
                        type=float,
                        required=False,
                        help='学习率')
    parser.add_argument('--warmup_steps',
                        default=1024,
                        type=int,
                        required=False,
                        help='warm up步数')
    parser.add_argument('--log_step',
                        default=1,
                        type=int,
                        required=False,
                        help='多少步汇报一次loss,设置为gradient accumulation的整数倍')
    parser.add_argument('--stride',
                        default=468,
                        type=int,
                        required=False,
                        help='训练时取训练数据的窗口步长')
    parser.add_argument('--gradient_accumulation',
                        default=1,
                        type=int,
                        required=False,
                        help='梯度积累')
    parser.add_argument('--fp16', action='store_true', help='混合精度')
    parser.add_argument('--fp16_opt_level',
                        default='O1',
                        type=str,
                        required=False)
    parser.add_argument('--max_grad_norm',
                        default=1.0,
                        type=float,
                        required=False)
    parser.add_argument('--num_pieces',
                        default=1,
                        type=int,
                        required=False,
                        help='将训练语料分成多少份')
    parser.add_argument('--min_length',
                        default=128,
                        type=int,
                        required=False,
                        help='最短收录文章长度')
    parser.add_argument('--output_dir',
                        default='model_quantangshi/',
                        type=str,
                        required=False,
                        help='模型输出路径')
    parser.add_argument('--pretrained_model',
                        default='',
                        type=str,
                        required=False,
                        help='模型训练起点路径')
    parser.add_argument('--writer_dir',
                        default='../tensorboard_summary/',
                        type=str,
                        required=False,
                        help='Tensorboard路径')
    parser.add_argument('--segment', action='store_true', help='中文以词为单位')
    parser.add_argument('--bpe_token', action='store_true', help='subword')
    parser.add_argument('--encoder_json',
                        default="../tokenizations/encoder.json",
                        type=str,
                        help="encoder.json")
    parser.add_argument('--vocab_bpe',
                        default="../tokenizations/vocab.bpe",
                        type=str,
                        help="vocab.bpe")

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    # args.segment = False

    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡

    model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(
        args.model_config)
    print('config:\n' + model_config.to_json_string())

    n_ctx = model_config.n_ctx

    if args.bpe_token:
        full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe)
    else:
        full_tokenizer = tokenization_bert.BertTokenizer(
            vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = 999999
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path

    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    stride = args.stride
    gradient_accumulation = args.gradient_accumulation
    fp16 = args.fp16  # 不支持半精度的显卡请勿打开
    fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    num_pieces = args.num_pieces
    min_length = args.min_length
    output_dir = args.output_dir
    tb_writer = SummaryWriter(log_dir=args.writer_dir)
    assert log_step % gradient_accumulation == 0

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    if not args.pretrained_model:
        model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config)
    else:
        model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(
            args.pretrained_model)
    model.train()
    model.to(device)

    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print('number of parameters: {}'.format(num_parameters))

    multi_gpu = False
    full_len = 0
    print('calculating total steps')
    for i in tqdm(range(num_pieces)):
        with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                  'r') as f:
            full_len += len([int(item) for item in f.read().strip().split()])
    total_steps = int(full_len / stride * epochs / batch_size /
                      gradient_accumulation)
    print('total steps = {}'.format(total_steps))

    optimizer = transformers.AdamW(model.parameters(),
                                   lr=lr,
                                   correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(optimizer,
                                                  warmup_steps=warmup_steps,
                                                  t_total=total_steps)
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(
            model, device_ids=[int(i) for i in args.device.split(',')])
        multi_gpu = True
    print('starting training')
    overall_step = 0
    running_loss = 0
    size = 0
    for epoch in range(epochs):
        print('epoch {}'.format(epoch + 1))
        now = datetime.now()
        print('time: {}'.format(now))
        x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
        random.shuffle(x)
        piece_num = 0
        for i in x:
            piecestart = datetime.now()

            with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                      'r') as f:
                line = f.read().strip()
            tokens = line.split()
            tokens = [int(token) for token in tokens]
            start_point = 0
            samples = []
            while start_point < len(tokens) - n_ctx:
                samples.append(tokens[start_point:start_point + n_ctx])
                start_point += stride
            if start_point < len(tokens):
                samples.append(tokens[len(tokens) - n_ctx:])
            random.shuffle(samples)
            for step in range(len(samples) // batch_size):  # drop last

                #  prepare data
                batch = samples[step * batch_size:(step + 1) * batch_size]
                batch_inputs = []
                for ids in batch:
                    int_ids = [int(x) for x in ids]
                    batch_inputs.append(int_ids)
                batch_inputs = torch.tensor(batch_inputs).long().to(device)

                #  forward pass
                outputs = model.forward(input_ids=batch_inputs,
                                        labels=batch_inputs)
                loss, logits = outputs[:2]

                #  get loss
                if multi_gpu:
                    loss = loss.mean()
                if gradient_accumulation > 1:
                    loss = loss / gradient_accumulation

                #  loss backward
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   max_grad_norm)

                #  optimizer step
                if (overall_step + 1) % gradient_accumulation == 0:
                    running_loss += loss.item()
                    optimizer.step()
                    optimizer.zero_grad()
                    scheduler.step()
                if (overall_step + 1) % log_step == 0:
                    tb_writer.add_scalar('loss',
                                         loss.item() * gradient_accumulation,
                                         overall_step)
                    log('now time: {}:{}:{}. Step {} of piece {} of epoch {}, loss {}'
                        .format(
                            datetime.now().hour,
                            datetime.now().minute,
                            datetime.now().second, step + 1, piece_num,
                            epoch + 1, running_loss * gradient_accumulation /
                            (log_step / gradient_accumulation)))
                    running_loss = 0
                overall_step += 1
                # print('now time: {}:{}. Step {} of piece {} of epoch {} '.format(
                #     datetime.now().hour,
                #             datetime.now().minute,
                #             step + 1,
                #             piece_num,
                #             epoch + 1
                # ))
                size += 1
            pieceend = datetime.now()
            log('{} times train: {} piece time :start= {} ;end={} ;all={} '.
                format(size, piece_num, piecestart, pieceend,
                       (pieceend - piecestart)))

            piece_num += 1

        log('saving model for epoch {}'.format(epoch + 1))
        if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)):
            os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1))
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(output_dir +
                                      'model_epoch{}'.format(epoch + 1))
        # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1))
        # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1))
        log('epoch {} finished'.format(epoch + 1))

        then = datetime.now()
        log('time: {}'.format(then))
        log('time for one epoch: {}'.format(then - now))

    log('training finished')
    if not os.path.exists(output_dir + 'final_model'):
        os.mkdir(output_dir + 'final_model')
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(output_dir + 'final_model')
Example #10
0
def train(model, device, train_list, multi_gpu, hparams):
    train_data = MyDataset(train_list)
    train_dataloader = DataLoader(train_data,
                                  batch_size=hparams.batch_size,
                                  shuffle=True,
                                  num_workers=hparams.num_workers,
                                  collate_fn=collate)
    model.train()
    total_steps = int(train_data.__len__() * hparams.epochs /
                      hparams.batch_size / hparams.gradient_accumulation)
    logger.info('total training steps = {}'.format(total_steps))

    optimizer = transformers.AdamW(model.parameters(),
                                   lr=hparams.lr,
                                   correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(
        optimizer, warmup_steps=hparams.warmup_steps, t_total=total_steps)

    logger.info('starting training')
    run_loss = 0
    over_step = 0
    tb_writer = SummaryWriter(log_dir=hparams.writer_dir)
    oom_time = 0  # out of memory 次数
    for epoch in range(hparams.epochs):
        start_time = datetime.now()
        for batch_index, input_ids in enumerate(train_dataloader):
            # GPT2Model的输入为n个token_id时,输出也是n个hidden_state,使用第n个hidden_state预测第n+1个token
            input_ids = input_ids.to(device)
            try:
                outputs = model.forward(input_ids=input_ids)
                loss, accuracy = cal_loss_accuracy(outputs, input_ids, device)
                if multi_gpu:
                    loss = loss.mean()
                    accuracy = accuracy.mean()
                if hparams.gradient_accumulation > 1:
                    loss = loss / hparams.gradient_accumulation
                    accuracy = accuracy / hparams.gradient_accumulation
                loss.backward()
                # 解决梯度爆照和消失问题
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               hparams.max_grad_norm)
                if (batch_index + 1) % hparams.gradient_accumulation == 0:
                    run_loss += loss.item()
                    optimizer.step()
                    optimizer.zero_grad()  # 清空梯度信息
                    scheduler.step()
                    over_step += 1
                    if (over_step + 1) % hparams.log_step == 0:
                        logger.info(
                            "batch {} of epoch {}, loss {}, accuracy {}".
                            format(batch_index + 1, epoch + 1, loss, accuracy))
                        tb_writer.add_scalar('loss', loss.item(), over_step)
            except RuntimeError as e:
                if "out of memory" in str(e):
                    oom_time += 1
                    logger.info("WARNING: ran out of memory,times: {}".format(
                        oom_time))
                    if hasattr(torch.cuda, 'empty_cache'):
                        torch.cuda.empty_cache()
                else:
                    logger.info(str(e))
        logger.info('saving model for epoch {}'.format(epoch + 1))
        if hparams.train_mmi:
            model_path = join(hparams.mmi_model_output_path,
                              "model_epoch{}".format(epoch + 1))
        else:
            model_path = join(hparams.dialog_model_output_path,
                              "model_epoch{}".format(epoch + 1))
        if not os.path.exists(model_path):
            os.mkdir(model_path)
        model_to_save = model.module if hasattr(model, "module") else model
        model_to_save.save_pretrained(model_path)
        logger.info('epoch {} finished'.format(epoch + 1))
        epoch_finish_time = datetime.now()
        logger.info('time for one epoch: {}'.format(epoch_finish_time -
                                                    start_time))
    logger.info('training finished')