Beispiel #1
0
 def _get_scheduler(self, optimizer, scheduler: str, warmup_steps: int, t_total: int):
     """
     Returns the correct learning rate scheduler
     """
     scheduler = scheduler.lower()
     if scheduler == 'constantlr':
         return pytorch_transformers.ConstantLRSchedule(optimizer)
     elif scheduler == 'warmupconstant':
         return pytorch_transformers.WarmupConstantSchedule(optimizer, warmup_steps=warmup_steps)
     elif scheduler == 'warmuplinear':
         return pytorch_transformers.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total)
     elif scheduler == 'warmupcosine':
         return pytorch_transformers.WarmupCosineSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total)
     elif scheduler == 'warmupcosinewithhardrestarts':
         return pytorch_transformers.WarmupCosineWithHardRestartsSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total)
     else:
         raise ValueError("Unknown scheduler {}".format(scheduler))
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='设置使用哪些显卡')
    parser.add_argument('--model_config',
                        default='config/model_config_small.json',
                        type=str,
                        required=False,
                        help='选择模型参数')
    parser.add_argument('--tokenizer_path',
                        default='cache/vocab_small.txt',
                        type=str,
                        required=False,
                        help='选择词库')
    parser.add_argument('--raw_data_path',
                        default='data/train.json',
                        type=str,
                        required=False,
                        help='原始训练语料')
    parser.add_argument('--tokenized_data_path',
                        default='data/tokenized/',
                        type=str,
                        required=False,
                        help='tokenized语料存放位置')
    parser.add_argument('--raw', action='store_true', help='是否先做tokenize')
    parser.add_argument('--epochs',
                        default=5,
                        type=int,
                        required=False,
                        help='训练循环')
    parser.add_argument('--batch_size',
                        default=8,
                        type=int,
                        required=False,
                        help='训练batch size')
    parser.add_argument('--lr',
                        default=1.5e-4,
                        type=float,
                        required=False,
                        help='学习率')
    parser.add_argument('--warmup_steps',
                        default=2000,
                        type=int,
                        required=False,
                        help='warm up步数')
    parser.add_argument('--log_step',
                        default=1,
                        type=int,
                        required=False,
                        help='多少步汇报一次loss')
    parser.add_argument('--stride',
                        default=768,
                        type=int,
                        required=False,
                        help='训练时取训练数据的窗口步长')
    parser.add_argument('--gradient_accumulation',
                        default=1,
                        type=str,
                        required=False,
                        help='梯度积累')
    parser.add_argument('--fp16', action='store_true', help='混合精度')
    parser.add_argument('--fp16_opt_level',
                        default='O1',
                        type=str,
                        required=False)
    parser.add_argument('--max_grad_norm',
                        default=1.0,
                        type=float,
                        required=False)
    parser.add_argument('--num_pieces',
                        default=100,
                        type=int,
                        required=False,
                        help='将训练语料分成多少份')
    parser.add_argument('--output_dir',
                        default='model/',
                        type=str,
                        required=False,
                        help='模型输出路径')
    parser.add_argument('--pretrained_model',
                        default='',
                        type=str,
                        required=False,
                        help='模型训练起点路径')
    parser.add_argument('--no_wordpiece',
                        action='store_true',
                        help='不做word piece切词')

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    if args.no_wordpiece:
        import tokenization_bert_without_wordpiece as tokenization_bert
    else:
        import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡
    model_config = pytorch_transformers.modeling_gpt2.GPT2Config.from_json_file(
        args.model_config)
    print('config:\n' + model_config.to_json_string())

    n_ctx = model_config.n_ctx
    full_tokenizer = tokenization_bert.BertTokenizer(
        vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = n_ctx
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    raw = args.raw  # 选择是否从零开始构建数据集
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    stride = args.stride
    gradient_accumulation = args.gradient_accumulation
    fp16 = args.fp16  # 不支持半精度的显卡请勿打开
    fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    num_pieces = args.num_pieces
    output_dir = args.output_dir

    if raw:
        print('building files')
        build_files(raw_data_path=raw_data_path,
                    tokenized_data_path=tokenized_data_path,
                    full_tokenizer=full_tokenizer,
                    num_pieces=num_pieces)
        print('files built')

    if not args.pretrained_model:
        model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel(
            config=model_config)
    else:
        model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(
            args.pretrained_model)
    model.train()
    model.to(device)
    multi_gpu = False
    full_len = 0
    print('calculating total steps')
    for i in tqdm(range(num_pieces)):
        with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                  'r') as f:
            full_len += len([int(item) for item in f.read().strip().split()])
    total_steps = int(full_len / stride * epochs / batch_size /
                      gradient_accumulation)
    print('total steps = {}'.format(total_steps))

    optimizer = pytorch_transformers.AdamW(model.parameters(),
                                           lr=lr,
                                           correct_bias=True)
    scheduler = pytorch_transformers.WarmupLinearSchedule(
        optimizer, warmup_steps=warmup_steps, t_total=total_steps)
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(model)
        multi_gpu = True
    print('starting training')
    for epoch in range(epochs):
        print('epoch {}'.format(epoch + 1))
        now = datetime.now()
        print('time: {}'.format(now))
        x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
        random.shuffle(x)
        piece_num = 0
        for i in x:
            running_loss = 0
            with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                      'r') as f:
                line = f.read().strip()
            tokens = line.split()
            tokens = [int(token) for token in tokens]
            start_point = 0
            samples = []
            while start_point < len(tokens) - n_ctx:
                samples.append(tokens[start_point:start_point + n_ctx])
                start_point += stride
            random.shuffle(samples)
            for step in range(len(samples) // batch_size):

                #  prepare data
                batch = samples[step * batch_size:(step + 1) * batch_size]
                batch_labels = []
                batch_inputs = []
                for ids in batch:
                    int_ids_for_labels = [int(x) for x in ids]
                    int_ids_for_inputs = [int(x) for x in ids]
                    batch_labels.append(int_ids_for_labels)
                    batch_inputs.append(int_ids_for_inputs)
                batch_labels = torch.tensor(batch_labels).long().to(device)
                batch_inputs = torch.tensor(batch_inputs).long().to(device)

                #  forward pass
                outputs = model.forward(input_ids=batch_inputs,
                                        labels=batch_labels)
                loss, logits = outputs[:2]

                #  get loss
                if multi_gpu:
                    loss = loss.mean()
                if gradient_accumulation > 1:
                    loss = loss / gradient_accumulation

                #  loss backward
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   max_grad_norm)

                #  optimizer step
                if (step + 1) % gradient_accumulation == 0:
                    running_loss += loss.item()
                    scheduler.step()
                    optimizer.step()
                    optimizer.zero_grad()
                if (step + 1) % log_step == 0:
                    print(
                        'now time: {}:{}. Step {} of piece {} of epoch {}, loss {}'
                        .format(
                            datetime.now().hour,
                            datetime.now().minute,
                            (step + 1) // gradient_accumulation, piece_num,
                            epoch + 1,
                            running_loss * gradient_accumulation / log_step))
                    running_loss = 0
            piece_num += 1

        print('saving model for epoch {}'.format(epoch + 1))
        if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)):
            os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1))
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(output_dir +
                                      'model_epoch{}'.format(epoch + 1))
        # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1))
        # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1))
        print('epoch {} finished'.format(epoch + 1))

        then = datetime.now()
        print('time: {}'.format(then))
        print('time for one epoch: {}'.format(then - now))

    print('training finished')
    if not os.path.exists(output_dir + 'final_model'):
        os.mkdir(output_dir + 'final_model')
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(output_dir + 'final_model')
Beispiel #3
0
def main():
    if raw:
        print('building files')
        build_files(data_path=raw_data_path)
        print('files built')

    model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel(
        config=model_config)
    model.to(device)

    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print('number of parameters: {}'.format(num_parameters))

    multi_gpu = False
    full_len = 0
    print('calculating total steps')
    for i in tqdm(range(num_pieces)):
        with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                  'r') as f:
            full_len += len([int(item) for item in f.read().strip().split()])
    total_steps = int(full_len / stride * epochs / batch_size /
                      gradient_accumulation)
    print('total steps = {}'.format(total_steps))
    optimizer = pytorch_transformers.AdamW(model.parameters(),
                                           lr=lr,
                                           correct_bias=True)
    scheduler = pytorch_transformers.WarmupLinearSchedule(
        optimizer, warmup_steps=warmup_steps, t_total=total_steps)
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(model)
        multi_gpu = True
    print('starting training')
    for epoch in range(epochs):
        print('epoch {}'.format(epoch + 1))
        now = datetime.now()
        print('time: {}'.format(now))
        x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
        random.shuffle(x)
        piece_num = 0
        for i in x:
            running_loss = 0
            with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                      'r') as f:
                line = f.read().strip()
            tokens = line.split()
            tokens = [int(token) for token in tokens]
            start_point = 0
            samples = []
            while start_point < len(tokens) - n_ctx:
                samples.append(tokens[start_point:start_point + n_ctx])
                start_point += stride
            random.shuffle(samples)
            for step in range(len(samples) // batch_size):

                #  prepare data
                batch = samples[step * batch_size:(step + 1) * batch_size]
                batch_labels = []
                batch_inputs = []
                for ids in batch:
                    int_ids_for_labels = [int(x) for x in ids]
                    int_ids_for_inputs = [int(x) for x in ids]
                    batch_labels.append(int_ids_for_labels)
                    batch_inputs.append(int_ids_for_inputs)
                batch_labels = torch.tensor(batch_labels).long().to(device)
                batch_inputs = torch.tensor(batch_inputs).long().to(device)

                #  forward pass
                outputs = model.forward(input_ids=batch_inputs,
                                        labels=batch_labels)
                loss, logits = outputs[:2]

                #  get loss
                if multi_gpu:
                    loss = loss.mean()
                if gradient_accumulation > 1:
                    loss = loss / gradient_accumulation

                #  loss backward
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   max_grad_norm)

                #  optimizer step
                if (step + 1) % gradient_accumulation == 0:
                    running_loss += loss.item()
                    scheduler.step()
                    optimizer.step()
                    optimizer.zero_grad()
                if (step + 1) % log_step == 0:
                    print('step {} of piece {} of epoch {}, loss {}'.format(
                        (step + 1) // gradient_accumulation, piece_num,
                        epoch + 1,
                        running_loss * gradient_accumulation / log_step))
                    running_loss = 0
            piece_num += 1

        print('saving model for epoch {}'.format(epoch + 1))
        if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)):
            os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1))
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(output_dir +
                                      'model_epoch{}'.format(epoch + 1))
        # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1))
        # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1))
        print('epoch {} finished'.format(epoch + 1))

        then = datetime.now()
        print('time: {}'.format(then))
        print('time for one epoch: {}'.format(then - now))

    print('training finished')
    if not os.path.exists(output_dir + 'final_model'):
        os.mkdir(output_dir + 'final_model')
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(output_dir + 'final_model')
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='设置使用哪些显卡')
    parser.add_argument('--model_config',
                        default='config/model_config_small.json',
                        type=str,
                        required=False,
                        help='选择模型参数')
    parser.add_argument('--tokenizer_path',
                        default='cache/vocab_small.txt',
                        type=str,
                        required=False,
                        help='选择词库')
    parser.add_argument('--tokenized_data_path',
                        default='data/tokenized/',
                        type=str,
                        required=False,
                        help='tokenized语料存放位置')
    parser.add_argument('--epochs',
                        default=5,
                        type=int,
                        required=False,
                        help='训练循环')
    parser.add_argument('--batch_size',
                        default=8,
                        type=int,
                        required=False,
                        help='训练batch size')
    parser.add_argument('--lr',
                        default=1.5e-4,
                        type=float,
                        required=False,
                        help='学习率')
    parser.add_argument('--warmup_steps',
                        default=2000,
                        type=int,
                        required=False,
                        help='warm up步数')
    parser.add_argument('--log_step',
                        default=1,
                        type=int,
                        required=False,
                        help='多少步汇报一次loss')
    parser.add_argument('--save_per_step',
                        default=10000,
                        type=int,
                        required=False,
                        help='多少步保存一次模型')
    parser.add_argument('--stride',
                        default=768,
                        type=int,
                        required=False,
                        help='训练时取训练数据的窗口步长')
    parser.add_argument('--gradient_accumulation',
                        default=1,
                        type=int,
                        required=False,
                        help='梯度积累')
    parser.add_argument('--fp16', action='store_true', help='混合精度')
    parser.add_argument('--fp16_opt_level',
                        default='O1',
                        type=str,
                        required=False)
    parser.add_argument('--max_grad_norm',
                        default=1.0,
                        type=float,
                        required=False)
    parser.add_argument('--num_pieces',
                        default=100,
                        type=int,
                        required=False,
                        help='将训练语料分成多少份')
    parser.add_argument('--min_length',
                        default=128,
                        type=int,
                        required=False,
                        help='最短收录文章长度')
    parser.add_argument('--output_dir',
                        default='model/',
                        type=str,
                        required=False,
                        help='模型输出路径')
    parser.add_argument('--pretrained_model',
                        default='',
                        type=str,
                        required=False,
                        help='模型训练起点路径')
    parser.add_argument('--writer_dir',
                        default='tensorboard_summary/',
                        type=str,
                        required=False,
                        help='Tensorboard路径')
    parser.add_argument('--bpe_token', action='store_true', help='subword')
    parser.add_argument('--encoder_json',
                        default="tokenizations/encoder.json",
                        type=str,
                        help="encoder.json")
    parser.add_argument('--vocab_bpe',
                        default="tokenizations/vocab.bpe",
                        type=str,
                        help="vocab.bpe")

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    from tokenizations import tokenization_bert
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡

    model_config = pytorch_transformers.modeling_gpt2.GPT2Config.from_json_file(
        args.model_config)
    print('config:\n' + model_config.to_json_string())

    n_ctx = model_config.n_ctx
    if args.bpe_token:
        full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe)
    else:
        full_tokenizer = tokenization_bert.BertTokenizer(
            vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = n_ctx
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    tokenized_data_path = args.tokenized_data_path
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    stride = args.stride
    gradient_accumulation = args.gradient_accumulation
    fp16 = args.fp16  # 不支持半精度的显卡请勿打开
    fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    num_pieces = args.num_pieces
    min_length = args.min_length
    output_dir = args.output_dir
    # tb_writer = SummaryWriter(log_dir=args.writer_dir)

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    if not args.pretrained_model:
        model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel(
            config=model_config)
    else:
        model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(
            args.pretrained_model)
    model.train()
    model.to(device)

    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print('number of parameters: {}'.format(num_parameters))

    multi_gpu = False
    full_len = 0
    print('calculating total steps')
    for i in tqdm(range(num_pieces)):
        with open(
                os.path.join(tokenized_data_path,
                             'tokenized_train_{}.txt'.format(i)), 'r') as f:
            full_len += len([int(item) for item in f.read().strip().split()])
    total_steps = int(full_len / stride * epochs / batch_size /
                      gradient_accumulation)
    print('total steps = {}'.format(total_steps))

    optimizer = pytorch_transformers.AdamW(model.parameters(),
                                           lr=lr,
                                           correct_bias=True)
    scheduler = pytorch_transformers.WarmupLinearSchedule(
        optimizer, warmup_steps=warmup_steps, t_total=total_steps)
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(model)
        multi_gpu = True
    print('starting training')
    overall_step = 0
    running_loss = 0
    running_step = 0
    for epoch in range(epochs):
        print('epoch {}'.format(epoch + 1))
        now = datetime.now()
        print('time: {}'.format(now))
        x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
        random.shuffle(x)
        piece_num = 0
        for i in x:
            with open(
                    os.path.join(tokenized_data_path,
                                 'tokenized_train_{}.txt'.format(i)),
                    'r') as f:
                line = f.read().strip()
            tokens = line.split()
            tokens = [int(token) for token in tokens]
            start_point = 0
            samples = []
            while start_point < len(tokens) - n_ctx:
                samples.append(tokens[start_point:start_point + n_ctx])
                start_point += stride
            random.shuffle(samples)
            for step in range(len(samples) // batch_size):  # drop last

                #  prepare data
                batch = samples[step * batch_size:(step + 1) * batch_size]
                batch_labels = []
                batch_inputs = []
                for ids in batch:
                    int_ids_for_labels = [int(x) for x in ids]
                    int_ids_for_inputs = [int(x) for x in ids]
                    batch_labels.append(int_ids_for_labels)
                    batch_inputs.append(int_ids_for_inputs)
                batch_labels = torch.tensor(batch_labels).long().to(device)
                batch_inputs = torch.tensor(batch_inputs).long().to(device)

                #  forward pass
                outputs = model.forward(input_ids=batch_inputs,
                                        labels=batch_labels)
                loss, logits = outputs[:2]

                #  get loss
                if multi_gpu:
                    loss = loss.mean()
                if gradient_accumulation > 1:
                    loss = loss / gradient_accumulation

                #  loss backward
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   max_grad_norm)

                #  optimizer step
                running_loss += loss.item()
                running_step += 1
                mean_loss = running_loss * gradient_accumulation / running_step
                if (step + 1) % gradient_accumulation == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    scheduler.step()
                    overall_step += 1

                    # how many steps to print loss log
                    if overall_step % log_step == 0:
                        now_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                        print(
                            'now time: {}: Step {} of piece {} of epoch {}. Global Step: {}, Mean Loss: {}'
                            .format(now_time,
                                    (step + 1) // gradient_accumulation,
                                    piece_num, epoch + 1, overall_step,
                                    mean_loss))

                    # how many steps to save a checkpoint
                    if overall_step % args.save_per_step == 0:
                        if not os.path.exists(
                                os.path.join(
                                    output_dir, "model_step_%d" %
                                    (overall_step + 1))):
                            os.mkdir(
                                os.path.join(
                                    output_dir,
                                    "model_step_%d" % (overall_step + 1)))
                        model_to_save = model.module if hasattr(
                            model, 'module') else model
                        model_to_save.save_pretrained(
                            os.path.join(output_dir,
                                         "model_step_%d" % (overall_step + 1)))

            piece_num += 1

        # save model per epoch
        print('saving model for epoch {}'.format(epoch + 1))
        if not os.path.exists(
                os.path.join(output_dir, 'model_epoch{}'.format(epoch + 1))):
            os.mkdir(
                os.path.join(output_dir, 'model_epoch{}'.format(epoch + 1)))
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(
            os.path.join(output_dir, 'model_epoch{}'.format(epoch + 1)))
        print('epoch {} finished'.format(epoch + 1))

        then = datetime.now()
        print('time: {}'.format(then))
        print('time for one epoch: {}'.format(then - now))

    # save the final model
    print('training finished')
    if not os.path.exists(os.path.join(output_dir, 'final_model')):
        os.mkdir(os.path.join(output_dir, 'final_model'))
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(os.path.join(output_dir, 'final_model'))
Beispiel #5
0
    def train(self):
        if not self.pretrained_model:
            model = GPT2KWModel(config=self.model_config)
        else:
            model = GPT2KWModel.from_pretrained(self.pretrained_model)
        model.train()
        model.to(self.device)
        # 计算模型参数量
        num_parameters = 0
        parameters = model.parameters()
        for parameter in parameters:
            num_parameters += parameter.numel()
        self.print_and_log('模型参数量: {}'.format(num_parameters))

        self.print_and_log("开始加载训练集")
        train_loader = self.create_dataloader()
        self.print_and_log("训练集加载完毕")

        epoch_steps = int(train_loader.sampler.num_samples / self.batch_size / self.accumulation_steps)
        total_steps = epoch_steps * self.epochs
        self.print_and_log('epoch 步数 = {}'.format(epoch_steps))
        self.print_and_log('总步数 = {}'.format(total_steps))

        optimizer = pytorch_transformers.AdamW(model.parameters(), lr=self.lr, correct_bias=True)
        scheduler = pytorch_transformers.WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=total_steps)

        if self.fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
            model, optimizer = amp.initialize(model, optimizer, opt_level=self.fp16_opt_level)

        if torch.cuda.device_count() > 1:
            model = DataParallel(model)
            multi_gpu = True
        else:
            multi_gpu = False

        overall_step = 0

        for epoch in range(self.epochs):
            self.print_and_log('epoch {}'.format(epoch + 1))
            now = datetime.now()
            self.print_and_log('time: {}'.format(now))
            optimizer.zero_grad()
            running_loss = 0
            for i, batch_data in enumerate(train_loader):
                if torch.cuda.is_available():
                    keyword_ids = batch_data[0].to(self.device, non_blocking=True)
                    passage_ids = batch_data[1].to(self.device, non_blocking=True)
                    label_ids = passage_ids.clone().to(self.device, non_blocking=True)
                else:
                    keyword_ids = batch_data[0]
                    passage_ids = batch_data[1]
                    label_ids = passage_ids.clone()
                outputs = model(input_ids=passage_ids, keyword_ids=keyword_ids, labels=label_ids)
                loss, logits = outputs[:2]

                if multi_gpu:
                    loss = loss.mean()

                if self.gradient_accumulation > 1:
                    loss = loss / self.gradient_accumulation

                #  loss backward
                if self.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), self.max_grad_norm)

                if (i + 1) % self.gradient_accumulation == 0:
                    running_loss += loss.item()
                    scheduler.step()
                    optimizer.step()
                    optimizer.zero_grad()
                    overall_step += 1
                    #if (overall_step + 1) % self.log_step == 0:
                    #    self.tb_writer.add_scalar('loss', loss.item(), overall_step)

                if (overall_step + 1) % self.log_step == 0 and running_loss != 0:
                    self.print_and_log('now time: {}:{}. Step {} of epoch {}, loss {}'.format(
                        datetime.now().hour,
                        datetime.now().minute,
                        overall_step + 1,
                        epoch + 1,
                        running_loss * self.gradient_accumulation / self.log_step))
                    running_loss = 0

            if not os.path.exists(self.output_dir + 'model_epoch{}'.format(epoch + 1)):
                os.makedirs(self.output_dir + 'model_epoch{}'.format(epoch + 1))
            model_to_save = model.module if hasattr(model, 'module') else model
            model_to_save.save_pretrained(self.output_dir + 'model_epoch{}'.format(epoch + 1))
            # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1))
            # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1))

            then = datetime.now()
            self.print_and_log('time: {}'.format(then))
            self.print_and_log('time for one epoch: {}'.format(then - now))

        self.print_and_log('training finished')
        self.f_log.close()
        if not os.path.exists(self.output_dir + 'final_model'):
            os.makedirs(self.output_dir + 'final_model')
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(self.output_dir + 'final_model')
    def train(self):
        if not self.pretrained_model:
            model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel(
                config=self.model_config)
        else:
            model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(
                self.pretrained_model)
        model.train()
        model.to(self.device)
        # 计算模型参数量
        num_parameters = 0
        parameters = model.parameters()
        for parameter in parameters:
            num_parameters += parameter.numel()
        self.print_and_log('模型参数量 = {}'.format(num_parameters))

        if self.do_tokenize:
            self.print_and_log("开始加载训练集")
            self.tokenize_and_save()
            self.print_and_log("训练集加载完毕")

        full_len = 0
        for i in range(self.split_num):
            with open(
                    self.tokenized_data_path +
                    'tokenized_train_{}.txt'.format(i), 'r') as f:
                full_len += len(
                    [int(item) for item in f.read().strip().split()])
        sample_num = int(full_len / self.stride)
        epoch_steps = int(full_len / self.stride / self.batch_size /
                          self.gradient_accumulation)
        total_steps = int(full_len / self.stride * self.epochs /
                          self.batch_size / self.gradient_accumulation)
        self.print_and_log('样本数 = {}'.format(sample_num))
        self.print_and_log('epoch 步数 = {}'.format(epoch_steps))
        self.print_and_log('总步数 = {}'.format(total_steps))

        optimizer = pytorch_transformers.AdamW(model.parameters(),
                                               lr=self.lr,
                                               correct_bias=True)
        scheduler = pytorch_transformers.WarmupLinearSchedule(
            optimizer, warmup_steps=self.warmup_steps, t_total=total_steps)

        if self.fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )
            model, optimizer = amp.initialize(model,
                                              optimizer,
                                              opt_level=self.fp16_opt_level)

        if torch.cuda.device_count() > 1:
            model = DataParallel(model)
            multi_gpu = True
        else:
            multi_gpu = False

        overall_step = 0
        running_loss = 0
        for epoch in range(self.epochs):
            self.print_and_log('epoch {}'.format(epoch + 1))
            now = datetime.now()
            self.print_and_log('time: {}'.format(now))
            optimizer.zero_grad()
            split_indices = np.linspace(0,
                                        self.split_num - 1,
                                        self.split_num,
                                        dtype=np.int32)
            random.shuffle(split_indices)
            for split_index in split_indices:
                with open(
                        self.tokenized_data_path +
                        'tokenized_train_{}.txt'.format(split_index),
                        'r') as f:
                    line = f.read().strip()
                all_ids = line.split()
                all_ids = [int(x) for x in all_ids]
                start_point = 0
                samples = []
                while start_point < len(all_ids) - self.n_ctx:
                    samples.append(all_ids[start_point:start_point +
                                           self.n_ctx])
                    start_point += self.stride
                random.shuffle(samples)
                for i in range(len(samples) // self.batch_size):  # drop last
                    batch = samples[i * self.batch_size:(i + 1) *
                                    self.batch_size]
                    batch_labels = torch.tensor(batch, dtype=torch.long).to(
                        self.device)
                    batch_inputs = torch.tensor(batch, dtype=torch.long).to(
                        self.device)
                    outputs = model.forward(input_ids=batch_inputs,
                                            labels=batch_labels)
                    loss, logits = outputs[:2]

                    if multi_gpu:
                        loss = loss.mean()

                    if self.gradient_accumulation > 1:
                        loss = loss / self.gradient_accumulation

                    #  loss backward
                    if self.fp16:
                        with amp.scale_loss(loss, optimizer) as scaled_loss:
                            scaled_loss.backward()
                            torch.nn.utils.clip_grad_norm_(
                                amp.master_params(optimizer),
                                self.max_grad_norm)
                    else:
                        loss.backward()
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       self.max_grad_norm)

                    if (i + 1) % self.gradient_accumulation == 0:
                        running_loss += loss.item()
                        scheduler.step()
                        optimizer.step()
                        optimizer.zero_grad()
                        overall_step += 1

                    if (overall_step +
                            1) % self.log_step == 0 and running_loss != 0:
                        self.print_and_log(
                            'now time: {}:{}. Step {} of epoch {}, loss {}'.
                            format(
                                datetime.now().hour,
                                datetime.now().minute, overall_step + 1,
                                epoch + 1, running_loss *
                                self.gradient_accumulation / self.log_step))
                        running_loss = 0

            if not os.path.exists(self.output_dir +
                                  'model_epoch{}'.format(epoch + 1)):
                os.makedirs(self.output_dir +
                            'model_epoch{}'.format(epoch + 1))
            gpt2_model = model.transformer
            model_to_save = gpt2_model.module if hasattr(
                gpt2_model, 'module') else gpt2_model
            model_to_save.save_pretrained(self.output_dir +
                                          'model_epoch{}'.format(epoch + 1))
            # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1))
            # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1))

            then = datetime.now()
            self.print_and_log('time: {}'.format(then))
            self.print_and_log('time for one epoch: {}'.format(then - now))

        self.print_and_log('training finished')
        self.f_log.close()
        if not os.path.exists(self.output_dir + 'final_model'):
            os.makedirs(self.output_dir + 'final_model')
        gpt2_model = model.transformer
        model_to_save = gpt2_model.module if hasattr(gpt2_model,
                                                     'module') else gpt2_model
        model_to_save.save_pretrained(self.output_dir + 'final_model')
Beispiel #7
0
def main():
    if raw:
        build_files(data_path=RAW_DATA_PATH)
        exit(1)

    model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel(
        config=model_config)
    MULTI_GPU = False
    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(model)
        MULTI_GPU = True
    model.to(device)

    total_tokens = 0
    for i in tqdm(range(num_pieces)):
        with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                  'r') as f:
            total_tokens += len(f.read().split())
    num_chunks = total_tokens // stride
    total_steps = int(num_chunks * EPOCHS / BATCH_SIZE)
    print('total steps = {}'.format(total_steps))
    optimizer = pytorch_transformers.AdamW(model.parameters(),
                                           lr=LR,
                                           correct_bias=True)
    scheduler = pytorch_transformers.WarmupLinearSchedule(
        optimizer, warmup_steps=WARMUP_STEPS, t_total=total_steps)
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=fp16_opt_level)
    print('starting training')
    for epoch in range(EPOCHS):
        print('epoch {}'.format(epoch))
        now = datetime.now()
        print('time: {}'.format(now))
        x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
        random.shuffle(x)
        piece_num = 0
        for i in x:
            with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                      'r') as f:
                running_loss = 0
                line = f.read()
                tokens = line.split()
                tokens = [int(token) for token in tokens]
                start_point = 0
                chunks = []
                while start_point < len(tokens) - n_ctx:
                    chunks.append(tokens[start_point:start_point + n_ctx])
                    start_point += stride
                random.shuffle(chunks)
                for step in range(len(chunks) // BATCH_SIZE):
                    batch = chunks[step * BATCH_SIZE:(step + 1) * BATCH_SIZE]
                    batch_labels = []
                    batch_inputs = []
                    for ids in batch:
                        int_ids_for_labels = [int(x) for x in ids]
                        int_ids_for_inputs = [int(x) for x in ids]
                        batch_labels.append(int_ids_for_labels)
                        batch_inputs.append(int_ids_for_inputs)
                    batch_labels = torch.tensor(batch_labels).long().to(device)
                    batch_inputs = torch.tensor(batch_inputs).long().to(device)

                    optimizer.zero_grad()
                    outputs = model.forward(input_ids=batch_inputs,
                                            labels=batch_labels)
                    loss, logits = outputs[:2]

                    if MULTI_GPU:
                        loss = loss.mean()

                    if fp16:
                        with amp.scale_loss(loss, optimizer) as scaled_loss:
                            scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), max_grad_norm)
                    else:
                        loss.backward()
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       max_grad_norm)

                    running_loss += loss.item()
                    scheduler.step()
                    optimizer.step()
                    if (step + 1) % LOG_STEP == 0:
                        print(
                            'step {} of piece {} of epoch {}, loss {}'.format(
                                step + 1, piece_num, epoch + 1,
                                running_loss / LOG_STEP))
                        running_loss = 0
            piece_num += 1

        print('saving model for epoch {}'.format(epoch))
        if not os.path.exists('./model/model_epoch{}'.format(epoch + 1)):
            os.mkdir('./model/model_epoch{}'.format(epoch + 1))
        model.save_pretrained('./model/model_epoch{}'.format(epoch + 1))
        torch.save(scheduler.state_dict(),
                   './model/model_epoch{}/scheduler.pt'.format(epoch + 1))
        torch.save(optimizer.state_dict(),
                   './model/model_epoch{}/optimizer.pt'.format(epoch + 1))
        print('epoch {} finished'.format(epoch + 1))

        then = datetime.now()
        print('time: {}'.format(then))
        print('time for one epoch: {}'.format(then - now))

    print('training finished')
    if not os.path.exists('./model/final_model'):
        os.mkdir('./model/final_model')
    model.save_pretrained('./model/final_model')
    torch.save(scheduler.state_dict(), './model/final_model/scheduler.pt')
    torch.save(optimizer.state_dict(), './model/final_model/optimizer.pt')