def init_optim(self): """ Initialize optimizer with model parameters. :param params: parameters from the model :param optim_states: optional argument providing states of optimizer to load :param saved_optim_type: type of optimizer being loaded, if changed will skip loading optimizer states """ # 设置优化器,并且在初始训练时,使用warmup策略 self.optimizer = transformers.AdamW(self.model.parameters(), lr=self.args.lr, correct_bias=True) self.scheduler = transformers.WarmupLinearSchedule( self.optimizer, warmup_steps=self.args.warmup_steps, t_total=self.total_steps)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--model_config', default='config/model_config.json', type=str, required=False, help='选择模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库') parser.add_argument('--raw_data_path', default='data/train.json', type=str, required=False, help='原始训练语料') parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--raw', action='store_true', help='是否先做tokenize') parser.add_argument('--epochs', default=5, type=int, required=False, help='训练循环') parser.add_argument('--batch_size', default=8, type=int, required=False, help='训练batch size') parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率') parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步数') parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss') parser.add_argument('--stride', default=768, type=int, required=False, help='训练时取训练数据的窗口步长') parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度积累') parser.add_argument('--fp16', action='store_true', help='混合精度') parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份') parser.add_argument('--output_dir', default='model/', type=str, required=False, help='模型输出路径') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径') parser.add_argument('--segment', action='store_true', help='中文以词为单位') args = parser.parse_args() args.device = 0 print('args:\n' + args.__repr__()) if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 model_config = transformers.modeling_gpt2.GPT2Config.from_json_file( args.model_config) print('config:\n' + model_config.to_json_string()) n_ctx = model_config.n_ctx full_tokenizer = tokenization_bert.BertTokenizer( vocab_file=args.tokenizer_path) ''' 直接使用gpt2的tokenizer ''' full_tokenizer.max_len = 999999 device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path raw = args.raw # 选择是否从零开始构建数据集 epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps log_step = args.log_step stride = args.stride gradient_accumulation = args.gradient_accumulation fp16 = args.fp16 # 不支持半精度的显卡请勿打开 fp16_opt_level = args.fp16_opt_level max_grad_norm = args.max_grad_norm num_pieces = args.num_pieces output_dir = args.output_dir if raw: print('building files') build_files(raw_data_path=raw_data_path, tokenized_data_path=tokenized_data_path, full_tokenizer=full_tokenizer, num_pieces=num_pieces) print('files built') if not args.pretrained_model: model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config) else: model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained( args.pretrained_model) model.train() model.to(device) multi_gpu = False full_len = 0 print('calculating total steps') for i in tqdm(range(num_pieces)): with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: full_len += len([int(item) for item in f.read().strip().split()]) total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation) print('total steps = {}'.format(total_steps)) optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) scheduler = transformers.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=total_steps) if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel(model) multi_gpu = True print('starting training') running_loss = 0 for epoch in range(epochs): print('epoch {}'.format(epoch + 1)) now = datetime.now() print('time: {}'.format(now)) x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32) random.shuffle(x) piece_num = 0 for i in x: with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: samples.append(tokens[start_point:start_point + n_ctx]) start_point += stride if start_point < len(tokens): samples.append(tokens[len(tokens) - n_ctx:]) random.shuffle(samples) for step in range(len(samples) // batch_size): # prepare data batch = samples[step * batch_size:(step + 1) * batch_size] batch_labels = [] batch_inputs = [] for ids in batch: int_ids_for_labels = [int(x) for x in ids] int_ids_for_inputs = [int(x) for x in ids] batch_labels.append(int_ids_for_labels) batch_inputs.append(int_ids_for_inputs) batch_labels = torch.tensor(batch_labels).long().to(device) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_labels) loss, logits = outputs[:2] # get loss if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step if (step + 1) % gradient_accumulation == 0: running_loss += loss.item() optimizer.step() optimizer.zero_grad() scheduler.step() if (step + 1) % log_step == 0: print( 'now time: {}:{}. Step {} of piece {} of epoch {}, loss {}' .format(datetime.now().hour, datetime.now().minute, (step + 1) // gradient_accumulation, piece_num, epoch + 1, running_loss / log_step)) running_loss = 0 piece_num += 1 print('saving model for epoch {}'.format(epoch + 1)) if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)): os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1)) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1)) # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1)) # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1)) print('epoch {} finished'.format(epoch + 1)) then = datetime.now() print('time: {}'.format(then)) print('time for one epoch: {}'.format(then - now)) print('training finished') if not os.path.exists(output_dir + 'final_model'): os.mkdir(output_dir + 'final_model') model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'final_model')
def train(model, device, train_list, multi_gpu, args): train_dataset = MyDataset(train_list) train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) total_steps = int(train_dataset.__len__() * args.epochs / args.batch_size / args.gradient_accumulation) logger.info('total training steps = {}'.format(total_steps)) save_step = max(int(args.save_step_percentage * total_steps), 1) logger.info('save per {} steps'.format(save_step)) optimizer = transformers.AdamW(model.parameters(), lr=args.lr, correct_bias=True) scheduler = transformers.WarmupLinearSchedule( optimizer, warmup_steps=args.warmup_steps, t_total=total_steps) logger.info('starting training') running_loss = 0 overall_step = -1 model_path = join(args.model_output_path, "saved.pt") if os.path.exists(model_path): checkpoint = torch.load(model_path) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) running_loss = checkpoint['running_loss'] overall_step = checkpoint['overall_step'] logger.info("running loss:{}, overall step:{}".format( running_loss, overall_step)) tb_writer = SummaryWriter(log_dir=args.writer_dir) oom_time = 0 model.train() oom_flag = False epoch_start_time = datetime.now() for batch_idx, input_ids in enumerate(train_dataloader): if batch_idx <= overall_step: continue input_ids = input_ids.to(device) try: mu, logvar, bow_probs = model.forward(input=input_ids) bow_loss = calculate_bow(bow_probs, input_ids, device) loss = bow_loss if multi_gpu: loss = loss.mean() if args.gradient_accumulation > 1: loss = loss / args.gradient_accumulation loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) if (batch_idx + 1) % args.gradient_accumulation == 0: running_loss += loss.item() optimizer.step() optimizer.zero_grad() scheduler.step() overall_step += 1 if (overall_step + 1) % args.log_step == 0 or (overall_step + 1 == total_steps): logger.info("step {}, loss {:.6}".format( overall_step, loss)) tb_writer.add_scalar('loss', loss.item(), overall_step) if (overall_step + 1) % save_step == 0 or (overall_step == total_steps): logger.info('saving for step {}'.format(overall_step)) if not os.path.exists(args.model_output_path): os.mkdir(args.model_output_path) torch.save( { # 'finished_epoch': epoch, 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'overall_step': overall_step, 'running_loss': running_loss }, model_path) logger.info('finish saving for step {}'.format(overall_step)) except RuntimeError as exception: if "out of memory" in str(exception): oom_time += 1 if not oom_flag: logger.info("WARNING: ran out of memory,times: {}".format( oom_time)) logger.info("batch_idx = ", batch_idx) oom_flag = True if hasattr(torch.cuda, 'empty_cache'): torch.cuda.empty_cache() else: logger.info(str(exception)) raise exception epoch_finish_time = datetime.now() logger.info('time for one epoch: {}'.format(epoch_finish_time - epoch_start_time)) logger.info('training finished')
def train(model, device, train_list, args): train_dataset = MyDataset(train_list) train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) total_steps = int(train_dataset.__len__() * args.epochs / args.batch_size) logger.info('total training steps = {}'.format(total_steps)) save_step = max(int(args.save_step_percentage * total_steps), 1) logger.info('save per {} steps'.format(save_step)) optimizer = transformers.AdamW(model.parameters(), lr=args.lr, correct_bias=True) scheduler = transformers.WarmupLinearSchedule( optimizer, warmup_steps=args.warmup_steps, t_total=total_steps) logger.info('starting training') running_loss = 0 overall_step = -1 kl_anneal_x0 = int(total_steps * args.kl_anneal_percentage) model_path = join(args.model_output_path, "saved.pt") if os.path.exists(model_path): checkpoint = torch.load(model_path) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) # finished_epoch = checkpoint['finished_epoch'] + 1 running_loss = checkpoint['running_loss'] overall_step = checkpoint['overall_step'] logger.info("running loss:{}, overall step:{}".format( running_loss, overall_step)) tb_writer = SummaryWriter(log_dir=args.writer_dir) oom_time = 0 model.train() oom_flag = False # for epoch in range(finished_epoch, args.epochs): epoch_start_time = datetime.now() for batch_idx, input_ids in enumerate(train_dataloader): if batch_idx <= overall_step: continue input_ids = input_ids.to(device) try: outputs, mu, logvar, bow_probs = model.forward(input=input_ids) # anneal_function, step, k, x0 ce, accuracy = calculate_loss_and_accuracy(outputs, labels=input_ids, device=device) kl_weight = min( 0.5, kl_anneal_function(anneal_function=args.kl_anneal_function, step=overall_step, k=args.kl_anneal_k, x0=kl_anneal_x0)) kld = (-0.5 * torch.sum(logvar - torch.pow(mu, 2) - torch.exp(logvar) + 1, 1)).mean().squeeze() bow_loss = calculate_bow(bow_probs, input_ids, device) loss = ce + kl_weight * kld + args.bow_weight * bow_loss loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) running_loss += loss.item() optimizer.step() optimizer.zero_grad() scheduler.step() overall_step += 1 if overall_step == 0 or ( overall_step + 1) % args.log_step == 0 or (overall_step + 1 == total_steps): logger.info( "step {}, ce {:.6}, kld {:.6}, kl_weight {:.6}, bow {:.6}, bow_weight {:.6}, loss {:.6}, accuracy {:.6}" .format(overall_step, ce, kld, kl_weight, bow_loss, args.bow_weight, loss, accuracy)) tb_writer.add_scalar('ce', ce.item(), overall_step) tb_writer.add_scalar('kld', kld.item(), overall_step) tb_writer.add_scalar('loss', loss.item(), overall_step) if (overall_step + 1) % save_step == 0 or (overall_step + 1 == total_steps): logger.info('saving for step {}'.format(overall_step)) if not os.path.exists(args.model_output_path): os.mkdir(args.model_output_path) torch.save( { # 'finished_epoch': epoch, 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'overall_step': overall_step, 'running_loss': running_loss }, model_path) decoder_path = join(args.model_output_path, 'decoder/') if not os.path.exists(decoder_path): os.mkdir(decoder_path) model.save_decoder(decoder_path) logger.info('finish saving for step {}'.format(overall_step)) except RuntimeError as exception: if "out of memory" in str(exception): oom_time += 1 if not oom_flag: logger.info("WARNING: ran out of memory,times: {}".format( oom_time)) logger.info("batch_idx = ", batch_idx) oom_flag = True if hasattr(torch.cuda, 'empty_cache'): torch.cuda.empty_cache() else: logger.info(str(exception)) raise exception epoch_finish_time = datetime.now() logger.info('time for one epoch: {}'.format(epoch_finish_time - epoch_start_time)) logger.info('training finished')
def train(model, device, train_list, multi_gpu, args, valid_list, test_list): train_dataset = MyDataset(train_list) train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) model.train() # 计算所有epoch进行参数优化的总步数total_steps total_steps = int(train_dataset.__len__() * args.epochs / args.batch_size / args.gradient_accumulation) logger.info('total training steps = {}'.format(total_steps)) # 设置优化器,并且在初始训练时,使用warmup策略 optimizer = transformers.AdamW(model.parameters(), lr=args.lr, correct_bias=True) scheduler = transformers.WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=total_steps) logger.info('starting training') # 用于统计每次梯度累计的loss running_loss = 0 # 统计一共训练了多少个step overall_step = 0 # 记录tensorboardX tb_writer = SummaryWriter(log_dir=args.writer_dir) # 记录 out of memory的次数 oom_time = 0 # 开始训练 for epoch in range(args.epochs): epoch_start_time = datetime.now() for batch_idx, input_ids in enumerate(train_dataloader): # 注意:GPT2模型的forward()函数,是对于给定的context,生成一个token,而不是生成一串token # GPT2Model的输入为n个token_id时,输出也是n个hidden_state,使用第n个hidden_state预测第n+1个token input_ids = input_ids.to(device) # 解决在运行过程中,由于显存不足产生的cuda out of memory的问题 try: outputs = model.forward(input_ids=input_ids) loss, accuracy = calculate_loss_and_accuracy(outputs, labels=input_ids, device=device) if multi_gpu: loss = loss.mean() accuracy = accuracy.mean() if args.gradient_accumulation > 1: loss = loss / args.gradient_accumulation accuracy = accuracy / args.gradient_accumulation loss.backward() # 梯度裁剪解决的是梯度消失或爆炸的问题,即设定阈值 torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) # 进行一定step的梯度累计之后,更新参数 if (batch_idx + 1) % args.gradient_accumulation == 0: running_loss += loss.item() # 更新参数 optimizer.step() # 清空梯度信息 optimizer.zero_grad() # 进行warm up scheduler.step() overall_step += 1 # 更新日志与tnesorboardX信息 if (overall_step + 1) % args.log_step == 0: logger.info( "batch {} of epoch {}, loss {}, accuracy {}".format(batch_idx + 1, epoch + 1, loss, accuracy)) tb_writer.add_scalar('loss', loss.item(), overall_step) except RuntimeError as exception: if "out of memory" in str(exception): oom_time += 1 logger.info("WARNING: ran out of memory,times: {}".format(oom_time)) if hasattr(torch.cuda, 'empty_cache'): torch.cuda.empty_cache() else: logger.info(str(exception)) raise exception logger.info('saving model for epoch {}'.format(epoch + 1)) if args.train_mmi: # 当前训练MMI模型 model_path = join(args.mmi_model_output_path, 'model_epoch{}'.format(epoch + 1)) else: # 当前训练对话模型 model_path = join(args.dialogue_model_output_path, 'model_epoch{}'.format(epoch + 1)) if not os.path.exists(model_path): os.mkdir(model_path) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(model_path) logger.info('epoch {} finished'.format(epoch + 1)) epoch_finish_time = datetime.now() logger.info('time for one epoch: {}'.format(epoch_finish_time - epoch_start_time)) logger.info ("Start Valid Set") evaluate(model, device, valid_list, multi_gpu, args) logger.info ("Start Test Set") evaluate(model, device, test_list, multi_gpu, args) logger.info('training finished')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='cuda visible devices') parser.add_argument('--model_config', default='config/model_config.json', type=str, required=False, help='path of the model configration file') parser.add_argument('--tokenizer_path', default='data/vocabs.txt', type=str, required=False, help='path of the vocabulary file') parser.add_argument('--raw_data_path', default='data/samples.json', type=str, required=False, help='path of the samples file') parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False, help='save the tokenized samples file to this dir') parser.add_argument( '--raw', action='store_true', help= 'do tokenize before training, no need if already tokenized with same configration' ) parser.add_argument('--epochs', default=24, type=int, required=False) parser.add_argument('--batch_size', default=16, type=int, required=False) parser.add_argument('--lr', default=2e-4, type=float, required=False) parser.add_argument('--warmup_steps', default=4000, type=int, required=False) parser.add_argument('--log_step', default=4000, type=int, required=False, help='period of reporting loss') parser.add_argument('--gradient_accumulation', default=1, type=int, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--output_dir', default='model/', type=str, required=False, help='save the model to this dir') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='pre-trained model dir') args = parser.parse_args() print('args:\n' + args.__repr__()) from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device model_config = transformers.modeling_gpt2.GPT2Config.from_json_file( args.model_config) print('config:\n' + model_config.to_json_string()) n_ctx = model_config.n_ctx full_tokenizer = tokenization_bert.BertTokenizer( vocab_file=args.tokenizer_path) full_tokenizer.max_len = 999999 if torch.cuda.is_available(): device = 'cuda' print(torch.cuda.get_device_name(0)) else: device = 'cpu' print(device) raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path raw = args.raw epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps log_step = args.log_step gradient_accumulation = args.gradient_accumulation max_grad_norm = args.max_grad_norm output_dir = args.output_dir assert log_step % gradient_accumulation == 0 if not os.path.exists(output_dir): os.mkdir(output_dir) if raw: print('building files') build_files(data_path=raw_data_path, tokenized_data_path=tokenized_data_path, full_tokenizer=full_tokenizer, n_ctx=n_ctx) print('files built') if not args.pretrained_model: model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config) else: model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained( args.pretrained_model) model.train() model.to(device) num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print('number of parameters: {}'.format(num_parameters)) multi_gpu = False full_len = 0 print('calculating total steps') with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(0), 'r') as f: full_len += len([int(item) for item in f.read().strip().split()]) total_steps = int(full_len / n_ctx * epochs / batch_size / gradient_accumulation) print('total steps = {}'.format(total_steps)) optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) scheduler = transformers.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=total_steps) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") device_ids = [] for i in args.device.split(','): try: print(torch.cuda.get_device_name(int(i))) device_ids.append(int(i)) except: pass model = DataParallel(model, device_ids=device_ids) multi_gpu = True print('starting training') overall_step = 0 running_loss = 0 with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(0), 'r') as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: samples.append(tokens[start_point:start_point + n_ctx]) start_point += n_ctx if start_point < len(tokens): samples.append(tokens[len(tokens) - n_ctx:]) for epoch in range(epochs): print('epoch {}'.format(epoch + 1)) now = datetime.now() print('time: {}'.format(now)) samples2 = copy.deepcopy(samples) random.shuffle(samples2) for step in range(len(samples2) // batch_size): # drop last # prepare data batch = samples2[step * batch_size:(step + 1) * batch_size] batch_inputs = torch.tensor(batch).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_inputs) loss, logits = outputs[:2] if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step if (overall_step + 1) % gradient_accumulation == 0: running_loss += loss.item() optimizer.step() optimizer.zero_grad() scheduler.step() if (overall_step + 1) % log_step == 0: tb_writer.add_scalar('loss', loss.item() * gradient_accumulation, overall_step) print('now time: {}:{}. Step {} of epoch {}, loss {}'.format( datetime.now().hour, datetime.now().minute, step + 1, epoch + 1, running_loss * gradient_accumulation / (log_step / gradient_accumulation))) running_loss = 0 overall_step += 1 print('saving model for epoch {}'.format(epoch + 1)) temp_epoch = (epoch + 1) % 2 # save disk space if not os.path.exists(output_dir + 'model_epoch{}'.format(temp_epoch)): os.mkdir(output_dir + 'model_epoch{}'.format(temp_epoch)) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(temp_epoch)) #torch.save(scheduler, output_dir + 'model_epoch{}/scheduler.pt'.format(temp_epoch)) #torch.save(optimizer, output_dir + 'model_epoch{}/optimizer.pt'.format(temp_epoch)) print('epoch {} finished'.format(epoch + 1)) then = datetime.now() print('time: {}'.format(then)) print('time for one epoch: {}'.format(then - now)) print('training finished')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--model_config', default='config/model_config.json', type=str, required=False, help='选择模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库') parser.add_argument('--raw_data_path', default='data/train.json', type=str, required=False, help='原始训练语料') parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--raw', action='store_true', help='是否先做tokenize') parser.add_argument('--epochs', default=5, type=int, required=False, help='训练循环') parser.add_argument('--batch_size', default=8, type=int, required=False, help='训练batch size') parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率') parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步数') parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss') parser.add_argument('--stride', default=768, type=int, required=False, help='训练时取训练数据的窗口步长') parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度积累') parser.add_argument('--fp16', action='store_true', help='混合精度') parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--num_pieces', default=1, type=int, required=False, help='将训练语料分成多少份') parser.add_argument('--output_dir', default='model/', type=str, required=False, help='模型输出路径') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径') parser.add_argument('--segment', action='store_true', help='中文以词为单位') ''' 配置参数------------------------------------------------------------------- ''' args = parser.parse_args() args.device = '1' args.batch_size = 5 from tokenizations import tokenization proj_root_path = os.path.dirname( os.path.dirname(os.path.realpath(__file__))) vocab_file_path = "tokenizations/clue-vocab.txt" #使用预训练里面的词典进行编码 text = '我是一个人' tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path, do_lower_case=True) line = tokenization.convert_to_unicode(text) bert_tokens = tokenizer.tokenize(line) encoded = tokenizer.convert_tokens_to_ids(bert_tokens) # 下面关注一下数据集的写法. args.raw = True args.raw_data_path = '172166.txt' # -small是小的版本 args.epochs = 200 args.output_dir = 'model/' # 结果存到e盘的final_model args.num_pieces = 10 # 结果存到e盘的final_model from pre_data_byOnlyOneBook import get_data as get_data name2 = args.raw_data_path.split('.')[0] get_data(name2 + '.txt', name2 + '.json') # 下面使用166893.json即可. ''' ------------------------------------------------------------------------------ ''' #---------------配置完毕 print('args:\n' + args.__repr__()) if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 model_config = transformers.modeling_gpt2.GPT2Config.from_json_file( args.model_config) # 这个参数很重要,表示一句话的长度. print('config:\n' + model_config.to_json_string()) n_ctx = model_config.n_ctx # full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) full_tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path, do_lower_case=True) ''' full_tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path, do_lower_case=True) ''' ''' 直接使用gpt2的tokenizer ''' full_tokenizer.max_len = 999999 device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path raw = args.raw # 选择是否从零开始构建数据集 epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps log_step = args.log_step stride = args.stride gradient_accumulation = args.gradient_accumulation fp16 = args.fp16 # 不支持半精度的显卡请勿打开 fp16_opt_level = args.fp16_opt_level max_grad_norm = args.max_grad_norm num_pieces = args.num_pieces output_dir = args.output_dir # 'data/tokenized/' 编码之后的东西放在这里. if raw: print('building files') build_files(raw_data_path=name2 + '.json', tokenized_data_path=tokenized_data_path, full_tokenizer=full_tokenizer, num_pieces=num_pieces) print('files built') if not args.pretrained_model: model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config) else: model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained( args.pretrained_model) model.train() model.to(device) multi_gpu = False full_len = 0 print('calculating total steps') for i in tqdm(range(num_pieces)): with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: full_len += len([int(item) for item in f.read().strip().split()]) import math total_steps = math.ceil(full_len / stride * epochs / batch_size / gradient_accumulation) print('total steps = {}'.format(total_steps)) optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) scheduler = transformers.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=total_steps) if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel(model) multi_gpu = True print('starting training') running_loss = 0 for epoch in range(epochs): print('epoch {}'.format(epoch + 1)) now = datetime.now() print('time: {}'.format(now)) x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32) random.shuffle(x) piece_num = 0 loss_save = [] for i in x: with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: # n_ctx 表示上下文的长度. samples.append(tokens[start_point:start_point + n_ctx]) start_point += stride if start_point < len(tokens): # 拼接上最后一个例子. samples.append(tokens[len(tokens) - n_ctx:]) random.shuffle(samples) for step in range((len(samples) // batch_size) + 1): # 多跑一个 # prepare data #先判断是否超界,如果超界就表示最后一个组不成batch,所以break if step * batch_size > len(samples) - 1: break batch = samples[step * batch_size:(step + 1) * batch_size] batch_labels = [] batch_inputs = [] for ids in batch: int_ids_for_labels = [int(x) for x in ids] int_ids_for_inputs = [int(x) for x in ids] batch_labels.append(int_ids_for_labels) batch_inputs.append(int_ids_for_inputs) batch_labels = torch.tensor(batch_labels).long().to(device) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass 居然输入输出都一样????????很奇怪这个模型. ''' 下面为了对比,把ctrl的模型写这里: flag_input, inputs = numericalize(domain+tokenized_train_text[i:i+seq_length]) # 注意输入要牵头加上domain. flag_output, outputs = numericalize(tokenized_train_text[i:i+seq_length+1]) # ctrl算法输入是 i:j 输出是i:j+1 研究一下这个数据的问题: https://www.cnblogs.com/wwj99/p/12503545.html 好像还真是,样本和标签一样. ''' outputs = model.forward(input_ids=batch_inputs, labels=batch_labels) loss, logits = outputs[:2] # get loss if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step if (step + 1) % gradient_accumulation == 0: running_loss += loss.item() optimizer.step() optimizer.zero_grad() scheduler.step() if (step + 1) % log_step == 0: print( 'now time: {}:{}. Step {} of piece {} of epoch {}, loss {}' .format(datetime.now().hour, datetime.now().minute, (step + 1) // gradient_accumulation, piece_num, epoch + 1, running_loss / log_step)) loss_save.append(running_loss / log_step) running_loss = 0 piece_num += 1 #--------------检测是否提前退出 last = loss_save[:10] avg1 = sum(last) / 10 #如果全在avg1上下百分之5以内就停止: last = np.array(last) avg1 = np.array(avg1) tmp = np.all(last >= avg1 * 0.97) and np.all(last >= avg1 * 1.03) if len(last) >= 10 and tmp and loss_save[-1] < 0.05: break #-------------------- print('training finished') if not os.path.exists(output_dir + 'final_model'): os.makedirs(output_dir + 'final_model') model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'final_model')
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--device", default="0,1,2,3", type=str, required=False, help="设置使用哪些显卡" ) parser.add_argument( "--model_config", default="config/model_config_small.json", type=str, required=False, help="选择模型参数", ) parser.add_argument( "--tokenizer_path", default="cache/vocab_small.txt", type=str, required=False, help="选择词库", ) parser.add_argument( "--raw_data_path", default="data/train.json", type=str, required=False, help="原始训练语料", ) parser.add_argument( "--tokenized_data_path", default="data/tokenized/", type=str, required=False, help="tokenized语料存放位置", ) parser.add_argument("--raw", action="store_true", help="是否先做tokenize") parser.add_argument("--epochs", default=5, type=int, required=False, help="训练循环") parser.add_argument( "--batch_size", default=8, type=int, required=False, help="训练batch size" ) parser.add_argument("--lr", default=1.5e-4, type=float, required=False, help="学习率") parser.add_argument( "--warmup_steps", default=2000, type=int, required=False, help="warm up步数" ) parser.add_argument( "--log_step", default=1, type=int, required=False, help="多少步汇报一次loss,设置为gradient accumulation的整数倍", ) parser.add_argument( "--stride", default=768, type=int, required=False, help="训练时取训练数据的窗口步长" ) parser.add_argument( "--gradient_accumulation", default=1, type=int, required=False, help="梯度积累" ) parser.add_argument("--fp16", action="store_true", help="混合精度") parser.add_argument("--fp16_opt_level", default="O1", type=str, required=False) parser.add_argument("--max_grad_norm", default=1.0, type=float, required=False) parser.add_argument( "--num_pieces", default=100, type=int, required=False, help="将训练语料分成多少份" ) parser.add_argument( "--min_length", default=128, type=int, required=False, help="最短收录文章长度" ) parser.add_argument( "--output_dir", default="model/", type=str, required=False, help="模型输出路径" ) parser.add_argument( "--pretrained_model", default="", type=str, required=False, help="模型训练起点路径" ) parser.add_argument( "--writer_dir", default="tensorboard_summary/", type=str, required=False, help="Tensorboard路径", ) parser.add_argument("--segment", action="store_true", help="中文以词为单位") parser.add_argument("--bpe_token", action="store_true", help="subword") parser.add_argument( "--encoder_json", default="tokenizations/encoder.json", type=str, help="encoder.json", ) parser.add_argument( "--vocab_bpe", default="tokenizations/vocab.bpe", type=str, help="vocab.bpe" ) args = parser.parse_args() print("args:\n" + args.__repr__()) if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 model_config = transformers.modeling_gpt2.GPT2Config.from_json_file( args.model_config ) print("config:\n" + model_config.to_json_string()) n_ctx = model_config.n_ctx if args.bpe_token: full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe) else: full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) full_tokenizer.max_len = 999999 device = "cuda" if torch.cuda.is_available() else "cpu" print("using device:", device) raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path raw = args.raw # 选择是否从零开始构建数据集 epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps log_step = args.log_step stride = args.stride gradient_accumulation = args.gradient_accumulation fp16 = args.fp16 # 不支持半精度的显卡请勿打开 fp16_opt_level = args.fp16_opt_level max_grad_norm = args.max_grad_norm num_pieces = args.num_pieces min_length = args.min_length output_dir = args.output_dir tb_writer = SummaryWriter(log_dir=args.writer_dir) assert log_step % gradient_accumulation == 0 if not os.path.exists(output_dir): os.mkdir(output_dir) if raw: print("building files") build_files( data_path=raw_data_path, tokenized_data_path=tokenized_data_path, num_pieces=num_pieces, full_tokenizer=full_tokenizer, min_length=min_length, ) print("files built") if not args.pretrained_model: model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config) else: model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained( args.pretrained_model ) model.train() model.to(device) num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print("number of parameters: {}".format(num_parameters)) multi_gpu = False full_len = 0 print("calculating total steps") for i in tqdm(range(num_pieces)): with open(tokenized_data_path + "tokenized_train_{}.txt".format(i), "r") as f: full_len += len([int(item) for item in f.read().strip().split()]) total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation) print("total steps = {}".format(total_steps)) optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) scheduler = transformers.WarmupLinearSchedule( optimizer, warmup_steps=warmup_steps, t_total=total_steps ) if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel(model, device_ids=[int(i) for i in args.device.split(",")]) multi_gpu = True print("starting training") overall_step = 0 running_loss = 0 saving_time = datetime.now() for epoch in range(epochs): print("epoch {}".format(epoch + 1)) now = datetime.now() print("time: {}".format(now)) x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32) random.shuffle(x) piece_num = 0 for i in x: with open( tokenized_data_path + "tokenized_train_{}.txt".format(i), "r" ) as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: samples.append(tokens[start_point : start_point + n_ctx]) start_point += stride if start_point < len(tokens): samples.append(tokens[len(tokens) - n_ctx :]) random.shuffle(samples) for step in range(len(samples) // batch_size): # drop last # prepare data batch = samples[step * batch_size : (step + 1) * batch_size] batch_inputs = [] for ids in batch: int_ids = [int(x) for x in ids] batch_inputs.append(int_ids) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_inputs) loss, logits = outputs[:2] # get loss if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), max_grad_norm ) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step if (overall_step + 1) % gradient_accumulation == 0: running_loss += loss.item() optimizer.step() optimizer.zero_grad() scheduler.step() if (overall_step + 1) % log_step == 0: tb_writer.add_scalar( "loss", loss.item() * gradient_accumulation, overall_step ) print( "now time: {}:{}. Step {} of piece {} of epoch {}, loss {}".format( datetime.now().hour, datetime.now().minute, step + 1, piece_num, epoch + 1, running_loss * gradient_accumulation / (log_step / gradient_accumulation), ) ) running_loss = 0 delta_time = datetime.now() - saving_time if delta_time.seconds > 1800: print("saving model for epoch {}".format(epoch + 1)) if not os.path.exists( output_dir + "model_epoch{}".format(epoch + 1) ): os.mkdir(output_dir + "model_epoch{}".format(epoch + 1)) model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained( output_dir + "model_epoch{}".format(epoch + 1) ) saving_time = datetime.now() overall_step += 1 piece_num += 1 print("saving model for epoch {}".format(epoch + 1)) if not os.path.exists(output_dir + "model_epoch{}".format(epoch + 1)): os.mkdir(output_dir + "model_epoch{}".format(epoch + 1)) model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir + "model_epoch{}".format(epoch + 1)) # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1)) # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1)) print("epoch {} finished".format(epoch + 1)) then = datetime.now() print("time: {}".format(then)) print("time for one epoch: {}".format(then - now)) print("training finished") if not os.path.exists(output_dir + "final_model"): os.mkdir(output_dir + "final_model") model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir + "final_model")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--model_config', default='../config/model_config_small.json', type=str, required=False, help='选择模型参数') parser.add_argument('--tokenizer_path', default='../cache/vocab_small.txt', type=str, required=False, help='选择词库') parser.add_argument('--raw_data_path', default='data_quantangshi', type=str, required=False, help='原始训练语料') parser.add_argument('--tokenized_data_path', default='data_quantangshi/tokenized/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--raw', action='store_true', help='是否先做tokenize') parser.add_argument('--epochs', default=15, type=int, required=False, help='训练循环') parser.add_argument('--batch_size', default=1, type=int, required=False, help='训练batch size') parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率') parser.add_argument('--warmup_steps', default=1024, type=int, required=False, help='warm up步数') parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss,设置为gradient accumulation的整数倍') parser.add_argument('--stride', default=468, type=int, required=False, help='训练时取训练数据的窗口步长') parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度积累') parser.add_argument('--fp16', action='store_true', help='混合精度') parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--num_pieces', default=1, type=int, required=False, help='将训练语料分成多少份') parser.add_argument('--min_length', default=128, type=int, required=False, help='最短收录文章长度') parser.add_argument('--output_dir', default='model_quantangshi/', type=str, required=False, help='模型输出路径') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径') parser.add_argument('--writer_dir', default='../tensorboard_summary/', type=str, required=False, help='Tensorboard路径') parser.add_argument('--segment', action='store_true', help='中文以词为单位') parser.add_argument('--bpe_token', action='store_true', help='subword') parser.add_argument('--encoder_json', default="../tokenizations/encoder.json", type=str, help="encoder.json") parser.add_argument('--vocab_bpe', default="../tokenizations/vocab.bpe", type=str, help="vocab.bpe") args = parser.parse_args() print('args:\n' + args.__repr__()) # args.segment = False if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 model_config = transformers.modeling_gpt2.GPT2Config.from_json_file( args.model_config) print('config:\n' + model_config.to_json_string()) n_ctx = model_config.n_ctx if args.bpe_token: full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe) else: full_tokenizer = tokenization_bert.BertTokenizer( vocab_file=args.tokenizer_path) full_tokenizer.max_len = 999999 device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps log_step = args.log_step stride = args.stride gradient_accumulation = args.gradient_accumulation fp16 = args.fp16 # 不支持半精度的显卡请勿打开 fp16_opt_level = args.fp16_opt_level max_grad_norm = args.max_grad_norm num_pieces = args.num_pieces min_length = args.min_length output_dir = args.output_dir tb_writer = SummaryWriter(log_dir=args.writer_dir) assert log_step % gradient_accumulation == 0 if not os.path.exists(output_dir): os.mkdir(output_dir) if not args.pretrained_model: model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config) else: model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained( args.pretrained_model) model.train() model.to(device) num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print('number of parameters: {}'.format(num_parameters)) multi_gpu = False full_len = 0 print('calculating total steps') for i in tqdm(range(num_pieces)): with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: full_len += len([int(item) for item in f.read().strip().split()]) total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation) print('total steps = {}'.format(total_steps)) optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) scheduler = transformers.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=total_steps) if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel( model, device_ids=[int(i) for i in args.device.split(',')]) multi_gpu = True print('starting training') overall_step = 0 running_loss = 0 size = 0 for epoch in range(epochs): print('epoch {}'.format(epoch + 1)) now = datetime.now() print('time: {}'.format(now)) x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32) random.shuffle(x) piece_num = 0 for i in x: piecestart = datetime.now() with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: samples.append(tokens[start_point:start_point + n_ctx]) start_point += stride if start_point < len(tokens): samples.append(tokens[len(tokens) - n_ctx:]) random.shuffle(samples) for step in range(len(samples) // batch_size): # drop last # prepare data batch = samples[step * batch_size:(step + 1) * batch_size] batch_inputs = [] for ids in batch: int_ids = [int(x) for x in ids] batch_inputs.append(int_ids) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_inputs) loss, logits = outputs[:2] # get loss if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step if (overall_step + 1) % gradient_accumulation == 0: running_loss += loss.item() optimizer.step() optimizer.zero_grad() scheduler.step() if (overall_step + 1) % log_step == 0: tb_writer.add_scalar('loss', loss.item() * gradient_accumulation, overall_step) log('now time: {}:{}:{}. Step {} of piece {} of epoch {}, loss {}' .format( datetime.now().hour, datetime.now().minute, datetime.now().second, step + 1, piece_num, epoch + 1, running_loss * gradient_accumulation / (log_step / gradient_accumulation))) running_loss = 0 overall_step += 1 # print('now time: {}:{}. Step {} of piece {} of epoch {} '.format( # datetime.now().hour, # datetime.now().minute, # step + 1, # piece_num, # epoch + 1 # )) size += 1 pieceend = datetime.now() log('{} times train: {} piece time :start= {} ;end={} ;all={} '. format(size, piece_num, piecestart, pieceend, (pieceend - piecestart))) piece_num += 1 log('saving model for epoch {}'.format(epoch + 1)) if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)): os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1)) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1)) # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1)) # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1)) log('epoch {} finished'.format(epoch + 1)) then = datetime.now() log('time: {}'.format(then)) log('time for one epoch: {}'.format(then - now)) log('training finished') if not os.path.exists(output_dir + 'final_model'): os.mkdir(output_dir + 'final_model') model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'final_model')
def train(model, device, train_list, multi_gpu, hparams): train_data = MyDataset(train_list) train_dataloader = DataLoader(train_data, batch_size=hparams.batch_size, shuffle=True, num_workers=hparams.num_workers, collate_fn=collate) model.train() total_steps = int(train_data.__len__() * hparams.epochs / hparams.batch_size / hparams.gradient_accumulation) logger.info('total training steps = {}'.format(total_steps)) optimizer = transformers.AdamW(model.parameters(), lr=hparams.lr, correct_bias=True) scheduler = transformers.WarmupLinearSchedule( optimizer, warmup_steps=hparams.warmup_steps, t_total=total_steps) logger.info('starting training') run_loss = 0 over_step = 0 tb_writer = SummaryWriter(log_dir=hparams.writer_dir) oom_time = 0 # out of memory 次数 for epoch in range(hparams.epochs): start_time = datetime.now() for batch_index, input_ids in enumerate(train_dataloader): # GPT2Model的输入为n个token_id时,输出也是n个hidden_state,使用第n个hidden_state预测第n+1个token input_ids = input_ids.to(device) try: outputs = model.forward(input_ids=input_ids) loss, accuracy = cal_loss_accuracy(outputs, input_ids, device) if multi_gpu: loss = loss.mean() accuracy = accuracy.mean() if hparams.gradient_accumulation > 1: loss = loss / hparams.gradient_accumulation accuracy = accuracy / hparams.gradient_accumulation loss.backward() # 解决梯度爆照和消失问题 torch.nn.utils.clip_grad_norm_(model.parameters(), hparams.max_grad_norm) if (batch_index + 1) % hparams.gradient_accumulation == 0: run_loss += loss.item() optimizer.step() optimizer.zero_grad() # 清空梯度信息 scheduler.step() over_step += 1 if (over_step + 1) % hparams.log_step == 0: logger.info( "batch {} of epoch {}, loss {}, accuracy {}". format(batch_index + 1, epoch + 1, loss, accuracy)) tb_writer.add_scalar('loss', loss.item(), over_step) except RuntimeError as e: if "out of memory" in str(e): oom_time += 1 logger.info("WARNING: ran out of memory,times: {}".format( oom_time)) if hasattr(torch.cuda, 'empty_cache'): torch.cuda.empty_cache() else: logger.info(str(e)) logger.info('saving model for epoch {}'.format(epoch + 1)) if hparams.train_mmi: model_path = join(hparams.mmi_model_output_path, "model_epoch{}".format(epoch + 1)) else: model_path = join(hparams.dialog_model_output_path, "model_epoch{}".format(epoch + 1)) if not os.path.exists(model_path): os.mkdir(model_path) model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(model_path) logger.info('epoch {} finished'.format(epoch + 1)) epoch_finish_time = datetime.now() logger.info('time for one epoch: {}'.format(epoch_finish_time - start_time)) logger.info('training finished')