def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, help='选择模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库') parser.add_argument('--raw_data_path', default='data/train.json', type=str, required=False, help='原始训练语料') parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--raw', action='store_true', help='是否先做tokenize') parser.add_argument('--epochs', default=5, type=int, required=False, help='训练循环') parser.add_argument('--batch_size', default=8, type=int, required=False, help='训练batch size') parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率') parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步数') parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss') parser.add_argument('--stride', default=768, type=int, required=False, help='训练时取训练数据的窗口步长') parser.add_argument('--gradient_accumulation', default=1, type=str, required=False, help='梯度积累') parser.add_argument('--fp16', action='store_true', help='混合精度') parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份') parser.add_argument('--output_dir', default='model/', type=str, required=False, help='模型输出路径') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径') parser.add_argument('--no_wordpiece', action='store_true', help='不做word piece切词') args = parser.parse_args() print('args:\n' + args.__repr__()) if args.no_wordpiece: import tokenization_bert_without_wordpiece as tokenization_bert else: import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 model_config = pytorch_transformers.modeling_gpt2.GPT2Config.from_json_file( args.model_config) print('config:\n' + model_config.to_json_string()) n_ctx = model_config.n_ctx full_tokenizer = tokenization_bert.BertTokenizer( vocab_file=args.tokenizer_path) full_tokenizer.max_len = n_ctx device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path raw = args.raw # 选择是否从零开始构建数据集 epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps log_step = args.log_step stride = args.stride gradient_accumulation = args.gradient_accumulation fp16 = args.fp16 # 不支持半精度的显卡请勿打开 fp16_opt_level = args.fp16_opt_level max_grad_norm = args.max_grad_norm num_pieces = args.num_pieces output_dir = args.output_dir if raw: print('building files') build_files(raw_data_path=raw_data_path, tokenized_data_path=tokenized_data_path, full_tokenizer=full_tokenizer, num_pieces=num_pieces) print('files built') if not args.pretrained_model: model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel( config=model_config) else: model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained( args.pretrained_model) model.train() model.to(device) multi_gpu = False full_len = 0 print('calculating total steps') for i in tqdm(range(num_pieces)): with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: full_len += len([int(item) for item in f.read().strip().split()]) total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation) print('total steps = {}'.format(total_steps)) optimizer = pytorch_transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) scheduler = pytorch_transformers.WarmupLinearSchedule( optimizer, warmup_steps=warmup_steps, t_total=total_steps) if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel(model) multi_gpu = True print('starting training') for epoch in range(epochs): print('epoch {}'.format(epoch + 1)) now = datetime.now() print('time: {}'.format(now)) x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32) random.shuffle(x) piece_num = 0 for i in x: running_loss = 0 with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: samples.append(tokens[start_point:start_point + n_ctx]) start_point += stride random.shuffle(samples) for step in range(len(samples) // batch_size): # prepare data batch = samples[step * batch_size:(step + 1) * batch_size] batch_labels = [] batch_inputs = [] for ids in batch: int_ids_for_labels = [int(x) for x in ids] int_ids_for_inputs = [int(x) for x in ids] batch_labels.append(int_ids_for_labels) batch_inputs.append(int_ids_for_inputs) batch_labels = torch.tensor(batch_labels).long().to(device) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_labels) loss, logits = outputs[:2] # get loss if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step if (step + 1) % gradient_accumulation == 0: running_loss += loss.item() scheduler.step() optimizer.step() optimizer.zero_grad() if (step + 1) % log_step == 0: print( 'now time: {}:{}. Step {} of piece {} of epoch {}, loss {}' .format( datetime.now().hour, datetime.now().minute, (step + 1) // gradient_accumulation, piece_num, epoch + 1, running_loss * gradient_accumulation / log_step)) running_loss = 0 piece_num += 1 print('saving model for epoch {}'.format(epoch + 1)) if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)): os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1)) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1)) # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1)) # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1)) print('epoch {} finished'.format(epoch + 1)) then = datetime.now() print('time: {}'.format(then)) print('time for one epoch: {}'.format(then - now)) print('training finished') if not os.path.exists(output_dir + 'final_model'): os.mkdir(output_dir + 'final_model') model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'final_model')
def main(): if raw: print('building files') build_files(data_path=raw_data_path) print('files built') model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel( config=model_config) model.to(device) num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print('number of parameters: {}'.format(num_parameters)) multi_gpu = False full_len = 0 print('calculating total steps') for i in tqdm(range(num_pieces)): with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: full_len += len([int(item) for item in f.read().strip().split()]) total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation) print('total steps = {}'.format(total_steps)) optimizer = pytorch_transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) scheduler = pytorch_transformers.WarmupLinearSchedule( optimizer, warmup_steps=warmup_steps, t_total=total_steps) if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel(model) multi_gpu = True print('starting training') for epoch in range(epochs): print('epoch {}'.format(epoch + 1)) now = datetime.now() print('time: {}'.format(now)) x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32) random.shuffle(x) piece_num = 0 for i in x: running_loss = 0 with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: samples.append(tokens[start_point:start_point + n_ctx]) start_point += stride random.shuffle(samples) for step in range(len(samples) // batch_size): # prepare data batch = samples[step * batch_size:(step + 1) * batch_size] batch_labels = [] batch_inputs = [] for ids in batch: int_ids_for_labels = [int(x) for x in ids] int_ids_for_inputs = [int(x) for x in ids] batch_labels.append(int_ids_for_labels) batch_inputs.append(int_ids_for_inputs) batch_labels = torch.tensor(batch_labels).long().to(device) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_labels) loss, logits = outputs[:2] # get loss if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step if (step + 1) % gradient_accumulation == 0: running_loss += loss.item() scheduler.step() optimizer.step() optimizer.zero_grad() if (step + 1) % log_step == 0: print('step {} of piece {} of epoch {}, loss {}'.format( (step + 1) // gradient_accumulation, piece_num, epoch + 1, running_loss * gradient_accumulation / log_step)) running_loss = 0 piece_num += 1 print('saving model for epoch {}'.format(epoch + 1)) if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)): os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1)) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1)) # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1)) # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1)) print('epoch {} finished'.format(epoch + 1)) then = datetime.now() print('time: {}'.format(then)) print('time for one epoch: {}'.format(then - now)) print('training finished') if not os.path.exists(output_dir + 'final_model'): os.mkdir(output_dir + 'final_model') model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'final_model')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, help='选择模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库') parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--epochs', default=5, type=int, required=False, help='训练循环') parser.add_argument('--batch_size', default=8, type=int, required=False, help='训练batch size') parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率') parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步数') parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss') parser.add_argument('--save_per_step', default=10000, type=int, required=False, help='多少步保存一次模型') parser.add_argument('--stride', default=768, type=int, required=False, help='训练时取训练数据的窗口步长') parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度积累') parser.add_argument('--fp16', action='store_true', help='混合精度') parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份') parser.add_argument('--min_length', default=128, type=int, required=False, help='最短收录文章长度') parser.add_argument('--output_dir', default='model/', type=str, required=False, help='模型输出路径') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径') parser.add_argument('--writer_dir', default='tensorboard_summary/', type=str, required=False, help='Tensorboard路径') parser.add_argument('--bpe_token', action='store_true', help='subword') parser.add_argument('--encoder_json', default="tokenizations/encoder.json", type=str, help="encoder.json") parser.add_argument('--vocab_bpe', default="tokenizations/vocab.bpe", type=str, help="vocab.bpe") args = parser.parse_args() print('args:\n' + args.__repr__()) from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 model_config = pytorch_transformers.modeling_gpt2.GPT2Config.from_json_file( args.model_config) print('config:\n' + model_config.to_json_string()) n_ctx = model_config.n_ctx if args.bpe_token: full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe) else: full_tokenizer = tokenization_bert.BertTokenizer( vocab_file=args.tokenizer_path) full_tokenizer.max_len = n_ctx device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) tokenized_data_path = args.tokenized_data_path epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps log_step = args.log_step stride = args.stride gradient_accumulation = args.gradient_accumulation fp16 = args.fp16 # 不支持半精度的显卡请勿打开 fp16_opt_level = args.fp16_opt_level max_grad_norm = args.max_grad_norm num_pieces = args.num_pieces min_length = args.min_length output_dir = args.output_dir # tb_writer = SummaryWriter(log_dir=args.writer_dir) if not os.path.exists(output_dir): os.mkdir(output_dir) if not args.pretrained_model: model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel( config=model_config) else: model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained( args.pretrained_model) model.train() model.to(device) num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print('number of parameters: {}'.format(num_parameters)) multi_gpu = False full_len = 0 print('calculating total steps') for i in tqdm(range(num_pieces)): with open( os.path.join(tokenized_data_path, 'tokenized_train_{}.txt'.format(i)), 'r') as f: full_len += len([int(item) for item in f.read().strip().split()]) total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation) print('total steps = {}'.format(total_steps)) optimizer = pytorch_transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) scheduler = pytorch_transformers.WarmupLinearSchedule( optimizer, warmup_steps=warmup_steps, t_total=total_steps) if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel(model) multi_gpu = True print('starting training') overall_step = 0 running_loss = 0 running_step = 0 for epoch in range(epochs): print('epoch {}'.format(epoch + 1)) now = datetime.now() print('time: {}'.format(now)) x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32) random.shuffle(x) piece_num = 0 for i in x: with open( os.path.join(tokenized_data_path, 'tokenized_train_{}.txt'.format(i)), 'r') as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: samples.append(tokens[start_point:start_point + n_ctx]) start_point += stride random.shuffle(samples) for step in range(len(samples) // batch_size): # drop last # prepare data batch = samples[step * batch_size:(step + 1) * batch_size] batch_labels = [] batch_inputs = [] for ids in batch: int_ids_for_labels = [int(x) for x in ids] int_ids_for_inputs = [int(x) for x in ids] batch_labels.append(int_ids_for_labels) batch_inputs.append(int_ids_for_inputs) batch_labels = torch.tensor(batch_labels).long().to(device) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_labels) loss, logits = outputs[:2] # get loss if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step running_loss += loss.item() running_step += 1 mean_loss = running_loss * gradient_accumulation / running_step if (step + 1) % gradient_accumulation == 0: optimizer.step() optimizer.zero_grad() scheduler.step() overall_step += 1 # how many steps to print loss log if overall_step % log_step == 0: now_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") print( 'now time: {}: Step {} of piece {} of epoch {}. Global Step: {}, Mean Loss: {}' .format(now_time, (step + 1) // gradient_accumulation, piece_num, epoch + 1, overall_step, mean_loss)) # how many steps to save a checkpoint if overall_step % args.save_per_step == 0: if not os.path.exists( os.path.join( output_dir, "model_step_%d" % (overall_step + 1))): os.mkdir( os.path.join( output_dir, "model_step_%d" % (overall_step + 1))) model_to_save = model.module if hasattr( model, 'module') else model model_to_save.save_pretrained( os.path.join(output_dir, "model_step_%d" % (overall_step + 1))) piece_num += 1 # save model per epoch print('saving model for epoch {}'.format(epoch + 1)) if not os.path.exists( os.path.join(output_dir, 'model_epoch{}'.format(epoch + 1))): os.mkdir( os.path.join(output_dir, 'model_epoch{}'.format(epoch + 1))) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained( os.path.join(output_dir, 'model_epoch{}'.format(epoch + 1))) print('epoch {} finished'.format(epoch + 1)) then = datetime.now() print('time: {}'.format(then)) print('time for one epoch: {}'.format(then - now)) # save the final model print('training finished') if not os.path.exists(os.path.join(output_dir, 'final_model')): os.mkdir(os.path.join(output_dir, 'final_model')) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(os.path.join(output_dir, 'final_model'))
def train(self): if not self.pretrained_model: model = GPT2KWModel(config=self.model_config) else: model = GPT2KWModel.from_pretrained(self.pretrained_model) model.train() model.to(self.device) # 计算模型参数量 num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() self.print_and_log('模型参数量: {}'.format(num_parameters)) self.print_and_log("开始加载训练集") train_loader = self.create_dataloader() self.print_and_log("训练集加载完毕") epoch_steps = int(train_loader.sampler.num_samples / self.batch_size / self.accumulation_steps) total_steps = epoch_steps * self.epochs self.print_and_log('epoch 步数 = {}'.format(epoch_steps)) self.print_and_log('总步数 = {}'.format(total_steps)) optimizer = pytorch_transformers.AdamW(model.parameters(), lr=self.lr, correct_bias=True) scheduler = pytorch_transformers.WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=total_steps) if self.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=self.fp16_opt_level) if torch.cuda.device_count() > 1: model = DataParallel(model) multi_gpu = True else: multi_gpu = False overall_step = 0 for epoch in range(self.epochs): self.print_and_log('epoch {}'.format(epoch + 1)) now = datetime.now() self.print_and_log('time: {}'.format(now)) optimizer.zero_grad() running_loss = 0 for i, batch_data in enumerate(train_loader): if torch.cuda.is_available(): keyword_ids = batch_data[0].to(self.device, non_blocking=True) passage_ids = batch_data[1].to(self.device, non_blocking=True) label_ids = passage_ids.clone().to(self.device, non_blocking=True) else: keyword_ids = batch_data[0] passage_ids = batch_data[1] label_ids = passage_ids.clone() outputs = model(input_ids=passage_ids, keyword_ids=keyword_ids, labels=label_ids) loss, logits = outputs[:2] if multi_gpu: loss = loss.mean() if self.gradient_accumulation > 1: loss = loss / self.gradient_accumulation # loss backward if self.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), self.max_grad_norm) if (i + 1) % self.gradient_accumulation == 0: running_loss += loss.item() scheduler.step() optimizer.step() optimizer.zero_grad() overall_step += 1 #if (overall_step + 1) % self.log_step == 0: # self.tb_writer.add_scalar('loss', loss.item(), overall_step) if (overall_step + 1) % self.log_step == 0 and running_loss != 0: self.print_and_log('now time: {}:{}. Step {} of epoch {}, loss {}'.format( datetime.now().hour, datetime.now().minute, overall_step + 1, epoch + 1, running_loss * self.gradient_accumulation / self.log_step)) running_loss = 0 if not os.path.exists(self.output_dir + 'model_epoch{}'.format(epoch + 1)): os.makedirs(self.output_dir + 'model_epoch{}'.format(epoch + 1)) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(self.output_dir + 'model_epoch{}'.format(epoch + 1)) # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1)) # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1)) then = datetime.now() self.print_and_log('time: {}'.format(then)) self.print_and_log('time for one epoch: {}'.format(then - now)) self.print_and_log('training finished') self.f_log.close() if not os.path.exists(self.output_dir + 'final_model'): os.makedirs(self.output_dir + 'final_model') model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(self.output_dir + 'final_model')
def train(self): if not self.pretrained_model: model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel( config=self.model_config) else: model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained( self.pretrained_model) model.train() model.to(self.device) # 计算模型参数量 num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() self.print_and_log('模型参数量 = {}'.format(num_parameters)) if self.do_tokenize: self.print_and_log("开始加载训练集") self.tokenize_and_save() self.print_and_log("训练集加载完毕") full_len = 0 for i in range(self.split_num): with open( self.tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: full_len += len( [int(item) for item in f.read().strip().split()]) sample_num = int(full_len / self.stride) epoch_steps = int(full_len / self.stride / self.batch_size / self.gradient_accumulation) total_steps = int(full_len / self.stride * self.epochs / self.batch_size / self.gradient_accumulation) self.print_and_log('样本数 = {}'.format(sample_num)) self.print_and_log('epoch 步数 = {}'.format(epoch_steps)) self.print_and_log('总步数 = {}'.format(total_steps)) optimizer = pytorch_transformers.AdamW(model.parameters(), lr=self.lr, correct_bias=True) scheduler = pytorch_transformers.WarmupLinearSchedule( optimizer, warmup_steps=self.warmup_steps, t_total=total_steps) if self.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=self.fp16_opt_level) if torch.cuda.device_count() > 1: model = DataParallel(model) multi_gpu = True else: multi_gpu = False overall_step = 0 running_loss = 0 for epoch in range(self.epochs): self.print_and_log('epoch {}'.format(epoch + 1)) now = datetime.now() self.print_and_log('time: {}'.format(now)) optimizer.zero_grad() split_indices = np.linspace(0, self.split_num - 1, self.split_num, dtype=np.int32) random.shuffle(split_indices) for split_index in split_indices: with open( self.tokenized_data_path + 'tokenized_train_{}.txt'.format(split_index), 'r') as f: line = f.read().strip() all_ids = line.split() all_ids = [int(x) for x in all_ids] start_point = 0 samples = [] while start_point < len(all_ids) - self.n_ctx: samples.append(all_ids[start_point:start_point + self.n_ctx]) start_point += self.stride random.shuffle(samples) for i in range(len(samples) // self.batch_size): # drop last batch = samples[i * self.batch_size:(i + 1) * self.batch_size] batch_labels = torch.tensor(batch, dtype=torch.long).to( self.device) batch_inputs = torch.tensor(batch, dtype=torch.long).to( self.device) outputs = model.forward(input_ids=batch_inputs, labels=batch_labels) loss, logits = outputs[:2] if multi_gpu: loss = loss.mean() if self.gradient_accumulation > 1: loss = loss / self.gradient_accumulation # loss backward if self.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), self.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), self.max_grad_norm) if (i + 1) % self.gradient_accumulation == 0: running_loss += loss.item() scheduler.step() optimizer.step() optimizer.zero_grad() overall_step += 1 if (overall_step + 1) % self.log_step == 0 and running_loss != 0: self.print_and_log( 'now time: {}:{}. Step {} of epoch {}, loss {}'. format( datetime.now().hour, datetime.now().minute, overall_step + 1, epoch + 1, running_loss * self.gradient_accumulation / self.log_step)) running_loss = 0 if not os.path.exists(self.output_dir + 'model_epoch{}'.format(epoch + 1)): os.makedirs(self.output_dir + 'model_epoch{}'.format(epoch + 1)) gpt2_model = model.transformer model_to_save = gpt2_model.module if hasattr( gpt2_model, 'module') else gpt2_model model_to_save.save_pretrained(self.output_dir + 'model_epoch{}'.format(epoch + 1)) # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1)) # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1)) then = datetime.now() self.print_and_log('time: {}'.format(then)) self.print_and_log('time for one epoch: {}'.format(then - now)) self.print_and_log('training finished') self.f_log.close() if not os.path.exists(self.output_dir + 'final_model'): os.makedirs(self.output_dir + 'final_model') gpt2_model = model.transformer model_to_save = gpt2_model.module if hasattr(gpt2_model, 'module') else gpt2_model model_to_save.save_pretrained(self.output_dir + 'final_model')
def main(): if raw: build_files(data_path=RAW_DATA_PATH) exit(1) model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel( config=model_config) MULTI_GPU = False if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel(model) MULTI_GPU = True model.to(device) total_tokens = 0 for i in tqdm(range(num_pieces)): with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: total_tokens += len(f.read().split()) num_chunks = total_tokens // stride total_steps = int(num_chunks * EPOCHS / BATCH_SIZE) print('total steps = {}'.format(total_steps)) optimizer = pytorch_transformers.AdamW(model.parameters(), lr=LR, correct_bias=True) scheduler = pytorch_transformers.WarmupLinearSchedule( optimizer, warmup_steps=WARMUP_STEPS, t_total=total_steps) if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) print('starting training') for epoch in range(EPOCHS): print('epoch {}'.format(epoch)) now = datetime.now() print('time: {}'.format(now)) x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32) random.shuffle(x) piece_num = 0 for i in x: with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: running_loss = 0 line = f.read() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 chunks = [] while start_point < len(tokens) - n_ctx: chunks.append(tokens[start_point:start_point + n_ctx]) start_point += stride random.shuffle(chunks) for step in range(len(chunks) // BATCH_SIZE): batch = chunks[step * BATCH_SIZE:(step + 1) * BATCH_SIZE] batch_labels = [] batch_inputs = [] for ids in batch: int_ids_for_labels = [int(x) for x in ids] int_ids_for_inputs = [int(x) for x in ids] batch_labels.append(int_ids_for_labels) batch_inputs.append(int_ids_for_inputs) batch_labels = torch.tensor(batch_labels).long().to(device) batch_inputs = torch.tensor(batch_inputs).long().to(device) optimizer.zero_grad() outputs = model.forward(input_ids=batch_inputs, labels=batch_labels) loss, logits = outputs[:2] if MULTI_GPU: loss = loss.mean() if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) running_loss += loss.item() scheduler.step() optimizer.step() if (step + 1) % LOG_STEP == 0: print( 'step {} of piece {} of epoch {}, loss {}'.format( step + 1, piece_num, epoch + 1, running_loss / LOG_STEP)) running_loss = 0 piece_num += 1 print('saving model for epoch {}'.format(epoch)) if not os.path.exists('./model/model_epoch{}'.format(epoch + 1)): os.mkdir('./model/model_epoch{}'.format(epoch + 1)) model.save_pretrained('./model/model_epoch{}'.format(epoch + 1)) torch.save(scheduler.state_dict(), './model/model_epoch{}/scheduler.pt'.format(epoch + 1)) torch.save(optimizer.state_dict(), './model/model_epoch{}/optimizer.pt'.format(epoch + 1)) print('epoch {} finished'.format(epoch + 1)) then = datetime.now() print('time: {}'.format(then)) print('time for one epoch: {}'.format(then - now)) print('training finished') if not os.path.exists('./model/final_model'): os.mkdir('./model/final_model') model.save_pretrained('./model/final_model') torch.save(scheduler.state_dict(), './model/final_model/scheduler.pt') torch.save(optimizer.state_dict(), './model/final_model/optimizer.pt')