def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path): # Construct model if openai_config_file == "": config = OpenAIGPTConfig() else: config = OpenAIGPTConfig(openai_config_file) model = OpenAIGPTModel(config) # Load weights from numpy load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path) # Save pytorch-model pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) torch.save(model.state_dict(), pytorch_weights_dump_path) print("Save configuration file to {}".format(pytorch_config_dump_path)) with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: f.write(config.to_json_string())
import math import torch import os from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel from pytorch_pretrained_bert.modeling import BertConfig, WEIGHTS_NAME, CONFIG_NAME from pytorch_pretrained_bert.modeling_openai import OpenAIGPTConfig, WEIGHTS_NAME, CONFIG_NAME model_path = 'openai-gpt' output_dir = './language-quality-subreward/gpt_output' WEIGHTS_NAME = 'pytorch_model.bin' special_tokens = ['_start_', '_delimiter_', '_classify_'] # Load pre-trained model (weights) with torch.no_grad(): output_config_file = os.path.join(output_dir, CONFIG_NAME) config = OpenAIGPTConfig(output_config_file) output_model_file = os.path.join(output_dir, WEIGHTS_NAME) model_state_dict = torch.load(output_model_file, map_location='cpu') model = OpenAIGPTLMHeadModel(config) model.load_state_dict(model_state_dict) # model = OpenAIGPTLMHeadModel.from_pretrained(model_path) # model.load_state_dict(torch.load(output_model_file, map_location='cpu')) model.eval() # Load pre-trained model tokenizer (vocabulary) tokenizer = OpenAIGPTTokenizer.from_pretrained(model_path, cache_dir='./tmp/', special_tokens=special_tokens) ''' model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt') model.eval() # Load pre-trained model tokenizer (vocabulary)
def __init__(self, args, tokenizer): self.args = args self.nli_tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case, cache_dir='.pytorch_pretrained_bert') self.output_config_file = os.path.join(args.output_dir, CONFIG_NAME) self.output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) self.nli_config = BertConfig(self.output_config_file) self.nli_model = BertForSequenceClassification(self.nli_config, num_labels=3) self.nli_model.load_state_dict( torch.load(self.output_model_file, map_location=torch.device('cpu'))) self.nli_model.to(args.device) self.nli_model.eval() if args.nli_uu_reward or args.nli_allres_reward: uu_output_config_file = os.path.join(args.uu_output_dir, CONFIG_NAME) uu_output_model_file = os.path.join(args.uu_output_dir, WEIGHTS_NAME) self.uu_nli_config = BertConfig(uu_output_config_file) self.uu_nli_model = BertForSequenceClassification( self.uu_nli_config, num_labels=3) self.uu_nli_model.load_state_dict( torch.load(uu_output_model_file, map_location=torch.device('cpu'))) self.uu_nli_model.to(args.device) self.uu_nli_model.eval() bert_emb_modelpath = "bert-base-uncased" self.bert_emb_tokenizer = BertTokenizer.from_pretrained( bert_emb_modelpath, cache_dir='.pytorch_pretrained_bert') self.bert_emb_model = BertModel.from_pretrained( bert_emb_modelpath, cache_dir='.pytorch_pretrained_bert').to(args.device) self.bert_emb_model.eval() self.tokenizer = tokenizer if args.lm_reward: lm_model_path = 'openai-gpt' lm_output_dir = 'language-quality-subreward/gpt_output' lm_special_tokens = ['_start_', '_delimiter_', '_classify_'] # Load pre-trained model (weights) with torch.no_grad(): lm_output_config_file = os.path.join(lm_output_dir, CONFIG_NAME) lm_config = OpenAIGPTConfig(lm_output_config_file) lm_output_model_file = os.path.join(lm_output_dir, WEIGHTS_NAME) #lm_model_state_dict = torch.load(lm_output_model_file) lm_model_state_dict = torch.load(lm_output_model_file, map_location='cpu') self.lm_model = OpenAIGPTLMHeadModel(lm_config) self.lm_model.load_state_dict(lm_model_state_dict) # Load pre-trained model tokenizer (vocabulary) self.lm_tokenizer = OpenAIGPTTokenizer.from_pretrained( lm_model_path, special_tokens=lm_special_tokens, cache_dir='.pytorch_pretrained_bert') self.special_tokens_ids = list( self.lm_tokenizer.convert_tokens_to_ids(token) for token in lm_special_tokens) self.lm_model.to(args.device) self.lm_model.eval()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument('--train_dataset', type=str, default='./train_recipes.json') parser.add_argument('--eval_dataset', type=str, default='./val_recipes.json') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=10) parser.add_argument('--train_batch_size', type=int, default=2) parser.add_argument('--eval_batch_size', type=int, default=2) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=6.25e-6) parser.add_argument('--warmup_proportion', type=float, default=0.1) parser.add_argument('--lr_schedule', type=str, default='warmup_cosine') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = OpenAIGPTTokenizer.from_pretrained( args.model_name, special_tokens=special_tokens) special_tokens_ids = list( tokenizer.convert_tokens_to_ids(token) for token in special_tokens) config = OpenAIGPTConfig() #model = OpenAIGPTLMHeadModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens)) model = OpenAIGPTLMHeadModel(config) model.set_num_special_tokens(len(special_tokens)) model.to(device) # Load and encode the datasets ''' if not args.train_dataset and not args.eval_dataset: roc_stories = cached_path(ROCSTORIES_URL) ''' def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj return list(tokenize_and_encode(o) for o in obj) logger.info("Encoding dataset...") train_dataset = load_recipes_dataset(args.train_dataset) train_dataset = train_dataset #remove extra length train data print(train_dataset[0]) eval_dataset = load_recipes_dataset(args.eval_dataset) print(len(eval_dataset)) datasets = (train_dataset, eval_dataset) encoded_datasets = tokenize_and_encode(datasets) selected_train_data = [] print(len(encoded_datasets[0])) for ins in encoded_datasets[0]: if len(ins) <= 510: selected_train_data.append(ins) encoded_datasets[0] = selected_train_data print(len(encoded_datasets[0])) # Compute the mex input length for the Transformer max_length = model.config.n_positions - 2 print(max_length) print(encoded_datasets[0][0]) input_length = max( len(story[:max_length]) + 2 for dataset in encoded_datasets for story in dataset) input_length = min(input_length, model.config.n_positions ) # Max size of input for the pre-trained model print(input_length) # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids) train_tensor_dataset = tensor_datasets[0] eval_tensor_dataset = tensor_datasets[1] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = len( train_data) * args.num_train_epochs // args.train_batch_size optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) print(.002 * num_train_optimization_steps) total_loss = 0 total_length = 0 print(model.transformer.h) ''' if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.eval() tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Pre LM training train data ppl") for step, batch in enumerate(tqdm_bar): #print(batch) batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels = batch loss = model(input_ids, lm_labels = lm_labels) lengths = mc_token_ids.to('cpu').numpy() #print(np.sum(lengths)) total_loss+=loss.item()*np.sum(lengths) total_length+=np.sum(lengths) print(total_loss/total_length) total_loss = 0 total_length = 0 if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.eval() tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(eval_dataloader, desc="Pre LM training val data ppl") for step, batch in enumerate(tqdm_bar): #print(batch) batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels = batch loss = model(input_ids, lm_labels = lm_labels) lengths = mc_token_ids.to('cpu').numpy() #print(np.sum(lengths)) total_loss+=loss.item()*np.sum(lengths) total_length+=np.sum(lengths) print(total_loss/total_length) ''' if args.do_train: print("=" * 80 + '\n') nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): #print(batch) batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels = batch loss = model(input_ids, lm_labels=lm_labels) loss.backward() optimizer.step() tr_loss += loss.item() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, optimizer.get_lr()[0]) total_loss = 0 total_length = 0 if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.eval() tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Post LM training train data ppl") for step, batch in enumerate(tqdm_bar): #print(batch) batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels = batch loss = model(input_ids, lm_labels=lm_labels) lengths = mc_token_ids.to('cpu').numpy() #print(np.sum(lengths)) total_loss += loss.item() * np.sum(lengths) total_length += np.sum(lengths) print(total_loss / total_length) total_loss = 0 total_length = 0 if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.eval() tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(eval_dataloader, desc="Post LM training val data ppl") for step, batch in enumerate(tqdm_bar): #print(batch) batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels = batch loss = model(input_ids, lm_labels=lm_labels) lengths = mc_token_ids.to('cpu').numpy() #print(np.sum(lengths)) total_loss += loss.item() * np.sum(lengths) total_length += np.sum(lengths) print(total_loss / total_length) print("=" * 80 + '\n') # Save a trained model '''
def main(): # Pre-train model: eval_ppl = 104.29582476475977 parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument('--train_dataset', type=str, default='') parser.add_argument('--eval_dataset', type=str, default='') parser.add_argument('--dataset', type=str, default='') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument('--warmup_proportion', type=float, default=0.002) parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") # args = parser.parse_args() args = parser.parse_args([ #'--do_train', '--do_eval', '--dataset=../data/convai2/train_both_original.txt', '--dataset=data/convai2/convai2_data.models', '--output_dir=./language-quality-subreward/gpt_output/' ]) print(args) # This commented code was used for parsing and pickling data from the original data file. ''' data = Parser(persona_limit=None, set_relation=None) print('Parsing...') data.parse(args.dataset) file_utils.save_model('data/convai2', data, '.models', 'convai2_data') ''' data = file_utils.read_model('', args.dataset, '') data = list(chain(*data.conversation)) #data = data[: 10] train_data_org = data[:int(0.9 * len(data))] eval_data_org = data[int(0.9 * len(data)):] del data print('') if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = OpenAIGPTTokenizer.from_pretrained( args.model_name, cache_dir="./cache/", special_tokens=special_tokens) special_tokens_ids = list( tokenizer.convert_tokens_to_ids(token) for token in special_tokens) model = OpenAIGPTLMHeadModel.from_pretrained( args.model_name, cache_dir="./cache/", num_special_tokens=len(special_tokens)) model.to(device) ''' # Load and encode the datasets if not args.train_dataset and not args.eval_dataset: roc_stories = cached_path(ROCSTORIES_URL) ''' def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj return list(tokenize_and_encode(o) for o in obj) logger.info("Encoding dataset...") train_dataset = train_data_org eval_dataset = eval_data_org datasets = (train_dataset, eval_dataset) encoded_datasets = tokenize_and_encode(datasets) # Compute the max input length for the Transformer max_length = model.config.n_positions // 2 - 2 input_length = max(len(sent[:max_length]) + 2 \ for dataset in encoded_datasets for sent in dataset) input_length = min(input_length, model.config.n_positions ) # Max size of input for the pre-trained model # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids) train_tensor_dataset, eval_tensor_dataset = tensor_datasets[ 0], tensor_datasets[1] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = len( train_data) * args.num_train_epochs // args.train_batch_size optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, lm_labels = batch loss = model(input_ids, lm_labels=lm_labels) loss.backward() optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, optimizer.get_lr()[0]) # Save a trained model if args.do_train: model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") config = model.config torch.save(model_to_save.state_dict(), output_model_file) # Yue: save the config: output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model that you have fine-tuned ''' model_state_dict = torch.load(output_model_file) model = OpenAIGPTLMHeadModel(config) model.load_state_dict(model_state_dict) model.to(device) ''' if args.do_eval: output_config_file = os.path.join(args.output_dir, CONFIG_NAME) config = OpenAIGPTConfig(output_config_file) # Load a trained model that you have fine-tuned output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) model_state_dict = torch.load(output_model_file) model = OpenAIGPTLMHeadModel(config) model.load_state_dict(model_state_dict) model.to(device) model.eval() eval_ppl = 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, lm_labels = batch loss = model(input_ids, lm_labels=lm_labels) eval_ppl += math.exp(loss.item()) nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_ppl = eval_ppl / nb_eval_steps train_loss = tr_loss / nb_tr_steps if args.do_train else None result = {'eval_ppl': eval_ppl, 'train_loss': train_loss} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))