def mine_triples(device, input_file, output_file, use_local_model=False): if use_local_model: print('loading BERT...') bert = BertForMaskedLM.from_pretrained("../models/BertForMaskedLM") print('loading GPT2...') gpt = GPT2LMHeadModel.from_pretrained("../models/GPT2LMHeadModel") else: print('loading BERT...') bert = BertForMaskedLM.from_pretrained(bert_model) print('loading GPT2...') gpt = GPT2LMHeadModel.from_pretrained(gpt2_model) """ 'concat': KnowledgeMiner( os.path.join(data_repo, candidate_file), device, DirectTemplate, bert ), 'template': KnowledgeMiner( os.path.join(data_repo, candidate_file), device, PredefinedTemplate, bert, grammar=False, template_loc=os.path.join(template_repo, single_templates) ), 'template_grammar': KnowledgeMiner( os.path.join(data_repo, candidate_file), device, PredefinedTemplate, bert, grammar=True, template_loc=os.path.join(template_repo, single_templates) ), """ knowledge_miners = { 'coherency': KnowledgeMiner(input_file, device, EnumeratedTemplate, bert, language_model=gpt, template_loc=os.path.join(template_repo, multiple_templates), use_local_model=use_local_model) } for template_type in knowledge_miners.keys(): predictions = run_experiment(template_type, knowledge_miners) triples = knowledge_miners[template_type].sentences.tuples scored_samples = list(zip(triples, predictions)) scored_samples.sort(key=lambda x: x[1], reverse=True) with open(output_file, "w") as f: for triple, pred in scored_samples: rel, head, tail = triple triple = (rel.lower(), head, tail) f.write("\t".join(triple) + "\t" + "{:.5f}".format(pred)) f.write("\n")
def __init__(self, type, model_name_or_path="gpt2"): super(LM, self).__init__() self.enc = GPT2Tokenizer.from_pretrained(model_name_or_path) if type == '345M': self.model = GPT2LMHeadModel.from_pretrained('output/') elif type == '117M': self.model = GPT2LMHeadModel.from_pretrained(model_name_or_path) self.model.to(self.device) self.model.eval() self.start_token = '<|endoftext|>'
def fluency_score(rated_a, opt): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") enc = GPT2Tokenizer.from_pretrained(opt.pretrained_model_path) model = GPT2LMHeadModel.from_pretrained(opt.pretrained_model_path) model.to(device) model.eval() nb_steps, eval_loss, exp_average_loss = 0, 0, None score_list = [] # k = "the book is on the desk. These impressions show , when alive , they had smooth skin , robust limbs with webbed feet , and a ridge of skin on their undersides." tensor(169.6684, device='cuda:0') with torch.no_grad(): for step, s in enumerate( rated_a): # actually here is a batch with batchsize=1 # Put model in training mode. if not s: print('space sentence') score_list.append(1e6) continue s = enc.encode( s) # + [50256] #50256 is the token_id for <|endoftext|> batch = torch.tensor([s]).to(device) loss = model(batch, lm_labels=batch) # everage -logp # print(loss*len(s)) eval_loss += loss.item() nb_steps += 1 score_list.append(loss.item()) cutoff = np.quantile([-t for t in score_list], 0.05) modified_rating = np.array( [cutoff if -t < cutoff else -t for t in score_list]) normed_rating = (modified_rating - cutoff) / np.abs(cutoff) return normed_rating
def init_model(seed=0, model_path='gpt2'): ''' Parameters: ---------- seed : int seed number for different ramdomizers model_name_or_path : string, optional either model name for existing model or path for trained model ''' np.random.seed(seed) torch.random.manual_seed(seed) torch.cuda.manual_seed(seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") enc = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2LMHeadModel.from_pretrained('gpt2') model = nn.DataParallel(model) model.load_state_dict(torch.load(model_path)) model = model.module model.to(device) model.eval() return model, enc, device
def mine_from_wikipedia(hardware): print('loading BERT...') bert = BertForMaskedLM.from_pretrained(bert_model) print('loading GPT2...') gpt = GPT2LMHeadModel.from_pretrained(gpt2_model) knowledge_miners = { 'concat': KnowledgeMiner(data_repo + wikipedia_candidates, hardware, DirectTemplate, bert), 'template': KnowledgeMiner(data_repo + wikipedia_candidates, hardware, PredefinedTemplate, bert, grammar=False, template_loc=template_repo + single_templates), 'template_grammar': KnowledgeMiner(data_repo + wikipedia_candidates, hardware, PredefinedTemplate, bert, grammar=True, template_loc=template_repo + single_templates), 'coherency': KnowledgeMiner(data_repo + wikipedia_candidates, hardware, EnumeratedTemplate, bert, language_model=gpt, template_loc=template_repo + multiple_templates) } for template_type in knowledge_miners.keys(): run_experiment(template_type, knowledge_miners)
def download_model(name): if not name in MODELS: raise Exception(str(name) + ' not a model in the list') if not exists(PATH): print("# ", str(PATH), "not found, creating dir.") mkdir(PATH) print('# Downloading model: ' + str(name)) name_path = MODEL_PATH_DICT[name] if name == 'word2vec': if not exists(join(PATH, name_path)): wget.download( 'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz' ) shutil.move(name_path, join(PATH, name_path)) print('# Downloaded word2vec') else: print('# Already downloaded') if name == 'glove': if not exists(join(PATH, name_path)): wget.download( 'http://nlp.stanford.edu/data/wordvecs/glove.840B.300d.zip') zip = zipfile.ZipFile('./glove.840B.300d.zip') zip.extractall() _ = glove2word2vec('./glove.840B.300d.txt', join(PATH, name_path)) print('# Downloaded glove') else: print('# Already downloaded') if name == 'dict2vec': if not exists(join(PATH, name_path)): wget.download( 'https://dict2vec.s3.amazonaws.com/dict2vec300.tar.bz2') tar = tarfile.open("dict2vec300.tar.bz2") tar.extractall() tar.close() shutil.move(name_path, join(PATH, name_path)) print('# Downloaded dict2vec') else: print('# Already downloaded') if name == 'conceptnet': if not exists(join(PATH, name_path)): wget.download( 'https://conceptnet.s3.amazonaws.com/downloads/2019/numberbatch/numberbatch-en-19.08.txt.gz' ) shutil.move(name_path, join(PATH, name_path)) print('# Downloaded Conceptnet Numberbatch') else: print('# Already downloaded') if name == 'bert' or name == 'bert-context': _ = BertTokenizer.from_pretrained('bert-large-uncased') _ = BertModel.from_pretrained( 'bert-large-uncased').embeddings.word_embeddings.weight.data.numpy( ) print('# Downloaded bert') if name == 'gpt2' or name == 'gpt2-context': _ = GPT2Tokenizer.from_pretrained('gpt2') _ = GPT2LMHeadModel.from_pretrained('gpt2') _ = GPT2Model.from_pretrained('gpt2') print('# Downloaded gpt-2')
def __init__(self,GPU, model_name_or_path="gpt2"): self.device = torch.device(GPU if torch.cuda.is_available() else "cpu") self.enc = GPT2Tokenizer.from_pretrained(model_name_or_path) self.model = GPT2LMHeadModel.from_pretrained(model_name_or_path) self.model.to(self.device) self.model.eval() self.start_token = '<|endoftext|>' print("Loaded GPT-2 model!")
def __init__(self, model_name_or_path="gpt2"): super(LM, self).__init__() self.enc = GPT2Tokenizer.from_pretrained(model_name_or_path) self.model = GPT2LMHeadModel.from_pretrained(model_name_or_path) self.model.to(self.device) self.model.eval() self.start_token = '<|endoftext|>' print("Loaded GPT-2 model!")
def run_model(): parser = argparse.ArgumentParser() parser.add_argument('--model_name_or_path', type=str, default='gpt2', help='pretrained model name or path to local checkpoint') parser.add_argument("--seed", type=int, default=0) parser.add_argument("--nsamples", type=int, default=1) parser.add_argument("--batch_size", type=int, default=-1) parser.add_argument("--length", type=int, default=-1) parser.add_argument("--temperature", type=int, default=1) parser.add_argument("--top_k", type=int, default=0) parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.') args = parser.parse_args() print(args) if args.batch_size == -1: args.batch_size = 1 assert args.nsamples % args.batch_size == 0 np.random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path) model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path) model.to(device) model.eval() if args.length == -1: args.length = model.config.n_ctx // 2 elif args.length > model.config.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx) while True: context_tokens = [] if not args.unconditional: raw_text = input("Model prompt >>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input("Model prompt >>> ") context_tokens = enc.encode(raw_text) generated = 0 for _ in range(args.nsamples // args.batch_size): out = sample_sequence( model=model, length=args.length, context=context_tokens if not args.unconditional else None, start_token=enc.encoder['<|endoftext|>'] if args.unconditional else None, batch_size=args.batch_size, temperature=args.temperature, top_k=args.top_k, device=device ) out = out[:, len(context_tokens):].tolist() for i in range(args.batch_size): generated += 1 text = enc.decode(out[i]) print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) print(text) print("=" * 80) if args.unconditional: break
def __init__( self, model_name_or_path="/data/pradeesh/detecting-fake-text/pytorch/"): super(LM, self).__init__() self.enc = GPT2Tokenizer.from_pretrained(model_name_or_path) self.model = GPT2LMHeadModel.from_pretrained(model_name_or_path) self.model.to(self.device) self.model.eval() self.start_token = '<|endoftext|>' print("Loaded GPT-2 model!")
def get_model(args, device): if args.scratch: config = GPT2Config(n_ctx=args.context_length, n_positions=args.context_length) model = GPT2LMHeadModel(config) else: model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path) #import torchsummary #torchsummary.summary(model, (args.context_length, vocab_size), args.train_batch_size) return model.to(device)
def __init__(self, args): super().__init__() if args.gpt2_model_dir is not None: # load GPT2 model from file gpt_model_name = str(args.gpt2_model_dir) + "/" dict_file = gpt_model_name print("loading GPT2 model from {}".format(gpt_model_name)) else: # load GPT2 model from huggingface cache gpt_model_name = args.gpt2_model_name dict_file = gpt_model_name # Load pre-trained model tokenizer (vocabulary) self.tokenizer = GPT2Tokenizer.from_pretrained(dict_file) # GPT uses different way to represent BPE then BERT. Namely, the # final suffixes are indicated with </w> suffix, while pieces that must # be followed are written as is. In BERT the prefixes are written as is # while the parts that must follow (not be followed!) have '##' prefix. # There is no one-to-one coversion. But at least we may make pieces that # may form a full word look the same. # Note that we should be very careful now, # tokenizer.convert_tokens_to_ids won't work with our vocabulary. def convert_word(word): if word == GPT2_EOS: return word if word.startswith('Ġ'): # the token starts with a whitespace return word[1:] return f'_{word}_' # the token not start with a white space. # may be not a head of a word, # or may be a head of a sentence. _, gpt_vocab = zip(*sorted(self.tokenizer.decoder.items())) self.vocab = [convert_word(word) for word in gpt_vocab] self._init_inverse_vocab() # Load pre-trained model (weights) self.gpt_model = GPT2LMHeadModel.from_pretrained(gpt_model_name) self.gpt_model.eval() # print(self.gpt_model.config) # Sanity check. assert len(self.vocab) == self.gpt_model.config.vocab_size #assert 0 == self.gpt_model.config.n_special self.eos_id = self.gpt_model.config.eos_token_id self.pad_id = self.gpt_model.config.eos_token_id self.unk_id = self.gpt_model.config.eos_token_id self.bos_id = self.gpt_model.config.bos_token_id self.model_vocab = self.vocab
def init(): #seed = 42 #np.random.seed(seed) #torch.random.manual_seed(seed) #torch.cuda.manual_seed(seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") enc = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2LMHeadModel.from_pretrained('gpt2') model.to(device) model.eval() return enc, model
def mine(hardware): print('Loading GPT2...') gpt = GPT2LMHeadModel.from_pretrained(gpt2_model) knowledge_miners = { 'coherency': KnowledgeMiner( data_repo + test_data, hardware, EnumeratedTemplate, language_model = gpt, template_loc = template_repo + multiple_templates) } return run_experiment('coherency', knowledge_miners)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--batch_size',default=1,type=int,help='Batch size for inference') parser.add_argument('--model_name',default='gpt2',type=str, help='Pre-trained model name') parser.add_argument('--max_seq_length',default=128,type=int, help='Maximum total input sequence length after tokenization') args = parser.parse_args() input_ids = torch.zeros([args.batch_size,args.max_seq_length],dtype=torch.long) model = GPT2LMHeadModel.from_pretrained(args.model_name) torch.onnx.export(model,input_ids,'gpt2_'+'batch'+str(args.batch_size)+'.onnx')
def __init__(self, text_sequence, model_type, temperature = 1.0, top_k = 0, batch_size = 1, length = 1, nsamples =1, debug = True): self.text_sequence = text_sequence #eventually will differentiate between gpt-2, BERT, etc. self.model_type = model_type model_name = 'gpt2' self.debug = debug #detect device self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.temperature = temperature self.top_k = top_k self.batch_size = batch_size self.length = length self.nsamples = nsamples #create encoder and model self.enc = GPT2Tokenizer.from_pretrained(model_name) self.model = GPT2LMHeadModel.from_pretrained(model_name) self.model.to(self.device) self.model.eval()
def context_score(questions, answers, opt): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") enc = GPT2Tokenizer.from_pretrained(opt.pretrained_model_path) model = GPT2LMHeadModel.from_pretrained(opt.pretrained_model_path) model.to(device) model.eval() score_list = [] with torch.no_grad(): for step, (question, answer) in enumerate( zip(questions, answers)): # actually here is a batch with batchsize=1 # Put model in training mode. if not answer: print('space sentence') score_list.append(-1e6) continue joint_enc = enc.encode( question + ' ' + answer) # + [50256] #50256 is the token_id for <|endoftext|> q = enc.encode(question) batch_joint = torch.tensor([joint_enc]).to(device) batch_q = torch.tensor([q]).to(device) loss_joint = model(batch_joint, lm_labels=batch_joint) # everage -logp loss_q = model(batch_q, lm_labels=batch_q) p_joint = -loss_joint * (len(joint_enc) - 1) p_q = -loss_q * (len(q) - 1) score = p_joint - (p_q) score_list.append(score.item()) cutoff = np.quantile(score_list, 0.05) modified_rating = np.array( [cutoff if t < cutoff else t for t in score_list]) normed_rating = (modified_rating - cutoff) / np.abs(cutoff) return normed_rating
def load_model_fromlist(name): if not name in MODELS: raise Exception(str(name) + ' not a model in the list') print('# Loading model: ' + str(name)) name_path = MODEL_PATH_DICT[name] if name == 'word2vec': if not exists(join(PATH, name_path)): download_model(name) return (gensim.models.KeyedVectors.load_word2vec_format(join( PATH, name_path), binary=True)) if name == 'glove': if not exists(join(PATH, name_path)): download_model(name) return (gensim.models.KeyedVectors.load_word2vec_format( join(PATH, name_path))) if name == 'dict2vec': if not exists(join(PATH, name_path)): download_model(name) return (gensim.models.KeyedVectors.load_word2vec_format( join(PATH, name_path), binary=False, unicode_errors="ignore")) if name == 'conceptnet': if not exists(join(PATH, name_path)): download_model(name) return (gensim.models.KeyedVectors.load_word2vec_format( join(PATH, name_path))) if name == 'bert': tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') model = BertModel.from_pretrained( 'bert-large-uncased').embeddings.word_embeddings.weight.data.numpy( ) return ([model, tokenizer]) if name == 'bert-context': tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') model = BertModel.from_pretrained('bert-large-uncased', output_hidden_states=True) return ([model, tokenizer]) if name == 'gpt2': tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2LMHeadModel.from_pretrained( 'gpt2').transformer.wte.weight.data.numpy() return ([model, tokenizer]) if name == 'gpt2-context': tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2Model.from_pretrained('gpt2', output_hidden_states=True) return ([model, tokenizer])
def run_model(): parser = argparse.ArgumentParser() parser.add_argument('--model_name_or_path', type=str, default='gpt2', help='pretrained model name or path to local checkpoint') parser.add_argument("--seed", type=int, default=0) parser.add_argument("--nsamples", type=int, default=1) parser.add_argument("--batch_size", type=int, default=-1) parser.add_argument("--length", type=int, default=-1) parser.add_argument("--temperature", type=float, default=1.0) parser.add_argument("--top_k", type=int, default=0) parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.') parser.add_argument('--inputs_file', type=str, default=None) parser.add_argument('--output_file', type=str, default='results.json') parser.add_argument('--do_beam_search', type=bool, default=False) args = parser.parse_args() print(args) if args.batch_size == -1: args.batch_size = 1 assert args.nsamples % args.batch_size == 0 np.random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path) model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path) model.to(device) model.eval() if args.length == -1: args.length = model.config.n_ctx // 2 elif args.length > model.config.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx) if args.inputs_file is None: decode_interactive(model, enc, device, args) else: decode_from_file(model, enc, device, args)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--output_dir", default='tuned_gpt2', type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument('--train_dataset', type=str, default='') parser.add_argument('--source_eval', type=str, default='') parser.add_argument('--target_eval', type=str, default='') parser.add_argument('--source_train', type=str, default='') parser.add_argument('--target_train', type=str, default='') parser.add_argument('--eval_dataset', type=str, default='') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=10) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--effective_batch_size',type=int, default=64) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument('--warmup_proportion', type=float, default=0.002) parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument('--bsz', type=int, default = 20) parser.add_argument('--bptt', type=int, default = 40) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() # print(args) model_type = 'gpt2' if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device(type='cuda') n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) # if not args.do_train and not args.do_eval: # raise ValueError("At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2LMHeadModel.from_pretrained('gpt2').to('cuda') model.to(device) #file_train = args.train_dataset #'cnn_train.txt' #file_eval = args.eval_dataset #'cnn_valid.txt' bptt = args.bptt bsz = args.bsz # X_eval, nbatch_eval = load_dataset(file_eval, tokenizer, bptt, bsz) # X_train, nbatch_train = load_dataset(file_train, tokenizer, bptt, bsz) batches_eval, labels_eval, nbatch_eval = load_dataset(args.source_eval, args.target_eval, tokenizer, bptt, bsz) batches_train, labels_train, nbatch_train = load_dataset(args.source_train, args.target_train, tokenizer, bptt, bsz) # Prepare optimizer # param_optimizer = list(model.parameters()) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] print('here 3') # num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size num_train_optimization_steps = nbatch_train * args.num_train_epochs optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) eval_loss_min = None print('here 4') model.to(device) nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for epoch_i in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 for i_batch in tqdm(list(range(nbatch_train)), desc='Evaluating epoch {}'.format(epoch_i)): batch = batches_train[i_batch]#X_train[:, i_batch*bsz:(1+i_batch)*bsz].permute(1,0) batch = batch.cuda() lm_labels = labels_train[i_batch].cuda() if batch.numel() == 0: break #loss = model(batch, lm_labels = labels_train[i_batch].cuda()) # TRY DOING IT MANUALLY loss_fct = CrossEntropyLoss(reduction = 'none') lm_logits,_ = model(batch) shift_logits = lm_logits[:, :-1, :].contiguous() shift_labels = batch[:,1:].contiguous() shift_labels_mask = (lm_labels[:,1:].contiguous().view(-1) != -1).float() loss_mat = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) loss = (loss_mat*shift_labels_mask).view(-1).sum()/shift_labels_mask.sum() # avg over non-masked indices loss.backward() # only step the model if you've gone through 'effective_batch_size' examples if (i_batch*args.train_batch_size) % args.effective_batch_size == 0 and i_batch != 0: optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item() nb_tr_steps += 1 ### # Evaluations ### if i_batch % 1000 == 0: # get eval score eval_loss = eval_model(model, nbatch_eval,batches_eval,labels_eval, bsz) # if eval_loss improves, save model if eval_loss_min is None or eval_loss < eval_loss_min: eval_loss_min = eval_loss # save model if eval loss is lower model_to_save = model # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) to_json_file(model_to_save.config,output_config_file) print('eval_loss {}',format(eval_loss)) model.train() if i_batch % 200 == 0: # try generating from model print("Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, optimizer.get_lr()[0])) model.eval() if model_type == 'gpt': encode = lambda a: tokenizer.convert_tokens_to_ids(tokenizer.tokenize(a)) decode = tokenizer.decode elif model_type == 'gpt2': encode = tokenizer.encode decode = tokenizer.decode generate_from_model(encode, decode, model = model,model_type = model_type) model.train()
import numpy as np import torch import torch.nn.functional as F import tqdm from tensorboardX import SummaryWriter from torch.utils.data import DataLoader, Dataset from tqdm import trange import pytorch_pretrained_bert from data_loader import get_data_loader from model_sampler import print_samples from pytorch_pretrained_bert import GPT2LMHeadModel, GPT2Tokenizer, OpenAIAdam from torch.utils.data import DataLoader, Dataset, Subset model_name = 'gpt2' enc = GPT2Tokenizer.from_pretrained(model_name) model = GPT2LMHeadModel.from_pretrained(model_name) model_name = 'gpt2' enc = GPT2Tokenizer.from_pretrained(model_name) model = GPT2LMHeadModel.from_pretrained(model_name) device='cpu' beam_width = 130 stopwords = [] def to_list(tensor): return list(tensor.cpu().numpy()) def predict(line, max_predictions): """Give continuation of the line with at most max_predictions BPE tokens. Returns line extended with predictions of the model."""
def run(): parser = ArgumentParser() parser.add_argument("--model_type", type=str, default="gpt", help="gpt or gpt2") parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--filename", type=str, default="data/instances_dev.pkl", help="File to use for decoding") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=50, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=42, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") if args.model_type == 'gpt2': tokenizer = GPT2Tokenizer.from_pretrained(args.model_checkpoint) model = GPT2LMHeadModel.from_pretrained(args.model_checkpoint) else: tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_checkpoint) model = OpenAIGPTLMHeadModel.from_pretrained(args.model_checkpoint) model.to(args.device) model.eval() data = get_dataset_from_file(tokenizer, args.filename) final_output_dict = { "version": "squash-2.0", "data": [{ "paragraphs": [] }] } question_number = 0 # For all the instances corresponding one paragraph, model input format is: paragraph + answer + question) # Paragraph will be common accross all the instances. # "past" can be used to reuse precomputed hidden state for paragraph in a subsequent predictions imort copy previous_para_index = None past = None for inst in tqdm.tqdm(data): with torch.no_grad(): current_para_index = inst['para_index'] if current_para_index != prev_para_index: past = None currrent_inst = copy.deepcopy(inst) # only keeping paragraph details in the instance to get its hidden states current_inst['question'] = [] current_inst['answer'] = [] instance, _ = build_input_from_segments(current_inst,tokenizer,with_eos=False) input_ids = torch.tensor(instance['input_ids'][:-2],device=args.device).unsqueeze(0) token_type_ids = torch.tensor(instance['token_type_ids'][:-2],device=args.device).unsqueeze(0) _,past=model(input_ids,toekn_type_ids=toekn_type_ids,past=past) #output "past" will have paragraph embeddings output = sample_sequence(inst, tokenizer, model, args,past) original_paragraph = tokenizer.decode(output['paragraph']) generated_question = tokenizer.decode(output['question'], skip_special_tokens=True) original_answer = tokenizer.decode(output['answer'], skip_special_tokens=True) para_index = inst['para_index'] # Output in a SQUAD-like format with questions clumped together under their parent paragraph if len(final_output_dict["data"][0]["paragraphs"]) > para_index: # verify whether the paragraph text is identical assert original_paragraph == final_output_dict["data"][0]["paragraphs"][para_index]['context'] # append the question answer pair final_output_dict["data"][0]["paragraphs"][para_index]['qas'].append({ 'id': 'question_%d' % question_number, 'question': generated_question, 'answers': [{ 'text': original_answer, 'answer_start': original_paragraph.index(original_answer) }], 'class': output['class'], 'algorithm': output['algorithm'], 'is_impossible': False }) else: # add a new question to the list of QA pairs final_output_dict['data'][0]['paragraphs'].append({ 'context': original_paragraph, 'qas': [{ 'id': 'question_%d' % question_number, 'question': generated_question, 'answers': [{ 'text': original_answer, 'answer_start': original_paragraph.index(original_answer) }], 'class': output['class'], 'algorithm': output['algorithm'], 'is_impossible': False }] }) question_number += 1 with open("squash/temp/generated_questions.json", "w") as f: f.write(json.dumps(final_output_dict))
def __init__(self): self.model = GPT2LMHeadModel.from_pretrained('gpt2') self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2') self.model.cuda() self.model.eval()
def run_model(): parser = argparse.ArgumentParser(description="") parser.add_argument('--model-path', type=str, help='pretrained model path to local checkpoint') parser.add_argument("--batch-size", type=int, default=40) parser.add_argument('--data-dir', type=str, default='../data') parser.add_argument('--dataset', type=str, default='../data') parser.add_argument("--test", action='store_true', default=False) args = parser.parse_args() print(args) if args.batch_size == -1: args.batch_size = 1 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = GPT2LMHeadModel.from_pretrained('gpt2', cache_dir='out/cache') if args.model_path: state = torch.load(args.model_path, map_location='cpu') model.load_state_dict(state) tokenizer = GPT2Tokenizer(os.path.join(args.data_dir, 'gpt2-vocab.json'), os.path.join(args.data_dir, 'gpt2-merges.txt')) # Hack to allow tokenizing longer sequences. tokenizer.max_len = int(1e12) model.half().to(device) model.eval() print('Model loaded.') d_val = PromptDataset( os.path.join( args.data_dir, 'writingPrompts/{}.wp_source'.format( 'test' if args.test else 'valid')), os.path.join( args.data_dir, 'writingPrompts/{}.wp_target'.format( 'test' if args.test else 'valid')), wp_preprocess) d_val_raw = PromptDataset( os.path.join( args.data_dir, 'writingPrompts/{}.wp_source'.format( 'test' if args.test else 'valid')), os.path.join( args.data_dir, 'writingPrompts/{}.wp_target'.format( 'test' if args.test else 'valid'))) print('Data loaded.') print('Running evaluation...') with torch.no_grad(): ppls = [] word_ppls = [] token_diffs = [] num_errs = 0 batch = [] for sample_id, (text, check_text) in enumerate(zip(d_val, d_val_raw)): bpe_tokens = [tokenizer.encoder['<|endoftext|>'] ] + tokenizer.encode(text) # (This limit applies to GPT2) bpe_tokens = bpe_tokens[:1025] # Pad batch.append( (bpe_tokens + [0] * (1025 - len(bpe_tokens)), len(bpe_tokens), check_text.split('---\n')[1].split(' '))) if len(batch) == args.batch_size or len( word_ppls) == len(d_val) - 1: x, x_lens, raw_tokens = zip(*batch) token_tensor = torch.tensor(x, dtype=torch.long, device=device) # Compute log probs lps = compute_logprobs(token_tensor, model) token_tensor = token_tensor.cpu().numpy() # Compute individually for i in range(lps.shape[0]): try: # Mask out some tokens target_tokens = token_tensor[i, 1:x_lens[i]] log_probs = lps[i, :x_lens[i] - 1] ppl, token_diff = word_level_ppl( target_tokens, log_probs.cpu().float().numpy(), tokenizer, raw_tokens[i]) token_diffs.append(token_diff) word_ppls.append(ppl) ppls.append(torch.exp(-log_probs.mean()).item()) except Exception as e: print('Skipping anomaly.') print(e) num_errs += 1 print( 'World Level PPL {:.2f} BPE PPL {:.2f} Diff {:.2f} Done: {:.2f}% Skip {}' .format(np.mean(word_ppls), np.mean(ppls), np.mean(token_diffs), sample_id / len(d_val) * 100, num_errs)) batch = []
# "vocab_size": 50257 # } ## Predict hidden states features for each layer with torch.no_grad(): hidden_states_1, past = model(tokens_tensor_1) print(hidden_states_1.shape) # torch.Size([1, 6, 768]) print(len(past), past[0].shape) # 12 torch.Size([2, 1, 12, 6, 64]) hidden_states_2, past = model(tokens_tensor_2, past=past) print(hidden_states_2.shape) # torch.Size([1, 8, 768]) print(len(past), past[0].shape) # 12 torch.Size([2, 1, 12, 14, 64]); 14 = 8 + 6 ## past can be used to reuse precomputed hidden state in a subsequent predictions (see beam-search examples in the run_gpt2.py example). ################################################################## ## GPT2LMHeadModel model = GPT2LMHeadModel.from_pretrained('/Users/coder352/datasets/WordVec/pytorch_pretrained_bert/gpt2/') model.eval() ## Predict all tokens with torch.no_grad(): predictions_1, past = model(tokens_tensor_1) predictions_2, past = model(tokens_tensor_2, past=past) print(hidden_states_2.shape) # torch.Size([1, 8, 768]) print(len(past), past[0].shape) # 12 torch.Size([2, 1, 12, 14, 64]) ## get the predicted last token predicted_index = torch.argmax(predictions_2[0, -1, :]).item(); print(predicted_index) # 508 predicted_token = tokenizer.decode([predicted_index]); print(predicted_token) # who ################################################################## ## Transformer-XL
def __init__(self, opt, shared=None): super(TransformerAgent, self).__init__(opt, shared) args = AttrDict( opt) # to keep most commands identical to the interact.py script self.args = args logging.basicConfig(level=logging.INFO) self.logger = logging.getLogger(__file__) self.logger.info(pformat(args)) random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) if shared is None: self.logger.info("Get pretrained model and tokenizer") if args.model_checkpoint == "": args.model_checkpoint = download_pretrained_model() if args.model.startswith('gpt2'): self.tokenizer = GPT2Tokenizer.from_pretrained( args.model_checkpoint) if self.args.eval_type == "hits@1": self.model_checkpoint = GPT2DoubleHeadsModel.from_pretrained( args.model_checkpoint) else: self.model_checkpoint = GPT2LMHeadModel.from_pretrained( args.model_checkpoint) elif args.model == 'openai-gpt': self.tokenizer = OpenAIGPTTokenizer.from_pretrained( args.model_checkpoint) if self.args.eval_type == "hits@1": self.model_checkpoint = OpenAIGPTDoubleHeadsModel.from_pretrained( args.model_checkpoint) else: self.model_checkpoint = OpenAIGPTLMHeadModel.from_pretrained( args.model_checkpoint) else: raise NotImplementedError( 'model type "%s" not implemented. Use either "openai-gpt" or "gpt2"' ) self.model_checkpoint.to(args.device) self.model_checkpoint.eval() self.logger.info("Build BPE prefix dictionary") convai_dict = build_dict() assert len(convai_dict) == 19304 self.prefix2words = self.get_prefix2words(convai_dict) else: self.model_checkpoint = shared['model'] self.tokenizer = shared['tokenizer'] self.prefix2words = shared['prefix2words'] self.special_tokens_ids = self.tokenizer.convert_tokens_to_ids( SPECIAL_TOKENS) self.persona = [] self.history = [] self.labels = [] self.reset()
def main(): # Parse the arguments parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='gpt2', help='pretrained model name') parser.add_argument("--bucket_name", type=str, default="al-ml-data") parser.add_argument("--s3_key", type=str, default="e2e_training/ gpt2_train_with_ids_indexed.pkl") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument('--train_dataset', type=str, default='') parser.add_argument('--eval_dataset', type=str, default='') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=1) parser.add_argument('--train_batch_size', type=int, default=16) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument('--warmup_proportion', type=float, default=0.002) parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Set the seed for random, numpy, PyTorch random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset #special_tokens = ['<POS>', '<NEG>','<END>'] tokenizer = GPT2Tokenizer.from_pretrained(args.model_name) #start_token_id = tokenizer.convert_tokens_to_ids(['<START>'])[0] model = GPT2LMHeadModel.from_pretrained(args.model_name) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # Load and encode dataset def tokenize_and_encode(bucket_name, s3_key): ''' This method tokenizes the input data and encodes it using the OpenAIGPTTokenizer :param file_path: Path of the input file, dtype: str :return: encoded dataset dtype: list ''' s3 = boto3.resource('s3') try: s3.Bucket(bucket_name).download_file(s3_key, '/tmp/gpt2_train_v1.pkl') except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == "404": print("The object does not exist.") else: raise with open("/tmp/gpt2_train_v1.pkl","rb") as fp1: data = pickle.load(fp1) os.remove("/tmp/gpt2_train_v1.pkl") return data logger.info("Encoding dataset...") train_dataset = tokenize_and_encode(args.bucket_name, args.s3_key) print(len(train_dataset)) train_dataset = [c for c in train_dataset if len(c) > 0] print(len(train_dataset)) #eval_dataset = tokenize_and_encode(args.eval_dataset) print("Training samples = {}".format(len(train_dataset))) #print("Validation samples = {}".format(len(eval_dataset))) print("Example = {}".format(train_dataset[0])) time.sleep(2) #train_dataset = [x for x in train_dataset if len(x) <= 300] #eval_dataset = [x for x in eval_dataset if len(x) <= 300] # Compute the mex input length for the Transformer #input_length = max(max(len(t) for t in train_dataset), max(len(q) for q in eval_dataset)) input_length = max(len(t) for t in train_dataset) if n_gpu > 1: input_length = min(input_length, model.module.config.n_positions) else: input_length = min(input_length, model.config.n_positions) # Max size of input for the pre-trained model print("Input Length = {}".format(input_length)) def pre_process_dataset(encoded_dataset, input_length): """ This method is to create torch tensor of input ids and lm labels :param encoded_dataset: Input dataset, dtype: list :param input_length: Maximum length of sentence from training and eval dataset, dtype: int :return: torch.tensor of size [len(encoded_dataset), 2] """ n_batch = len(encoded_dataset) input_ids = np.zeros(shape=(n_batch, input_length), dtype=np.int64) lm_labels = np.full(shape=(n_batch, input_length), fill_value=-1, dtype=np.int64) for i, tokens in enumerate(encoded_dataset): input_ids[i, :len(tokens)] = tokens[:input_length] start_token_index = tokens.index(9688) # 9688 is id for token 'start' lm_labels[i, start_token_index+2 : len(tokens)-1] = tokens[start_token_index+3 : input_length] input_ids = torch.tensor(input_ids) lm_labels = torch.tensor(lm_labels) tensor_dataset = (input_ids, lm_labels) #tensor_dataset.append(torch.tensor(d) for d in all_inputs) return tensor_dataset # Prepare input tensors and dataloders train_tensor_dataset = pre_process_dataset(train_dataset, input_length=input_length) #eval_tensor_dataset = pre_process_dataset(eval_dataset, input_length=input_length) print(train_tensor_dataset[0].shape, train_tensor_dataset[1].shape) print("Training Example Input ids= {}".format(train_tensor_dataset[0][0])) print("Training Example Language Modeling ids = {}".format(train_tensor_dataset[1][0])) time.sleep(10) train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) #eval_data = TensorDataset(*eval_tensor_dataset) #eval_sampler = RandomSampler(eval_data) #eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for epoch in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, lm_labels = batch loss = model(input_ids, lm_labels=lm_labels) if n_gpu > 1: loss.mean().backward() else: loss.backward() optimizer.step() optimizer.zero_grad() if n_gpu > 1: tmp_loss = loss.mean().item() else: tmp_loss = loss.item() exp_average_loss = tmp_loss if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * tmp_loss nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, optimizer.get_lr()[0]) ''' if (step > 0 and step % 20 == 0): print("SAving Model....") model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "language_model_{}.bin".format(epoch+1)) config = model.module.config if hasattr(model, 'module') else model.config torch.save(model_to_save.state_dict(), output_model_file) ''' model_to_save = model.module if hasattr(model, 'module') else model output_model_file = os.path.join(args.output_dir, "pytorch_model_final.bin") config = model.module.config if hasattr(model, 'module') else model.config torch.save(model_to_save.state_dict(), output_model_file) model_state_dict = torch.load(output_model_file) model = GPT2LMHeadModel(config) model.load_state_dict(model_state_dict) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # Save a trained model # if args.do_train: # model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") # config = model.config # torch.save(model_to_save.state_dict(), output_model_file) # # # Load a trained model that you have fine-tuned # model_state_dict = torch.load(output_model_file) # model = OpenAIGPTLMHeadModel(config) # model.load_state_dict(model_state_dict) # model.to(device) if args.do_eval: model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, lm_labels = batch with torch.no_grad(): lm_loss = model(input_ids, lm_labels=lm_labels) eval_loss += lm_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps train_loss = tr_loss/nb_tr_steps if args.do_train else None result = {'eval_loss': eval_loss, 'train_loss': train_loss} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def run_lm(data, year, model_name, predictions_dict): """ Using BERT or GPT2 as Language models :param data: The actual data of the year stored on dictionary :param year: The corresponding year of the data. It is used when we save the predictions :param model_name: Name of LM_experiments we used (BERT or GPT2). It is used on the output file name :param predictions_dict: A dict where we save the predictions from our experiments :return: The updated prediction_dict """ model, tokenizer, vocab_size = None, None, None if model_name == 'GPT2_LM': tokenizer = GPT2Tokenizer.from_pretrained('gpt2') vocab_size = len(tokenizer.encoder) model = GPT2LMHeadModel.from_pretrained('gpt2') elif model_name == 'BERT_LM': tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) vocab_size = len(tokenizer.vocab) model = BertForMaskedLM.from_pretrained('bert-base-uncased') model.eval() model.to('cuda') # It is used when we normalize the predicted probabilities of LM_experiments to [0, 1] soft_max = torch.nn.Softmax() # For each κ initialize a dict to store the predictions each_case_k_predictions = [{} for _ in range(MAX_BPES_TO_SEARCH)] for doc_id, doc in data.items(): for j in range(MAX_BPES_TO_SEARCH): each_case_k_predictions[j].update({doc_id: {}}) for peer_id, peer in doc['peer_summarizers'].items(): summary = peer['system_summary'] if not_valid(peer_id=peer_id, doc_id=doc_id): for j in range(MAX_BPES_TO_SEARCH): each_case_k_predictions[j][doc_id].update( {peer_id: vocab_size}) continue indexed_summary = None if model_name == 'GPT2_LM': indexed_summary = tokenizer.encode(summary) elif model_name == 'BERT_LM': # BERT can handle max 512 bpes tokenized_summary = tokenizer.tokenize(summary)[:512] indexed_summary = tokenizer.convert_tokens_to_ids( tokenized_summary) # Convert the SUMMARY to PyTorch tensor tokens_tensor = torch.tensor([indexed_summary]) tokens_tensor = tokens_tensor.to('cuda') with torch.no_grad(): if summary != '': if model_name == 'GPT2_LM': predictions, _ = model( tokens_tensor) # GPT returns the present elif model_name == 'BERT_LM': predictions = model( tokens_tensor) # BERT returns only the predictions probability_distribution = [] # i --> index of the word that we are looking (i+1 the next one) for i in range(predictions.shape[1] - 1): # Normalize the predictions of LM_experiments by passing them through the softmax soft_predictions = soft_max( predictions[0, i, :]).reshape(vocab_size) if model_name == 'GPT2_LM': # GPT -> probabilities (predictions) corresponds to the next word p = soft_predictions[tokens_tensor[0, i + 1]].item() elif model_name == 'BERT_LM': # BERT -> probabilities (predictions) corresponds to this word which is masked p = soft_predictions[tokens_tensor[0, i]].item() probability_distribution.append(math.log(p, 2)) perplexities = get_perplexity( probabilities=probability_distribution) for j in range(MAX_BPES_TO_SEARCH): each_case_k_predictions[j][doc_id].update( {peer_id: perplexities[j]}) else: print('BLANK') for j in range(MAX_BPES_TO_SEARCH): each_case_k_predictions[j][doc_id].update( {peer_id: vocab_size}) k = compute_correlations_of_each_k(data=data, predictions=each_case_k_predictions, model_name=model_name, year=year) return save_the_best_predictions( best_predictions=each_case_k_predictions[k - 1], predictions_dict=predictions_dict, year=year, model_name=model_name)
parser.add_argument("--seed", type=int, default=0) parser.add_argument("--nsamples", type=int, default=1) parser.add_argument("--batch_size", type=int, default=-1) parser.add_argument("--length", type=int, default=randrange(50, 150, 1)) parser.add_argument("--temperature", type=float, default=1.0) parser.add_argument("--top_k", type=int, default=5) parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.') args = parser.parse_args() print(args) if args.batch_size == -1: args.batch_size = 1 assert args.nsamples % args.batch_size == 0 np.random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokenizer = GPT2Tokenizer.from_pretrained("gpt2") model = GPT2LMHeadModel.from_pretrained("gpt2") model.to(device) if args.length == -1: args.length = model.config.n_ctx // 2 elif args.length > model.config.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx) app.run("0.0.0.0", port=int(os.environ.get("PORT", 5000)))
def run(): parser = ArgumentParser() parser.add_argument("--model_type", type=str, default="gpt", help="gpt or gpt2") parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--filename", type=str, default="data/instances_dev.pkl", help="File to use for decoding") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=50, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=42, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") # While using SQUASH in the pipeline mode, prefer using the --key flag parser.add_argument( "--key", type=str, default=None, help= "Override the default settings if the key is set, used in pipeline mode" ) args = parser.parse_args() if args.key is not None: # Override some the filename and top_p default settings if args.key is set # This is done when the question generation module is being used in the SQUASH pipeline mode args.filename = "squash/temp/%s/input.pkl" % args.key with open("squash/temp/%s/metadata.json" % args.key, "r") as f: metadata = json.loads(f.read()) args.top_p = metadata["settings"]["top_p"] logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") if args.model_type == 'gpt2': tokenizer = GPT2Tokenizer.from_pretrained(args.model_checkpoint) model = GPT2LMHeadModel.from_pretrained(args.model_checkpoint) else: tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_checkpoint) model = OpenAIGPTLMHeadModel.from_pretrained(args.model_checkpoint) model.to(args.device) model.eval() data = get_positional_dataset_from_file(tokenizer, args.filename) final_output_dict = {"version": "squash-2.0", "data": [{"paragraphs": []}]} question_number = 0 para_cache = {"index": None, "hidden_states": None} for inst in tqdm.tqdm(data): with torch.no_grad(): para_index = inst["para_index"] # Questions from the same paragraph all appear together # We can re-use the paragraph hidden representations for different questions in the same paragraph if para_index != para_cache["index"]: # Since we have moved to a new paragraph, generate its cache para_cache["hidden_states"] = None # Ignore the answer and question while building the input instance, _ = build_para_only_input_from_segments( inst, tokenizer) input_ids = torch.tensor(instance['input_ids'], device=args.device).unsqueeze(0) token_type_ids = torch.tensor(instance['token_type_ids'], device=args.device).unsqueeze(0) # Run a forward pass to generate the para caches _, para_cache["hidden_states"] = model( input_ids, token_type_ids=token_type_ids) # Sample a question using the paragraph cache output = sample_sequence(inst, tokenizer, model, args, para_cache) original_paragraph = tokenizer.decode(output['paragraph']) generated_question = tokenizer.decode(output['question'], skip_special_tokens=True) original_answer = tokenizer.decode(output['answer'], skip_special_tokens=True) para_index = inst['para_index'] para_cache["index"] = inst['para_index'] # verify whether the answer position is correct, since this will be utilized for filtering original_ans_position = output["answer_position"] if original_paragraph[ output["answer_position"]:output["answer_position"] + len(original_answer)] != original_answer: # This should never be executed, only used as a last resort logger.info("Answer mismatch!") original_ans_position = original_paragraph.index(original_answer) # Output in a SQUAD-like format with questions clumped together under their parent paragraph if len(final_output_dict["data"][0]["paragraphs"]) > para_index: # verify whether the paragraph text is identical assert original_paragraph == final_output_dict["data"][0][ "paragraphs"][para_index]['context'] # append the question answer pair final_output_dict["data"][0]["paragraphs"][para_index][ 'qas'].append({ 'id': 'question_%d' % question_number, 'question': generated_question, 'answers': [{ 'text': original_answer, 'answer_start': original_ans_position, }], 'class': output['class'], 'algorithm': output['algorithm'], 'is_impossible': False }) else: # add a new question to the list of QA pairs final_output_dict['data'][0]['paragraphs'].append({ 'context': original_paragraph, 'qas': [{ 'id': 'question_%d' % question_number, 'question': generated_question, 'answers': [{ 'text': original_answer, 'answer_start': original_ans_position, }], 'class': output['class'], 'algorithm': output['algorithm'], 'is_impossible': False }] }) question_number += 1 with open("squash/temp/%s/generated_questions.json" % args.key, "w") as f: f.write(json.dumps(final_output_dict))
def __init__(self): # Load pre-trained model tokenizer (vocabulary) self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2') self.model = GPT2LMHeadModel.from_pretrained('gpt2') self.model.eval()
size_hint=(1, 1.5)) self.window.add_widget(self.user) # button widget self.button = Button(text="Generate !", size_hint=(1, 0.5), bold=True, background_color='32A67F', background_normal='') self.button.bind(on_press=self.callback) self.window.add_widget(self.button) self.window.add_widget(self.m_output0) self.window.add_widget(self.m_output1) # self.window.add_widget(self.m_output2) return self.window def callback(self, instance): self.m_output1.text, _ = Gen_new(self.user.text, 20) if __name__ == "__main__": # Configuration tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2LMHeadModel.from_pretrained('gpt2') translator = Translator() SayHello().run()