def run_pplm_example(pretrained_model="gpt2-medium", cond_text="", uncond=False, num_samples=1, bag_of_words=None, discrim=None, discrim_weights=None, discrim_meta=None, class_label=-1, length=100, stepsize=0.02, temperature=1.0, top_k=10, sample=False, num_iterations=3, grad_length=10000, horizon_length=1, window_length=0, decay=False, gamma=1.5, gm_scale=0.9, kl_scale=0.01, seed=0, colorama=False): # set Random seed torch.manual_seed(seed) np.random.seed(seed) # set the device device = "cuda" if torch.cuda.is_available() else "cpu" if discrim == 'generic': set_generic_model_params(discrim_weights, discrim_meta) if discrim is not None: pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim][ "pretrained_model"] print("discrim = {}, pretrained_model set to discriminator's = {}". format(discrim, pretrained_model)) # load pretrained model model = GPT2LMHeadModel.from_pretrained(pretrained_model, output_hidden_states=True) model.to(device) model.eval() # load tokenizer tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model) # Freeze GPT-2 weights for param in model.parameters(): param.requires_grad = False # figure out conditioning text if uncond: tokenized_cond_text = tokenizer.encode([tokenizer.bos_token]) else: raw_text = cond_text while not raw_text: #print("Did you forget to add `--cond_text`? ") raw_text = input("Model prompt >>> ") tokenized_cond_text = tokenizer.encode(tokenizer.bos_token + raw_text) unpert_gen_tok_text, pert_gen_tok_texts, _, _ = full_text_generation( model=model, tokenizer=tokenizer, context=tokenized_cond_text, device=device, num_samples=num_samples, bag_of_words=bag_of_words, discrim=discrim, class_label=class_label, length=length, stepsize=stepsize, temperature=temperature, top_k=top_k, sample=sample, num_iterations=num_iterations, grad_length=grad_length, horizon_length=horizon_length, window_length=window_length, decay=decay, gamma=gamma, gm_scale=gm_scale, kl_scale=kl_scale, ) # untokenize unperturbed text bow_word_ids = set() if bag_of_words and colorama: bow_indices = get_bag_of_words_indices(bag_of_words.split(";"), tokenizer) for single_bow_list in bow_indices: # filtering all words in the list composed of more than 1 token filtered = list(filter(lambda x: len(x) <= 1, single_bow_list)) # w[0] because we are sure w has only 1 item because previous fitler bow_word_ids.update(w[0] for w in filtered) # iterate through the perturbed texts for i, pert_gen_tok_text in enumerate(pert_gen_tok_texts): try: # untokenize unperturbed text if colorama: import colorama pert_gen_text = '' for word_id in pert_gen_tok_text.tolist()[0]: if word_id in bow_word_ids: pert_gen_text += '{}{}{}'.format( colorama.Fore.RED, tokenizer.decode([word_id]), colorama.Style.RESET_ALL) else: pert_gen_text += tokenizer.decode([word_id]) else: pert_gen_text = tokenizer.decode(pert_gen_tok_text.tolist()[0]) model except: pass return pert_gen_text.replace('<|endoftext|>', '')
def run_model(batch_size, learning_rate, n_ctx, n_head, n_embd, n_layer, adaptive, bpe, masked_lm, classification, bpe_model_path, datasets, lm_corpus_file, pos_tags, dict_path, rnn, crf, config_id): parser = argparse.ArgumentParser() parser.add_argument("--seed", type=int, default=2019) parser.add_argument("--nsamples", type=int, default=1) parser.add_argument("--batch_size", type=int, default=batch_size) parser.add_argument("--length", type=int, default=-1) parser.add_argument("--temperature", type=int, default=1) parser.add_argument("--top_k", type=int, default=0) parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.') parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--lr_warmup', type=float, default=0.002) parser.add_argument('--lr', type=float, default=learning_rate) parser.add_argument('--b1', type=float, default=0.9) parser.add_argument('--b2', type=float, default=0.999) parser.add_argument('--e', type=float, default=1e-8) parser.add_argument('--l2', type=float, default=0.01) parser.add_argument('--vector_l2', action='store_true') parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument("--initializer_range", type=float, default=0.02) parser.add_argument("--layer_norm_epsilon", type=float, default=1e-6) parser.add_argument("--n_ctx", type=int, default=n_ctx) parser.add_argument("--n_positions", type=int, default=n_ctx) parser.add_argument("--n_embd", type=int, default=n_embd) parser.add_argument("--n_head", type=int, default=n_head) parser.add_argument("--n_layer", type=int, default=n_layer) parser.add_argument("--max_vocab_size", type=int, default=0, help='Zero means no limit.') parser.add_argument('--max_step', type=int, default=100000, help='upper epoch limit') parser.add_argument('--eta_min', type=float, default=0.0, help='min learning rate for cosine scheduler') parser.add_argument('--clip', type=float, default=0.25, help='gradient clipping') parser.add_argument('--kw_cut', type=int, default=10, help='Precison and recall @') parser.add_argument("--num_epoch", type=int, default=10) parser.add_argument('--data_path', type=str, default='data') parser.add_argument('--result_path', type=str, default='gpt_results_final.txt') parser.add_argument('--adaptive', action='store_true', help='If true, use adaptive softmax.') parser.add_argument('--bpe', action='store_true', help='If true, use byte pair encoding.') parser.add_argument( '--masked_lm', action='store_true', help= 'If true, use masked language model objective for pretraining instead of regular language model.' ) parser.add_argument('--transfer_learning', action='store_true', help='If true, use a pretrained language model.') parser.add_argument('--POS_tags', action='store_true', help='POS tags') parser.add_argument('--classification', action='store_true', help='If true, train a classifier.') parser.add_argument( '--rnn', action='store_true', help='If true, use a RNN with attention in classification head.') parser.add_argument( '--crf', action='store_true', help= 'If true, use CRF instead of costum loss function in classification head.' ) parser.add_argument('--bpe_model_path', type=str, default=bpe_model_path) parser.add_argument('--datasets', type=str, default=datasets) parser.add_argument('--lm_corpus_file', type=str, default=lm_corpus_file) parser.add_argument('--trained_language_models_dir', type=str, default='trained_language_models') parser.add_argument('--trained_classification_models_dir', type=str, default='trained_classification_models') parser.add_argument('--dict_path', type=str, default=dict_path, help='Path to dictionary') parser.add_argument('--lang', type=str, default='english', help='Path to dictionary') parser.add_argument( '--config_id', type=str, default=config_id, help= 'Used to connect trained language models with classification models') parser.add_argument('--cuda', action='store_false', help='If true, use gpu.') args = parser.parse_args() args.adaptive = adaptive args.classification = classification args.transfer_learning = True args.POS_tags = pos_tags args.bpe = bpe args.masked_lm = masked_lm args.rnn = rnn args.crf = crf args.cuda = True if not os.path.exists(args.trained_classification_models_dir): os.makedirs(args.trained_classification_models_dir) if not os.path.exists(args.trained_language_models_dir): os.makedirs(args.trained_language_models_dir) if args.bpe: sp = GPT2Tokenizer.from_pretrained("gpt2") else: sp = None if args.crf: assert not args.rnn if args.rnn: assert not args.crf print(args) if args.lang == 'english': stemmer = PorterStemmer() elif args.lang == 'estonian': stemmer = Lemmatizer('et') elif args.lang == 'croatian': stemmer = Lemmatizer('hr') np.random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) if not args.classification: df_data = file_to_df(os.path.join(args.data_path, args.lm_corpus_file), classification=False) df_data = df_data.sample(frac=1, random_state=2019) val_idx = int(0.8 * df_data.shape[0]) test_idx = int(0.9 * df_data.shape[0]) df_train = df_data[:val_idx] df_valid = df_data[val_idx:test_idx] df_test = df_data[test_idx:] print( '------------------------------------------------------------------------------------------------------' ) print('Training language model on all data') print("Train size: ", df_train.shape, "Valid size: ", df_valid.shape, "Test size: ", df_test.shape) print( '------------------------------------------------------------------------------------------------------' ) print() train_test(df_train, df_valid, df_test, args, stemmer, sp) else: result_file = open(args.result_path, 'a', encoding='utf8') result_file.write("Classification results for config " + config_id + ":\n\n") result_file.write("Parameters:\n") result_file.write( str(args) + '\n------------------------------------------------\n') for folder in args.datasets.split(';'): print( '------------------------------------------------------------------------------------------------------' ) print('Training on: ', folder) print( '------------------------------------------------------------------------------------------------------' ) if folder == 'duc' or folder == 'nus': #cross validation kf = model_selection.KFold(n_splits=10) df_data = file_to_df(os.path.join(args.data_path, folder, folder + '_test.json'), classification=True) df_data = df_data.sample(frac=1, random_state=2019) print() print('Cross validation on duc') fold_counter = 0 total_pred = [] total_true = [] for train_index, test_index in kf.split(df_data): fold_counter += 1 df_train, df_test = df_data.iloc[ train_index], df_data.iloc[test_index] sep_idx = int(df_train.shape[0] / 10) df_valid = df_train[:sep_idx] df_train = df_train[sep_idx:] print("Train fold ", fold_counter, "fold size: ", df_train.shape, "Valid fold size: ", df_valid.shape, "Test fold size: ", df_test.shape) print() fold_pred, fold_true, num_parameters = train_test( df_train, df_valid, df_test, args, stemmer, sp, folder) total_pred.extend(fold_pred) total_true.extend(fold_true) print() print( '--------------------------------------------------------------------' ) print('Final CV results:') print() else: df_train = file_to_df(os.path.join(args.data_path, folder, folder + '_valid.json'), classification=True) df_train = df_train.sample(frac=1, random_state=2019) val_idx = int(0.8 * df_train.shape[0]) df_valid = df_train[val_idx:] df_train = df_train[:val_idx] df_test = file_to_df(os.path.join(args.data_path, folder, folder + '_test.json'), classification=True) print("Train size: ", df_train.shape, "Valid size: ", df_valid.shape, "Test size: ", df_test.shape) print() total_pred, total_true, num_parameters = train_test( df_train, df_valid, df_test, args, stemmer, sp, folder) p_5, r_5, f_5, p_10, r_10, f_10, p_k, r_k, f_k, p_M, r_M, f_M = eval( total_pred, total_true, lang=args.lang) result_file.write("Dataset: " + folder + '\n') result_file.write('Precision@5: ' + str(p_5) + ' Recall@5: ' + str(r_5) + ' F1@5: ' + str(f_5) + '\n') result_file.write('Precision@10: ' + str(p_10) + ' Recall@10: ' + str(r_10) + ' F1@10: ' + str(f_10) + '\n') result_file.write('Precision@k: ' + str(p_k) + ' Recall@k: ' + str(r_k) + ' F1@k: ' + str(f_k) + '\n') result_file.write('Precision@M: ' + str(p_M) + ' Recall@M: ' + str(r_M) + ' F1@M: ' + str(f_M) + '\n') result_file.write('Num. trainable parameters: ' + str(num_parameters) + '\n') outputs = [] for pred, true in zip(total_pred, total_true): pred = ";".join(list(pred)) true = ";".join(list(true)) outputs.append((pred, true)) df_preds = pd.DataFrame(outputs, columns=['Predicted', 'True']) df_preds.to_csv('predictions/' + folder + '.csv', sep=',', encoding='utf8') result_file.write( "\n-----------------------------------------------------------\n") result_file.write( "\n-----------------------End of the run----------------------\n") result_file.write( "\n-----------------------------------------------------------\n") result_file.close()
def run_model(): parser = argparse.ArgumentParser() parser.add_argument('--model-path', type=str, help='pretrained model path to local checkpoint') parser.add_argument("--seed", type=int, default=0) parser.add_argument("--nsamples", type=int, default=1) parser.add_argument("--batch_size", type=int, default=1) parser.add_argument("--length", type=int, default=-1) parser.add_argument("--temperature", type=int, default=0.95) parser.add_argument('--top_p', type=float, default=0.95) parser.add_argument('--top_k', type=int, default=100) parser.add_argument('--data-dir', type=str, default='data') parser.add_argument('--out-dir', type=str, default='out') parser.add_argument('--data_type', type=str, default='t1', choices=['t' + str(i) for i in range(9)], help="t: type") parser.add_argument('--model_type', type=str, default='cvae', choices=['cvae', 'ae_vae_fusion']) parser.add_argument('--dataset', type=str, default='wi', choices=['wp', 'wi'], help="Dataset to use for training") # use GPU parser.add_argument('--gpu', default=2, type=int) parser.add_argument('--no_gpu', action="store_true") parser.add_argument('--add_input', action="store_true") parser.add_argument('--add_attn', action="store_true") parser.add_argument('--add_softmax', action="store_true") parser.add_argument('--attn_proj_vary', action="store_true") parser.add_argument('--learn_prior', action="store_true") args = parser.parse_args( '--model-path out/wi.1.proj_vary_cyc_cvae/model_0030000.pt ' '--add_input --learn_prior '.split()) print(args) if args.model_type == 'cvae': args.learn_prior = True else: args.learn_prior = False # GPU if not torch.cuda.is_available(): args.no_gpu = True gpu = not args.no_gpu if gpu: torch.cuda.set_device(args.gpu) device = torch.device(args.gpu if gpu else "cpu") # randomness np.random.seed(args.seed) prng = np.random.RandomState() torch.random.manual_seed(args.seed) if gpu: torch.cuda.manual_seed(args.seed) if args.batch_size == -1: args.batch_size = 1 assert args.nsamples % args.batch_size == 0 # logging save_folder = args.model_path + '.eval/' os.makedirs(save_folder, exist_ok=True) importlib.reload(logging) logging.basicConfig(filename=os.path.join(save_folder, 'eval.log'), level=logging.INFO, format='%(asctime)s--- %(message)s') logging.info( '\n----------------------------------------------------------------------' ) print('Loading models...') cache_dir = os.path.join(args.out_dir, 'model_cache') os.makedirs(cache_dir, exist_ok=True) # Load pre-trained teacher tokenizer (vocabulary) tokenizer = GPT2Tokenizer.from_pretrained('gpt2', cache_dir=cache_dir) tokenizer.max_len = int(1e12) gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2', cache_dir=cache_dir) print('gpt2_params:', num_params(gpt2_model)) # gpt2: 124439808 config = GPT2Config() # # add special tokens # special_tokens_dict = { # 'pad_token': '<|startoftext|>', # 'cls_token': '<|startofcond|>', # 'sep_token': '<|sepofcond|>', # 'mask_token': '<|endofcond|>' # } # num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) # print('We have added', num_added_toks, 'special tokens') # # Notice: resize_token_embeddings expect to receive the full size of the new vocab # gpt2_model.resize_token_embeddings(len(tokenizer)) # assert tokenizer.pad_token == '<|startoftext|>' VAE = VAEModel(config, add_input=args.add_input, add_attn=args.add_attn, add_softmax=args.add_softmax, attn_proj_vary=args.attn_proj_vary, learn_prior=args.learn_prior) init_para_frompretrained(VAE.transformer, gpt2_model.transformer, share_para=True) init_para_frompretrained(VAE.encoder, gpt2_model.transformer, share_para=False) if args.learn_prior: init_para_frompretrained(VAE.encoder_prior, VAE.encoder, share_para=True) VAE.encoder_prior.averageSelfAttention.attention_weights = VAE.encoder.averageSelfAttention.attention_weights VAE.lm_head.weight = gpt2_model.lm_head.weight if VAE.add_softmax: VAE.lm_head_rep = Conv1D(*gpt2_model.lm_head.weight.size()) # VAE.lm_head_rep = LM_head_rep(*gpt2_model.lm_head.weight.size()[::-1]) print('VAE_params:', num_params(VAE)) # 286694400 args.load = args.model_path if args.load: print('Loading model weights...') state = torch.load(os.path.join(args.load), map_location='cpu') if 'module' in list(state.keys( ))[0]: # model_path is data parallel model with attr 'module' state_copy = copy.copy(state) keys = state_copy.keys() for k in keys: state[k.replace('module.', '')] = state.pop(k) VAE.load_state_dict(state) gc.collect() print('Model loaded.') print('Setup data...') seq_len = VAE.config.n_ctx test_loader = prepare_dataset(args.data_dir, args.dataset, tokenizer, 1, seq_len, 1, seq_len, args.batch_size, seq_len, make_train=False, make_val=False, make_test=True, data_type=args.data_type)[0] print('Done.') VAE.eval() # be careful about VAE.eval() vs VAE.train() VAE.to(device) loss_fn = nn.CrossEntropyLoss(reduction='none') logging.info( '\n----------------------------------------------------------------------' ) logging.info("Testing loop. batches: %d" % len(test_loader)) endoftext = tokenizer.convert_tokens_to_ids("<|endoftext|>") startofcond = tokenizer.convert_tokens_to_ids("<|startofcond|>") endofcond = tokenizer.convert_tokens_to_ids("<|endofcond|>") n_samples = 0 bleu4_sum = 0.0 rouge_scores_values_sum = [0.0] * 9 model_type = args.model_type # test_iter = iter(test_loader); x_mask, x_tokens, y_mask, y_tokens, input_tokens, target_tokens, mask = next(test_iter) with tqdm(total=len(test_loader)) as pbar: for i_test, (x_mask, x_tokens, y_mask, y_tokens, input_tokens, target_tokens, mask) in enumerate(test_loader): length = args.length if length == -1: length = VAE.config.n_ctx - 1 elif length > VAE.config.n_ctx - 1: raise ValueError( "Can't get samples longer than window size: %s" % VAE.config.n_ctx) eff_samples = [] n, l = target_tokens.size() storys = [tokenizer.decode(target_tokens[i, :]) for i in range(n)] storys_str = [ s[:s.find("<|endoftext|>") + len("<|endoftext|>")] if "<|endoftext|>" in s else s for s in storys ] for _ in range(args.nsamples // args.batch_size): # model, batch_size, temperature, top_k, top_p, eos_token, sample = VAE, args.batch_size, args.temperature, args.top_k, args.top_p, tokenizer.encoder['<|endoftext|>'], True out, _ = sample_sequence( model=VAE, tokenizer=tokenizer, length=length, batch_size=args.batch_size, x_mask=x_mask, x_tokens=x_tokens, y_mask=y_mask, y_tokens=y_tokens, temperature=args.temperature, top_k=args.top_k, top_p=args.top_p, device=device, eos_token=tokenizer.encoder['<|endoftext|>'], model_type=model_type) out = out.tolist() # extract story, check metrics for i in range(len(out)): text = out[i] text = text[text.index(endoftext) + 1:] if endoftext in text: idx = text.index(endoftext) text = text[:idx] text = tokenizer.decode(text).strip() # score for one long text, higher than 0.075 usually means repetition # rep_score = repeat_score(text.split(), ngram=[3, 4, 5, 6, 7, 8]) # if rep_score > 0.075: # # print(rep_score) # continue try: # check bleu bleu4 = sentence_bleu( [storys_str[i].split()], text, smoothing_function=SmoothingFunction().method7) # check rouge rouge = Rouge() rouge_scores = rouge.get_scores(text, storys_str[i]) rouge_scores_values = [ v for k in rouge_scores[0].keys() for v in rouge_scores[0][k].values() ] bleu4_sum += bleu4 rouge_scores_values_sum = [ v1 + v2 for v1, v2 in zip(rouge_scores_values_sum, rouge_scores_values) ] n_samples += 1 except: bleu4 = 0.0 rouge_scores = [{ 'rouge-1': { 'f': 0.0, 'p': 0.0, 'r': 0.0 }, 'rouge-2': { 'f': 0.0, 'p': 0.0, 'r': 0.0 }, 'rouge-l': { 'f': 0.0, 'p': 0.0, 'r': 0.0 } }] eff_samples.append((text, bleu4, rouge_scores)) # write samples to file samples_file = open(save_folder + 'batch-' + '%04d' % i_test + '.txt', 'w', encoding='utf8') for i in range(len(eff_samples)): samples_file.write("=" * 50 + " SAMPLE " + str(i) + " " + "=" * 50) samples_file.write('\n' * 2) samples_file.write("=" * 40 + " Outlines " + "=" * 40) samples_file.write('\n' * 2) samples_file.write( tokenizer.decode( x_tokens[i, :][x_mask[i, :] == 1].tolist())) samples_file.write('\n' * 2) samples_file.write("=" * 40 + " Story " + "=" * 40) samples_file.write('\n' * 2) samples_file.write(storys_str[i]) samples_file.write('\n' * 2) samples_file.write("=" * 40 + " Generated " + "=" * 40) samples_file.write('\n' * 2) samples_file.write(eff_samples[i][0]) samples_file.write('\n' * 4) samples_file.flush() logging.info('batch %04d finished.', i_test) pbar.update(1) print('Test complete with %05d samples.' % n_samples) logging.info("Test complete with %05d samples.", n_samples) bleu4 = round(bleu4_sum / n_samples, 3) rouge_scores_values = [ round(r / n_samples, 3) for r in rouge_scores_values_sum ] print(' bleu-4:', bleu4) print(' rouge :', rouge_scores_values) logging.info(' bleu-4: %f', bleu4) logging.info(' rouge : %s', str(rouge_scores_values))
iteration = int(sys.argv[5]) # Tell pytorch to run this model on the GPU. if use_gpu: device = torch.device('cuda:' + str(gpu_id)) secondary_device = torch.device('cuda:' + str(secondary_gpu_id)) else: device = torch.device("cpu") secondary_device = torch.device("cpu") df = pickle.load(open(pkl_dump_dir + "df_fine.pkl", "rb")) parent_to_child = pickle.load(open(pkl_dump_dir + "parent_to_child.pkl", "rb")) fine_labels = list(set(df.label.values)) coarse_tokenizer = GPT2Tokenizer.from_pretrained(coarse_tok_path, do_lower_case=True) coarse_model = torch.load(model_path + model_name, map_location=device) coarse_model.to(secondary_device) seed_val = 42 random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False all_true = [] all_preds = [] for p in [parent_label]: print("Training coarse label:", p)
random.shuffle(model_files) model_files = model_files[:args.max_sample_num] print(f"total len of files: {len(model_files)}") entropies = [] max_probs = [] print(args.spec_name) if 'pegasus' in args.model_name: from transformers import PegasusTokenizer bpe_tokenizer = PegasusTokenizer.from_pretrained(args.model_name) EOS_TOK_IDs = [106, bpe_tokenizer.eos_token_id, 2] # <n> elif 'gpt' in args.model_name: from transformers import GPT2Tokenizer bpe_tokenizer = GPT2Tokenizer.from_pretrained('gpt2') EOS_TOK_IDs = [bpe_tokenizer.eos_token_id] elif 'bart' in args.model_name: from transformers import BartTokenizer bpe_tokenizer = BartTokenizer.from_pretrained(args.model_name) EOS_TOK_IDs = [bpe_tokenizer.eos_token_id] else: raise NotImplementedError try: outputs = [] outputs_pos_entropy = [] for f in model_files: with open(os.path.join(args.cur_dir, f), 'rb') as fd: data = pickle.load(fd) print(f"Finish loading {f}")
def __init__(self, model_path='gpt2', tokenizer_path='gpt2'): self.tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path) self.tokenizer.pad_token = '<|endoftext|>' self.model = GPT2LMHeadModel.from_pretrained(model_path)
ap.add_argument('--k', type=int, default=1) ap.add_argument('--layer', type=int, default=-1) ap.add_argument('--out_dir', type=str, default='results') args = ap.parse_args() algo = args.algo k = args.k layer = args.layer out_dir = args.out_dir model_type = args.model_type if not os.path.exists(out_dir): os.makedirs(out_dir) tokenizer = GPT2Tokenizer.from_pretrained(model_type) model = Model(device='cuda') DEVICE = 'cuda' templates = get_template_list() if args.algo == 'topk': marg_contrib_path = out_dir + "/marg_contrib.pickle" if os.path.exists(marg_contrib_path): print('Using cached marginal contribution') marg_contrib = pickle.load(open(marg_contrib_path, "rb")) layer_list = marg_contrib['layer'] neuron_list = marg_contrib['neuron'] else: print('Computing marginal contribution') layer_list, neuron_list = get_all_contrib(templates, tokenizer,
def get_tokenizer(self, **kwargs): kwargs.update(self.special_tokens_map) return GPT2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
def main(): parser = get_parser() args = parser.parse_args() if not args.model_name: args.model_name = args.model_path if args.doc_stride >= args.max_seq_length - args.max_query_length: logger.warning( "WARNING - You've set a doc stride which may be superior to the document length in some " "examples. This could result in errors when building features from the examples. Please reduce the doc " "stride or increase the maximum length to ensure the features are correctly built." ) if ( os.path.exists(args.output_dir) and os.listdir(args.output_dir) and not args.overwrite_output_dir ): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( args.output_dir ) ) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Set device args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.DEBUG if args.debug else logging.INFO ) # Set seed set_seed(args) # Load pretrained model and tokenizer config = GPT2Config.from_pretrained( args.config_name if args.config_name else args.model_path, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer = GPT2Tokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer.add_tokens(['question:', ':question']) tokenizer.pad_token = tokenizer.eos_token tokenizer.sep_token = tokenizer.eos_token tokenizer.encode = partial(tokenizer.encode, is_pretokenized=True, truncation=True) tokenizer.encode_plus = partial(tokenizer.encode_plus, is_pretokenized=True, truncation=True) model = GPT2LMHeadModel.from_pretrained( args.model_path, config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) model.resize_token_embeddings(len(tokenizer)) model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set. # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will # remove the need for this code, but it is still valid. if args.fp16: try: import apex apex.amp.register_half_function(torch, "einsum") except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") # Training train_dataset = load_and_cache_examples(args, tokenizer, 'quest_gen', evaluate=False, gpt=True) train_dataset = preprocess_dataset(train_dataset, tokenizer) dev_dataset = load_and_cache_examples(args, tokenizer, 'quest_gen', evaluate=True, gpt=True) dev_dataset = preprocess_dataset(dev_dataset, tokenizer) train(args, train_dataset, dev_dataset, model, tokenizer) logging.info('Finished training !') # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` # Good practice: save your training arguments together with the trained model logger.info("Saving final model checkpoint to %s", args.output_dir) model.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
def __init__(self, device='cuda'): self.model_name = 'sberbank-ai/rugpt3medium_based_on_gpt2' self.model_type = 'gpt2' self.tokenizer = GPT2Tokenizer.from_pretrained(self.model_name) self.model = GPT2LMHeadModel.from_pretrained(self.model_name) self.model.to(device)
import spacy import xx_ent_wiki_sm from deeppavlov import build_model, configs morph = pymorphy2.MorphAnalyzer() nlp = xx_ent_wiki_sm.load() nlp.add_pipe(nlp.create_pipe('sentencizer'), first=True) syntax_model = build_model(configs.syntax.syntax_ru_syntagrus_bert, download=True) from transformers import GPT2LMHeadModel, GPT2Tokenizer import torch tokenizer = GPT2Tokenizer.from_pretrained( "sberbank-ai/rugpt3large_based_on_gpt2") model = GPT2LMHeadModel.from_pretrained( "sberbank-ai/rugpt3large_based_on_gpt2") if torch.cuda.is_available(): device = "cuda" else: device = "cpu" model.to(device) print("Success!") russian_restricted_pronouns = "я мной меня мною мне мы нас нам нами ты тебя тебе тобою тобой вы вас вам вами".split( ) extra_marks = re.compile(r"&[a-zA-Z0-9;]+") expanding_startings = [ "В то же время",
def test_fused_upper_triangle_mask_softmax(): from megatron.model.gpt2_model import ( gpt2_attention_mask_func as attention_mask_func, ) from megatron.model.fused_softmax import FusedScaleMaskSoftmax, SoftmaxFusionTypes gpt = GPT2Model.from_pretrained("gpt2").cuda().half() tokenizer = GPT2Tokenizer.from_pretrained("gpt2") test_text = ( "Hello. How are you? I am fine thank you and you? yes Good. " "hi hi hi hi hi hi hi" # 24 ) tokens = tokenizer( [test_text] * 4, return_tensors="pt", ) attention_mask = tokens["attention_mask"].cuda() attention_mask = attention_mask.view(attention_mask.size(0), -1) attention_mask = attention_mask[:, None, None, :] attention_mask = (1.0 - attention_mask) * -10000.0 attention_mask = attention_mask.repeat(1, 1, attention_mask.size()[-1], 1) attn = gpt.h[0] hidden_states = gpt.wte(tokens["input_ids"].cuda()) q, k, v = attn.attn.c_attn(hidden_states).split(768, dim=-1) q = attn.attn._split_heads(q, attn.attn.num_heads, attn.attn.head_dim) k = attn.attn._split_heads(k, attn.attn.num_heads, attn.attn.head_dim) attn_weights = torch.matmul(q, k.transpose(-1, -2)) sq, sk = q.size(-2), k.size(-2) causal_mask = attn.attn.bias[:, :, sk - sq:sk, :sk].bool() total_mask = ~(causal_mask & (attention_mask == 0)) """ tensor([[[[False, True, True, ..., True, True, True], [False, False, True, ..., True, True, True], [False, False, False, ..., True, True, True], ..., [False, False, False, ..., False, True, True], [False, False, False, ..., False, False, True], [False, False, False, ..., False, False, False]]] """ fused_softmax = (FusedScaleMaskSoftmax( input_in_fp16=True, input_in_bf16=False, mask_func=attention_mask_func, fusion_type=SoftmaxFusionTypes.upper_triang, scale=None, softmax_in_fp32=False, ).cuda().half()) fused_softmax_output = fused_softmax( attn_weights, total_mask, ) torch_softmax = (FusedScaleMaskSoftmax( input_in_fp16=True, input_in_bf16=False, fusion_type=SoftmaxFusionTypes.none, mask_func=attention_mask_func, scale=None, softmax_in_fp32=False, ).cuda().half()) torch_softmax_output = torch_softmax( attn_weights, total_mask, ) test_result = (fused_softmax_output - torch_softmax_output).abs() while test_result.dim() != 1: test_result = test_result.mean(dim=-1) diff = test_result.mean(dim=-1) if diff <= 1e-3: print( f"\n[Success] test_fused_upper_triangle_mask_softmax" f"\n > mean_difference={diff}" f"\n > fused_values={fused_softmax_output[-1][-1][-1][:5].tolist()}" f"\n > torch_values={torch_softmax_output[-1][-1][-1][:5].tolist()}" ) else: print( f"\n[Fail] test_fused_upper_triangle_mask_softmax" f"\n > mean_difference={diff}, " f"\n > fused_values={fused_softmax_output[-1][-1][-1][:5].tolist()}, " f"\n > torch_values={torch_softmax_output[-1][-1][-1][:5].tolist()}" )
from transformers import GPT2LMHeadModel, GPT2Tokenizer import torch model = GPT2LMHeadModel.from_pretrained('gpt2') tokenizer = GPT2Tokenizer.from_pretrained('gpt2', pad_token='<PAD>') # IMPORTANT: Note that setting the <PAD> token like this itn the constructor gives the # pad_token the pad_token_id = 50256, which normally belongs to <BOS> token_ids in GPT2 # This is a very ugly way that works at the moment of setting the pad_token_id to the <BOS> token that is already included in the vocab size. This will be updated in the coming weeks! # noqa: E501 prompt_text = [ 'in this paper we', 'we are trying to', 'The purpose of this workshop is to check whether we can' ] # encode plus batch handles multiple batches and automatically creates attention_masks seq_len = 11 encodings_dict = tokenizer.batch_encode_plus(prompt_text, max_length=seq_len, pad_to_max_length=True) # ideally we should be able to just input the following two variables to the function model.generate() ... => to be implemented soon! # noqa: E501 input_ids = torch.tensor(encodings_dict['input_ids']) attn_mask = torch.tensor(encodings_dict['attention_mask']) num_tokens_to_produce = 20 pad_token_id = tokenizer.pad_token_id eos_token_id = tokenizer.eos_token_id eos_not_in_sents = torch.ones(input_ids.shape[0]).long() # we need to get the token ids of the last non-padded value last_non_masked_idx = torch.sum(attn_mask, dim=1) - 1
def run_model(): parser = argparse.ArgumentParser() parser.add_argument('--model-path', type=str, help='pretrained model path to local checkpoint') parser.add_argument("--seed", type=int, default=0) parser.add_argument("--nsamples", type=int, default=8) parser.add_argument("--batch_size", type=int, default=8) parser.add_argument("--length", type=int, default=-1) parser.add_argument("--temperature", type=int, default=0.95) parser.add_argument('--top_p', type=float, default=0.95) parser.add_argument('--top_k', type=int, default=100) parser.add_argument('--data-dir', type=str, default='data') parser.add_argument('--out-dir', type=str, default='out') parser.add_argument('--model_type', type=str, default='m', choices=['b0', 'b1', 'm'], help="b: baseline, m: model") parser.add_argument('--dataset', type=str, default='wp', choices=['wp', 'wi'], help="Dataset to use for training") # use GPU parser.add_argument('--gpu', default=0, type=int) parser.add_argument('--no_gpu', action="store_true") args = parser.parse_args('--model-path out/wp4.0223/model_latest.pt'.split()) print(args) # GPU if not torch.cuda.is_available(): args.no_gpu = True gpu = not args.no_gpu if gpu: torch.cuda.set_device(args.gpu) device = torch.device(args.gpu if gpu else "cpu") # randomness np.random.seed(args.seed) prng = np.random.RandomState() torch.random.manual_seed(args.seed) if gpu: torch.cuda.manual_seed(args.seed) if args.batch_size == -1: args.batch_size = 1 assert args.nsamples % args.batch_size == 0 # logging save_folder = args.model_path + '.eval/' os.makedirs(save_folder, exist_ok=True) importlib.reload(logging) logging.basicConfig(filename=os.path.join(save_folder, 'eval.log'), level=logging.INFO, format='%(asctime)s--- %(message)s') logging.info('\n----------------------------------------------------------------------') #logging.info("the configuration:") #logging.info(str(args).replace(',', '\n')) print('Loading models...') cache_dir = os.path.join(args.out_dir, 'model_cache') os.makedirs(cache_dir, exist_ok=True) # Load pre-trained teacher tokenizer (vocabulary) tokenizer = GPT2Tokenizer.from_pretrained('gpt2', cache_dir=cache_dir) tokenizer.max_len = int(1e12) model = GPT2LMHeadModel.from_pretrained('gpt2', cache_dir=cache_dir) # add special tokens special_tokens_dict = { 'pad_token': '<|startoftext|>', 'cls_token': '<|startofcond|>', 'sep_token': '<|sepofcond|>', 'mask_token': '<|endofcond|>' } num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) print('We have added', num_added_toks, 'special tokens') # Notice: resize_token_embeddings expect to receive the full size of the new vocab model.resize_token_embeddings(len(tokenizer)) assert tokenizer.pad_token == '<|startoftext|>' if args.model_path: state = torch.load(args.model_path, map_location='cpu') if 'module' in list(state.keys())[0]: # model_path is data parallel model with attr 'module' state_copy = copy.copy(state) keys = state_copy.keys() for k in keys: state[k.replace('module.', '')] = state.pop(k) model.load_state_dict(state) logging.info('load model from ' + args.model_path) model.to(device) model.eval() print('Model loaded.') seq_len = model.config.n_ctx test_loader = prepare_dataset( args.data_dir, args.dataset, tokenizer, 1, seq_len, 1, seq_len, args.batch_size, seq_len, make_train=False, make_val=False, make_test=True, model_type=args.model_type )[0] logging.info('\n----------------------------------------------------------------------') logging.info("Testing loop. batches: %d" % len(test_loader)) endoftext = tokenizer.convert_tokens_to_ids("<|endoftext|>") startofcond = tokenizer.convert_tokens_to_ids("<|startofcond|>") endofcond = tokenizer.convert_tokens_to_ids("<|endofcond|>") n_samples = 0 bleu4_sum = 0.0 rouge_scores_values_sum = [0.0] * 9 with tqdm(total=len(test_loader)) as pbar: for i_test, (context, context_mask, keys, storys) in enumerate(test_loader): # test_iter = iter(test_loader); context, context_mask, keys, storys = next(test_iter) if all([len(key)==0 for key in keys]): keys = None length = args.length if length == -1: length = model.config.n_ctx - context.size(1) elif length > model.config.n_ctx - context.size(1): raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx) eff_samples = [] storys_str = ['\n\n'.join([tokenizer.decode(s) for s in story]) for story in storys] # use '\n\n' as paragraph separator for _ in range(args.nsamples // args.batch_size): # batch_size, temperature, top_k, top_p, eos_token, sample = args.batch_size, args.temperature, args.top_k, args.top_p, tokenizer.encoder['<|endoftext|>'], True out, _ = sample_sequence( model=model, tokenizer=tokenizer, length=length, batch_size=args.batch_size, context=context, context_mask=context_mask, temperature=args.temperature, top_k=args.top_k, top_p=args.top_p, device = device, eos_token = tokenizer.encoder['<|endoftext|>'], keys=keys ) out = out.tolist() # just print # generated = 0 # for i in range(args.batch_size): # generated += 1 # text = tokenizer.decode(out[i]) # print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) # print(text) # extract story, check metrics for i in range(len(out)): text = out[i] text = text[text.index(endoftext) + 1:] if endoftext in text: idx = text.index(endoftext) text = text[:idx] story_sample = [] while startofcond in text and endofcond in text and text.index(startofcond) < text.index(endofcond): idx = text.index(startofcond) story_sample.append(text[:idx]) idx = text.index(endofcond) text = text[idx + 1:] if startofcond not in text and endofcond not in text: story_sample.append(text) text = '\n\n'.join([tokenizer.decode(s) for s in story_sample]).strip() # score for one long text, higher than 0.075 usually means repetition # rep_score = repeat_score(text.split(), ngram=[3, 4, 5, 6, 7, 8]) # if rep_score > 0.075: # # print(rep_score) # continue try: # check bleu bleu4 = sentence_bleu([storys_str[i].split()], text, smoothing_function=SmoothingFunction().method7) # check rouge rouge = Rouge() rouge_scores = rouge.get_scores(text, storys_str[i]) rouge_scores_values = [v for k in rouge_scores[0].keys() for v in rouge_scores[0][k].values()] bleu4_sum += bleu4 rouge_scores_values_sum = [v1 + v2 for v1, v2 in zip(rouge_scores_values_sum, rouge_scores_values)] n_samples += 1 except: bleu4 = 0.0 rouge_scores = [{'rouge-1': {'f': 0.0, 'p': 0.0, 'r': 0.0}, 'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0}, 'rouge-l': {'f': 0.0, 'p': 0.0, 'r': 0.0}}] eff_samples.append((text, bleu4, rouge_scores)) # write samples to file samples_file = open(save_folder + 'batch-' + '%04d' % i_test + '.txt', 'w', encoding='utf8') for i in range(len(eff_samples)): samples_file.write("=" * 50 + " SAMPLE " + str(i) + " " + "=" * 50) samples_file.write('\n' * 2) samples_file.write("=" * 40 + " Outlines " + "=" * 40) samples_file.write('\n' * 2) samples_file.write(tokenizer.decode(context[i, :-1][context_mask[i, :] == 1].tolist())) if keys is not None: samples_file.write('\n\n'.join([tokenizer.decode(s) for s in keys[i]])) samples_file.write('\n' * 2) samples_file.write("=" * 40 + " Story " + "=" * 40) samples_file.write('\n' * 2) samples_file.write(storys_str[i]) samples_file.write('\n' * 2) samples_file.write("=" * 40 + " Generated " + "=" * 40) samples_file.write('\n' * 2) samples_file.write(eff_samples[i][0]) samples_file.write('\n' * 2) samples_file.write(str(eff_samples[i][1:])) samples_file.write('\n' * 4) samples_file.flush() logging.info('batch %04d finished.', i_test) pbar.update(1) print('Test complete with %05d samples.' % n_samples) logging.info("Test complete with %05d samples.", n_samples) bleu4 = round(bleu4_sum / n_samples, 3) rouge_scores_values = [round(r / n_samples, 3) for r in rouge_scores_values_sum] print(' bleu-4:', bleu4) print(' rouge :', rouge_scores_values) logging.info(' bleu-4: %f', bleu4) logging.info(' rouge : %s', str(rouge_scores_values))
def initialize_model(config, d_out, is_featurizer=False): """ Initializes models according to the config Args: - config (dictionary): config dictionary - d_out (int): the dimensionality of the model output - is_featurizer (bool): whether to return a model or a (featurizer, classifier) pair that constitutes a model. Output: If is_featurizer=True: - featurizer: a model that outputs feature Tensors of shape (batch_size, ..., feature dimensionality) - classifier: a model that takes in feature Tensors and outputs predictions. In most cases, this is a linear layer. If is_featurizer=False: - model: a model that is equivalent to nn.Sequential(featurizer, classifier) Pretrained weights are loaded according to config.pretrained_model_path using either transformers.from_pretrained (for bert-based models) or our own utils.load function (for torchvision models, resnet18-ms, and gin-virtual). There is currently no support for loading pretrained weights from disk for other models. """ if config.model in ('resnet18', 'resnet34', 'resnet50', 'resnet101', 'wideresnet50', 'densenet121'): if is_featurizer: featurizer = initialize_torchvision_model( name=config.model, d_out=None, **config.model_kwargs) classifier = nn.Linear(featurizer.d_out, d_out) model = (featurizer, classifier) else: model = initialize_torchvision_model( name=config.model, d_out=d_out, **config.model_kwargs) elif 'bert' in config.model: if is_featurizer: featurizer = initialize_bert_based_model(config, d_out, is_featurizer) classifier = nn.Linear(featurizer.d_out, d_out) model = (featurizer, classifier) else: model = initialize_bert_based_model(config, d_out) elif config.model == 'resnet18_ms': # multispectral resnet 18 from models.resnet_multispectral import ResNet18 if is_featurizer: featurizer = ResNet18(num_classes=None, **config.model_kwargs) classifier = nn.Linear(featurizer.d_out, d_out) model = (featurizer, classifier) else: model = ResNet18(num_classes=d_out, **config.model_kwargs) elif config.model == 'gin-virtual': from models.gnn import GINVirtual if is_featurizer: featurizer = GINVirtual(num_tasks=None, **config.model_kwargs) classifier = nn.Linear(featurizer.d_out, d_out) model = (featurizer, classifier) else: model = GINVirtual(num_tasks=d_out, **config.model_kwargs) elif config.model == 'code-gpt-py': from models.code_gpt import GPT2LMHeadLogit, GPT2FeaturizerLMHeadLogit from transformers import GPT2Tokenizer name = 'microsoft/CodeGPT-small-py' tokenizer = GPT2Tokenizer.from_pretrained(name) if is_featurizer: model = GPT2FeaturizerLMHeadLogit.from_pretrained(name) model.resize_token_embeddings(len(tokenizer)) featurizer = model.transformer classifier = model.lm_head model = (featurizer, classifier) else: model = GPT2LMHeadLogit.from_pretrained(name) model.resize_token_embeddings(len(tokenizer)) elif config.model == 'logistic_regression': assert not is_featurizer, "Featurizer not supported for logistic regression" model = nn.Linear(out_features=d_out, **config.model_kwargs) else: raise ValueError(f'Model: {config.model} not recognized.') # Load pretrained weights from disk using our utils.load function # This has only been tested on some models (mostly vision), so run this code iff we're sure it works # We've already loaded pretrained weights for bert-based models using the transformers library if config.model not in ('code-gpt-py', 'logistic_regression', 'unet-seq', 'fasterrcnn') and 'bert' not in config.model: if config.pretrained_model_path and os.path.exists(config.pretrained_model_path): try: if type(model) is tuple: # load both featurizer and classifier prev_epoch, best_val_metric = load( nn.Sequential(*model), config.pretrained_model_path, device=config.device ) else: prev_epoch, best_val_metric = load(model, config.pretrained_model_path, device=config.device) print( (f'Initialized model with pretrained weights from {config.pretrained_model_path} ') + (f'previously trained for {prev_epoch} epochs ' if prev_epoch else '') + (f'with previous val metric {best_val_metric} ' if best_val_metric else '') ) except: print('Something went wrong loading the pretrained model.') pass return model
from transformers import GPT2Tokenizer, GPT2LMHeadModel import numpy as np import torch import logging logging.getLogger().setLevel(logging.CRITICAL) device = 'cpu' if torch.cuda.is_available(): device = 'cuda' pt_model = 'gpt2' print("Importing " + pt_model) tokenizer = GPT2Tokenizer.from_pretrained(pt_model) model = GPT2LMHeadModel.from_pretrained(pt_model) model = model.to(device) print(pt_model + "model imported") # Function to first select topN tokens from the probability list and then based on the selected N word distribution # get random token ID def choose_from_top(probs, n=40): print("Selecting Word") ind = np.argpartition(probs, -n)[-n:] top_prob = probs[ind] top_prob = top_prob / np.sum(top_prob) # Normalize choice = np.random.choice(n, 1, p=top_prob) token_id = ind[choice][0] return int(token_id)
def __init__(self, config): self.device = "cuda" if torch.cuda.is_available() else "cpu" print(f"using device: {self.device}") self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2") self.model = GPT2LMHeadModel.from_pretrained("gpt2").to(self.device)
def __init__(self): self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2') self.model = GPT2LMHeadModel.from_pretrained('gpt2') self.device = torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu')
def filter_nan(df): sel = df["Summary"].notnull() summaries = df["Summary"][sel].values.tolist() reviews = df["Text"][sel].values.tolist() return reviews, summaries if __name__ == "__main__": import pandas as pd from transformers import GPT2Tokenizer len_dict = { "review": AMAZON_REVIEW_LENGTH, "summary": AMAZON_SUMMARY_LENGTH } PRETRAINED_MODEL_NAME = "gpt2" REVIEW_PATH = "../../data/amazon_fine_food_review/Reviews.csv" df = pd.read_csv(REVIEW_PATH) tokenizer = GPT2Tokenizer.from_pretrained(PRETRAINED_MODEL_NAME, bos_token=BOS, eos_token=EOS, pad_token=PAD) # build dataset reviews, summaries = filter_nan(df) dataset = AmazonReviewV1(reviews, summaries, tokenizer, len_dict, BOS, EOS)
def main(): #print("MODEL NAME, BATCH SIZE, AVG LATENCY (ms), AVG MEM USAGE (MiB)") #parser parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str) parser.add_argument('--num_inference', type=int) parser.add_argument('--batch_size', type=int) parser.add_argument('--gpu', action="store_true", default=False) args = parser.parse_args() model_name = args.model_name num_inference = args.num_inference batch_size = args.batch_size use_gpu = args.gpu and torch.cuda.is_available() # stores latency / memory usage values l_inference_latency = list() l_memory_capacity = list() # call corresponding DNN model... # TODO: ADD OTHER MODELS - RESNET50, ... # TODO: FIX NLP MODELS' SEQUENCE LENGTH if (model_name == "resnet18"): with torch.no_grad(): model = models.resnet18(True, True) if use_gpu: model = model.cuda() # inference for i in range(num_inference): # input inputs = torch.zeros(batch_size, 3, 224, 224) if use_gpu: inputs = inputs.to('cuda') start_time = time.time() _ = model(inputs) torch.cuda.synchronize() end_time = time.time() l_inference_latency.append(end_time - start_time) l_memory_capacity.append(torch.cuda.memory_allocated()) str_avg_inf_time = sec_to_ms( average_90_percent(l_inference_latency)) str_avg_mem_usage = bytes_to_mib( average_90_percent(l_memory_capacity)) print(",".join([ "RESNET18", str(batch_size), str_avg_inf_time, str_avg_mem_usage ])) elif (model_name == "wide_resnet101_2"): with torch.no_grad(): model = models.wide_resnet101_2(True, True) if use_gpu: model = model.cuda() # inference for i in range(num_inference): # input inputs = torch.zeros(batch_size, 3, 224, 224) if use_gpu: inputs = inputs.to('cuda') start_time = time.time() _ = model(inputs) torch.cuda.synchronize() end_time = time.time() l_inference_latency.append(end_time - start_time) l_memory_capacity.append(torch.cuda.memory_allocated()) str_avg_inf_time = sec_to_ms( average_90_percent(l_inference_latency)) str_avg_mem_usage = bytes_to_mib( average_90_percent(l_memory_capacity)) print(",".join([ "WIDE-RESNET101-2", str(batch_size), str_avg_inf_time, str_avg_mem_usage ])) elif (model_name == "mobilenet"): with torch.no_grad(): model = models.mobilenet_v2(True, True) if use_gpu: model = model.cuda() # warmup for i in range(num_inference): inputs = torch.zeros(batch_size, 3, 224, 224) if use_gpu: inputs = inputs.to('cuda') start_time = time.time() _ = model(inputs) torch.cuda.synchronize() end_time = time.time() l_inference_latency.append(end_time - start_time) l_memory_capacity.append(torch.cuda.memory_allocated()) str_avg_inf_time = sec_to_ms( average_90_percent(l_inference_latency)) str_avg_mem_usage = bytes_to_mib( average_90_percent(l_memory_capacity)) print(",".join([ "MOBILENET_V2", str(batch_size), str_avg_inf_time, str_avg_mem_usage ])) elif (model_name == "bert"): with torch.no_grad(): tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") model = AutoModel.from_pretrained("bert-base-uncased") if use_gpu: model = model.cuda() # inference for i in range(num_inference): # BERT maximum sequence length 512 sample_text = "BERT" * int(512 / 4) texts = [sample_text] * batch_size inputs = tokenizer(texts, return_tensors="pt") if use_gpu: inputs = inputs.to('cuda') start_time = time.time() _ = model(**inputs) torch.cuda.synchronize() end_time = time.time() l_inference_latency.append(end_time - start_time) l_memory_capacity.append(torch.cuda.memory_allocated()) str_avg_inf_time = sec_to_ms( average_90_percent(l_inference_latency)) str_avg_mem_usage = bytes_to_mib( average_90_percent(l_memory_capacity)) print(",".join([ "BERT-BASE-UNCASED", str(batch_size), str_avg_inf_time, str_avg_mem_usage ])) elif (model_name == "gpt2"): with torch.no_grad(): tokenizer = GPT2Tokenizer.from_pretrained("gpt2") model = GPT2Model.from_pretrained("gpt2") if use_gpu: model = model.cuda() # inference for i in range(num_inference): # GPT2 maximum sequence length 124 sample_text = "GPT2" * int(1024 / 4) texts = [sample_text] * batch_size inputs = tokenizer(texts, return_tensors="pt") if use_gpu: inputs = inputs.to('cuda') start_time = time.time() _ = model(**inputs) torch.cuda.synchronize() end_time = time.time() l_inference_latency.append(end_time - start_time) l_memory_capacity.append(torch.cuda.memory_allocated()) str_avg_inf_time = sec_to_ms( average_90_percent(l_inference_latency)) str_avg_mem_usage = bytes_to_mib( average_90_percent(l_memory_capacity)) print(",".join( ["GPT2", str(batch_size), str_avg_inf_time, str_avg_mem_usage])) elif (model_name == "dlrm"): print("Unimplemented model: DLRM") # TODO: MAKE IT WORK... PLEASE ''' with torch.no_grad(): model = DLRM_Net() if use_gpu: model = model.cuda() # inference for i in range(num_inference): inputs = ???? if use_gpu: inputs = inputs.to('cuda') start_time = time.time() _ = model(**inputs) torch.cuda.synchronize() end_time = time.time() l_inference_latency.append(end_time - start_time) l_memory_capacity.append(torch.cuda.memory_allocated()) str_avg_inf_time = sec_to_ms(average_90_percent(l_inference_latency)) str_avg_mem_usage = bytes_to_mib(average_90_percent(l_memory_capacity)) print(",".join(["GPT2", str(batch_size), str_avg_inf_time, str_avg_mem_usage])) ''' else: print("Unidentified model name: {}".format(model_name)) return
def tokenizer(self): return GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
from transformers import ( GPT2LMHeadModel, GPT2Tokenizer, ) tokenizer = GPT2Tokenizer.from_pretrained("gpt2") model1 = GPT2LMHeadModel.from_pretrained("gpt2") def gen_text(min_length=20, max_length=40, temperature=1.0, sentence_prefix=''): min_length = int(min_length) max_length = int(max_length) temperature = float(temperature) input_ids = tokenizer.encode( sentence_prefix, add_special_tokens=False, return_tensors="pt", add_space_before_punct_symbol=True, ) output_ids = model1.generate( input_ids=input_ids, temperature=temperature, do_sample=True, min_length=min_length, max_length=max_length, # desired output sentence length pad_token_id=model1.config.eos_token_id,
import numpy as np import pandas as pd import timeit import torch from torch.utils.data import TensorDataset import transformers import json import argparse import nltk from helperGPT2 import execute_tokenization nltk.download('averaged_perceptron_tagger') nltk.download('punkt') assert (transformers.__version__ == '2.6.0') tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2') special_tokens = { 'bos_token': '<|startoftext|>', 'eos_token': '<|endoftext|>', 'pad_token': '<pad>', 'additional_special_tokens': ['<|keyword|>', '<|summarize|>'] } tokenizer.add_special_tokens(special_tokens) assert (len(tokenizer) == 50261) """ ================================================= END OF IMPORT AND INITIALIZATION START OF THE HELPFER FUNCTION SECTION ================================================= """
self.tokenizer = tokenizer self.tokenizer.max_len = 1500 # tokenizer weird behavior self.turn_ending = tokenizer.cls_token_id#[628, 198] # tokenizer.encode("\n\n\n") def __len__(self): return len(self.data) def __getitem__(self, index): dial_tokens = tokenizer.encode(self.data[index][0]) + [self.turn_ending] cls_token_location = dial_tokens.index(self.tokenizer.cls_token_id) dial_act = self.data[index][1] return dial_tokens, cls_token_location, dial_act def collate(self, unpacked_data): return unpacked_data tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium") tokenizer.add_special_tokens({'cls_token': '[CLS]'}) class GPT2DoubleHeadsModel_modified(GPT2DoubleHeadsModel): def __init__(self, config): super().__init__(config) # config.num_labels = 1 config.num_labels = le.classes_.shape[0] self.transformer = GPT2Model(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.multiple_choice_head = SequenceSummary(config) self.init_weights() config = GPT2Config() config = config.from_pretrained('gpt2-medium')
return args if __name__ == "__main__": # Load training parameters args = parse_args() # Claim the GPU (research cluster-specific issue) try: torch.ones(1).to(args.device) except RuntimeError as err: logging.error(err) sys.exit(1) # Load pre-trained OpenAI GPT-2 model tokenizer = GPT2Tokenizer.from_pretrained(args.model_name_or_path) model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path) model.to(args.device) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "left" # Hack to be able to batch generate # Load and batch the prompts for quicker generation logging.info(f"Loading prompts from {args.prompt_path}") prompt_lst = [] with open(args.prompt_path, 'r') as f: prompt_lst = [f"{line.strip()} [RESPONSE]" for line in f.readlines()] logging.info("total number of sentences = {}".format(len(prompt_lst))) prompt_lst_batch, prompt_lst_batch_idx = batchify(prompt_lst, args.bsz) logging.info("total batch size = {}".format(len(prompt_lst_batch))) # Start the output file
def __init__(self, model='bert', model_size='base', cased=True, fine_tune=False, use_proj=False, proj_dim=256): super(Encoder, self).__init__() assert (model in MODEL_LIST) self.base_name = model self.model = None self.tokenizer = None self.num_layers = None self.hidden_size = None # First initialize the model and tokenizer model_name = '' # Do we want the tokenizer to lower case or not do_lower_case = not cased # Model is one of the BERT variants if 'bert' in model: assert (model_size in BERT_MODEL_SIZES) model_name = model + "-" + model_size if model == 'bert' and not cased: # Only original BERT supports uncased models model_name += '-uncased' elif model == 'roberta': # RoBERTa model types have no casing suffix in HuggingFace map # So we don't modify the model name pass else: model_name += '-cased' if model == 'bert': self.model = BertModel.from_pretrained( model_name, output_hidden_states=True) self.tokenizer = BertTokenizer.from_pretrained( model_name, do_lower_case=do_lower_case) elif model == 'roberta': self.model = RobertaModel.from_pretrained( model_name, output_hidden_states=True) self.tokenizer = RobertaTokenizer.from_pretrained( model_name, do_lower_case=do_lower_case) elif model == 'spanbert': # Model is loaded in a different way # Earlier "pytorch_transformers" required a .tar.gz URL/file. # Updated library "transformers" requires pytorch_model.bin and config.json # separately. That's why we have to keep the SpanBERT codebase around and initialize # the model using that codebase (based on pytorch_pretrained_bert). # NOTE: By default transformer models are initialized to eval() mode! # Not using the eval() mode will result in randomness. self.model = SpanbertModel.from_pretrained(model_name).eval() # SpanBERT uses the same tokenizer as BERT (that's why the slicing in model name). # We use the tokenizer from "transformers" since it provides an almost unified API. self.tokenizer = BertTokenizer.from_pretrained( model_name[4:], do_lower_case=do_lower_case) self.num_layers = self.model.config.num_hidden_layers self.hidden_size = self.model.config.hidden_size elif model == "xlnet": model_name = model + "-" + model_size + "-cased" self.model = XLNetModel.from_pretrained(model_name, output_hidden_states=True) self.tokenizer = XLNetTokenizer.from_pretrained( model_name, do_lower_case=do_lower_case) self.num_layers = self.model.config.num_hidden_layers self.hidden_size = self.model.config.hidden_size elif model == 'gpt2': assert (model_size in GPT2_MODEL_SIZES) model_name = model if model_size != "small": model_name += "-" + model_size self.model = GPT2Model.from_pretrained(model_name, output_hidden_states=True) # Set the EOS token to be the PAD token since no explicit pad token # in GPT2 implementation. self.tokenizer = GPT2Tokenizer.from_pretrained( model_name, do_lower_case=do_lower_case, pad_token="<|endoftext|>") self.num_layers = self.model.config.n_layer self.hidden_size = self.model.config.n_embd # Set the model name self.model_name = model_name # Set shift size due to introduction of special tokens if self.base_name == 'xlnet': self.start_shift = 0 self.end_shift = 2 else: self.start_shift = (1 if self.tokenizer._cls_token else 0) self.end_shift = (1 if self.tokenizer._sep_token else 0) # Set requires_grad to False if not fine tuning if not fine_tune: for param in self.model.parameters(): param.requires_grad = False if use_proj: # Apply a projection layer to output of pretrained models self.proj = nn.Linear(self.hidden_size, proj_dim) # Update the hidden size self.hidden_size = proj_dim else: self.proj = None # Set parameters required on top of pre-trained models self.weighing_params = nn.Parameter(torch.ones(self.num_layers)) # Attention-based Span representation parameters - MIGHT NOT BE USED self.attention_params = nn.Linear(self.hidden_size, 1) nn.init.constant_(self.attention_params.weight, 0)
epsilon = 1e-8 # Set the seed value all over the place to make this reproducible. seed_val = 42 random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) SAVE_PATH = "/mnt/nfs/work1/llcao/zonghaiyao/LM/" # I'm not really doing anything with the config buheret configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False) # Load the GPT tokenizer. tokenizer = GPT2Tokenizer.from_pretrained( 'gpt2', pad_token='<|endoftext|>') #gpt2-medium # instantiate the model model = rerankGPT2LMHeadModel_stage1_all_tokens_no_stage2.from_pretrained( "gpt2", config=configuration, MAX_LEN=MAX_LEN, CAN_NUM=CAN_NUM, num_of_rerank=num_of_rerank) # this step is necessary because I've added some tokens (bos_token, etc) to the embeddings # otherwise the tokenizer and model tensors won't match up model.resize_token_embeddings(len(tokenizer)) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!")
def test_batch_generation_2heads(self): model = GPT2DoubleHeadsModel.from_pretrained("gpt2") model.to(torch_device) tokenizer = GPT2Tokenizer.from_pretrained("gpt2") tokenizer.padding_side = "left" # This tokenizer has no pad token, so we have to set it in some way # Define PAD Token = EOS Token = 50256 tokenizer.pad_token = tokenizer.eos_token model.config.pad_token_id = model.config.eos_token_id # use different length sentences to test batching sentences = [ "Hello, my dog is a little", "Today, I", ] inputs = tokenizer(sentences, return_tensors="pt", padding=True) input_ids = inputs["input_ids"].to(torch_device) token_type_ids = torch.cat( [ input_ids.new_full( (input_ids.shape[0], input_ids.shape[1] - 1), 0), input_ids.new_full((input_ids.shape[0], 1), 500), ], dim=-1, ) outputs = model.generate( input_ids=input_ids, attention_mask=inputs["attention_mask"].to(torch_device), ) outputs_tt = model.generate( input_ids=input_ids, attention_mask=inputs["attention_mask"].to(torch_device), token_type_ids=token_type_ids, ) inputs_non_padded = tokenizer( sentences[0], return_tensors="pt").input_ids.to(torch_device) output_non_padded = model.generate(input_ids=inputs_non_padded) num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][ -1].long().sum().cpu().item() inputs_padded = tokenizer( sentences[1], return_tensors="pt").input_ids.to(torch_device) output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings) batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True) batch_out_sentence_tt = tokenizer.batch_decode( outputs_tt, skip_special_tokens=True) non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True) padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True) expected_output_sentence = [ "Hello, my dog is a little bit of a mess. I'm not sure if he's going", "Today, I'm going to be doing a lot of research on this. I", ] self.assertListEqual(expected_output_sentence, batch_out_sentence) self.assertTrue( batch_out_sentence_tt != batch_out_sentence) # token_type_ids should change output self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
def run_pplm_example(pretrained_model="gpt2-medium", cond_text="", uncond=False, num_samples=1, bag_of_words=None, discrim=None, discrim_weights=None, discrim_meta=None, class_label=-1, length=100, stepsize=0.02, temperature=1.0, top_k=10, sample=True, num_iterations=3, grad_length=10000, horizon_length=1, window_length=0, decay=False, gamma=1.5, gm_scale=0.9, kl_scale=0.01, seed=0, no_cuda=False, colorama=False, verbosity='regular'): # set Random seed torch.manual_seed(seed) np.random.seed(seed) # set verbosiry verbosity_level = VERBOSITY_LEVELS.get(verbosity.lower(), REGULAR) # set the device device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu" if discrim == 'generic': set_generic_model_params(discrim_weights, discrim_meta) if discrim is not None: discriminator_pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim][ "pretrained_model"] if pretrained_model != discriminator_pretrained_model: pretrained_model = discriminator_pretrained_model if verbosity_level >= REGULAR: print("discrim = {}, pretrained_model set " "to discriminator's = {}".format(discrim, pretrained_model)) # load pretrained model model = GPT2LMHeadModel.from_pretrained(pretrained_model, output_hidden_states=True) model.to(device) model.eval() # load tokenizer tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model) # Freeze GPT-2 weights for param in model.parameters(): param.requires_grad = False # figure out conditioning text if uncond: tokenized_cond_text = tokenizer.encode([tokenizer.bos_token], add_special_tokens=False) else: raw_text = cond_text while not raw_text: print("Did you forget to add `--cond_text`? ") raw_text = input("Model prompt >>> ") tokenized_cond_text = tokenizer.encode(tokenizer.bos_token + raw_text, add_special_tokens=False) print("= Prefix of sentence =") print(tokenizer.decode(tokenized_cond_text)) print() # generate unperturbed and perturbed texts # full_text_generation returns: # unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time unpert_gen_tok_text, pert_gen_tok_texts, _, _ = full_text_generation( model=model, tokenizer=tokenizer, context=tokenized_cond_text, device=device, num_samples=num_samples, bag_of_words=bag_of_words, discrim=discrim, class_label=class_label, length=length, stepsize=stepsize, temperature=temperature, top_k=top_k, sample=sample, num_iterations=num_iterations, grad_length=grad_length, horizon_length=horizon_length, window_length=window_length, decay=decay, gamma=gamma, gm_scale=gm_scale, kl_scale=kl_scale, verbosity_level=verbosity_level) # untokenize unperturbed text unpert_gen_text = tokenizer.decode(unpert_gen_tok_text.tolist()[0]) if verbosity_level >= REGULAR: print("=" * 80) print("= Unperturbed generated text =") print(unpert_gen_text) print() generated_texts = [] bow_word_ids = set() if bag_of_words and colorama: bow_indices = get_bag_of_words_indices(bag_of_words.split(";"), tokenizer) for single_bow_list in bow_indices: # filtering all words in the list composed of more than 1 token filtered = list(filter(lambda x: len(x) <= 1, single_bow_list)) # w[0] because we are sure w has only 1 item because previous fitler bow_word_ids.update(w[0] for w in filtered) # iterate through the perturbed texts for i, pert_gen_tok_text in enumerate(pert_gen_tok_texts): try: # untokenize unperturbed text if colorama: import colorama pert_gen_text = '' for word_id in pert_gen_tok_text.tolist()[0]: if word_id in bow_word_ids: pert_gen_text += '{}{}{}'.format( colorama.Fore.RED, tokenizer.decode([word_id]), colorama.Style.RESET_ALL) else: pert_gen_text += tokenizer.decode([word_id]) else: pert_gen_text = tokenizer.decode(pert_gen_tok_text.tolist()[0]) print("= Perturbed generated text {} =".format(i + 1)) print(pert_gen_text) print() except: pass # keep the prefix, perturbed seq, original seq for each index generated_texts.append( (tokenized_cond_text, pert_gen_tok_text, unpert_gen_tok_text)) return
def compute_gpt_embeddings(annotation_data): pretrained_weights = 'gpt2' tokenizer = GPT2Tokenizer.from_pretrained(pretrained_weights) model = GPT2Model.from_pretrained(pretrained_weights) return compute_transformer_embeddings(model, tokenizer, annotation_data)