def __init__(self, *args, language_model=None, template_loc='./relation_map_multiple.json', use_local_model=False): super().__init__(*args, language_model=language_model, template_loc=template_loc, use_local_model=use_local_model) self.nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) if use_local_model: self.enc = GPT2Tokenizer.from_pretrained("../models/GPT2LMHeadModel") else: self.enc = GPT2Tokenizer.from_pretrained('gpt2') with open(self.template_loc, 'r') as f: self.templates = json.load(f)
def main(args): # TODO specify vocab path to avoid download if args.tokenizer is not None: toker = GPT2Tokenizer.from_pretrained(args.tokenizer) else: toker = GPT2Tokenizer.from_pretrained('gpt2') assert args.corpus.endswith('.txt') or args.corpus.endswith('.tsv') db_path = f'{args.corpus[:-4]}.db/db' if exists(dirname(db_path)): raise ValueError('Found existing DB, please backup') else: os.makedirs(dirname(db_path)) with open(args.corpus, "r", encoding="utf-8") as reader, \ shelve.open(db_path, 'n') as db: chunk = [] n_chunk = 0 n_example = 0 for line in tqdm(reader, total=_get_file_len(args.corpus)): try: if len(chunk) >= args.chunk_size: # save and renew chunk db[f'chunk_{n_chunk}'] = gzip.compress( json.dumps(chunk[:args.chunk_size]).encode('utf-8')) chunk = chunk[args.chunk_size:] n_chunk += 1 weights, inputs, attn_masks, position_ids, type_ids = _get_inputs_from_text( line, toker) if len(weights) < 2: continue features = _make_features(n_example, weights, inputs, attn_masks, position_ids, type_ids, toker, args.max_seq_len) for feature in features: chunk.append(vars(feature)) n_example += 1 except Exception as e: raise e # save last chunk db[f'chunk_{n_chunk}'] = gzip.compress( json.dumps(chunk).encode('utf-8')) # save relevant information to reproduce meta = { 'n_example': n_example, 'chunk_size': args.chunk_size, 'max_seq_len': args.max_seq_len } with open(join(dirname(db_path), 'meta.json'), 'w') as writer: json.dump(meta, writer, indent=4) torch.save(toker, join(dirname(db_path), 'tokenizer.pt'))
def fluency_score(rated_a, opt): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") enc = GPT2Tokenizer.from_pretrained(opt.pretrained_model_path) model = GPT2LMHeadModel.from_pretrained(opt.pretrained_model_path) model.to(device) model.eval() nb_steps, eval_loss, exp_average_loss = 0, 0, None score_list = [] # k = "the book is on the desk. These impressions show , when alive , they had smooth skin , robust limbs with webbed feet , and a ridge of skin on their undersides." tensor(169.6684, device='cuda:0') with torch.no_grad(): for step, s in enumerate( rated_a): # actually here is a batch with batchsize=1 # Put model in training mode. if not s: print('space sentence') score_list.append(1e6) continue s = enc.encode( s) # + [50256] #50256 is the token_id for <|endoftext|> batch = torch.tensor([s]).to(device) loss = model(batch, lm_labels=batch) # everage -logp # print(loss*len(s)) eval_loss += loss.item() nb_steps += 1 score_list.append(loss.item()) cutoff = np.quantile([-t for t in score_list], 0.05) modified_rating = np.array( [cutoff if -t < cutoff else -t for t in score_list]) normed_rating = (modified_rating - cutoff) / np.abs(cutoff) return normed_rating
def main(mode: str = 'baseline', max_length: int = None): enc = GPT2Tokenizer.from_pretrained('gpt2') CORPUS_FILE = '/Users/ben/data/wikitext-2/wiki.train.tokens' with open(CORPUS_FILE) as f: corpus = f.read() if max_length: corpus = corpus[:max_length] # Reprocess vocab as real bytes vocab = [ bytes(enc.byte_decoder[c] for c in token) for token in enc.encoder ] encoder = dict(zip(vocab, range(len(vocab)))) greedy = Encoder(vocab) with Timer(): if mode == 'baseline': out = enc.encode(corpus) elif mode == 'greedy': out = list(greedy.encode(corpus)) elif mode == 'greedy-c': pass elif mode == 'numba': out = list( numba_bpe.numba_encode(numba_bpe.random_str(100000), numba_bpe.fake_vocab())) elif mode == 'nonumba': out = list( numba_bpe.encode(numba_bpe.random_str(100000), numba_bpe.fake_vocab())) else: raise Exception('Uknown mode %s'.format(mode)) print(f'Compression ratio {len(out)/len(corpus):.4f}')
def __init__(self, *args, language_model=None, template_loc='./relation_map_multiple.json'): super().__init__(*args, language_model=language_model, template_loc=template_loc) self.nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) self.enc = GPT2Tokenizer.from_pretrained('gpt2') print("Loading template JSON.") with open(self.template_loc, 'r') as f: self.templates = json.load(f)
def run_model(): parser = argparse.ArgumentParser() parser.add_argument('--model_name_or_path', type=str, default='', help='pretrained model name or path to local checkpoint') parser.add_argument("--seed", type=int, default=42) parser.add_argument("--load_checkpoint", '-c', type=str, default='') parser.add_argument("--fp16", type=boolean_string, default=False) parser.add_argument("--max_seq_length", type=int, default=128) parser.add_argument("--generation_length", type=int, default=20) parser.add_argument("--max_history", type=int, default=2) parser.add_argument("--temperature", type=float, default=1) parser.add_argument("--top_k", type=int, default=0) parser.add_argument("--top_p", type=float, default=0.9) parser.add_argument('--use_gpu', action='store_true') parser.add_argument("--gpu", type=int, default=0) args = parser.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) device = torch.device("cuda" if torch.cuda.is_available() and args.use_gpu else "cpu") n_gpu = torch.cuda.device_count() args.device, args.n_gpu = device, n_gpu np.random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) #### load the GPT-2 model config = GPT2Config.from_json_file(os.path.join(args.model_name_or_path, 'config.json')) enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path) model = load_model(GPT2LMHeadModel(config), args.load_checkpoint, args, verbose=True) model.to(device) model.eval() history = [] while True: raw_text = input("USR >>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input("USR >>> ") if raw_text.lower() == 'quit': print('SYS >>> Goodbye!') break history.append(raw_text) context_tokens = sum([enc.encode(h) + [EOS_ID] for h in history],[]) #+ [EOS_ID] context_tokens = torch.tensor(context_tokens, device=device, dtype=torch.long).unsqueeze(0) position_ids = torch.arange(0, context_tokens.size(-1), dtype=torch.long, device=context_tokens.device) out = generate_sequence(model, context_tokens, position_ids=position_ids, length=args.generation_length, temperature=args.temperature, top_k=args.top_k, top_p= args.top_p) out = out.tolist() text = enc.decode(cut_seq_to_eos(out[0])).encode('ascii','ignore').decode('ascii') print("SYS >>> ", text) history.append(text) history = history[-(2*args.max_history+1):]
def init_model(seed=0, model_path='gpt2'): ''' Parameters: ---------- seed : int seed number for different ramdomizers model_name_or_path : string, optional either model name for existing model or path for trained model ''' np.random.seed(seed) torch.random.manual_seed(seed) torch.cuda.manual_seed(seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") enc = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2LMHeadModel.from_pretrained('gpt2') model = nn.DataParallel(model) model.load_state_dict(torch.load(model_path)) model = model.module model.to(device) model.eval() return model, enc, device
def main(): """Preprocess a dataset.""" parser = argparse.ArgumentParser() parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument('--dataset_path', type=str, default='') parser.add_argument('--model_name_or_path', type=str, default='gpt2', help='pretrained model name') parser.add_argument( '--min_file_len', type=int, help= "When loading dataset, throw out files with fewer than this many characters" ) parser.add_argument( '--max_file_len', type=int, help= "When loading dataset, throw out files with greater than this many characters" ) args = parser.parse_args() enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path) _ = lazy_load(args.dataset_path, enc, args)
def extract_gpt2_hidden_activations( text_path, save_activs_to): #, mode="full_model", focus_layers=[]): # read in text samples to pass through single layer of gpt2 model text_inputs = [] with open(text_path, "rb") as infile: text_inputs = pickle.load(infile) # num_inputs = len(text_inputs) # Load pre-trained model tokenizer (vocabulary) tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # get the hidden activations - assumes a gpu is available layer_activs = [] for text in text_inputs: # tokenize text indexed_tokens = tokenizer.encode(text) tokens_tensor = torch.tensor([indexed_tokens]).to('cuda') # set up model model = GPT2Model.from_pretrained('gpt2') model.eval() model.to('cuda') # grab the hidden activations and save them to layer_actives with torch.no_grad(): hidden, _ = model(tokens_tensor) layer_activs.append(hidden.cpu().numpy().squeeze()) # clear gpu memory in preparation for next text sample torch.cuda.empty_cache() # save layer dimensions with open(save_activs_to, "wb") as outfile: pickle.dump(layer_activs, outfile) pass
def download_model(name): if not name in MODELS: raise Exception(str(name) + ' not a model in the list') if not exists(PATH): print("# ", str(PATH), "not found, creating dir.") mkdir(PATH) print('# Downloading model: ' + str(name)) name_path = MODEL_PATH_DICT[name] if name == 'word2vec': if not exists(join(PATH, name_path)): wget.download( 'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz' ) shutil.move(name_path, join(PATH, name_path)) print('# Downloaded word2vec') else: print('# Already downloaded') if name == 'glove': if not exists(join(PATH, name_path)): wget.download( 'http://nlp.stanford.edu/data/wordvecs/glove.840B.300d.zip') zip = zipfile.ZipFile('./glove.840B.300d.zip') zip.extractall() _ = glove2word2vec('./glove.840B.300d.txt', join(PATH, name_path)) print('# Downloaded glove') else: print('# Already downloaded') if name == 'dict2vec': if not exists(join(PATH, name_path)): wget.download( 'https://dict2vec.s3.amazonaws.com/dict2vec300.tar.bz2') tar = tarfile.open("dict2vec300.tar.bz2") tar.extractall() tar.close() shutil.move(name_path, join(PATH, name_path)) print('# Downloaded dict2vec') else: print('# Already downloaded') if name == 'conceptnet': if not exists(join(PATH, name_path)): wget.download( 'https://conceptnet.s3.amazonaws.com/downloads/2019/numberbatch/numberbatch-en-19.08.txt.gz' ) shutil.move(name_path, join(PATH, name_path)) print('# Downloaded Conceptnet Numberbatch') else: print('# Already downloaded') if name == 'bert' or name == 'bert-context': _ = BertTokenizer.from_pretrained('bert-large-uncased') _ = BertModel.from_pretrained( 'bert-large-uncased').embeddings.word_embeddings.weight.data.numpy( ) print('# Downloaded bert') if name == 'gpt2' or name == 'gpt2-context': _ = GPT2Tokenizer.from_pretrained('gpt2') _ = GPT2LMHeadModel.from_pretrained('gpt2') _ = GPT2Model.from_pretrained('gpt2') print('# Downloaded gpt-2')
def construct_encoder(self): model = GPT2Model.from_pretrained(self.model_name) model.cuda() model = torch.nn.DataParallel(model) model.eval() tokenizer = GPT2Tokenizer.from_pretrained(self.model_name) print("Model and tokenzier are constructed!") return model, tokenizer
def __init__(self,GPU, model_name_or_path="gpt2"): self.device = torch.device(GPU if torch.cuda.is_available() else "cpu") self.enc = GPT2Tokenizer.from_pretrained(model_name_or_path) self.model = GPT2LMHeadModel.from_pretrained(model_name_or_path) self.model.to(self.device) self.model.eval() self.start_token = '<|endoftext|>' print("Loaded GPT-2 model!")
def __init__(self, model_name_or_path="gpt2"): super(LM, self).__init__() self.enc = GPT2Tokenizer.from_pretrained(model_name_or_path) self.model = GPT2LMHeadModel.from_pretrained(model_name_or_path) self.model.to(self.device) self.model.eval() self.start_token = '<|endoftext|>' print("Loaded GPT-2 model!")
def run_model(): parser = argparse.ArgumentParser() parser.add_argument('--model_name_or_path', type=str, default='gpt2', help='pretrained model name or path to local checkpoint') parser.add_argument("--seed", type=int, default=0) parser.add_argument("--nsamples", type=int, default=1) parser.add_argument("--batch_size", type=int, default=-1) parser.add_argument("--length", type=int, default=-1) parser.add_argument("--temperature", type=int, default=1) parser.add_argument("--top_k", type=int, default=0) parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.') args = parser.parse_args() print(args) if args.batch_size == -1: args.batch_size = 1 assert args.nsamples % args.batch_size == 0 np.random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path) model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path) model.to(device) model.eval() if args.length == -1: args.length = model.config.n_ctx // 2 elif args.length > model.config.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx) while True: context_tokens = [] if not args.unconditional: raw_text = input("Model prompt >>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input("Model prompt >>> ") context_tokens = enc.encode(raw_text) generated = 0 for _ in range(args.nsamples // args.batch_size): out = sample_sequence( model=model, length=args.length, context=context_tokens if not args.unconditional else None, start_token=enc.encoder['<|endoftext|>'] if args.unconditional else None, batch_size=args.batch_size, temperature=args.temperature, top_k=args.top_k, device=device ) out = out[:, len(context_tokens):].tolist() for i in range(args.batch_size): generated += 1 text = enc.decode(out[i]) print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) print(text) print("=" * 80) if args.unconditional: break
def get_tokenizer(tokenizer_name): if tokenizer_name == 'GPT-2': tokenizer = GPT2Tokenizer.from_pretrained('gpt2') elif tokenizer_name == 'GPT': tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') else: raise NotImplementedError(f'{tokenizer_name} -- No such tokenizer') return tokenizer
def __init__( self, model_name_or_path="/data/pradeesh/detecting-fake-text/pytorch/"): super(LM, self).__init__() self.enc = GPT2Tokenizer.from_pretrained(model_name_or_path) self.model = GPT2LMHeadModel.from_pretrained(model_name_or_path) self.model.to(self.device) self.model.eval() self.start_token = '<|endoftext|>' print("Loaded GPT-2 model!")
def __init__(self, type, model_name_or_path="gpt2"): super(LM, self).__init__() self.enc = GPT2Tokenizer.from_pretrained(model_name_or_path) if type == '345M': self.model = GPT2LMHeadModel.from_pretrained('output/') elif type == '117M': self.model = GPT2LMHeadModel.from_pretrained(model_name_or_path) self.model.to(self.device) self.model.eval() self.start_token = '<|endoftext|>'
def load_model_fromlist(name): if not name in MODELS: raise Exception(str(name) + ' not a model in the list') print('# Loading model: ' + str(name)) name_path = MODEL_PATH_DICT[name] if name == 'word2vec': if not exists(join(PATH, name_path)): download_model(name) return (gensim.models.KeyedVectors.load_word2vec_format(join( PATH, name_path), binary=True)) if name == 'glove': if not exists(join(PATH, name_path)): download_model(name) return (gensim.models.KeyedVectors.load_word2vec_format( join(PATH, name_path))) if name == 'dict2vec': if not exists(join(PATH, name_path)): download_model(name) return (gensim.models.KeyedVectors.load_word2vec_format( join(PATH, name_path), binary=False, unicode_errors="ignore")) if name == 'conceptnet': if not exists(join(PATH, name_path)): download_model(name) return (gensim.models.KeyedVectors.load_word2vec_format( join(PATH, name_path))) if name == 'bert': tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') model = BertModel.from_pretrained( 'bert-large-uncased').embeddings.word_embeddings.weight.data.numpy( ) return ([model, tokenizer]) if name == 'bert-context': tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') model = BertModel.from_pretrained('bert-large-uncased', output_hidden_states=True) return ([model, tokenizer]) if name == 'gpt2': tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2LMHeadModel.from_pretrained( 'gpt2').transformer.wte.weight.data.numpy() return ([model, tokenizer]) if name == 'gpt2-context': tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2Model.from_pretrained('gpt2', output_hidden_states=True) return ([model, tokenizer])
def __init__(self, args): super().__init__() if args.gpt2_model_dir is not None: # load GPT2 model from file gpt_model_name = str(args.gpt2_model_dir) + "/" dict_file = gpt_model_name print("loading GPT2 model from {}".format(gpt_model_name)) else: # load GPT2 model from huggingface cache gpt_model_name = args.gpt2_model_name dict_file = gpt_model_name # Load pre-trained model tokenizer (vocabulary) self.tokenizer = GPT2Tokenizer.from_pretrained(dict_file) # GPT uses different way to represent BPE then BERT. Namely, the # final suffixes are indicated with </w> suffix, while pieces that must # be followed are written as is. In BERT the prefixes are written as is # while the parts that must follow (not be followed!) have '##' prefix. # There is no one-to-one coversion. But at least we may make pieces that # may form a full word look the same. # Note that we should be very careful now, # tokenizer.convert_tokens_to_ids won't work with our vocabulary. def convert_word(word): if word == GPT2_EOS: return word if word.startswith('Ġ'): # the token starts with a whitespace return word[1:] return f'_{word}_' # the token not start with a white space. # may be not a head of a word, # or may be a head of a sentence. _, gpt_vocab = zip(*sorted(self.tokenizer.decoder.items())) self.vocab = [convert_word(word) for word in gpt_vocab] self._init_inverse_vocab() # Load pre-trained model (weights) self.gpt_model = GPT2LMHeadModel.from_pretrained(gpt_model_name) self.gpt_model.eval() # print(self.gpt_model.config) # Sanity check. assert len(self.vocab) == self.gpt_model.config.vocab_size #assert 0 == self.gpt_model.config.n_special self.eos_id = self.gpt_model.config.eos_token_id self.pad_id = self.gpt_model.config.eos_token_id self.unk_id = self.gpt_model.config.eos_token_id self.bos_id = self.gpt_model.config.bos_token_id self.model_vocab = self.vocab
def init(): #seed = 42 #np.random.seed(seed) #torch.random.manual_seed(seed) #torch.cuda.manual_seed(seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") enc = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2LMHeadModel.from_pretrained('gpt2') model.to(device) model.eval() return enc, model
def token_split(s, method='split', tokenizer=None): ''' Given a string s, tokenize ''' if method == 'split': return s.split() if method == 'moses': tokenized_text = mt.tokenize(s, return_str=True) return tokenized_text.split() if method == 'gpt2': if tokenizer is None: global global_tokenizer if global_tokenizer is None: global_tokenizer = GPT2Tokenizer.from_pretrained('gpt2') tokenizer = global_tokenizer return gpt2_split(tokenizer, s) assert (False)
def __init__(self, cuda_device=-1): super(GPT2Embedder, self).__init__() self.cuda_device = 'cpu' if cuda_device == -1 else f'cuda:{cuda_device}' # Load pre-trained model tokenizer (vocabulary) self.enc = GPT2Tokenizer.from_pretrained('gpt2') # Load pre-trained model (weights) self.model = GPT2Model.from_pretrained('gpt2') self.model.to(self.cuda_device) self.model.eval( ) # we only use the evaluation mode of the pretrained model self._bos_id = self.enc.encoder['<|endoftext|>'] self._bos_past = None
def init(self, model_path, model_checkpoint): self.config = GPT2Config.from_json_file(os.path.join(model_path, "config.json")) self.tokenizer = GPT2Tokenizer.from_pretrained(model_path) self.model = GPT2LMHeadModel(self.config) model_state_dict = fix_state_dict_namespace(torch.load(model_checkpoint)) start_model = self.model if hasattr(self.model, "transformer") and all(not s.startswith('transformer.') for s in model_state_dict.keys()): print('loading transfomer only') start_model = self.model.transformer start_model.load_state_dict(model_state_dict) if self.fp16: self.model.half() self.model.to(self.device) self.model.eval()
def __init__(self, text, lens, target, identity_df, weights, model="gpt2", split_point=0.25): super(TrainDataset, self).__init__() self._text = text self._lens = lens self._target = target self._identity_df = identity_df self._weights = weights self._split_point = split_point VOCAB_PATH = Path('../input/torch-bert-weights/%s' % (model)) self._tokenizer = GPT2Tokenizer.from_pretrained(VOCAB_PATH)
def __init__(self, tuple_dir, device, language_model=None, template_loc=None): """ Args: tuple_dir (string): Path to the csv file with commonsense tuples """ # Load pre-trained model tokenizer (vocabulary) self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2') self.device = device self.model = language_model if self.model is not None: self.model.eval() self.model.to(self.device) self.template_loc = template_loc # Load tuples with open(tuple_dir) as tsvfile: reader = csv.reader(tsvfile, delimiter='\t') self.tuples = [row for row in reader]
def fetch_objects(): bert = BertModel.from_pretrained( 'bert-base-uncased').embeddings.position_embeddings.weight.data gpt = OpenAIGPTModel.from_pretrained( 'openai-gpt').positions_embed.weight.data gpt2 = GPT2Model.from_pretrained('gpt2').wpe.weight.data bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') gpt_tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2') return { 'bert': bert, 'gpt': gpt, 'gpt2': gpt2 }, { 'bert': bert_tokenizer, 'gpt': gpt_tokenizer, 'gpt2': gpt2_tokenizer }
def __init__(self, text_sequence, model_type, temperature = 1.0, top_k = 0, batch_size = 1, length = 1, nsamples =1, debug = True): self.text_sequence = text_sequence #eventually will differentiate between gpt-2, BERT, etc. self.model_type = model_type model_name = 'gpt2' self.debug = debug #detect device self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.temperature = temperature self.top_k = top_k self.batch_size = batch_size self.length = length self.nsamples = nsamples #create encoder and model self.enc = GPT2Tokenizer.from_pretrained(model_name) self.model = GPT2LMHeadModel.from_pretrained(model_name) self.model.to(self.device) self.model.eval()
def context_score(questions, answers, opt): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") enc = GPT2Tokenizer.from_pretrained(opt.pretrained_model_path) model = GPT2LMHeadModel.from_pretrained(opt.pretrained_model_path) model.to(device) model.eval() score_list = [] with torch.no_grad(): for step, (question, answer) in enumerate( zip(questions, answers)): # actually here is a batch with batchsize=1 # Put model in training mode. if not answer: print('space sentence') score_list.append(-1e6) continue joint_enc = enc.encode( question + ' ' + answer) # + [50256] #50256 is the token_id for <|endoftext|> q = enc.encode(question) batch_joint = torch.tensor([joint_enc]).to(device) batch_q = torch.tensor([q]).to(device) loss_joint = model(batch_joint, lm_labels=batch_joint) # everage -logp loss_q = model(batch_q, lm_labels=batch_q) p_joint = -loss_joint * (len(joint_enc) - 1) p_q = -loss_q * (len(q) - 1) score = p_joint - (p_q) score_list.append(score.item()) cutoff = np.quantile(score_list, 0.05) modified_rating = np.array( [cutoff if t < cutoff else t for t in score_list]) normed_rating = (modified_rating - cutoff) / np.abs(cutoff) return normed_rating
def transform(self, X): # Load pre-trained model tokenizer (vocabulary) tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # Load pre-trained model (weights) model = GPT2Model.from_pretrained('gpt2', cache_dir='tmp/gpt2/') model.eval() output = [] for idx, row in tqdm(X.iterrows(), total=len(X)): # Encode some inputs indexed_tokens_1 = tokenizer.encode(row.text) # If you have a GPU, put everything on cuda # Convert inputs to PyTorch tensors tokens_tensor_1 = torch.tensor([indexed_tokens_1]) tokens_tensor_1 = tokens_tensor_1.to('cuda') model.to('cuda') # Predict hidden states features for each layer with torch.no_grad(): hidden_states_1, past = model(tokens_tensor_1) tokens = [ tokenizer.decoder[token].replace('Ġ', '') for token in indexed_tokens_1 ] output.append([tokens, hidden_states_1.cpu()[0]]) output = pd.DataFrame(output, columns=['tokens', 'layer_-1']) res = [] for idx, row in X.iterrows(): res.append(self.get_sample_props(output.loc[idx], **row)[1:]) res = pd.DataFrame(res, columns=[ 'tokens', 'pronoun_offset_token', 'a_offset_token', 'b_offset_token', 'a_span', 'b_span', 'pronoun_token', 'a_tokens', 'b_tokens', 'bert', 'cls' ]) cols = set(X.columns).difference(res.columns) return {'X': pd.concat([X[cols], res], axis=1)}
def extract_gpt2_hidden_word_representations( word, save_activs_to): #, mode="full_model", focus_layers=[]): # text_inputs = [] # with open(text_path, "rb") as infile: # text_inputs = pickle.load(infile) # num_inputs = len(text_inputs) # Load pre-trained model tokenizer (vocabulary) tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # get the hidden activations for word -- assumes gpu is available word_vec = None # initialize word vector object # tokenize word indexed_tokens = tokenizer.encode(word) num_tokens = len(indexed_tokens) tokens_tensor = torch.tensor([indexed_tokens]).to('cuda') # set up model model = GPT2Model.from_pretrained('gpt2') model.eval() model.to('cuda') # get word_vec with torch.no_grad(): # get token-wise activations hidden, _ = model(tokens_tensor) hidden_np = hidden.cpu().numpy() # identify hidden layer dimension that represents different tokens # seq_dim = hidden_np.shape.index(num_tokens) seq_dim = 1 # we know that the dimension corresponding to tokens is the 2nd dimension, indexed by 1 # sum the hidden layer element-wise along the token dimension to get word vector representation word_vec = np.sum(hidden_np, axis=seq_dim).squeeze() # clear gpu memory torch.cuda.empty_cache() # save word vector with open(save_activs_to, "wb") as outfile: pickle.dump(word_vec, outfile) pass
import numpy as np import torch import torch.nn.functional as F import tqdm from tensorboardX import SummaryWriter from torch.utils.data import DataLoader, Dataset from tqdm import trange import pytorch_pretrained_bert from data_loader import get_data_loader from model_sampler import print_samples from pytorch_pretrained_bert import GPT2LMHeadModel, GPT2Tokenizer, OpenAIAdam from torch.utils.data import DataLoader, Dataset, Subset model_name = 'gpt2' enc = GPT2Tokenizer.from_pretrained(model_name) model = GPT2LMHeadModel.from_pretrained(model_name) model_name = 'gpt2' enc = GPT2Tokenizer.from_pretrained(model_name) model = GPT2LMHeadModel.from_pretrained(model_name) device='cpu' beam_width = 130 stopwords = [] def to_list(tensor): return list(tensor.cpu().numpy()) def predict(line, max_predictions): """Give continuation of the line with at most max_predictions BPE tokens. Returns line extended with predictions of
## Predict all tokens with torch.no_grad(): predictions = model(tokens_tensor, segments_tensors) print(predictions.shape) # torch.Size([1, 14, 30522]) ## confirm we were able to predict 'henson' predicted_index = torch.argmax(predictions[0, masked_index]).item(); print(predicted_index) # 27227 predicted_token = tokenizer.convert_ids_to_tokens([predicted_index]) print(predicted_token) # ['henson'] ################################################################## ## OpenAI GPT2 ################################################################## ## GPT2Tokenizer tokenizer = GPT2Tokenizer.from_pretrained('/Users/coder352/datasets/WordVec/pytorch_pretrained_bert/gpt2/') print(tokenizer.max_len) # 1000000000000 print(len(tokenizer.encoder)) # 50257 print(type(tokenizer.encoder)) # <class 'dict'> print(tokenizer.encoder.keys()) print(len(tokenizer.decoder)) # 50257 print(type(tokenizer.decoder)) # <class 'dict'> print(tokenizer.decoder.get(56)) ## Encode some inputs text_1 = "Who was Jim Henson ?" # 大小写在一起的... text_2 = "Jim Henson was a puppeteer" indexed_tokens_1 = tokenizer.encode(text_1); print(indexed_tokens_1, type(indexed_tokens_1)) # [8241, 373, 5395, 367, 19069, 5633]; <class 'list'> print(tokenizer.encode("who was jim henson ?")) # [8727, 373, 474, 320, 30963, 1559, 5633] print(tokenizer.decode(indexed_tokens_1)) # Who was Jim Henson ? print(tokenizer.decode([8727, 373, 474, 320])) # who was jim