def load_openai_gpt(n_special=1, n_ctx=512): text_encoder = TextEncoder("pytorch-openai-transformer-lm/model/encoder_bpe_40000.json", "pytorch-openai-transformer-lm/model/vocab_40000.bpe") encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) vocab = n_vocab + n_special + n_ctx args = DEFAULT_CONFIG lm_model = LMModel(args, vocab, n_ctx, return_probs=True) load_openai_pretrained_model(lm_model.transformer, n_ctx=n_ctx, n_special=n_special, path="pytorch-openai-transformer-lm/model/", path_names="pytorch-openai-transformer-lm/") # lm_model.to(device) lm_model.return_probs = False lm_model.eval() return lm_model, text_encoder
def __init__(self): # initialize lm and text encoder and everything # set up the encoder to turn words into indices encoder_path = 'model/encoder_bpe_40000.json' bpe_path = 'model/vocab_40000.bpe' self.text_encoder = TextEncoder(encoder_path, bpe_path) self.nvocab = len(self.text_encoder.encoder) nctx = 512 # number of positional embeddings (nctx = number of context) vocab = self.nvocab + nctx # set up pretrained openai model args = DEFAULT_CONFIG self.lm_model = LMModel(args, vocab, nctx, return_probs = True) load_openai_pretrained_model(self.lm_model.transformer, n_ctx=nctx, n_special=0) self.lm_model.eval() # this line puts the model in eval mode so we don't do dropout :) # set up spacy for pos tagging self.nlp = spacy.load('en', disable=['ner', 'textcat', 'parser'])
log_dir = args.log_dir submission_dir = args.submission_dir device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) text_encoder = TextEncoder(args.encoder_path, args.bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) n_special = 0 # XD: useless for language modeling task vocab = n_vocab + n_special + n_ctx #the size of the vocabalery - in this case it is letters - so I don;t think its what we need lm_model = LMModel(args, vocab, n_ctx, return_probs=True) load_openai_pretrained_model(lm_model.transformer, n_ctx=n_ctx, n_special=n_special) lm_model.to(device) lm_model.eval() #till now it loaded the previuos model and the vocabalery that will be used text = input('Input some beginning words:') #why we need this? create_dictionary(text_encoder) while text != 'q': X = text_encoder.encode([text,]) XMB = make_batch(X) for _ in range(args.gen_len): lm_probs = lm_model(XMB) #the porbability of each word in the vocabalry? if args.topk == 0:
class SurprisalAnalyzer: def __init__(self): # initialize lm and text encoder and everything # set up the encoder to turn words into indices encoder_path = 'model/encoder_bpe_40000.json' bpe_path = 'model/vocab_40000.bpe' self.text_encoder = TextEncoder(encoder_path, bpe_path) self.nvocab = len(self.text_encoder.encoder) nctx = 512 # number of positional embeddings (nctx = number of context) vocab = self.nvocab + nctx # set up pretrained openai model args = DEFAULT_CONFIG self.lm_model = LMModel(args, vocab, nctx, return_probs = True) load_openai_pretrained_model(self.lm_model.transformer, n_ctx=nctx, n_special=0) self.lm_model.eval() # this line puts the model in eval mode so we don't do dropout :) # set up spacy for pos tagging self.nlp = spacy.load('en', disable=['ner', 'textcat', 'parser']) def make_batch(self, X): X = np.array(X) assert X.ndim in [1, 2] if X.ndim == 1: X = np.expand_dims(X, axis=0) # add positional encodings - just second dimension that says which word is where pos_enc = np.arange(self.nvocab, self.nvocab + X.shape[-1]) pos_enc = np.expand_dims(pos_enc, axis=0) batch = np.stack([X, pos_enc], axis=-1) batch = torch.tensor(batch, dtype=torch.long) return batch def _get_continuation_tensor(self, sent_vec): """ Deals strictly with tensors """ sent_batch = self.make_batch(sent_vec) sent_res = self.lm_model(sent_batch) return sent_res def tensor_to_probs(self, tensor): """ converts torch tensor to clean numpy array holding probabilities (Basically just hides some nasty code) """ return tensor[:, -1, :].flatten().detach().numpy() def get_continuation_probs(self, sentence): sent_vec = self.text_encoder.encode([sentence]) tensor = self._get_continuation_tensor(sent_vec) return self.tensor_to_probs(tensor) def _get_continuations(self, sent_res, k=10, verbose=False): """ Making this private so I can access it externally... that's awful This is a helper function for the `get_continuations` wrapper that separates the actual processing of the sentence from getting top continuations """ probs, decode = sent_res[:,-1,:].topk(k) if verbose: for p, d in zip(probs.flatten(), decode.flatten()): print("\t...%s (%.4f)"%(self.text_encoder.decoder[d.item()], p.item())) words = [self.text_encoder.decoder[d.item()] for d in decode.flatten()] # strip of the word ending tags if there are some - if it's not a full continuation, what to do? for i in range(len(words)): if words[i][-4:] == "</w>": words[i] = words[i][:-4] probs = probs.flatten().detach().numpy() # convert probs from tensor to numpy array return words, probs def get_continuations(self, sentence, k=10, verbose=False): """ sentence: a string that you want to get next words for k: how many next words you want to get verbose: do you want to print the output """ sent_vec = self.text_encoder.encode([sentence]) sent_res = self._get_continuation_tensor(sent_vec) if verbose: print(sentence) return self._get_continuations(sent_res, k, verbose) def _get_pos_continuations(self, sentence, words, probs): """ helper function for `get_pos_continuations` that takes the lists of words and probabilities and performs all the computation to get the most common pos tags independently of processing an individual sentence """ # get POS of all of k continuations pos_counter = Counter() for word, prob in zip(words, probs): sentence_continuation = "{} {}".format(sentence, word) encoded = self.nlp(sentence_continuation) pos_counter[encoded[-1].pos_] += prob # format pos_counter most common output as two lists, one of probs and one of pos tags pos_counter_list = list(zip(*pos_counter.most_common())) pos_tags, pos_tag_probs = list(pos_counter_list[0]), np.array((pos_counter_list[1]), dtype=np.float32) return pos_tags, pos_tag_probs def get_pos_continuations(self, sentence, k=10, verbose=False): """ sentence: string you want next parts of speech for k: how many top words to analyze NOTE: unlike in the `get_continuation` function, the k is NOT how many unique POS tags you want to look at, it's how many words you want to consider """ # get likely next words words, probs = self.get_continuations(sentence, k, verbose=False) return self._get_pos_continuations(sentence, words, probs) ################################################################################ # The following three functions calculate entropy/surprisal of a SINGLE function ################################################################################ def _get_surprisal(self, distribution, index): word_prob = distribution[index] return -np.log2(word_prob) def get_surprisal(self, sentence, word): """ get the -log2 probability of the word following the sentence """ all_probs = self.get_continuation_probs(sentence) # if the word is not in the vocabulary in full, represent its probability by the # probability of the first part of its encoding (the 0 index) word_index = self.text_encoder.encode([word])[0] # word_prob = all_probs[word_index] return self._get_surprisal(all_probs, word_index)#-np.log2(word_prob) def _get_entropy(self, distribution): return -np.sum([p*np.log2(p) if p > 0 else 0 for p in distribution]) def get_entropy(self, sentence): """ finds the shannon entropy of predicting the word following sentence """ all_probs = self.get_continuation_probs(sentence) return self._get_entropy(all_probs)#-np.sum([p*np.log2(p) if p > 0 else 0 for p in all_probs]) def get_surprisal_entropy_ratio(self, sentence, word): "gets ratio betwen surprisal and entropy at the end of the sentence for a given word" all_probs = self.get_continuation_probs(sentence) word_index = self.text_encoder.encode([word])[0] entropy = self._get_entropy(all_probs) surprisal = self._get_surprisal(all_probs, word_index) return surprisal/entropy #################################################################### # Same as above but for part of speech #################################################################### def get_surprisal_pos(self, sentence, pos, k=1000): """ Because we the language model is not a POS tagger, we cannot directly calculate the surprisal of the pos from a full probability distribution, instead we have to use the degenerate distribution computed from the top k most probable POS continuations sentence is full sentence pos is pos we want to get surprisal of k is how many possible continuations to check """ pos_tags, pos_tag_probs = self.get_pos_continuations(sentence, k) pos_index = pos_tags.index(pos) # assume the POS we want is in the list somewhere... return self._get_surprisal(pos_tag_probs, pos_index) def get_entropy_pos(self, sentence, k=1000): """ Disclaimer about degenerate distribution same as above """ pos_tags, pos_tag_probs = self.get_pos_continuations(sentence, k) return self._get_entropy(pos_tag_probs) ##################################################################### # Gets all of the above metrics for every word in a single sentence # ##################################################################### def get_surprisal_sentence(self, sentence, prepend=None, start=1): """ A little uglier, but perhaps faster """ surprisals = [] sent_enc = self.text_encoder.encode([sentence])[0] # list of indices in enocder 1-d if prepend != None: sent_enc = prepend + sent_enc sent_dec = [self.text_encoder.decoder[ind] for ind in sent_enc] sent_batch = None # if you run the language model with the whole sentence the outputs for each # word are the probabilities for the next word! sent_batch = self.make_batch([sent_enc]) sent_tensor = self.lm_model(sent_batch) for i in range(start, len(sent_enc)): surprisals.append(-np.log2(sent_tensor[:,i-1,sent_enc[i]].item())) return surprisals, sent_dec def get_s_h_shr_sentence(self, sentence, prepend=None, start=1): """ calculates the surprisal, entropy, and surprisal-entropy-ratio at each word (defined by bpe) in the sentence returns, in order 1. The list of surprisals (len(sentence) - 1) 2. The list of entropies (len(sentence) - 1) 3. The list of rations between surprisals and entropies (len(sentence) - 1) 4. The decoded tokens that are used by the BPE encoder wrapper """ surprisals, entropies, surprisal_entropy_ratios = [],[],[] sent_enc = self.text_encoder.encode([sentence])[0] # list of indices in enocder 1-d if prepend != None: sent_enc = prepend + sent_enc sent_dec = [self.text_encoder.decoder[ind] for ind in sent_enc] # start = max(0, min(1, start)) # doesn't work because language model needs to condition on something start = 1 for i in range(start, len(sent_enc)): partial_sent_enc = [sent_enc[:i]] cont_tensor = self._get_continuation_tensor(partial_sent_enc) partial_probs = self.tensor_to_probs(cont_tensor) surprisals.append(self._get_surprisal(partial_probs, sent_enc[i])) entropies.append(self._get_entropy(partial_probs)) surprisal_entropy_ratios.append(surprisals[-1]/entropies[-1]) return surprisals, entropies, surprisal_entropy_ratios, sent_dec
n_special = 3 max_len = n_ctx // 2 - 2 n_ctx = 626 * 2 + 4 vocab = n_vocab + n_special + n_ctx print(vocab) trX, trM = transform_roc(trX) vaX, vaM = transform_roc(vaX) n_train = len(trX) n_valid = len(vaX) n_batch_train = args.n_batch * max(n_gpu, 1) n_updates_total = (n_train // n_batch_train) * args.n_iter lm_model = LMModel(args, vocab, n_ctx) criterion = nn.CrossEntropyLoss(reduce=False) model_opt = OpenAIAdam(lm_model.parameters(), lr=args.lr, schedule=args.lr_schedule, warmup=args.lr_warmup, t_total=n_updates_total, b1=args.b1, b2=args.b2, e=args.e, l2=args.l2, vector_l2=args.vector_l2, max_grad_norm=args.max_grad_norm) compute_loss_fct = LMLossCompute(criterion, model_opt) load_openai_pretrained_model(lm_model.transformer,
firstbpe, secondbpe = encode_dataset(*(firstsent, secondsent), encoder=text_encoder) encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) n_special = 3 max_len = n_ctx // 2 - 2 n_ctx = 1256 vocab = n_vocab + n_special + n_ctx n_train = len(firstsent) n_valid = len(secondsent) n_batch_train = args.n_batch * max(n_gpu, 1) n_updates_total = (n_train // n_batch_train) * args.n_iter dh_model = LMModel(args, vocab, n_ctx) load_openai_pretrained_model(dh_model.transformer, n_ctx=n_ctx, n_special=n_special) dh_model.to(device) dh_model = nn.DataParallel(dh_model) n_updates = 0 n_epochs = 0 desc = "challenge" path = os.path.join(save_dir, desc, 'best_params') dh_model.load_state_dict(torch.load(path)) arr = predict(firstsent, secondsent, firstbpe, secondbpe) with open(os.path.join(os.getcwd(), 'part1.txt'), 'w') as w: for pred in arr:
def main(args): init(args) # Constants n_ctx = args.n_ctx data_dir = args.data_dir device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) text_encoder = TextEncoder(args.encoder_path, args.bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) text_encoder.decoder[len(encoder)] = '_start_' encoder['_start_'] = len(encoder) text_encoder.decoder[len(encoder)] = '_delimiter_' encoder['_delimiter_'] = len(encoder) text_encoder.decoder[len(encoder)] = '_classify_' encoder['_classify_'] = len(encoder) n_special = 3 # XD: useless for language modeling task vocab = n_vocab + n_special + n_ctx lm_model = LMModel(args, vocab, n_ctx, return_probs=True, doc_embed=args.doc_model) load_openai_pretrained_model(lm_model.transformer, n_ctx=n_ctx, n_special=n_special) if args.checkpoint != "none": checkpoint = torch.load(args.checkpoint, map_location='cpu') state_dict = checkpoint["state_dict"] for key in list(state_dict.keys()): state_dict[key[7:]] = state_dict[key] del state_dict[key] pos_emb_mask = torch.zeros(1, 1, vocab) pos_emb_mask[:, :, -n_ctx] = -1e12 state_dict['pos_emb_mask'] = pos_emb_mask lm_model.load_state_dict(state_dict) lm_model.to(device) lm_model = DataParallelModel(lm_model) train_bar = get_loader(os.path.join(data_dir, "val_encoded.jsonl"), n_gpu, encoder, num_workers=1, shuffle=True, max_size=args.n_iter) srcs, hyps, refs = [], [], [] with torch.no_grad(): lm_model.eval() for i, (pad_output, mask_output) in enumerate(tqdm(train_bar), 1): src_strs, tgt_strs, gen_strs = generate_outputs( lm_model, pad_output, mask_output, text_encoder, device, args.beam, args.gen_len, args.k, args.decoding_strategy) srcs.extend(src_strs) hyps.extend(gen_strs) refs.extend(tgt_strs) for i in range(len(hyps)): print("*" * 50) print("Source: {}".format(srcs[i])) print('Hypothesis: {}'.format(hyps[i])) print("Reference: {}".format(refs[i]))
) + 3, n_ctx) vocab = n_vocab + n_special + n_ctx trX, trM = transform_roc(trX1, trX2, trX3) vaX, vaM = transform_roc(vaX1, vaX2, vaX3) if submit: teX, teM = transform_roc(teX1, teX2, teX3) n_train = len(trY) n_valid = len(vaY) n_batch_train = args.n_batch * max(n_gpu, 1) n_updates_total = (n_train // n_batch_train) * args.n_iter # change this one to LMModel # dh_model = DoubleHeadModel(args, clf_token, 'multiple_choice', vocab, n_ctx) lm_model = LMModel(args, vocab, n_ctx, return_probs=True) # load_openai_pretrained_model(lm_model.transformer, n_ctx=n_ctx, n_special=n_special) lm_model.to(device) criterion = nn.CrossEntropyLoss(reduce=False) model_opt = OpenAIAdam(lm_model.parameters(), lr=args.lr, schedule=args.lr_schedule, warmup=args.lr_warmup, t_total=n_updates_total, b1=args.b1, b2=args.b2, e=args.e, l2=args.l2, vector_l2=args.vector_l2, max_grad_norm=args.max_grad_norm)
def main(args): # Constants n_ctx = args.n_ctx desc = args.desc device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) text_encoder = TextEncoder(args.encoder_path, args.bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 3 print("Loading dataset...") test_loader = get_loader(args.data_file, args.n_batch, encoder, num_workers=1, shuffle=False, subset=args.subset) vocab = n_vocab + n_special + n_ctx dh_model = LMModel(args, vocab=vocab, n_ctx=n_ctx, doc_embed=args.doc_model) print("Loading model...") load_openai_pretrained_model(dh_model.transformer, n_ctx=n_ctx, n_special=n_special, path="./model/", path_names="./") if args.checkpoint != "none": checkpoint = torch.load(args.checkpoint, map_location='cpu') state_dict = checkpoint["state_dict"] for key in list(state_dict.keys()): state_dict[key[7:]] = state_dict[key] del state_dict[key] pos_emb_mask = torch.zeros(1, 1, vocab) pos_emb_mask[:, :, -n_ctx] = -1e12 state_dict['pos_emb_mask'] = pos_emb_mask dh_model.load_state_dict(state_dict) dh_model.to(device) dh_model = DataParallelModel(dh_model) stop_words = [] if args.stop_words is not None: with open(args.stop_words) as f: for line in f: stop_words.append(line) evaluate_model(dh_model, test_loader, text_encoder, device, args.beam, args.gen_len, args.k, args.decoding_strategy, args.save_file, args.gen_dir, args.tgt_dir, args.max_len, stop_words, args)
def main(args): init(args) # Constants n_ctx = args.n_ctx save_dir = os.path.join(args.output_dir, args.experiment_name, "checkpoints") desc = args.desc data_dir = args.data_dir log_dir = os.path.join(args.output_dir, args.experiment_name, "logs") train_log_interval = args.train_log_interval val_log_interval = args.val_log_interval beam = args.beam gen_len = args.gen_len k = args.k decoding_strategy = args.decoding_strategy accum_iter = args.accum_iter device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) logger = Logger(log_dir) text_encoder = TextEncoder(args.encoder_path, args.vocab_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 3 print("Loading dataset...") train_loader = get_loader(os.path.join(data_dir, "train_encoded.jsonl"), args.n_batch, encoder, num_workers=3, shuffle=True) val_loader = get_loader(os.path.join(data_dir, "val_encoded.jsonl"), n_gpu, encoder, num_workers=0, shuffle=False, max_size=args.num_val_examples) print("Train length: {}, Validation length: {}".format(len(train_loader), len(val_loader))) vocab = n_vocab + n_special + n_ctx n_updates_total = (len(train_loader) // args.accum_iter) * (args.num_epochs_dat + args.num_epochs_ft) dh_model = LMModel(args, vocab=vocab, n_ctx=n_ctx, doc_embed=args.doc_model) criterion = nn.CrossEntropyLoss(reduction="none") model_opt = OpenAIAdam(dh_model.parameters(), lr=args.lr, schedule=args.lr_schedule, warmup=args.lr_warmup, t_total=n_updates_total, b1=args.b1, b2=args.b2, e=args.e, l2=args.l2, vector_l2=args.vector_l2, max_grad_norm=args.max_grad_norm) lm_loss = LMLoss(criterion) summary_loss = SummaryLoss(criterion) print("Loading Model") if args.use_pretrain: load_openai_pretrained_model(dh_model.transformer, n_ctx=n_ctx, n_special=n_special, path="./model/", path_names="./") start_iter, running_loss = load_checkpoint(args.checkpoint, dh_model, model_opt, vocab, n_ctx) dh_model.to(device) dh_model = DataParallelModel(dh_model) lm_loss = DataParallelCriterion(lm_loss) summary_loss = DataParallelCriterion(summary_loss) for i in range(args.num_epochs_dat): start_iter, running_loss = run_epoch(start_iter, running_loss, dh_model, lm_loss, model_opt, train_loader, val_loader, train_log_interval, val_log_interval, device, beam, gen_len, k, decoding_strategy, accum_iter, "DAT Training Epoch [{}/{}]".format(i + 1, args.num_epochs_dat), save_dir, logger, text_encoder, show_progress=args.show_progress, summary_loss=summary_loss) for i in range(args.num_epochs_ft): start_iter, running_loss = run_epoch(start_iter, running_loss, dh_model, summary_loss, model_opt, train_loader, val_loader, train_log_interval, val_log_interval, device, beam, gen_len, k, decoding_strategy, accum_iter, "FT Training Epoch [{}/{}]".format(i + 1, args.num_epochs_ft), save_dir, logger, text_encoder, show_progress=args.show_progress)