def infilling_ngram(self, context: str): result.clear() with open(os.path.join(MODEL_DIR, 'additional_ids_to_tokens.pkl'), 'rb') as f: additional_ids_to_tokens = pickle.load(f) additional_tokens_to_ids = { v: k for k, v in additional_ids_to_tokens.items() } try: ilm.tokenize_util.update_tokenizer(additional_ids_to_tokens, tokenizer) except ValueError: print('Already updated') # Load model device = 'cpu' model = GPT2LMHeadModel.from_pretrained(MODEL_DIR) model.eval() _ = model.to(device) context_ids = ilm.tokenize_util.encode(context, tokenizer) _blank_id = ilm.tokenize_util.encode(' _', tokenizer)[0] # Infilling type: One of sentence, document, mixture, paragraph, ngram, or word context_ids[context_ids.index( _blank_id)] = additional_tokens_to_ids['<|infill_ngram|>'] generated = infill_with_ilm(model, additional_tokens_to_ids, context_ids, num_infills=5) for g in generated: result.append(str(ilm.tokenize_util.decode(g, tokenizer))) return result
def infilling_word(self, context: str, order: int, mask: str): result.clear() # with open(os.path.join(MODEL_DIR, 'additional_ids_to_tokens.pkl'), 'rb') as f: # additional_ids_to_tokens = pickle.load(f) # additional_tokens_to_ids = {v: k for k, v in additional_ids_to_tokens.items()} # try: # ilm.tokenize_util.update_tokenizer(additional_ids_to_tokens, tokenizer) # except ValueError: # print('Already updated') # Load model # device = 'cpu' # model = GPT2LMHeadModel.from_pretrained(MODEL_DIR) # model.eval() # _ = model.to(device) context_ids = ilm.tokenize_util.encode(context, tokenizer) _blank_id = ilm.tokenize_util.encode(' _', tokenizer)[0] print('blank', _blank_id) print('context', context) # Infilling type: One of sentence, document, mixture, paragraph, ngram, or word # context_ids[context_ids.index(_blank_id)] = additional_tokens_to_ids['<|infill_word|>'] print('before', context_ids) count_mask = 0 for i in range(len(context_ids)): if _blank_id == context_ids[i]: if mask[count_mask] == 'word': context_ids[i] = self.additional_tokens_to_ids[ '<|infill_word|>'] if mask[count_mask] == 'sent': context_ids[i] = self.additional_tokens_to_ids[ '<|infill_sentence|>'] count_mask += 1 print('after', context_ids) generated = infill_with_ilm(self.model, self.additional_tokens_to_ids, context_ids, num_infills=order) print('generated', generated) for g in generated: result.append(str(ilm.tokenize_util.decode(g, tokenizer))) return result
# Replace blanks with appropriate tokens from left to right _blank_id = ilm.tokenize_util.encode(' _', tokenizer)[0] context_ids[context_ids.index( _blank_id)] = additional_tokens_to_ids['<|infill_word|>'] context_ids[context_ids.index( _blank_id)] = additional_tokens_to_ids['<|infill_word|>'] #context_ids[context_ids.index(_blank_id)] = additional_tokens_to_ids['<|infill_ngram|>'] #context_ids[context_ids.index(_blank_id)] = additional_tokens_to_ids['<|infill_sentence|>'] #context_ids[context_ids.index(_blank_id)] = additional_tokens_to_ids['<|infill_sentence|>'] #context_ids[context_ids.index(_blank_id)] = additional_tokens_to_ids['<|infill_sentence|>'] print(ilm.tokenize_util.decode(context_ids, tokenizer)) from ilm.infer import infill_with_ilm generated = infill_with_ilm(model, additional_tokens_to_ids, context_ids, num_infills=10) for g in generated: print('-' * 80) print(ilm.tokenize_util.decode(g, tokenizer)) blankCandidates = [] with open( f"/u/scr/mhahn/PRETRAINED/GLUE/glue_data/SST-2/dev_alternatives_c_sentBreak_new_finetuned_large.tsv", "r") as inFile: for line in inFile: if line.startswith("####"): next(inFile) tokenized = next(inFile).strip().split(" ") print("TOK", tokenized)