tokenizer.convert_ids_to_tokens(indices[i].item()) for i in range(len(indices)) ]) masked_token = tokenizer.mask_token topk_filled_outputs = [] for index, predicted_token_bpe in enumerate( topk_predicted_token_bpe.split(' ')): predicted_token = predicted_token_bpe.replace('\u2581', ' ') if " {0}".format(masked_token) in masked_input: topk_filled_outputs.append(( masked_input.replace(' {0}'.format(masked_token), predicted_token), values[index].item(), predicted_token, )) else: topk_filled_outputs.append(( masked_input.replace(masked_token, predicted_token), values[index].item(), predicted_token, )) return topk_filled_outputs tokenizer = CamembertTokenizer.from_pretrained('camembert-base') model = CamembertForMaskedLM.from_pretrained('camembert-base') model.eval() masked_input = "Le camembert est <mask> :)" print(fill_mask(masked_input, model, tokenizer, topk=3))
def __init__(self, data_list, text_ptm_dir): self.data_list = data_list self.tokenizer = CamembertTokenizer.from_pretrained(text_ptm_dir)