def test_inference_masked_lm(self): model = RobertaForMaskedLM.from_pretrained("roberta-base") input_ids = torch.tensor( [[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) output = model(input_ids)[0] expected_shape = torch.Size((1, 11, 50265)) self.assertEqual(output.shape, expected_shape) # compare the actual values for a slice. expected_slice = torch.tensor([[[33.8802, -4.3103, 22.7761], [4.6539, -2.8098, 13.6253], [1.8228, -3.6898, 8.8600]]]) # roberta = torch.hub.load('pytorch/fairseq', 'roberta.base') # roberta.eval() # expected_slice = roberta.model.forward(input_ids)[0][:, :3, :3].detach() self.assertTrue( torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
def __init__(self, model_path='roberta-base', temperature=1.0, top_k=None, top_p=None, device='cuda'): super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p) self.model_path = model_path # self.tokenizer = AutoTokenizer.from_pretrained(model_path) # self.model = AutoModel.from_pretrained(model_path) self.tokenizer = RobertaTokenizer.from_pretrained(model_path) self.model = RobertaForMaskedLM.from_pretrained(model_path) self.model.to(self.device) self.model.eval()
def test_inference_masked_lm(self): model = RobertaForMaskedLM.from_pretrained('roberta-base') input_ids = torch.tensor([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) output = model(input_ids)[0] expected_shape = torch.Size((1, 11, 50265)) self.assertEqual( output.shape, expected_shape ) # compare the actual values for a slice. expected_slice = torch.Tensor( [[[33.8843, -4.3107, 22.7779], [ 4.6533, -2.8099, 13.6252], [ 1.8222, -3.6898, 8.8600]]] ) self.assertTrue( torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3) )
def test_tokenize(self): current_dir = os.path.dirname(os.path.realpath(__file__)) vocab_path = os.path.join(current_dir, 'data', 'vocab.txt') tokenized_smiles = [ 12, 16, 16, 16, 17, 16, 16, 18, 16, 19, 16, 17, 22, 19, 18, 33, 17, 16, 18, 23, 181, 17, 22, 19, 18, 17, 19, 16, 33, 20, 19, 55, 17, 16, 38, 23, 18, 17, 33, 17, 19, 18, 35, 20, 19, 18, 16, 20, 22, 16, 16, 22, 16, 21, 23, 20, 23, 22, 16, 23, 22, 16, 21, 23, 18, 19, 16, 20, 22, 16, 16, 22, 16, 16, 22, 16, 20, 13 ] model = RobertaForMaskedLM.from_pretrained( 'seyonec/SMILES_tokenized_PubChem_shard00_50k') model.num_parameters() tokenizer = SmilesTokenizer( vocab_path, max_len=model.config.max_position_embeddings) assert tokenized_smiles == tokenizer.encode( "CCC(CC)COC(=O)[C@H](C)N[P@](=O)(OC[C@H]1O[C@](C#N)([C@H](O)[C@@H]1O)C1=CC=C2N1N=CN=C2N)OC1=CC=CC=C1" )
def __init__(self, args): # self.dict_file = "{}/{}".format(args.roberta_model_dir, args.roberta_vocab_name) self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base') if args.model_path is not None: print("Testing CoLAKE...") print('loading model parameters from {}...'.format( args.model_path)) config = RobertaConfig.from_pretrained('roberta-base', type_vocab_size=3) self.model = RobertaForMaskedLM(config=config) states_dict = torch.load(os.path.join(args.model_path, 'model.bin')) self.model.load_state_dict(states_dict, strict=False) else: print("Testing RoBERTa baseline...") self.model = RobertaForMaskedLM.from_pretrained('roberta-base') self._build_vocab() self._init_inverse_vocab() self._model_device = 'cpu' self.max_sentence_length = args.max_sentence_length
def test_sequence(): result_dir = "../results/" tok_dir = "tokenizer_model/" tokenizer = train.get_tok(tok_dir) csv_path = "../utils/test.csv" txt_name = "test.txt" utils.make_train_txt(csv_path, txt_name) dp = DP("../utils/test.csv") param = dp.param.to_numpy()[-7:] param = param.tolist() param.append("probability") mod_dir = "transformer_model/checkpoint-33000/" model = RobertaForMaskedLM.from_pretrained(mod_dir) fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer, device=0) PD, PB, ill, count = start(fill_mask) denorm = dp.denormalize(ill) l = [] for x in range(len(PD)): tmp = [] for j in range(7): tmp.append(random.randrange(denorm[x][j][0], denorm[x][j][1] + 1)) tmp.append(PB[x]) l.append(tmp) df = pd.DataFrame(l, columns=param) df.to_csv(result_dir + "result.csv", index=False, encoding="cp949")
def __init__(self, config): super().__init__() self.train_config = config self.roberta = RobertaForMaskedLM.from_pretrained('roberta-base') _ = self.roberta.eval() for param in self.roberta.parameters(): param.requires_grad = False self.pred_model = self.roberta.roberta self.enc_model = self.pred_model.embeddings.word_embeddings # self.proj_head = DVProjectionHead() # self.proj_head = DVProjectionHead_ActiFirst() self.proj_head = DVProjectionHead_EmbActi() self.tkz = RobertaTokenizer.from_pretrained("roberta-base") self.collator = TokenizerCollate(self.tkz) self.lossfunc = nn.BCEWithLogitsLoss() self.acc = Accuracy(threshold=0.0) self.f1 = F1(threshold=0.0)
def evaluate(args): """ Args: ckpt: model checkpoints. hparams_file: the string should end with "hparams.yaml" """ trainer = Trainer(gpus=args.gpus, distributed_backend=args.distributed_backend, deterministic=True) # reload test dataloader # print(trainer.test()) print("path_to_model_checkpoint", args.path_to_model_checkpoint) # print(BertForQA) model = BertForQA.load_from_checkpoint( checkpoint_path=args.path_to_model_checkpoint, hparams_file=args.path_to_model_hparams_file, map_location=None, batch_size=args.eval_batch_size, ) mlm_model = RobertaForMaskedLM.from_pretrained( './cached_models/roberta_squad1_covidmlm(train_and_dev)_3epoch/') model.model.roberta.load_state_dict(mlm_model.roberta.state_dict()) # mlm_model = RobertaForMaskedLM.from_pretrained('./cached_models/roberta_squad1_2epoch_covidmlm_3epoch/') # model.model.roberta.load_state_dict(mlm_model.roberta.state_dict()) # # evaluate ner # model = BertForNERTask.load_from_checkpoint( # checkpoint_path=args.path_to_model_checkpoint, # hparams_file=args.path_to_model_hparams_file, # map_location=None, # batch_size=args.eval_batch_size # ) trainer.test(model=model)
parser = argparse.ArgumentParser( description= "Extraction some layers of the full RobertaForMaskedLM or GPT2LMHeadModel for Transfer Learned Distillation" ) parser.add_argument("--model_type", default="roberta", choices=["roberta", "gpt2"]) parser.add_argument("--model_name", default="roberta-large", type=str) parser.add_argument("--dump_checkpoint", default="serialization_dir/tf_roberta_048131723.pth", type=str) parser.add_argument("--vocab_transform", action="store_true") args = parser.parse_args() if args.model_type == "roberta": model = RobertaForMaskedLM.from_pretrained(args.model_name) prefix = "roberta" elif args.model_type == "gpt2": model = GPT2LMHeadModel.from_pretrained(args.model_name) prefix = "transformer" state_dict = model.state_dict() compressed_sd = {} # Embeddings # if args.model_type == "gpt2": for param_name in ["wte.weight", "wpe.weight"]: compressed_sd[f"{prefix}.{param_name}"] = state_dict[ f"{prefix}.{param_name}"] else: for w in [
else: print('No GPU available, using the CPU instead.') device = torch.device("cpu") from transformers import RobertaConfig, RobertaTokenizerFast, RobertaForMaskedLM config = RobertaConfig( vocab_size=50265, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=12, type_vocab_size=1, ) tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lower_case=False) model = RobertaForMaskedLM.from_pretrained('roberta-base', config=config) # 12-layer, 768-hidden, 12-heads, 125M parameters, roberta-base using the bert-base architecture dataset = LineByLineTextDataset(tokenizer=tokenizer, file_path='IMDB_train.csv.txt', block_size=128) #dataset = load_dataset("./csv_for_ft_new.py", data_files=file_path) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) training_args = TrainingArguments( do_train=True, do_predict=True, output_dir=savedname, overwrite_output_dir=True,
from typing import Optional, List, Callable, Any, Dict import torch from transformers import FillMaskPipeline from transformers import RobertaTokenizerFast, RobertaForMaskedLM model = RobertaForMaskedLM.from_pretrained("roberta-large") tokenizer = RobertaTokenizerFast.from_pretrained("roberta-large") device_number = torch.cuda.current_device() if torch.cuda.is_available( ) else -1 mask_predictor = FillMaskPipeline(model=model, tokenizer=tokenizer, device=device_number) # # def predict_mask(text, predictor=mask_predictor, options: Optional[List[str]] = None, num_results: Optional[int] = 1): # # # results = predictor(text, targets=options, top_k=num_results) # parsed_results = [] # for result in results: # parsed_result = {"word": _postprocess_mask_prediction_token(result['token_str']), # "softmax": result["score"]} # parsed_results.append(parsed_result) # # return parsed_results def _postprocess_mask_prediction_token(text): return text[1:] if text[0] == "Ġ" else text
def gen_neighborhood(args): # initialize seed if args.seed is not None: torch.manual_seed(args.seed) np.random.seed(args.seed) # load model and tokenizer # slow tokenizer is for non-unk decoding if args.is_roberta: r_model = RobertaForMaskedLM.from_pretrained(args.model) tokenizer = RobertaTokenizerFast.from_pretrained( args.tokenizer, max_len=512 ) old_style_tokenizer = RobertaTokenizer.from_pretrained( args.tokenizer, max_len=512 ) mask_length = max( len(tokenizer.vocab), r_model.lm_head.decoder.out_features ) start_ignore = min( len(tokenizer.vocab), r_model.lm_head.decoder.out_features ) else: tokenizer = BertTokenizerFast.from_pretrained( args.tokenizer, clean_text=True, tokenize_chinese_chars=True, strip_accents=False, do_lower_case=False, ) old_style_tokenizer = BertTokenizer.from_pretrained( args.tokenizer, do_lower_case=False, strip_accents=False, tokenize_chinese_chars=True, ) r_model = BertForMaskedLM.from_pretrained(args.model) assert ( len(tokenizer.vocab) == r_model.cls.predictions.decoder.out_features ) mask_length = len(tokenizer.vocab) start_ignore = mask_length r_model.eval() if torch.cuda.is_available(): r_model.cuda() # remove unused vocab and special ids from sampling softmax_mask = np.full(mask_length, False) softmax_mask[tokenizer.all_special_ids] = True for k, v in tokenizer.vocab.items(): if "[unused" in k: softmax_mask[v] = True for i in range(start_ignore, mask_length): # this is what happens if your vocab is smaller than it claims to be # we'll never use the rest of the ids anyways so we should mask them softmax_mask[i] = True if not args.is_roberta: import pdb pdb.set_trace() # load the inputs and labels lines = [ tuple(s.strip().split("\t")) for s in open(args.in_file).readlines() ] num_lines = len(lines) # lines[i] is a list of [s], where s is each sentence in the ith column # of the file lines = [[[s] for s in s_list] for s_list in list(zip(*lines))] # load label file if it exists if args.label_file: labels = [s.strip() for s in open(args.label_file).readlines()] output_labels = True else: labels = [0] * num_lines output_labels = False # shard the input and labels if args.num_shards > 0: shard_start = (int(num_lines / args.num_shards) + 1) * args.shard shard_end = (int(num_lines / args.num_shards) + 1) * (args.shard + 1) lines = [s_list[shard_start:shard_end] for s_list in lines] labels = labels[shard_start:shard_end] # open output files if args.num_shards != 1: s_rec_file = open(args.output_prefix + "_" + str(args.shard), "w") unk_rec_file = open( args.output_prefix, "_unks_" + str(args.shard), "w" ) if output_labels: l_rec_file = open( args.output_prefix + "_" + str(args.shard) + ".label", "w" ) else: s_rec_file = open(args.output_prefix, "w") unk_rec_file = open(args.output_prefix + "_unks", "w") if output_labels: l_rec_file = open(args.output_prefix + ".label", "w") # sentences and labels to process sents = [] l = [] # number sentences generated num_gen = [] # sentence index to noise from gen_index = [] # number of tries generating a new sentence num_tries = [] # next sentence index to draw from next_sent = 0 # indices and words corresponding to each instance of [UNK] / <unk> for the # sentences in sents unks = [] sents, l, next_sent, num_gen, num_tries, gen_index, unks = fill_batch( args, tokenizer, sents, l, lines, labels, next_sent, num_gen, num_tries, gen_index, unks, old_style_tokenizer, ) total_unks_in_base_corpus = 0 # main augmentation loop while sents != []: # remove any sentences that are done generating and dump to file for i in range(len(num_gen))[::-1]: if num_gen[i] == args.num_samples or num_tries[i] > args.max_tries: # get sent info gen_sents = sents.pop(i) num_gen.pop(i) gen_index.pop(i) label = l.pop(i) unk_list = unks.pop(i) total_unks_in_base_corpus += len(unk_list[0][0]) # write generated sentences for sg, unk in zip(gen_sents[1:], unk_list[1:]): # the [1:-1] here refers to some weirdness that repr() does # on strings -- namely, adding quotes at the start and end de_unked = [ de_unk(repr(val)[1:-1], unk[i], tokenizer) for i, val in enumerate(sg) ] orig = [repr(val)[1:-1] for val in sg] s_rec_file.write("\t".join(de_unked) + "\n") unk_rec_file.write("\t".join(orig) + "\n") if output_labels: l_rec_file.write(label + "\n") # fill batch sents, l, next_sent, num_gen, num_tries, gen_index, unks = fill_batch( args, tokenizer, sents, l, lines, labels, next_sent, num_gen, num_tries, gen_index, unks, old_style_tokenizer, ) # break if done dumping if len(sents) == 0: print(f"Total unks in base corpus: {total_unks_in_base_corpus}") break # build batch toks = [] masks = [] for i in range(len(gen_index)): s = sents[i][gen_index[i]] tok, mask = hf_masked_encode( tokenizer, *s, noise_prob=args.noise_prob, random_token_prob=args.random_token_prob, leave_unmasked_prob=args.leave_unmasked_prob, ) toks.append(tok) masks.append(mask) # pad up to max len input max_len = max([len(tok) for tok in toks]) pad_tok = tokenizer.pad_token_id toks = [ F.pad(tok, (0, max_len - len(tok)), "constant", pad_tok) for tok in toks ] masks = [ F.pad(mask, (0, max_len - len(mask)), "constant", pad_tok) for mask in masks ] toks = torch.stack(toks) masks = torch.stack(masks) # load to GPU if available if torch.cuda.is_available(): toks = toks.cuda() masks = masks.cuda() # predict reconstruction rec, rec_masks = hf_reconstruction_prob_tok( toks, masks, tokenizer, r_model, softmax_mask, reconstruct=True, topk=args.topk, ) # decode reconstructions and append to lists for i in range(len(rec)): rec_work = rec[i].cpu().tolist() meaningful_ids = [ val for val in rec_work if val != tokenizer.pad_token_id ][1:-1] # TODO (maybe) make this work with multiple sentences # this would involve splitting on tokenizer.sep_token_id and # doing this for each sentence curr_unk_data = get_current_unk_data( meaningful_ids, unks[i][0][0], tokenizer.unk_token_id ) curr_unk_data = (curr_unk_data,) s_rec = [ s.strip() for s in tokenizer.decode(meaningful_ids).split( tokenizer.sep_token ) ] s_rec = tuple(s_rec) # check if identical reconstruction or empty if s_rec not in sents[i] and "" not in s_rec: sents[i].append(s_rec) num_gen[i] += 1 num_tries[i] = 0 gen_index[i] = 0 unks[i].append(curr_unk_data) # otherwise try next sentence else: num_tries[i] += 1 gen_index[i] += 1 if gen_index[i] == len(sents[i]): gen_index[i] = 0 # clean up tensors del toks del masks
def _get_masked_language_model(self): """ Initializes the RoBERTaForMaskedLM transformer """ self.mlm = RobertaForMaskedLM.from_pretrained(self.model) self.mlm.eval()
def create_long_model(model_specified, attention_window, max_pos, save_model_to): """Starting from the `roberta-base` (or similar) checkpoint, the following function converts it into an instance of `RobertaLong`. It makes the following changes: 1)extend the position embeddings from `512` positions to `max_pos`. In Longformer, we set `max_pos=4096` 2)initialize the additional position embeddings by copying the embeddings of the first `512` positions. This initialization is crucial for the model performance (check table 6 in [the paper](https://arxiv.org/pdf/2004.05150.pdf) for performance without this initialization) 3) replaces `modeling_bert.BertSelfAttention` objects with `modeling_longformer.LongformerSelfAttention` with a attention window size `attention_window` The output of this function works for long documents even without pretraining. Check tables 6 and 11 in [the paper](https://arxiv.org/pdf/2004.05150.pdf) to get a sense of the expected performance of this model before pretraining.""" model = RobertaForMaskedLM.from_pretrained( model_specified) #,gradient_checkpointing=True) tokenizer = RobertaTokenizer.from_pretrained(model_specified, model_max_length=max_pos) config = model.config # extend position embeddings tokenizer.model_max_length = max_pos tokenizer.init_kwargs['model_max_length'] = max_pos current_max_pos, embed_size = model.roberta.embeddings.position_embeddings.weight.shape max_pos += 2 # NOTE: RoBERTa has positions 0,1 reserved, so embedding size is max position + 2 config.max_position_embeddings = max_pos assert max_pos > current_max_pos # allocate a larger position embedding matrix new_pos_embed = model.roberta.embeddings.position_embeddings.weight.new_empty( max_pos, embed_size) # copy position embeddings over and over to initialize the new position embeddings k = 2 step = current_max_pos - 2 while k < max_pos - 1: new_pos_embed[k:( k + step)] = model.roberta.embeddings.position_embeddings.weight[2:] k += step model.roberta.embeddings.position_embeddings.weight.data = new_pos_embed model.roberta.embeddings.position_embeddings.num_embeddings = len( new_pos_embed.data) # # first, check that model.roberta.embeddings.position_embeddings.weight.data.shape is correct — has to be 4096 (default) of your desired length # model.roberta.embeddings.position_ids = torch.arange( # 0, model.roberta.embeddings.position_embeddings.num_embeddings # )[None] model.roberta.embeddings.position_ids.data = torch.tensor( [i for i in range(max_pos)]).reshape(1, max_pos) # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention` config.attention_window = [attention_window] * config.num_hidden_layers for i, layer in enumerate(model.roberta.encoder.layer): longformer_self_attn = LongformerSelfAttention(config, layer_id=i) longformer_self_attn.query = copy.deepcopy(layer.attention.self.query) longformer_self_attn.key = copy.deepcopy(layer.attention.self.key) longformer_self_attn.value = copy.deepcopy(layer.attention.self.value) longformer_self_attn.query_global = copy.deepcopy( layer.attention.self.query) longformer_self_attn.key_global = copy.deepcopy( layer.attention.self.key) longformer_self_attn.value_global = copy.deepcopy( layer.attention.self.value) layer.attention.self = longformer_self_attn logger.info(f'saving model to {save_model_to}') model.save_pretrained(save_model_to) tokenizer.save_pretrained(save_model_to)
def main(args, logger): # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv') trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl') tst_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/test.csv') trn_df = pd.concat([trn_df, tst_df], axis=0).fillna(-1) trn_df['is_original'] = 1 # raw_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/raw_pseudo_tst_df.csv') # half_opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/half_opt_pseudo_tst_df.csv') # opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/opt_pseudo_tst_df.csv') # clean texts # trn_df = clean_data(trn_df, ['question_title', 'question_body', 'answer']) # load additional tokens # with open('./mnt/inputs/nes_info/trn_over_10_vocab.pkl', 'rb') as fin: # additional_tokens = pickle.load(fin) gkf = GroupKFold( n_splits=5).split( X=trn_df.question_body, groups=trn_df.question_body_le, ) histories = { 'trn_loss': {}, 'val_loss': {}, 'val_metric': {}, 'val_metric_raws': {}, } loaded_fold = -1 loaded_epoch = -1 if args.checkpoint: histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint) fold_best_metrics = [] fold_best_metrics_raws = [] for fold, (trn_idx, val_idx) in enumerate(gkf): if fold > 0: break if fold < loaded_fold: fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax(histories["val_metric"][fold])]) continue sel_log( f' --------------------------- start fold {fold} --------------------------- ', logger) fold_trn_df = trn_df.iloc[trn_idx] # .query('is_original == 1') fold_trn_df = fold_trn_df.drop( ['is_original', 'question_body_le'], axis=1) # use only original row fold_val_df = trn_df.iloc[val_idx].query('is_original == 1') fold_val_df = fold_val_df.drop( ['is_original', 'question_body_le'], axis=1) if args.debug: fold_trn_df = fold_trn_df.sample(100, random_state=71) trn_df = trn_df.sample(100, random_state=71) fold_val_df = fold_val_df.sample(100, random_state=71) temp = pd.Series(list(itertools.chain.from_iterable( fold_trn_df.question_title.apply(lambda x: x.split(' ')) + fold_trn_df.question_body.apply(lambda x: x.split(' ')) + fold_trn_df.answer.apply(lambda x: x.split(' ')) ))).value_counts() tokens = temp[temp >= 10].index.tolist() # tokens = [] tokens = [ 'CAT_TECHNOLOGY'.casefold(), 'CAT_STACKOVERFLOW'.casefold(), 'CAT_CULTURE'.casefold(), 'CAT_SCIENCE'.casefold(), 'CAT_LIFE_ARTS'.casefold(), ]# + additional_tokens fold_trn_df = trn_df.drop(['is_original', 'question_body_le'], axis=1) # fold_trn_df = pd.concat([fold_trn_df, raw_pseudo_df, opt_pseudo_df, half_opt_pseudo_df], axis=0) trn_dataset = QUESTDataset( df=fold_trn_df, mode='train', tokens=tokens, augment=[], tokenizer_type=TOKENIZER_TYPE, pretrained_model_name_or_path=TOKENIZER_PRETRAIN, do_lower_case=DO_LOWER_CASE, LABEL_COL=LABEL_COL, t_max_len=T_MAX_LEN, q_max_len=Q_MAX_LEN, a_max_len=A_MAX_LEN, tqa_mode=TQA_MODE, TBSEP='</s>', pos_id_type='arange', MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, use_category=False, ) # update token trn_sampler = RandomSampler(data_source=trn_dataset) trn_loader = DataLoader(trn_dataset, batch_size=BATCH_SIZE, sampler=trn_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=True, pin_memory=True) model = RobertaForMaskedLM.from_pretrained(MODEL_PRETRAIN) optimizer = optim.Adam(model.parameters(), lr=3e-5) scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=MAX_EPOCH, eta_min=1e-5) # load checkpoint model, optim, scheduler if args.checkpoint and fold == loaded_fold: load_checkpoint(args.checkpoint, model, optimizer, scheduler) for epoch in tqdm(list(range(MAX_EPOCH))): if fold <= loaded_fold and epoch <= loaded_epoch: continue # model = DataParallel(model) model = model.to(DEVICE) trn_loss = train_one_epoch_ML(model, optimizer, trn_loader, DEVICE) scheduler.step() if fold in histories['trn_loss']: histories['trn_loss'][fold].append(trn_loss) else: histories['trn_loss'][fold] = [trn_loss, ] if fold in histories['val_loss']: histories['val_loss'][fold].append(trn_loss) else: histories['val_loss'][fold] = [trn_loss, ] if fold in histories['val_metric']: histories['val_metric'][fold].append(trn_loss) else: histories['val_metric'][fold] = [trn_loss, ] if fold in histories['val_metric_raws']: histories['val_metric_raws'][fold].append(trn_loss) else: histories['val_metric_raws'][fold] = [trn_loss, ] sel_log( f'fold : {fold} -- epoch : {epoch} -- ' f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- ', logger) model = model.to('cpu') # model = model.module save_checkpoint( f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model, optimizer, scheduler, histories, [], [], [], fold, epoch, trn_loss, trn_loss, ) save_and_clean_for_prediction( f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', trn_dataset.tokenizer, clean=False) del model send_line_notification('fini!') sel_log('now saving best checkpoints...', logger)
import torch from transformers import RobertaForMaskedLM, RobertaTokenizer DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # TODO: move these global variables to local as we add metrics. # TODO: finetune `MODEL` on the QA/dialog dataset for a better metric MODEL = RobertaForMaskedLM.from_pretrained('roberta-large').to(DEVICE) TOKENIZER = RobertaTokenizer.from_pretrained('roberta-large') def mlm_metric(prompt, response): """ Masked language model metric. (Need to double-check implementation.) Parameters ---------- prompt : str The user comment. response : str The bot's response. Returns ------- Sum of negative probabilities (lower is better). """ tokens = TOKENIZER([[prompt, response]], return_tensors='pt') input_ids = tokens['input_ids'][0] dividing_indices = [
tokenizer2 = PreTrainedTokenizerFast.from_pretrained( dataset_path / 'my-pretrained-tokenizer-fast2', max_len=128) # 4. Check that the LM actually trained def to_gpu(x, *args, **kwargs): return x.cuda(*args, **kwargs) if USE_GPU else x # load trained model # os.system('tar xzvf PanTadeuszRoBERTa.tgz') model = RobertaForMaskedLM.from_pretrained(str(model_path)) model = to_gpu(model) model.device # ## generate model.eval() max_length = 100 eval = Evaluator(text_tokenizer, tokenizer2) # gen1 = evaluate('chwycił na taśmie przypięty', max_length=max_length, temperature=1.0) # print_eval(gen1) # gen1 # print_eval(evaluate('Litwo! Ojczyzno', max_length=max_length, temperature=1.0))
if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Prepare model class tempmodel(nn.Module): def __init__(self, roberta, insert_net, delete_net): super().__init__() self.roberta = roberta self.insert_net = insert_net self.delete_net = delete_net roberta = RobertaForMaskedLM.from_pretrained("roberta-base") insert_net = nn.Linear(768, 3) delete_net = nn.Linear(768, 3) roberta.load_state_dict( torch.load(os.path.join(args.from_dir, 'bert_model.bin'))) # if args.delete: # insert_net.load_state_dict(torch.load(os.path.join(args.from_dir, 'insert_model.bin'))) # else: from weight_init import weight_init insert_net.apply(weight_init) delete_net.apply(weight_init) # init CTRL code roberta.roberta.embeddings.word_embeddings.weight[ 50261, :].data = roberta.roberta.embeddings.word_embeddings.weight[ 0, :].data roberta.roberta.embeddings.word_embeddings.weight[
file_in = sys.argv[1] file_out = sys.argv[2] all_data_dict = dict() max_length = 100 tail_hidd_list = list() #device = "cpu" device = "cuda" pretrained_weights = 'roberta-base' tokenizer = RobertaTokenizer.from_pretrained(pretrained_weights) fine_tuned_weight = 'roberta-base' model = RobertaForMaskedLM.from_pretrained(pretrained_weights, output_hidden_states=True, return_dict=True) #model.load_state_dict(torch.load(fine_tuned_weight), strict=False) #model.to(device).half() model.to(device) model.eval() num_samples = 1000000 old = torch.FloatTensor(768) with open(file_in) as f: #data = json.load(f) for index, d in tqdm(enumerate(f)): if index == 1000000: break
intermediate_size=2048, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, bos_token_id=tokenizer2._tokenizer.token_to_id("<s>"), eos_token_id=tokenizer2._tokenizer.token_to_id("</s>"), pad_token_id=tokenizer2._tokenizer.token_to_id("<pad>"), attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.0, ) print(config) # model = RobertaForMaskedLM(config=config) # !tar xzvf "PanTadeuszRoBERTa.tgz" model = RobertaForMaskedLM.from_pretrained('runs/run_4/model') print(f'model.num_parameters(): {model.num_parameters()}') # Build training/eval Datasets print(fn_corpus) dataset = LineByLineTextDataset( tokenizer=tokenizer2, file_path=fn_corpus, block_size=128, ) eval_dataset = LineByLineTextDataset( tokenizer=tokenizer2, file_path=fn_corpus_eval, block_size=128, )
def main(): # 在 src/transformers/training_args.py中查看所有可能的参数,或将--help标志传递给此脚本。 # 现在,我们保留了不同的参数集,以使关注点更加清晰。 parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # 如果我们仅将一个参数传递给脚本,并且它是指向json文件的路径,那么让我们对其进行解析以获取参数。 model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"输出目录({training_args.output_dir}) 以及存在,并且不为空" "Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # 记录每个进程的日志 logger.warning( f"使用的 rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu} " + f"是否分布式训练: {bool(training_args.local_rank != -1)}, 16-bits 半精度训练: {training_args.fp16}" ) # 主进程的日志设为verbosity: if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("训练/评估参数 %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub # # For CSV/JSON files, this script will use the column called 'text' or the first column. You can easily tweak this # behavior (see below) # # 在分布式训练中,load_dataset函数可确保只有一个本地进程可以同时下载数据集。 if data_args.dataset_name is not None: # 从hub下载和加载数据集。 # 首先确定本地缓存了cache文件 cache_script = os.path.join("data", data_args.dataset_name+".py") if not os.path.exists(cache_script): raise Exception("请检查本地是否存在相关脚本文件") datasets = load_dataset(path=cache_script, name=data_args.dataset_config_name, data_dir=data_args.data_dir) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] if extension == "txt": extension = "text" datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # 加载预训练模型和tokenizer # # Distributed training: # .from_pretrained方法可确保只有一本地个进程可以同时下载模型和vocab。 if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning("你正从头开始初始化一个新的config.") # tokenizer的设置 if model_args.tokenizer_name: if model_args.tokenizer_name == "myroberta": tokenizer = BertTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer ) else: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer ) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer ) else: raise ValueError( "您正在从头实例化一个新的tokenizer。 此脚本不支持此功能。 " "您可以用其它形式训练好之后,在这里使用,使用方法: using --tokenizer_name." ) #模型的设置 if model_args.model_name_or_path: if model_args.model_name_or_path == 'myroberta': model = RobertaForMaskedLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: model = AutoModelForMaskedLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("从头开始训练一个模型") model = AutoModelForMaskedLM.from_config(config) #重设下tokenizer的大小,如果当我们从头训练新模型时,这是必须的 model.resize_token_embeddings(len(tokenizer)) # 处理数据集 # First we tokenize all the texts. if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] if data_args.line_by_line: # 按行处理, tokenize each nonempty line padding = "max_length" if data_args.pad_to_max_length else False def tokenize_function(examples): # 移除空行 # 收到的数据长度 print(f"收到的数据长度: {[len(t) for t in examples['text']]}") examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()] tokenizer_res = tokenizer( examples["text"], padding=padding, truncation=True, max_length=data_args.max_seq_length, # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it # receives the `special_tokens_mask`. return_special_tokens_mask=True, ) print(f"tokenizer之后的数据长度: {print([len(t) for t in tokenizer_res['input_ids']])}") return tokenizer_res tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=[text_column_name], load_from_cache_file=not data_args.overwrite_cache, ) else: # 否则,我们将tokenize每个文本,然后将它们拼接在一起,然后再将它们分成较小的部分。 # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more # efficient when it receives the `special_tokens_mask`. def tokenize_function(examples): return tokenizer(examples[text_column_name], return_special_tokens_mask=True) #默认一次处理1000行 tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if data_args.max_seq_length is None: max_seq_length = tokenizer.model_max_length else: if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"参数给定的 max_seq_length ({data_args.max_seq_length}) 比模型的 ({tokenizer.model_max_length}) 最大长度长. 使用模型的最大长度 max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) # 主要数据处理功能,可拼接数据集中的所有文本并生成max_seq_length的块。 def group_texts(examples): # 拼接所有文本。 concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) # 我们删除一小部分,如果模型支持该字段,则可以添加padding,而不是删除,您可以根据需要自定义此部分。 total_length = (total_length // max_seq_length) * max_seq_length # Split by chunks of max_len. result = { k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)] for k, t in concatenated_examples.items() } return result # 注意,使用batched=True`时,此映射一起处理1,000个文本,因此group_texts会丢弃这1,000个文本组中的每一个的余数。 您可以在此处调整该batch_size,但较高的值可能会较慢进行预处理。 # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map tokenized_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator # 这部分是随机mask token的设置 data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: model_path = ( model_args.model_name_or_path if (model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path)) else None ) trainer.train(model_path=model_path) trainer.save_model() # Saves the tokenizer too for easy upload # Evaluation results = {} if training_args.do_eval: logger.info("*** 开始评估 ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) results["perplexity"] = perplexity output_eval_file = os.path.join(training_args.output_dir, "eval_results_mlm.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") return results
'prediction': prediction, 'word_distance': word_distance, 'word_length': word_length, 'wer': wer, 'char_distance': char_distance, 'char_length': char_length, 'cer': cer } return result from transformers import RobertaForMaskedLM, RobertaTokenizer tokenizer = RobertaTokenizer.from_pretrained("jordimas/julibert") julibert = RobertaForMaskedLM.from_pretrained("jordimas/julibert").to("cuda") print(julibert.config) # Preprocessing the datasets. # We need to read the aduio files as arrays def evaluate(batch): inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True) with torch.no_grad(): logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
intermediate_size=256, max_position_embeddings=256, num_attention_heads=1, num_hidden_layers=1, type_vocab_size=1, bos_token_id=tokenizer._tokenizer.token_to_id("<s>"), eos_token_id=tokenizer._tokenizer.token_to_id("</s>"), pad_token_id=tokenizer._tokenizer.token_to_id("<pad>"), attention_probs_dropout_prob=0.1, hidden_dropout_prob=0.3, ) print(config) # model = RobertaForMaskedLM(config=config) model = RobertaForMaskedLM.from_pretrained('runs/esperberto/run_7/model') print(f'model.num_parameters(): {model.num_parameters()}') dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path=dataset_path / "oscar.eo.1000x10.txt", block_size=128, ) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) training_args = TrainingArguments( output_dir=str(run_path), logging_dir=str(run_path),
from transformers import RobertaTokenizerFast tokenizer = RobertaTokenizerFast.from_pretrained( "trained_models/distilbako-base-akuapem-twi-cased", max_len=512, do_lower_case=True) tokenizer.save_vocabulary( "trained_models/distilbako-base-asante-twi-uncased" ) # saving pretrained tokenizer locally in case of asante from transformers import RobertaForMaskedLM #model = RobertaForMaskedLM(config=config) # from scratch, for Akuapem model = RobertaForMaskedLM.from_pretrained( "trained_models/distilbako-base-akuapem-twi-cased" ) # fine-tune from Akuapem weights, for Asante print(model.num_parameters()) from transformers import LineByLineTextDataset dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path="asante_twi_bible.txt", block_size=128, ) from transformers import DataCollatorForLanguageModeling data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True,
'--evaluate_during_training', '--do_train', '--do_eval', ]) training_args.val_datapath = [ '/media/txguo/866225e9-e15c-2d46-aba5-1ce1c0452e49/download_pdf/another_samples/1_5001.txt', '/media/txguo/866225e9-e15c-2d46-aba5-1ce1c0452e49/download_pdf/another_samples/5001_10001.txt' ] training_args.train_datapath = [ '/media/txguo/866225e9-e15c-2d46-aba5-1ce1c0452e49/download_pdf/another_samples/10001_15001.txt', '/media/txguo/866225e9-e15c-2d46-aba5-1ce1c0452e49/download_pdf/another_samples/15001_20001.txt' ] if __name__ == "__main__": roberta_base = RobertaForMaskedLM.from_pretrained( './pretrained_model/roberta_chinese_base') roberta_base_tokenizer = BertTokenizerFast.from_pretrained( './pretrained_model/roberta_chinese_base') # logger.info('Evaluating roberta-base (seqlen: 512) for refernece ...') # pretrain_and_evaluate(training_args, roberta_base, roberta_base_tokenizer, eval_only=True, model_path=None) model_path = f'{training_args.output_dir}/roberta-base-{model_args.max_pos}' if not os.path.exists(model_path): os.makedirs(model_path) logger.info( f'Converting roberta-base into roberta-base-{model_args.max_pos}') model, tokenizer = create_long_model( save_model_to=model_path, attention_window=model_args.attention_window, max_pos=model_args.max_pos)
def __init__(self, model="bert", postag_file='saved_objects/postag_dict_all+VBN.p', syllables_file='saved_objects/cmudict-0.7b.txt', extra_stress_file='saved_objects/edwins_extra_stresses.txt', top_file='saved_objects/words/top_words.txt', templates_file='poems/scenery_templates.txt', mistakes_file='saved_objects/mistakes.txt'): #self.templates = [("FROM scJJS scNNS PRP VBZ NN", "0_10_10_1_01_01"), # ("THAT scJJ scNN PRP VBD MIGHT RB VB", "0_10_10_1_0_10_1"), # ("WHERE ALL THE scNNS OF PRP$ JJ NNS", "0_1_0_10_1_0_10_1"), # ("AND THAT JJ WHICH RB VBZ NN", "0_1_01_0_10_1_01")] with open(templates_file) as tf: self.templates = [(" ".join(line.split()[:-1]), line.split()[-1]) for line in tf.readlines()] with open(postag_file, 'rb') as f: self.postag_dict = pickle.load(f) self.pos_to_words, self.words_to_pos = helper.get_pos_dict( postag_file, mistakes_file=mistakes_file) self.special_words = helper.get_finer_pos_words() self.dict_meters = helper.create_syll_dict([syllables_file], extra_stress_file) if model == "bert": self.lang_model = BertForMaskedLM.from_pretrained( 'bert-base-uncased') self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.lang_vocab = list(self.tokenizer.vocab.keys()) self.lang_model.eval() self.vocab_to_num = self.tokenizer.vocab elif model == "roberta": self.lang_model = RobertaForMaskedLM.from_pretrained( 'roberta-base') # 'roberta-base' self.tokenizer = RobertaTokenizer.from_pretrained( 'roberta-base') # 'roberta-large' with open("saved_objects/roberta/vocab.json") as json_file: j = json.load(json_file) self.lang_vocab = list(j.keys()) self.lang_model.eval() self.vocab_to_num = { self.lang_vocab[x]: x for x in range(len(self.lang_vocab)) } else: self.lang_model = None self.poems = pd.read_csv('poems/kaggle_poem_dataset.csv')['Content'] with open(top_file) as tf: self.top_common_words = [line.strip() for line in tf.readlines()][:125] self.stemmer = PorterStemmer() self.api_url = 'https://api.datamuse.com/words' self.gender = random.choice([["he", "him", "his", "himself"], ["she", "her", "hers", "herself"]])
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased', do_lower_case=False) model = BertForMaskedLM.from_pretrained('./multi-label_LM/multi-label_Bert_e10_b16', config=config) #model = BertForMaskedLM.from_pretrained('./multi-label_train.csv_LMmodel', config=config) # 12-layer, 768-hidden, 12-heads, 110M parameters. elif args.LM == 'RoBerta': from transformers import RobertaConfig, RobertaTokenizerFast, RobertaForMaskedLM config = RobertaConfig(vocab_size=50265, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=12, type_vocab_size=1, ) tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lower_case=False) model = RobertaForMaskedLM.from_pretrained('./multi-label_LM/multi-label_RoBerta_e10_b16', config=config) # 12-layer, 768-hidden, 12-heads, 125M parameters, roberta-base using the bert-base architecture elif args.LM == 'XLM': from transformers import XLMConfig, XLMTokenizer, XLMWithLMHeadModel config = XLMConfig(vocab_size=64139, emb_dim=1024, max_position_embeddings=512, n_heads=8, n_layers=6, ) tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-enfr-1024', do_lower_case=False) model = XLMWithLMHeadModel.from_pretrained('./multi-label_LM/multi-label_XLM_e10_b16', config=config) # 6-layer, 1024-hidden, 8-heads
def evaluate(args): """ Evaluate a masked language model using CrowS-Pairs dataset. """ print("Evaluating:") print("Input:", args.input_file) print("Model:", args.lm_model) print("=" * 100) logging.basicConfig(level=logging.INFO) # load data into panda DataFrame df_data = read_data(args.input_file) # supported masked language models if args.lm_model == "scibert-bert": tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased') model = BertForMaskedLM.from_pretrained('allenai/scibert_scivocab_uncased') uncased = True elif args.lm_model == "biobert-bert": tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-v1.1') model = BertForMaskedLM.from_pretrained('dmis-lab/biobert-v1.1') uncased = True elif args.lm_model == "scibert-roberta": tokenizer = RobertaTokenizer.from_pretrained('allenai/scibert_scivocab_uncased') model = RobertaForMaskedLM.from_pretrained('allenai/scibert_scivocab_uncased') uncased = True elif args.lm_model == "biobert-roberta": tokenizer = RobertaTokenizer.from_pretrained('dmis-lab/biobert-v1.1') model = RobertaForMaskedLM.from_pretrained('dmis-lab/biobert-v1.1') uncased = True elif args.lm_model == "roberta": tokenizer = RobertaTokenizer.from_pretrained('roberta-large') model = RobertaForMaskedLM.from_pretrained('roberta-large') uncased = False elif args.lm_model == "bert": tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForMaskedLM.from_pretrained('bert-base-uncased') uncased = True model.eval() model.to('cuda') mask_token = tokenizer.mask_token log_softmax = torch.nn.LogSoftmax(dim=0) vocab = tokenizer.get_vocab() with open(args.lm_model + ".vocab", "w") as f: f.write(json.dumps(vocab)) lm = {"model": model, "tokenizer": tokenizer, "mask_token": mask_token, "log_softmax": log_softmax, "uncased": uncased } # score each sentence. # each row in the dataframe has the sentid and score for pro and anti stereo. df_score = pd.DataFrame(columns=['sent_more', 'sent_less', 'sent_more_score', 'sent_less_score', 'score', 'stereo_antistereo', 'bias_type']) total_stereo, total_antistereo = 0, 0 stereo_score, antistereo_score = 0, 0 N = 0 neutral = 0 total = len(df_data.index) with tqdm(total=total) as pbar: for index, data in df_data.iterrows(): direction = data['direction'] bias = data['bias_type'] score = mask_unigram(data, lm) for stype in score.keys(): score[stype] = round(score[stype], 3) N += 1 pair_score = 0 pbar.update(1) if score['sent1_score'] == score['sent2_score']: neutral += 1 else: if direction == 'stereo': total_stereo += 1 if score['sent1_score'] > score['sent2_score']: stereo_score += 1 pair_score = 1 elif direction == 'antistereo': total_antistereo += 1 if score['sent2_score'] > score['sent1_score']: antistereo_score += 1 pair_score = 1 sent_more, sent_less = '', '' if direction == 'stereo': sent_more = data['sent1'] sent_less = data['sent2'] sent_more_score = score['sent1_score'] sent_less_score = score['sent2_score'] else: sent_more = data['sent2'] sent_less = data['sent1'] sent_more_score = score['sent1_score'] sent_less_score = score['sent2_score'] df_score = df_score.append({'sent_more': sent_more, 'sent_less': sent_less, 'sent_more_score': sent_more_score, 'sent_less_score': sent_less_score, 'score': pair_score, 'stereo_antistereo': direction, 'bias_type': bias }, ignore_index=True) df_score.to_csv(args.output_file) print('=' * 100) print('Total examples:', N) print('Metric score:', round((stereo_score + antistereo_score) / N * 100, 2)) print('Stereotype score:', round(stereo_score / total_stereo * 100, 2)) if antistereo_score != 0: print('Anti-stereotype score:', round(antistereo_score / total_antistereo * 100, 2)) print("Num. neutral:", neutral, round(neutral / N * 100, 2)) print('=' * 100) print()
# %% import torch import string from transformers import RobertaTokenizer, RobertaForMaskedLM roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base') roberta_model = RobertaForMaskedLM.from_pretrained('roberta-base').eval() top_k = 10 def decode(tokenizer, pred_idx, top_clean): ignore_tokens = string.punctuation + '[PAD]' tokens = [] for w in pred_idx: token = ''.join(tokenizer.decode(w).split()) if token not in ignore_tokens: tokens.append(token.replace('##', '')) return '\n'.join(tokens[:top_clean]) def encode(tokenizer, text_sentence, add_special_tokens=True): text_sentence = text_sentence.replace('<mask>', tokenizer.mask_token) # if <mask> is the last token, append a "." so that models dont predict punctuation. if tokenizer.mask_token == text_sentence.split()[-1]: text_sentence += ' .' input_ids = torch.tensor([ tokenizer.encode(text_sentence, add_special_tokens=add_special_tokens) ]) mask_idx = torch.where(input_ids == tokenizer.mask_token_id)[1].tolist()[0]
if not os.path.exists(output_dir): os.makedirs(output_dir) export_model_path = os.path.join(output_dir, "phobert-base-formaskedlm.onnx") model_name_or_path = "vinai/phobert-base" cache_dir = "./cache_models" enable_overwrite = True tokenizer = PhobertTokenizer.from_pretrained(model_name_or_path) input_ids = torch.tensor( tokenizer.encode("Hôm nay trời <mask> quá", add_special_tokens=True)).unsqueeze(0) ort_session = onnxruntime.InferenceSession(export_model_path) # compute ONNX Runtime output prediction ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(input_ids)} ort_out = ort_session.run(None, ort_inputs) print(len(ort_out[0][0][5])) config = RobertaConfig.from_pretrained(model_name_or_path, cache_dir=cache_dir) model = RobertaForMaskedLM.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir) with torch.no_grad(): out = model(input_ids) print("***** Verifying correctness *****") print('PyTorch and ONNX Runtime output are close: {}'.format( np.allclose(to_numpy(out[0]), ort_out[0], rtol=1e-03, atol=1e-04)))