def __init__(self): super().__init__() self.bert = BertForMaskedLM.from_pretrained("bert-base-uncased")
def init(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=False, help="The input training data file (a text file).", ) parser.add_argument( "--output_dir", default=None, type=str, required=False, help="The output directory where the model predictions and checkpoints will be written.", ) # Optional parameters parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument( "--do_evaluate", type=bool, default=False, help="Run model evaluation on out-of-sample data.", ) parser.add_argument("--do_train", type=bool, default=False, help="Run training.") parser.add_argument( "--do_overwrite_output_dir", type=bool, default=False, help="Whether to overwrite the output dir.", ) parser.add_argument( "--encoder_model_name_or_path", default="bert-base-cased", type=str, help="The model checkpoint to initialize the encoder's weights with.", ) parser.add_argument( "--decoder_model_name_or_path", default="/data/zhuoyu/semantic_parsing/models", type=str, help="The model checkpoint to initialize the decoder's weights with.", ) parser.add_argument( "--model_type", default="bert", type=str, help="The decoder architecture to be fine-tuned.", ) parser.add_argument( "--max_grad_norm", default=1.0, type=float, help="Max gradient norm." ) parser.add_argument( "--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument( "--to_cpu", default=False, type=bool, help="Whether to force training on CPU." ) parser.add_argument( "--num_train_epochs", default=10, type=int, help="Total number of training epochs to perform.", ) parser.add_argument( "--per_gpu_eval_batch_size", default=4, type=int, help="Batch size per GPU/CPU for eval.", ) parser.add_argument( "--per_gpu_train_batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.", ) parser.add_argument( "--input_block_size", default=256, type=int, help="Max seq length for input", ) parser.add_argument( "--output_block_size", default=128, type=int, help="Max seq length for output", ) parser.add_argument( "--trained_checkpoints", default="/data/zhuoyu/semantic_parsing/chemistry_bert_parser_binary", type=str, help="trained_checkpoints", ) parser.add_argument( "--decoding_type", default="decoding", type=str, help="", ) parser.add_argument( "--encoder_lr", default=5e-4, type=float, help="encoder's learning rate", ) parser.add_argument( "--decoder_lr", default=5e-4, type=float, help="encoder's learning rate", ) parser.add_argument( "--encoder_warmup", default=10, type=int, help="encoder's learning rate", ) parser.add_argument( "--decoder_warmup", default=100, type=int, help="encoder's learning rate", ) parser.add_argument("--seed", default=42, type=int) args = parser.parse_args() # Set up training device if args.to_cpu or not torch.cuda.is_available(): args.device = torch.device("cpu") args.n_gpu = 0 else: args.device = torch.device("cuda") args.n_gpu = torch.cuda.device_count() print(args.n_gpu) # Load pretrained model and tokenizer. The decoder's weights are randomly initialized. tokenizer = AutoTokenizer.from_pretrained(args.encoder_model_name_or_path ,never_split=['[unused0]','[unused1]','[unused2]','[unused3]']) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", 0, args.device, args.n_gpu, False, False, ) logger.info("Training/evaluation parameters %s", args) checkpoint=args.trained_checkpoints encoder_checkpoint = os.path.join(checkpoint, "encoder") decoder_checkpoint_question_varibles = os.path.join(checkpoint, "decoder_0") decoder_checkpoint_conditions = os.path.join(checkpoint, "decoder_1") decoder_models = [BertForMaskedLM.from_pretrained(decoder_checkpoint_question_varibles), BertForMaskedLM.from_pretrained(decoder_checkpoint_conditions)] model = Model2Models.from_pretrained( encoder_checkpoint, decoder_model=decoder_models ) model.to(args.device) model.eval() processor=ChemistryProcessor() return args,model,tokenizer,processor
from flask import Flask, render_template, request, url_for, jsonify, make_response from flask_cors import CORS import json import torch import wordfreq from transformers import BertTokenizer, BertForMaskedLM app = Flask(__name__) # https://stackoverflow.com/questions/37575089/disable-template-cache-jinja2 app.config['TEMPLATES_AUTO_RELOAD'] = True CORS(app) tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') model = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") def prepareInputs(init_text): # List of punctuation to determine where segments end punc_list = [".", "?", "!"] # Prepend the [CLS] tag prompt_text = "[CLS] " + init_text # Insert the [SEP] tags for i in range(0, len(prompt_text)): if prompt_text[i] in punc_list: prompt_text = prompt_text[:i + 1] + " [SEP]" + prompt_text[i + 1:] return prompt_text def createSegIDs(tokenized_text): currentSeg = 0
def train_mlm(lr=3e-5, epoch=20, save_epoch_cnt=2, save_batch_cnt=500, mult_batch=True, mult_cnt=batch_mult_cnt): model = BertForMaskedLM.from_pretrained('../' + model_path) masked_ids, token_type_idss, attention_masks, input_idss = \ pickle.load(open('../preprocess/train_data_for_mlm.pk', 'rb')) batch_cnt = len(masked_ids) print('data preparation finished, ' + str(len(masked_ids)) + ' batch in total') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.to(device) optimizer = AdamW(model.parameters(), lr=lr) for i_epoch in range(epoch): model.train() epoch_total_loss = 0.0 temp_batch_cnt = 0 accu_loss = None epoch_run_time = 0 for i_batch, masked_batch in enumerate(masked_ids): time_start = time.time() masked_batch = masked_batch.cuda() input_ids = input_idss[i_batch].cuda() token_type_ids = token_type_idss[i_batch].cuda() attention_mask = attention_masks[i_batch].cuda() if mult_batch: outputs = model(input_ids=masked_batch, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=input_ids) if accu_loss is None: accu_loss = outputs[0] else: accu_loss = accu_loss + outputs[0] temp_batch_cnt += 1 if temp_batch_cnt >= mult_cnt: temp_batch_cnt = 0 accu_loss.backward() optimizer.step() model.zero_grad() epoch_total_loss += accu_loss.float() # record time time_end = time.time() epoch_run_time += time_end - time_start speed = epoch_run_time / (i_batch + 1) eta = int((batch_cnt - (i_batch + 1)) * speed) print( f'epoch:{i_epoch + 1} batch:{i_batch + 1} loss:{accu_loss.float()} avg loss:{epoch_total_loss / (i_batch + 1)} ETA:{eta // 3600}:{(eta % 3600) // 60}:{eta % 60}' ) else: model.zero_grad() outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=masked_batch) loss = outputs[0] loss.backward() optimizer.step() epoch_total_loss += loss.float() # record time time_end = time.time() epoch_run_time += time_end - time_start speed = epoch_run_time / (i_batch + 1) eta = int((batch_cnt - (i_batch + 1)) * speed) print( f'epoch:{i_epoch + 1} batch:{i_batch + 1} loss:{loss.float()} avg loss:{epoch_total_loss / (i_batch + 1)} ETA:{eta // 3600}:{(eta % 3600) // 60}:{eta % 60}' ) if (i_batch + 1) % save_batch_cnt == 0: save_name = '../' + model_path + '_ContTrain_epoch_' + str( i_epoch + 1) + '_batch_' + str(i_batch + 1) + '_bsz_' + str(mlm_bsz) print('saving models as:' + save_name) model.bert.save_pretrained(save_name) if (i_epoch + 1) % save_epoch_cnt == 0: save_name = '../' + model_path + '_ContTrain_epoch_' + str( i_epoch + 1) + '_bsz_' + str(mlm_bsz) print('saving models as:' + save_name) model.bert.save_pretrained(save_name)
df_lineage = pd.read_csv(lineage[1], sep='\t', names=["Repo"])[:20000] print("CSVs loaded") docstrings_avg_vec = np.load(docstrings_vecs[1], allow_pickle=True) config = BertConfig.from_json_file(model_path[1] + '/config.json') config.output_hidden_states = True print("Tokenizer and model initialized") tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') device = torch.device('cpu') model = BertForMaskedLM.from_pretrained("bert-base-uncased", config=config) model.load_state_dict( torch.load(model_path[1] + "/pytorch_model.bin", map_location=device)) model.eval() # Initialize a new index, using a HNSW index on Cosine Similarity index = nmslib.init(method='hnsw', space='cosinesimil') index.addDataPointBatch(docstrings_avg_vec) index.createIndex({'post': 2}, print_progress=True) print("Index made") # Routes: @app.route('/hello') def hello_world():
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help="The input training data file (a text file).", ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) # Optional parameters parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument( "--do_evaluate", type=bool, default=False, help="Run model evaluation on out-of-sample data.", ) parser.add_argument("--do_train", type=bool, default=False, help="Run training.") parser.add_argument( "--do_overwrite_output_dir", type=bool, default=False, help="Whether to overwrite the output dir.", ) parser.add_argument( "--encoder_model_name_or_path", default="bert-base-cased", type=str, help="The model checkpoint to initialize the encoder's weights with.", ) parser.add_argument( "--decoder_model_name_or_path", default="/data/zhuoyu/semantic_parsing/models", type=str, help="The model checkpoint to initialize the decoder's weights with.", ) parser.add_argument( "--model_type", default="bert", type=str, help="The decoder architecture to be fine-tuned.", ) parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--to_cpu", default=False, type=bool, help="Whether to force training on CPU.") parser.add_argument( "--num_train_epochs", default=10, type=int, help="Total number of training epochs to perform.", ) parser.add_argument( "--per_gpu_eval_batch_size", default=4, type=int, help="Batch size per GPU/CPU for eval.", ) parser.add_argument( "--per_gpu_train_batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.", ) parser.add_argument( "--input_block_size", default=256, type=int, help="Max seq length for input", ) parser.add_argument( "--output_block_size", default=64, type=int, help="Max seq length for output", ) parser.add_argument( "--trained_checkpoints", default="", type=str, help="trained_checkpoints", ) parser.add_argument( "--decoding_type", default="pnt", type=str, help="", ) parser.add_argument( "--encoder_lr", default=5e-4, type=float, help="encoder's learning rate", ) parser.add_argument( "--decoder_lr", default=5e-4, type=float, help="encoder's learning rate", ) parser.add_argument( "--encoder_warmup", default=10, type=int, help="encoder's learning rate", ) parser.add_argument( "--decoder_warmup", default=100, type=int, help="encoder's learning rate", ) parser.add_argument("--seed", default=42, type=int) parser.add_argument( "--decoder_version", default="v1", type=str, help="", ) args = parser.parse_args() if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.do_overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. Use --do_overwrite_output_dir to overwrite." .format(args.output_dir)) # Set up training device if args.to_cpu or not torch.cuda.is_available(): args.device = torch.device("cpu") args.n_gpu = 0 else: args.device = torch.device("cuda") args.n_gpu = torch.cuda.device_count() print(args.n_gpu) # Load pretrained model and tokenizer. The decoder's weights are randomly initialized. tokenizer = AutoTokenizer.from_pretrained( args.encoder_model_name_or_path, never_split=['[unused0]', '[unused1]', '[unused2]', '[unused3]']) #config = BertConfig.from_pretrained(args.model_name_or_path) #config.num_hidden_layers=3 #config.is_decoder=True #decoder_model = BertForMaskedLM(config) decoder_models = [ BertForMaskedLM.from_pretrained(args.decoder_model_name_or_path), BertForMaskedLM.from_pretrained(args.decoder_model_name_or_path) ] model = Model2Models.from_pretrained(args.encoder_model_name_or_path, decoder_model=decoder_models) #model = Model2Model.from_pretrained( # args.model_name_or_path, decoder_model=None #) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", 0, args.device, args.n_gpu, False, False, ) logger.info("Training/evaluation parameters %s", args) # Train the model if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if args.do_train: model.to(args.device) global_step, tr_loss = train(args, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = (model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) torch.save(args, os.path.join(args.output_dir, "training_arguments.bin")) # Evaluate the model results = {} if args.do_evaluate: checkpoints = [args.trained_checkpoints] logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: encoder_checkpoint = os.path.join(checkpoint, "encoder") decoder_checkpoint_question_varibles = os.path.join( checkpoint, "decoder_0") decoder_checkpoint_conditions = os.path.join( checkpoint, "decoder_1") decoder_models = [ BertForMaskedLM.from_pretrained( decoder_checkpoint_question_varibles), BertForMaskedLM.from_pretrained(decoder_checkpoint_conditions) ] model = Model2Models.from_pretrained(encoder_checkpoint, decoder_model=decoder_models) model.to(args.device) #model = PreTrainedEncoderDecoder.from_pretrained( # encoder_checkpoint, decoder_checkpoint #) #model = Model2Model.from_pretrained(encoder_checkpoint) #model.to(args.device) results = "placeholder" evaluate(args, model, tokenizer, "test") return results
self.data = fact_ru + wiki_ru + lenta_ru def __len__(self): return len(self.data) def __getitem__(self, i): return self.tokenizer.encode_plus(self.data[i], **self.tokenizer_params) #check() tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased", max_len=128) model_config = BertConfig.from_json_file("config.json") model = BertForMaskedLM(model_config) #model = BertForMaskedLM.from_pretrained("outputs/checkpoint-15000") dataset = RuDataset(tokenizer) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) trainer = Trainer( model, data_collator=data_collator, train_dataset=dataset, tokenizer=tokenizer, # prediction_loss_only=True, args=TrainingArguments(output_dir="outputs",
def main(): cfg = args_parse() # 如果不存在训练文件则先处理数据 if not os.path.exists(cfg.DATASETS.TRAIN): logger.debug('preprocess data') preprocess.main() logger.info(f'load model, model arch: {cfg.MODEL.NAME}') tokenizer = BertTokenizer.from_pretrained(cfg.MODEL.BERT_CKPT) collator = DataCollator(tokenizer=tokenizer) # 加载数据 train_loader, valid_loader, test_loader = make_loaders( collator, train_path=cfg.DATASETS.TRAIN, valid_path=cfg.DATASETS.VALID, test_path=cfg.DATASETS.TEST, batch_size=cfg.SOLVER.BATCH_SIZE, num_workers=4) if cfg.MODEL.NAME == 'softmaskedbert4csc': model = SoftMaskedBert4Csc(cfg, tokenizer) elif cfg.MODEL.NAME == 'macbert4csc': model = MacBert4Csc(cfg, tokenizer) else: raise ValueError("model not found.") # 加载之前保存的模型,继续训练 if cfg.MODEL.WEIGHTS and os.path.exists(cfg.MODEL.WEIGHTS): model.load_from_checkpoint(checkpoint_path=cfg.MODEL.WEIGHTS, cfg=cfg, map_location=device, tokenizer=tokenizer) # 配置模型保存参数 os.makedirs(cfg.OUTPUT_DIR, exist_ok=True) ckpt_callback = ModelCheckpoint(monitor='val_loss', dirpath=cfg.OUTPUT_DIR, filename='{epoch:02d}-{val_loss:.2f}', save_top_k=1, mode='min') # 训练模型 logger.info('train model ...') trainer = pl.Trainer( max_epochs=cfg.SOLVER.MAX_EPOCHS, gpus=None if device == torch.device('cpu') else cfg.MODEL.GPU_IDS, accumulate_grad_batches=cfg.SOLVER.ACCUMULATE_GRAD_BATCHES, callbacks=[ckpt_callback]) # 进行训练 # train_loader中有数据 torch.autograd.set_detect_anomaly(True) if 'train' in cfg.MODE and train_loader and len(train_loader) > 0: if valid_loader and len(valid_loader) > 0: trainer.fit(model, train_loader, valid_loader) else: trainer.fit(model, train_loader) logger.info('train model done.') # 模型转为transformers可加载 if ckpt_callback and len(ckpt_callback.best_model_path) > 0: ckpt_path = ckpt_callback.best_model_path elif cfg.MODEL.WEIGHTS and os.path.exists(cfg.MODEL.WEIGHTS): ckpt_path = cfg.MODEL.WEIGHTS else: ckpt_path = '' logger.info(f'ckpt_path: {ckpt_path}') if ckpt_path and os.path.exists(ckpt_path): model.load_state_dict(torch.load(ckpt_path)['state_dict']) # 先保存原始transformer bert model tokenizer.save_pretrained(cfg.OUTPUT_DIR) bert = BertForMaskedLM.from_pretrained(cfg.MODEL.BERT_CKPT) bert.save_pretrained(cfg.OUTPUT_DIR) state_dict = torch.load(ckpt_path)['state_dict'] new_state_dict = OrderedDict() if cfg.MODEL.NAME in ['macbert4csc']: for k, v in state_dict.items(): if k.startswith('bert.'): new_state_dict[k[5:]] = v else: new_state_dict = state_dict # 再保存finetune训练后的模型文件,替换原始的pytorch_model.bin torch.save(new_state_dict, os.path.join(cfg.OUTPUT_DIR, 'pytorch_model.bin')) # 进行测试的逻辑同训练 if 'test' in cfg.MODE and test_loader and len(test_loader) > 0: trainer.test(model, test_loader)
SAVE_MODEL + "/merges.txt", ) tokenizer.enable_truncation(max_length=512) print(tokenizer.encode("For it is in reality vain to profess")) config = BertConfig( vocab_size=52_000, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, ) tokenizer = BertTokenizer.from_pretrained(SAVE_MODEL, max_len=512) model = BertForMaskedLM(config=config) print(model.num_parameters()) dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path=PATH + "/kant.txt", block_size=128, ) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) training_args = TrainingArguments( output_dir=SAVE_MODEL, overwrite_output_dir=True,
def main(): parser = argparse.ArgumentParser() # Model and data are required parser.add_argument( "--dir_pretrained_model", type=str, required=True, help= "Dir containing pre-trained model (checkpoint), which may have been fine-tuned already." ) # Required for certain modes (--resume, --do_train, --eval_during_training, --do_eval or --do_pred) parser.add_argument( "--dir_train", type=str, help= ("Dir containing training data (n files named <lang>.train containing unlabeled text)" )) parser.add_argument( "--dir_output", type=str, help= "Directory in which model will be written (required if --do_train (but not --resume) or --do_pred)" ) parser.add_argument( "--path_dev", type=str, help="Path of 2-column TSV file containing labeled validation examples." ) parser.add_argument( "--path_test", type=str, required=False, help="Path of text file containing unlabeled test examples.") # Execution modes parser.add_argument( "--resume", action="store_true", help= "Resume training model in --dir_pretrained_model (note: --dir_output will be ignored)" ) parser.add_argument("--do_train", action="store_true", help="Run training") parser.add_argument("--eval_during_training", action="store_true", help="Run evaluation on dev set during training") parser.add_argument("--do_eval", action="store_true", help="Evaluate model on dev set") parser.add_argument("--do_pred", action="store_true", help="Run prediction on test set") # Score to optimize on dev set (by early stopping) parser.add_argument( "--score_to_optimize", choices=["track1", "track2", "track3"], default="track3", help="Score to optimize on dev set during training (by early stopping)." ) # Hyperparameters parser.add_argument( "--freeze_encoder", action="store_true", help= "Freeze weights of pre-trained encoder. (Note: in this case, we do not keep doing MLM.)" ) parser.add_argument( "--no_mlm", action="store_true", help= "Do not keep doing masked language modeling (MLM) during fine-tuning.") parser.add_argument( "--sampling_alpha", type=float, default=1.0, help= "Dampening factor for relative frequencies used to compute language sampling probabilities" ) parser.add_argument( "--weight_relevant", type=float, default=1.0, help= "Relative sampling frequency of relevant languages wrt irrelevant languages" ) parser.add_argument("--train_batch_size", default=16, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for evaluation.") parser.add_argument( "--seq_len", default=128, type=int, help= "Length of input sequences. Shorter seqs are padded, longer ones are trucated" ) parser.add_argument("--learning_rate", default=1e-4, type=float, help="The initial learning rate for AdamW optimizer.") parser.add_argument("--equal_betas", action='store_true', help="Use beta1=beta2=0.9 for AdamW optimizer.") parser.add_argument( "--correct_bias", action='store_true', help= "Correct bias in AdamW optimizer (correct_bias=False is meant to reproduce BERT behaviour exactly." ) parser.add_argument( "--max_train_steps", default=1000000, type=int, help= "Maximum number of training steps to perform. Note: # optimization steps = # train steps / # accumulation steps." ) parser.add_argument( "--num_train_steps_per_epoch", default=1000, type=int, help= "Number of training steps that equals one epoch. Note: # optimization steps = # train steps / # accumulation steps." ) parser.add_argument( '--grad_accum_steps', type=int, default=1, help= "Number of training steps (i.e. batches) to accumualte before performing a backward/update pass." ) parser.add_argument( "--num_gpus", type=int, default=-1, help="Num GPUs to use for training (0 for none, -1 for all available)") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") args = parser.parse_args() # Distributed or parallel? if args.local_rank != -1 or args.num_gpus > 1: raise NotImplementedError( "No distributed or parallel training available at the moment.") if torch.cuda.is_available(): args.device = torch.device("cuda") args.n_gpu = 1 else: args.device = torch.device("cpu") args.n_gpu = 0 # Check execution mode assert args.resume or args.do_train or args.do_eval or args.do_pred if args.resume: assert not args.do_train assert not args.do_eval assert not args.do_pred # Load checkpoint. This contains a pre-trained model which may or # may not have been fine-tuned for language identification already logger.info("Loading checkpoint...") checkpoint_path = os.path.join(args.dir_pretrained_model, "checkpoint.tar") checkpoint_data = torch.load(checkpoint_path) if args.resume: # Check progress logger.info("Resuming training. Currently at training step %d" % checkpoint_data["global_step"]) # Replace args with initial args for this job, except for # num_gpus, seed and model directory current_num_gpus = args.n_gpu current_dir_pretrained_model = args.dir_pretrained_model args = deepcopy(checkpoint_data["initial_args"]) args.num_gpus = current_num_gpus args.dir_pretrained_model = current_dir_pretrained_model args.resume = True logger.info("Args (most have been reloaded from checkpoint): %s" % args) else: if args.eval_during_training: assert args.do_train if args.do_train or args.do_pred: assert args.dir_output is not None if os.path.exists(args.dir_output) and os.path.isdir( args.dir_output) and len(os.listdir(args.dir_output)) > 1: msg = "%s already exists and is not empty" % args.dir_output raise ValueError(msg) if not os.path.exists(args.dir_output): os.makedirs(args.dir_output) if args.do_train: assert args.dir_train is not None train_paths = glob.glob(os.path.join(args.dir_train, "*.train")) assert len(train_paths) > 0 checkpoint_data["initial_args"] = args if args.do_train and args.freeze_encoder and not args.no_mlm: logger.warning( "Setting --no_mlm to True since --freeze_encoder is True, therefore doing MLM would be pointless." ) args.no_mlm = True if args.do_eval or args.eval_during_training: assert args.path_dev is not None assert os.path.exists(args.path_dev) if args.do_pred: assert args.path_test is not None assert os.path.exists(args.path_test) if args.grad_accum_steps < 1: raise ValueError( "Invalid grad_accum_steps parameter: {}, should be >= 1".format( args.grad_accum_steps)) # Create list of languages we handle lang_list = sorted(ALL_LANGS) # Seed RNGs np.random.seed(args.seed) torch.manual_seed(args.seed) if args.n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # Load tokenizer logger.info("Loading tokenizer...") tokenizer_path = os.path.join(args.dir_pretrained_model, "tokenizer.pkl") with open(tokenizer_path, "rb") as f: tokenizer = pickle.load(f) # Make encoder and model logger.info("Making encoder...") encoder_config = BertConfig.from_json_file( os.path.join(args.dir_pretrained_model, "config.json")) encoder = BertForMaskedLM(encoder_config) logger.info("Making model...") model = BertForLangID(encoder, lang_list) model.to(args.device) # Load model weights. First, check if we just have an encoder, or a previously fine-tuned model if "classifier.dense.weight" in checkpoint_data["model_state_dict"]: if "best_model_state_dict" in checkpoint_data and not args.resume: logger.info("Loading model weights from 'best_model_state_dict'") model.load_state_dict(checkpoint_data["best_model_state_dict"]) else: logger.info("Loading model weights from 'model_state_dict'") model.load_state_dict(checkpoint_data["model_state_dict"]) else: # Model has not previously been fine-tuned, so we only load encoder weights assert args.do_train logger.info("Loading encoder weights from 'model_state_dict'") model.encoder.load_state_dict(checkpoint_data["model_state_dict"]) if (args.do_train or args.resume) and args.freeze_encoder: model.freeze_encoder() # Write encoder config and tokenizer in output directory if (not args.resume) and args.do_train: path_config = os.path.join(args.dir_output, "config.json") model.encoder.config.to_json_file(path_config) path_tokenizer = os.path.join(args.dir_output, "tokenizer.pkl") with open(path_tokenizer, "wb") as f: pickle.dump(tokenizer, f) # Log some info on the model logger.info("Encoder config: %s" % repr(model.encoder.config)) logger.info("Model params:") for n, p in model.named_parameters(): msg = " %s" % n if not p.requires_grad: msg += " ***FROZEN***" logger.info(msg) logger.info("Nb model params: %d" % count_params(model)) logger.info("Nb params in encoder: %d" % count_params(model.encoder)) logger.info("Nb params in pooler: %d" % count_params(model.pooler)) logger.info("Nb params in classifier: %d" % count_params(model.classifier)) # Get data max_seq_length = args.seq_len + 2 # We add 2 for CLS and SEP if args.resume: # Reload training dataset(s) logger.info("Reloading training data from checkpoint") train_dataset = checkpoint_data["train_dataset"] train_dataset.prep_files_for_streaming() dev_dataset = checkpoint_data.get("dev_dataset", None) unk_dataset = checkpoint_data.get("unk_dataset", None) if unk_dataset: unk_dataset.prep_files_for_streaming() elif args.do_train: # Remove unk.train if present, and create a MLM dataset for it. path_unk = check_for_unk_train_data(train_paths) if path_unk is None: unk_dataset = None else: train_paths.remove(path_unk) logger.info("Creating MLM-only training set from %s..." % path_unk) unk_dataset = BertDatasetForMLM( [path_unk], tokenizer, max_seq_length, sampling_alpha=args.sampling_alpha, weight_relevant=args.weight_relevant, encoding="utf-8", seed=args.seed, verbose=DEBUG) logger.info("Creating training set from %s training files in %s..." % (len(train_paths), args.dir_train)) train_dataset = BertDatasetForClassification( train_paths, tokenizer, max_seq_length, include_mlm=True, sampling_alpha=args.sampling_alpha, weight_relevant=args.weight_relevant, encoding="utf-8", seed=args.seed, verbose=DEBUG) if path_unk is not None: assert len(unk_dataset) == len(train_dataset) # Check train_dataset.lang2id: keys should contain all langs, and nothing else, like that of the model assert train_dataset.lang2id == model.lang2id if not args.resume: dev_dataset = None if args.do_eval or args.eval_during_training: logger.info("Loading validation data from %s..." % args.path_dev) dev_dataset = BertDatasetForTesting(args.path_dev, tokenizer, model.lang2id, max_seq_length, require_labels=True, encoding="utf-8", verbose=DEBUG) if args.do_train and args.eval_during_training: checkpoint_data["dev_dataset"] = dev_dataset if args.do_pred: logger.info("Loading test data from %s..." % args.path_test) test_dataset = BertDatasetForTesting(args.path_test, tokenizer, model.lang2id, max_seq_length, require_labels=False, encoding="utf-8", verbose=DEBUG) # Compute number of epochs and steps, initialize number of training steps done. num_opt_steps_per_epoch = args.num_train_steps_per_epoch // args.grad_accum_steps args.num_epochs = math.ceil(checkpoint_data["max_opt_steps"] / num_opt_steps_per_epoch) if args.do_train and (not args.resume): checkpoint_data["global_step"] = 0 checkpoint_data[ "max_opt_steps"] = args.max_train_steps // args.grad_accum_steps # Training if args.do_train or args.resume: # Prepare optimizer logger.info("Preparing optimizer...") np_list = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] opt_params = [{ 'params': [p for n, p in np_list if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in np_list if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.equal_betas: betas = (0.9, 0.9) else: betas = (0.9, 0.999) optimizer = AdamW( opt_params, lr=args.learning_rate, betas=betas, correct_bias=args.correct_bias ) # To reproduce BertAdam specific behaviour, use correct_bias=False # Load optimizer state if resuming if args.resume: optimizer.load_state_dict(checkpoint_data["optimizer_state_dict"]) # Log some info before training logger.info("*** Training info: ***") logger.info(" Number of training steps completed: %d" % checkpoint_data["global_step"]) logger.info(" Max training steps: %d" % args.max_train_steps) logger.info(" Gradient accumulation steps: %d" % args.grad_accum_steps) logger.info(" Max optimization steps: %d" % checkpoint_data["max_opt_steps"]) logger.info(" Training dataset size: %d" % len(train_dataset)) logger.info(" Batch size: %d" % args.train_batch_size) logger.info(" # training steps/epoch: %d" % (args.num_train_steps_per_epoch)) logger.info(" # optimization steps/epoch: %d" % num_opt_steps_per_epoch) logger.info(" # epochs to do: %d" % args.num_epochs) if args.eval_during_training: logger.info("Validation dataset size: %d" % len(dev_dataset)) # Run training train(model, optimizer, train_dataset, args, checkpoint_data, dev_dataset=dev_dataset, unk_dataset=unk_dataset) # Reload model save_to_dir = args.dir_pretrained_model if args.resume else args.dir_output checkpoint_data = torch.load( os.path.join(save_to_dir, "checkpoint.tar")) if "best_model_state_dict" in checkpoint_data: model.load_state_dict(checkpoint_data["best_model_state_dict"]) else: model.load_state_dict(checkpoint_data["model_state_dict"]) # Evaluate model on dev set if args.do_eval: logger.info("*** Running evaluation... ***") scores = evaluate(model, dev_dataset, args) logger.info("***** Evaluation Results *****") for score_name in sorted(scores.keys()): logger.info("- %s: %.4f" % (score_name, scores[score_name])) # Get model's predictions on test set if args.do_pred: logger.info("*** Running prediction... ***") logits = predict(model, test_dataset, args) pred_class_ids = np.argmax(logits.cpu().numpy(), axis=1) pred_labels = [test_dataset.label_list[i] for i in pred_class_ids] path_pred = os.path.join(args.dir_output, "pred.txt") logger.info("Writing predictions in %s..." % path_pred) with open(path_pred, 'w', encoding="utf-8") as f: for x in pred_labels: f.write("%s\n" % x)
def main(): args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. accelerator = Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on the screen. # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel( logging.INFO if accelerator.is_local_main_process else logging.ERROR) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Loading the dataset from local csv file. data_files = {} data_files["train"] = args.train_file data_files["validation"] = args.validation_file raw_datasets = load_dataset("json", data_files=data_files) # Get the label list label_list = raw_datasets["train"].unique("label") label_list.sort() # Let's sort it for determinism # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. tokenizer = MyBertTokenizer.from_pretrained("bert-base-cased") model = BertForMaskedLM.from_pretrained(args.model_name_or_path) # Preprocessing the datasets padding = "max_length" if args.pad_to_max_length else False # We have made sure that pos_token_id and neg_token_id is a single token label_to_text = {0: "terrible", 1: "fantastic"} logger.info(f"Label to text mapping: {label_to_text}") mask_token_id = tokenizer.mask_token_id pos_token_id = tokenizer(label_to_text[1])["input_ids"][1] neg_token_id = tokenizer(label_to_text[0])["input_ids"][1] label_to_token_id = {0: neg_token_id, 1: pos_token_id} def preprocess_function(examples): # Tokenize the source texts texts = examples["sentence"] result = tokenizer(texts, padding=padding, max_length=args.max_length, truncation=True) # Add prompt prompt_token_ids = tokenizer("The movie is [MASK].")["input_ids"][1:] input_ids = result["input_ids"] input_ids = [ids[:-1] + prompt_token_ids for ids in input_ids] result["input_ids"] = input_ids additional_len = len( prompt_token_ids) - 1 # Not including [BOS] and [EOS] # Add attention mask attention_mask = result["attention_mask"] attention_mask_with_prompt = [ x + [1] * additional_len for x in attention_mask ] result["attention_mask"] = attention_mask_with_prompt # Add token type token_type_ids = result["token_type_ids"] token_type_ids_with_prompt = [ x + [0] * additional_len for x in token_type_ids ] result["token_type_ids"] = token_type_ids_with_prompt # Important!!! Since we use padding, the mask position is not fixed in the end mask_positions = [ids.index(mask_token_id) for ids in input_ids] result["mask_positions"] = mask_positions # Prepare labels sentiments = [label_to_token_id[l] for l in examples["label"]] labels = [] for x, y, z in zip(input_ids, sentiments, mask_positions): label = [-100] * len(x) label[z] = y labels.append(label) # This is for all tokens result["labels"] = labels # # This is the true label for each sample result["targets"] = examples["label"] return result processed_datasets = raw_datasets.map( preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names, desc="Running tokenizer on dataset", ) train_dataset = processed_datasets["train"] eval_dataset = processed_datasets["validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # DataLoaders creation: if args.pad_to_max_length: # If padding was already done ot max length, we use the default data collator that will just convert everything # to tensors. data_collator = default_data_collator else: # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). data_collator = DataCollatorWithPadding( tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)) train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size) eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader) # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be # shorter in multiprocess) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) metric = load_metric("accuracy") # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {args.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Get the initial zero-shot accuracy model.eval() for step, batch in enumerate(eval_dataloader): # Extract targets targets = batch.pop("targets") mask_positions = batch.pop("mask_positions") outputs = model(**batch) predictions = [] for x, y in zip(outputs.logits, mask_positions): # logits size [8 X 49 X 30522] pos_logit = x[y][pos_token_id] neg_logit = x[y][neg_token_id] if pos_logit > neg_logit: predictions.append(1) else: predictions.append(0) predictions = torch.tensor(predictions).to(targets.device) metric.add_batch( predictions=accelerator.gather(predictions), references=accelerator.gather(targets), ) eval_metric = metric.compute( ) # After this, all batches in metric will be cleared. logger.info(f"zero-shot accuracy: {eval_metric}") exit(0) # Only show the progress bar once on each machine. completed_steps = 0 best_acc = 0.0 start = time.time() for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): # Extract targets batch.pop("targets") batch.pop("mask_positions") outputs = model(**batch) loss = outputs.loss loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len( train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() completed_steps += 1 if completed_steps % 100 == 0: logger.info( f"Completed {completed_steps} steps, time passed: {time.time() - start}s." ) if completed_steps >= args.max_train_steps: break model.eval() for step, batch in enumerate(eval_dataloader): # Extract targets targets = batch.pop("targets") mask_positions = batch.pop("mask_positions") outputs = model(**batch) predictions = [] for x, y in zip(outputs.logits, mask_positions): # logits size [8 X 49 X 30522] pos_logit = x[y][pos_token_id] neg_logit = x[y][neg_token_id] if pos_logit > neg_logit: predictions.append(1) else: predictions.append(0) predictions = torch.tensor(predictions).to(targets.device) metric.add_batch( predictions=accelerator.gather(predictions), references=accelerator.gather(targets), ) eval_metric = metric.compute( ) # After this, all batches in metric will be cleared. logger.info(f"epoch {epoch}: {eval_metric}") if eval_metric["accuracy"] > best_acc: best_acc = eval_metric["accuracy"] accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
attention_mask=train_data_feature['attention_mask'], masked_lm_labels=train_data_feature['masked_lm_labels']) test_dataset = makeDataset( token_embeddings=test_data_feature['token_embeddings'], segement_embeddings=test_data_feature['segement_embeddings'], attention_mask=test_data_feature['attention_mask'], masked_lm_labels=test_data_feature['masked_lm_labels']) train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True) test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=True) # 設定model參數 # type_vocab_size:如果token_type_ids有用到2種以上時,則需要做修改 # 另外還有在本地的transformers/modeling_utils.py中第469~471行需要註解掉,因為pytorch會拋出錯誤訊息(RuntimeError: Error(s) in loading state_dict for BertForMaskedLM : size mismatch for bert.embeddings.token_type_embeddings.weight: copying a param with shape torch.Size([2, 768]) from checkpoint, the shape in current model is torch.Size([3, 768]).),但原始bert就有16維可以用,所以我用3維應該是沒問題的 config = BertConfig.from_pretrained('bert-base-chinese', type_vocab_size=3) model = BertForMaskedLM.from_pretrained( 'bert-base-chinese', from_tf=bool('.ckpt' in 'bert-base-chinese'), config=config) model.to(device) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }, { 'params': [ p for n, p in model.named_parameters()
def main(args, logger): # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv') trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl') tst_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/test.csv') trn_df = pd.concat([trn_df, tst_df], axis=0).fillna(-1) trn_df['is_original'] = 1 # raw_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/raw_pseudo_tst_df.csv') # half_opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/half_opt_pseudo_tst_df.csv') # opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/opt_pseudo_tst_df.csv') # clean texts # trn_df = clean_data(trn_df, ['question_title', 'question_body', 'answer']) # load additional tokens # with open('./mnt/inputs/nes_info/trn_over_10_vocab.pkl', 'rb') as fin: # additional_tokens = pickle.load(fin) gkf = GroupKFold(n_splits=5).split( X=trn_df.question_body, groups=trn_df.question_body_le, ) histories = { 'trn_loss': {}, 'val_loss': {}, 'val_metric': {}, 'val_metric_raws': {}, } loaded_fold = -1 loaded_epoch = -1 if args.checkpoint: histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint) fold_best_metrics = [] fold_best_metrics_raws = [] for fold, (trn_idx, val_idx) in enumerate(gkf): if fold > 0: break if fold < loaded_fold: fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax( histories["val_metric"][fold])]) continue sel_log( f' --------------------------- start fold {fold} --------------------------- ', logger) fold_trn_df = trn_df.iloc[trn_idx] # .query('is_original == 1') fold_trn_df = fold_trn_df.drop(['is_original', 'question_body_le'], axis=1) # use only original row fold_val_df = trn_df.iloc[val_idx].query('is_original == 1') fold_val_df = fold_val_df.drop(['is_original', 'question_body_le'], axis=1) if args.debug: fold_trn_df = fold_trn_df.sample(100, random_state=71) trn_df = trn_df.sample(100, random_state=71) fold_val_df = fold_val_df.sample(100, random_state=71) temp = pd.Series( list( itertools.chain.from_iterable( fold_trn_df.question_title.apply(lambda x: x.split(' ')) + fold_trn_df.question_body.apply(lambda x: x.split(' ')) + fold_trn_df.answer.apply(lambda x: x.split(' ')))) ).value_counts() tokens = temp[temp >= 10].index.tolist() # tokens = [] tokens = [ 'CAT_TECHNOLOGY'.casefold(), 'CAT_STACKOVERFLOW'.casefold(), 'CAT_CULTURE'.casefold(), 'CAT_SCIENCE'.casefold(), 'CAT_LIFE_ARTS'.casefold(), ] # + additional_tokens fold_trn_df = trn_df.drop(['is_original', 'question_body_le'], axis=1) # fold_trn_df = pd.concat([fold_trn_df, raw_pseudo_df, opt_pseudo_df, half_opt_pseudo_df], axis=0) trn_dataset = QUESTDataset( df=fold_trn_df, mode='train', tokens=tokens, augment=[], tokenizer_type=TOKENIZER_TYPE, pretrained_model_name_or_path=TOKENIZER_PRETRAIN, do_lower_case=DO_LOWER_CASE, LABEL_COL=LABEL_COL, t_max_len=T_MAX_LEN, q_max_len=Q_MAX_LEN, a_max_len=A_MAX_LEN, tqa_mode=TQA_MODE, TBSEP='[SEP]', pos_id_type='arange', MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, use_category=False, ) # update token trn_sampler = RandomSampler(data_source=trn_dataset) trn_loader = DataLoader(trn_dataset, batch_size=BATCH_SIZE, sampler=trn_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=True, pin_memory=True) model = BertForMaskedLM.from_pretrained('bert-base-uncased') optimizer = optim.Adam(model.parameters(), lr=3e-5) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=MAX_EPOCH, eta_min=1e-5) # load checkpoint model, optim, scheduler if args.checkpoint and fold == loaded_fold: load_checkpoint(args.checkpoint, model, optimizer, scheduler) for epoch in tqdm(list(range(MAX_EPOCH))): if fold <= loaded_fold and epoch <= loaded_epoch: continue # model = DataParallel(model) model = model.to(DEVICE) trn_loss = train_one_epoch_ML(model, optimizer, trn_loader, DEVICE) scheduler.step() if fold in histories['trn_loss']: histories['trn_loss'][fold].append(trn_loss) else: histories['trn_loss'][fold] = [ trn_loss, ] if fold in histories['val_loss']: histories['val_loss'][fold].append(trn_loss) else: histories['val_loss'][fold] = [ trn_loss, ] if fold in histories['val_metric']: histories['val_metric'][fold].append(trn_loss) else: histories['val_metric'][fold] = [ trn_loss, ] if fold in histories['val_metric_raws']: histories['val_metric_raws'][fold].append(trn_loss) else: histories['val_metric_raws'][fold] = [ trn_loss, ] sel_log( f'fold : {fold} -- epoch : {epoch} -- ' f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- ', logger) model = model.to('cpu') # model = model.module save_checkpoint( f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model, optimizer, scheduler, histories, [], [], [], fold, epoch, trn_loss, trn_loss, ) save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', trn_dataset.tokenizer, clean=False) del model send_line_notification('fini!') sel_log('now saving best checkpoints...', logger)
interval = end - start param.data[start:end, :].copy_( pretrained_weight.data[:interval]) start = end elif "decoder.cls.predictions.bias" in name or "cls.predictions.bias" in name: param.data[:pretrained_weight.shape[0]].copy_( pretrained_weight.data) else: param.data.copy_(pretrained_weight.data) else: print(name) if __name__ == "__main__": config = AutoConfig.from_pretrained("bert-base-uncased") tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # pretrained_model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased") pretrained_model = AutoModelForCausalLM.from_pretrained( "bert-base-uncased") add_tokens(tokenizer) config.max_position_embeddings = 1024 + 2 config.vocab_size = len(tokenizer.get_vocab()) # config = EncoderDecoderConfig.from_encoder_decoder_configs(config, config) # model = EncoderDecoderModel(config=config) model = BertForMaskedLM(config) load_weights(model, pretrained_model) model.save_pretrained('bert-base-uncased-itokens') tokenizer.save_pretrained('bert-base-uncased-itokens')
model.to('cuda') # Predict hidden states features for each layer with torch.no_grad(): # See the models docstrings for the detail of the inputs outputs = model(tokens_tensor, token_type_ids=segments_tensors) # Transformers models always output tuples. # See the models docstrings for the detail of all the outputs # In our case, the first element is the hidden state of the last layer of the Bert model encoded_layers = outputs[0] # We have encoded our input sequence in a FloatTensor of shape (batch size, sequence length, model hidden dimension) assert tuple(encoded_layers.shape) == (1, len(indexed_tokens), model.config.hidden_size) # Load pre-trained model (weights) model = BertForMaskedLM.from_pretrained('bert-base-uncased') model.eval() # If you have a GPU, put everything on cuda tokens_tensor = tokens_tensor.to('cuda') segments_tensors = segments_tensors.to('cuda') model.to('cuda') # Predict all tokens with torch.no_grad(): outputs = model(tokens_tensor, token_type_ids=segments_tensors) predictions = outputs[0] # confirm we were able to predict 'henson' predicted_index = torch.argmax(predictions[0, masked_index]).item() predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
help="Location of the model", type=str, required=True) parser.add_argument("--tokenizer", help="Location of the tokenizer", type=str, required=True) parser.add_argument("--mode", type=str, choices=["mask_first", "mask_last"], required=True) parser.add_argument("--metric", help="Which metric to calculate ?", choices=["rank", "probability"], required=True) parser.add_argument("--metrics-output-path", type=str) args = parser.parse_args() tokenizer = BertTokenizer.from_pretrained(args.tokenizer) model = BertForMaskedLM.from_pretrained(args.model).cuda().eval() metrics_output_path = args.metrics_output_path if args.metrics_output_path is not None else args.model import os metrics_output_path = os.path.join( metrics_output_path, f"first_name_given_last_name/{args.mode}_{args.metric}") os.makedirs(metrics_output_path, exist_ok=True) evaluate(model, tokenizer, args.mode, args.metric, metrics_output_path)
import torch from transformers import BertTokenizer, BertForMaskedLM import sys sys.path.append(".") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokenizer = BertTokenizer.from_pretrained( "shibing624/macbert4csc-base-chinese") model = BertForMaskedLM.from_pretrained("shibing624/macbert4csc-base-chinese") model = model.to(device) sentence = "你找到你最喜欢的工作,我也很高心" tokens = ['[CLS]'] + tokenizer.tokenize(sentence) + ['[SEP]'] print(tokens) # with torch.no_grad(): # outputs = model(**tokenizer(texts, padding=True, return_tensors='pt').to(device)) for i in range(1, len(tokens) - 1): tmp = tokens[:i] + ['[MASK]'] + tokens[i + 1:] masked_ids = torch.tensor([tokenizer.convert_tokens_to_ids(tmp)]) segment_ids = torch.tensor([[0] * len(tmp)]) attention_mask = torch.tensor([[1] * len(tmp)]) outputs = model(masked_ids, attention_mask=attention_mask, token_type_ids=segment_ids) prediction_scores = outputs[0] print(tmp) # 打印被预测的字符 prediction_index = torch.argmax(prediction_scores[0, i]).item() predicted_token = tokenizer.convert_ids_to_tokens([prediction_index])[0]
def __init__(self, model_name_or_path: str): super(BertPretrain, self).__init__() self.bert_model = BertForMaskedLM.from_pretrained(model_name_or_path)
def main(): print('start of main') parser = argparse.ArgumentParser( description = '''This script computes probabilities for a masked token with words from the words file, and stores result in csv format to the output file ''') parser.add_argument("-s", type = str, required=True, dest = "sent_type", help = 'class name: "sv_agreement" or "anaphora"') parser.add_argument("-t", type = str, required=True, dest = "template", help = 'template name (see templates.txt)') parser.add_argument("-g", type = int, required=False, default=None, dest = "gpu_num", help = 'which gpu to run this on') parser.add_argument("-m", type = str, required=False, default='bert-base-uncased', dest = "model_path_or_name", help = 'path to the model or name of the model') args = parser.parse_args() if args.sent_type not in ['sv_agreement', 'anaphora']: parser.error("invalid sent_type argument for -s") print('creating results path') use_wug = args.model_path_or_name != 'bert-base-uncased' number = None if use_wug: model_type = args.model_path_or_name.split('/') if model_type[-1]=='': model_type = model_type[:-1] number = model_type[-3].lower() model_path = '/'.join(model_type[-3:]) results_path = FINE_TUNE_RESULTS_PATH[:-7] % model_path if not os.path.isdir(results_path): print('creating directory %s' % results_path) os.mkdir(results_path) results_path = FINE_TUNE_RESULTS_PATH[:-4] % (model_path, args.sent_type) if not os.path.isdir(results_path): print('creating directory %s' % results_path) os.mkdir(results_path) results_path = FINE_TUNE_RESULTS_PATH % (model_path, args.sent_type, args.template) else: results_path = RESULTS_PATH[:-4] % args.sent_type if not os.path.isdir(results_path): print('creating directory %s' % results_path) os.mkdir(results_path) results_path = RESULTS_PATH % (args.sent_type, args.template) results_filename = RESULTS_FILENAME % args.template outfilename = os.path.join(str(ABS_PATH), results_path, results_filename) if not os.path.isdir(results_path): print('creating directory %s' % results_path) os.mkdir(results_path) print('getting consts') sent_types = csp_consts.SENT_TYPES[args.sent_type] batch_sizes = csp_consts.BERT_BATCH_SIZES[args.sent_type] masked_types = csp_consts.MASKED_TYPE[args.sent_type] max_len_types = csp_consts.BERT_MAX_TYPE[args.sent_type] try: template_name = sent_types[args.template] batch_size_dict = batch_sizes[args.template] masked_type = masked_types[args.template] max_len = max_len_types[args.template] except KeyError: parser.error("Incompatible template for the given sentence type") sys.exit() print('loading model at', datetime.now()) bert_tokenizer = BertTokenizer.from_pretrained(args.model_path_or_name) bert_model = BertForMaskedLM.from_pretrained(args.model_path_or_name) bert_model.eval() if args.gpu_num is not None: device = torch.device('cuda:'+str(args.gpu_num) if torch.cuda.is_available() else 'cpu') print('running on GPU: %d' % args.gpu_num) else: device = torch.device('cpu') bert_model.to(device) batch_size = batch_size_dict['pairs'] num_sents = batch_size_dict['sents'] if use_wug: batch_size *= 2 num_sents //= 2 print('starting all computations at', datetime.now()) eval_from_file(bert_model, bert_tokenizer, template_name, outfilename, masked_type, batch_size, num_sents, max_len, device=device, use_wug=use_wug, number=number) print('completed all computations at', datetime.now())
def aspect_extractor_trainer(data_itr, model_name, bert_tokenizer, linguistic_vocab, required_features_list, lang, lowercase_data, H, lr, scheduler_patience_steps, scheduler_decay_factor, scheduler_min_lr, epochs, max_norm, no_improvement_tolerance=5000, save_model_name="project_sublayers.pt", relative_sizing=False, resolution_strategy="first", report_every=5000): """ Implementation of the sub-layer model trainer which pre-trains the transformer heads using the BERT vectors. """ assert len(required_features_list) > 0, "You have to select some features" assert linguistic_vocab is not None and len(linguistic_vocab) > 0 Hs = [] for rfl in required_features_list: if rfl in linguistic_vocab: if relative_sizing: print( "This might not be supported in the multi-head implementation" ) Hs.append(len(linguistic_vocab[rfl])) else: # TODO consider hierarchical encoding of features here Hs.append(1.0) assert len(Hs) > 0 Hs.append(max(Hs)) weight_ratio = int(float(H) / sum(Hs)) assert weight_ratio > 1 Hs = [int(weight_ratio * ind) for ind in Hs] Hs[-1] += max(0, (H - sum(Hs))) print( "Loading the pre-trained BertForMaskedLM model: {}".format(model_name)) bert_lm = BertForMaskedLM.from_pretrained( model_name, output_hidden_states=True).to(device) number_of_bert_layers = len(bert_lm.bert.encoder.layer) + 1 D_in = D_out = bert_lm.bert.config.hidden_size reverse_linguistic_vocab = create_reverse_linguistic_vocab( linguistic_vocab) print("Loading Spacy Tokenizers") spacy_tokenizer_1, spacy_tokenizer_2 = SpacyTokenizer( lang, lowercase_data), SpacyTokenizer(lang, lowercase_data) spacy_tokenizer_2.overwrite_tokenizer_with_split_tokenizer() print("Creating the model") model = AspectExtractor( D_in, Hs, D_out, [len(linguistic_vocab[f]) + 1 for f in required_features_list], number_of_bert_layers, required_features_list, reverse_linguistic_vocab).to(device) model.apply(weight_init) opt = optim.SGD(model.parameters(), lr=float(lr), momentum=0.9) scheduler = ReduceLROnPlateau(opt, mode='min', patience=scheduler_patience_steps, factor=scheduler_decay_factor, threshold=0.001, verbose=False, min_lr=scheduler_min_lr) print("Starting to train ...") break_condition = False for t in range(epochs): if break_condition: print( "Minimum {} batches have been observed without any accuracy improvements in classifiers, ending the training ..." .format(no_improvement_tolerance)) break all_loss = 0.0 all_tokens_count = 0.0 feature_pred_corrects = [0 for _ in range(len(required_features_list))] feature_pred_correct_all = 0.0 all_prediction = [[] for _ in required_features_list] all_actual = [[] for _ in required_features_list] # TODO use the actual dataset object instead of this iterator itr = data_itr() tolerance_counts = [0 for _ in required_features_list] tolerance_bests = [0.0 for _ in required_features_list] for batch_id, input_sentences in enumerate(itr): sequences = [ torch.tensor( bert_tokenizer.tokenizer.encode(input_sentence, add_special_tokens=True), device=device) for input_sentence in input_sentences ] features, feature_weights = map_sentences_to_vocab_ids( input_sentences, required_features_list, linguistic_vocab, spacy_tokenizer_1, spacy_tokenizer_2, bert_tokenizer) input_ids = torch.nn.utils.rnn.pad_sequence( sequences, batch_first=True, padding_value=bert_tokenizer.tokenizer.pad_token_id) if input_ids.size(1) > bert_lm.config.max_position_embeddings: continue outputs = bert_lm(input_ids, masked_lm_labels=input_ids)[ 2] # (batch_size * [input_length + 2] * 768) all_layers_embedded = torch.cat( [o.detach().unsqueeze(0) for o in outputs], dim=0) maxes = torch.max(model.bert_weights_for_average_pooling, dim=-1, keepdim=True)[0] x_exp = torch.exp(model.bert_weights_for_average_pooling - maxes) x_exp_sum = torch.sum(x_exp, dim=-1, keepdim=True) output_custom = x_exp / x_exp_sum embedded = torch.matmul(all_layers_embedded.permute(1, 2, 3, 0), output_custom) # sequence_length, batch_size, len(feats) predictions = torch.zeros(embedded.size(1), embedded.size(0), len(required_features_list)) for s in range(1, embedded.size(1) - 1): x = embedded.select(1, s) features_selected = [] feature_weights_selected = [] permitted_to_continue = True for f, fw in zip(features, feature_weights): if s < f.size(1): features_selected.append(f.select(1, s)) feature_weights_selected.append(fw.select(1, s)) else: permitted_to_continue = False if not permitted_to_continue: continue _, loss, feature_pred_correct, feat_predictions = model( x, features_selected, feature_weights_selected) predictions[s] = feat_predictions for ind, score in enumerate(feature_pred_correct): feature_pred_corrects[ind] += score.sum().item() feature_pred_correct_all += feature_pred_correct[0].size(0) model.zero_grad() loss.backward(retain_graph=True) nn.utils.clip_grad_norm_(model.parameters(), max_norm) opt.step() all_loss += loss.item() all_tokens_count += x.size(0) _classification_report_ = [ "{}:{:.2f}%".format( feat.upper(), float(feature_pred_corrects[ind] * 100) / feature_pred_correct_all) for ind, feat in enumerate(required_features_list) ] itr.set_description( "Epoch: {}, Average Loss: {:.2f}, [{}]".format( t, all_loss / all_tokens_count, "; ".join(_classification_report_))) # if model has not had any improvements in any of the classifier scores after {no_improvement_tolerance} batches, the training will stop. for ind, feat in enumerate(required_features_list): feat_score = round( float(feature_pred_corrects[ind] * 100) / feature_pred_correct_all, 3) if tolerance_bests[ind] < feat_score: tolerance_bests[ind] = feat_score tolerance_counts[ind] = 0 else: tolerance_counts[ind] = tolerance_counts[ind] + 1 break_condition = sum([ 1 if tolerance_counts[ind] >= no_improvement_tolerance else 0 for ind, feat in enumerate(required_features_list) ]) == len(required_features_list) if break_condition: break scheduler.step(all_loss / all_tokens_count) predictions = predictions.transpose(0, 1) for b in range(predictions.size(0)): for l in range(1, predictions.size(1) - 1): classes = predictions[b][l] for idx in range(len(required_features_list)): pred_id = int(classes[idx].item()) - 1 if idx >= len(features) or b >= features[idx].size( 0) or l >= features[idx].size(1): # print("WARNING: skipping access to index out of bounds for a tensor with size " # "({}, {}, {}) with indices [{}, {}, {}]".format(len(features), features[idx].size(0), # features[idx].size(1), idx, b, l)) continue actual_id = int(features[idx][b][l].item()) - 1 predicted_label = reverse_linguistic_vocab[ required_features_list[idx]][ pred_id] if pred_id > -1 else '__PAD__' actual_label = reverse_linguistic_vocab[ required_features_list[idx]][ actual_id] if actual_id > -1 else '__PAD__' # predicted_bis, predicted_label = separate_bis_label(predicted_label) # actual_bis, actual_label = separate_bis_label(actual_label) if actual_label != '__PAD__': all_actual[idx].append(actual_label) all_prediction[idx].append(predicted_label) # print(pred_tag, actual_label, actual_bis, predicted_label, predicted_bis, predicted_label == actual_label) if batch_id and batch_id % report_every == 0: print("Creating report/persisting trained model ...") create_train_report_and_persist_modules( model, save_model_name, all_actual, all_prediction, feature_pred_correct_all, feature_pred_corrects, required_features_list, resolution_strategy) print( "Cleaning up the collected actual/prediction labels [done due to prevent application getting killed for memory limits]" ) for idx in range(len(required_features_list)): del all_actual[idx][:] del all_prediction[idx][:] create_train_report_and_persist_modules(model, save_model_name, all_actual, all_prediction, feature_pred_correct_all, feature_pred_corrects, required_features_list, resolution_strategy) print("Training done.")
basedir = os.path.dirname(__file__) return os.path.join(basedir, path) if torch.cuda.is_available(): device = torch.device("cuda") print('There are %d GPU(s) available.' % torch.cuda.device_count()) print('We will use the GPU:', torch.cuda.get_device_name(0)) else: print('No GPU available, using the CPU instead.') device = torch.device("cpu") # Load a trained model and vocabulary that you have fine-tuned model = BertForMaskedLM.from_pretrained(output_dir, output_attentions=False, # Whether the model returns attentions weights. output_hidden_states=True, # Whether the model returns all hidden-states. ) tokenizer = AutoTokenizer.from_pretrained(output_dir) # Copy the model to the GPU. # model.to(device) model.eval() def clean(sent): sent = sent.translate(str.maketrans('', '', string.punctuation)) sent = sent.lower().split() sent = [word for word in sent if word not in words] sent = ' '.join(sent) return sent
def main(): parser = argparse.ArgumentParser() parser.add_argument("--extract_data", action="store_true", help="extract data from scratch (otherwise, load data from saved file)") parser.add_argument("--all_langs", action="store_true", help="Translate from all langs to all langs") parser.add_argument("--eval_pos", help="evaluate a specific pos tag: NOUN, VERB or ADJ") args = parser.parse_args() random_seed = 10 num_langs = 15 sent_per_lang = 5000 num_classifiers = 20 num_to_eval = 1000 random.seed(random_seed) print("random_seed", random_seed) print("num_langs", num_langs) print("sent_per_lang", sent_per_lang) print("num_classifiers", num_classifiers) print("num_to_eval", num_to_eval) print(args) # load mBERT pretrained_weights = 'bert-base-multilingual-uncased' tokenizer_mlm = BertTokenizer.from_pretrained(pretrained_weights) model_mlm = BertForMaskedLM.from_pretrained(pretrained_weights, output_hidden_states=True) output_embeddings = model_mlm.cls.predictions.decoder.weight.detach().cpu().numpy() # collect data (<sent_per_lang> sentences) from <num_langs> most frequent languages, from TED data, langs = collect_data_per_lang(num_langs=num_langs, sent_per_lang=sent_per_lang) # extract representations of random tokens from each sentence random.seed(random_seed) data_filename = "../data/data_with_states_{}lang_{}perlang_embeddings".format(num_langs, sent_per_lang) if args.extract_data: # extract the representations and dump them to a file data_with_states = extract_repr_random_token_bert_mlm(tokenizer_mlm, copy.deepcopy(data)[:num_langs * sent_per_lang], output_embeddings) with open(data_filename, "wb") as f: pickle.dump(data_with_states, f) print("extracted data") else: # load the representations instead of extracting them with open(data_filename, "rb") as f: data_with_states = pickle.load(f) print("loaded data") # data for lang_repr vecs, labels_lang = data_for_lang_repr(data_with_states, random_seed) # create a vector representation for each language, and save to file random.seed(random_seed) lang_repr = create_repr_per_lang(vecs, labels_lang) lang_repr_filename = "../data/lang_repr_{}lang_no_inlp".format(num_langs) with open(lang_repr_filename, "wb") as f: pickle.dump(lang_repr, f) # evaluation using northeuralex # extract north_euralex data and save # 'zh-tw', 'zh-cn' (chinese) and 'pt-br' (brazilian portuguese) are not in this data ted2eur = {"en": "eng", "ar": "arb", "he": "heb", "ru": "rus", "ko": "kor", "it": "ita", "ja": "jpn", "es": "spa", "fr": "fra", "nl": "nld", "ro": "ron", "tr": "tur"} eur2ted = {v: k for k, v in ted2eur.items()} all_translations, map_word_pos = extract_north_euralex(eur2ted) with open("../data/all_translations_north_euralex", "wb") as f: pickle.dump(all_translations, f) # names of files details_filename = "../data/north_euralex_details" repr_filename = "../data/representations_embed_{}lang_no_inlp".format(num_langs) # evaluate on north_euralex if args.all_langs: all_langs_1 = [] all_langs_5 = [] all_langs_10 = [] for source_lang in ["eng", "rus", "nld", "fra", "spa", "ita", "ron", "tur", "kor", "jpn", "arb", "heb"]: for target_lang in ["eng", "rus", "nld", "fra", "spa", "ita", "ron", "tur", "kor", "jpn", "arb", "heb"]: print(source_lang, "to", target_lang) rank_before, rank_after = evaluate_northeuralex_all_langs(source_lang, target_lang, lang_repr, eur2ted, all_translations, output_embeddings, tokenizer_mlm) acc1, acc5, acc10 = print_evals(rank_before, rank_after, return_accs=True) all_langs_1.append(acc1) all_langs_5.append(acc5) all_langs_10.append(acc10) all_langs_1 = np.array(all_langs_1) all_langs_5 = np.array(all_langs_5) all_langs_10 = np.array(all_langs_10) print(all_langs_1) print(all_langs_5) print(all_langs_10) np.save("../data/all_langs_1", all_langs_1.reshape(12, 12)) np.save("../data/all_langs_5", all_langs_5.reshape(12, 12)) np.save("../data/all_langs_10", all_langs_10.reshape(12, 12)) else: rank_before, rank_after = evaluate_northeuralex(lang_repr, eur2ted, all_translations, map_word_pos, output_embeddings, tokenizer_mlm, args.eval_pos, repr_filename=repr_filename, details_filename=details_filename) # print evaluations for lang in rank_before: print("\nlang:", lang) print_evals(rank_before[lang], rank_after[lang]) rank_before_all = [] rank_after_all = [] for lang in rank_before: rank_before_all += rank_before[lang] rank_after_all += rank_after[lang] print("\nall together\n") print_evals(rank_before_all, rank_after_all)
def AE(df): model_type = 'bert-base-uncased' tokenizer = BertTokenizer.from_pretrained(model_type) model = BertModel.from_pretrained(model_type, return_dict=True) mask_model = BertForMaskedLM.from_pretrained(model_type, return_dict=True) sep_token = '[SEP]' mask_token = '[MASK]' mask_id = tokenizer(mask_token)['input_ids'][1] sep_id = tokenizer(sep_token)['input_ids'][1] optimizer = AdamW(model.parameters()) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) mask_model.to(device) auxiliary_tokens = ['the', 'aspect', 'term', 'is'] df['mask_tokens'] = 0 df['auxiliary_tokens'] = 0 df = df.astype('object') for i in range(len(df)): #for j in range(len(df['aspect_terms'].iloc[i])): auxiliary_sents = [] for j in range(len(df['aspect_terms'].iloc[i])): aspect_terms = df['aspect_terms'].iloc[i][j] auxiliary_sent = auxiliary_tokens + [aspect_terms] + [ sep_token ] + df['tokens'].iloc[i] auxiliary_sents.append(auxiliary_sent) mask_sent = auxiliary_tokens + [mask_token] + [sep_token ] + df['tokens'].iloc[i] df['mask_tokens'].iloc[i] = mask_sent df['auxiliary_tokens'].iloc[i] = auxiliary_sents df['distance'] = 0 df = df.astype('object') for i in range(len(df)): tokenized = tokenizer.encode(df['mask_tokens'].iloc[i]) sep_index = tokenized.index(sep_id) mask_index = tokenized.index(mask_id) tokenized = pd.Series([tokenized]) padded = pad_sequences(tokenized, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post") attention_mask = np.where(padded != 0, 1, 0) input_ids = torch.tensor(padded).to(device) attention_mask = torch.tensor(attention_mask).to(device) with torch.no_grad(): last_hidden_states = model(input_ids, attention_mask=attention_mask) original_mask_embedding = last_hidden_states[0][:, mask_index, :].cpu( ).numpy() distance = [] for pertubed_index in range(sep_index + 1, MAX_LEN): padded = pad_sequences(tokenized, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post") if padded[0][pertubed_index] != 0 and padded[0][ pertubed_index] != sep_id: #print(padded.shape) cur_id = padded[0][pertubed_index] padded[0][pertubed_index] = mask_id cur_embedding = mask_embedding(model, padded, mask_index) d = dist(original_mask_embedding, cur_embedding) distance.append((cur_id, d)) df['distance'].iloc[i] = distance df['perturbed_mask_index'] = 0 df = df.astype('object') for i in range(len(df)): perturbed_mask_index = [] mask_threshold = calculate_threshold( np.array(df['distance'].iloc[i])[:, 1], std_strength) for dis_index in range(len(df['distance'].iloc[i])): if df['distance'].iloc[i][dis_index][1] < mask_threshold and df[ 'labels'].iloc[i][dis_index] != 'B' and df['labels'].iloc[ i][dis_index] != 'I': perturbed_mask_index.append(dis_index) df['perturbed_mask_index'].iloc[i] = perturbed_mask_index df['augment_token_id'] = 0 df = df.astype('object') for i in range(len(df)): augment_tokenizeds = [] for j in range(len(df['aspect_terms'].iloc[i])): tokenized = tokenizer.encode(df['auxiliary_tokens'].iloc[i][j]) tokenized = torch.Tensor(tokenized).unsqueeze(0).to( torch.int64).to(device) augment_tokenized = tokenizer.encode( df['auxiliary_tokens'].iloc[i][j]) for k in range(len(df['perturbed_mask_index'].iloc[i])): mask_tokenized = tokenizer.encode( df['auxiliary_tokens'].iloc[i][j]) sep_index = mask_tokenized.index(sep_id) perturbed_mask_index = df['perturbed_mask_index'].iloc[i][ k] + sep_index + 1 mask_tokenized[perturbed_mask_index] = mask_id mask_tokenized = torch.Tensor(mask_tokenized).unsqueeze(0).to( torch.int64).to(device) outputs = mask_model(mask_tokenized, labels=tokenized) augment_tokenized[perturbed_mask_index] = int( outputs.logits[:, perturbed_mask_index, :].argmax().cpu( ).numpy()) augment_tokenizeds.append(augment_tokenized) df['augment_token_id'].iloc[i] = augment_tokenizeds df['augment_tokens'] = 0 df = df.astype('object') for i in range(len(df)): tokens_lists = [] for j in range(len(df['aspect_terms'].iloc[i])): tokens_list = [] for k in range(1, len(df['augment_token_id'].iloc[i][j]) - 1): tokens_list.append( tokenizer.decode([df['augment_token_id'].iloc[i][j][k]])) sep_index = tokens_list.index(sep_token) tokens_list = tokens_list[sep_index + 1:] tokens_lists.append(tokens_list) df['augment_tokens'].iloc[i] = tokens_lists return df
from transformers import BertTokenizer, BertForMaskedLM, GPT2Tokenizer, GPT2LMHeadModel bert_name = 'bert-base-uncased' gpt2_name = 'gpt2' bert_dir = './models/bert' gpt2_dir = './models/gpt2' bert_model = BertForMaskedLM.from_pretrained(bert_name) bert_tokenizer = BertTokenizer.from_pretrained(bert_name) gpt2_model = GPT2LMHeadModel.from_pretrained(gpt2_name) gpt2_tokenizer = GPT2Tokenizer.from_pretrained(gpt2_name) bert_model.save_pretrained(bert_dir) bert_tokenizer.save_pretrained(bert_dir) gpt2_model.save_pretrained(gpt2_dir) gpt2_tokenizer.save_pretrained(gpt2_dir)
description= "Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation" ) parser.add_argument("--model_type", default="bert", choices=["bert", "roberta"]) parser.add_argument("--model_name", default='bert-base-uncased', type=str) parser.add_argument( "--dump_checkpoint", default='serialization_dir/tf_bert-base-uncased_0247911.pth', type=str) parser.add_argument("--vocab_transform", action='store_true') args = parser.parse_args() if args.model_type == 'bert': model = BertForMaskedLM.from_pretrained(args.model_name) prefix = 'bert' elif args.model_type == 'roberta': model = RobertaForMaskedLM.from_pretrained(args.model_name) prefix = 'roberta' state_dict = model.state_dict() compressed_sd = {} for w in ['word_embeddings', 'position_embeddings']: compressed_sd[f'distilbert.embeddings.{w}.weight'] = \ state_dict[f'{prefix}.embeddings.{w}.weight'] for w in ['weight', 'bias']: compressed_sd[f'distilbert.embeddings.LayerNorm.{w}'] = \ state_dict[f'{prefix}.embeddings.LayerNorm.{w}']
all_samples = [ sample for sentence_samples in samples for sample in sentence_samples ] sample_sentences, sample_names, sample_masks = list(zip(*all_samples)) sample_sentences, sample_names, sample_masks = ( list(sample_sentences), list(sample_names), list(sample_masks), ) print(len(sample_sentences)) sys.exit(0) tokenizer = BertTokenizerFast.from_pretrained(args.tokenizer) model = BertForMaskedLM.from_pretrained(args.model).eval().cuda() cmp_model = BertForMaskedLM.from_pretrained(args.comparator).eval().cuda() metrics_output_path = args.model if args.metrics_output_path is None else args.metrics_output_path print(f"Saving results to {metrics_output_path}") torch.cuda.empty_cache() losses_under_model = batched_perplexity(model, tokenizer, sample_sentences, sample_masks) torch.cuda.empty_cache() losses_under_comparator = batched_perplexity(cmp_model, tokenizer, sample_sentences, sample_masks) loss_diff = losses_under_comparator - losses_under_model
# https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb # In[6]: from transformers import BertForMaskedLM, BertConfig configuration = BertConfig( vocab_size=80000, # max_position_embeddings=512, # 512 + 2 more special tokens # num_attention_heads=12, # num_hidden_layers=12, # type_vocab_size=1, ) # configuration.vocab_size = 20000 model = BertForMaskedLM(config=configuration) # model = RobertaForMaskedLM.from_pretrained('./Roberta/checkpoint-200000') # Accessing the model configuration # model.config # # Initializing Tokenizer # ## Rewrite Tokenizer of bert_itos_80k with special tokens in front # In[9]: from senior_project_util import ThaiTokenizer, pre_rules_th, post_rules_th from fastai.text.transform import BaseTokenizer, Tokenizer, Vocab from fastai.text.data import TokenizeProcessor, NumericalizeProcessor
def get_model(vocab_size): config = get_config(vocab_size) if transformer_type == 'roberta': return RobertaForMaskedLM(config=config) return BertForMaskedLM(config=config)
def __init__(self, cfg, tokenizer): super().__init__(cfg) self.cfg = cfg self.bert = BertForMaskedLM.from_pretrained(cfg.MODEL.BERT_CKPT) self.tokenizer = tokenizer
def __init__(self): self.src_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') self.tgt_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.tgt_tokenizer.bos_token = '<s>' self.tgt_tokenizer.eos_token = '</s>' #hidden_size and intermediate_size are both wrt all the attention heads. #Should be divisible by num_attention_heads encoder_config = BertConfig(vocab_size=self.src_tokenizer.vocab_size, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, hidden_dropout_prob=config.dropout_prob, attention_probs_dropout_prob=config.dropout_prob, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12) decoder_config = BertConfig(vocab_size=self.tgt_tokenizer.vocab_size, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, hidden_dropout_prob=config.dropout_prob, attention_probs_dropout_prob=config.dropout_prob, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12, is_decoder=True) #Create encoder and decoder embedding layers. encoder_embeddings = torch.nn.Embedding(self.src_tokenizer.vocab_size, config.hidden_size, padding_idx=self.src_tokenizer.pad_token_id) decoder_embeddings = torch.nn.Embedding(self.tgt_tokenizer.vocab_size, config.hidden_size, padding_idx=self.tgt_tokenizer.pad_token_id) encoder = BertModel(encoder_config) encoder.set_input_embeddings(encoder_embeddings.cpu()) decoder = BertForMaskedLM(decoder_config) decoder.set_input_embeddings(decoder_embeddings.cpu()) input_dirs = config.model_output_dirs suffix = "pytorch_model.bin" decoderPath = os.path.join(input_dirs['decoder'], suffix) encoderPath = os.path.join(input_dirs['encoder'], suffix) decoder_state_dict = torch.load(decoderPath) encoder_state_dict = torch.load(encoderPath) decoder.load_state_dict(decoder_state_dict) encoder.load_state_dict(encoder_state_dict) self.model = TranslationModel(encoder, decoder, None, None, self.tgt_tokenizer, config) self.model.cpu() #model.eval() self.model.encoder.eval() self.model.decoder.eval()