def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_data_file", default=None, type=str, required=True, help="The input training data file (a text file).") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--eval_data_file", default=None, type=str, help= "An optional input evaluation data file to evaluate the perplexity on (a text file)." ) # parser.add_argument("--model_type", default="bert", type=str, # help="The model architecture to be fine-tuned.") parser.add_argument( "--model_name_or_path", default="bert-base-cased", type=str, help="The model checkpoint for weights initialization.") # MLMで学習するか parser.add_argument( "--mlm", action='store_true', help= "Train with masked-language modeling loss instead of language modeling." ) parser.add_argument( "--mlm_probability", type=float, default=0.15, help="Ratio of tokens to mask for masked language modeling loss") parser.add_argument( "--config_name", default="", type=str, help= "Optional pretrained config name or path if not the same as model_name_or_path" ) parser.add_argument( "--tokenizer_name", default="", type=str, help= "Optional pretrained tokenizer name or path if not the same as model_name_or_path" ) parser.add_argument( "--cache_dir", default="", type=str, help= "Optional directory to store the pre-trained models downloaded from s3 (instread of the default one)" ) # ちょっとよくわからない parser.add_argument( "--block_size", default=-1, type=int, help="Optional input sequence length after tokenization." "The training dataset will be truncated in block of this size for training." "Default to the model max input length for single sentence inputs (take into account special tokens)." ) # ====== 学習 ====== parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") # バリデーション parser.add_argument( "--evaluate_during_training", action='store_true', help="Run evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") # ====== 学習オプション ====== parser.add_argument("--per_gpu_train_batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=4, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") # ====== GPUオプション ====== # parser.add_argument("--n_gpu", default=1, type=int, # help="The number of GPUs to use for training") # tp = lambda x:list(map(int, x.split('.'))) # parser.add_argument("--device_ids", default=None, type=tp, # help="The device IDs to use for training") # 重み減衰 parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=1.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( '--save_total_limit', type=int, default=None, help= 'Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default' ) parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") # ======= 分散学習 ========= parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") args = parser.parse_args() # if args.model_type in ["bert", "roberta", "distilbert", "camembert"] and not args.mlm: # raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm " # "flag (masked language modeling).") if args.eval_data_file is None and args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Barrier to make sure only the first process in distributed training download model & vocab # config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = BertConfig.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None) bert_tokenizer = BertTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None) jp_tokenizer = JumanTokenizer() # if args.block_size <= 0: # args.block_size = bert_tokenizer.max_len_single_sentence # Our input block size will be the max possible for the model # args.block_size = min(args.block_size, bert_tokenizer.max_len_single_sentence) model = BertForMaskedLM.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None) # ====== BERT一部パラメータ凍結 ======= # - BERTエンコーダ最終層,プーラーのみ凍結回避 bert_last_layer = copy.deepcopy(model.bert.encoder.layer[-1]) bert_pooler = copy.deepcopy(model.bert.pooler) # - BERT凍結 for param in model.bert.parameters(): param.requires_grad = False # - 非凍結レイヤーで置換 model.bert.encoder.layer[-1] = bert_last_layer model.bert.pooler = bert_pooler # ===================================== assert all([ param.requires_grad for param in model.bert.encoder.layer[-1].parameters() ]) model.to(args.device) if args.local_rank == 0: torch.distributed.barrier( ) # End of barrier to make sure only the first process in distributed training download model & vocab logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache train_dataset = load_and_cache_examples(args, bert_tokenizer, jp_tokenizer, evaluate=False) if args.local_rank == 0: torch.distributed.barrier() global_step, tr_loss = train(args, train_dataset, model, bert_tokenizer, jp_tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) bert_tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) # Load a trained model and vocabulary that you have fine-tuned model = BertForMaskedLM.from_pretrained(args.output_dir) bert_tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) # ====== BERT一部パラメータ凍結 ======= # - BERTエンコーダ最終層,プーラーのみ凍結回避 bert_last_layer = copy.deepcopy(model.bert.encoder.layer[-1]) bert_pooler = copy.deepcopy(model.bert.pooler) # - BERT凍結 for param in model.bert.parameters(): param.requires_grad = False # - 非凍結レイヤーで置換 model.bert.encoder.layer[-1] = bert_last_layer model.bert.pooler = bert_pooler # ===================================== model.to(args.device) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( '-')[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split( '/')[-1] if checkpoint.find('checkpoint') != -1 else "" model = BertForMaskedLM.from_pretrained(checkpoint) # ====== BERT一部パラメータ凍結 ======= # - BERTエンコーダ最終層,プーラーのみ凍結回避 bert_last_layer = copy.deepcopy(model.bert.encoder.layer[-1]) bert_pooler = copy.deepcopy(model.bert.pooler) # - BERT凍結 for param in model.bert.parameters(): param.requires_grad = False # - 非凍結レイヤーで置換 model.bert.encoder.layer[-1] = bert_last_layer model.bert.pooler = bert_pooler # ===================================== model.to(args.device) result = evaluate(args, model, bert_tokenizer, jp_tokenizer, prefix=prefix) result = dict( (k + '_{}'.format(global_step), v) for k, v in result.items()) results.update(result) return results
def __init__(self): self.tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') self.model = BertForMaskedLM.from_pretrained('bert-base-chinese') self.model.eval()
def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) if args.LM == 'Bert': from transformers import BertTokenizerFast, BertConfig, BertForMaskedLM config = BertConfig(vocab_size=28996, max_position_embeddings=512, num_attention_heads=12, num_hidden_layers=12, #type_vocab_size=2, default is 2 ) tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased', do_lower_case=False) model = BertForMaskedLM.from_pretrained('./multi-label_LM/multi-label_Bert_e10_b16', config=config) #model = BertForMaskedLM.from_pretrained('./multi-label_train.csv_LMmodel', config=config) # 12-layer, 768-hidden, 12-heads, 110M parameters. elif args.LM == 'RoBerta': from transformers import RobertaConfig, RobertaTokenizerFast, RobertaForMaskedLM config = RobertaConfig(vocab_size=50265, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=12, type_vocab_size=1, ) tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lower_case=False) model = RobertaForMaskedLM.from_pretrained('./multi-label_LM/multi-label_RoBerta_e10_b16', config=config) # 12-layer, 768-hidden, 12-heads, 125M parameters, roberta-base using the bert-base architecture
# -*- coding: utf-8 -*- """ @author:XuMing([email protected]) @description: pip install gradio """ import gradio as gr import operator import torch from transformers import BertTokenizer, BertForMaskedLM tokenizer = BertTokenizer.from_pretrained( "shibing624/macbert4csc-base-chinese") model = BertForMaskedLM.from_pretrained("shibing624/macbert4csc-base-chinese") def ai_text(text): with torch.no_grad(): outputs = model(**tokenizer([text], padding=True, return_tensors='pt')) def get_errors(corrected_text, origin_text): sub_details = [] for i, ori_char in enumerate(origin_text): if ori_char in [' ', '“', '”', '‘', '’', '琊', '\n', '…', '—', '擤']: # add unk word corrected_text = corrected_text[:i] + ori_char + corrected_text[ i:] continue if i >= len(corrected_text): continue if ori_char != corrected_text[i]:
if __name__ == "__main__": parser = argparse.ArgumentParser( description= "Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation" ) parser.add_argument("--model_type", default="bert", choices=["bert"]) parser.add_argument("--model_name", default="bert-base-uncased", type=str) parser.add_argument( "--dump_checkpoint", default="serialization_dir/tf_bert-base-uncased_0247911.pth", type=str) parser.add_argument("--vocab_transform", action="store_true") args = parser.parse_args() if args.model_type == "bert": model = BertForMaskedLM.from_pretrained(args.model_name) prefix = "bert" else: raise ValueError('args.model_type should be "bert".') state_dict = model.state_dict() compressed_sd = {} for w in ["word_embeddings", "position_embeddings"]: compressed_sd[f"distilbert.embeddings.{w}.weight"] = state_dict[ f"{prefix}.embeddings.{w}.weight"] for w in ["weight", "bias"]: compressed_sd[f"distilbert.embeddings.LayerNorm.{w}"] = state_dict[ f"{prefix}.embeddings.LayerNorm.{w}"] std_idx = 0
@description: refer https://github.com/voidful/BertGenerate/blob/master/Bert_Generate.ipynb update bert_lstm model to seq2seq """ import torch import torch.nn as nn from transformers import BertTokenizer, BertModel, BertForMaskedLM from pycorrector import config input_text = "[CLS] I go to school by bus [SEP] " target_text = "我搭校车上学" modelpath = config.bert_model_dir tokenizer = BertTokenizer.from_pretrained(modelpath) model = BertForMaskedLM.from_pretrained(modelpath) # cuda # model.to('cuda') def get_example_pair(input_text, target_text): example_pair = dict() for i in range(0, len(target_text) + 1): tokenized_text = tokenizer.tokenize(input_text) tokenized_text.extend(target_text[:i]) tokenized_text.append('[MASK]') indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) tokens_tensor = torch.tensor([indexed_tokens]).to('cpu')
dtype=torch.long) all_input_attention = torch.tensor( [input_attention for input_attention in input_attention], dtype=torch.long) all_input_maskLM = torch.tensor( [input_maskLM for input_maskLM in input_maskLM], dtype=torch.long) full_dataset = TensorDataset(all_input_id, all_input_pos, all_input_attention, all_input_maskLM) return full_dataset # load model token_type_size = 13 config = BertConfig.from_pretrained('bert-base-uncased') model = BertForMaskedLM.from_pretrained('bert-base-uncased') model.bert.embeddings.token_type_embeddings = nn.Embedding( token_type_size, config.hidden_size) nlp = spacy.load( "model/spacy/en_core_web_md-2.3.1/en_core_web_md/en_core_web_md-2.3.1") semantics_list = [ "about such concepts as absurdity it knew nothing .", ] ori_syntactic_list = ["he had no idea about such terms ."] syntactic_list_in_dict, part_maskLM_embedding_list_in_dict = data_preprocess.extrapolate_syntactic( ori_syntactic_list, nlp) token_embedding_id_list, segment_embedding_list, attention_embedding_list, maskLM_embedding_list = data_preprocess.get_embedding( semantics_list, ori_syntactic_list, syntactic_list_in_dict, part_maskLM_embedding_list_in_dict) pos_embedding_list = data_preprocess_pos.get_pos_embedding(
def download_bert_model(): return BertForMaskedLM.from_pretrained("bert-base-cased")
def __init__(self, model_name_or_path: str): super(BertPretrain, self).__init__() self.bert_model = BertForMaskedLM.from_pretrained(model_name_or_path)
def __init__(self, args): super().__init__() self.model = BertForMaskedLM.from_pretrained('bert-base-uncased') self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.ntxloss = NTXentLoss(temperature=args.temperature) self.args = args
df_lineage = pd.read_csv(lineage[1], sep='\t', names=["Repo"])[:20000] print("CSVs loaded") docstrings_avg_vec = np.load(docstrings_vecs[1], allow_pickle=True) config = BertConfig.from_json_file(model_path[1] + '/config.json') config.output_hidden_states = True print("Tokenizer and model initialized") tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') device = torch.device('cpu') model = BertForMaskedLM.from_pretrained("bert-base-uncased", config=config) model.load_state_dict( torch.load(model_path[1] + "/pytorch_model.bin", map_location=device)) model.eval() # Initialize a new index, using a HNSW index on Cosine Similarity index = nmslib.init(method='hnsw', space='cosinesimil') index.addDataPointBatch(docstrings_avg_vec) index.createIndex({'post': 2}, print_progress=True) print("Index made") # Routes: @app.route('/hello') def hello_world():
def __init__(self, args): super().__init__() self.model = BertForMaskedLM.from_pretrained('bert-base-uncased') self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.bceloss = nn.BCEWithLogitsLoss() self.args = args
def __init__(self, mlm_path: str = "bert-base-uncased", k: int = 50, threshold_pred_score: float = 0.3, max_length: int = 512, batch_size: int = 32, replace_rate: float = 1.0, insert_rate: float = 0.0, device: Optional[torch.device] = None, sentence_encoder=None, filter_words: List[str] = None): """ BAE: BERT-based Adversarial Examples for Text Classification. Siddhant Garg, Goutham Ramakrishnan. EMNLP 2020. `[pdf] <https://arxiv.org/abs/2004.01970>`__ `[code] <https://github.com/QData/TextAttack/blob/master/textattack/attack_recipes/bae_garg_2019.py>`__ This script is adapted from <https://github.com/LinyangLee/BERT-Attack> given the high similarity between the two attack methods. This attacker supports the 4 attack methods (BAE-R, BAE-I, BAE-R/I, BAE-R+I) in the paper. Args: mlm_path: The path to the masked language model. **Default:** 'bert-base-uncased' k: The k most important words / sub-words to substitute for. **Default:** 50 threshold_pred_score: Threshold used in substitute module. **Default:** 0.3 max_length: The maximum length of an input sentence for bert. **Default:** 512 batch_size: The size of a batch of input sentences for bert. **Default:** 32 replace_rate: Replace rate. insert_rate: Insert rate. device: A computing device for bert. sentence_encoder: A sentence encoder to calculate the semantic similarity of two sentences. Default: :py:class:`.UniversalSentenceEncoder` filter_words: A list of words that will be preserved in the attack procesudre. :Data Requirements: :py:data:`.TProcess.NLTKPerceptronPosTagger` :Classifier Capacity: * get_pred * get_prob :Language: english """ if sentence_encoder is None: self.encoder = UniversalSentenceEncoder() else: self.encoder = sentence_encoder self.tokenizer_mlm = BertTokenizerFast.from_pretrained( mlm_path, do_lower_case=True) if device is not None: self.device = device else: self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") config_atk = BertConfig.from_pretrained(mlm_path) self.mlm_model = BertForMaskedLM.from_pretrained(mlm_path, config=config_atk).to( self.device) self.k = k self.threshold_pred_score = threshold_pred_score self.max_length = max_length self.batch_size = batch_size self.replace_rate = replace_rate self.insert_rate = insert_rate if self.replace_rate == 1.0 and self.insert_rate == 0.0: self.sub_mode = 0 # only using replacement elif self.replace_rate == 0.0 and self.insert_rate == 1.0: self.sub_mode = 1 # only using insertion elif self.replace_rate + self.insert_rate == 1.0: self.sub_mode = 2 # replacement OR insertion for each token / subword elif self.replace_rate == 1.0 and self.insert_rate == 1.0: self.sub_mode = 3 # first replacement AND then insertion for each token / subword else: raise NotImplementedError() self.__lang_tag = TAG_English if filter_words is None: filter_words = get_default_filter_words(self.__lang_tag) self.filter_words = set(filter_words) check_language([self.encoder], self.__lang_tag)
def bert_example(): # NOTE [info] >> Refer to example codes in the comment of forward() of each BERT class in https://github.com/huggingface/transformers/blob/master/src/transformers/modeling_bert.py pretrained_model_name = 'bert-base-uncased' tokenizer = BertTokenizer.from_pretrained(pretrained_model_name) input_ids = torch.tensor(tokenizer.encode('Hello, my dog is cute', add_special_tokens=True)).unsqueeze(0) # Batch size 1. if True: print('Start loading a model...') start_time = time.time() # The bare Bert Model transformer outputting raw hidden-states without any specific head on top. model = BertModel.from_pretrained(pretrained_model_name) print('End loading a model: {} secs.'.format(time.time() - start_time)) print('Start inferring...') start_time = time.time() model.eval() with torch.no_grad(): outputs = model(input_ids) print('End inferring: {} secs.'.format(time.time() - start_time)) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple. print('{} processed.'.format(BertModel.__name__)) if True: print('Start loading a model...') start_time = time.time() # Bert Model with two heads on top as done during the pre-training: a 'masked language modeling' head and a 'next sentence prediction (classification)' head. model = BertForPreTraining.from_pretrained(pretrained_model_name) print('End loading a model: {} secs.'.format(time.time() - start_time)) print('Start inferring...') start_time = time.time() model.eval() with torch.no_grad(): outputs = model(input_ids) print('End inferring: {} secs.'.format(time.time() - start_time)) prediction_scores, seq_relationship_scores = outputs[:2] print('{} processed.'.format(BertForPreTraining.__name__)) if True: print('Start loading a model...') start_time = time.time() # Bert Model with a 'language modeling' head on top. model = BertForMaskedLM.from_pretrained(pretrained_model_name) print('End loading a model: {} secs.'.format(time.time() - start_time)) print('Start inferring...') start_time = time.time() model.eval() with torch.no_grad(): outputs = model(input_ids, masked_lm_labels=input_ids) print('End inferring: {} secs.'.format(time.time() - start_time)) loss, prediction_scores = outputs[:2] print('{} processed.'.format(BertForMaskedLM.__name__)) if True: print('Start loading a model...') start_time = time.time() # Bert Model with a 'next sentence prediction (classification)' head on top. model = BertForNextSentencePrediction.from_pretrained(pretrained_model_name) print('End loading a model: {} secs.'.format(time.time() - start_time)) print('Start inferring...') start_time = time.time() model.eval() with torch.no_grad(): outputs = model(input_ids) print('End inferring: {} secs.'.format(time.time() - start_time)) seq_relationship_scores = outputs[0] print('{} processed.'.format(BertForNextSentencePrediction.__name__)) if True: print('Start loading a model...') start_time = time.time() # Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. model = BertForSequenceClassification.from_pretrained(pretrained_model_name) print('End loading a model: {} secs.'.format(time.time() - start_time)) labels = torch.tensor([1]).unsqueeze(0) # Batch size 1. print('Start inferring...') start_time = time.time() model.eval() with torch.no_grad(): outputs = model(input_ids, labels=labels) print('End inferring: {} secs.'.format(time.time() - start_time)) loss, logits = outputs[:2] print('{} processed.'.format(BertForSequenceClassification.__name__)) if True: print('Start loading a model...') start_time = time.time() # Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. model = BertForMultipleChoice.from_pretrained(pretrained_model_name) print('End loading a model: {} secs.'.format(time.time() - start_time)) choices = ['Hello, my dog is cute', 'Hello, my cat is amazing'] input_ids0 = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices. labels = torch.tensor(1).unsqueeze(0) # Batch size 1. print('Start inferring...') start_time = time.time() model.eval() with torch.no_grad(): outputs = model(input_ids0, labels=labels) print('End inferring: {} secs.'.format(time.time() - start_time)) loss, classification_scores = outputs[:2] print('{} processed.'.format(BertForMultipleChoice.__name__)) if True: print('Start loading a model...') start_time = time.time() # Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. model = BertForTokenClassification.from_pretrained(pretrained_model_name) print('End loading a model: {} secs.'.format(time.time() - start_time)) labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1. print('Start inferring...') start_time = time.time() model.eval() with torch.no_grad(): outputs = model(input_ids, labels=labels) print('End inferring: {} secs.'.format(time.time() - start_time)) loss, scores = outputs[:2] print('{} processed.'.format(BertForTokenClassification.__name__)) if True: print('Start loading a model...') start_time = time.time() # Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute 'span start logits' and 'span end logits'). model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') print('End loading a model: {} secs.'.format(time.time() - start_time)) question, text = 'Who was Jim Henson?', 'Jim Henson was a nice puppet' encoding = tokenizer.encode_plus(question, text) input_ids0, token_type_ids = encoding['input_ids'], encoding['token_type_ids'] print('Start inferring...') start_time = time.time() model.eval() with torch.no_grad(): start_scores, end_scores = model(torch.tensor([input_ids0]), token_type_ids=torch.tensor([token_type_ids])) print('End inferring: {} secs.'.format(time.time() - start_time)) all_tokens = tokenizer.convert_ids_to_tokens(input_ids0) answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]) assert answer == 'a nice puppet' print('{} processed.'.format(BertForQuestionAnswering.__name__))
def main(): parser = setup_parser() args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_eval: raise ValueError("At least `do_eval` must be True.") # Load pre-trained model tokenizer (vocabulary) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None # Prepare model. Load pre-trained model weights. cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)) model = BertForMaskedLM.from_pretrained(args.bert_model, cache_dir=cache_dir) if args.fp16: model.half() model.to(device) # output_sr_file = open(args.output_SR_file, "a+") # Load fastText word embeddings print("Loading embeddings ...") wordVecPath = args.word_embeddings # wordVecPath = "./fastText/crawl-300d-2M-subword.vec" fasttext_dico, fasttext_emb = getWordmap(wordVecPath) # Load word frequency word_count_path = args.word_frequency with open(word_count_path, 'rb') as f: word_count = pickle.load(f) # with open('../word_frequency/counter_Tokens.p', 'rb') as f: # word_count = pickle.load(f) stopword_list1 = set(stopwords.words('english')) with open(args.stopwords, "r") as f: stopword_list2 = set(eval(f.read())) stopword_list = stopword_list1.union(stopword_list2) ps = PorterStemmer() SS = [] substitution_words = [] source_words = [] num_selection = args.num_selections window_context = 11 if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Read dataset origin, sent_id, eval_examples, mask_words, CHV_selections, CHV_substitutions, CHV_sim_scores = read_df(args.eval_dir) print(sent_id) print(eval_examples) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) # logger.info(" Batch size = %d", args.eval_batch_size) # Put the model in "evaluation" mode, meaning feed-forward operation. model.eval() eval_size = len(eval_examples) for i in tqdm(range(eval_size)): substitution_df = [] # print(f'{origin[i]}, sentence {sent_id[i]}: ') print(f'sentence {sent_id[i]}:\n{eval_examples[i]}') tokens, words, position = convert_sentence_to_token( sentence=eval_examples[i], tokenizer=tokenizer, seq_length=128) print("tokens: ", tokens) print("words: ", words) print("position: ", position) assert len(words) == len(position) # len_tokens = len(tokens) # print("len_tokens: ", len_tokens) try: mask_index = words.index(mask_words[i].lower()) #use lower case if do_lower_case == True except ValueError: print(f'"{mask_words[i]}" is not in list of words') try: mask_index = words.index(mask_words[i].lower() + "'s") except ValueError: print(f'"{mask_words[i]}" + "\'s" is also not in list of words\nThis sentence will be skipped.\n') continue mask_position = position[mask_index] mask_context = extract_context(words, mask_index, window_context) # print("mask_index: ", mask_index) # print("mask_position: ", mask_position) # print("mask_context: ", mask_context) if isinstance(mask_position, list): feature = convert_whole_word_to_feature(tokens_a=tokens, mask_position=mask_position, seq_length=args.max_seq_length, tokenizer=tokenizer) else: feature = convert_token_to_feature(tokens_a=tokens, mask_position=mask_position, seq_length=args.max_seq_length, tokenizer=tokenizer) print("feature.tokens: ", feature.tokens) # print("feature.input_ids: ", feature.input_ids) # print("feature.input_type_ids: ", feature.input_type_ids) # print("feature.input_mask: ", feature.input_mask) # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([feature.input_ids]) segments_tensor = torch.tensor([feature.input_type_ids]) attention_mask = torch.tensor([feature.input_mask]) # If we have a GPU, put the tensors on cuda tokens_tensor = tokens_tensor.to('cuda') segments_tensor = segments_tensor.to('cuda') attention_mask = attention_mask.to('cuda') # Predict all tokens with torch.no_grad(): output = model(tokens_tensor, token_type_ids=segments_tensor, attention_mask=attention_mask) prediction_scores = output[0] # print("predictions: ", prediction_scores) if isinstance(mask_position, list): predicted_top = prediction_scores[0, mask_position[0]].topk(40) else: predicted_top = prediction_scores[0, mask_position].topk(40) pre_tokens = tokenizer.convert_ids_to_tokens(predicted_top[1].cpu().numpy()) print("pre_tokens: ", pre_tokens) pre_prob_values = predicted_top[0].cpu().numpy() print("pre_prob_values: ", pre_prob_values) ss = substitution_selection(source_word=mask_words[i], pre_tokens=pre_tokens, pre_scores=pre_prob_values, stopwords=stopword_list, ps=ps, num_selection=num_selection) # SS.append(ss) # source_words.append(mask_words[i]) pre_word = substitution_ranking(source_word=mask_words[i], source_context=mask_context, substitution_selection=ss, fasttext_dico=fasttext_dico, fasttext_emb=fasttext_emb, word_count=word_count, tokenizer=tokenizer, maskedLM=model) MLM_sim_score = fuzzy_match(mask_words[i], pre_word) print('---------------------------------------') print("Sentence: ", eval_examples[i]) print("Source word: ", mask_words[i]) print("Substitution selection: ", ss) print("Model substitution: ", pre_word) print("Model sim score: ", MLM_sim_score) print("CHV substitution: ", CHV_substitutions[i]) print("CHV sim score: ", CHV_sim_scores[i]) print(" ") # substitution_words.append(pre_word) substitution_df.append(OrderedDict({"origin": origin[i], "sent_id": sent_id[i], "sentence": eval_examples[i], "source_term": mask_words[i], "CHV_selection": CHV_selections[i], "CHV_substitution": CHV_substitutions[i], "CHV_sim_score": CHV_sim_scores[i], "MLM_selection": ss, "MLM_substitution": pre_word, "MLM_sim_score": MLM_sim_score })) save_output(args.output_path, substitution_df)
masked_index = 1 tokenized_text[masked_index] = '[MASK]' # masked_index = 12 # tokenized_text[masked_index] = '[SEP]' print(tokenized_text) # ['テレビ', 'で', '[MASK]', 'の', '試合', 'を', '見る', '。'] # Convert token to vocabulary indices indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # [571, 12, 4, 5, 608, 11, 2867, 8] # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([indexed_tokens]) # tensor([[ 571, 12, 4, 5, 608, 11, 2867, 8]]) # Load pre-trained model model = BertForMaskedLM.from_pretrained( 'cl-tohoku/bert-base-japanese-whole-word-masking') model.eval() # Predict with torch.no_grad(): outputs = model(tokens_tensor) predictions = outputs[0][0, masked_index].topk(10) # 予測結果の上位5件を抽出 # Show results for i, index_t in enumerate(predictions.indices): index = index_t.item() token = tokenizer.convert_ids_to_tokens([index])[0] print(i, token)
PRETRAINED_MODEL_NAME_OR_PATH = os.environ.get("PRETRAINED_MODEL_NAME_OR_PATH") logging.info(f"PRETRAINED_MODEL_NAME_OR_PATH = {PRETRAINED_MODEL_NAME_OR_PATH}") MASK_ID = 103 try: cuda = torch.cuda.is_available() if cuda: torch.cuda.set_device(0) # singe gpu device = torch.device("cuda") else: device = torch.device("cpu") logger.info(f"masked_lm is set to run on {device}") # init model tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME_OR_PATH) model = BertForMaskedLM.from_pretrained(PRETRAINED_MODEL_NAME_OR_PATH) model.eval() if cuda: model.cuda() logger.info("masked_lm model is ready") except Exception as e: sentry_sdk.capture_exception(e) logger.exception(e) raise e app = Flask(__name__) health = HealthCheck(app, "/healthcheck") logging.getLogger("werkzeug").setLevel("WARNING")
import torch from transformers import BertTokenizer, BertModel, BertForMaskedLM import logging logging.basicConfig(level=logging.INFO) # OPTIONAL tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') model = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased') model.eval() # model.to('cuda') # if you have gpu #https://stackoverflow.com/questions/54978443/predicting-missing-words-in-a-sentence-natural-language-processing-model def predict_masked_sent(text, top_k=5): # Tokenize input text = "[CLS] %s [SEP]" % text tokenized_text = tokenizer.tokenize(text) #print(tokenizer.lang2id) masked_index = tokenized_text.index("[MASK]") indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) tokens_tensor = torch.tensor([indexed_tokens]) # tokens_tensor = tokens_tensor.to('cuda') # if you have gpu # Predict all tokens with torch.no_grad(): outputs = model(tokens_tensor) predictions = outputs[0] probs = torch.nn.functional.softmax(predictions[0, masked_index], dim=-1) top_k_weights, top_k_indices = torch.topk(probs, top_k, sorted=True)
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help="The input training data file (a text file).", ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.", ) # Optional parameters parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument( "--do_evaluate", type=bool, default=False, help="Run model evaluation on out-of-sample data.", ) parser.add_argument("--do_train", type=bool, default=False, help="Run training.") parser.add_argument( "--do_overwrite_output_dir", type=bool, default=False, help="Whether to overwrite the output dir.", ) parser.add_argument( "--model_name_or_path", default="bert-base-cased", type=str, help="The model checkpoint to initialize the encoder and decoder's weights with.", ) parser.add_argument( "--model_type", default="bert", type=str, help="The decoder architecture to be fine-tuned.", ) parser.add_argument( "--max_grad_norm", default=1.0, type=float, help="Max gradient norm." ) parser.add_argument( "--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument( "--to_cpu", default=False, type=bool, help="Whether to force training on CPU." ) parser.add_argument( "--num_train_epochs", default=10, type=int, help="Total number of training epochs to perform.", ) parser.add_argument( "--per_gpu_eval_batch_size", default=4, type=int, help="Batch size per GPU/CPU for eval.", ) parser.add_argument( "--per_gpu_train_batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.", ) parser.add_argument( "--input_block_size", default=256, type=int, help="Max seq length for input", ) parser.add_argument( "--output_block_size", default=64, type=int, help="Max seq length for output", ) parser.add_argument( "--trained_checkpoints", default="", type=str, help="trained_checkpoints", ) parser.add_argument( "--decoding_type", default="pnt", type=str, help="", ) parser.add_argument( "--encoder_lr", default=5e-4, type=float, help="encoder's learning rate", ) parser.add_argument( "--decoder_lr", default=5e-4, type=float, help="encoder's learning rate", ) parser.add_argument( "--encoder_warmup", default=10, type=int, help="encoder's learning rate", ) parser.add_argument( "--decoder_warmup", default=100, type=int, help="encoder's learning rate", ) parser.add_argument("--seed", default=42, type=int) args = parser.parse_args() if ( os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.do_overwrite_output_dir ): raise ValueError( "Output directory ({}) already exists and is not empty. Use --do_overwrite_output_dir to overwrite.".format( args.output_dir ) ) # Set up training device if args.to_cpu or not torch.cuda.is_available(): args.device = torch.device("cpu") args.n_gpu = 0 else: args.device = torch.device("cuda") args.n_gpu = torch.cuda.device_count() print(args.n_gpu) # Load pretrained model and tokenizer. The decoder's weights are randomly initialized. tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) #config = BertConfig.from_pretrained(args.model_name_or_path) #config.num_hidden_layers=3 #config.is_decoder=True #decoder_model = BertForMaskedLM(config) decoder_model = BertForMaskedLM.from_pretrained(r'/data/zhuoyu/semantic_parsing/models') model = Model2Model.from_pretrained( args.model_name_or_path, decoder_model=decoder_model ) #model = Model2Model.from_pretrained( # args.model_name_or_path, decoder_model=None #) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", 0, args.device, args.n_gpu, False, False, ) logger.info("Training/evaluation parameters %s", args) # Train the model model.to(args.device) if args.do_train: global_step, tr_loss = train(args, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) torch.save(args, os.path.join(args.output_dir, "training_arguments.bin")) # Evaluate the model results = {} if args.do_evaluate: checkpoints = [args.trained_checkpoints] logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: encoder_checkpoint = os.path.join(checkpoint, "encoder") decoder_checkpoint = os.path.join(checkpoint, "decoder") #model = PreTrainedEncoderDecoder.from_pretrained( # encoder_checkpoint, decoder_checkpoint #) #model = Model2Model.from_pretrained(encoder_checkpoint) #model.to(args.device) results = "placeholder" evaluate(args,model,tokenizer,"test") return results
args = parser.parse_args() print("Reconstruction. step = ", args.step) if args.type_of_model == 'albert': tokenizer = AlbertTokenizer(os.path.join(args.config_and_vocab, args.type_of_model, 'vocab.model')) config = AlbertConfig.from_json_file(os.path.join(args.config_and_vocab, args.type_of_model, 'config.json')) config.output_hidden_states = True model = AlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path = None, config = config, state_dict = torch.load(os.path.join( args.path_to_pytorch_models, args.type_of_model, 'pytorch_model_' + args.step + '.bin'))) elif args.type_of_model == 'bert': tokenizer = BertTokenizer(os.path.join(args.config_and_vocab, args.type_of_model, 'vocab.model')) config = BertConfig.from_json_file(os.path.join(args.config_and_vocab, args.type_of_model, 'config.json')) config.output_hidden_states = True model = BertForMaskedLM.from_pretrained(pretrained_model_name_or_path = None, config = config, state_dict = torch.load(os.path.join( args.path_to_pytorch_models, args.type_of_model, 'pytorch_model_' + args.step + '.bin'))) else: raise NotImplementedError("The given model type %s is not supported" % args.type_of_model) device = 'cuda' if torch.cuda.is_available else 'cpu' model.eval().to(device) tag = {} with open(os.path.join(args.data, 'ontonotes/const/pos/labels.txt')) as f: while True: pos = f.readline().rstrip() if pos == "": break tag[pos] = np.asarray([0, 0])
def _get_masked_language_model(self): """ Initializes the BertForMaskedLM transformer """ self.mlm = BertForMaskedLM.from_pretrained(self.model) self.mlm.eval()
def main(): # load the args from yaml file with open("bert_finetune.yaml") as file: args = yaml.load(file, Loader=yaml.FullLoader) print("Printing Arguments...") for key in args: print("- " + str(key) + ": " + str(args[key])) print("\nDevice used...") if torch.cuda.is_available(): print("Using GPU") else: print("GPU not seen by Torch, please check again. Exiting for now...") exit() # load the model print("\nLoading the model...") model = BertForMaskedLM.from_pretrained(args["model_name"]) # freeze the first 21 layers model = freeze_bert_fn(model, [i for i in range(0, 21)]) # freeze all but three print(model.config) print("Number of parameters: " + str(model.num_parameters())) # invoke the tokenizer tokenizer = BertTokenizerFast.from_pretrained(args["model_name"]) # load the dataset print("\nLoading the dataset") t0 = time.time() dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path=args["train_data_file"], block_size=512, ) val_dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path=args["val_data_file"], block_size=512, ) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=args["mlm_probability"]) print("Time taken: " + str(time.time() - t0)) # training training_args = TrainingArguments( output_dir=args["save_model_directory"], overwrite_output_dir=True, do_train=True, do_eval=True, logging_steps=args["logging_steps"], evaluation_strategy="steps", eval_steps=args["eval_steps"], num_train_epochs=args["num_train_epochs"], per_gpu_train_batch_size=args["batch_size"], per_gpu_eval_batch_size=args["batch_size"], save_steps=args["save_steps"], disable_tqdm=False, ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, eval_dataset=val_dataset, ) trainer.train() trainer.save_model(args["save_model_directory"])
import torch from transformers import BertForMaskedLM, BertTokenizer from torch.nn import CrossEntropyLoss import math from sentence_transformers import SentenceTransformer from transformers import XLNetModel, XLNetTokenizer import bert_preprocess import numpy as np st_model = SentenceTransformer('bert-base-nli-mean-tokens') bertmodelname = 'bert-large-uncased-whole-word-masking' tokenizer = BertTokenizer.from_pretrained(bertmodelname) model = BertForMaskedLM.from_pretrained(bertmodelname) """ bertmodelname = 'bert-large-uncased-whole-word-masking' tokenizer = BertTokenizer.from_pretrained(bertmodelname) bertsavedmodelname = "pytorch_model3.bin" model_state_dict = torch.load(bertsavedmodelname) model = BertForMaskedLM.from_pretrained(pretrained_model_name_or_path=bertmodelname, state_dict=model_state_dict) """ #text = '[CLS] I want to [MASK] the car because it is cheap . [SEP]' def predict_missing(text):
def main(): """ Modify substitute probabilities based on lexical similarity with target. """ parser = argparse.ArgumentParser( description='Modify substitute probabilities based on lexical similarity with target.') parser.add_argument( '--model_name', type=str, required=True, help='HuggingFace model name or path') parser.add_argument( '--subs_path', type=str, required=True, help='Path to the pickle file containing substitute lists (output by substitutes.py).') parser.add_argument( '--targets_path', type=str, required=True, help='Path to the csv file containing target word forms.') parser.add_argument( '--output_path', type=str, required=True, help='Output path for pickle containing substitutes with lexical similarity values.') parser.add_argument( '--batch_size', type=int, default=64, help='The batch size per device (GPU core / CPU).') parser.add_argument( '--ignore_decoder_bias', action='store_true', help="Whether to ignore the decoder's bias vector during masked word prediction") parser.add_argument( '--normalise_embeddings', action='store_true', help="Whether to ignore the decoder's bias vector during masked word prediction") parser.add_argument( '--local_rank', type=int, default=-1, help='For distributed training.') args = parser.parse_args() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.info(__file__.upper()) start_time = time.time() # Setup CUDA, GPU & distributed training if args.local_rank == -1: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") n_gpu = 1 # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s", args.local_rank, device, n_gpu, bool(args.local_rank != -1) ) # Set seeds across modules set_seed(42, n_gpu) # Load target forms target_forms = [] with open(args.targets_path, 'r', encoding='utf-8') as f_in: for line in f_in.readlines(): line = line.strip() forms = line.split(',')[1:] target_forms.extend(forms) print('=' * 80) print('targets:', target_forms) print('=' * 80) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab # Load model and tokenizer tokenizer = BertTokenizer.from_pretrained(args.model_name, never_split=target_forms, use_fast=False) model = BertForMaskedLM.from_pretrained(args.model_name, output_hidden_states=True) if args.ignore_decoder_bias: logger.warning('Ignoring bias vector for masked word prediction.') model.cls.predictions.decoder.bias = None if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab model.to(device) # Store vocabulary indices of target words targets_ids = [tokenizer.encode(t, add_special_tokens=False) for t in target_forms] assert len(target_forms) == len(targets_ids) words_added = [] for t, t_id in zip(target_forms, targets_ids): if tokenizer.do_lower_case: t = t.lower() if t in tokenizer.added_tokens_encoder: continue if len(t_id) > 1 or (len(t_id) == 1 and t_id[0] == tokenizer.unk_token_id): if tokenizer.add_tokens([t]): model.resize_token_embeddings(len(tokenizer)) words_added.append(t) else: logger.error('Word not properly added to tokenizer:', t, tokenizer.tokenize(t)) # check if correctly added for t, t_id in zip(target_forms, targets_ids): if len(t_id) != 1: print(t, t_id) logger.warning("\nTarget words added to the vocabulary: {}.\n".format(', '.join(words_added))) # assert len(t_id) == 1 # because of never_split list # if t_id[0] == tokenizer.unk_token_id: # if tokenizer.add_tokens([t]): # model.resize_token_embeddings(len(tokenizer)) # words_added.append(t) # else: # logger.error('Word not properly added to tokenizer:', t, tokenizer.tokenize(t)) # multi-gpu training (should be after apex fp16 initialization) if n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) with open(args.subs_path, 'rb') as f_in: substitutes_raw = pickle.load(f_in) substitutes_new = { w: [{'candidates': [], 'logp': [], 'dot_products': []} for _ in substitutes_raw[w]] for w in substitutes_raw } def collate(batch): return [ {'input_ids': torch.cat([item[0]['input_ids'].unsqueeze(0) for item in batch], dim=0), 'attention_mask': torch.cat([item[0]['attention_mask'].unsqueeze(0) for item in batch], dim=0)}, [item[1] for item in batch], # target [item[2] for item in batch], # occurrence_idx [item[3] for item in batch], # candidate_token torch.cat([torch.as_tensor(item[4]).unsqueeze(0) for item in batch], dim=0), # embedding [item[5] for item in batch], # logp [item[6] for item in batch] #position ] dataset = SubstitutesDataset(substitutes_raw, tokenizer, args.normalise_embeddings) sampler = SequentialSampler(dataset) dataloader = DataLoader(dataset, sampler=sampler, batch_size=args.batch_size, collate_fn=collate) iterator = tqdm(dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(iterator): model.eval() inputs, tgt, occurrence_idxs, candidate_tokens, tgt_embedding, logps, positions = batch inputs['input_ids'] = inputs['input_ids'].to(device) inputs['attention_mask'] = inputs['attention_mask'].to(device) tgt_embedding = tgt_embedding.to(device) bsz = inputs['input_ids'].shape[0] with torch.no_grad(): outputs = model(**inputs) # n_sentences, max_sent_len, vocab_size hidden_states = outputs[1] last_layer = hidden_states[-1][np.arange(bsz), positions, :] # (bsz, hdims) if args.normalise_embeddings: last_layer = normalize(last_layer, p=2) dot_products = torch.sum(tgt_embedding * last_layer, dim=1) # (bsz) if args.normalise_embeddings: assert all([d <= 1.01 for d in dot_products]), 'Dot product should not exceed 1 if vectors are normalised.' for b_id in np.arange(bsz): tgt_lemma = tgt[b_id] occurrence_idx = occurrence_idxs[b_id] substitutes_new[tgt_lemma][occurrence_idx]['candidates'].append(candidate_tokens[b_id]) substitutes_new[tgt_lemma][occurrence_idx]['logp'].append(logps[b_id]) substitutes_new[tgt_lemma][occurrence_idx]['dot_products'].append(dot_products[b_id].item()) iterator.close() with open(args.output_path, 'wb') as f_out: pickle.dump(substitutes_new, f_out) logger.warning("--- %s seconds ---" % (time.time() - start_time))
# model.save_pretrained('./ernie-1.0') # tokenizer.save_pretrained('./ernie-1.0') # tf_model.save_pretrained("./ernie-1.0") import torch from transformers import BertTokenizer, BertForMaskedLM tokenizer = BertTokenizer.from_pretrained('nghuyong/ernie-1.0') input_tx = "[CLS] [MASK] [MASK] [MASK] 是黑龙江的省会城市[SEP]" tokenized_text = tokenizer.tokenize(input_tx) indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([[0] * len(tokenized_text)]) model = BertForMaskedLM.from_pretrained('nghuyong/ernie-1.0') model.eval() with torch.no_grad(): outputs = model(tokens_tensor, token_type_ids=segments_tensors) predictions = outputs[0] predicted_index = [ torch.argmax(predictions[0, i]).item() for i in range(0, (len(tokenized_text) - 1)) ] predicted_token = [ tokenizer.convert_ids_to_tokens([predicted_index[x]])[0] for x in range(1, (len(tokenized_text) - 1)) ]
return input_padded, index_list, len(clean_text) def complete_missing_word(pred_binary, index_list, len_list): list_cwi_predictions = list(pred_binary[0][:len_list]) for i in index_list: list_cwi_predictions.insert(i, 0) return list_cwi_predictions # Second part: The Candidates generation and selection using BERT # Load the BERT model for masked languge bert_model = 'bert-large-uncased' tokenizer = BertTokenizer.from_pretrained(bert_model) model = BertForMaskedLM.from_pretrained(bert_model) model.eval() zipf_frequency('stop', 'en') zipf_frequency('thwart', 'en') # Now the function to get the candidates out of BERT (MLM): def get_bert_candidates(input_text, list_cwi_predictions, numb_predictions_displayed=10): list_candidates_bert = [] for word, pred in zip(input_text.split(), list_cwi_predictions): if (pred and (pos_tag([word])[0][1] in ['NNS', 'NN', 'VBP', 'RB', 'VBG', 'VBD']) ) or (zipf_frequency(word, 'en')) < 3.1:
def __init__(self): super(MaskedLM, self).__init__() self.bert_layer = BertForMaskedLM.from_pretrained('allenai/scibert_scivocab_uncased', output_hidden_states=True) self.dropout = nn.Dropout(0.1)
def __init__(self, model_name_or_path: str = 'bert-base-cased') -> None: self._tokenizer: PreTrainedTokenizer = BertTokenizer.from_pretrained( model_name_or_path) self._model = BertForMaskedLM.from_pretrained(model_name_or_path) self._STOPWORDS: List[str] = stopwords.words('english')
return res def get_pronounce_dist_between_sentences(sent1, sent2): pron1 = pronounce_sentence(sent1) pron2 = pronounce_sentence(sent2) return LevenshteinDist(pron1, pron2, 1, 1, confusion, 1)[-1][-1] if LOAD_MODEL: # or 'BertForMaskedLM' not in locals(): from transformers import BertForMaskedLM, BertTokenizer, BertTokenizerFast import torch, math device = torch.device('cuda') # bertMaskedLM = BertForMaskedLM.from_pretrained( # '/home/akiralll/PycharmProjects/bert_mlm/distilbert-base-uncased-train_wiki_articles_lm-train_youtube/') bertMaskedLM = BertForMaskedLM.from_pretrained('distilbert-base-uncased') tokenizer = BertTokenizerFast.from_pretrained('distilbert-base-uncased') bertMaskedLM.to(device) def make_features_for_candidate(orig_text: list, candidate_text: list, l_context_index, r_context_index, client_vocab=None): orig_text_str = ''.join(orig_text) candidate_text_str = ''.join(candidate_text) orig_text_pronounce = pronounce_sentence(orig_text) cand_text_pronounce = pronounce_sentence(candidate_text) features = [
def main(): MODEL_CACHE = './model/bert-base-chinese' WORD_2_VECTOR_MODEL_DIR = './model/merge_sgns_bigram_char300.txt' WORD_FREQ_DICT = './dict/modern_chinese_word_freq.txt' EVAL_FILE_PATH = './dataset/annotation_data.csv' BERT_RES_PATH = './data/bert_ss_res.csv' # ERNIE_RES_PATH = './data/ernie_output.csv' VECTOR_RES_PATH = './data/vector_ss_res.csv' DICT_RES_PATH = './data/dict_ss_res.csv' HOWNET_RES_PATH = './data/hownet_ss_res.csv' HYBRID_RES_PATH = './data/hybrid_ss_res.csv' SUBSTITUTION_NUM = 10 word_2_vector_model_dir = WORD_2_VECTOR_MODEL_DIR model_cache = MODEL_CACHE word_freq_dict = WORD_FREQ_DICT eval_file_path = EVAL_FILE_PATH bert_res_path = BERT_RES_PATH # ernie_res_path = ERNIE_RES_PATH vector_res_path = VECTOR_RES_PATH dict_res_path = DICT_RES_PATH hownet_res_path = HOWNET_RES_PATH hybrid_res_path = HYBRID_RES_PATH substitution_num = SUBSTITUTION_NUM print('loading models...') tokenizer = BertTokenizer.from_pretrained(model_cache) model = BertForMaskedLM.from_pretrained(model_cache) # OpenHowNet.download() hownet = OpenHowNet.HowNetDict(use_sim=True) model.to('cuda') model.eval() print('loading embeddings...') model_word2vector = gensim.models.KeyedVectors.load_word2vec_format( word_2_vector_model_dir, binary=False) print('loading files...') word_freq_dict = read_dict(word_freq_dict) bert_res = read_ss_result(bert_res_path) vector_res = read_ss_result(vector_res_path) dict_res = read_ss_result(dict_res_path) hownet_res = read_ss_result(hownet_res_path) hybrid_res = read_ss_result(hybrid_res_path) row_lines, source_sentences, source_words = read_dataset(eval_file_path) for row_line, source_sentence, source_word, bert_subs, vector_subs, dict_subs, hownet_subs, hybrid_subs in zip( row_lines, source_sentences, source_words, bert_res, vector_res, dict_res, hownet_res, hybrid_res): # 全部运行可能耗时较长,建议注释部分代码块运行需要的测试 if bert_subs[0] != 'NULL': bert_pre_word, bert_ss_sorted = substitute_ranking( row_line, model_word2vector, model, tokenizer, hownet, source_sentence, source_word, bert_subs, word_freq_dict, substitution_num) else: bert_pre_word = 'NULL' bert_ss_sorted = ['NULL'] if vector_subs[0] != 'NULL': vector_pre_word, vector_ss_sorted = substitute_ranking( row_line, model_word2vector, model, tokenizer, hownet, source_sentence, source_word, vector_subs, word_freq_dict, substitution_num) else: vector_pre_word = 'NULL' vector_ss_sorted = ['NULL'] if dict_subs[0] != 'NULL': dict_pre_word, dict_ss_sorted = substitute_ranking( row_line, model_word2vector, model, tokenizer, hownet, source_sentence, source_word, dict_subs, word_freq_dict, substitution_num) else: dict_pre_word = 'NULL' dict_ss_sorted = ['NULL'] if hownet_subs[0] != 'NULL': hownet_pre_word, hownet_ss_sorted = substitute_ranking( row_line, model_word2vector, model, tokenizer, hownet, source_sentence, source_word, hownet_subs, word_freq_dict, substitution_num) else: hownet_pre_word = 'NULL' hownet_ss_sorted = ['NULL'] if hybrid_subs[0] != 'NULL': hybrid_pre_word, hybrid_ss_sorted = substitute_ranking( row_line, model_word2vector, model, tokenizer, hownet, source_sentence, source_word, hybrid_subs, word_freq_dict, substitution_num) else: hybrid_pre_word = 'NULL' hybrid_ss_sorted = ['NULL'] save_result(row_line, bert_pre_word, bert_ss_sorted, './test/data/nochnum/bert_sr_res_no_chnum.csv') save_result(row_line, vector_pre_word, vector_ss_sorted, './test/data/nochnum/vector_sr_res_no_chnum.csv') save_result(row_line, dict_pre_word, dict_ss_sorted, './test/data/nochnum/dict_sr_res_no_chnum.csv') save_result(row_line, hownet_pre_word, hownet_ss_sorted, './test/data/nochnum/hownet_sr_res_no_chnum.csv') save_result(row_line, hybrid_pre_word, hybrid_ss_sorted, './test/data/nochnum/hybrid_sr_res_no_chnum.csv')