def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = BertForMaskedLM(config=config) model.eval() loss, prediction_scores = model(input_ids, token_type_ids, input_mask, token_labels) result = { "loss": loss, "prediction_scores": prediction_scores, } self.parent.assertListEqual( list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]) self.check_loss_output(result)
def __init__(self, bert_path): vocab_file_name = 'vocab.txt' # 日本語文章をBERTに食わせるためにJumanを読み込む self.juman_tokenizer = JumanTokenizer() # 事前学習済みのBERTモデルを読み込む self.model = BertModel.from_pretrained(bert_path) # 事前学習済みのBERTモデルのTokenizerを読み込む self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name, do_lower_case=False, do_basic_tokenize=False) self.vocab_size = len(self.bert_tokenizer.vocab) # 事前学習済みのBERTモデルのMaskedLMタスクモデルを読み込む self.model = BertForMaskedLM.from_pretrained(bert_path) # 除外するヘッダ等トークン except_tokens = ["[MASK]", #"[PAD]", "[UNK]", "[CLS]", "[SEP]", "(", ")", "・", "/", "、", "。", "!", "?", "「", "」", "…", "’", "』", "『", ":", "※" ] self.except_ids = [self.bert_tokenizer.vocab[token] for token in except_tokens] # vocab_sizeのうち、except_ids以外は、利用する self.candidate_ids = [i for i in range(self.vocab_size) if i not in self.except_ids]
def initialize_detector(self): t1 = time.time() try: import kenlm except ImportError: raise ImportError( 'mypycorrector dependencies are not fully installed, ' 'they are required for statistical language model.' 'Please use "pip install kenlm" to install it.' 'if you are Win, Please install kenlm in cgwin.') self.lm = kenlm.Model(self.language_model_path) logger.debug('Loaded language model: %s, spend: %s s' % (self.language_model_path, str(time.time() - t1))) # 词、频数dict t2 = time.time() self.word_freq = self.load_word_freq_dict(self.word_freq_path) self.char_freq = self.load_char_freq_dict(self.char_freq_path) t3 = time.time() logger.debug( 'Loaded word freq, char freq file: %s, size: %d, spend: %s s' % (self.word_freq_path, len(self.word_freq), str(t3 - t2))) # 自定义混淆集 self.custom_confusion = self._get_custom_confusion_dict( self.custom_confusion_path) t4 = time.time() logger.debug('Loaded confusion file: %s, size: %d, spend: %s s' % (self.custom_confusion_path, len( self.custom_confusion), str(t4 - t3))) # 自定义切词词典 self.custom_word_freq = self.load_word_freq_dict( self.custom_word_freq_path) self.person_names = self.load_word_freq_dict(self.person_name_path) self.place_names = self.load_word_freq_dict(self.place_name_path) self.stopwords = self.load_word_freq_dict(self.stopwords_path) # 合并切词词典及自定义词典 self.custom_word_freq.update(self.person_names) self.custom_word_freq.update(self.place_names) self.custom_word_freq.update(self.stopwords) self.word_freq.update(self.custom_word_freq) t5 = time.time() logger.debug('Loaded custom word file: %s, size: %d, spend: %s s' % (self.custom_confusion_path, len( self.custom_word_freq), str(t5 - t4))) self.tokenizer = Tokenizer(dict_path=self.word_freq_path, custom_word_freq_dict=self.custom_word_freq, custom_confusion_dict=self.custom_confusion) # bert预训练模型 t6 = time.time() self.bert_tokenizer = BertTokenizer(vocab_file=self.bert_model_vocab) self.MASK_TOKEN = "[MASK]" self.MASK_ID = self.bert_tokenizer.convert_tokens_to_ids( [self.MASK_TOKEN])[0] # Prepare model self.model = BertForMaskedLM.from_pretrained(self.bert_model_dir) logger.debug("Loaded model ok, path: %s, spend: %.3f s." % (self.bert_model_dir, time.time() - t6)) self.initialized_detector = True
def __init__(self, model_path, tokenizer_path): super(Bert, self).__init__() self.model_path = model_path self.tokenizer_path = tokenizer_path self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path) self.model = BertForMaskedLM.from_pretrained(model_path)
def main(args): # set tokenizer vocab = PreDefinedVocab( vocab_file=args.vocab_file, unk_token='[UNK]', sep_token='[SEP]', pad_token='[PAD]', mask_token='[MASK]', cls_token='[CLS]', ) tokenizer = WordpieceTokenizer(vocab) to_word = False # select a sampling module if args.sampling_strategy == 'random': sampling_fn = sampler.UniformSampler() # select a augmentation module if args.augmentation_strategy == 'dropout': generator_fn = generator.DropoutGenerator() elif args.augmentation_strategy == 'blank': generator_fn = generator.BlankGenerator( mask_token=tokenizer.vocab.mask_token) elif args.augmentation_strategy == 'unigram': generator_fn = generator.UnigramGenerator( args.unigram_frequency_for_generation) to_word = True elif args.augmentation_strategy == 'bigramkn': generator_fn = generator.BigramKNGenerator( args.bigram_frequency_for_generation) to_word = True elif args.augmentation_strategy == 'wordnet': generator_fn = generator.WordNetGenerator(lang=args.lang_for_wordnet) to_word = True elif args.augmentation_strategy == 'word2vec': generator_fn = generator.Word2vecGenerator(args.w2v_file) to_word = True elif args.augmentation_strategy == 'ppdb': generator_fn = generator.PPDBGenerator(args.ppdb_file) to_word = True elif args.augmentation_strategy == 'bert': from pytorch_transformers import BertTokenizer, BertForMaskedLM bert = BertForMaskedLM.from_pretrained(args.model_name_or_path) generator_fn = generator.BertGenerator(tokenizer, bert, args.temparature) augmentor_fn = augmentor.ReplacingAugmentor(tokenizer, sampling_fn, generator_fn, to_word=to_word) with open(args.input, 'r') as f: for line in f: line = line.rstrip() augmented_line = augmentor_fn(line, args.augmentation_rate) print(augmented_line)
def prepare_models(): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased', output_attentions=True) model.eval() mask_model = BertForMaskedLM.from_pretrained('bert-base-uncased') mask_model.eval() return tokenizer, model, mask_model
def __init__(self, model_path='bert-base-uncased', tokenizer_path=None, device='cuda'): super().__init__() self.model_path = model_path self.device = device self.tokenizer = BertTokenizer.from_pretrained(model_path) self.model = BertForMaskedLM.from_pretrained(model_path) self.model.to(device) self.model.eval()
def initialize_bert_detector(self): t1 = time.time() self.bert_tokenizer = BertTokenizer(vocab_file=self.bert_model_vocab) self.MASK_TOKEN = "[MASK]" self.MASK_ID = self.bert_tokenizer.convert_tokens_to_ids( [self.MASK_TOKEN])[0] # Prepare model self.model = BertForMaskedLM.from_pretrained(self.bert_model_dir) logger.debug("Loaded model ok, path: %s, spend: %.3f s." % (self.bert_model_dir, time.time() - t1)) self.initialized_bert_detector = True
def __init__(self, bert_path, vocab_file_name="vocab.txt", use_cuda=False): # 日本語文章をBERTに食わせるためにJumanを読み込む self.juman_tokenizer = JumanTokenizer() # 事前学習済みのBERTモデルのMaskedLMタスクモデルを読み込む self.model = BertForMaskedLM.from_pretrained(bert_path) # 事前学習済みのBERTモデルのTokenizerを読み込む self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name, do_lower_case=False, do_basic_tokenize=False) # CUDA-GPUを利用するかどうかのフラグ読み込み self.use_cuda = use_cuda
def init(maxlen=512): global config, tokenizer, model, sim_model, MAX_LENGTH MAX_LENGTH = maxlen bert_model_name = 'bert-base-uncased' config = BertConfig.from_pretrained(bert_model_name) config.output_hidden_states = True tokenizer = BertTokenizer.from_pretrained(bert_model_name) model = BertForMaskedLM.from_pretrained(bert_model_name, config=config) model.to(DEVICE) model.eval() sim_model = smodel.WebBertSimilarity(device=DEVICE)
def __init__(self, vocabulary, config): spacy.prefer_gpu() self.spacyNlp = spacy.load("en_core_web_lg") self.simThreshold = 0.70 self.modelName = config["bertModel"] self.guesses = config["numberOfGuesses"] self.useSimilarity = config["useSimilarity"] self.useSynonyms = config["useSynonyms"] self.bertModel = BertForMaskedLM.from_pretrained(self.modelName).cuda() self.bertTokenizer = BertTokenizer.from_pretrained(self.modelName) self.maskingToken = "[MASK]" self.paddingToken = "[PAD]" self.startToken = "[CLS]" self.endToken = "[SEP]" self.vocabulary = vocabulary
def sample_predict_token(): # Load pre-trained model tokenizer (vocabulary) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Tokenize input text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" tokenized_text = tokenizer.tokenize(text) # Mask a token that we will try to predict back with `BertForMaskedLM` masked_index = 8 tokenized_text[masked_index] = '[MASK]' assert tokenized_text == [ '[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]' ] # Convert token to vocabulary indices indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # Define sentence A and B indices associated to 1st and 2nd sentences (see paper) segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1] # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) model = BertForMaskedLM.from_pretrained('bert-base-uncased') model.eval() # If you have a GPU, put everything on cuda #tokens_tensor = tokens_tensor.to('cuda') #segments_tensors = segments_tensors.to('cuda') #model.to('cuda') # Predict all tokens with torch.no_grad(): outputs = model(tokens_tensor, token_type_ids=segments_tensors) predictions = outputs[0] # confirm we were able to predict 'henson' predicted_index = torch.argmax(predictions[0, masked_index]).item() predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] print(predicted_token) assert predicted_token == 'henson'
def __init__(self, model_path='bert-base-uncased', temperature=1.0, top_k=None, top_p=None, device='cuda'): super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p) self.model_path = model_path # self.tokenizer = AutoTokenizer.from_pretrained(model_path) # self.model = AutoModel.from_pretrained(model_path) self.tokenizer = BertTokenizer.from_pretrained(model_path) self.model = BertForMaskedLM.from_pretrained(model_path) self.model.to(self.device) self.model.eval()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--bert-model', type=str, default='bert-base-multilingual-uncased') args = parser.parse_args() model = BertForMaskedLM.from_pretrained(args.bert_model).cuda() model.eval() tokenizer = BertTokenizer.from_pretrained(args.bert_model) while True: with torch.no_grad(): sentence = input('> ') bundle = SingleInputBundle([tokenizer.tokenize(sentence)], tokenizer.vocab) bundle.cuda() print( predict_top_k(model, tokenizer.vocab, tokenizer.ids_to_tokens, bundle))
def __init__(self, model_directory, vocab_file, lower=False): # Load pre-trained model (weights) self.model = BertForMaskedLM.from_pretrained(model_directory) self.model.eval() self.cuda = torch.cuda.is_available() if self.cuda: self.model = self.model.cuda() # Load pre-trained model tokenizer (vocabulary) self.tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=lower) self.CLS = '[CLS]' self.SEP = '[SEP]' self.MASK = '[MASK]' self.mask_id = self.tokenizer.convert_tokens_to_ids([self.MASK])[0] self.sep_id = self.tokenizer.convert_tokens_to_ids([self.SEP])[0] self.cls_id = self.tokenizer.convert_tokens_to_ids([self.CLS])[0]
def __init__(self, tokenizer, bert_path): self.tokenizer = tokenizer self.vocab_size = len(self.tokenizer.vocab) # 事前学習済みのBERTモデルのMaskedLMタスクモデルを読み込む self.model = BertForMaskedLM.from_pretrained(bert_path) # 除外するヘッダ等トークン except_tokens = [ "[MASK]", #"[PAD]", "[UNK]", "[CLS]", "[SEP]" ] self.except_ids = [ self.tokenizer.vocab[token] for token in except_tokens ] # vocab_sizeのうち、except_ids以外は、利用する self.candidate_ids = [ i for i in range(self.vocab_size) if i not in self.except_ids ]
#!/usr/bin/python3 import torch from pytorch_transformers import BertForMaskedLM, BertTokenizer import sys import torch.nn.functional as F import numpy as np # Load pre-trained model and tokenizer model = BertForMaskedLM.from_pretrained('bert-large-uncased') tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') # Read items from file with open('items_agr_punct.csv', encoding='utf8') as f: text = f.read().splitlines() # Write to file orig_stdout = sys.stdout f = open('out_agr_punct.txt', 'w') sys.stdout = f # Write Column Headers print("Surprisal, VerbCondition, FillerCondition, EmbeddingLevel") for s in text: splits = s.split(',') item = "[CLS] " + splits[0] + " [SEP]" tokenized_text = tokenizer.tokenize(item) # Find index of the masked token words = splits[0].split(' ') masked_index = words.index('[MASK]') + 1
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_data', type=Path, required=True) parser.add_argument('--output_dir', type=Path, required=True) parser.add_argument( "--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--do_lower_case", action="store_true") parser.add_argument( "--reduce_memory", action="store_true", help= "Store training data as on-disc memmaps to massively reduce memory usage" ) parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument("--wp", type=bool, default=False, help="if train on wp") parser.add_argument( '--from_scratch', action='store_true', help='do not load prtrain model, only random initialize') parser.add_argument("--output_step", type=int, default=100000, help="Number of step to save model") args = parser.parse_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" samples_per_epoch = [] num_data_epochs = args.epochs for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}.json" metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) args.n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) args.output_mode = "classification" if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning( f"Output directory ({args.output_dir}) already exists and is not empty!" ) args.output_dir.mkdir(parents=True, exist_ok=True) while True: try: tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) if tokenizer._noi_token is None: tokenizer._noi_token = '[NOI]' if args.bert_model == 'bert-base-uncased' or 'bert-large-uncased': tokenizer.vocab['[NOI]'] = tokenizer.vocab.pop('[unused0]') else: tokenizer.vocab['[NOI]'] = tokenizer.vocab.pop('[unused1]') # else: # raise ValueError("No clear choice for insert NOI for tokenizer type {}".format(args.model_name_or_path)) tokenizer.ids_to_tokens[1] = '[NOI]' logger.info("Adding [NOI] to the vocabulary 1") except: continue break total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int(total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model if args.from_scratch: model = BertForMaskedLM() else: model = BertForMaskedLM.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) model.train() for epoch in range(args.epochs): epoch_dataset = PregeneratedDataset( epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory, args=args) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids = batch outputs = model( input_ids, segment_ids, input_mask, lm_label_ids, ) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() # Update learning rate schedule optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % args.output_step == 0 and args.local_rank in [ -1, 0 ]: # Save model checkpoint output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.local_rank in [-1, 0]: # Save model checkpoint output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) logger.info("PROGRESS: {}%".format( round(100 * (epoch + 1) / args.epochs, 4))) logger.info("EVALERR: {}%".format(tr_loss)) # Save a trained model if args.local_rank == -1 or torch.distributed.get_rank() == 0: logging.info("** ** * Saving fine-tuned model ** ** * ") logger.info("Saving model checkpoint to %s", args.output_dir) model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir)
model_emb = load_embedding(args.modelname) allowed_vocabulary = None if args.allowed_vocabulary: with open(args.allowed_vocabulary) as handle: lines = [line.strip() for line in handle] encoded = [ model_emb.tokenizer.encode(token, add_special_tokens=False) for token in lines ] assert all([len(x) == 1 for x in encoded]) allowed_vocabulary = set([x[0] for x in encoded if len(x) == 1]) model = EmbInputBertModel.from_pretrained(args.modelname, output_attentions=True) language_model = BertForMaskedLM.from_pretrained(args.modelname).cls model = model.to(device=args.device) language_model = language_model.to(device=args.device) model.eval() language_model.eval() mappers = { method: load_mapper(f"{args.wikiname}.{args.modelname}.{method}") for method in args.methods } for pattern in tqdm(patterns): relation, template = pattern["relation"], pattern["template"]
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_data', type=Path, required=True) parser.add_argument('--output_dir', type=Path, required=False, default=None) parser.add_argument( "--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--do_lower_case", type=boolean_string, default=False, action="store_true") parser.add_argument( "--reduce_memory", type=boolean_string, default=False, help= "Store training data as on-disc memmaps to massively reduce memory usage" ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", type=boolean_string, default=False, help="Whether not to use CUDA when available") parser.add_argument("--batch_size", default=1, type=int, help="Total batch size for training.") parser.add_argument( '--fp16', type=boolean_string, default=False, help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument("--type", default="greedy", type=str, help="greedy: greedy generation. sample: sampling") parser.add_argument('--noi_decay', type=int, default=3, help="round number to decay NOI prob") parser.add_argument('--reduce_decay', type=int, default=1, help="round number to decay reduce prob") parser.add_argument('--verbose', type=int, default=1, help="verbose level") parser.add_argument('--n_test', type=int, default=5000, help="number of test examples") parser.add_argument('--prevent', type=boolean_string, default=True, help="avoid generating several words") parser.add_argument('--reduce_stop', type=boolean_string, default=True, help="reduce stopwords") parser.add_argument('--lessrepeat', action='store_true', help="reduce repetition (only for tokenwise)") args = parser.parse_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" if not args.output_dir: args.output_dir = args.bert_model epoch_file = args.pregenerated_data / f"test.key.txt" total_examples = 1000 args.max_seq_length = 256 # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) args.n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) args.output_mode = "classification" if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning( f"Output directory ({args.output_dir}) already exists and is not empty!" ) args.output_dir.mkdir(parents=True, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) # Prepare model model = BertForMaskedLM.from_pretrained(args.bert_model) sep_tok = tokenizer.vocab['[SEP]'] cls_tok = tokenizer.vocab['[CLS]'] pad_tok = tokenizer.vocab['[PAD]'] model.to(device) model.eval() print(args) logging.info("***** Running generation *****") logging.info(f" Num examples = {total_examples}") logging.info(" Batch size = %d", args.batch_size) epoch_dataset = PregeneratedDataset(epoch=0, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=1) epoch_sampler = SequentialSampler(epoch_dataset) generate_dataloader = DataLoader(epoch_dataset, sampler=epoch_sampler, batch_size=args.batch_size) file_name = os.path.join(args.output_dir, f"{args.type}.txt") f = open(file_name, "w", 1) print(file_name) prevent = [tokenizer.vocab.get(x) for x in PREVENT_LIST] if args.prevent else None if args.reduce_stop: REDUCE_LIST = REDUCE_LIST | STOP_LIST reduce = None if args.prevent: reduce = [tokenizer.vocab.get(x) for x in reduce_list] reduce = [s for s in reduce if s] with tqdm(total=len(generate_dataloader), desc=f"Epoch {0}") as pbar: for step, batch in enumerate(generate_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids = batch pdb.set_trace() if args.type == "greedy": predict_ids = greedy_search(model, input_ids, segment_ids, input_mask, args=args, tokenizer=tokenizer, prevent=prevent, reduce=reduce) elif args.type == 'sampling': predict_ids = sample_generate(model, input_ids, segment_ids, input_mask, temperature=0.8, args=args, tokenizer=tokenizer, prevent=prevent, reduce=reduce) else: raise NotImplementedError output = " ".join([ str( tokenizer.ids_to_tokens.get(x, "noa").encode( 'ascii', 'ignore').decode('ascii')) for x in predict_ids[0].detach().cpu().numpy() if x != sep_tok and x != pad_tok and x != cls_tok ]) + "\n" output = output.replace(" ##", "") f.write(output) pbar.update(1)
def __init__(self): self.tokenizer = BertTokenizer.from_pretrained(MODEL_PATH) self.model = BertForMaskedLM.from_pretrained(MODEL_PATH) self.model.eval() self.model.to(DEVICE)
from flask import Flask, request from flask_cors import CORS import torch import numpy as np from pytorch_transformers import BertTokenizer, BertForMaskedLM import nltk app = Flask(__name__) CORS(app) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForMaskedLM.from_pretrained('bert-base-uncased', output_attentions=True) model.eval() @app.route('/fillblanks', methods=['POST']) def predict(): sentence_orig = request.form.get('text') if '____' not in sentence_orig: return sentence_orig sentence = sentence_orig.replace('____', 'MASK') tokens = nltk.word_tokenize(sentence) sentences = nltk.sent_tokenize(sentence) sentence = " [SEP] ".join(sentences) sentence = "[CLS] " + sentence + " [SEP]" tokenized_text = tokenizer.tokenize(sentence) masked_index = tokenized_text.index('mask') tokenized_text[masked_index] = "[MASK]" indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
torch.manual_seed(0) np.random.seed(0) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False random.seed(0) parser = argparse.ArgumentParser() parser.add_argument('--file', type=str, default='../PlotExtraction/fairy.txt') parser.add_argument('--outfile', type=str, default='bert_fairy.txt') parser.add_argument('--model', type=str, default='./bert/fairy') args = parser.parse_args() tokenizer = BertTokenizer.from_pretrained(args.model) model = BertForMaskedLM.from_pretrained(args.model, output_attentions=False) model.eval() def capitalizeFirst(phrase): words = phrase.split() words[0] = words[0].capitalize() return ' '.join(words) def is_punctuation(s): return len(set(s).intersection(set(string.punctuation))) > 0 def getScore(sentence): tokenized_text = tokenizer.tokenize('[CLS] ' + "[MASK] " + sentence + ' [SEP]')
input_ids = torch.tensor(tokenizer.encode(s), device=device).unsqueeze( 0) # Batch size 1 results.append( clf.forward(input_ids)[0].detach().cpu().numpy().flatten()) return np.array(results).reshape(-1, 2) print('loading models and data...') default = 'bert-base-uncased' mdir = '/scratch/users/vision/chandan/pacmed/glue/SST-2-3epoch' # '/scratch/users/vision/chandan/pacmed/glue/SST-2-middle/' device = 'cpu' tokenizer = BertTokenizer.from_pretrained(mdir) clf = BertForSequenceClassification.from_pretrained(mdir).eval().to(device) masked_predictor = BertForMaskedLM.from_pretrained(default).eval().to(device) lines = open('data/stsa.binary.test', 'r').read() lines = [line for line in lines.split('\n') if not line is ''] classes = [int(line[0]) for line in lines] reviews = [line[2:] for line in lines] num_reviews = 1821 # 1821 save_freq = 1 scores_iid = {} scores_conditional = {} scores_remove = {} scores_lime = {} # loop over reviews print('looping over dset...')
from flask import Flask, request from flask_cors import CORS import torch import random import numpy as np from pytorch_transformers import BertTokenizer, BertForMaskedLM import nltk app = Flask(__name__) CORS(app) base_dir = '/finetuned_lm-review/finetuned_lm' tokenizer = BertTokenizer.from_pretrained(base_dir) model = BertForMaskedLM.from_pretrained(base_dir, output_attentions=False) model.eval() def duplicates(lst, item): return [i for i, x in enumerate(lst) if x == item] @app.route('/autocomplete', methods=['POST']) def predict(): sentence = "" sentence_orig = request.form.get('text') sentence_length = request.form.get('len') decoding_type = request.form.get('decoding_type') domain_type = request.form.get('domain_type') filler = ' '.join(['MASK' for _ in range(int(sentence_length))]) if domain_type == 'review':
def main(): parser = argparse.ArgumentParser() add_dict_options(parser, ARGS) args = parser.parse_args() set_seed(args.seed) if args.prefix_file: prefix_sampler = torch.load(args.prefix_file) if args.transfo: tokenizer = TransfoXLTokenizer.from_pretrained(args.transfo_model) model = TransfoXLLMHeadModel.from_pretrained(args.transfo_model) elif args.bert: tokenizer = BertTokenizer.from_pretrained(args.bert_model) model = BertForMaskedLM.from_pretrained(args.bert_model) else: tokenizer = GPT2Tokenizer.from_pretrained(args.gpt2_model) model = GPT2LMHeadModel.from_pretrained(args.gpt2_model) init_sos(model) if args.resume: model.load_state_dict( torch.load(args.resume, map_location=lambda s, l: s)) if not args.simple_sample: model = nn.DataParallel(model) model.cuda() if args.bert: text_batches = list(split(list(sys.stdin), 128)) for text_batch in tqdm(text_batches, desc='Augmenting'): for _ in range(args.num_samples): mtext_batch = [ ' '.join('[MASK]' if ( random.random() < 0.2 and '\t' not in x) else x for x in sent.split(' ')) for sent in text_batch ] print('\n'.join( x.replace('[SEP]', '\t').strip() for x in augment_texts( model, tokenizer, mtext_batch, max_len=args.msl))) sys.stdout.flush() return sample_batches = [ SampleBatch(model, tokenizer, prefix_sampler) for _ in range(args.num_buffers) ] if args.simple_sample: for _ in tqdm(range(args.num_samples)): print(sample_batches[0].simple_sample(pair=args.paired, transfo=args.transfo)) sys.stdout.flush() return n_output = 0 pbar = tqdm(total=args.num_samples, desc='Generating') while n_output < args.num_samples: try: sample_batch = random.choice(sample_batches) sample_batch.try_add_sample() fin_texts = sample_batch.step(pair=args.paired) except ValueError: sample_batch.try_add_sample() continue for fin_text in fin_texts: if n_output >= args.num_samples: return print(fin_text.replace(EOS_TOKEN, '').replace('<eos>', '\t')) sys.stdout.flush() pbar.update(1) n_output += 1 if (n_output + 1) % args.balance_every == 0: pbar.set_postfix(dict(last_balance=n_output)) SampleBatch.balance(sample_batches)