def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default="data", type=str, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument("--saved_model", default=None, type=str, help="Fine-tuned model to load weights from") parser.add_argument("--config_file", default=None, type=str, help="File to load config from") ## Other parameters parser.add_argument( "--tensorboard_logdir", default="runs", type=str, required=False, help="The output directory where the tensorboard event files are saved." ) parser.add_argument( "--cache_dir", default="~/local_model_cache", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=50, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--train_frac", default=1.0, type=float, help="What percentage of the training data to use") parser.add_argument("--do_train", action='store_true', help="Whether to run training.", default=True) parser.add_argument( "--q_relevance", action='store_true', help="Whether to classify questions as confused or not.") parser.add_argument( "--r_relevance", action='store_true', help="Whether to classify responses as confused or not.") parser.add_argument("--answer_extraction", action='store_true', help="Whether to extract answers") parser.add_argument("--answer_verification", action='store_true', help="Whether to verify answers", default=False) parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.", default=True) parser.add_argument("--do_mini", action='store_true', help="Whether not to mini version of the data") parser.add_argument( "--do_lower_case", default=True, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=128, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=128, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=10.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--attention_dropout", default=0.1, type=float, help="Percent dropout at attention layers") parser.add_argument("--hidden_dropout", default=0.1, type=float, help="Percent dropout at hidden layers") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() log_dir = os.path.join( args.tensorboard_logdir, datetime.now().strftime('%Y-%m-%d--%H-%M-%S') + '_' + os.path.basename(args.output_dir[:-1] if args.output_dir[-1] == '/' else args.output_dir)) os.mkdir(log_dir) fh = logging.FileHandler(log_dir + '/run.log') fh.setLevel(logging.DEBUG) tb_writer = SummaryWriter(logdir=log_dir) def get_free_gpu(): os.system('nvidia-smi -q -d Memory |grep -A4 GPU|grep Free >tmp') memory_available = [ int(x.split()[2]) for x in open('tmp', 'r').readlines() ] os.remove('tmp') return np.argmax(memory_available) if args.local_rank == -1 or args.no_cuda: device = torch.device(f"cuda:{get_free_gpu()}" if torch.cuda. is_available() and not args.no_cuda else "cpu") n_gpu = 1 else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") args.output_dir = args.output_dir if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) processor = VQRProcessor(args.do_mini, args.q_relevance, args.r_relevance, args.train_frac) label_list = processor.get_labels() num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( np.ceil( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps)) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) train_features, train_token_mappings = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) train_dataloader = get_train_dataloader(train_features, args) if args.do_eval: eval_examples = processor.get_dev_examples(args.data_dir) eval_dataloader, eval_token_mappings = get_eval_dataloader( args, eval_examples, label_list, tokenizer) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) if args.saved_model is not None: print("Now loading from", args.saved_model) # Load a trained model and config that you have fine-tuned config = BertConfig(args.config_file) model = BertForVQR(config, num_labels=2, binary_only=args.binary_only, answer_extraction_only=args.answer_extraction_only, answer_verification=args.answer_verification) model.load_state_dict(torch.load(args.saved_model)) else: config = BertConfig.from_pretrained(args.config_file) model = BertForVQR.from_pretrained( args.bert_model, cache_dir=cache_dir, config=config, num_labels=num_labels, q_relevance=args.q_relevance, r_relevance=args.r_relevance, answer_extraction=args.answer_extraction, answer_verification=args.answer_verification) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) train(device, model, optimizer, train_dataloader, args, eval_dataloader, tb_writer, train_examples, train_token_mappings, eval_examples, eval_token_mappings) if not args.do_train and args.do_eval and ( args.local_rank == -1 or torch.distributed.get_rank() == 0): eval(device, model, eval_dataloader, args) tb_writer.close()
import torch import torch.nn.functional as F from torch import nn from transformers import BertConfig, BertTokenizer from transformers.modeling_bert import BertLayerNorm from .adaptive_span import AdaptiveSpan from .entmax import EntmaxAlpha from .layerdrop import LayerDrop_Bert, LayerDrop_Cross from .lxmert_utils import (VISUAL_CONFIG, BertPreTrainedModel, InputFeatures, convert_sents_to_features, set_visual_config) device = torch.device("cuda") if torch.cuda.is_available() else torch.device( "cpu") MAX_VQA_LENGTH = 20 bert_config = BertConfig() class GeLU(nn.Module): def __init__(self): super().__init__() def forward(self, x): return F.gelu(x) ## BertEmbeddings class BertEmbeddings(nn.Module): """Construct the embeddings from word, position and token_type embeddings. """ def __init__(self, config):
# filepath = 'data/match/pretrain_wo_aug.txt' filepath = 'data/match/pretrain.txt' #150个epoch的没有对偶数据增强 vocab_file_dir = './output_bert/vocab.txt' tokenizer = BertTokenizer.from_pretrained(vocab_file_dir) dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path = filepath, block_size=32 # 32 ) config = BertConfig( vocab_size=23737, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, max_position_embeddings=512, ) model = BertForMaskedLM(config) # model = BertForMaskedLM.from_pretrained(pretrained_model_name_or_path='chinese-bert-wwm') # model.resize_token_embeddings(new_num_tokens=23737) #这里写现在token的个数 # data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) #0.15 data_collator = DataCollatorForNgramMask(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) #0.15 training_args = TrainingArguments( output_dir='output_bert_wwm', overwrite_output_dir=True, num_train_epochs=100,
if len(ck_list) == 0: return None if len(ck_list) == 1: return os.path.join(output_path, ck_list[0]) else: return os.path.join(output_path, ck_list[-1]) latest_model_path = load_latest_path(output_path) print(f"restore {latest_model_path}") """ Set Config BERT Base """ config = BertConfig( vocab_size=32_000, attention_probs_dropout_prob=0.1, directionality="bidi", gradient_checkpointing=False, hidden_act="gelu", hidden_dropout_prob=0.1, hidden_size=768, initializer_range=0.02, intermediate_size=3072, layer_norm_eps=1e-12, max_position_embeddings=512, model_type="bert", num_attention_heads=12, num_hidden_layers=12, pad_token_id=0, pooler_fc_size=768,
def build_enc_dec_tokenizers(config): src_tokenizer = BertTokenizer.from_pretrained( 'bert-base-multilingual-cased') tgt_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tgt_tokenizer.bos_token = '<s>' tgt_tokenizer.eos_token = '</s>' # hidden_size and intermediate_size are both wrt all the attention heads. # Should be divisible by num_attention_heads encoder_config = BertConfig( vocab_size=src_tokenizer.vocab_size, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, hidden_dropout_prob=config.dropout_prob, attention_probs_dropout_prob=config.dropout_prob, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12) decoder_config = BertConfig( vocab_size=tgt_tokenizer.vocab_size, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, hidden_dropout_prob=config.dropout_prob, attention_probs_dropout_prob=config.dropout_prob, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12, is_decoder=False) # CHANGE is_decoder=True # Create encoder and decoder embedding layers. encoder_embeddings = torch.nn.Embedding( src_tokenizer.vocab_size, config.hidden_size, padding_idx=src_tokenizer.pad_token_id) decoder_embeddings = torch.nn.Embedding( tgt_tokenizer.vocab_size, config.hidden_size, padding_idx=tgt_tokenizer.pad_token_id) encoder = BertModel(encoder_config) # encoder.set_input_embeddings(encoder_embeddings.cuda()) encoder.set_input_embeddings(encoder_embeddings) # 1 decoder = BertForMaskedLM(decoder_config) # decoder.set_input_embeddings(decoder_embeddings.cuda()) decoder.set_input_embeddings(decoder_embeddings) # 2 # model.cuda() tokenizers = ED({'src': src_tokenizer, 'tgt': tgt_tokenizer}) return encoder, decoder, tokenizers
config = GPT2Config() model = GPT2LMHeadModel(config) model.resize_token_embeddings(len(tokenizer)) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) if train: train_dataset = datasets.Dataset.load_from_disk(os.path.join(data_dir, "lm_train")) elif model_type == "bert": dataset_properties = json.load(open(os.path.join(data_dir, "dataset_properties.json"))) special_tokens = dataset_properties["special_tokens"] tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") tokenizer.add_special_tokens({"additional_special_tokens": special_tokens}) config = BertConfig() config.vocab_size = len(tokenizer) model = AutoModelForMaskedLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer) # the NL inputs for the train dataset are the same for BERT and GPT-2 models, but they are tokenized # differently (using the corresponding BERT and GPT-2 tokenizers, respectively). The standard training # set is already tokenized with the BERT tokenizer, so we can reuse that set here. if train: train_dataset = datasets.Dataset.load_from_disk(os.path.join(data_dir, "arsenal_train")) else: raise("unknown model type")
import os import re import json import string import argparse from tokenizers import BertWordPieceTokenizer from transformers import BertTokenizer, TFBertModel, BertConfig from models import CABert import pandas as pd from metrics import evaluate max_len = 512 configuration = BertConfig() # default paramters and configuration for BERT # Save the slow pretrained tokenizer slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") save_path = "bert_base_uncased/" if not os.path.exists(save_path): os.makedirs(save_path) slow_tokenizer.save_pretrained(save_path) # Load the fast tokenizer from saved file tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True) if __name__ == '__main__': my_parser = argparse.ArgumentParser( description='List the content of a folder') # Add the arguments my_parser.add_argument('train',
def build_model(config, train_loader, eval_loader): src_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') tgt_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tgt_tokenizer.bos_token = '<s>' tgt_tokenizer.eos_token = '</s>' #hidden_size and intermediate_size are both wrt all the attention heads. #Should be divisible by num_attention_heads encoder_config = BertConfig(vocab_size=src_tokenizer.vocab_size, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, hidden_dropout_prob=config.dropout_prob, attention_probs_dropout_prob=config.dropout_prob, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12) decoder_config = BertConfig(vocab_size=tgt_tokenizer.vocab_size, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, hidden_dropout_prob=config.dropout_prob, attention_probs_dropout_prob=config.dropout_prob, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12, is_decoder=True) #Create encoder and decoder embedding layers. encoder_embeddings = torch.nn.Embedding(src_tokenizer.vocab_size, config.hidden_size, padding_idx=src_tokenizer.pad_token_id) decoder_embeddings = torch.nn.Embedding(tgt_tokenizer.vocab_size, config.hidden_size, padding_idx=tgt_tokenizer.pad_token_id) encoder = BertModel(encoder_config) encoder.set_input_embeddings(encoder_embeddings.cpu()) decoder = BertForMaskedLM(decoder_config) decoder.set_input_embeddings(decoder_embeddings.cpu()) """ input_dirs = config.model_output_dirs if(os.listdir(input_dirs['decoder']) and os.listdir(input_dirs['encoder'])): suffix = "pytorch_model.bin" decoderPath = os.path.join(input_dirs['decoder'], suffix) encoderPath = os.path.join(input_dirs['encoder'], suffix) decoder_state_dict = torch.load(decoderPath) encoder_state_dict = torch.load(encoderPath) decoder.load_state_dict(decoder_state_dict) encoder.load_state_dict(encoder_state_dict) model = TranslationModel(encoder, decoder, train_loader, eval_loader, tgt_tokenizer, config) model.cpu() return model """ #model = TranslationModel(encoder, decoder) model = TranslationModel(encoder, decoder, train_loader, eval_loader, tgt_tokenizer, config) model.cpu() tokenizers = ED({'src': src_tokenizer, 'tgt': tgt_tokenizer}) #return model, tokenizers return model
def __init__(self, args, device, checkpoint=None, bert_from_extractive=None): super(AbsSummarizer, self).__init__() self.args = args self.device = device self.bert = Bert(args.large, args.temp_dir, args.finetune_bert) if bert_from_extractive is not None: self.bert.model.load_state_dict( dict([(n[11:], p) for n, p in bert_from_extractive.items() if n.startswith("bert.model")]), strict=True, ) if args.encoder == "baseline": bert_config = BertConfig( self.bert.model.config.vocab_size, hidden_size=args.enc_hidden_size, num_hidden_layers=args.enc_layers, num_attention_heads=8, intermediate_size=args.enc_ff_size, hidden_dropout_prob=args.enc_dropout, attention_probs_dropout_prob=args.enc_dropout, ) self.bert.model = BertModel(bert_config) if args.max_pos > 512: my_pos_embeddings = nn.Embedding( args.max_pos, self.bert.model.config.hidden_size) my_pos_embeddings.weight.data[: 512] = self.bert.model.embeddings.position_embeddings.weight.data my_pos_embeddings.weight.data[ 512:] = self.bert.model.embeddings.position_embeddings.weight.data[ -1][None, :].repeat(args.max_pos - 512, 1) self.bert.model.embeddings.position_embeddings = my_pos_embeddings self.vocab_size = self.bert.model.config.vocab_size tgt_embeddings = nn.Embedding(self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0) if self.args.share_emb: tgt_embeddings.weight = copy.deepcopy( self.bert.model.embeddings.word_embeddings.weight) # 在bert基础上添加一个 decoder的结构 self.decoder = TransformerDecoder( self.args.dec_layers, self.args.dec_hidden_size, heads=self.args.dec_heads, d_ff=self.args.dec_ff_size, dropout=self.args.dec_dropout, embeddings=tgt_embeddings, ) self.generator = get_generator(self.vocab_size, self.args.dec_hidden_size, device) self.generator[0].weight = self.decoder.embeddings.weight if checkpoint is not None: self.load_state_dict(checkpoint["model"], strict=True) else: for module in self.decoder.modules(): if isinstance(module, (nn.Linear, nn.Embedding)): module.weight.data.normal_(mean=0.0, std=0.02) elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() for p in self.generator.parameters(): if p.dim() > 1: xavier_uniform_(p) else: p.data.zero_() if args.use_bert_emb: tgt_embeddings = nn.Embedding( self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0) tgt_embeddings.weight = copy.deepcopy( self.bert.model.embeddings.word_embeddings.weight) self.decoder.embeddings = tgt_embeddings self.generator[0].weight = self.decoder.embeddings.weight self.to(device)
def train(args): if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) if args.gpu != '-1' and torch.cuda.is_available(): device = torch.device('cuda') torch.cuda.set_rng_state(torch.cuda.get_rng_state()) torch.backends.cudnn.deterministic = True else: device = torch.device('cpu') config = { 'train': { 'unchanged_variable_weight': 0.1, 'buffer_size': 5000 }, 'encoder': { 'type': 'SequentialEncoder' }, 'data': { 'vocab_file': 'data/vocab.bpe10000/vocab' } } train_set = Dataset('data/preprocessed_data/train-shard-*.tar') dev_set = Dataset('data/preprocessed_data/dev.tar') vocab = Vocab.load('data/vocab.bpe10000/vocab') if args.decoder: vocab_size = len(vocab.all_subtokens) + 1 else: vocab_size = len(vocab.source_tokens) + 1 max_iters = args.max_iters lr = args.lr warm_up = args.warm_up batch_size = 4096 effective_batch_size = args.batch_size max_embeds = 1000 if args.decoder else 512 bert_config = BertConfig(vocab_size=vocab_size, max_position_embeddings=max_embeds, num_hidden_layers=6, hidden_size=256, num_attention_heads=4) model = BertForPreTraining(bert_config) if args.restore: state_dict = torch.load(os.path.join(args.save_dir, args.res_name)) model.load_state_dict(state_dict['model']) batch_count = state_dict['step'] epoch = state_dict['epoch'] model.train() model.to(device) if len(args.gpu) > 1 and device == torch.device('cuda'): model = nn.DataParallel(model) def lr_func(step): if step > warm_up: return (max_iters - step) / (max_iters - warm_up) else: return (step / warm_up) optimizer = torch.optim.Adam(model.parameters(), lr=lr, eps=1e-6, weight_decay=0.01) scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_func, last_epoch=-1) loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='none') if args.restore: optimizer.load_state_dict(state_dict['optim']) scheduler.load_state_dict(state_dict['scheduler']) batch_count = 0 epoch = 0 cum_loss = 0.0 while True: # load training dataset, which is a collection of ASTs and maps of gold-standard renamings train_set_iter = train_set.batch_iterator( batch_size=batch_size, return_examples=False, config=config, progress=True, train=True, max_seq_len=512, num_readers=args.num_readers, num_batchers=args.num_batchers) epoch += 1 print("Epoch {}".format(epoch)) loss = 0 num_seq = 0 optimizer.zero_grad() for batch in train_set_iter: if args.decoder: input_ids = batch.tensor_dict['prediction_target'][ 'src_with_true_var_names'] else: input_ids = batch.tensor_dict['src_code_tokens'] attention_mask = torch.ones_like(input_ids) attention_mask[input_ids == 0] = 0.0 assert torch.max(input_ids) < vocab_size assert torch.min(input_ids) >= 0 if input_ids.shape[0] > max_embeds: print( "Warning - length {} is greater than max length {}. Skipping." .format(input_ids.shape[0], max_embeds)) continue input_ids, labels = mask_tokens(inputs=input_ids, mask_token_id=vocab_size - 1, vocab_size=vocab_size, mlm_probability=0.15) input_ids[attention_mask == 0] = 0 labels[attention_mask == 0] = -100 if torch.cuda.is_available(): input_ids = input_ids.cuda() labels = labels.cuda() attention_mask = attention_mask.cuda() outputs = model(input_ids=input_ids, attention_mask=attention_mask, masked_lm_labels=labels) unreduced_loss = loss_fn( outputs[0].view(-1, bert_config.vocab_size), labels.view(-1)).reshape(labels.shape) / ( torch.sum(labels != -100, axis=1).unsqueeze(1) + 1e-7) loss += unreduced_loss.sum() num_seq += input_ids.shape[0] if num_seq > effective_batch_size: batch_count += 1 loss /= num_seq cum_loss += loss.item() if batch_count % 20 == 0: print("{} batches, Loss : {:.4}, LR : {:.6}".format( batch_count, cum_loss / 20, scheduler.get_lr()[0])) cum_loss = 0.0 if batch_count % 10000 == 0: fname1 = os.path.join( args.save_dir, 'bert_{}_step_{}.pth'.format( ('decoder' if args.decoder else 'encoder'), batch_count)) fname2 = os.path.join( args.save_dir, 'bert_{}.pth'.format( ('decoder' if args.decoder else 'encoder'), batch_count)) state = { 'epoch': epoch, 'step': batch_count, 'model': model.module.state_dict(), 'optim': optimizer.state_dict(), 'scheduler': scheduler.state_dict() } torch.save(state, fname1) torch.save(state, fname2) print("Saved file to path {}".format(fname1)) print("Saved file to path {}".format(fname2)) loss.backward() optimizer.step() scheduler.step() optimizer.zero_grad() loss = 0 num_seq = 0 if batch_count == max_iters: print(f'[Learner] Reached max iters', file=sys.stderr) exit() print("Max_len = {}".format(max_len)) break
def load_bert(bert_path, device): bert_config_path = os.path.join(bert_path, 'config.json') bert = BertModel(BertConfig(**load_json(bert_config_path))).to(device) bert_model_path = os.path.join(bert_path, 'model.bin') bert.load_state_dict(clean_state_dict(torch.load(bert_model_path))) return bert
from transformers import BertConfig config = BertConfig( vocab_size=21_128, max_position_embeddings=512, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=2, ) from transformers import BertTokenizerFast tokenizer = BertTokenizerFast.from_pretrained("./model/wpe", max_len=512) from transformers import BertForMaskedLM model = BertForMaskedLM(config=config) print(model.num_parameters()) model.resize_token_embeddings(len(tokenizer)) from transformers import LineByLineTextDataset dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path="./data/data_train.csv", block_size=128, ) from transformers import DataCollatorForLanguageModeling
# add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token #decoder = BertGenerationDecoder.from_pretrained("bert-base-uncased", # add_cross_attention=True, is_decoder=True, # bos_token_id=decoder_tokenizer.vocab["[CLS]"], # eos_token_id=decoder_tokenizer.vocab["[SEP]"], # ) #decoder.resize_token_embeddings(len(decoder_tokenizer)) # Fresh decoder config. decoder_config = BertConfig( is_decoder=True, add_cross_attention=True, vocab_size=len(decoder_tokenizer), # Set required tokens. unk_token_id=decoder_tokenizer.vocab["[UNK]"], sep_token_id=decoder_tokenizer.vocab["[SEP]"], pad_token_id=decoder_tokenizer.vocab["[PAD]"], cls_token_id=decoder_tokenizer.vocab["[CLS]"], mask_token_id=decoder_tokenizer.vocab["[MASK]"], bos_token_id=decoder_tokenizer.vocab["[BOS]"], eos_token_id=decoder_tokenizer.vocab["[EOS]"], ) # AutoConfig.from_pretrained("bert-base-uncased") #decoder_config = BertGenerationDecoderConfig() # From: https://github.com/huggingface/transformers/blob/master/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py#L464 #>>> model.config.decoder_start_token_id = tokenizer.cls_token_id #>>> model.config.pad_token_id = tokenizer.pad_token_id #>>> model.config.vocab_size = model.config.decoder.vocab_size #decoder_config.decoder_start_token_id = decoder_tokenizer.vocab["[CLS]"] # decoder_config.pad_token_type_id = 0 ?
OUTPUT_DIRECTORY = "./synthBERT" # load vocab file to list vocab_file = open(VOCAB_PATH, "r") vocab = vocab_file.read().split("\n") # use our pretrained tokenizer and model tokenizer = BertTokenizer(VOCAB_PATH, do_basic_tokenize=True, additional_special_tokens=vocab) #tokenizer.add_tokens(vocab) # set BERT model parameters config = BertConfig( vocab_size=141, max_position_embeddings=50, num_addention_heads=12, num_hidden_layers=6, ) # instantiate the model # transformers has some built specifically for masked language modeling model = BertForMaskedLM(config=config) # resize the model embedding to fit our own vocab model.resize_token_embeddings(len(tokenizer)) # put corpus into a dataset helper dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path=CORPUS_PATH, block_size=128,
def main(): parser = argparse.ArgumentParser( description='Train the individual Transformer model') parser.add_argument('--dataset_folder', type=str, default='datasets') parser.add_argument('--dataset_name', type=str, default='zara1') parser.add_argument('--obs', type=int, default=8) parser.add_argument('--preds', type=int, default=12) parser.add_argument('--emb_size', type=int, default=1024) parser.add_argument('--heads', type=int, default=8) parser.add_argument('--layers', type=int, default=6) parser.add_argument('--dropout', type=float, default=0.1) parser.add_argument('--cpu', action='store_true') parser.add_argument('--output_folder', type=str, default='Output') parser.add_argument('--val_size', type=int, default=50) parser.add_argument('--gpu_device', type=str, default="0") parser.add_argument('--verbose', action='store_true') parser.add_argument('--max_epoch', type=int, default=100) parser.add_argument('--batch_size', type=int, default=256) parser.add_argument('--validation_epoch_start', type=int, default=30) parser.add_argument('--resume_train', action='store_true') parser.add_argument('--delim', type=str, default='\t') parser.add_argument('--name', type=str, default="zara1") args = parser.parse_args() model_name = args.name try: os.mkdir('models') except: pass try: os.mkdir('output') except: pass try: os.mkdir('output/BERT') except: pass try: os.mkdir(f'models/BERT') except: pass try: os.mkdir(f'output/BERT/{args.name}') except: pass try: os.mkdir(f'models/BERT/{args.name}') except: pass log = SummaryWriter('logs/BERT_%s' % model_name) log.add_scalar('eval/mad', 0, 0) log.add_scalar('eval/fad', 0, 0) try: os.mkdir(args.name) except: pass device = torch.device("cuda") if args.cpu or not torch.cuda.is_available(): device = torch.device("cpu") args.verbose = True ## creation of the dataloaders for train and validation train_dataset, _ = baselineUtils.create_datase(args.dataset_folder, args.dataset_name, 0, args.obs, args.preds, delim=args.delim, train=True, verbose=args.verbose) val_dataset, _ = baselineUtils.create_dataset(args.dataset_folder, args.dataset_name, 0, args.obs, args.preds, delim=args.delim, train=False, verbose=args.verbose) test_dataset, _ = baselineUtils.create_dataset(args.dataset_folder, args.dataset_name, 0, args.obs, args.preds, delim=args.delim, train=False, eval=True, verbose=args.verbose) from transformers import BertModel, BertConfig config = BertConfig(vocab_size=30522, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act='relu', hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12) model = BertModel(config).to(device) from individual_TF import LinearEmbedding as NewEmbed, Generator as GeneratorTS a = NewEmbed(3, 768).to(device) model.set_input_embeddings(a) generator = GeneratorTS(768, 2).to(device) # model.set_output_embeddings(GeneratorTS(1024,2)) tr_dl = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=0) val_dl = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=True, num_workers=0) test_dl = torch.utils.data.DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=0) # optim = SGD(list(a.parameters())+list(model.parameters())+list(generator.parameters()),lr=0.01) # sched=torch.optim.lr_scheduler.StepLR(optim,0.0005) optim = NoamOpt( 768, 0.1, len(tr_dl), torch.optim.Adam(list(a.parameters()) + list(model.parameters()) + list(generator.parameters()), lr=0, betas=(0.9, 0.98), eps=1e-9)) # optim=Adagrad(list(a.parameters())+list(model.parameters())+list(generator.parameters()),lr=0.01,lr_decay=0.001) epoch = 0 mean = train_dataset[:]['src'][:, :, 2:4].mean((0, 1)) * 0 std = train_dataset[:]['src'][:, :, 2:4].std((0, 1)) * 0 + 1 while epoch < args.max_epoch: epoch_loss = 0 model.train() for id_b, batch in enumerate(tr_dl): optim.optimizer.zero_grad() r = 0 rot_mat = np.array([[np.cos(r), np.sin(r)], [-np.sin(r), np.cos(r)]]) inp = ((batch['src'][:, :, 2:4] - mean) / std).to(device) inp = torch.matmul(inp, torch.from_numpy(rot_mat).float().to(device)) trg_masked = torch.zeros((inp.shape[0], args.preds, 2)).to(device) inp_cls = torch.ones(inp.shape[0], inp.shape[1], 1).to(device) trg_cls = torch.zeros(trg_masked.shape[0], trg_masked.shape[1], 1).to(device) inp_cat = torch.cat((inp, trg_masked), 1) cls_cat = torch.cat((inp_cls, trg_cls), 1) net_input = torch.cat((inp_cat, cls_cat), 2) position = torch.arange(0, net_input.shape[1]).repeat( inp.shape[0], 1).long().to(device) token = torch.zeros( (inp.shape[0], net_input.shape[1])).long().to(device) attention_mask = torch.ones( (inp.shape[0], net_input.shape[1])).long().to(device) out = model(input_ids=net_input, position_ids=position, token_type_ids=token, attention_mask=attention_mask) pred = generator(out[0]) loss = F.pairwise_distance( pred[:, :].contiguous().view(-1, 2), torch.matmul( torch.cat( (batch['src'][:, :, 2:4], batch['trg'][:, :, 2:4]), 1).contiguous().view(-1, 2).to(device), torch.from_numpy(rot_mat).float().to(device))).mean() loss.backward() optim.step() print("epoch %03i/%03i frame %04i / %04i loss: %7.4f" % (epoch, args.max_epoch, id_b, len(tr_dl), loss.item())) epoch_loss += loss.item() # sched.step() log.add_scalar('Loss/train', epoch_loss / len(tr_dl), epoch) with torch.no_grad(): model.eval() gt = [] pr = [] val_loss = 0 for batch in val_dl: inp = ((batch['src'][:, :, 2:4] - mean) / std).to(device) trg_masked = torch.zeros( (inp.shape[0], args.preds, 2)).to(device) inp_cls = torch.ones(inp.shape[0], inp.shape[1], 1).to(device) trg_cls = torch.zeros(trg_masked.shape[0], trg_masked.shape[1], 1).to(device) inp_cat = torch.cat((inp, trg_masked), 1) cls_cat = torch.cat((inp_cls, trg_cls), 1) net_input = torch.cat((inp_cat, cls_cat), 2) position = torch.arange(0, net_input.shape[1]).repeat( inp.shape[0], 1).long().to(device) token = torch.zeros( (inp.shape[0], net_input.shape[1])).long().to(device) attention_mask = torch.zeros( (inp.shape[0], net_input.shape[1])).long().to(device) out = model(input_ids=net_input, position_ids=position, token_type_ids=token, attention_mask=attention_mask) pred = generator(out[0]) loss = F.pairwise_distance( pred[:, :].contiguous().view(-1, 2), torch.cat( (batch['src'][:, :, 2:4], batch['trg'][:, :, 2:4]), 1).contiguous().view(-1, 2).to(device)).mean() val_loss += loss.item() gt_b = batch['trg'][:, :, 0:2] preds_tr_b = pred[:, args.obs:].cumsum(1).to( 'cpu').detach() + batch['src'][:, -1:, 0:2] gt.append(gt_b) pr.append(preds_tr_b) gt = np.concatenate(gt, 0) pr = np.concatenate(pr, 0) mad, fad, errs = baselineUtils.distance_metrics(gt, pr) log.add_scalar('validation/loss', val_loss / len(val_dl), epoch) log.add_scalar('validation/mad', mad, epoch) log.add_scalar('validation/fad', fad, epoch) model.eval() gt = [] pr = [] for batch in test_dl: inp = ((batch['src'][:, :, 2:4] - mean) / std).to(device) trg_masked = torch.zeros( (inp.shape[0], args.preds, 2)).to(device) inp_cls = torch.ones(inp.shape[0], inp.shape[1], 1).to(device) trg_cls = torch.zeros(trg_masked.shape[0], trg_masked.shape[1], 1).to(device) inp_cat = torch.cat((inp, trg_masked), 1) cls_cat = torch.cat((inp_cls, trg_cls), 1) net_input = torch.cat((inp_cat, cls_cat), 2) position = torch.arange(0, net_input.shape[1]).repeat( inp.shape[0], 1).long().to(device) token = torch.zeros( (inp.shape[0], net_input.shape[1])).long().to(device) attention_mask = torch.zeros( (inp.shape[0], net_input.shape[1])).long().to(device) out = model(input_ids=net_input, position_ids=position, token_type_ids=token, attention_mask=attention_mask) pred = generator(out[0]) gt_b = batch['trg'][:, :, 0:2] preds_tr_b = pred[:, args.obs:].cumsum(1).to( 'cpu').detach() + batch['src'][:, -1:, 0:2] gt.append(gt_b) pr.append(preds_tr_b) gt = np.concatenate(gt, 0) pr = np.concatenate(pr, 0) mad, fad, errs = baselineUtils.distance_metrics(gt, pr) torch.save(model.state_dict(), "models/BERT/%s/ep_%03i.pth" % (args.name, epoch)) torch.save(generator.state_dict(), "models/BERT/%s/gen_%03i.pth" % (args.name, epoch)) torch.save(a.state_dict(), "models/BERT/%s/emb_%03i.pth" % (args.name, epoch)) log.add_scalar('eval/mad', mad, epoch) log.add_scalar('eval/fad', fad, epoch) epoch += 1 ab = 1
def model_setting(args): loader, tokenizer = get_loader(args) if args.text_processor == 'roberta': config = RobertaConfig() roberta = RobertaModel(config) # text_processor = roberta.from_pretrained('roberta-base') ## 텍스트를 분할해서 로벌타에 넣어보자 if args.dataset == 'MissO_split' or args.dataset == 'TVQA_split': text_processor_que = roberta.from_pretrained('roberta-base') text_processor_utt = roberta.from_pretrained('roberta-base') elif args.eval == 'True': memory_processor = roberta.from_pretrained('roberta-base') logic_processor = roberta.from_pretrained('roberta-base') else: text_processor = roberta.from_pretrained('roberta-base') elif args.text_processor == 'bert': config = BertConfig() bert = BertModel(config) text_processor = bert.from_pretrained('bert-base-uncased') else: text_processor = None if args.eval == 'False': if args.only_text_input == 'True': model = QuestionLevelDifficultyOT(args, tokenizer, text_processor) else: if args.dataset == 'MissO_split' or args.dataset == 'TVQA_split': model = QuestionLevelDifficulty_M_split( args, tokenizer, text_processor_que, text_processor_utt) else: model = QuestionLevelDifficulty_M(args, tokenizer, text_processor) criterion = get_loss_func(tokenizer) optimizer = get_optim(args, model) scheduler = get_scheduler(optimizer, args, loader['train']) model.to(args.device) criterion.to(args.device) config = { 'loader': loader, 'optimizer': optimizer, 'criterion': criterion, 'scheduler': scheduler, 'tokenizer': tokenizer, 'args': args, 'model': model } else: memory_model = QuestionLevelDifficulty_M(args, tokenizer, memory_processor) logic_model = QuestionLevelDifficulty_L(args, tokenizer, logic_processor) memory_model.to(args.device) logic_model.to(args.device) config = { 'loader': loader, 'tokenizer': tokenizer, 'args': args, 'memory_model': memory_model, 'logic_model': logic_model } return config
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--relation', '-r', type=str, required=True, help= f'relation type that is trained on. Available :{", ".join(config.supported_relations)}' ) parser.add_argument('--dataset_name', '-d', required=True, type=str, help='dataset used for train, eval and vocab') parser.add_argument('--output_model_name', '-o', type=str, default='', help='Defaults to dataset_name if not stated.') parser.add_argument('--epochs', type=int, default='2000', help='Default is 2000 epochs') parser.add_argument('--batch_size', type=int, default='1024', help='Default is batch size of 256') parser.add_argument('--logging_steps', type=int, default='200', help='After how many batches metrics are logged') parser.add_argument( "--mlm_probability", type=float, default=0.15, help="Ratio of tokens to mask for masked language modeling loss") parser.add_argument( "--block_size", default=-1, type=int, help="Optional input sequence length after tokenization." "The training datasets will be truncated in block of this size for training." "Default to the model max input length for single sentence inputs (take into account special tokens).", ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--learning_rate", default=6e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.") parser.add_argument( "--save_total_limit", type=int, default=2, help="Saves this many checkpoints and deletes older ones", ) parser.add_argument("--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument("--gpu_device", type=int, default=0, help="gpu number") args = parser.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu_device) if args.output_model_name == '': args.output_model_name = args.dataset_name data_dir = Path('data') / args.relation / 'datasets' / args.dataset_name args.train_data_file = data_dir / 'train.txt' args.tokenizer_name = f'data/{args.relation}/vocab/{args.dataset_name}/' args.output_dir = f'output/models/{args.relation}/{args.output_model_name}' if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and not args.overwrite_output_dir: raise ValueError( f"Output directory ({args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup CUDA, GPU & distributed training device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) # Set seed set_seed(args) # Load pretrained model and tokenizer tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name) model_config = BertConfig(vocab_size=tokenizer.vocab_size) if args.block_size <= 0: args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: args.block_size = min(args.block_size, tokenizer.max_len) corrects = { "eval": json.load(open( data_dir / 'subject_relation2object_eval.json', 'r', )), "train": json.load(open( data_dir / 'subject_relation2object_train.json', 'r', )) } for eval_type, d in corrects.items(): corrects[eval_type] = batchify_dict(d, args, tokenizer) logger.info("Training new model from scratch") model = BertForMaskedLM(config=model_config) model.to(args.device) logger.info("Training/evaluation parameters %s", args) train_dataset = LineByLineTextDataset(tokenizer, args, args.train_data_file, block_size=args.block_size) # train global_step, tr_loss = train(args, train_dataset, corrects, model, tokenizer) # TRAIN logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using # from_pretrained() os.makedirs(args.output_dir, exist_ok=True) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = (model.module if hasattr(model, "module") else model) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned model = BertForMaskedLM.from_pretrained(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.output_dir) model.to(args.device) # Evaluation results = {} checkpoints = [args.output_dir] logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split( "/")[-1] if checkpoint.find("checkpoint") != -1 else "" model = BertForMaskedLM.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, corrects, model, tokenizer, prefix=prefix) result = dict( (k + "_{}".format(global_step), v) for k, v in result.items()) results.update(result) return results
# # How to train a language model Highlight all the steps to effectively train Transformer model on custom data # - Colab (ipynb) version : https://github.com/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb # - MD version: https://github.com/huggingface/blog/blob/master/how-to-train.md # # Pretrain Longformer How to build a "long" version of existing pretrained models Iz Beltagy # https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb # In[6]: from transformers import BertForMaskedLM, BertConfig configuration = BertConfig( vocab_size=80000, # max_position_embeddings=512, # 512 + 2 more special tokens # num_attention_heads=12, # num_hidden_layers=12, # type_vocab_size=1, ) # configuration.vocab_size = 20000 model = BertForMaskedLM(config=configuration) # model = RobertaForMaskedLM.from_pretrained('./Roberta/checkpoint-200000') # Accessing the model configuration model.config # # Initializing Tokenizer # ## Rewrite Tokenizer of bert_itos_80k with special tokens in front
def __init__(self, bert_path): super().__init__() config = BertConfig() config.output_hidden_states = True self.bert_layer = BertModel.from_pretrained(bert_path, config=config)
from transformers import BertTokenizer, BertConfig, BertModel from sklearn.model_selection import train_test_split import torch import torch.utils.data as data import torch.nn as nn import torch.nn.functional as F import torch.optim as optim logger = create_logger(__name__) # bert parameters MAX_LEN = 256 # max is 512 for BERT config = BertConfig( vocab_size_or_config_json_file=32000, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, ) def load_train_test_sets(filepath: str = "IMDB Dataset.csv", test_size: float = 0.10, random_state: int = 42): data = pd.read_csv(filepath) # Sentiment score must be numeric data["sent_score"] = 1 data.loc[data.sentiment == "negative", "sent_score"] = 0 X, y = data["review"], data["sent_score"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state) return X_train, X_test, y_train, y_test
def __init__(self): self.config = BertConfig("bert-base-chinese") self.model = BertForQuestionAnswering.from_pretrained( "bert-base-chinese") self.data_processor = DataProcessor( max_len=self.config.max_position_embeddings)
import os import re import json import string import numpy as np import tensorflow as tf from tensorflow import keras from tensorflow.keras import layers from Squad_example import SquadExample, ExactMatch from utils import create_inputs_targets, normalize_text, create_squad_examples, create_model from transformers import BertTokenizer, TFBertModel, BertConfig from constant import max_len configuration = BertConfig() slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") save_path = "bert_base_uncased/" if not os.path.exists(save_path): os.makedirs(save_path) slow_tokenizer.save_pretrained(save_path) train_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json" train_path = keras.utils.get_file("train.json", train_data_url) eval_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json" eval_path = keras.utils.get_file("eval.json", eval_data_url) with open(train_path) as f: raw_train_data = json.load(f)
args = get_eval_args() device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") res_file = os.path.join(args.output_dir, "./raw_res.csv") cache_dir = os.path.join(args.data_dir, "cache") cached_file = os.path.join(cache_dir, "test_examples_cache.dat".format()) logging.basicConfig(level='INFO') logger = logging.getLogger(__name__) if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) model = TBertT(BertConfig(), args.code_bert) if args.model_path and os.path.exists(args.model_path): model_path = os.path.join(args.model_path, MODEL_FNAME) model.load_state_dict(torch.load(model_path)) logger.info("model loaded") start_time = time.time() test_examples = load_examples(args.data_dir, data_type="test", model=model, overwrite=args.overwrite, num_limit=args.test_num) test_examples.update_embd(model) m = test(args, model, test_examples, "cached_twin_test") exe_time = time.time() - start_time m.write_summary(exe_time)
return np.stack(layer_vectors), np.stack(layer_attns) else: return np.stack(layer_vectors) #%% # # random_model = True random_model = False # dep_tree = True dep_tree = False if random_model: model = BertModel( BertConfig(output_hidden_states=True, output_attentions=True, cache_dir='pretrained_models')) else: model = BertModel.from_pretrained('bert-base-cased', output_hidden_states=True, output_attentions=True) # config = AutoConfig.from_pretrained('bert-base-cased', output_hidden_states=True, # output_attentions=True, # cache_dir='pretrained_models') # model = AutoModel.from_config(config) # tokenizer = AutoTokenizer.from_pretrained('bert-base-cased', cache_dir='pretrained_models') tokenizer = BertTokenizer.from_pretrained('bert-base-cased') if dep_tree: dfile = SAVE_DIR + 'dependency_train_bracketed.txt' idx = np.load(SAVE_DIR + 'const_in_dep.npy').astype(int)
# full_vecs = np.array(layer_full_vectors).squeeze().transpose((0,2,1)) return np.stack(layer_Z_vectors), np.array(layer_att), np.array( layer_full_vectors) #%% random_model = False # random_model = True if random_model: # config = AutoConfig.from_pretrained(pretrained_weights, output_hidden_states=True, # output_attentions=args.attention, # cache_dir='pretrained_models') # model = AutoModel.from_config(config) model = BertModel( BertConfig(output_hidden_states=True, output_attentions=True)) else: model = BertModel.from_pretrained('bert-base-cased', output_hidden_states=True, output_attentions=True) tokenizer = BertTokenizer.from_pretrained('bert-base-cased') dfile = SAVE_DIR + 'train_bracketed.txt' # with open(SAVE_DIR+'permuted_data.pkl', 'rb') as dfile: # dist = pkl.load(dfile) #%% max_num = 200 these_bounds = [0, 1, 2, 3, 4, 5, 6]
def train(args): #load squad data for pre-training. args.train_batch_size=int(args.train_batch_size / args.gradient_accumulation_steps) review_train_examples=np.load(os.path.join(args.review_data_dir, "data.npz") ) num_train_steps = args.num_train_steps bar = tqdm(total=num_train_steps) # load bert pre-train data. review_train_data = TensorDataset( torch.from_numpy(review_train_examples["input_ids"]), torch.from_numpy(review_train_examples["segment_ids"]), torch.from_numpy(review_train_examples["input_mask"]), torch.from_numpy(review_train_examples["masked_lm_ids"]), torch.from_numpy(review_train_examples["next_sentence_labels"]) ) review_train_dataloader = DataLoader(review_train_data, sampler=RandomSampler(review_train_data), batch_size=args.train_batch_size , drop_last=True) # we do not have any valiation for pretuning model = BertForPreTraining.from_pretrained(modelconfig.MODEL_ARCHIVE_MAP[args.bert_model], cache_dir='../cache', config=BertConfig()) model.train() model = BertForMTPostTraining(model, BertConfig()) model.cuda() # Prepare optimizer param_optimizer = [(k, v) for k, v in model.named_parameters() if v.requires_grad==True] param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] t_total = num_train_steps optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(args.warmup_proportion * t_total), num_training_steps=t_total) global_step=0 step=0 batch_loss=0. model.train() model.zero_grad() training=True review_iter=iter(review_train_dataloader) model_dir = os.path.join(args.output_dir, "saved_model") os.makedirs(model_dir, exist_ok=True) while training: try: batch = next(review_iter) except: review_iter=iter(review_train_dataloader) batch = next(review_iter) batch = tuple(t.cuda() for t in batch) input_ids, segment_ids, input_mask, masked_lm_ids, next_sentence_labels = batch review_loss = model("review", input_ids=input_ids.long(), token_type_ids=segment_ids.long(), attention_mask=input_mask.long(), masked_lm_labels=masked_lm_ids.long(), next_sentence_label=next_sentence_labels.long()) loss = review_loss if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps batch_loss += loss loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear(global_step/t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 bar.update(1) if global_step % 50 ==0: logging.info("step %d batch_loss %f ", global_step, batch_loss) batch_loss=0. if global_step % args.save_checkpoints_steps == 0: model.float() print('Saving model..') model.model.save_pretrained(model_dir + f"-{global_step}") if global_step>=num_train_steps: training=False break step+=1 model.float() print('Saving model..') model.model.save_pretrained(model_dir + f"-{global_step}")
print('max_len of tokenized texts:', max([len(sent) for sent in tokenized_texts])) print("Tokenize the first sentence:") print(tokenized_texts[0]) # construct the vocabulary vocab = list(set([w for sent in tokenized_texts for w in sent])) # index the input words tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts] input_ids = pad_or_truncate(input_ids, MAX_LEN) bert_config = BertConfig(vocab_size_or_config_json_file=len(vocab)) heads = config["heads"] heads_mwe = config["heads_mwe"] all_test_indices = [] all_predictions = [] all_folds_labels = [] recorded_results_per_fold = [] splits = train_test_loader(input_ids, labels, A, A_MWE, target_token_idices, K, BATCH_TRAIN, BATCH_TEST) for i, (train_dataloader, test_dataloader) in enumerate(splits): model = BertWithGCNAndMWE(MAX_LEN, bert_config, heads, heads_mwe, dropout) model.to(device)
def bert_model(): config = BertConfig() model = BertForQuestionAnswering(config=config) return model