def main(): bert_base_config = BertConfig.from_pretrained('bert-base-uncased', num_labels=2) bert_base_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=bert_base_config) count = 0 for name, param in bert_base_model.named_parameters(): if param.requires_grad: size = 1 for s in param.data.size(): size = s * size count += size print('The total number of parameters in bert_base_uncased: ', count) roberta_config = RobertaConfig.from_pretrained('roberta-base', num_labels=2) roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base',config=roberta_config) count = 0 for name, param in roberta_model.named_parameters(): if param.requires_grad: size = 1 for s in param.data.size(): size = s * size count += size print('The total number of parameters in roberta: ', count) albert_config = AlbertConfig.from_pretrained('albert-base-v2', num_labels=2) albert_model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', config=albert_config) count = 0 for name, param in albert_model.named_parameters(): if param.requires_grad: size = 1 for s in param.data.size(): size = s * size count += size print('The total number of parameters in albert: ', count)
def get_albert_for_comparison(): model_name = 'albert-base-v2' config = AlbertConfig.from_pretrained(model_name) config.output_hidden_states = False input_ids = tf.keras.Input(shape=(128, ), name='input_ids', dtype=tf.int32) attention_mask = tf.keras.Input(shape=(128, ), name='attention_mask', dtype=tf.int32) transformer_model = TFAlbertModel.from_pretrained(model_name, config=config) embedding_layer = transformer_model([input_ids, attention_mask])[0] X = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=tf.keras.initializers.TruncatedNormal( stddev=config.initializer_range), activation="relu", name="pre_classifier", )(embedding_layer[:, 0]) X = tf.keras.layers.Dropout(config.classifier_dropout_prob)(X) output_ = tf.keras.layers.Dense( 1, kernel_initializer=tf.keras.initializers.TruncatedNormal( stddev=config.initializer_range), name="classifier")(X) return tf.keras.Model([input_ids, attention_mask], output_)
def __init__(self, my_config, args): super(NqModel, self).__init__() #albert_base_configuration = AlbertConfig(vocab_size=30000,hidden_size=768,num_attention_heads=12,intermediate_size=3072, # attention_probs_dropout_prob=0) self.my_mask = None self.args = args #mfeb/albert-xxlarge-v2-squad2 self.bert_config = AlbertConfig.from_pretrained("albert-xxlarge-v2") # self.bert_config.gradient_checkpointing = True # self.bert_config.Extgradient_checkpointing = True self.bert = AlbertModel.from_pretrained("albert-xxlarge-v2", config=self.bert_config) # self.bert = AlbertModel.from_pretrained("albert-base-v2") my_config.hidden_size = self.bert.config.hidden_size self.right = 0 self.all = 0 #self.bert = AlbertModel(albert_base_configuration) #self.bert2 = BertModel(bert_config) #self.bert = BertModel(BertConfig()) #self.bert = RobertaModel(RobertaConfig(max_position_embeddings=514,vocab_size=50265)) #print(my_config,bert_config) # self.tok_dense = nn.Linear(my_config.hidden_size, my_config.hidden_size) self.tok_dense = nn.Linear(my_config.hidden_size * 2, my_config.hidden_size * 2) # self.tok_dense2 = nn.Linear(my_config.hidden_size, my_config.hidden_size) # self.para_dense = nn.Linear(self.config.hidden_size, self.config.hidden_size) # self.doc_dense = nn.Linear(self.config.hidden_size, self.config.hidden_size) self.dropout = nn.Dropout(my_config.hidden_dropout_prob) self.tok_outputs = nn.Linear(my_config.hidden_size * 2, 1) # tune to avoid fell into bad places # self.tok_outputs2 = nn.Linear(my_config.hidden_size, 1) # config.max_token_len, config.max_token_relative # self.para_outputs = nn.Linear(self.config.hidden_size, 1) # self.answer_type_outputs = nn.Linear(self.config.hidden_size, 2) # self.tok_to_label = nn.Linear(my_config.max_token_len,2) # self.par_to_label = nn.Linear(my_config.max_paragraph_len,2) #self.encoder = Encoder(my_config) self.encoder = Encoder(my_config) # self.encoder2 = Encoder(my_config) self.my_config = my_config self.model_choice = None self.ground_answer = None self.ACC = 0 self.ALL = 0 self.ErrId = []
def __init__(self, config): super(AlBert, self).__init__() model_config = AlbertConfig.from_pretrained( config.config_file, num_labels=config.num_labels, finetuning_task=config.task, ) self.albert = AlbertModel.from_pretrained( config.model_name_or_path, config=model_config, ) if config.requires_grad: for param in self.albert.parameters(): param.requires_grad = True self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) #add the weighted layer self.hidden_weight = config.weighted_layer_tag #must modify the config.json self.pooling_tag = config.pooling_tag if self.hidden_weight: self.weight_layer = config.weighted_layer_num #self.weight = torch.zeros(self.weight_layer).to(config.device) self.weight = torch.nn.Parameter(torch.FloatTensor(self.weight_layer), requires_grad=True) self.softmax = nn.Softmax() self.pooler = nn.Sequential(nn.Linear(768, 768), nn.Tanh()) elif self.pooling_tag: self.maxPooling = nn.MaxPool1d(64) self.avgPooling = nn.AvgPool1d(64) self.pooler = nn.Sequential(nn.Linear(768*3, 768), nn.Tanh())
def __init__(self, model_name, model_type): """ Hyper-parameters found with validation set: xlnet-large-casd : epoch = 4, learning_rate = 1E-5, batch_size = 16, epsilon = 1e-6 bert-large-uncased : epoch = 4, learning_rate = 3E-5, batch_size = 16, epsilon = 1e-8 ALBERT xxlarge-v2 large : epoch = 3, learning_rate = 5E-5, batch_size = 8, epsilon = 1e-6 to be improved... """ self.model_name = model_name self.model_type = model_type # Cf transformers library, batch of 16 or 32 is advised for training. For memory issues, we will take 16. Gradient accumulation step has not lead # to great improvment and therefore won't be used here. if model_type == 'albert': self.batch_size = 8 else: self.batch_size = 16 available_model_name = ["xlnet-large-cased", "bert-large-uncased", "albert-xlarge-v2"] available_model_type = ["bert", "xlnet", "albert"] if self.model_name not in available_model_name: raise Exception("Error : model_name should be in", available_model_name) if self.model_type not in available_model_type: raise Exception("Error : model_name should be in", available_model_type) # Load BertForSequenceClassification, the pretrained BERT model with a single linear regression layer on top of the pooled output # Load our fined tune model: ex: BertForSequenceClassification.from_pretrained('./my_saved_model_directory/') if self.model_type == 'bert': self.config = BertConfig.from_pretrained(self.model_name, num_labels=1) # num_labels=1 for regression task self.model = BertForSequenceClassification.from_pretrained(self.model_name, config=self.config) elif self.model_type == 'xlnet': self.config = XLNetConfig.from_pretrained(self.model_name, num_labels=1) self.model = XLNetForSequenceClassification.from_pretrained(self.model_name, config=self.config) elif self.model_type == 'albert': self.config = AlbertConfig.from_pretrained(self.model_name, num_labels=1) self.model = AlbertForSequenceClassification.from_pretrained(self.model_name, config=self.config) self.model.cuda() if self.model_name == 'xlnet-large-cased': self.epochs = 4 self.lr = 1e-5 self.eps = 1e-6 elif self.model_name == 'bert-large-uncased': self.epochs = 4 self.lr = 3e-5 self.eps = 1e-8 elif self.model_name == 'albert-xxlarge-v2': self.epochs = 3 self.lr = 5e-5 self.eps = 1e-6 self.max_grad_norm = 1.0 # Gradient threshold, gradients norms that exceed this threshold are scaled down to match the norm. self.optimizer = AdamW(self.model.parameters(), lr=self.lr, eps=self.eps) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.n_gpu = torch.cuda.device_count() torch.cuda.get_device_name(0)
def load_model(self, model_path: str, do_lower_case=True): config = AlbertConfig.from_pretrained(model_path + "/config.json") tokenizer = AlbertTokenizer.from_pretrained(model_path) #tokenizer = AlbertTokenizer.from_pretrained('albert-large-v2', do_lower_case=do_lower_case) model = AlbertForQuestionAnswering.from_pretrained(model_path, from_tf=False, config=config) return model, tokenizer
def load_model_and_tokenizer(manifest): zipped_model_path = download_zipped_model(manifest, assert_hash=True) unzipped_model_dir = get_unzipped_dir_path(zipped_model_path) config = AlbertConfig.from_pretrained(unzipped_model_dir) model = TFAlbertForSequenceClassification.from_pretrained( unzipped_model_dir, config=config) tokenizer = AlbertTokenizer.from_pretrained(unzipped_model_dir) return model, tokenizer
def __init__(self, transformer_model, is_train): super(LMNER, self).__init__() config = AlbertConfig.from_pretrained(transformer_model) self.transformer_model = AlbertForMaskedLM.from_pretrained( transformer_model, config=config) # 是否对bert进行训练 for name, param in self.transformer_model.named_parameters(): param.requires_grad = is_train
def __init__(self, config): super(Model, self).__init__() self.config = AlbertConfig.from_pretrained(config.albert_config_path) self.albert = AlbertModel.from_pretrained(config.albert_model_path, config=self.config) for param in self.albert.parameters(): param.requires_grad = True self.fc = nn.Linear(config.hidden_size, config.num_classes)
def load_pretrained_encoder(mpath, config="albert_config.json", model="albert_model.bin"): b_config = BC.from_pretrained(opt.join(mpath, config)) encoder = AlbertModel.from_pretrained(opt.join(mpath, model), config=b_config) return encoder
def __init__(self, args, train_dataloader, test_dataloader=None): self.args = args cuda_condition = torch.cuda.is_available() and args.with_cuda self.device = torch.device("cuda" if cuda_condition else "cpu") print('Current cuda device ', torch.cuda.current_device()) # check if args.weight_load: config = AutoConfig.from_pretrained(args.pre_trained_model_path) model_state_dict = torch.load( os.path.join(args.pre_trained_model_path, 'pytorch_model.bin')) self.model = CXRBERT.from_pretrained(args.pre_trained_model_path, state_dict=model_state_dict, config=config, args=args).to(self.device) print('training restart with mid epoch') print(config) else: if args.bert_model == "albert-base-v2": config = AlbertConfig.from_pretrained(args.bert_model) elif args.bert_model == "emilyalsentzer/Bio_ClinicalBERT": config = AutoConfig.from_pretrained(args.bert_model) elif args.bert_model == "bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12": config = AutoConfig.from_pretrained(args.bert_model) elif args.bert_model == "bert-small-scratch": config = BertConfig.from_pretrained( "google/bert_uncased_L-4_H-512_A-8") elif args.bert_model == "bert-base-scratch": config = BertConfig.from_pretrained("bert-base-uncased") else: config = BertConfig.from_pretrained( args.bert_model) # bert-base, small, tiny self.model = CXRBERT(config, args).to(self.device) wandb.watch(self.model) if args.with_cuda and torch.cuda.device_count() > 1: print("Using %d GPUS for BERT" % torch.cuda.device_count()) self.model = nn.DataParallel(self.model, device_ids=args.cuda_devices) self.train_data = train_dataloader self.test_data = test_dataloader self.optimizer = AdamW(self.model.parameters(), lr=args.lr) self.mlm_criterion = nn.CrossEntropyLoss(ignore_index=-100) self.itm_criterion = nn.CrossEntropyLoss() self.log_freq = args.log_freq self.step_cnt = 0 print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))
def load_pretrained(mpath, config="albert_config.json", model="albert_model.bin"): b_config = BC.from_pretrained(opt.join(mpath, config)) encoder = AlbertModel.from_pretrained(opt.join(mpath, model), config=b_config) tokenizer = BertTokenizer.from_pretrained(mpath) return encoder, tokenizer
def load_albert(path): """ 加载模型 """ vocab_file = os.path.join(path, 'vocab.txt') tokenizer = BertTokenizer.from_pretrained(vocab_file) # print(tokenizer) config = AlbertConfig.from_pretrained(path) model = AlbertModel.from_pretrained(path, config=config) return model, tokenizer
def init_model(self, model_name): if model_name == 'Bert': config = BertConfig.from_pretrained('bert-base-uncased') config.hidden_dropout_prob = 0.2 config.attention_probs_dropout_prob = 0.2 self.model = BertForMultipleChoice.from_pretrained( 'pre_weights/bert-base-uncased_model.bin', config=config) elif model_name == 'Roberta': config = RobertaConfig.from_pretrained('roberta-large') config.hidden_dropout_prob = 0.2 config.attention_probs_dropout_prob = 0.2 self.model = RobertaForMultipleChoice.from_pretrained( 'pre_weights/roberta-large_model.bin', config=config) # print('load csqa pretrain weights...') # self.model.load_state_dict(torch.load( # 'checkpoints/commonsenseQA_pretrain_temp.pth' # )) elif model_name == 'Albert': self.model = AlbertForMultipleChoice.from_pretrained( 'pre_weights/albert-xxlarge_model.bin', config=AlbertConfig.from_pretrained('albert-xxlarge-v1')) elif model_name == 'RobertaLM': config = RobertaConfig.from_pretrained('roberta-large') config.hidden_dropout_prob = 0.2 config.attention_probs_dropout_prob = 0.2 self.model = RobertaForMultipleChoiceWithLM.from_pretrained( 'pre_weights/roberta-large_model.bin', config=config) elif model_name == 'RobertaLM2': self.model = RobertaForMultipleChoiceWithLM2(self.tokenizer) elif 'GNN' in model_name: self.model = SOTA_goal_model(self.args) elif 'LM' in model_name: config = RobertaConfig.from_pretrained('roberta-large') config.hidden_dropout_prob = 0.2 config.attention_probs_dropout_prob = 0.2 self.model = RobertaForMultipleChoiceWithLM.from_pretrained( 'pre_weights/roberta-large_model.bin', config=config) elif 'KBERT' in model_name: config = RobertaConfig.from_pretrained('roberta-large') config.hidden_dropout_prob = 0.2 config.attention_probs_dropout_prob = 0.2 self.model = RobertaForMultipleChoice.from_pretrained( 'pre_weights/roberta-large_model.bin', config=config) else: pass self.model.to(self.args['device']) if torch.cuda.device_count() > 1 and self.args['use_multi_gpu']: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) self.model = torch.nn.DataParallel(self.model)
def __init__(self, coordinator_args: CoordinatorArguments, collab_optimizer_args: CollaborativeOptimizerArguments, averager_args: AveragerArguments, dht: hivemind.DHT): self.save_checkpoint_step_interval = coordinator_args.save_checkpoint_step_interval self.repo_path = coordinator_args.repo_path self.upload_interval = coordinator_args.upload_interval self.previous_step = -1 config = AlbertConfig.from_pretrained( coordinator_args.model_config_path) self.model = AlbertForPreTraining(config) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] opt = Lamb( optimizer_grouped_parameters, lr=0.00176, weight_decay=0.01, clamp_value=10000.0, debias=True, ) adjusted_target_batch_size = collab_optimizer_args.target_batch_size - collab_optimizer_args.batch_size_lead self.collaborative_optimizer = hivemind.CollaborativeOptimizer( opt=opt, dht=dht, prefix=experiment_prefix, compression_type=hivemind.utils.CompressionType.Value( collab_optimizer_args.compression), throughput=collab_optimizer_args.bandwidth, target_batch_size=adjusted_target_batch_size, client_mode=collab_optimizer_args.client_mode, verbose=True, start=True, **asdict(averager_args)) self.previous_timestamp = time.time()
def init_model(cachedir='~/hashtag/', no_cuda=True): global tokenizer, model f_cachedir = os.path.expanduser(cachedir) bert_config = AlbertConfig.from_pretrained(f_cachedir) model = HashtagClassifier.from_pretrained(f_cachedir, config=bert_config) device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu" model.to(device) model.eval() tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
def download_albert_base(): file = '../input/albert-base-v2' config = AlbertConfig.from_pretrained('albert-base-v2') config.save_pretrained(file) model = AlbertModel.from_pretrained('albert-base-v2') model.save_pretrained(file) tkn = AlbertTokenizer.from_pretrained('albert-base-v2') tkn.save_pretrained(file)
def build_pretrain_feature_model(self): mn = self.args.pretrain_feature_model_name if 'albert' in mn: pretrain_feature_tokenizer = BertTokenizer.from_pretrained(mn) config = AlbertConfig.from_pretrained(mn) config.output_hidden_states = True self.pretrain_feature_model = AlbertModel.from_pretrained( mn, config=config).to(self.device) else: pretrain_feature_tokenizer = AutoTokenizer.from_pretrained(mn) config = AutoConfig.from_pretrained(mn) config.output_hidden_states = True self.pretrain_feature_model = AutoModel.from_pretrained( mn, config=config).to(self.device) self.pretrain_feature_model.requires_grad_(False) # self.pretrain_feature_model.requires_grad_(True) # pipeline input is raw data, we have ids, so direct use model # self.pretrain_feature_pipeline = Pipeline('feature-extraction', # model=self.pretrain_feature_model, tokenizer=pretrain_feature_tokenizer) # TODO: pre calc feature and save to file, it use less memory for train and faster # XXX: only used this tokenizer vocab, did not used for byte pair split, now just split by space utils.add_special_tokens_(self.pretrain_feature_model, pretrain_feature_tokenizer) # FIXME: this changed args should saved to checkpoint file if self.args.pretrain_feature_type == 'mem_n2n': self.args.emb_dim = self.pretrain_feature_model.config.hidden_size self.args.d_model = self.pretrain_feature_model.config.hidden_size elif self.args.pretrain_feature_type == 'feature': self.args.emb_dim = self.pretrain_feature_model.config.hidden_size else: if self.pretrain_feature_model.base_model_prefix != 'bert': self.args.emb_dim = self.pretrain_feature_model.config.embedding_size else: self.args.emb_dim = self.pretrain_feature_model.config.hidden_size # XXX: for 'xlnet' # self.args.d_model = self.pretrain_feature_model.config.hidden_size if 'weight' in self.args.pretrain_feature_type: # few effects self.args.d_model = self.pretrain_feature_model.config.hidden_size self.args.n_head = self.pretrain_feature_model.config.num_attention_heads self.args.d_ff = self.pretrain_feature_model.config.intermediate_size self.args.factor_ff = False self.vocab = datasets.ChatVocab(pretrain_feature_tokenizer) self.input_dim = len(self.vocab) self.pad_idx = self.vocab.stoi(utils.PAD) self.embeddings = None # too slow # self.tokenizer = pretrain_feature_tokenizer.tokenize self.tokenizer = None
def __init__(self): super(AlbertTweetModel, self).__init__() config = AlbertConfig.from_pretrained( './albert.torch/albert-base-v2/config.json', output_hidden_states=True) self.bert = AlbertModel.from_pretrained( './albert.torch/albert-base-v2/pytorch_model.bin', config=config) self.dropout = nn.Dropout(0.5) self.fc = nn.Linear(config.hidden_size, 2) nn.init.normal_(self.fc.weight, std=0.2) nn.init.normal_(self.fc.bias, 0)
def __init__(self): super(Model, self).__init__() # 加载预训练模型 self.config = AlbertConfig.from_pretrained(Config.config_bert_path) self.albert = AlbertModel.from_pretrained(Config.model_bert_path, config=self.config) for param in self.albert.parameters(): param.requires_grad = True self.qa_outputs = nn.Linear(1024, 2) self.loss_fct = CrossEntropyLoss() # 计算损失
def model_setting(model_name): if model_name == 'bert': from transformers import AutoTokenizer, BertForSequenceClassification, BertConfig config = BertConfig.from_pretrained("bert-base-uncased", num_labels=2) tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") model = BertForSequenceClassification.from_pretrained( "bert-base-uncased") return config, tokenizer, model elif model_name == 'albert': from transformers import AutoTokenizer, AlbertForSequenceClassification, AlbertConfig config = AlbertConfig.from_pretrained("albert-base-v2", num_labels=2) tokenizer = AutoTokenizer.from_pretrained("albert-base-v2") model = AlbertForSequenceClassification.from_pretrained( "albert-base-v2") return config, tokenizer, model
def retrieve_conf(trained_condition, trained_vocab): albertconf = AlbertConfig.from_pretrained( f'albert-{trained_condition.albert_scale}-v2') if 'smaller' in trained_condition.keys(): if trained_condition.smaller: #originally used 4H for FFN but for memory issue, use 1H for FFN albertconf.hidden_size = trained_condition.hidden_size albertconf.num_hidden_layers = trained_condition.num_hidden_layers albertconf.num_attention_heads = trained_condition.num_attention_heads albertconf.intermediate_size = albertconf.hidden_size albertconf.vocab_size = len(trained_vocab.itos) albertconf.bos_token_id = trained_vocab.stoi['BOS'] albertconf.eos_token_id = trained_vocab.stoi['EOS'] albertconf.pad_token_id = trained_vocab.stoi['PAD'] albertconf.max_position_embeddings = 40 return albertconf
def load_model_and_tokenizer(): unzipped_saved_model_dir = get_unzipped_dir_path(MODEL_ZIP_PATH, UNZIPPED_MODEL_PATH) print("Loading pretrained ALBERT classification model") start = time.time() config = AlbertConfig.from_pretrained(unzipped_saved_model_dir, num_labels=NUM_LABELS, max_length=DEFAULT_MAX_LEN) model = TFAlbertForSequenceClassification.from_pretrained( unzipped_saved_model_dir, config=config) tokenizer = AlbertTokenizer.from_pretrained(unzipped_saved_model_dir, do_lower_case=True) duration = time.time() - start print(f"Initializing model took {duration}") return model, tokenizer
def get_bert_config(bert_model_type, output_hidden_states=False): if bert_model_type in [ 'bert-base-uncased', 'prod-bert-base-uncased', 'bert-base-cased', 'bert-large-uncased', 'tune_bert-base-uncased_nsp', 'bert-large-uncased-whole-word-masking', 'bert-large-uncased-whole-word-masking-finetuned-squad' ]: bert_config = BertConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in [ 'roberta-base', 'prod-roberta-base-cased', 'roberta-large', 'roberta-large-mnli', 'distilroberta-base' ]: bert_config = RobertaConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in ['xlnet-base-cased']: bert_config = XLNetConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in [ 'albert-base-v1', 'albert-large-v1', 'albert-xlarge-v1', 'albert-xxlarge-v1' ]: bert_config = AlbertConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in ['gpt2', 'gpt2-medium']: bert_config = GPT2Config.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in ['transfo-xl']: bert_config = TransfoXLConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in [ 'distilbert-base-uncased', 'distilbert-base-uncased-distilled-squad' ]: bert_config = DistilBertConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) else: raise ValueError( f'`bert_model_type` not understood: {bert_model_type}') bert_config.output_hidden_states = output_hidden_states return bert_config
def load_model(self, config=None): """载入模型,返回载入后的模型组件 Returns: [dict] -- [模型组件] """ print("** loading model.. **") tokenizer = BertTokenizer.from_pretrained('../albert-small/', cache_dir=None, do_lower_case=True) bert_config = AlbertConfig.from_pretrained('../albert-small/') model = PairModel(config=bert_config) device = torch.device('cpu') state = torch.load(Path('../albert-small/pytorch_model.pt'), map_location=device) model.load_state_dict(state['model']) model.to(device) model.eval() self.model = model self.tokenizer = tokenizer
def __init__(self, *args, **kwargs): super(AlbertForComparison, self).__init__(self, args, kwargs) self.model_name = 'albert-base-v2' self.config = AlbertConfig.from_pretrained(self.model_name) self.config.output_hidden_states = False self.embedding_layer = TFAlbertModel.from_pretrained( self.model_name, config=self.config) self.pre_classifier = tf.keras.layers.Dense( self.config.hidden_size, kernel_initializer=tf.keras.initializers.TruncatedNormal( stddev=self.config.initializer_range), activation="relu", name="pre_classifier", ) self.classifier = tf.keras.layers.Dense( 1, kernel_initializer=tf.keras.initializers.TruncatedNormal( stddev=self.config.initializer_range), name="classifier")
def get_model_and_tokenizer(cls, model_name): model = tokenizer = None if model_name == 'Bert': model = BertForMultipleChoice.from_pretrained( 'pre_weights/bert-base-uncased_model.bin', config=BertConfig.from_pretrained('bert-base-uncased')) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') elif model_name == 'Roberta': model = RobertaForMultipleChoice.from_pretrained( 'pre_weights/roberta-large_model.bin', config=RobertaConfig.from_pretrained('roberta-large')) tokenizer = RobertaTokenizer.from_pretrained('roberta-large') elif model_name == 'Albert': model = AlbertForMultipleChoice.from_pretrained( 'pre_weights/albert-xxlarge_model.bin', config=AlbertConfig.from_pretrained('albert-xxlarge-v1')) tokenizer = AlbertTokenizer.from_pretrained('albert-xxlarge-v1') elif model_name == 'RobertaLM': model = RobertaForMultipleChoiceWithLM.from_pretrained( 'pre_weights/roberta-large_model.bin', config=RobertaConfig.from_pretrained('roberta-large')) tokenizer = RobertaTokenizer.from_pretrained('roberta-large') elif model_name == 'RobertaLM2': tokenizer = RobertaTokenizer.from_pretrained('roberta-large') model = RobertaForMultipleChoiceWithLM2(tokenizer) elif 'GNN' in model_name: tokenizer = RobertaTokenizer.from_pretrained('roberta-large') model = SOTA_goal_model(cls.args) elif 'LM' in model_name: tokenizer = RobertaTokenizer.from_pretrained('roberta-large') model = RobertaForMultipleChoiceWithLM.from_pretrained( 'pre_weights/roberta-large_model.bin', config=RobertaConfig.from_pretrained('roberta-large')) elif 'KBERT' in model_name: tokenizer = RobertaTokenizer.from_pretrained('roberta-large') model = RobertaForMultipleChoice.from_pretrained( 'pre_weights/roberta-large_model.bin', config=RobertaConfig.from_pretrained('roberta-large')) else: pass return model, tokenizer
def build_pretrain_feature_model(self): mn = self.model_config.pretrain_feature_model_name if 'albert' in mn: pretrain_feature_tokenizer = BertTokenizer.from_pretrained(mn) config = AlbertConfig.from_pretrained(mn) config.output_hidden_states = True self.pretrain_feature_model = AlbertModel.from_pretrained( mn, config=config).to(self.device) else: pretrain_feature_tokenizer = AutoTokenizer.from_pretrained(mn) config = AutoConfig.from_pretrained(mn) config.output_hidden_states = True self.pretrain_feature_model = AutoModel.from_pretrained( mn, config=config).to(self.device) self.pretrain_feature_model.requires_grad_(False) # pipeline input is raw data, we have ids, so direct use model # self.pretrain_feature_pipeline = Pipeline('feature-extraction', # model=self.pretrain_feature_model, tokenizer=pretrain_feature_tokenizer) # TODO: pre calc feature and save to file, it use less memory for train and faster # XXX: only used this tokenizer vocab, did not used for byte pair split, now just split by space utils.add_special_tokens_(self.pretrain_feature_model, pretrain_feature_tokenizer) # FIXME: this changed args should saved to checkpoint file # for use feature # self.args.emb_dim = self.pretrain_feature_model.config.hidden_size # self.model_config.emb_dim = self.pretrain_feature_model.config.hidden_size # for use emb self.args.emb_dim = self.pretrain_feature_model.config.embedding_size self.model_config.emb_dim = self.pretrain_feature_model.config.embedding_size self.vocab = datasets.ChatVocab(pretrain_feature_tokenizer) self.input_dim = len(self.vocab) self.pad_idx = self.vocab.stoi(utils.PAD) # pretrain_feature_model emb and weight no need anymore, use trained model self.pretrain_feature_model = None self.tokenizer = pretrain_feature_tokenizer.tokenize
def train(rank, args): ####################### ## distributed if args.distributed_enabled: torch.distributed.init_process_group( backend='nccl', init_method='env://', world_size=args.distributed_world_size, rank=rank) if args.gpu_enabled: device = torch.device('cuda:{}'.format(rank)) else: device = torch.device('cpu') is_master = True if not args.distributed_enabled else args.distributed_enabled and rank == 0 ####################### ## preamble set_gpus(rank) set_seed(rank) set_cuda(deterministic=args.gpu_deterministic) output_dir = f'{args.output_dir}/{rank}' os.makedirs(output_dir, exist_ok=False) setup_logging(filename=f'{output_dir}/output.log', console=is_master) ####################### ## dataset tokenizer = new_tokenizer(vocab_file=args.data_vocab_file) vocab_size = len(tokenizer.vocab) ds_train = wrap_example_builder( dataset=load_owt(owt_dir=args.data_dir, n_tensors_per_file=args.data_n_tensors_per_file), vocab=tokenizer.vocab, max_length=args.data_max_seq_length) pad_token_id = tokenizer.vocab['[PAD]'] mask_token_id = tokenizer.vocab['[MASK]'] cls_token_id = tokenizer.vocab['[CLS]'] sep_token_id = tokenizer.vocab['[SEP]'] def collate_batch(examples): input_ids = torch.nn.utils.rnn.pad_sequence( [example['input_ids'] for example in examples], batch_first=True, padding_value=pad_token_id) input_mask = torch.nn.utils.rnn.pad_sequence( [example['input_mask'] for example in examples], batch_first=True, padding_value=pad_token_id) segment_ids = torch.nn.utils.rnn.pad_sequence( [example['segment_ids'] for example in examples], batch_first=True, padding_value=pad_token_id) return input_ids, input_mask, segment_ids def cycle(iterable): while True: for x in iterable: yield x ds_train_loader = iter( cycle( DataLoader(ds_train, batch_size=args.opt_batch_size, collate_fn=collate_batch))) ####################### ## model def to_distributed_model(model): return model if not args.distributed_enabled else torch.nn.parallel.DistributedDataParallel( model, device_ids=[rank], find_unused_parameters=True) def tie_weights(generator, discriminator): generator.electra.embeddings.word_embeddings = discriminator.electra.embeddings.word_embeddings generator.electra.embeddings.position_embeddings = discriminator.electra.embeddings.position_embeddings generator.electra.embeddings.token_type_embeddings = discriminator.electra.embeddings.token_type_embeddings class LogitsAdapter(torch.nn.Module): def __init__(self, adaptee): super().__init__() self.adaptee = adaptee def forward(self, *args, **kwargs): return self.adaptee(*args, **kwargs)[0] from transformers import AutoConfig, ElectraForMaskedLM, ElectraForPreTraining generator = ElectraForMaskedLM( AutoConfig.from_pretrained(args.model_generator)) discriminator = AdaptedDiscriminator( AlbertConfig.from_pretrained(args.model_discriminator)) tie_weights(generator, discriminator) model = to_distributed_model( Electra(LogitsAdapter(generator), LogitsAdapter(discriminator), num_tokens=vocab_size, mask_token_id=mask_token_id, pad_token_id=pad_token_id, mask_prob=args.model_mask_prob, mask_ignore_token_ids=[ tokenizer.vocab['[CLS]'], tokenizer.vocab['[SEP]'] ], random_token_prob=0.0).to(device)) ####################### ## optimizer def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1): def lr_lambda(current_step): learning_rate = max( 0.0, 1. - (float(current_step) / float(num_training_steps))) learning_rate *= min(1.0, float(current_step) / float(num_warmup_steps)) return learning_rate return LambdaLR(optimizer, lr_lambda, last_epoch) def get_params_without_weight_decay_ln(named_params, weight_decay): no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ { 'params': [ p for n, p in named_params if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay, }, { 'params': [ p for n, p in named_params if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, }, ] return optimizer_grouped_parameters optimizer = torch.optim.AdamW(get_params_without_weight_decay_ln( model.named_parameters(), weight_decay=0.1), lr=args.opt_lr, betas=(0.9, 0.999), eps=1e-08) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.opt_warmup_steps, num_training_steps=args.opt_num_training_steps) scaler = torch.cuda.amp.GradScaler(enabled=args.gpu_mixed_precision) ####################### ## train t, steps_s, eta_m = time(), 0., 0 for step in range(args.opt_num_training_steps + 1): input_ids, input_mask, segment_ids = next(ds_train_loader) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) assert input_ids.shape[1] <= args.data_max_seq_length optimizer.zero_grad() with torch.cuda.amp.autocast(enabled=args.gpu_mixed_precision): loss, loss_mlm, loss_disc, acc_gen, acc_disc, disc_labels, disc_pred = model( input_ids, attention_mask=input_mask, token_type_ids=segment_ids) scaler.scale(loss).backward() scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) scaler.step(optimizer) scaler.update() scheduler.step() metrics = { 'step': (step, '{:8d}'), 'loss': (loss.item(), '{:8.5f}'), 'loss_mlm': (loss_mlm.item(), '{:8.5f}'), 'loss_disc': (loss_disc.item(), '{:8.5f}'), 'acc_gen': (acc_gen.item(), '{:5.3f}'), 'acc_disc': (acc_disc.item(), '{:5.3f}'), 'lr': (scheduler.get_last_lr()[0], '{:8.7f}'), 'steps': (steps_s, '{:4.1f}/s'), 'eta': (eta_m, '{:4d}m'), } if step % args.step_log == 0: sep = ' ' * 2 logger.info( sep.join([ f'{k}: {v[1].format(v[0])}' for (k, v) in metrics.items() ])) if step > 0 and step % 100 == 0: t2 = time() steps_s = 100. / (t2 - t) eta_m = int(((args.opt_num_training_steps - step) / steps_s) // 60) t = t2 if step % 200 == 0: logger.info( np.array2string(disc_labels[0].cpu().numpy(), threshold=sys.maxsize, max_line_width=sys.maxsize)) logger.info( np.array2string(disc_pred[0].cpu().numpy(), threshold=sys.maxsize, max_line_width=sys.maxsize)) if step > 0 and step % args.step_ckpt == 0 and is_master: discriminator.electra.save_pretrained( f'{args.output_dir}/ckpt/{step}')
def main(): parser = argparse.ArgumentParser() parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") args = parser.parse_args() torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.device = device seed = 5003 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.backends.cudnn.deterministic = True # prepare input import pickle with open('../1_1/distribution_dict1.pickle', 'rb') as f: distribution_dict1 = pickle.load(f) with open('../1_1/distribution_dict2.pickle', 'rb') as f: distribution_dict2 = pickle.load(f) with open('../1_1/distribution_dict3.pickle', 'rb') as f: distribution_dict3 = pickle.load(f) with open('../1_1/distribution_dict4.pickle', 'rb') as f: distribution_dict4 = pickle.load(f) json_dir = '../../input/simplified-nq-train.jsonl' max_data = 999999999 id_list = [] neg_id_list = [] data_dict = {} neg_data_dict = {} with open(json_dir) as f: for n, line in tqdm(enumerate(f)): if n > max_data: break data = json.loads(line) is_pos = False annotations = data['annotations'][0] if annotations['yes_no_answer'] == 'YES': is_pos = True elif annotations['yes_no_answer'] == 'NO': is_pos = True elif annotations['short_answers']: is_pos = True elif annotations['long_answer']['candidate_index'] != -1: is_pos = True if is_pos and len(data['long_answer_candidates']) > 1: data_id = data['example_id'] id_list.append(data_id) # random sampling if data_id in distribution_dict1: candidate_index_list = np.array( distribution_dict1[data_id]['candidate_index_list']) prob_list = np.power( np.array(distribution_dict1[data_id]['prob_list']), 1) elif data_id in distribution_dict2: candidate_index_list = np.array( distribution_dict2[data_id]['candidate_index_list']) prob_list = np.power( np.array(distribution_dict2[data_id]['prob_list']), 1) elif data_id in distribution_dict3: candidate_index_list = np.array( distribution_dict3[data_id]['candidate_index_list']) prob_list = np.power( np.array(distribution_dict3[data_id]['prob_list']), 1) else: candidate_index_list = np.array( distribution_dict4[data_id]['candidate_index_list']) prob_list = np.power( np.array(distribution_dict4[data_id]['prob_list']), 1) prob_list /= sum(prob_list) negative_candidate_index = random_sample_negative_candidates( candidate_index_list, prob_list) # doc_words = data['document_text'].split() # negative candidate = data['long_answer_candidates'][ negative_candidate_index] negative_candidate_words = doc_words[ candidate['start_token']:candidate['end_token']] negative_candidate_start = candidate['start_token'] negative_candidate_end = candidate['end_token'] # positive candidate = data['long_answer_candidates'][ annotations['long_answer']['candidate_index']] positive_candidate_words = doc_words[ candidate['start_token']:candidate['end_token']] positive_candidate_start = candidate['start_token'] positive_candidate_end = candidate['end_token'] # initialize data_dict data_dict[data_id] = { 'question_text': data['question_text'], 'annotations': data['annotations'], 'positive_text': positive_candidate_words, 'positive_start': positive_candidate_start, 'positive_end': positive_candidate_end, 'negative_text': negative_candidate_words, 'negative_start': negative_candidate_start, 'negative_end': negative_candidate_end, } elif (not is_pos) and len(data['long_answer_candidates']) >= 1: data_id = data['example_id'] neg_id_list.append(data_id) # random sampling if data_id in distribution_dict1: candidate_index_list = np.array( distribution_dict1[data_id]['candidate_index_list']) prob_list = np.power( np.array(distribution_dict1[data_id]['prob_list']), 1) elif data_id in distribution_dict2: candidate_index_list = np.array( distribution_dict2[data_id]['candidate_index_list']) prob_list = np.power( np.array(distribution_dict2[data_id]['prob_list']), 1) elif data_id in distribution_dict3: candidate_index_list = np.array( distribution_dict3[data_id]['candidate_index_list']) prob_list = np.power( np.array(distribution_dict3[data_id]['prob_list']), 1) else: candidate_index_list = np.array( distribution_dict4[data_id]['candidate_index_list']) prob_list = np.power( np.array(distribution_dict4[data_id]['prob_list']), 1) prob_list /= sum(prob_list) negative_candidate_index = random_sample_negative_candidates( candidate_index_list, prob_list) # doc_words = data['document_text'].split() # negative candidate = data['long_answer_candidates'][ negative_candidate_index] negative_candidate_words = doc_words[ candidate['start_token']:candidate['end_token']] negative_candidate_start = candidate['start_token'] negative_candidate_end = candidate['end_token'] # initialize data_dict neg_data_dict[data_id] = { 'question_text': data['question_text'], 'negative_text': negative_candidate_words, 'negative_start': negative_candidate_start, 'negative_end': negative_candidate_end, } print(len(id_list), len(neg_id_list)) random.shuffle(id_list) random.shuffle( neg_id_list ) # length of neg_id_list must be longer than id_list otherwise data generator will error # hyperparameters max_seq_len = 360 max_question_len = 64 learning_rate = 0.000004 batch_size = 3 ep = 0 # build model if args.local_rank not in [-1, 0]: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() model_path = 'model/' config = AlbertConfig.from_pretrained(model_path) config.num_labels = 5 config.vocab_size = 30010 tokenizer = AlbertTokenizer.from_pretrained(model_path, do_lower_case=True) #print(tokenizer.unk_token_id) model = AlbertForQuestionAnswering.from_pretrained('weights/epoch1/', config=config) # add new tokens new_token_dict = { '<P>': 'qw1', '<Table>': 'qw2', '<Tr>': 'qw3', '<Ul>': 'qw4', '<Ol>': 'qw5', '<Fl>': 'qw6', '<Li>': 'qw7', '<Dd>': 'qw8', '<Dt>': 'qw9', } new_token_list = [ 'qw1', 'qw2', 'qw3', 'qw4', 'qw5', 'qw6', 'qw7', 'qw8', 'qw9', 'qw99', ] num_added_toks = tokenizer.add_tokens(new_token_list) print('We have added', num_added_toks, 'tokens') model.resize_token_embeddings(len(tokenizer)) if args.local_rank == 0: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() model.to(args.device) optimizer = optim.Adam(model.parameters(), lr=learning_rate) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # training # iterator for training train_datagen = TFQADataset(id_list=id_list, neg_id_list=neg_id_list) train_sampler = DistributedSampler(train_datagen) train_collate = Collator(id_list=id_list, neg_id_list=neg_id_list, data_dict=data_dict, neg_data_dict=neg_data_dict, new_token_dict=new_token_dict, tokenizer=tokenizer, max_seq_len=max_seq_len, max_question_len=max_question_len) train_generator = DataLoader(dataset=train_datagen, sampler=train_sampler, collate_fn=train_collate, batch_size=batch_size, num_workers=3, pin_memory=True) # train losses1 = AverageMeter() # start losses2 = AverageMeter() # end losses3 = AverageMeter() # class accuracies1 = AverageMeter() # start accuracies2 = AverageMeter() # end accuracies3 = AverageMeter() # class model.train() for j, (batch_input_ids, batch_attention_mask, batch_token_type_ids, batch_y_start, batch_y_end, batch_y) in enumerate(train_generator): batch_input_ids = batch_input_ids.cuda() batch_attention_mask = batch_attention_mask.cuda() batch_token_type_ids = batch_token_type_ids.cuda() labels1 = batch_y_start.cuda() labels2 = batch_y_end.cuda() labels3 = batch_y.cuda() logits1, logits2, logits3 = model(batch_input_ids, batch_attention_mask, batch_token_type_ids) y_true = (batch_y_start, batch_y_end, batch_y) loss1, loss2, loss3 = loss_fn((logits1, logits2, logits3), (labels1, labels2, labels3)) loss = loss1 + loss2 + loss3 acc1, n_position1 = get_position_accuracy(logits1, labels1) acc2, n_position2 = get_position_accuracy(logits2, labels2) acc3, n_position3 = get_position_accuracy(logits3, labels3) losses1.update(loss1.item(), n_position1) losses2.update(loss2.item(), n_position2) losses3.update(loss3.item(), n_position3) accuracies1.update(acc1, n_position1) accuracies2.update(acc2, n_position2) accuracies3.update(acc3, n_position2) optimizer.zero_grad() with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() optimizer.step() if args.local_rank == 0: print( 'epoch: {}, train_loss1: {}, train_loss2: {}, train_loss3: {}, train_acc1: {}, train_acc2: {}, train_acc3: {}' .format(ep, losses1.avg, losses2.avg, losses3.avg, accuracies1.avg, accuracies2.avg, accuracies3.avg), flush=True) out_dir = 'weights/epoch2/' if not os.path.exists(out_dir): os.makedirs(out_dir) torch.save(model.module.state_dict(), out_dir + 'pytorch_model.bin')