def __init__( self, pretrained_model_name=None, config_filename=None, vocab_size=None, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", max_position_embeddings=512, ): super().__init__() # Check that only one of pretrained_model_name, config_filename, and # vocab_size was passed in total = 0 if pretrained_model_name is not None: total += 1 if config_filename is not None: total += 1 if vocab_size is not None: total += 1 if total != 1: raise ValueError( "Only one of pretrained_model_name, vocab_size, " + "or config_filename should be passed into the " + "ALBERT constructor." ) # TK: The following code checks the same once again. if vocab_size is not None: config = AlbertConfig( vocab_size_or_config_json_file=vocab_size, vocab_size=vocab_size, hidden_size=hidden_size, num_hidden_layers=num_hidden_layers, num_attention_heads=num_attention_heads, intermediate_size=intermediate_size, hidden_act=hidden_act, max_position_embeddings=max_position_embeddings, ) model = AlbertModel(config) elif pretrained_model_name is not None: model = AlbertModel.from_pretrained(pretrained_model_name) elif config_filename is not None: config = AlbertConfig.from_json_file(config_filename) model = AlbertModel(config) else: raise ValueError( "Either pretrained_model_name or vocab_size must" + " be passed into the ALBERT constructor" ) model.to(self._device) self.add_module("albert", model) self.config = model.config self._hidden_size = model.config.hidden_size
def get_model(args): if args.model_size == 'debug': num_hidden_layers = 1 embedding_size = 8 hidden_size = 16 num_hidden_groups = 1 intermediate_size = 32 num_attention_heads = 2 args.gen_ratio = 2 elif args.model_size == 'small': num_hidden_layers = 12 embedding_size = 128 hidden_size = 256 num_hidden_groups = 1 intermediate_size = 1024 num_attention_heads = 4 elif args.model_size == 'base': num_hidden_layers = 12 embedding_size = 128 hidden_size = 768 num_hidden_groups = 1 intermediate_size = 3072 num_attention_heads = 12 else: raise Exception('Which model? small, base, large') generator_config = AlbertConfig( max_position_embeddings=args.seq_length, vocab_size=args.vocab_size, num_hidden_layers=num_hidden_layers, embedding_size=embedding_size, num_hidden_groups=num_hidden_groups, hidden_size=hidden_size // args.gen_ratio, intermediate_size=intermediate_size // args.gen_ratio, num_attention_heads=num_attention_heads // args.gen_ratio, ) discriminator_config = AlbertConfig( max_position_embeddings=args.seq_length, vocab_size=args.vocab_size, num_hidden_layers=num_hidden_layers, embedding_size=embedding_size, num_hidden_groups=num_hidden_groups, hidden_size=hidden_size, intermediate_size=intermediate_size, num_attention_heads=num_attention_heads, ) model = Electra(args, gen_config=generator_config, dis_config=discriminator_config) return model
def main(args): with open(args.config) as fp: data = json.loads(fp.read()) config = AlbertConfig(**data) model = AlbertForMaskedLM(config) model: AlbertForMaskedLM = load_tf_weights_in_albert(model, config, args.checkpoint) model.save_pretrained(args.output)
def __init__(self, my_config, args): super(NqModel, self).__init__() #albert_base_configuration = AlbertConfig(vocab_size=30000,hidden_size=768,num_attention_heads=12,intermediate_size=3072, # attention_probs_dropout_prob=0) self.my_mask = None self.args = args #mfeb/albert-xxlarge-v2-squad2 self.bert_config = AlbertConfig.from_pretrained("albert-xxlarge-v2") # self.bert_config.gradient_checkpointing = True # self.bert_config.Extgradient_checkpointing = True self.bert = AlbertModel.from_pretrained("albert-xxlarge-v2", config=self.bert_config) # self.bert = AlbertModel.from_pretrained("albert-base-v2") my_config.hidden_size = self.bert.config.hidden_size self.right = 0 self.all = 0 #self.bert = AlbertModel(albert_base_configuration) #self.bert2 = BertModel(bert_config) #self.bert = BertModel(BertConfig()) #self.bert = RobertaModel(RobertaConfig(max_position_embeddings=514,vocab_size=50265)) #print(my_config,bert_config) # self.tok_dense = nn.Linear(my_config.hidden_size, my_config.hidden_size) self.tok_dense = nn.Linear(my_config.hidden_size * 2, my_config.hidden_size * 2) # self.tok_dense2 = nn.Linear(my_config.hidden_size, my_config.hidden_size) # self.para_dense = nn.Linear(self.config.hidden_size, self.config.hidden_size) # self.doc_dense = nn.Linear(self.config.hidden_size, self.config.hidden_size) self.dropout = nn.Dropout(my_config.hidden_dropout_prob) self.tok_outputs = nn.Linear(my_config.hidden_size * 2, 1) # tune to avoid fell into bad places # self.tok_outputs2 = nn.Linear(my_config.hidden_size, 1) # config.max_token_len, config.max_token_relative # self.para_outputs = nn.Linear(self.config.hidden_size, 1) # self.answer_type_outputs = nn.Linear(self.config.hidden_size, 2) # self.tok_to_label = nn.Linear(my_config.max_token_len,2) # self.par_to_label = nn.Linear(my_config.max_paragraph_len,2) #self.encoder = Encoder(my_config) self.encoder = Encoder(my_config) # self.encoder2 = Encoder(my_config) self.my_config = my_config self.model_choice = None self.ground_answer = None self.ACC = 0 self.ALL = 0 self.ErrId = []
def __init__(self, albert_name="ALBERT-base", device="cuda"): super().__init__() if albert_name == "ALBERT-base": albert_configuration = AlbertConfig(hidden_size=768, num_attention_heads=12, intermediate_size=3072) elif albert_name == "ALBERT-xxlarge": albert_configuration = AlbertConfig() else: raise self.device = device self.tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') self.model = AlbertModel.from_pretrained('albert-base-v2').to( self.device) self.linear = nn.Linear(model.config.embedding_size, 2).to(self.device) self.dropout = nn.Dropout(0.1).to(self.device)
def __init__(self, path: str ,device: str = 'cpu'): """ Init the NER Albert """ if not os.path.exists(path): raise NotADirectoryError( f"{os.path.abspath(path)} must be a directory containing the model files: config, tokenizer, weights.") files = os.listdir(path) if CONFIG_JSON_FILE not in files: raise FileNotFoundError(f"{CONFIG_JSON_FILE} must be in {path}.") if WEIGHTS_FILE not in files: raise FileNotFoundError(f"{WEIGHTS_FILE} must be in {path}.") with open(os.path.join(path, CONFIG_JSON_FILE), "r") as f: config = json.load(f) self.tokenizer = AutoTokenizer.from_pretrained(path) weights = torch.load(os.path.join(path, WEIGHTS_FILE), map_location=lambda storage, loc: storage) # Load pretrained model/tokenizer config = AlbertConfig.from_dict(config) self.model = AlbertForTokenClassification(config) self.model.load_state_dict(weights) self.model = self.model.eval() self.args = albert_args_ner if device == "cuda": logger.debug("Setting model with CUDA") self.args['device'] = 'cuda' self.model.to('cuda')
def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_mask = None if self.use_input_mask: input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) token_type_ids = None if self.use_token_type_ids: token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) sequence_labels = None token_labels = None choice_labels = None if self.use_labels: sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) choice_labels = ids_tensor([self.batch_size], self.num_choices) config = AlbertConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, intermediate_size=self.intermediate_size, hidden_act=self.hidden_act, hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
def get_albert_for_comparison(): model_name = 'albert-base-v2' config = AlbertConfig.from_pretrained(model_name) config.output_hidden_states = False input_ids = tf.keras.Input(shape=(128, ), name='input_ids', dtype=tf.int32) attention_mask = tf.keras.Input(shape=(128, ), name='attention_mask', dtype=tf.int32) transformer_model = TFAlbertModel.from_pretrained(model_name, config=config) embedding_layer = transformer_model([input_ids, attention_mask])[0] X = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=tf.keras.initializers.TruncatedNormal( stddev=config.initializer_range), activation="relu", name="pre_classifier", )(embedding_layer[:, 0]) X = tf.keras.layers.Dropout(config.classifier_dropout_prob)(X) output_ = tf.keras.layers.Dense( 1, kernel_initializer=tf.keras.initializers.TruncatedNormal( stddev=config.initializer_range), name="classifier")(X) return tf.keras.Model([input_ids, attention_mask], output_)
def main(): bert_base_config = BertConfig.from_pretrained('bert-base-uncased', num_labels=2) bert_base_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=bert_base_config) count = 0 for name, param in bert_base_model.named_parameters(): if param.requires_grad: size = 1 for s in param.data.size(): size = s * size count += size print('The total number of parameters in bert_base_uncased: ', count) roberta_config = RobertaConfig.from_pretrained('roberta-base', num_labels=2) roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base',config=roberta_config) count = 0 for name, param in roberta_model.named_parameters(): if param.requires_grad: size = 1 for s in param.data.size(): size = s * size count += size print('The total number of parameters in roberta: ', count) albert_config = AlbertConfig.from_pretrained('albert-base-v2', num_labels=2) albert_model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', config=albert_config) count = 0 for name, param in albert_model.named_parameters(): if param.requires_grad: size = 1 for s in param.data.size(): size = s * size count += size print('The total number of parameters in albert: ', count)
def load_model(self): parser = argparse.ArgumentParser() args = parser.parse_args() args.output_encoded_layers = True args.output_attention_layers = True args.output_att_score = True args.output_att_sum = True self.args = args # 解析配置文件, 教师模型和student模型的vocab是不变的 self.vocab_file = "albert_model/vocab.txt" # 这里是使用的teacher的config和微调后的teacher模型, 也可以换成student的config和蒸馏后的student模型 # student config: config/chinese_bert_config_L4t.json # distil student model: distil_model/gs8316.pkl self.bert_config_file_S = "albert_model/config.json" self.tuned_checkpoint_S = "trained_teacher_model/test_components.pkl" self.max_seq_length = 70 # 预测的batch_size大小 self.predict_batch_size = 64 # 加载student的配置文件, 校验最大序列长度小于我们的配置中的序列长度 bert_config_S = AlbertConfig.from_json_file(self.bert_config_file_S) bert_config_S.num_labels = self.num_labels # 加载tokenizer tokenizer = BertTokenizer(vocab_file=self.vocab_file) # 加载模型 model_S = AlbertSPC(bert_config_S) state_dict_S = torch.load(self.tuned_checkpoint_S, map_location=self.device) model_S.load_state_dict(state_dict_S) if self.verbose: print("模型已加载") return tokenizer, model_S
def __init__(self, args, token_vocab_size, output_dim=100): super(LMCDecoderBERT, self).__init__() self.pool_layers = args.pool_bert if args.debug_model: bert_dim = 100 num_hidden_layers = 1 embedding_size = 100 intermediate_size = 100 output_dim = 100 else: bert_dim = 256 num_hidden_layers = 2 embedding_size = 128 intermediate_size = 256 num_attention_heads = max(1, bert_dim // 64) print('Using {} attention heads in decoder'.format(num_attention_heads)) config = AlbertConfig( vocab_size=token_vocab_size, embedding_size=embedding_size, hidden_size=bert_dim, num_hidden_layers=num_hidden_layers, intermediate_size=intermediate_size, # 3072 is default num_attention_heads=num_attention_heads, output_hidden_states=self.pool_layers ) self.bert = AlbertModel(config) self.u = nn.Linear(bert_dim, output_dim, bias=True) self.v = nn.Linear(bert_dim, 1, bias=True) self.att_linear = nn.Linear(bert_dim, 1, bias=True) self.dropout = nn.Dropout(0.2)
def prediction(): # data = input('请输入测试数据:') data = "Don't give me your attitude!" print(data) tokenized_data = tokenizer.tokenize(data) tokenized_data.insert(0, "[CLS]") tokenized_data.append("[SEP]") data_indexed = tokenizer.convert_tokens_to_ids(tokenized_data) data = torch.from_numpy(np.array(data_indexed)).to(device) data = data.unsqueeze(0) # [1, seq_length] config = AlbertConfig(hidden_size=768) model = ALBertClassifyModel(config, num_class=2, fc_dropout=DROPOUT) model.load_state_dict(torch.load(SAVE_MODEL_PATH)) model.to(device) model.eval() softmax = nn.Softmax(dim=1) with torch.no_grad(): predict = model(data) predict_softmax = softmax(predict) print(predict_softmax) predict = torch.argmax(predict_softmax, dim=1) print(predict)
def from_hocon(cls: Type[QueryCodeSiamese], config: ConfigTree) -> QueryCodeSiamese: """Load Query1Code1_CodeSearchModel from a config tree""" if "training.model.encoder.type" in config: if config["training.model.encoder.type"] == "albert": logger.info("Creating QueryCodeSiamese with Albert encoder") albert_config = AlbertConfig( **config["training.model.encoder"]) encoder = PreTrainedModelRecordable(AlbertModel(albert_config)) elif config["training.model.encoder.type"] == "bert": logger.info("Creating QueryCodeSiamese with Bert encoder") bert_config = BertConfig(**config["training.model.encoder"]) encoder = PreTrainedModelRecordable(BertModel(bert_config)) else: # default is BERT now logger.info("Creating QueryCodeSiamese with Bert encoder") bert_config = BertConfig(**config["training.model.encoder"]) encoder = PreTrainedModelRecordable(BertModel(bert_config)) model = QueryCodeSiamese( encoder=encoder, pooler=MeanWeightedPooler( input_size=config["training.model.encoder.hidden_size"])) return model
def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) attention_mask = None if self.use_attention_mask: attention_mask = random_attention_mask( [self.batch_size, self.seq_length]) token_type_ids = None if self.use_token_type_ids: token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) config = AlbertConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, intermediate_size=self.intermediate_size, hidden_act=self.hidden_act, hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, is_decoder=False, initializer_range=self.initializer_range, ) return config, input_ids, token_type_ids, attention_mask
def load_macbert_model(self): parser = argparse.ArgumentParser() args = parser.parse_args() args.output_encoded_layers = True args.output_attention_layers = True args.output_att_score = True args.output_att_sum = True self.args = args # 解析配置文件, 教师模型和student模型的vocab是不变的 self.vocab_file = "mac_bert_model/vocab.txt" # 这里是使用的teacher的config和微调后的teacher模型, 也可以换成student的config和蒸馏后的student模型 # student config: config/chinese_bert_config_L4t.json # distil student model: distil_model/gs8316.pkl self.bert_config_file_S = "mac_bert_model/config.json" self.tuned_checkpoint_S = "trained_teacher_model/macbert_teacher_max75len_5000.pkl" # 加载student的配置文件, 校验最大序列长度小于我们的配置中的序列长度 bert_config_S = AlbertConfig.from_json_file(self.bert_config_file_S) # 加载tokenizer tokenizer = BertTokenizer(vocab_file=self.vocab_file) # 加载模型 model_S = AlbertSPC(bert_config_S) state_dict_S = torch.load(self.tuned_checkpoint_S, map_location=self.device) model_S.load_state_dict(state_dict_S) if self.verbose: print("模型已加载") self.predict_tokenizer = tokenizer self.predict_model = model_S logger.info(f"macbert预测模型加载完成")
def __init__(self, config): super(AlBert, self).__init__() model_config = AlbertConfig.from_pretrained( config.config_file, num_labels=config.num_labels, finetuning_task=config.task, ) self.albert = AlbertModel.from_pretrained( config.model_name_or_path, config=model_config, ) if config.requires_grad: for param in self.albert.parameters(): param.requires_grad = True self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) #add the weighted layer self.hidden_weight = config.weighted_layer_tag #must modify the config.json self.pooling_tag = config.pooling_tag if self.hidden_weight: self.weight_layer = config.weighted_layer_num #self.weight = torch.zeros(self.weight_layer).to(config.device) self.weight = torch.nn.Parameter(torch.FloatTensor(self.weight_layer), requires_grad=True) self.softmax = nn.Softmax() self.pooler = nn.Sequential(nn.Linear(768, 768), nn.Tanh()) elif self.pooling_tag: self.maxPooling = nn.MaxPool1d(64) self.avgPooling = nn.AvgPool1d(64) self.pooler = nn.Sequential(nn.Linear(768*3, 768), nn.Tanh())
def __init__(self, model_name, model_type): """ Hyper-parameters found with validation set: xlnet-large-casd : epoch = 4, learning_rate = 1E-5, batch_size = 16, epsilon = 1e-6 bert-large-uncased : epoch = 4, learning_rate = 3E-5, batch_size = 16, epsilon = 1e-8 ALBERT xxlarge-v2 large : epoch = 3, learning_rate = 5E-5, batch_size = 8, epsilon = 1e-6 to be improved... """ self.model_name = model_name self.model_type = model_type # Cf transformers library, batch of 16 or 32 is advised for training. For memory issues, we will take 16. Gradient accumulation step has not lead # to great improvment and therefore won't be used here. if model_type == 'albert': self.batch_size = 8 else: self.batch_size = 16 available_model_name = ["xlnet-large-cased", "bert-large-uncased", "albert-xlarge-v2"] available_model_type = ["bert", "xlnet", "albert"] if self.model_name not in available_model_name: raise Exception("Error : model_name should be in", available_model_name) if self.model_type not in available_model_type: raise Exception("Error : model_name should be in", available_model_type) # Load BertForSequenceClassification, the pretrained BERT model with a single linear regression layer on top of the pooled output # Load our fined tune model: ex: BertForSequenceClassification.from_pretrained('./my_saved_model_directory/') if self.model_type == 'bert': self.config = BertConfig.from_pretrained(self.model_name, num_labels=1) # num_labels=1 for regression task self.model = BertForSequenceClassification.from_pretrained(self.model_name, config=self.config) elif self.model_type == 'xlnet': self.config = XLNetConfig.from_pretrained(self.model_name, num_labels=1) self.model = XLNetForSequenceClassification.from_pretrained(self.model_name, config=self.config) elif self.model_type == 'albert': self.config = AlbertConfig.from_pretrained(self.model_name, num_labels=1) self.model = AlbertForSequenceClassification.from_pretrained(self.model_name, config=self.config) self.model.cuda() if self.model_name == 'xlnet-large-cased': self.epochs = 4 self.lr = 1e-5 self.eps = 1e-6 elif self.model_name == 'bert-large-uncased': self.epochs = 4 self.lr = 3e-5 self.eps = 1e-8 elif self.model_name == 'albert-xxlarge-v2': self.epochs = 3 self.lr = 5e-5 self.eps = 1e-6 self.max_grad_norm = 1.0 # Gradient threshold, gradients norms that exceed this threshold are scaled down to match the norm. self.optimizer = AdamW(self.model.parameters(), lr=self.lr, eps=self.eps) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.n_gpu = torch.cuda.device_count() torch.cuda.get_device_name(0)
def load_model(self, model_path: str, do_lower_case=True): config = AlbertConfig.from_pretrained(model_path + "/config.json") tokenizer = AlbertTokenizer.from_pretrained(model_path) #tokenizer = AlbertTokenizer.from_pretrained('albert-large-v2', do_lower_case=do_lower_case) model = AlbertForQuestionAnswering.from_pretrained(model_path, from_tf=False, config=config) return model, tokenizer
def load_model_and_tokenizer(manifest): zipped_model_path = download_zipped_model(manifest, assert_hash=True) unzipped_model_dir = get_unzipped_dir_path(zipped_model_path) config = AlbertConfig.from_pretrained(unzipped_model_dir) model = TFAlbertForSequenceClassification.from_pretrained( unzipped_model_dir, config=config) tokenizer = AlbertTokenizer.from_pretrained(unzipped_model_dir) return model, tokenizer
def __init__(self, transformer_model, is_train): super(LMNER, self).__init__() config = AlbertConfig.from_pretrained(transformer_model) self.transformer_model = AlbertForMaskedLM.from_pretrained( transformer_model, config=config) # 是否对bert进行训练 for name, param in self.transformer_model.named_parameters(): param.requires_grad = is_train
def launch(training_flag, test_flag): tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') if training_flag: model = AlbertForTokenClassification.from_pretrained( 'albert-base-v2', num_labels=len(tags_vals)) ## ---------12 . Optimizer -> weight regularization is a solution to reduce the overfitting of a deep learning """ Last keras optimization 2020 (rates from 0.01 seem to be best hyperparamater )for weight regularization for weights layers from keras.layers import LSTM from keras.regularizers import l2 model.add(LSTM(32, kernel_regularizer=l2(0.01), recurrent_regularizer=l2(0.01), bias_regularizer=l2(0.01))) Note : BERT not include beta an gamma parametres for optimization """ FULL_FINETUNING = True if FULL_FINETUNING: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.0 }] else: param_optimizer = list(model.classifier.named_parameters()) optimizer_grouped_parameters = [{ "params": [p for n, p in param_optimizer] }] optimizer = Adam(optimizer_grouped_parameters, lr=args.lr) launch_training(training_path=args.training_data, training_epochs=args.epochs, valid_path=args.validate_data, training_batch_size=1, model=model, model_path=model_path, tokenizer=tokenizer, optimizer=optimizer) if test_flag: if args.save: model_path = args.save + 'pytorch_model.bin' config = AlbertConfig.from_json_file(args.save + '/config.json') model = AlbertForTokenClassification.from_pretrained(args.save, config=config) else: model = AlbertForTokenClassification.from_pretrained( 'albert-base-v2', num_labels=len(tags_vals)) launch_test_directory(test_path=test_flag, model=model, tokenizer=tokenizer)
def __init__(self, bert_model='bert-base-cased'): super(BERTRepresenter, self).__init__() if 'albert' in bert_model.lower(): config = AlbertConfig() self.bert = AlbertModel(config).from_pretrained(bert_model) else: config = BertConfig() # config = BertConfig(vocab_size=24000, hidden_size=264) self.bert = BertModel(config).from_pretrained(bert_model)
def __init__(self, config): super(Model, self).__init__() self.config = AlbertConfig.from_pretrained(config.albert_config_path) self.albert = AlbertModel.from_pretrained(config.albert_model_path, config=self.config) for param in self.albert.parameters(): param.requires_grad = True self.fc = nn.Linear(config.hidden_size, config.num_classes)
def load_pretrained_encoder(mpath, config="albert_config.json", model="albert_model.bin"): b_config = BC.from_pretrained(opt.join(mpath, config)) encoder = AlbertModel.from_pretrained(opt.join(mpath, model), config=b_config) return encoder
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path): # Initialise PyTorch model config = AlbertConfig.from_json_file(albert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = AlbertForMaskedLM(config) load_tf_weights_in_albert(model, config, tf_checkpoint_path) print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path)
def __init__(self, args, train_dataloader, test_dataloader=None): self.args = args cuda_condition = torch.cuda.is_available() and args.with_cuda self.device = torch.device("cuda" if cuda_condition else "cpu") print('Current cuda device ', torch.cuda.current_device()) # check if args.weight_load: config = AutoConfig.from_pretrained(args.pre_trained_model_path) model_state_dict = torch.load( os.path.join(args.pre_trained_model_path, 'pytorch_model.bin')) self.model = CXRBERT.from_pretrained(args.pre_trained_model_path, state_dict=model_state_dict, config=config, args=args).to(self.device) print('training restart with mid epoch') print(config) else: if args.bert_model == "albert-base-v2": config = AlbertConfig.from_pretrained(args.bert_model) elif args.bert_model == "emilyalsentzer/Bio_ClinicalBERT": config = AutoConfig.from_pretrained(args.bert_model) elif args.bert_model == "bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12": config = AutoConfig.from_pretrained(args.bert_model) elif args.bert_model == "bert-small-scratch": config = BertConfig.from_pretrained( "google/bert_uncased_L-4_H-512_A-8") elif args.bert_model == "bert-base-scratch": config = BertConfig.from_pretrained("bert-base-uncased") else: config = BertConfig.from_pretrained( args.bert_model) # bert-base, small, tiny self.model = CXRBERT(config, args).to(self.device) wandb.watch(self.model) if args.with_cuda and torch.cuda.device_count() > 1: print("Using %d GPUS for BERT" % torch.cuda.device_count()) self.model = nn.DataParallel(self.model, device_ids=args.cuda_devices) self.train_data = train_dataloader self.test_data = test_dataloader self.optimizer = AdamW(self.model.parameters(), lr=args.lr) self.mlm_criterion = nn.CrossEntropyLoss(ignore_index=-100) self.itm_criterion = nn.CrossEntropyLoss() self.log_freq = args.log_freq self.step_cnt = 0 print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))
def load_pretrained(mpath, config="albert_config.json", model="albert_model.bin"): b_config = BC.from_pretrained(opt.join(mpath, config)) encoder = AlbertModel.from_pretrained(opt.join(mpath, model), config=b_config) tokenizer = BertTokenizer.from_pretrained(mpath) return encoder, tokenizer
def load_albert(path): """ 加载模型 """ vocab_file = os.path.join(path, 'vocab.txt') tokenizer = BertTokenizer.from_pretrained(vocab_file) # print(tokenizer) config = AlbertConfig.from_pretrained(path) model = AlbertModel.from_pretrained(path, config=config) return model, tokenizer
def load_train_model(self): """ 初始化训练的模型 :return: """ parser = argparse.ArgumentParser() args = parser.parse_args() args.output_encoded_layers = True args.output_attention_layers = True args.output_att_score = True args.output_att_sum = True self.learning_rate = 2e-05 #学习率 warmup的比例 self.warmup_proportion = 0.1 self.num_train_epochs = 1 #使用的学习率scheduler self.schedule = 'slanted_triangular' self.s_opt1 = 30.0 self.s_opt2 = 0.0 self.s_opt3 = 1.0 self.weight_decay_rate = 0.01 #训练多少epcoh保存一次模型 self.ckpt_frequency = 1 #模型和日志保存的位置 self.output_dir = "output_root_dir/train_api" #梯度累积步数 self.gradient_accumulation_steps = 1 self.args = args # 解析配置文件, 教师模型和student模型的vocab是不变的 self.vocab_file = "albert_model/vocab.txt" self.bert_config_file_S = "albert_model/config.json" self.tuned_checkpoint_S = "albert_model/pytorch_model.bin" # 加载student的配置文件, 校验最大序列长度小于我们的配置中的序列长度 bert_config_S = AlbertConfig.from_json_file(self.bert_config_file_S) # 加载tokenizer tokenizer = BertTokenizer(vocab_file=self.vocab_file) # 加载模型 model_S = AlbertSPC(bert_config_S, num_labels=self.num_labels, args=self.args) state_dict_S = torch.load(self.tuned_checkpoint_S, map_location=self.device) state_weight = { k[5:]: v for k, v in state_dict_S.items() if k.startswith('bert.') } missing_keys, _ = model_S.bert.load_state_dict(state_weight, strict=False) #验证下参数没有丢失 assert len(missing_keys) == 0 self.train_tokenizer = tokenizer self.train_model = model_S logger.info(f"训练模型{self.tuned_checkpoint_S}加载完成")
def __init__(self, coordinator_args: CoordinatorArguments, collab_optimizer_args: CollaborativeOptimizerArguments, averager_args: AveragerArguments, dht: hivemind.DHT): self.save_checkpoint_step_interval = coordinator_args.save_checkpoint_step_interval self.repo_path = coordinator_args.repo_path self.upload_interval = coordinator_args.upload_interval self.previous_step = -1 config = AlbertConfig.from_pretrained( coordinator_args.model_config_path) self.model = AlbertForPreTraining(config) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] opt = Lamb( optimizer_grouped_parameters, lr=0.00176, weight_decay=0.01, clamp_value=10000.0, debias=True, ) adjusted_target_batch_size = collab_optimizer_args.target_batch_size - collab_optimizer_args.batch_size_lead self.collaborative_optimizer = hivemind.CollaborativeOptimizer( opt=opt, dht=dht, prefix=experiment_prefix, compression_type=hivemind.utils.CompressionType.Value( collab_optimizer_args.compression), throughput=collab_optimizer_args.bandwidth, target_batch_size=adjusted_target_batch_size, client_mode=collab_optimizer_args.client_mode, verbose=True, start=True, **asdict(averager_args)) self.previous_timestamp = time.time()