def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = BertForPreTraining(config=config) model.to(torch_device) model.eval() loss, prediction_scores, seq_relationship_score = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, next_sentence_label=sequence_labels) result = { "loss": loss, "prediction_scores": prediction_scores, "seq_relationship_score": seq_relationship_score, } self.parent.assertListEqual( list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]) self.parent.assertListEqual( list(result["seq_relationship_score"].size()), [self.batch_size, 2]) self.check_loss_output(result)
def test_model(): hf_config = BertConfig( vocab_size=1000, hidden_size=100, num_attention_heads=2, intermediate_size=256, hidden_dropout_prob=0.0, attention_probs_dropout_prob=0.0, max_position_embeddings=100, num_hidden_layers=1, ) config = BertBackboneConfig( hidden_dim=hf_config.hidden_size, n_heads=hf_config.num_attention_heads, layer_norm_eps=hf_config.layer_norm_eps, intermediate_dim=hf_config.intermediate_size, n_layers=hf_config.num_hidden_layers, n_pos=hf_config.max_position_embeddings, n_types=hf_config.type_vocab_size, vocab_size=hf_config.vocab_size, pad_token_id=hf_config.pad_token_id, attention_probs_dropout=hf_config.attention_probs_dropout_prob, hidden_dropout=hf_config.hidden_dropout_prob) bs = 8 seq_len = 12 seed_everything(228) hf_model = BertForPreTraining(hf_config) token_ids = torch.randint(low=0, high=hf_config.vocab_size, size=(bs, seq_len)) clf_labels = torch.randint(low=0, high=2, size=(bs, )) hf_loss = hf_model(token_ids, masked_lm_labels=token_ids, next_sentence_label=clf_labels)[0] seed_everything(228) backbone = BertBackbone(config) model = BertPreTrainingModel(backbone=backbone) token_ids = torch.randint(low=0, high=hf_config.vocab_size, size=(bs, seq_len)) clf_labels = torch.randint(low=0, high=2, size=(bs, )) inp = BertBackboneInput(token_ids=token_ids, token_type_ids=None, token_pos=None) loss = model(inp, head_labels={'lm': token_ids, 'clf': clf_labels}).loss
def get_bert_save_dict(): import os state_path = 'data/bert-large.pt' if os.path.exists(state_path): state = torch.load(state_path) else: model = BertForPreTraining.from_pretrained(globals.bert_model) state = model.state_dict() # cache state torch.save(state, state_path) return state
def convert_tf2_checkpoint_to_pytorch(tf_checkpoint_path, config_path, output_folder): # Instantiate model logger.info(f'Loading model based on config from {config_path}...') config = BertConfig.from_json_file(config_path) model = BertForPreTraining(config) # Load weights from checkpoint logger.info(f'Loading weights from checkpoint {tf_checkpoint_path}...') load_tf2_weights_in_bert(model, tf_checkpoint_path, config) # Create dirs if not os.path.isdir(output_folder): os.makedirs(output_folder) # Save pytorch-model f_out_model = os.path.join(output_folder, 'pytorch_model.bin') logger.info(f'Saving PyTorch model to {f_out_model}...') torch.save(model.state_dict(), f_out_model) # Save config to output f_out_config = os.path.join(output_folder, 'config.json') logger.info(f'Saving config to {f_out_config}...') config.to_json_file(f_out_config)
def _load_google_checkpoint(self): logger.info('Loading Checkpoint from Google for Pre training') download_and_extract(self.google_checkpoint_location, './') checkpoint_dir = os.path.join('./', self.google_checkpoint_root) config_location = os.path.join(checkpoint_dir, 'bert_config.json') index_location = os.path.join(checkpoint_dir, 'bert_model.ckpt.index') logger.info( f'Config file: {config_location}. Index file: {index_location}') config = BertConfig.from_json_file(config_location) self.bert = BertForPreTraining.from_pretrained(index_location, config=config, from_tf=True)
def from_pretrained(self, model_dir): self.encoder_config = BertConfig.from_pretrained(model_dir) self.tokenizer = BertTokenizer.from_pretrained( path.join(model_dir, 'tokenizer'), do_lower_case=args.do_lower_case) self.utt_encoder = BertForPreTraining.from_pretrained( path.join(model_dir, 'utt_encoder')) self.context_encoder = BertForSequenceClassification.from_pretrained( path.join(model_dir, 'context_encoder')) self.context_mlm_trans = BertPredictionHeadTransform( self.encoder_config) self.context_mlm_trans.load_state_dict( torch.load(path.join(model_dir, 'context_mlm_trans.pkl'))) self.context_order_trans = SelfSorting(self.encoder_config.hidden_size) self.context_order_trans.load_state_dict( torch.load(path.join(model_dir, 'context_order_trans.pkl'))) self.decoder_config = BertConfig.from_pretrained(model_dir) self.decoder = BertLMHeadModel.from_pretrained( path.join(model_dir, 'decoder'))
def create_and_check_for_pretraining( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = BertForPreTraining(config=config) model.to(torch_device) model.eval() result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels, next_sentence_label=sequence_labels, ) self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) self.parent.assertEqual(result.seq_relationship_logits.shape, (self.batch_size, 2))
def test(): bert_model_path = '../checkpoints/bert-base-chinese/' # pytorch_model.bin bert_config_path = '../checkpoints/bert-base-chinese/' # bert_config.json vocab_path = '../checkpoints/bert-base-chinese/vocab.txt' tokenizer = BertTokenizer.from_pretrained(vocab_path) # model = BertModel.from_pretrained(bert_model_path, config=bert_config_path) model = BertForPreTraining.from_pretrained(bert_model_path, config=bert_config_path) text_batch = ["哈哈哈", "嘿嘿嘿", "嘿嘿嘿", "嘿嘿嘿"] encoding = tokenizer(text_batch, return_tensors='pt', padding=True, truncation=True) input_ids = encoding['input_ids'] print(input_ids) print(input_ids.shape) output1, output2 = model(input_ids) print(output1) print(output2) print(output1.shape) print(output2.shape)
def __init__(self, pretrained_model, tokenizer_name_or_path: str, data_dir: str, batch_size: int, max_train_examples: int = None, max_eval_examples: int = None, train_strategy='train-all-lexical') -> None: super(LexicalTrainingModel, self).__init__() self.save_hyperparameters() if pretrained_model.startswith('google-checkpoint'): self._load_google_checkpoint() else: self.bert = BertForPreTraining.from_pretrained(pretrained_model) self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name_or_path) self.__setup_lexical_for_training() self.train_dataset = None self.eval_dataset = None self.test_dataset = None
from transformers import BertTokenizer, BertForPreTraining, BertForSequenceClassification from tqdm import tqdm, trange from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler from torch.utils.data.distributed import DistributedSampler from transformers.file_utils import PYTORCH_PRETRAINED_BERT_CACHE from transformers.optimization import AdamW, get_linear_schedule_with_warmup max_length=100 k=10 device="cpu" pretrained_weights = '/data5/private/suyusheng/task_selecte/bert-base-uncased-128/' tokenizer = BertTokenizer.from_pretrained(pretrained_weights, do_lower_case=True) fine_tuned_weight = '/data5/private/suyusheng/task_selecte/output_finetune/pytorch_model.bin_1314' model = BertForPreTraining.from_pretrained(pretrained_weights, output_hidden_states=True,return_dict=True) model.load_state_dict(torch.load(fine_tuned_weight), strict=False) model.to(device) #out_CLS = torch.load("/data5/private/suyusheng/task_selecte/data/open_domain_preprocessed/opendomain_CLS.pt") out_CLS = torch.load("/data5/private/suyusheng/task_selecte/data/open_domain_preprocessed/opendomain_CLS_res.pt") out_CLS = out_CLS.to(device) #with open("/data5/private/suyusheng/task_selecte/data/open_domain_preprocessed/opendomain.json") as f: with open("/data5/private/suyusheng/task_selecte/data/open_domain_preprocessed/opendomain_res.json") as f: out_data = json.load(f) with open("../data/restaurant/train.json") as f: data = json.load(f) for index, d in enumerate(tqdm(data)):
class NERTagger(pl.LightningModule): def __init__(self, hparams): """ input: hparams: namespace with the following items: 'data_dir' (str): Data Directory. default: './official/ebm_nlp_1_00' 'bioelmo_dir' (str): BioELMo Directory. default: './models/bioelmo', help='BioELMo Directory') 'max_length' (int): Max Length. default: 1024 'lr' (float): Learning Rate. default: 1e-2 'fine_tune_bioelmo' (bool): Whether to Fine Tune BioELMo. default: False 'lr_bioelmo' (float): Learning Rate in BioELMo Fine-tuning. default: 1e-4 """ super().__init__() self.hparams = hparams self.itol = ID_TO_LABEL self.ltoi = {v: k for k, v in self.itol.items()} if self.hparams.model == "bioelmo": # Load Pretrained BioELMo DIR_ELMo = pathlib.Path(str(self.hparams.bioelmo_dir)) self.bioelmo = self.load_bioelmo( DIR_ELMo, not self.hparams.fine_tune_bioelmo ) self.bioelmo_output_dim = self.bioelmo.get_output_dim() # ELMo Padding token (In ELMo token with ID 0 is used for padding) VOCAB_FILE_PATH = DIR_ELMo / "vocab.txt" command = shlex.split(f"head -n 1 {VOCAB_FILE_PATH}") res = subprocess.Popen(command, stdout=subprocess.PIPE) self.bioelmo_pad_token = res.communicate()[0].decode("utf-8").strip() # Initialize Intermediate Affine Layer self.hidden_to_tag = nn.Linear(int(self.bioelmo_output_dim), len(self.itol)) elif self.hparams.model == "biobert": # Load Pretrained BioBERT PATH_BioBERT = pathlib.Path(str(self.hparams.biobert_path)) self.bertconfig = BertConfig.from_pretrained(self.hparams.bert_model_type) self.bertforpretraining = BertForPreTraining(self.bertconfig) self.bertforpretraining.load_tf_weights(self.bertconfig, PATH_BioBERT) self.biobert = self.bertforpretraining.bert self.tokenizer = BertTokenizer.from_pretrained(self.hparams.bert_model_type) # Freeze BioBERT if fine-tune not desired if not self.hparams.fine_tune_biobert: for n, m in self.biobert.named_parameters(): m.requires_grad = False # Initialize Intermediate Affine Layer self.hidden_to_tag = nn.Linear( int(self.bertconfig.hidden_size), len(self.itol) ) # Initialize CRF TRANSITIONS = conditional_random_field.allowed_transitions( constraint_type="BIO", labels=self.itol ) self.crf = conditional_random_field.ConditionalRandomField( # set to 3 because here "tags" means ['O', 'B', 'I'] # no need to include 'BOS' and 'EOS' in "tags" num_tags=len(self.itol), constraints=TRANSITIONS, include_start_end_transitions=False, ) self.crf.reset_parameters() @staticmethod def load_bioelmo(bioelmo_dir: str, freeze: bool) -> Elmo: # Load Pretrained BioELMo DIR_ELMo = pathlib.Path(bioelmo_dir) bioelmo = Elmo( DIR_ELMo / "biomed_elmo_options.json", DIR_ELMo / "biomed_elmo_weights.hdf5", 1, requires_grad=bool(not freeze), dropout=0, ) return bioelmo def get_device(self): return self.crf.state_dict()["transitions"].device def _forward_bioelmo(self, tokens) -> Tuple[torch.Tensor, torch.Tensor]: # character_ids: torch.tensor(n_batch, len_max) # documents will be padded to have the same token lengths as the longest document character_ids = batch_to_ids(tokens) character_ids = character_ids[:, : self.hparams.max_length, :] character_ids = character_ids.to(self.get_device()) # characted_ids -> BioELMo hidden state of the last layer & mask out = self.bioelmo(character_ids) hidden = out["elmo_representations"][-1] crf_mask = out["mask"].to(torch.bool).to(self.get_device()) return (hidden, crf_mask) def _forward_biobert( self, tokens: List[List[str]] ) -> Tuple[torch.Tensor, torch.Tensor]: """ Return BioBERT Hidden state for the tokenized documents. Documents with different lengths will be accepted. list(list(str)) -> tuple(torch.tensor, torch.tensor) """ # Convert each token of each document into a list of subwords. # e.g., # [['Admission', 'Date', ...], ['Service', ':', ...]] # | # V # [[['Ad', '##mission'], ['Date'], ...], [['Service'], [':'], ...]] subwords_unchained = [ [self.tokenizer.tokenize(tok) for tok in doc] for doc in tokens ] # Simply replace each token of each document with corresponding subwords. # e.g., # [['Admission', 'Date', ...], ['Service', ':', ...]] # | # V # [['Ad', '##mission', 'Date', ...], ['Service', ':', ...]] subwords = [ list(itertools.chain(*[self.tokenizer.tokenize(tok) for tok in doc])) for doc in tokens ] # Memorize (i) header place of each token and (ii) how many subwords each token gave birth. # e.g., # For document ['Admission', 'Date'] -> ['Ad', '##mission', 'Date'], # subword_info will be {'start':[0,2], 'length':[2,1]}. subword_info = [] for doc in subwords_unchained: word_lengths = [len(word) for word in doc] word_head_ix = [0] for i in range(len(word_lengths) - 1): word_head_ix.append(word_head_ix[-1] + word_lengths[i]) assert len(word_lengths) == len(word_head_ix) subword_info.append({"start": word_head_ix, "length": word_lengths}) assert [len(info["start"]) for info in subword_info] == [ len(doc) for doc in tokens ] # Split each document into chunks shorter than max_length. # Here, each document will be simply split at every 510 tokens. max_length = min( self.bertconfig.max_position_embeddings, self.hparams.max_length ) longest_length = max([len(doc) for doc in subwords]) n_chunks = (longest_length - 1) // (max_length - 2) + 1 chunks = [] for n in range(n_chunks): chunk_of_all_documents = [] for document in subwords: chunk_of_single_document = document[ (max_length - 2) * n : (max_length - 2) * (n + 1) ] if chunk_of_single_document == []: chunk_of_all_documents.append([""]) else: chunk_of_all_documents.append(chunk_of_single_document) chunks.append(chunk_of_all_documents) # Convert chunks into BERT input form. inputs = [] for chunk in chunks: if type(chunk) is str: unsqueezed_chunk = [[chunk]] elif type(chunk) is list: if type(chunk[0]) is str: unsqueezed_chunk = [chunk] elif type(chunk[0]) is list: unsqueezed_chunk = chunk inputs.append( self.tokenizer.batch_encode_plus( unsqueezed_chunk, pad_to_max_length=True, is_pretokenized=True, ) ) # Get BioBERT hidden states. hidden_states = [] for inpt in inputs: inpt_tensors = { k: torch.tensor(v).to(self.get_device()) for k, v in inpt.items() } hidden_state = self.biobert(**inpt_tensors)[0][:, 1:-1, :] hidden_states.append(hidden_state) # Concatenate hidden states from each chunk. hidden_states_cat = torch.cat(hidden_states, dim=1) # If a word was tokenized into multiple subwords, take average of them. # e.g. Hidden state for "Admission" equals average of hidden states for "Ad" and "##mission" hidden_states_shrunk = torch.zeros_like(hidden_states_cat) for n in range(hidden_states_cat.size()[0]): hidden_state_shrunk = torch.stack( [ torch.narrow(hidden_states_cat[n], dim=0, start=s, length=l).mean( dim=0 ) for s, l in zip(subword_info[n]["start"], subword_info[n]["length"]) ] ) hidden_states_shrunk[ n, : hidden_state_shrunk.size()[0], : ] = hidden_state_shrunk # Truncate lengthy tail that will not be used. hidden_states_shrunk = hidden_states_shrunk[ :, : max([len(doc) for doc in tokens]), : ] # Create mask for CRF. crf_mask = torch.zeros(hidden_states_shrunk.size()[:2]).to(torch.uint8) for i, length in enumerate([len(doc) for doc in tokens]): crf_mask[i, :length] = 1 crf_mask = crf_mask > 0 crf_mask = crf_mask.to(self.get_device()) return (hidden_states_shrunk, crf_mask) def _forward_crf( self, hidden: torch.Tensor, gold_tags_padded: torch.Tensor, crf_mask: torch.Tensor, ) -> Dict: """ input: hidden (torch.tensor) (n_batch, seq_length, hidden_dim) gold_tags_padded (torch.tensor) (n_batch, seq_length) crf_mask (torch.bool) (n_batch, seq_length) output: result (dict) 'log_likelihood' : torch.tensor 'pred_tags_packed' : torch.nn.utils.rnn.PackedSequence 'gold_tags_padded' : torch.tensor """ result = {} if not (hidden.size()[1] == gold_tags_padded.size()[1] == crf_mask.size()[1]): raise RuntimeError( "seq_length of hidden, gold_tags_padded, and crf_mask do not match: " + f"{hidden.size()}, {gold_tags_padded.size()}, {crf_mask.size()}" ) if gold_tags_padded is not None: # Training Mode # Log likelihood log_prob = self.crf.forward(hidden, gold_tags_padded, crf_mask) # top k=1 tagging Y = [ torch.tensor(result[0]) for result in self.crf.viterbi_tags(logits=hidden, mask=crf_mask) ] Y = rnn.pack_sequence(Y, enforce_sorted=False) result["log_likelihood"] = log_prob result["pred_tags_packed"] = Y result["gold_tags_padded"] = gold_tags_padded return result else: # Prediction Mode # top k=1 tagging Y = [ torch.tensor(result[0]) for result in self.crf.viterbi_tags(logits=hidden, mask=crf_mask) ] Y = rnn.pack_sequence(Y, enforce_sorted=False) result["pred_tags_packed"] = Y return result def forward(self, tokens, gold_tags=None): """ Main NER tagging function. Documents with different token lengths are accepted. input: tokens (list(list(str))): List of documents for the batch. Each document must be stored as a list of tokens. gold_tags (list(list(int))): List of gold labels for each document of the batch. output: result (dict) 'log_likelihood' : torch.tensor 'pred_tags_packed' : torch.nn.utils.rnn.PackedSequence 'gold_tags_padded' : torch.tensor """ if self.hparams.model == "bioelmo": # BioELMo features hidden, crf_mask = self._forward_bioelmo(tokens) elif self.hparams.model == "biobert": # BioELMo features hidden, crf_mask = self._forward_biobert(tokens) # Turn on gradient tracking # Affine transformation (Hidden_dim -> N_tag) hidden.requires_grad_() hidden = self.hidden_to_tag(hidden) if gold_tags is not None: gold_tags = [torch.tensor(seq) for seq in gold_tags] gold_tags_padded = rnn.pad_sequence( gold_tags, batch_first=True, padding_value=self.ltoi["O"] ) gold_tags_padded = gold_tags_padded[:, : self.hparams.max_length] gold_tags_padded = gold_tags_padded.to(self.get_device()) else: gold_tags_padded = None result = self._forward_crf(hidden, gold_tags_padded, crf_mask) return result def recognize_named_entity(self, token, gold_tags=None): """ Alias of self.forward(). """ return self.forward(token, gold_tags) def step(self, batch, batch_nb, *optimizer_idx): tokens_nopad = batch["tokens"] tags_nopad = batch["tags"] assert list(map(len, tokens_nopad)) == list( map(len, tags_nopad) ), "ERROR: the number of tokens and BIO tags are different in some record." # Negative Log Likelihood result = self.forward(tokens_nopad, tags_nopad) returns = { "loss": result["log_likelihood"] * (-1.0), "T": result["gold_tags_padded"], "Y": result["pred_tags_packed"], "I": batch["ix"], } assert ( torch.isnan(returns["loss"]).sum().item() == 0 ), "Loss function contains nan." return returns def unpack_pred_tags(self, Y_packed): """ input: Y_packed: torch.nn.utils.rnn.PackedSequence output: Y: list(list(str)) Predicted NER tagging sequence. """ Y_padded, Y_len = rnn.pad_packed_sequence( Y_packed, batch_first=True, padding_value=-1 ) Y_padded = Y_padded.numpy().tolist() Y_len = Y_len.numpy().tolist() # Replace B- tag with I- tag # because the original paper defines the NER task as identification of spans, not entities Y = [ [self.itol[ix].replace("B-", "I-") for ix in ids[:length]] for ids, length in zip(Y_padded, Y_len) ] return Y def unpack_gold_and_pred_tags(self, T_padded, Y_packed): """ input: T_padded: torch.tensor Y_packed: torch.nn.utils.rnn.PackedSequence output: T: list(list(str)) Gold NER tagging sequence. Y: list(list(str)) Predicted NER tagging sequence. """ Y = self.unpack_pred_tags(Y_packed) Y_len = [len(seq) for seq in Y] T_padded = T_padded.numpy().tolist() # Replace B- tag with I- tag # because the original paper defines the NER task as identification of spans, not entities T = [ [self.itol[ix] for ix in ids[:length]] for ids, length in zip(T_padded, Y_len) ] return T, Y def gather_outputs(self, outputs): if len(outputs) > 1: loss = torch.mean(torch.tensor([output["loss"] for output in outputs])) else: loss = outputs[0]["loss"] IX = [] Y = [] T = [] for output in outputs: T_batch, Y_batch = self.unpack_gold_and_pred_tags( output["T"].cpu(), output["Y"].cpu() ) T += T_batch Y += Y_batch IX += output["I"].cpu().numpy().tolist() returns = {"loss": loss, "T": T, "Y": Y, "I": IX} return returns def training_step(self, batch, batch_nb, *optimizer_idx) -> Dict: # Process on individual mini-batches """ (batch) -> (dict or OrderedDict) # Caution: key for loss function must exactly be 'loss'. """ return self.step(batch, batch_nb, *optimizer_idx) def training_epoch_end(self, outputs: Union[List[Dict], List[List[Dict]]]) -> Dict: """ outputs(list of dict) -> loss(dict or OrderedDict) # Caution: key must exactly be 'loss'. """ outs = self.gather_outputs(outputs) loss = outs["loss"] Y = outs["Y"] T = outs["T"] get_logger(self.hparams.version).info( f"========== Training Epoch {self.current_epoch} ==========" ) get_logger(self.hparams.version).info(f"Loss: {loss.item()}") get_logger(self.hparams.version).info( f"Entity-wise classification report\n{seq_classification_report(T, Y, 4)}" ) progress_bar = {"train_loss": loss} returns = {"loss": loss, "progress_bar": progress_bar} return returns def validation_step(self, batch, batch_nb) -> Dict: # Process on individual mini-batches """ (batch) -> (dict or OrderedDict) """ return self.step(batch, batch_nb) def validation_epoch_end( self, outputs: Union[List[Dict], List[List[Dict]]] ) -> Dict: """ For single dataloader: outputs(list of dict) -> (dict or OrderedDict) For multiple dataloaders: outputs(list of (list of dict)) -> (dict or OrderedDict) """ outs = self.gather_outputs(outputs) loss = outs["loss"] Y = outs["Y"] T = outs["T"] get_logger(self.hparams.version).info( f"========== Validation Epoch {self.current_epoch} ==========" ) get_logger(self.hparams.version).info(f"Loss: {loss.item()}") get_logger(self.hparams.version).info( f"Entity-wise classification report\n{seq_classification_report(T, Y, 4)}" ) progress_bar = {"val_loss": loss} returns = {"val_loss": loss, "progress_bar": progress_bar} return returns def test_step(self, batch, batch_nb) -> Dict: # Process on individual mini-batches """ (batch) -> (dict or OrderedDict) """ return self.step(batch, batch_nb) def test_epoch_end(self, outputs: Union[List[Dict], List[List[Dict]]]) -> Dict: """ For single dataloader: outputs(list of dict) -> (dict or OrderedDict) For multiple dataloaders: outputs(list of (list of dict)) -> (dict or OrderedDict) """ outs = self.gather_outputs(outputs) loss = outs["loss"] Y = outs["Y"] T = outs["T"] get_logger(self.hparams.version).info("========== Test ==========") get_logger(self.hparams.version).info(f"Loss: {loss.item()}") get_logger(self.hparams.version).info( f"Entity-wise classification report\n{seq_classification_report(T, Y, 4)}" ) progress_bar = {"test_loss": loss} returns = {"test_loss": loss, "progress_bar": progress_bar} return returns def configure_optimizers( self, ) -> Union[torch.optim.Optimizer, List[torch.optim.Optimizer]]: if self.hparams.model == "bioelmo": if self.hparams.fine_tune_bioelmo: optimizer_bioelmo_1 = optim.Adam( self.bioelmo.parameters(), lr=float(self.hparams.lr_bioelmo) ) optimizer_bioelmo_2 = optim.Adam( self.hidden_to_tag.parameters(), lr=float(self.hparams.lr_bioelmo) ) optimizer_crf = optim.Adam( self.crf.parameters(), lr=float(self.hparams.lr) ) return [optimizer_bioelmo_1, optimizer_bioelmo_2, optimizer_crf] else: optimizer = optim.Adam(self.parameters(), lr=float(self.hparams.lr)) return optimizer elif self.hparams.model == "biobert": if self.hparams.fine_tune_biobert: optimizer_biobert_1 = optim.Adam( self.biobert.parameters(), lr=float(self.hparams.lr_biobert) ) optimizer_biobert_2 = optim.Adam( self.hidden_to_tag.parameters(), lr=float(self.hparams.lr_biobert) ) optimizer_crf = optim.Adam( self.crf.parameters(), lr=float(self.hparams.lr) ) return [optimizer_biobert_1, optimizer_biobert_2, optimizer_crf] else: optimizer = optim.Adam(self.parameters(), lr=float(self.hparams.lr)) return optimizer def train_dataloader(self) -> torch.utils.data.DataLoader: ds_train = NERDataset.from_dirnames(self.hparams.train_dirs) dl_train = NERDataLoader( ds_train, batch_size=self.hparams.batch_size, shuffle=True ) return dl_train def val_dataloader(self) -> torch.utils.data.DataLoader: ds_val = NERDataset.from_dirnames(self.hparams.val_dirs) dl_val = NERDataLoader( ds_val, batch_size=self.hparams.batch_size, shuffle=False ) return dl_val def test_dataloader(self) -> torch.utils.data.DataLoader: ds_test = NERDataset.from_dirnames(self.hparams.test_dirs) dl_test = NERDataLoader( ds_test, batch_size=self.hparams.batch_size, shuffle=False ) return dl_test
def prepare_model(args, device): config = BertConfig.from_pretrained('bert-base-uncased', cache_dir=args.cache_dir) # config.num_hidden_layers = 12 if args.force_num_hidden_layers: logger.info("Modifying model config with num_hidden_layers to %d", args.force_num_hidden_layers) config.num_hidden_layers = args.force_num_hidden_layers model = BertForPreTraining(config) if args.init_state_dict is not None: model.load_state_dict(args.init_state_dict, strict=False) model_desc = bert_model_description(config) lr_scheduler = LinearWarmupLRScheduler(total_steps=int(args.max_steps), warmup=args.warmup_proportion) loss_scaler = amp.DynamicLossScaler() if args.fp16 else None options = orttrainer.ORTTrainerOptions({ 'batch': { 'gradient_accumulation_steps': args.gradient_accumulation_steps }, 'device': { 'id': str(device) }, 'mixed_precision': { 'enabled': args.fp16, 'loss_scaler': loss_scaler }, 'debug': { 'deterministic_compute': True, }, 'utils': { 'grad_norm_clip': True }, 'distributed': { 'world_rank': max(0, args.local_rank), 'world_size': args.world_size, 'local_rank': max(0, args.local_rank), 'allreduce_post_accumulation': args.allreduce_post_accumulation, 'deepspeed_zero_optimization': { 'stage': args.deepspeed_zero_stage } }, 'lr_scheduler': lr_scheduler }) param_optimizer = list(model.named_parameters()) no_decay_keys = ["bias", "gamma", "beta", "LayerNorm"] params = [{ 'params': [ n for n, p in param_optimizer if any(no_decay_key in n for no_decay_key in no_decay_keys) ], "alpha": 0.9, "beta": 0.999, "lambda": 0.0, "epsilon": 1e-6 }, { 'params': [ n for n, p in param_optimizer if not any(no_decay_key in n for no_decay_key in no_decay_keys) ], "alpha": 0.9, "beta": 0.999, "lambda": 0.0, "epsilon": 1e-6 }] optim_config = optim.AdamConfig(params=params, lr=2e-5, do_bias_correction=True) model = orttrainer.ORTTrainer(model, model_desc, optim_config, options=options) return model
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_file", default="manual_description.txt", type=str, help="The input train corpus.") parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default="out", type=str, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=200, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=16, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=4.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", default=True, action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) #train_examples = None num_train_steps = None if args.do_train: print("Loading Train Dataset", args.train_file) train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForPreTraining.from_pretrained( args.bert_model, config=BertConfig.from_pretrained(args.bert_model)) model.to(device) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) for epoch in trange(1, int(args.num_train_epochs) + 1, desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 print("epoch=", epoch) for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration", position=0)): with torch.no_grad(): batch = (item.cuda(device=device) for item in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch model.train() optimizer.zero_grad() prediction_scores, seq_relationship_score = model( input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids) if lm_label_ids is not None and is_next is not None: loss_fct = CrossEntropyLoss(ignore_index=-1) #masked_lm_loss = loss_fct(prediction_scores.view(-1, model.config.vocab_size),lm_label_ids.view(-1)) next_sentence_loss = loss_fct( seq_relationship_score.view(-1, 2), is_next.view(-1)) total_loss = next_sentence_loss model.zero_grad() loss = total_loss if step % 200 == 0: print(loss) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if epoch % 5 == 0: # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self checkpoint_prefix = 'checkpoint' + str(epoch) output_dir = os.path.join( args.output_dir, '{}-{}'.format(checkpoint_prefix, global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) if args.do_train: torch.save(args, os.path.join(output_dir, 'training_args.bin'))
import torch from transformers import BertTokenizer, BertForPreTraining tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForPreTraining.from_pretrained('bert-base-uncased') # 疑似誤り文(ノイズ)を生成し、img2infoの辞書で保管。 import json import random import pickle import nltk # word_tokenize nltk.download('punkt') # pos_tag nltk.download('averaged_perceptron_tagger') # wordnet from nltk.corpus import wordnet as wn from tqdm import tqdm def build_img2info(json_obj, sim_value): # 画像のidをkey (key, caption, noise caption)をvalue img2info = {} idx = 0 for dic in tqdm(json_obj.values(), total=len(json_obj)): new_noise = [] for caption in dic['captions']: noise_captions = [] # 形態素解析 morph = nltk.word_tokenize(caption.lower()) pos = nltk.pos_tag(morph)
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model print("=> creating model 'bert'") model = BertForPreTraining.from_pretrained('bert-base-uncased', return_dict=True) if not torch.cuda.is_available(): print('using CPU, this will be slow') elif args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have # args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion = BertPretrainingCriterion(vocab_size) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True args.max_predictions_per_seq = 80 # Data loading code traindir = os.path.join(args.data) epoch = 0 training_steps = 0 writer = None enable_tensorboard = args.rank <= 0 if enable_tensorboard: if args.rank == -1: # No DDP: writer = SummaryWriter(comment='_bert_no_ddp_' + args.data) else: writer = SummaryWriter(comment='_bert_' + args.dist_backend + '_' + str(args.world_size) + 'GPUs_' + args.data) train_raw_start = time.time() while True: batch_time = AverageMeter('Time', ':6.3f') data_time = AverageMeter('Data', ':6.3f') example_speed = AverageMeter('Speed', ':6.3f') losses = AverageMeter('Loss', ':.4e') files = [ os.path.join(traindir, f) for f in os.listdir(traindir) if os.path.isfile(os.path.join(traindir, f)) and 'training' in f ] files.sort() num_files = len(files) random.Random(args.seed + epoch).shuffle(files) f_start_id = 0 if torch.distributed.is_initialized() and get_world_size() > num_files: remainder = get_world_size() % num_files data_file = files[(f_start_id * get_world_size() + get_rank() + remainder * f_start_id) % num_files] else: data_file = files[(f_start_id * get_world_size() + get_rank()) % num_files] previous_file = data_file train_data = pretraining_dataset(data_file, args.max_predictions_per_seq) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_data, shuffle=False) else: train_sampler = torch.utils.data.RandomSampler(train_data) train_dataloader = torch.utils.data.DataLoader( train_data, sampler=train_sampler, batch_size=args.batch_size, num_workers=4, pin_memory=True) pool = ProcessPoolExecutor(1) shared_file_list = {} for f_id in range(f_start_id + 1, len(files)): if get_world_size() > num_files: data_file = files[(f_id * get_world_size() + get_rank() + remainder * f_id) % num_files] else: data_file = files[(f_id * get_world_size() + get_rank()) % num_files] previous_file = data_file dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, shared_file_list, args) train_iter = train_dataloader end = time.time() progress = ProgressMeter( len(train_iter), [batch_time, data_time, example_speed, losses], prefix="Epoch: [{}]".format(epoch)) for step, batch in enumerate(train_iter): training_steps += 1 batch = [t.to(args.gpu) for t in batch] input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch outputs = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) prediction_scores = outputs.prediction_logits seq_relationship_score = outputs.seq_relationship_logits loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels) losses.update(loss.item()) # compute gradient and do SGD step # optimizer.zero_grad() loss.backward() optimizer.step() for param in model.parameters(): param.grad = None # measure elapsed time elapsed_time = time.time() - end batch_time.update(elapsed_time) end = time.time() speed = len(batch[0]) / elapsed_time example_speed.update(speed) global global_steps global global_examples global_examples += len(batch[0]) global_steps += 1 if step % args.print_freq == 0: progress.display(step) if writer is not None: writer.add_scalar('loss/step', loss.item(), global_steps) writer.add_scalar('speed/step', speed, global_steps) if global_steps >= (args.max_step / abs(args.world_size)): break if global_steps >= (args.max_step / abs(args.world_size)): break del train_dataloader train_dataloader, data_file = dataset_future.result(timeout=None) now = time.time() print('Global Steps: ' + str(global_steps)) print('Total Examples: ' + str(global_examples)) print('Train duration: ' + str(now - train_raw_start)) print('Example/Sec: ' + str(global_examples / (now - train_raw_start))) epoch += 1 if epoch >= args.epochs: break if writer is not None: writer.add_scalar('overall_speed/step', global_examples / (now - train_raw_start), global_steps) writer.close()
def train(): logger.info('*' * 64) logger.info('token:%s' % current_time) logger.info('*' * 64) parser = ArgumentParser() parser.add_argument( "--train_file", type=str, default="./my_test/data/student/part1.txt", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./cache/', help="Path or url of the dataset cache") parser.add_argument("--batch_size", type=int, default=2, help="Batch size for validation") parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Accumulate gradients on several steps") parser.add_argument("--lr", type=float, default=6.25e-4, help="Learning rate") # parser.add_argument("--train_precent", type=float, default=0.7, help="Batch size for validation") parser.add_argument("--n_epochs", type=int, default=1, help="Number of training epochs") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") # parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--log_step", type=int, default=1, help="Multiple-choice loss coefficient") parser.add_argument("--base_model", type=str, default="bert-base-uncased") parser.add_argument( "--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) args = parser.parse_args() logger.info(args) device = torch.device(args.device) tokenizer = BertTokenizer.from_pretrained(args.base_model) train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size) model = BertForPreTraining.from_pretrained(args.base_model) optimizer = optim.Adam(model.parameters(), lr=args.lr) steps = len(train_data_loader.dataset) // train_data_loader.batch_size steps = steps if steps > 0 else 1 logger.info('steps:%d' % steps) lr_warmup = get_cosine_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=1500, num_training_steps=steps * args.n_epochs) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") gpu_num = torch.cuda.device_count() gpu_list = [int(i) for i in range(gpu_num)] model = DataParallel(model, device_ids=gpu_list) multi_gpu = True if torch.cuda.is_available(): model.cuda() # model.to(device) # criterion.to(device) def update(engine, batch): model.train() # input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch """ input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None, next_sentence_label=None, """ # loss = model(input_ids=batch[0],input_mask=batch[1],segment_ids=batch[2],lm_label_ids=batch[3],is_next=batch[4]) loss = model(input_ids=batch[0], attention_mask=batch[1], position_ids=batch[2], masked_lm_labels=batch[3], next_sentence_label=batch[4]) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() lr_warmup.step() if multi_gpu: loss = loss.mean() loss.backward() return loss.cpu().item() trainer = Engine(update) # def inference(engine, batch): # model.eval() # with torch.no_grad(): # input_ids = batch[0].to(device) # attention_mask = batch[1].to(device) # labels = batch[2].to(device) # output = model(input_ids=input_ids, attention_mask=attention_mask) # # predict = output.permute(1, 2, 0) # trg = labels.permute(1, 0) # loss = criterion(predict.to(device), trg.to(device)) # return predict, trg # # evaluator = Engine(inference) # metrics = {"nll": Loss(criterion, output_transform=lambda x: (x[0], x[1])), # "accuracy": Accuracy(output_transform=lambda x: (x[0], x[1]))} # for name, metric in metrics.items(): # metric.attach(evaluator, name) # # @trainer.on(Events.EPOCH_COMPLETED) # def log_validation_results(trainer): # evaluator.run(valid_data_loader) # ms = evaluator.state.metrics # logger.info("Validation Results - Epoch: [{}/{}] Avg accuracy: {:.6f} Avg loss: {:.6f}" # .format(trainer.state.epoch, trainer.state.max_epochs, ms['accuracy'], ms['nll'])) # '''======================early stopping ==========================''' # def score_function(engine): # val_loss = engine.state.metrics['nll'] # return -val_loss # handler = EarlyStopping(patience=5, score_function=score_function, trainer=trainer) # evaluator.add_event_handler(Events.COMPLETED, handler) '''==================print information by iterator=========================''' @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(trainer): if trainer.state.iteration % args.log_step == 0: logger.info("Epoch[{}/{}] Step[{}/{}] Loss: {:.6f}".format( trainer.state.epoch, trainer.state.max_epochs, trainer.state.iteration % steps, steps, trainer.state.output * args.gradient_accumulation_steps)) '''================add check point========================''' checkpoint_handler = ModelCheckpoint(checkpoint_dir, 'checkpoint', n_saved=3) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'BertClassificationModel': getattr(model, 'module', model) }) # "getattr" take care of distributed encapsulation '''==============run trainer=============================''' trainer.run(train_data_loader, max_epochs=args.n_epochs)
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): print("Converting TensorFlow checkpoint from {} with config at {}".format( tf_checkpoint_path, bert_config_file)) # Load weights from TF model init_vars = tf.train.list_variables(tf_checkpoint_path) names = [] arrays = [] for name, shape in init_vars: print("Loading TF weight {} with shape {}".format(name, shape)) array = tf.train.load_variable(tf_checkpoint_path, name) names.append(name) arrays.append(array) # Initialise PyTorch model config = BertConfig.from_json_file(bert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = BertForPreTraining(config) for name, array in zip(names, arrays): name = name.split('/') # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model if any(n in ["adam_v", "adam_m", "global_step"] for n in name): print("Skipping {}".format("/".join(name))) continue pointer = model for m_name in name: if re.fullmatch(r'[A-Za-z]+_\d+', m_name): l = re.split(r'_(\d+)', m_name) else: l = [m_name] if l[0] == 'kernel' or l[0] == 'gamma': pointer = getattr(pointer, 'weight') elif l[0] == 'output_bias' or l[0] == 'beta': pointer = getattr(pointer, 'bias') elif l[0] == 'output_weights': pointer = getattr(pointer, 'weight') else: pointer = getattr(pointer, l[0]) if len(l) >= 2: num = int(l[1]) pointer = pointer[num] if m_name[-11:] == '_embeddings': pointer = getattr(pointer, 'weight') elif m_name == 'kernel': array = np.transpose(array) try: assert pointer.shape == array.shape except AssertionError as e: e.args += (pointer.shape, array.shape) raise print("Initialize PyTorch weight {}".format(name)) pointer.data = torch.from_numpy(array) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path) # if __name__=='__main__': # convert_tf_checkpoint_to_pytorch(config.TF_PATH,config.BERT_CONFIG_FILE,config.BERT_WEIGHTS)
def convert_pytorch_checkpoint_to_tf(model: BertForPreTraining, ckpt_dir: str, model_name: str): """ Args: model: BertModel Pytorch model instance to be converted ckpt_dir: Tensorflow model directory model_name: model name Currently supported HF models: - Y BertModel - N BertForMaskedLM - N BertForPreTraining - N BertForMultipleChoice - N BertForNextSentencePrediction - N BertForSequenceClassification - N BertForQuestionAnswering """ tensors_to_transpose = ("dense.weight", "attention.self.query", "attention.self.key", "attention.self.value") var_map = ( ("layer.", "layer_"), ("word_embeddings.weight", "word_embeddings"), ("position_embeddings.weight", "position_embeddings"), ("token_type_embeddings.weight", "token_type_embeddings"), ("cls.predictions.bias", "cls.predictions.output_bias"), (".", "/"), ("LayerNorm/weight", "LayerNorm/gamma"), ("LayerNorm/bias", "LayerNorm/beta"), ("weight", "kernel"), ("cls/seq_relationship/bias", "cls/seq_relationship/output_bias"), ("cls/seq_relationship/kernel", "cls/seq_relationship/output_weights"), ) if not os.path.isdir(ckpt_dir): os.makedirs(ckpt_dir) state_dict = model.state_dict() def to_tf_var_name(name: str): for patt, repl in iter(var_map): name = name.replace(patt, repl) return "bert/{}".format(name) if not name.startswith("cls") else name def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session): tf_dtype = tf.dtypes.as_dtype(tensor.dtype) tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer()) session.run(tf.variables_initializer([tf_var])) session.run(tf_var) return tf_var tf.reset_default_graph() with tf.Session() as session: for var_name in state_dict: tf_name = to_tf_var_name(var_name) torch_tensor = state_dict[var_name].numpy() if any([x in var_name for x in tensors_to_transpose]): torch_tensor = torch_tensor.T tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session) tf.keras.backend.set_value(tf_var, torch_tensor) tf_weight = session.run(tf_var) print("Successfully created {}: {}".format( tf_name, np.allclose(tf_weight, torch_tensor))) saver = tf.train.Saver(tf.trainable_variables()) saver.save( session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt"))
def __init__(self, metadata, timer, is_ZH, data_manager): super().__init__() self.timer = timer self.timer("bert-init") self.batch_per_train = 50 self.batch_size_eval = 64 self.max_seq_len = 301 self.batch_size = 48 self.weight_decay = 0 self.learning_rate = 5e-5 self.adam_epsilon = 1e-8 self.max_grad_norm = 1. self.total_epoch = 100 self.logging_step = -1 self.warmup_steps = 0 self.metadata = metadata self.num_class = self.metadata.get_output_size() self.bert_folder = extract_bert_model() bertConfig = BertConfig.from_json_file(self.bert_folder + '/config.json') self.model = BertClassification(None, bertConfig, self.num_class) self.bertTokenizer = BertTokenizer.from_pretrained(self.bert_folder) bertModel = BertForPreTraining.from_pretrained( self.bert_folder, num_labels=self.num_class, from_tf=BERT_V == 0) self.model.bert = bertModel.bert del bertModel.cls self.model.to(torch.device("cuda")) self.data = data_manager self.data.add_pipeline( BertPipeline(is_ZH, metadata, self.bertTokenizer, max_length=self.max_seq_len)) self.train_data_loader = None self.test_data_loader = None self.valid_data_loader = None self.done_training = False self.estimate_time_per_batch = None self.estimate_valid_time = None self.estimate_test_time = None # init optimizer and scheduler no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": self.weight_decay, }, { "params": [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] self.optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon) self.scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=self.warmup_steps, num_training_steps=self.total_epoch * self.batch_per_train) # first, we only train the classifier self.optimizer_only_classifier = optim.Adam( self.model.classifier.parameters(), 0.0005) self.place = 'cpu' self.timer("bert-init") print('[bert init] time cost: %.2f' % (self.timer.accumulation["bert-init"]))
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument." ) if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name" ) if model_args.model_name_or_path: model = BertForPreTraining.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = BertForPreTraining.from_config(config) if model_args.cls_model_name_or_path: cls_config = AutoConfig.from_pretrained( model_args.cls_model_name_or_path, num_labels=2, finetuning_task="cola", cache_dir=model_args.cache_dir, ) cls_model = AutoModelForSequenceClassification.from_pretrained( model_args.cls_model_name_or_path, from_tf=bool(".ckpt" in model_args.cls_model_name_or_path), config=cls_config, cache_dir=model_args.cache_dir, ) cls_model.resize_token_embeddings(len(tokenizer)) mask_selector = MaskSelector(cls_model,training_args) model.resize_token_embeddings(len(tokenizer)) if config.model_type in ["bert", "roberta", "distilbert", "camembert"] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm " "flag (masked language modeling)." ) if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) # Get datasets train_dataset = get_dataset(data_args, tokenizer=tokenizer, model_args=model_args, cache_dir=model_args.cache_dir) if training_args.do_train else None eval_dataset = get_dataset(data_args, model_args=None, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None data_collator = DataCollatorForMixLM( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability ) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, prediction_loss_only=True, ) # Training if training_args.do_train: model_path = ( model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None ) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) result = {"perplexity": perplexity} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results
labels = data['bert_label'].to(device).long() optim.zero_grad() outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels, next_sentence_label=next_sentence_label) loss = outputs['loss'] losses.append(loss.cpu().detach().numpy()) loss = np.mean(losses) return loss device = 'cuda' if torch.cuda.is_available() else 'cpu' config = BertConfig(vocab_size=len(WORDS) + 1) model = BertForPreTraining.from_pretrained('bert-base-chinese') model = model.to(device) # model=nn.DataParallel(model,device_ids=[0,1]) optim = torch.optim.Adam(model.parameters(), lr=2e-5) criterion = nn.CrossEntropyLoss() NUM_EPOCHS = 5 for epoch in range(NUM_EPOCHS): pbar = tqdm(train_loader) losses = [] for data_label in pbar: data = data_label[0] next_sentence_label = data_label[1].to(device).long() input_ids = data['input_ids'].to(device).long() token_type_ids = data['token_type_ids'].to(device).long() attention_mask = data['attention_mask'].to(device).long()
def __init__(self, hparams): """ input: hparams: namespace with the following items: 'data_dir' (str): Data Directory. default: './official/ebm_nlp_1_00' 'bioelmo_dir' (str): BioELMo Directory. default: './models/bioelmo', help='BioELMo Directory') 'max_length' (int): Max Length. default: 1024 'lr' (float): Learning Rate. default: 1e-2 'fine_tune_bioelmo' (bool): Whether to Fine Tune BioELMo. default: False 'lr_bioelmo' (float): Learning Rate in BioELMo Fine-tuning. default: 1e-4 """ super().__init__() self.hparams = hparams self.itol = ID_TO_LABEL self.ltoi = {v: k for k, v in self.itol.items()} if self.hparams.model == "bioelmo": # Load Pretrained BioELMo DIR_ELMo = pathlib.Path(str(self.hparams.bioelmo_dir)) self.bioelmo = self.load_bioelmo( DIR_ELMo, not self.hparams.fine_tune_bioelmo ) self.bioelmo_output_dim = self.bioelmo.get_output_dim() # ELMo Padding token (In ELMo token with ID 0 is used for padding) VOCAB_FILE_PATH = DIR_ELMo / "vocab.txt" command = shlex.split(f"head -n 1 {VOCAB_FILE_PATH}") res = subprocess.Popen(command, stdout=subprocess.PIPE) self.bioelmo_pad_token = res.communicate()[0].decode("utf-8").strip() # Initialize Intermediate Affine Layer self.hidden_to_tag = nn.Linear(int(self.bioelmo_output_dim), len(self.itol)) elif self.hparams.model == "biobert": # Load Pretrained BioBERT PATH_BioBERT = pathlib.Path(str(self.hparams.biobert_path)) self.bertconfig = BertConfig.from_pretrained(self.hparams.bert_model_type) self.bertforpretraining = BertForPreTraining(self.bertconfig) self.bertforpretraining.load_tf_weights(self.bertconfig, PATH_BioBERT) self.biobert = self.bertforpretraining.bert self.tokenizer = BertTokenizer.from_pretrained(self.hparams.bert_model_type) # Freeze BioBERT if fine-tune not desired if not self.hparams.fine_tune_biobert: for n, m in self.biobert.named_parameters(): m.requires_grad = False # Initialize Intermediate Affine Layer self.hidden_to_tag = nn.Linear( int(self.bertconfig.hidden_size), len(self.itol) ) # Initialize CRF TRANSITIONS = conditional_random_field.allowed_transitions( constraint_type="BIO", labels=self.itol ) self.crf = conditional_random_field.ConditionalRandomField( # set to 3 because here "tags" means ['O', 'B', 'I'] # no need to include 'BOS' and 'EOS' in "tags" num_tags=len(self.itol), constraints=TRANSITIONS, include_start_end_transitions=False, ) self.crf.reset_parameters()
def load_annotations(self, proposal_method, **kwargs): logger = logging.getLogger("vmr.trainer") logger.info("Preparing data form file {}, please wait...".format( self.anno_file)) self.annos = [] self.gts = [] word2vec_cache_prefix = os.path.splitext(self.anno_file)[0] word2vec_cache_file = '{}_word2vec_{}.pkl'.format( word2vec_cache_prefix, self.word2vec) # Define word embedding function if os.path.exists(word2vec_cache_file): annos_original = None # Load word embeddings cache if exists logger.info("Word2vec cache exist, load cache file.") with open(word2vec_cache_file, 'rb') as F: self.annos_query = pickle.load(F) def word_embedding(idx, sentence): assert self.annos_query[idx]['sentence'] == sentence, \ 'annotation file {} has been modified, cache file expired!'.format(self.anno_file,) return self.annos_query[idx]['query'], self.annos_query[idx][ 'wordlen'] else: annos_original = [] # Computing word embeddings if there's no cache if self.word2vec == 'BERT': # Here we use second-to-last hidden layer # See 3.5 Pooling Strategy & Layer Choice in https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/#3-extracting-embeddings tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') bert = BertForPreTraining.from_pretrained('bert-base-uncased', return_dict=True) bert.to('cuda') def word_embedding(idx, sentence): sentence_tokenized = tokenizer( sentence, return_tensors="pt") # token_num = sentence_num+2 for key in sentence_tokenized: sentence_tokenized[key] = sentence_tokenized[key].to( 'cuda') with torch.no_grad(): query = bert(**sentence_tokenized, output_hidden_states=True )['hidden_states'][-2].squeeze_().to( 'cpu') #(token_num, 768) query = query[1:-1] return query, query.size( 0) #(sentence_len, 768) including punctuations elif self.word2vec == 'GloVe': def word_embedding(idx, sentence): word2vec = glove_embedding(sentence) return word2vec, word2vec.size( 0) #(sentence_len, 300) including punctuations else: raise NotImplementedError # Loading annotations, generate ground truth for model proposal logger.info("loading annotations ...") with open(self.anno_file, 'r') as f: annos = json.load(f) for vid, anno in tqdm(annos.items()): duration = anno[ 'duration'] if self.dataset_name != 'tacos' else anno[ 'num_frames'] / anno['fps'] # Produce annotations for idx in range(len(anno['timestamps'])): timestamp = anno['timestamps'][idx] sentence = anno['sentences'][idx] if timestamp[0] < timestamp[1]: moment = torch.tensor([max(timestamp[0], 0), min(timestamp[1], duration)]) if self.dataset_name != 'tacos' \ else torch.tensor( [max(timestamp[0]/anno['fps'],0), min(timestamp[1]/anno['fps'],duration)] ) query, wordlen = word_embedding(len(self.annos), sentence) self.avg_wordvec += query.mean(dim=0) if annos_original is not None: annos_original.append({ 'sentence': sentence, 'query': query, 'wordlen': wordlen, }) adjmat = torch.tensor(anno['dependency_parsing_graph'] [idx]) if self.dep_graph else None if self.consti_mask: constimask = torch.tensor( anno['constituency_parsing_mask'][idx], dtype=torch.float32) layers = torch.linspace( constimask.size(0) - 1, 0, self.tree_depth).long( ) # The original tree is from root to leaf constimask = constimask[layers, :, :] else: constimask = None if self.dep_graph: padding = query.size(0) - adjmat.size(0) adjmat = torch.nn.functional.pad( adjmat, (0, padding, 0, padding), "constant", 0) if self.dep_graph else None if wordlen >= self.max_num_words: wordlen = self.max_num_words query = query[:self.max_num_words] adjmat = adjmat[:self.max_num_words, :self. max_num_words] if self.dep_graph else None elif self.fix_num_words: padding = self.max_num_words - wordlen query = torch.nn.functional.pad( query, (0, 0, 0, padding), "constant", 0) #print('padded:', query.shape) if self.dep_graph: padding = self.max_num_words - adjmat.size(0) adjmat = torch.nn.functional.pad( adjmat, (0, padding, 0, padding), "constant", 0) if self.dep_graph else None self.annos.append({ 'vid': vid, 'moment': moment, 'sentence': sentence, 'query': query, 'querymask': torch.ones(wordlen, dtype=torch.int32), 'adjmat': adjmat, 'constimask': constimask, 'wordlen': wordlen, 'duration': duration, }) gt_dict = self.__generate_ground_truth__( moment, duration, proposal_method, **kwargs) self.gts.append(gt_dict) self.avg_wordvec /= len(self.annos) if not os.path.exists(word2vec_cache_file): with open(word2vec_cache_file, 'wb') as F: word2vec_cache = [{ 'sentence': anno['sentence'], 'query': anno['query'], 'wordlen': anno['wordlen'] } for anno in annos_original] pickle.dump(word2vec_cache, F) # Loading visual features if in_memory if self.in_memory: logger.info( "Loading visual features from {}, please wait...".format( self.feat_file)) self.feats, self.seglens = video2feats(self.feat_file, annos.keys(), self.num_segments, self.dataset_name, self.upsample) logger.info("Dataset prepared!")
pretraindata, BATCH_SIZE, collate_fn=pretrain_collate_fn ) # Set the config of the bert config = BertConfig( num_hidden_layers=4, hidden_size=312, intermediate_size=1200, max_position_embeddings=1024, ) if args.target == "mobert": config.num_labels = pretraindata.token_num + 1 model = MoBert(config) elif args.target == "bert": model = BertForPreTraining(config) model = model.to(device) # Pre-train the MoBERT model optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) model.train() step = 1 total_loss = 0 total_loss_pre = 0 total_loss_cl = 0 start = time.time() for src, mlm, mask, nsp, mt, token_type_ids in dataloader: src = src.to(device) mlm = mlm.to(device) mask = mask.to(device)
def __init__(self, scan_encoder_class=None, scan_encoder_args={}, bert_class=None, bert_args={}, scan_decoder_class=None, scan_decoder_args={}, task_configs=[], vocab_args={}, loss_weighting=None, optim_class="Adam", optim_args={}, scheduler_class=None, scheduler_args={}, pretrained_configs=[], cuda=True, devices=[0]): """ """ super().__init__(optim_class, optim_args, scheduler_class, scheduler_args, pretrained_configs, cuda, devices) self.encodes_scans = scan_encoder_class is not None if self.encodes_scans: self.scan_encoder = getattr( modules, scan_encoder_class)(**scan_encoder_args) self.scan_encoder = nn.DataParallel(self.scan_encoder, device_ids=self.devices) if bert_class == "BertModelPreTrained": self.bert = BertModel.from_pretrained(**bert_args) elif bert_class == "BertForPretraining": self.bert = BertForPreTraining.from_pretrained(**bert_args) elif bert_class == "BertModel": bert_args["config"] = BertConfig.from_dict(bert_args["config"]) self.bert = BertModel(**bert_args) else: self.bert = getattr(modules, bert_class)(**bert_args) self.bert = nn.DataParallel(self.bert, device_ids=self.devices) self.decodes_scans = scan_decoder_class is not None if self.decodes_scans: self.scan_decoder = getattr( modules, scan_decoder_class)(**scan_decoder_args) self.task_heads = {} self.task_inputs = {} for task_head_config in task_configs: task = task_head_config["task"] head_class = getattr(modules, task_head_config["class"]) args = task_head_config["args"] self.task_inputs[task] = (task_head_config["inputs"] if "inputs" in task_head_config else "pool") if "config" in args: # bert task heads take config object for parameters, must convert from dict config = args["config"] args["config"] = namedtuple("Config", config.keys())(*config.values()) if head_class is BertOnlyMLMHead: embs = self.bert.module.embeddings.word_embeddings.weight self.task_heads[task] = head_class( bert_model_embedding_weights=embs, **args) else: self.task_heads[task] = head_class(**args) self.task_heads = torch.nn.ModuleDict(self.task_heads) self.vocab = WordPieceVocab(**vocab_args) self._build_loss(loss_weighting) self._post_init()
def main(): args = get_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # not parallizing across GPUs because of deadlocks n_gpu = 1 if torch.cuda.device_count() > 0 else 0 logging.info(f'device: {device} n_gpu: {n_gpu} seed: {args.seed}') res = nvidia_smi.nvmlDeviceGetMemoryInfo(handle) logging.info( f'mem: {res.used / (1024**2)} (GiB) ({100 * (res.used / res.total):.3f}%)' ) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning( f"Output directory ({args.output_dir}) already exists and is not empty!" ) args.output_dir.mkdir(parents=True, exist_ok=True) # TODO: not sure what this for loop is doing samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}.json" metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break else: num_data_epochs = args.epochs total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = total_train_examples // args.train_batch_size # Prepare model tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) model = BertForPreTraining.from_pretrained(args.bert_model) model.to(device) # Prepare optimizer no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=num_train_optimization_steps) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) model.train() for epoch in range(args.epochs): tmp_fp = f'/media/data_1/darius/data/512epoch_{epoch}_dataset_255.pkl' if Path(tmp_fp).is_file(): logging.info(f'Loading dataset from {tmp_fp}...') with open(tmp_fp, 'rb') as f: epoch_dataset = pickle.load(f) else: epoch_dataset = PregeneratedDataset( epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) with open(tmp_fp, 'wb') as f: pickle.dump(epoch_dataset, f, protocol=4) train_sampler = RandomSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: for _, (input_ids, input_mask, segment_ids, lm_label_ids, is_next) in enumerate(train_dataloader): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) lm_label_ids = lm_label_ids.to(device) is_next = is_next.to(device) # breakpoint() outputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=lm_label_ids, next_sentence_label=is_next) # outputs = model(input_ids, segment_ids, # input_mask, lm_label_ids, is_next) loss = outputs.loss loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") optimizer.step() # optimizer.zero_grad() scheduler.step() global_step += 1 # Save a trained model logging.info("** ** * Saving fine-tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir)
DATAPATH = '/datasets/shshi' pretrained_path = '%s/pretrained' % DATAPATH if args.model == 'bert_base': config = BertConfig.from_json_file('bert_base_config.json') else: config = BertConfig.from_json_file('bert_config.json') #config = BertConfig.from_json_file('bert_config.json') # Padding for divisibility by 8 if config.vocab_size % 8 != 0: config.vocab_size += 8 - (config.vocab_size % 8) vocab_size = config.vocab_size #tokenizer = BertTokenizer.from_pretrained(pretrained_path) #model = BertForPreTraining.from_pretrained(pretrained_path) model = BertForPreTraining(config) if args.cuda: model.cuda() #optimizer = AdamW(model.parameters(), optimizer = optim.Adam( model.parameters(), lr=2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps=1e-8 # args.adam_epsilon - default is 1e-8. ) #optimizer = optim.SGD(model.parameters(), lr=2e-5) compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none use_bytescheduler = True import bytescheduler.pytorch.horovod as bsc
# https://huggingface.co/transformers/model_doc/bert.html#bertconfig config = BertConfig(vocab_size=32000, hidden_size=256, num_hidden_layers=6, num_attention_heads=4, intermediate_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, pad_token_id=0, position_embedding_type="absolute") model = BertForPreTraining(config=config) model.num_parameters() train_dataset = TextDatasetForNextSentencePrediction( tokenizer=tokenizer, file_path='/opt/ml/code/KBOBERT/KBOBERT_Data.txt', block_size=512, overwrite_cache=False, short_seq_probability=0.1, nsp_probability=0.5, ) # eval_dataset = TextDatasetForNextSentencePrediction( # tokenizer=tokenizer, # file_path='/opt/ml/code/KBOBERT/wiki_20190620_small.txt', # block_size=512,
from torch.nn import CrossEntropyLoss from common import AverageMeter from custom_metrics import LMAccuracy from data_loader import Data_pretrain from config import Config if __name__ == '__main__': # training_path, file_id, tokenizer, data_name, reduce_memory=False tokenizer = BertTokenizer.from_pretrained('./bert_base_pretrain/vocab.txt') train_data_path = './data/processed_data0.json' txt = Data_pretrain(train_data_path, tokenizer) data_iter = DataLoader(txt, shuffle=True, batch_size=2) bert_config = BertConfig.from_pretrained(Config.config_path) # model = BertForPreTraining(config=bert_config) model = BertForPreTraining.from_pretrained( './bert_base_pretrain/pytorch_model.bin', config=bert_config) model.to(Config.device) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }]
def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): seed = 42 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) onnxruntime.set_seed(seed) model = BertForPreTraining(config=config) model.eval() loss, prediction_scores, seq_relationship_score = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, next_sentence_label=sequence_labels) model_desc = ModelDescription([ self.input_ids_desc, self.attention_mask_desc, self.token_type_ids_desc, self.masked_lm_labels_desc, self.next_sentence_label_desc ], [ self.loss_desc, self.prediction_scores_desc, self.seq_relationship_scores_desc ]) from collections import namedtuple MyArgs = namedtuple( "MyArgs", "local_rank world_size max_steps learning_rate warmup_proportion batch_size seq_len" ) args = MyArgs(local_rank=0, world_size=1, max_steps=100, learning_rate=0.00001, warmup_proportion=0.01, batch_size=13, seq_len=7) def get_lr_this_step(global_step): return get_lr(args, global_step) loss_scaler = LossScaler('loss_scale_input_name', True, up_scale_window=2000) # It would be better to test both with/without mixed precision and allreduce_post_accumulation. # However, stress test of all the 4 cases is not stable at lease on the test machine. # There we only test mixed precision and allreduce_post_accumulation because it is the most useful use cases. option_fp16 = [True] option_allreduce_post_accumulation = [True] option_gradient_accumulation_steps = [1, 8] option_use_internal_get_lr_this_step = [True, False] option_use_internal_loss_scaler = [True, False] option_split_batch = [BatchArgsOption.ListAndDict] for fp16 in option_fp16: for allreduce_post_accumulation in option_allreduce_post_accumulation: for gradient_accumulation_steps in option_gradient_accumulation_steps: for use_internal_get_lr_this_step in option_use_internal_get_lr_this_step: for use_internal_loss_scaler in option_use_internal_loss_scaler: for split_batch in option_split_batch: print("gradient_accumulation_steps:", gradient_accumulation_steps) print("use_internal_loss_scaler:", use_internal_loss_scaler) loss_ort, prediction_scores_ort, seq_relationship_score_ort =\ run_test(model, model_desc, self.device, args, gradient_accumulation_steps, fp16, allreduce_post_accumulation, get_lr_this_step, use_internal_get_lr_this_step, loss_scaler, use_internal_loss_scaler, split_batch) print(loss_ort) print(prediction_scores_ort) print(seq_relationship_score_ort)