def load(cls, pretrained_model_name_or_path, language=None, **kwargs): """ Load a pretrained model by supplying * the name of a remote model on s3 ("bert-base-cased" ...) * OR a local path of a model trained via transformers ("some_dir/huggingface_model") * OR a local path of a model trained via FARM ("some_dir/farm_model") :param pretrained_model_name_or_path: The path of the saved pretrained model or its name. :type pretrained_model_name_or_path: str """ bert = cls() if "farm_lm_name" in kwargs: bert.name = kwargs["farm_lm_name"] else: bert.name = pretrained_model_name_or_path # We need to differentiate between loading model using FARM format and Pytorch-Transformers format farm_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json" if os.path.exists(farm_lm_config): # FARM style bert_config = BertConfig.from_pretrained(farm_lm_config) farm_lm_model = Path(pretrained_model_name_or_path) / "language_model.bin" bert.model = BertModel.from_pretrained(farm_lm_model, config=bert_config, **kwargs) bert.language = bert.model.config.language else: # Pytorch-transformer Style bert.model = BertModel.from_pretrained(str(pretrained_model_name_or_path), **kwargs) bert.language = cls._get_or_infer_language_from_name(language, pretrained_model_name_or_path) return bert
def init_encoder( cls, cfg_name: str, projection_dim: int = 0, dropout: float = 0.1, **kwargs ) -> BertModel: cfg = BertConfig.from_pretrained(cfg_name if cfg_name else 'bert-base-uncased') if dropout != 0: cfg.attention_probs_dropout_prob = dropout cfg.hidden_dropout_prob = dropout return cls.from_pretrained(cfg_name, config=cfg, project_dim=projection_dim, **kwargs)
def __init__(self, model_path, vocab: Vocabulary): super().__init__(vocab) config = BertConfig.from_pretrained(model_path) bert_model = BertForPreTraining(config) self.bert = bert_model.bert tags = vocab.get_index_to_token_vocabulary("tags") num_tags = len(tags) self.projection = torch.nn.Linear(768, num_tags) self.metric = SpanBasedF1Measure(vocab, label_encoding='BMES')
def __init__(self, args: argparse.Namespace): super().__init__() self.args = args self.bert_config = BertConfig.from_pretrained(self.args.bert_path) self.model = BertForMaskedLM(self.bert_config) self.loss_fn = CrossEntropyLoss(reduction="none") self.train_acc = MaskedAccuracy() self.valid_acc = MaskedAccuracy()
def __init__(self, args): super().__init__() self.args = args self.bert_config = BertConfig.from_pretrained( self.args.bert_config_dir, output_hidden_states=False) self.bert = BertModel(self.bert_config) self.linear = nn.Linear(self.bert_config.hidden_size * 1001, 919) self.threshold = nn.Threshold(0, 1e-6) self.linear2 = nn.Linear(919, 919) self.sigmoid = nn.Sigmoid()
def _build_word_embedding(self): self.bert_config = BertConfig.from_pretrained(self.config.bert_model_name) if self.config.pretrained_bert: bert_model = BertForPreTraining.from_pretrained(self.config.bert_model_name) self.word_embedding = bert_model.bert.embeddings self.pooler = bert_model.bert.pooler self.pooler.apply(self.init_weights) else: self.pooler = BertPooler(self.bert_config) self.word_embedding = BertEmbeddings(self.bert_config)
def __init__(self, config, args): super().__init__(config) self.args = args if args.bert_model == "albert-base-v2": bert = AlbertModel.from_pretrained(args.bert_model) elif args.bert_model == "emilyalsentzer/Bio_ClinicalBERT": bert = AutoModel.from_pretrained(args.bert_model) elif args.bert_model == "bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12": bert = AutoModel.from_pretrained(args.bert_model) elif args.bert_model == "bert-small-scratch": config = BertConfig.from_pretrained( "google/bert_uncased_L-4_H-512_A-8") bert = BertModel(config) elif args.bert_model == "bert-base-scratch": config = BertConfig.from_pretrained("bert-base-uncased") bert = BertModel(config) else: bert = BertModel.from_pretrained( args.bert_model) # bert-base-uncased, small, tiny self.txt_embeddings = bert.embeddings self.img_embeddings = ImageBertEmbeddings(args, self.txt_embeddings) if args.img_encoder == 'ViT': img_size = args.img_size patch_sz = 32 if img_size == 512 else 16 self.img_encoder = Img_patch_embedding(image_size=img_size, patch_size=patch_sz, dim=2048) else: self.img_encoder = ImageEncoder_cnn(args) for p in self.img_encoder.parameters(): p.requires_grad = False for c in list(self.img_encoder.children())[5:]: for p in c.parameters(): p.requires_grad = True self.encoder = bert.encoder self.pooler = bert.pooler
def load_model(self): self.tokenizer = BertTokenizer.from_pretrained(self.args.pretrained_path,do_lower_case=self.args.do_lower_case) self.config = BertConfig.from_pretrained(self.args.pretrained_path,num_labels=self.args.num_labels) if self.args.resume_model: self.model = BertForMultiLable.from_pretrained(self.args.resume_model_path,config=self.config) with open(self.threshold_path, 'r') as f: self.threshold = float(f.read()) # read the best model's threshold else: self.model = BertForMultiLable.from_pretrained(self.args.pretrained_path,config=self.config) if self.args.cuda: self.model.cuda() if self.args.n_gpus>1: self.model = DataParallel(self.model)
def __init__(self, config, args): super().__init__(config) self.args = args if args.bert_model == "emilyalsentzer/Bio_ClinicalBERT": bert = AutoModel.from_pretrained(args.bert_model) elif args.bert_model == "bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12": bert = AutoModel.from_pretrained(args.bert_model) elif args.bert_model == "bert-small-scratch": config = BertConfig.from_pretrained( "google/bert_uncased_L-4_H-512_A-8") bert = BertModel(config) elif args.bert_model == "bert-base-scratch": config = BertConfig.from_pretrained("bert-base-uncased") bert = BertModel(config) else: bert = BertModel.from_pretrained( args.bert_model) # bert-base-uncased, small, tiny self.txt_embeddings = bert.embeddings self.encoder = bert.encoder self.pooler = bert.pooler
def __init__(self, model_path, vocab: Vocabulary): super().__init__(vocab) self.pretrained_tokenizer = BertForPreTraining.from_pretrained( model_path) config = BertConfig.from_pretrained(model_path) bert_model = BertForPreTraining(config) self.bert = bert_model.bert tags = vocab.get_index_to_token_vocabulary("tags") num_tags = len(tags) constraints = allowed_transitions(constraint_type="BMES", labels=tags) self.projection = torch.nn.Linear(768, num_tags) self.crf = ConditionalRandomField(num_tags=num_tags, constraints=constraints, include_start_end_transitions=False)
def test_model_from_pretrained(self): logging.basicConfig(level=logging.INFO) for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: config = BertConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, PretrainedConfig) model = BertModel.from_pretrained(model_name) model, loading_info = BertModel.from_pretrained( model_name, output_loading_info=True) self.assertIsNotNone(model) self.assertIsInstance(model, PreTrainedModel) for value in loading_info.values(): self.assertEqual(len(value), 0) config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True) model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True) self.assertEqual(model.config.output_attentions, True) self.assertEqual(model.config.output_hidden_states, True) self.assertEqual(model.config, config)
def load_model(self): self.tokenizer = MyBertTokenizer.from_pretrained( self.args.pretrained_path, do_lower_case=self.args.do_lower_case) self.config = BertConfig.from_pretrained( self.args.pretrained_path, num_labels=self.args.num_labels) if self.args.resume_model: self.model = BertCrfForNer.from_pretrained( self.args.resume_model_path, config=self.config) else: self.model = BertCrfForNer.from_pretrained( self.args.pretrained_path, config=self.config) if self.args.cuda: self.model.cuda() if self.args.n_gpus > 1: self.model = DataParallel(self.model)
def model_builder(model_name_or_path: str, num_labels: int, feat_config_path: str = None, one_hot_embed: bool = True, use_lstm=False, device: torch.device = torch.device("cpu")): feature = None if feat_config_path is not None: feature = Feature(feat_config_path, one_hot_embed) config = BertConfig.from_pretrained(model_name_or_path, num_labels=num_labels) model = NerModel.from_pretrained(model_name_or_path, config=config, feature=feature, use_lstm=use_lstm, device=device) return config, model, feature
def __init__(self, config, args): super().__init__(config) if args.weight_load: config = AutoConfig.from_pretrained(args.load_pretrained_model) model_state_dict = torch.load( os.path.join(args.load_pretrained_model, 'pytorch_model.bin')) cxrbert = CXRBERT.from_pretrained(args.load_pretrained_model, state_dict=model_state_dict, config=config, args=args) else: config = BertConfig.from_pretrained('bert-base-uncased') cxrbert = CXRBERT(config, args) self.enc = cxrbert.enc self.itm = cxrbert.itm
def __init__(self, bert_model: str, max_layer=None, pool=True, freeze_embeddings=False): super().__init__() self.freeze_embeddings = freeze_embeddings config = BertConfig.from_pretrained(bert_model, cache_dir=TRANSFORMER_CACHE_DIR) if max_layer is not None and not pool: config.num_hidden_layers = max_layer self.pool = pool self.max_layer = max_layer self.embeddings = BertEmbeddings(config) if config.num_hidden_layers > 0: self.encoder = BertEncoder(config) self.encoder.output_hidden_states = True else: self.encoder = None if pool: self.pooler = BertPooler(config) else: self.pooler = None self.config = config self.bert_model = bert_model
def init_encoder( cls, cfg_name: str, num_hidden_layers: int, projection_dim: int = 0, dropout: float = 0.1, pretrained: bool = True, **kwargs ) -> BertModel: cfg = BertConfig.from_pretrained(cfg_name if cfg_name else "bert-base-uncased", num_hidden_layers=num_hidden_layers, **kwargs) if dropout != 0: cfg.attention_probs_dropout_prob = dropout cfg.hidden_dropout_prob = dropout if pretrained: return cls.from_pretrained( cfg_name, config=cfg, project_dim=projection_dim, ) else: return cls(cfg, project_dim=projection_dim)
def model_builder_from_pretrained(model_name_or_path, num_labels, pre_train_path, feat_dir: str = None, one_hot_embed: bool = True, use_lstm=False, device: torch.device = torch.device("cpu")): feature = None if feat_dir is not None: feature = Feature(feat_dir + "/feature_config.json", one_hot_embed) config = BertConfig.from_pretrained(model_name_or_path, num_labels=num_labels) model = NerModel.from_pretrained(model_name_or_path, config=config, feature=feature, use_lstm=False, device=device) model.load_state_dict( torch.load(pre_train_path + "/vner_model.bin", map_location='cpu')) model.eval() return config, model, feature
def init_encoder(cls, cfg_name: str, projection_dim: int = 0, dropout: float = 0.1, num_hidden_layers: int = 12, num_attention_heads: int = 12, pretrained: bool = True, **kwargs) -> BertModel: cfg = BertConfig.from_pretrained( cfg_name if cfg_name else "bert-base-uncased") if dropout != 0: cfg.attention_probs_dropout_prob = dropout cfg.hidden_dropout_prob = dropout cfg.num_hidden_layers = num_hidden_layers cfg.num_attention_heads = num_attention_heads cfg.pooler_num_attention_heads = num_attention_heads # careful here logger.info(f'new bert cfg:\n{cfg}') if pretrained: return cls.from_pretrained(cfg_name, config=cfg, project_dim=projection_dim, **kwargs) else: return HFBertEncoder(cfg, project_dim=projection_dim)
def test_forward(self): img_feature_dim = 2054 bert_model_name = "bert-base-uncased" use_img_layernorm = True img_layer_norm_eps = 1e-12 bert_config = BertConfig.from_pretrained(bert_model_name) # augment hf BertConfig for vinvl BertImgModel config bert_config.img_feature_dim = img_feature_dim bert_config.use_img_layernorm = use_img_layernorm bert_config.img_layer_norm_eps = img_layer_norm_eps model = VinVLBase(bert_config) model.eval() model = model.to(get_current_device()) bs = 8 num_feats = 70 max_sentence_len = 25 input_ids = torch.ones((bs, max_sentence_len), dtype=torch.long) img_feat = torch.rand((bs, num_feats, img_feature_dim)) with torch.no_grad(): model_output = model(input_ids, img_feat).last_hidden_state self.assertEqual(model_output.shape, torch.Size([8, 95, 768]))
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_neg_data', type=Path, required=True) parser.add_argument('--pregenerated_data', type=Path, required=True) parser.add_argument('--output_dir', type=Path, required=True) parser.add_argument( "--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--do_lower_case", action="store_true") parser.add_argument( "--reduce_memory", action="store_true", help= "Store training data as on-disc memmaps to massively reduce memory usage" ) parser.add_argument("--max_seq_len", default=512, type=int) parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--kr_batch_size", default=8, type=int, help="Total batch size for training.") parser.add_argument("--kr_freq", default=0.7, type=float) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--learning_rate", default=1e-4, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") args = parser.parse_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}.json" metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break else: num_data_epochs = args.epochs if args.local_rank == -1 or args.no_cuda: print(torch.cuda.is_available()) device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() print(n_gpu) print("no gpu?") else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) print("GPU Device: ", device) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs logging.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) # if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) pt_output = Path(getenv('PT_OUTPUT_DIR', '')) args.output_dir = Path(os.path.join(pt_output, args.output_dir)) if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning( f"Output directory ({args.output_dir}) already exists and is not empty!" ) args.output_dir.mkdir(parents=True, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int(total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model config = BertConfig.from_pretrained(args.bert_model) # config.num_hidden_layers = args.num_layers model = FuckWrapper(config) model.to(device) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) model.train() before_train_path = Path(os.path.join(args.output_dir, "before_training")) print("Before training path: ", before_train_path) before_train_path.mkdir(parents=True, exist_ok=True) model.save_pretrained(os.path.join(args.output_dir, "before_training")) tokenizer.save_pretrained(os.path.join(args.output_dir, "before_training")) neg_epoch_dataset = PregeneratedDataset( epoch=0, training_path=args.pregenerated_neg_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) if args.local_rank == -1: neg_train_sampler = RandomSampler(neg_epoch_dataset) else: neg_train_sampler = DistributedSampler(neg_epoch_dataset) neg_train_dataloader = DataLoader(neg_epoch_dataset, sampler=neg_train_sampler, batch_size=args.train_batch_size) def inf_train_gen(): while True: for kr_step, kr_batch in enumerate(neg_train_dataloader): yield kr_step, kr_batch kr_gen = inf_train_gen() for epoch in range(args.epochs): epoch_dataset = PregeneratedDataset( epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 if n_gpu > 1 and args.local_rank == -1 or (n_gpu <= 1): logging.info("** ** * Saving fine-tuned model ** ** * ") model.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: for step, batch in enumerate(train_dataloader): model.train() batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids = batch outputs = model(input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids, masked_lm_labels=lm_label_ids, negated=False) loss = outputs[0] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) if args.local_rank == 0 or args.local_rank == -1: nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() # Update learning rate schedule optimizer.step() optimizer.zero_grad() global_step += 1 if random.random() > args.kr_freq: kr_step, kr_batch = next(kr_gen) kr_batch = tuple(t.to(device) for t in kr_batch) input_ids, input_mask, segment_ids, lm_label_ids = kr_batch outputs = model(input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids, masked_lm_labels=lm_label_ids, negated=True) loss = outputs[0] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) if args.local_rank == -1: nb_tr_steps += 1 mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() # Update learning rate schedule optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model if n_gpu > 1 and args.local_rank == -1 or (n_gpu <= 1): logging.info("** ** * Saving fine-tuned model ** ** * ") model.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir)
if __name__ == '__main__': source_path = '../data/train/source.txt' target_path = '../data/train/target.txt' keyword_path = '../data/train/TextRank.txt' eval_source_path = '../data/eval/source.txt' eval_target_path = '../data/eval/target.txt' eval_keyword_path = '../data/eval/TextRank.txt' log_path = '../log/log.txt' bert_path = '../../chinese_wwm_ext_pytorch/' pre_trainModel = '../model/model.pth' # 断点训练的模型名称 log = open(log_path, 'w', encoding='utf-8') rouge = Rouge() # 评估指标 device = torch.device('cuda:0') # 加载BERT模型 bert_config = BertConfig.from_pretrained(bert_path + 'bert_config.json') # 配置文件 bert_model = BertModel.from_pretrained(bert_path + 'pytorch_model.bin', config=bert_config) # 模型 bert_model.to(device) tokenizer = BertTokenizer.from_pretrained(bert_path + 'vocab.txt') # 词包 config = Config() # 训练集 loader = DataLoader(dataset=MyDataSet(source_path, target_path, keyword_path, tokenizer), batch_size=config.batch_size, shuffle=True, num_workers=0, collate_fn=pad, drop_last=False) # 最后一个batch数据集不丢弃 # 评估集
from oscar.modeling.modeling_distilbert import DistilBertForImageCaptioning from transformers.modeling_bert import BertConfig if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--dump_source", default="", type=str) parser.add_argument("--target_config", default="", type=str) parser.add_argument("--dump_target", default="", type=str) parser.add_argument("--vocab_transform", action="store_true") args = parser.parse_args() f = open("/home/ubuntu/mmml/layers.log", 'w') model = BertForImageCaptioning.from_pretrained(args.dump_source) new_model = DistilBertForImageCaptioning( BertConfig.from_pretrained(args.target_config)) state_dict = model.state_dict() compressed_sd = {} for name, param in model.named_parameters(): if param.requires_grad: print(name, param.data.shape, file=f) print("\n\n", file=f) for name, param in new_model.named_parameters(): if param.requires_grad: print(name, param.data.shape, file=f) prefix = "bert"
# Vocab and Tokenizer ptr_dir = Path("pretrained") vocab_filepath = ptr_dir / "{}-vocab.pkl".format(args.type) with open(vocab_filepath, mode='rb') as io: vocab = pickle.load(io) ptr_tokenizer = BertTokenizer.from_pretrained(args.type, do_lower_case="uncased" in args.type) ptr_tokenizer = Tokenizer(vocab, ptr_tokenizer.tokenize) preprocessor = PreProcessor(ptr_tokenizer, model_config.max_len) # Load Model config_filepath = ptr_dir / "{}-config.json".format(args.type) config = BertConfig.from_pretrained(config_filepath, output_hidden_states=False) model = BIIN(config, vocab, model_config.hidden_size, enc_num_layers=len(model_config.hidden_size)) # Data Loader tr_ds = Corpus(data_config.tr_path, preprocessor.preprocess, sep='\t', doc_col='question1', label_col='is_duplicate', is_pair=True, doc_col_second='question2') val_ds = Corpus(data_config.dev_path, preprocessor.preprocess,
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_neg_data', type=Path, required=True) parser.add_argument('--pregenerated_pos_data', type=Path, required=True) parser.add_argument('--validation_neg_data', type=Path, required=True) parser.add_argument('--validation_pos_data', type=Path, required=True) parser.add_argument('--pregenerated_data', type=Path, required=True) parser.add_argument('--output_dir', type=Path, required=True) parser.add_argument('--exp_group', type=str, required=True) parser.add_argument( "--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--method", type=str, choices=[ 'neg_samebatch', 'distill_samebatch', 'distill_samebatch_lstm', 'distill', 'kl', 'unlikelihood' ]) parser.add_argument("--do_lower_case", action="store_true") parser.add_argument("--save_before", action='store_true') parser.add_argument( "--reduce_memory", action="store_true", help= "Store training data as on-disc memmaps to massively reduce memory usage" ) parser.add_argument("--max_seq_len", default=512, type=int) parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--port_idx", type=int) parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--valid_batch_size", default=64, type=int, help="Total batch size for training.") parser.add_argument("--kr_freq", default=0.0, type=float) parser.add_argument("--mlm_freq", default=0, type=float) parser.add_argument("--kl_w", default=1000, type=float) parser.add_argument("--ul_w", default=1, type=float) parser.add_argument("--gamma", default=0.5, type=float, help="coeff of UL and 1-coeff of LL") parser.add_argument('--no_mlm', action='store_true', help="don't do any MLM training") parser.add_argument("--no_tie", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--no_ul', action='store_true', help="don't do any UL training") parser.add_argument('--no_ll', action='store_true', help="don't do any LL training") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--learning_rate", default=1e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") args = parser.parse_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}.json" metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break else: num_data_epochs = args.epochs if args.local_rank == -1 or args.no_cuda: print(torch.cuda.is_available()) device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() print("Num of gpus: ", n_gpu) else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) print("GPU Device: ", device) n_gpu = 1 dist_comms.init_distributed_training(args.local_rank, args.port_idx) # Initializes the distributed backend which will take care of sychronizing nodes/GPUs logging.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) # if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) pt_output = Path(getenv('PT_OUTPUT_DIR', '')) args.output_dir = Path(os.path.join(pt_output, args.output_dir)) if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning( f"Output directory ({args.output_dir}) already exists and is not empty!" ) args.output_dir.mkdir(parents=True, exist_ok=True) if args.bert_model != "roberta-base": tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) else: tokenizer = RobertaTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) tokenizer.vocab = tokenizer.encoder total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int(total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model if args.bert_model != "roberta-base": if args.method == "neg_samebatch": config = BertConfig.from_pretrained(args.bert_model) config.bert_model = args.bert_model core_model = BertForNegSameBatch.from_pretrained(args.bert_model, args.gamma, config=config) core_model.init_orig_bert() elif args.method == "unlikelihood": config = BertConfig.from_pretrained(args.bert_model) core_model = BertForNegPreTraining.from_pretrained(args.bert_model, config=config) else: raise NotImplementedError( f"method {args.method} is not implemented") else: config = RobertaConfig.from_pretrained(args.bert_model) core_model = RobertaForNegPreTraining.from_pretrained(args.bert_model) core_model = core_model.to(device) # Prepare optimizer param_optimizer = list(core_model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) core_model, optimizer = amp.initialize(core_model, optimizer, opt_level=args.fp16_opt_level) model = torch.nn.parallel.DistributedDataParallel( core_model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) model.train() if args.local_rank == 0 or args.local_rank == -1: if args.save_before: before_train_path = Path( os.path.join(args.output_dir, "before_training")) print("Before training path: ", before_train_path) before_train_path.mkdir(parents=True, exist_ok=True) model.module.save_pretrained( os.path.join(args.output_dir, "before_training")) tokenizer.save_pretrained( os.path.join(args.output_dir, "before_training")) # writer = SummaryWriter(log_dir=args.output_dir) wandb.init(project="neg_v2", name=str(args.output_dir).split("/")[-1], group=args.exp_group, entity='negation') mlm_averagemeter = AverageMeter() ul_averagemeter = AverageMeter() ll_averagemeter = AverageMeter() kl_averagemeter = AverageMeter() neg_epoch_dataset = PregeneratedDataset( epoch=0, training_path=args.pregenerated_neg_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) pos_epoch_dataset = PregeneratedDataset( epoch=0, training_path=args.pregenerated_pos_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) neg_validation_dataset = PregeneratedDataset( epoch=0, training_path=args.validation_neg_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) pos_validation_dataset = PregeneratedDataset( epoch=0, training_path=args.validation_pos_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) if args.local_rank == -1: neg_train_sampler = RandomSampler(neg_epoch_dataset) pos_train_sampler = RandomSampler(pos_epoch_dataset) neg_valid_sampler = RandomSampler(neg_validation_dataset) pos_valid_sampler = RandomSampler(pos_validation_dataset) else: neg_train_sampler = DistributedSampler(neg_epoch_dataset) pos_train_sampler = DistributedSampler(pos_epoch_dataset) neg_valid_sampler = DistributedSampler(neg_validation_dataset) pos_valid_sampler = DistributedSampler(pos_validation_dataset) neg_train_dataloader = DataLoader(neg_epoch_dataset, sampler=neg_train_sampler, batch_size=args.train_batch_size) pos_train_dataloader = DataLoader(pos_epoch_dataset, sampler=pos_train_sampler, batch_size=args.train_batch_size) neg_valid_dataloader = DataLoader(neg_validation_dataset, sampler=neg_valid_sampler, batch_size=args.valid_batch_size) pos_valid_dataloader = DataLoader(pos_validation_dataset, sampler=pos_valid_sampler, batch_size=args.valid_batch_size) def inf_train_gen(): while True: for kr_step, kr_batch in enumerate(neg_train_dataloader): yield kr_step, kr_batch kr_gen = inf_train_gen() def pos_inf_train_gen(): while True: for kr_step, kr_batch in enumerate(pos_train_dataloader): yield kr_step, kr_batch pos_kr_gen = pos_inf_train_gen() mlm_loss, neg_loss = 0, 0 mlm_nb_it, neg_nb_it = 1, 1 mlm_nb_ex, neg_nb_ex = 0, 0 for epoch in range(args.epochs): epoch_dataset = PregeneratedDataset( epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 ul_tr_loss = 0 nb_ul_tr_examples, nb_ul_tr_steps = 0, 1 ll_tr_loss = 0 nb_ll_tr_examples, nb_ll_tr_steps = 0, 1 kl_tr_loss = 0 nb_kl_tr_examples, nb_kl_tr_steps = 0, 1 if n_gpu > 1 and args.local_rank == -1 or (n_gpu <= 1 and args.local_rank == 0): logging.info("** ** * Saving fine-tuned model ** ** * ") model.module.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: for step, batch in enumerate(train_dataloader): if not args.no_mlm and (random.random() > args.mlm_freq): model.train() batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids = batch outputs = model(input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids, masked_lm_labels=lm_label_ids, negated=False) loss = outputs[1] loss_dict = outputs[0] mlm_loss += loss_dict['mlm'].item() mlm_nb_it += 1 mlm_nb_ex += input_ids.size(0) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if args.local_rank == 0 or args.local_rank == -1: mlm_averagemeter.update(loss_dict['mlm'].item()) # writer.add_scalar('MLM/train', loss_dict['mlm'].item(), mlm_nb_it) wandb.log({'MLM/train': loss_dict['mlm'].item()}) nb_tr_steps += 1 nb_ll_tr_steps += 1 mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str( f"MLM: {mlm_averagemeter:.6f}, UL: {ul_averagemeter:.6f}, LL: {ll_averagemeter:.6f}, KL: {kl_averagemeter:.6f}" ) if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() # Update learning rate schedule optimizer.step() optimizer.zero_grad() global_step += 1 pbar.update(1) random_num = random.random() if random_num > args.kr_freq: if args.method in ["neg_samebatch"]: ul_step, ul_batch = next(kr_gen) ul_batch = tuple(t.to(device) for t in ul_batch) ul_input_ids, ul_input_mask, ul_segment_ids, ul_lm_label_ids = ul_batch ll_step, ll_batch = next(pos_kr_gen) ll_batch = tuple(t.to(device) for t in ll_batch) ll_input_ids, ll_input_mask, ll_segment_ids, ll_lm_label_ids = ll_batch batch_mask = torch.zeros( (ul_input_ids.size(0) + ll_input_ids.size(0)), dtype=ll_input_mask.dtype, device=device) batch_mask[:ul_input_ids.size(0)] = 1. outputs = model( input_ids=torch.cat([ul_input_ids, ll_input_ids], 0), attention_mask=torch.cat( [ul_input_mask, ll_input_mask], 0), token_type_ids=torch.cat( [ul_segment_ids, ll_segment_ids], 0), masked_lm_labels=torch.cat( [ul_lm_label_ids, ll_lm_label_ids], 0), negated=True, batch_neg_mask=batch_mask) loss = outputs[1] * args.ul_w loss_dict = outputs[0] if args.local_rank == 0 or args.local_rank == -1: wandb.log({ 'UL/train': loss_dict['neg'].item(), 'LL/train': loss_dict['pos'].item() }) ul_averagemeter.update(loss_dict['neg'].item()) ll_averagemeter.update(loss_dict['pos'].item()) neg_nb_it += 1 elif random.random() > 0.5 and not args.no_ul: kr_step, kr_batch = next(kr_gen) kr_batch = tuple(t.to(device) for t in kr_batch) input_ids, input_mask, segment_ids, lm_label_ids = kr_batch outputs = model(input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids, masked_lm_labels=lm_label_ids, negated=True) loss = outputs[1] * args.ul_w loss_dict = outputs[0] nb_ul_tr_steps += 1 neg_loss += loss_dict['neg'].item() if args.local_rank == 0 or args.local_rank == -1: wandb.log({ 'UL/train': loss_dict['neg'].item(), 'KL/train': loss_dict['kl'].item() * args.kl_w }) ul_averagemeter.update(loss_dict['neg'].item()) kl_averagemeter.update(loss_dict['kl'].item() * args.kl_w) neg_nb_it += 1 elif not args.no_ll: kr_step, kr_batch = next(pos_kr_gen) kr_batch = tuple(t.to(device) for t in kr_batch) input_ids, input_mask, segment_ids, lm_label_ids = kr_batch outputs = model(input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids, masked_lm_labels=lm_label_ids, negated=False) loss = outputs[1] loss_dict = outputs[0] nb_ll_tr_steps += 1 mlm_loss += loss_dict['mlm'].item() mlm_nb_it += 1 if args.local_rank == 0 or args.local_rank == -1: wandb.log({'LL/train': loss_dict['mlm'].item()}) ll_averagemeter.update(loss_dict['mlm'].item()) mlm_nb_ex += input_ids.size(0) else: continue if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if args.local_rank == 0 or args.local_rank == -1: nb_tr_steps += 1 mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str( f"MLM: {mlm_averagemeter:.6f}, UL: {ul_averagemeter:.6f}, LL: {ll_averagemeter:.6f}, KL: {kl_averagemeter:.6f}" ) if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() # Update learning rate schedule optimizer.step() optimizer.zero_grad() global_step += 1 if n_gpu > 1 and args.local_rank == -1 or ( n_gpu <= 1 and args.local_rank == 0): if False and (step + 1) % 100 == 0: neg_valid_res = validate( model=model, dataloader=neg_valid_dataloader, device=device, negated=True) pos_valid_res = validate( model=model, dataloader=pos_valid_dataloader, device=device, negated=False) wandb.log({ 'neg/valid/p@1': neg_valid_res % 100., 'pos/valid/p@1': pos_valid_res % 100. }) # Save a trained model if n_gpu > 1 and args.local_rank == -1 or (n_gpu <= 1 and args.local_rank == 0): print("Saving model") logging.info("** ** * Saving fine-tuned model ** ** * ") model.module.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) print(str(wandb.run.id)) pickle.dump( str(wandb.run.id), open(os.path.join(args.output_dir, 'wandb_run_id.pkl'), 'wb'))
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = BertConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = BertConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = BertTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = BertTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name") if model_args.model_name_or_path: model = BertForTagRankingLate.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = BertForTagRanking.from_config(config) # add vocab for special tokens and hashtags special_tokens = ['<img>', '<loc>', '<time>'] num_added_special_toks = tokenizer.add_tokens(special_tokens) print('We have added', num_added_special_toks, 'special tokens') tokenizer.img_token = '<img>' tokenizer.loc_token = '<loc>' tokenizer.time_token = '<time>' print(tokenizer.convert_tokens_to_ids(special_tokens)) assert tokenizer.img_token == '<img>' assert tokenizer.loc_token == '<loc>' assert tokenizer.time_token == '<time>' with open(data_args.tag_list) as f: tag_list = f.readlines() tag_list = ' '.join(tag_list).replace('\n', '').split() num_added_toks = tokenizer.add_tokens(tag_list) print('tag_list:', data_args.tag_list) print('We have added', num_added_toks, 'tokens for hashtags') print('total vocab_size:', len(tokenizer)) model.resize_token_embeddings(len(tokenizer)) if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) # Get datasets train_dataset = get_dataset( data_args, tokenizer=tokenizer) if training_args.do_train else None eval_dataset = get_dataset( data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None data_collator = DataCollatorForTagGeneration(config.vocab_size) training_args.per_device_eval_batch_size = 1 # force eval_batch as 1 # Initialize our Trainer trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=data_collator) # Training if training_args.do_train: model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") dataloader = trainer.get_eval_dataloader(eval_dataset) # multi-gpu eval if training_args.n_gpu > 1: model = torch.nn.DataParallel(model) description = "Evaluation" batch_size = dataloader.batch_size logger.info("***** Running %s *****", description) logger.info(" Num examples = %d", len(dataloader.dataset)) logger.info(" Batch size = %d", batch_size) model.eval() if is_torch_tpu_available(): dataloader = pl.ParallelLoader( dataloader, [training_args.device]).per_device_loader(training_args.device) results = {} for eid, example in enumerate(tqdm(dataloader, desc=description)): feature = convert_example_to_feature(example, tokenizer, data_args.block_size) image_ids = torch.tensor([feature['image_ids']], dtype=torch.long).to(training_args.device) location_ids = torch.tensor([feature['location_ids']], dtype=torch.long).to( training_args.device) time_ids = torch.tensor([feature['time_ids']], dtype=torch.long).to(training_args.device) text_ids = torch.tensor([feature['text_ids']], dtype=torch.long).to(training_args.device) pid = feature['pid'] inputs = { 'image_ids': image_ids, 'location_ids': location_ids, 'time_ids': time_ids, 'text_ids': text_ids } with torch.no_grad(): outputs = model(**inputs) logits = outputs[0] logit_for_cls = logits[0] orig_vocab_size = 30522 added_special_toks_size = 3 # <img>, <loc>, <time> logit_for_cls[:orig_vocab_size + added_special_toks_size] = -float('inf') probabilities = F.softmax(logit_for_cls, 0).detach().cpu() probs, predicted_indices = torch.topk(probabilities, k=10) predicted_tokens = tokenizer.convert_ids_to_tokens( predicted_indices) while pid in results: pid = pid + '_' results[pid] = predicted_tokens results_save_path = os.path.join(training_args.output_dir, 'results.json') with open(results_save_path, 'w') as f: logger.info("saved results.json into %s", training_args.output_dir) json.dump(results, f)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Model parameters %s", model_args) logger.info("Data parameters %s", data_args) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = BertConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = BertConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = BertConfig() logger.warning( "You are instantiating a new config instance from scratch.") config.loss_fct = model_args.loss_fct if model_args.tokenizer_name: tokenizer = BertTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = BertTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name") if model_args.model_name_or_path: model = BertForTagGeneration.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir) else: logger.info("Training new model from scratch") model = BertForTagGeneration.from_config(config) # add vocab for special tokens and hashtags special_tokens = ['<img>', '<loc>', '<time>'] num_added_special_toks = tokenizer.add_tokens(special_tokens) print('We have added', num_added_special_toks, 'special tokens') tokenizer.img_token = '<img>' tokenizer.loc_token = '<loc>' tokenizer.time_token = '<time>' print(tokenizer.convert_tokens_to_ids(special_tokens)) assert tokenizer.img_token == '<img>' assert tokenizer.loc_token == '<loc>' assert tokenizer.time_token == '<time>' with open(data_args.tag_list) as f: tag_list = f.readlines() tag_list = ' '.join(tag_list).replace('\n', '').split() num_added_toks = tokenizer.add_tokens(tag_list) print('tag_list:', data_args.tag_list) print('We have added', num_added_toks, 'tokens for hashtags') print('total vocab_size:', len(tokenizer)) model.resize_token_embeddings(len(tokenizer)) if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) neptune_project_name = 'junmokang/bertinsta' neptune_experiment_name = 'bertinsta-generation' if not training_args.do_eval: if is_torch_tpu_available(): if xm.get_ordinal() == 0: neptune.init(neptune_project_name) neptune.create_experiment(name=neptune_project_name, params=training_args.__dict__) else: neptune.init(neptune_project_name) neptune.create_experiment(name=neptune_project_name, params=training_args.__dict__) # Get datasets train_dataset = get_dataset( data_args, tokenizer=tokenizer, loss_fct=model_args.loss_fct) if training_args.do_train else None eval_dataset = get_dataset( data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None data_collator = DataCollatorForTagGeneration(config.vocab_size, loss_fct=model_args.loss_fct) training_args.per_device_eval_batch_size = 1 # force eval_batch as 1 # Initialize our Trainer trainer = Trainer(model=model, args=training_args, neptune=neptune, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=data_collator) # Training if training_args.do_train: model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") dataloader = trainer.get_eval_dataloader(eval_dataset) # multi-gpu eval if training_args.n_gpu > 1: model = torch.nn.DataParallel(model) description = "Evaluation" batch_size = dataloader.batch_size logger.info("***** Running %s *****", description) logger.info(" Num examples = %d", len(dataloader.dataset)) logger.info(" Batch size = %d", batch_size) model.eval() if is_torch_tpu_available(): dataloader = pl.ParallelLoader( dataloader, [training_args.device]).per_device_loader(training_args.device) results = {} grouping_results = {} # interaction_matrix = np.zeros((6, 6)) # feature interaction beam_width = 1 top_k = 10 # tag to contexts mapping context_list = [ 'emotion', 'mood', 'location', 'time', 'object', 'activity', 'event', 'others' ] context2ids = {c: [] for c in context_list} if data_args.tag2contexts: with open(data_args.tag2contexts) as f: tag2contexts = json.load(f) for tag, contexts in tag2contexts.items(): for c in contexts: context2ids[c].append(tag) for c in context_list: context2ids[c] = tokenizer.convert_tokens_to_ids( context2ids[c]) for eid, example in enumerate(tqdm(dataloader, desc=description)): generated_tags = beam_decode(beam_width, top_k, model, example, tokenizer, data_args.block_size, training_args.device) # generated_tags = beam_decode(beam_width, top_k, model, example, tokenizer, data_args.block_size, training_args.device, None, interaction_matrix) # feature interaction results[example['pid']] = generated_tags grouping_results[example['pid']] = {} grouping_results[example['pid']]['all'] = generated_tags # print('all:', str(generated_tags)) # diverse generation (according to context) if data_args.tag2contexts: for context in context_list: generated_tags = beam_decode(beam_width, top_k, model, example, tokenizer, data_args.block_size, training_args.device, context2ids[context]) grouping_results[example['pid']][context] = generated_tags # print(context, ':', str(generated_tags)) # with np.printoptions(precision=2, suppress=True): # feature interaction # print(interaction_matrix) # print(interaction_matrix.sum(1)) # print(interaction_matrix / interaction_matrix.sum(1)) results_save_path = os.path.join(training_args.output_dir, 'results.json') with open(results_save_path, 'w') as f: logger.info("saved results.json into %s", training_args.output_dir) json.dump(results, f) grouping_results_save_path = os.path.join(training_args.output_dir, 'grouping_results.json') with open(grouping_results_save_path, 'w') as f: logger.info("saved grouping_results.json into %s", training_args.output_dir) json.dump(grouping_results, f)
def run_bert_mwa_torch(args): vocab_file_path = os.path.join( bert_mwa_config.get("bert_pretrained_model_path"), bert_mwa_config.get("vocab_file")) bert_config_file = os.path.join( bert_mwa_config.get("bert_pretrained_model_path"), bert_mwa_config.get("bert_config_path")) slot_file = os.path.join( bert_mwa_config.get("slot_list_root_path"), bert_mwa_config.get("bert_slot_complete_file_name")) data_loader = bertWordPrepareData(vocab_file_path, slot_file, bert_mwa_config, None, 384, None, None, False, False, True) label2id = data_loader.tokenizer.slot2id train_features = data_loader.load_cache_train_dev_data() train_dataset = create_dataset_for_torch(train_features) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) valid_features = data_loader.load_cache_train_dev_data(False) valid_dataset = create_dataset_for_torch(valid_features) valid_sampler = SequentialSampler(valid_dataset) valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.valid_batch_size) model = BertForMWA device = torch.device("cuda:0") if args.do_train: model = model.from_pretrained( bert_mwa_config.get("bert_pretrained_model_path"), device=device, label2ids=label2id) model = model.to(device) if data_loader.train_samples_nums % args.train_batch_size != 0: each_epoch_steps = int( data_loader.train_samples_nums / args.train_batch_size) + 1 else: each_epoch_steps = int(data_loader.train_samples_nums / args.train_batch_size) train_steps_nums = each_epoch_steps * args.epochs param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr, eps=1e-6) optimizer = BertAdam(optimizer_grouped_parameters, lr=args.lr, max_grad_norm=args.clip_norm, warmup=0.1, t_total=train_steps_nums) # scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=args.train_steps) logger.info("***** Running training *****") logger.info(" Num examples = %d", data_loader.train_samples_nums) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", train_steps_nums) trainer(model, optimizer, train_dataloader, valid_dataloader, args.epochs, train_steps_nums, each_epoch_steps, data_loader.tokenizer.id2slot, device, logger, args) if args.do_test: test_features = data_loader.load_cache_train_dev_data(False, True) test_dataset = create_dataset_for_torch(test_features) test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.test_batch_size) bertconfig = BertConfig.from_pretrained( bert_mwa_config.get("bert_pretrained_model_path")) model = BertForMWA(bertconfig, label2id, device) model.load_state_dict(state_dict=torch.load( bert_mwa_config.get(args.model_checkpoint_dir) + "/pytorch_model.bin")) # model = model.from_pretrained(,device=device,label2ids=label2id) model.to(device) predict_all_and_evaluate(model, test_dataloader, data_loader.tokenizer.id2slot, device, logger, "data/orig_data_test.txt", args)
def main(): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # device = torch.device('cpu') if torch.cuda.is_available(): print("current device: ", torch.cuda.current_device()) # special token SOPH = '<soph>' NSOPH = '<nsoph>' config = BertConfig.from_pretrained('bert-base-uncased') # constant the seed SEED = 1234 random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) torch.cuda.manual_seed(SEED) torch.backends.cudnn.deterministic = True tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') num_added_token = tokenizer.add_tokens(([SOPH, NSOPH])) INPUT_DIM = len(tokenizer) # len(SRC.vocab) OUTPUT_DIM = len(tokenizer) # len(TRG.vocab) HID_DIM = 768 DEC_LAYERS = 3 DEC_HEADS = 8 DEC_PF_DIM = 512 ENC_DROPOUT = 0.1 DEC_DROPOUT = 0.1 SRC_PAD_IDX = 0 TRG_PAD_IDX = 0 BATCH_SIZE = 100 MAX_SEQ_LEN = 50 N_EPOCHS = 5 CLIP = 1 LEARNING_RATE = 0.0005 SAVE_PATH = 'tut6-model.pt' LOAD_PATH = 'tut6-model.pt' unfreeze_bert = False do_load = False do_train = False do_eval = False do_generate = True dec = Decoder(OUTPUT_DIM, HID_DIM, DEC_LAYERS, DEC_HEADS, DEC_PF_DIM, DEC_DROPOUT, device) model = Seq2Seq(dec, SRC_PAD_IDX, TRG_PAD_IDX, config, device).to(device) # Resize tokenizer model.bert_encoder.resize_token_embeddings(len(tokenizer)) model.decoder.apply(initialize_weights) optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX) best_valid_loss = float('inf') processor = DiscoFuseProcessor() valid_iterator, num_val_ex = make_DataLoader(data_dir='./', processor=processor, tokenizer=tokenizer, max_seq_length=MAX_SEQ_LEN, batch_size=BATCH_SIZE, mode="dev", SOPH=SOPH, NSOPH=NSOPH, domain="sports") if do_train: for param in model.bert_encoder.parameters(): param.requires_grad = unfreeze_bert print(f'The model has {count_parameters(model):,} trainable parameters') train_iterator, num_tr_ex = make_DataLoader(data_dir='./', processor=processor, tokenizer=tokenizer, max_seq_length=MAX_SEQ_LEN, batch_size=BATCH_SIZE, mode="train", SOPH=SOPH, NSOPH=NSOPH) print("---- Begin Training ----") if do_load and os.path.exists(LOAD_PATH): print("---- Loading model from {} ----".format(LOAD_PATH)) model.load_state_dict(torch.load(LOAD_PATH)) for epoch in range(N_EPOCHS): start_time = time.time() num_batches_in_epoch = int(num_tr_ex/BATCH_SIZE) # 10000 train_loss = train(model, train_iterator, optimizer, criterion, CLIP, num_batches_in_epoch, device=device) valid_loss, valid_exact = evaluate(model, valid_iterator, criterion, device=device, tokenizer=tokenizer) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(model.state_dict(), SAVE_PATH) print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s') print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}') print(f'\t Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f}') print(f'\t Val. EXACT: {valid_exact:.2f}') elif do_eval: print("Doing only evaluation") model.load_state_dict(torch.load(LOAD_PATH)) valid_loss, valid_exact = evaluate(model, valid_iterator, criterion, device=device, tokenizer=tokenizer) print(f'\t Val. Loss: {valid_loss:.3f} | Val. EXACT: {valid_exact:3.3f}') elif do_generate: print("Doing only generation") model.load_state_dict(torch.load(LOAD_PATH)) all_predictions, all_trgs, all_counter_predictions = generate(model, valid_iterator, device, tokenizer) all_counter_pred_str = [" ".join(a).replace(" ##", "") for a in all_counter_predictions] all_pred_str = [" ".join(a).replace(" ##", "") for a in all_predictions] all_trgs_str = [" ".join(a).replace(" ##", "") for a in all_trgs] with open("generated_fuse.txt", 'a') as fp: for i in range(len(all_predictions)): counter_pred_line = "Counter pred: " + all_counter_pred_str[i] + "\n" pred_line = "Origin pred: " + all_pred_str[i] + "\n" trg_line = "origin trg: " + all_trgs_str[i] + "\n\n" fp.writelines(counter_pred_line) fp.writelines(pred_line) fp.writelines(trg_line) else: raise ValueError("Error - must either train evaluate, or generate!")