def forward( self, pixel_values=None, head_mask=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: Examples: ```python >>> from transformers import AutoFeatureExtractor, SwinForImageClassification >>> from PIL import Image >>> import requests >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) >>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/swin-tiny-patch4-window7-224") >>> model = SwinForImageClassification.from_pretrained("microsoft/swin-tiny-patch4-window7-224") >>> inputs = feature_extractor(images=image, return_tensors="pt") >>> outputs = model(**inputs) >>> logits = outputs.logits >>> # model predicts one of the 1000 ImageNet classes >>> predicted_class_idx = logits.argmax(-1).item() >>> print("Predicted class:", model.config.id2label[predicted_class_idx]) ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.swin( pixel_values, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] logits = self.classifier(sequence_output) loss = None if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) if not return_dict: output = (logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, )
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': opt.device = torch.device(f'cuda:{index}') if opt.distributed: opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index dist.init_process_group(backend='nccl', init_method=opt.dist_url, world_size=opt.world_size, rank=opt.dist_rank) opt.batch_size = int(opt.batch_size / opt.ngpus_per_node) opt.n_threads = int( (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node) opt.is_master_node = not opt.distributed or opt.dist_rank == 0 model = generate_model(opt) if opt.batchnorm_sync: assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.' model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.model, opt.n_finetune_classes) if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) model = make_data_parallel(model, opt.distributed, opt.device) if opt.pretrain_path: parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) else: parameters = model.parameters() if opt.is_master_node: print(model) # criterion = CrossEntropyLoss().to(opt.device) criterion = MSELoss().to(opt.device) if not opt.no_train: (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) if opt.resume_path is not None: opt.begin_epoch, optimizer, scheduler = resume_train_utils( opt.resume_path, opt.begin_epoch, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones if not opt.no_val: val_loader, val_logger = get_val_utils(opt) if opt.tensorboard and opt.is_master_node: from torch.utils.tensorboard import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None prev_val_loss = None for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) current_lr = get_lr(optimizer) train_epoch(i, train_loader, model, criterion, optimizer, opt.device, current_lr, train_logger, train_batch_logger, tb_writer, opt.distributed) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if not opt.no_val: prev_val_loss = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer, opt.distributed) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.step() elif not opt.no_train and opt.lr_scheduler == 'plateau': scheduler.step(prev_val_loss) # try doing test later here if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}_results.json'.format( opt.inference_subset) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mnli-mm": MnliMismatchedProcessor, "mrpc": MrpcProcessor, "sst-2": Sst2Processor, "sts-b": StsbProcessor, "qqp": QqpProcessor, "qnli": QnliProcessor, "rte": RteProcessor, "wnli": WnliProcessor, "yelp": YelpProcessor, } output_modes = { "cola": "classification", "mnli": "classification", "mrpc": "classification", "sst-2": "classification", "sts-b": "regression", "qqp": "classification", "qnli": "classification", "rte": "classification", "wnli": "classification", "yelp": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # define a new function to compute loss values for both output_modes logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == "classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForSequenceClassification.from_pretrained( args.output_dir, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) else: model = BertForSequenceClassification.from_pretrained( args.output_dir, num_labels=num_labels) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) # create eval loss and other metric required by the task if output_mode == "classification": loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / global_step if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # hack for MNLI-MM if task_name == "mnli": task_name = "mnli-mm" processor = processors[task_name]() if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.". format(args.output_dir)) if not os.path.exists(args.output_dir + '-MM'): os.makedirs(args.output_dir + '-MM') eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] preds = np.argmax(preds, axis=1) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / global_step if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in ``[0, ..., config.num_labels - 1]``. If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs: loss: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Classification (or regression if config.num_labels==1) loss. logits ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import AlbertTokenizer, AlbertForSequenceClassification import torch tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = AlbertForSequenceClassification.from_pretrained('albert-base-v2') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] """ outputs = self.albert( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), logits, (hidden_states), (attentions)
def train(self, train_examples, model): """ Trains the model. Parameters ---------- examples: list Contains the data as a list of InputExample's model: BertModel The Bert model to be trained. weights: list Contains class weights. Returns ------- model: BertModel The trained model. """ validation_examples = self.get_data('validation') global_step = 0 self.validation_losses = [] # Training train_dataloader = self.get_loader(train_examples, 'train') model.train() step_number = len(train_dataloader) i = 0 for _ in trange(int(self.config.num_train_epochs), desc="Epoch"): model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc='Iteration')): if (self.config.gradual_unfreeze and i == 0): for param in model.bert.parameters(): param.requires_grad = False if (step % (step_number // 3)) == 0: i += 1 if (self.config.gradual_unfreeze and i > 1 and i < self.config.encoder_no): for k in range(i - 1): try: for param in model.bert.encoder.layer[ self.config.encoder_no - 1 - k].parameters(): param.requires_grad = True except: pass if (self.config.gradual_unfreeze and i > self.config.encoder_no + 1): for param in model.bert.embeddings.parameters(): param.requires_grad = True batch = tuple(t.to(self.device) for t in batch) input_ids, attention_mask, token_type_ids, label_ids, agree_ids = batch logits = model(input_ids, attention_mask, token_type_ids)[0] weights = self.class_weights.to(self.device) if self.config.output_mode == "classification": loss_fct = CrossEntropyLoss(weight=weights) loss = loss_fct(logits.view(-1, self.num_labels), label_ids.view(-1)) elif self.config.output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if self.config.gradient_accumulation_steps > 1: loss = loss / self.config.gradient_accumulation_steps else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % self.config.gradient_accumulation_steps == 0: if self.config.fp16: lr_this_step = self.config.learning_rate * warmup_linear( global_step / self.num_train_optimization_steps, self.config.warm_up_proportion) for param_group in self.optimizer.param_groups: param_group['lr'] = lr_this_step torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) self.optimizer.step() self.scheduler.step() self.optimizer.zero_grad() global_step += 1 # Validation validation_loader = self.get_loader(validation_examples, phase='eval') model.eval() valid_loss, valid_accuracy = 0, 0 nb_valid_steps, nb_valid_examples = 0, 0 for input_ids, attention_mask, token_type_ids, label_ids, agree_ids in tqdm( validation_loader, desc="Validating"): input_ids = input_ids.to(self.device) attention_mask = attention_mask.to(self.device) token_type_ids = token_type_ids.to(self.device) label_ids = label_ids.to(self.device) agree_ids = agree_ids.to(self.device) with torch.no_grad(): logits = model(input_ids, attention_mask, token_type_ids)[0] if self.config.output_mode == "classification": loss_fct = CrossEntropyLoss(weight=weights) tmp_valid_loss = loss_fct( logits.view(-1, self.num_labels), label_ids.view(-1)) elif self.config.output_mode == "regression": loss_fct = MSELoss() tmp_valid_loss = loss_fct(logits.view(-1), label_ids.view(-1)) valid_loss += tmp_valid_loss.mean().item() nb_valid_steps += 1 valid_loss = valid_loss / nb_valid_steps self.validation_losses.append(valid_loss) print("Validation losses: {}".format(self.validation_losses)) if valid_loss == min(self.validation_losses): try: os.remove(self.config.model_dir / ('temporary' + str(best_model))) except: print('No best model found') torch.save({ 'epoch': str(i), 'state_dict': model.state_dict() }, self.config.model_dir / ('temporary' + str(i))) best_model = i # Save a trained model and the associated configuration checkpoint = torch.load(self.config.model_dir / ('temporary' + str(best_model))) model.load_state_dict(checkpoint['state_dict']) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(self.config.model_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(self.config.model_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) os.remove(self.config.model_dir / ('temporary' + str(best_model))) return model
def forward( self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict distilbert_output = self.distilbert( input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_state = distilbert_output[0] # (bs, seq_len, dim) pooled_output = hidden_state[:, 0] # (bs, dim) pooled_output = self.pre_classifier(pooled_output) # (bs, dim) pooled_output = nn.ReLU()(pooled_output) # (bs, dim) pooled_output = self.dropout(pooled_output) # (bs, dim) logits = self.classifier(pooled_output) # (bs, num_labels) loss = None if labels is not None: if self.config.problem_type is None: if self.num_labels == 1: self.config.problem_type = "regression" elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): self.config.problem_type = "single_label_classification" else: self.config.problem_type = "multi_label_classification" if self.config.problem_type == "regression": loss_fct = MSELoss() if self.num_labels == 1: loss = loss_fct(logits.squeeze(), labels.squeeze()) else: loss = loss_fct(logits, labels) elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) if not return_dict: output = (logits,) + distilbert_output[1:] return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, logits=logits, hidden_states=distilbert_output.hidden_states, attentions=distilbert_output.attentions, )
def __init__(self, exploration='epsilonGreedy', memory=10000, discount=0.99, uncertainty=True, uncertainty_weight=1, update_every=200, double=True, use_distribution=True, reward_normalization=False, encoder=None, hidden_size=40, state_difference=True, state_difference_weight=1, **kwargs) -> None: self.uncertainty = uncertainty self.hidden_size = hidden_size self.network = NetWork(self.hidden_size).to(device) self.createEncoder(encoder) self.network.hasEncoder = self.hasEncoder print("Number of parameters in network:", count_parameters(self.network)) self.criterion = MSELoss() self.memory = ReplayBuffer(int(memory)) self.remember = self.memory.remember() self.exploration = Exploration() if exploration == 'greedy': self.explore = self.exploration.greedy elif exploration == 'epsilonGreedy': self.explore = self.exploration.epsilonGreedy elif exploration == 'softmax': self.explore = self.exploration.softmax elif exploration == 'epsintosoftmax': self.explore = self.exploration.epsintosoftmax self.target_network = NetWork(self.hidden_size).to(device) self.target_network.hasEncoder = self.hasEncoder self.placeholder_network = NetWork(self.hidden_size).to(device) self.placeholder_network.hasEncoder = self.hasEncoder self.gamma, self.f = discount, 0 self.update_every, self.double, self.use_distribution = update_every, double, use_distribution self.counter = 0 self.reward_normalization = reward_normalization self.state_difference = state_difference self.true_state_trace = None self.uncertainty_weight = uncertainty_weight self.state_difference_weight = state_difference_weight if encoder is not None: self.optimizer_value = Adam( list(self.network.fromEncoder.parameters()) + list(self.network.lstm.parameters()) + list(self.network.linear.parameters()), lr=1e-4, weight_decay=1e-5) else: self.optimizer_value = Adam(list(self.network.color.parameters()) + list(self.network.conv1.parameters()) + list(self.network.lstm.parameters()) + list(self.network.linear.parameters()), lr=1e-4, weight_decay=1e-5) if self.uncertainty: self.optimizer_exploration = Adam(list( self.network.exploration_network.parameters()), lr=1e-4, weight_decay=1e-5) if self.state_difference: self.optimizer_state_avoidance = Adam(list( self.network.state_difference_network.parameters()), lr=1e-4, weight_decay=1e-5) self.onpolicy = True
def forward(self, input_ids, src_probs=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, labels=None, loss_ignore_index=-100, hard_labels=None, hard_labels_mask=None, hard_label_loss_weight=0): outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) outputs = (logits, ) + outputs[ 2:] # add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels)[active_loss] active_labels = labels.view(-1)[active_loss] loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss, ) + outputs if src_probs is not None: # ## KL Divergence # loss_KD_fct = KLDivLoss(reduction="mean") # log_probs = torch.nn.functional.log_softmax(logits, dim=-1) # if attention_mask is not None: # active_loss = attention_mask.view(-1) == 1 # active_log_probs = log_probs.view(-1, self.num_labels)[active_loss] # active_src_probs = src_probs.view(-1, self.num_labels)[active_loss] # # loss_KD = loss_KD_fct(active_log_probs, active_src_probs) # else: # loss_KD = loss_KD_fct(log_probs, src_probs) # ## CrossEntropy # loss_KD_fct = CrossEntropyLoss() # src_labels = torch.argmax(src_probs.view(-1, self.num_labels), dim=-1) # if attention_mask is not None: # active_loss = attention_mask.view(-1) == 1 # active_logits = logits.view(-1, self.num_labels)[active_loss] # active_src_labels = src_labels[active_loss] # # loss_KD = loss_KD_fct(active_logits, active_src_labels) # else: # loss_KD = loss_KD_fct(logits.view(-1, self.num_labels), src_labels) ## L2 Norm loss_KD_fct = MSELoss(reduction="mean") probs = torch.nn.functional.softmax(logits, dim=-1) if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 # Note that, even in TS learning, actually here we do NOT use the label information. Instead we just remove the loss w.r.t sub-word starting with "##" in BERT inactive_subword = labels.view(-1) == loss_ignore_index active_loss[inactive_subword] = 0 active_probs = probs.view(-1, self.num_labels)[active_loss] active_src_probs = src_probs.view(-1, self.num_labels)[active_loss] loss_KD = loss_KD_fct(active_probs, active_src_probs) else: loss_KD = loss_KD_fct(probs, src_probs) if hard_labels is not None: hard_active_loss = active_loss == 1 if hard_labels_mask is not None: hard_active_loss[hard_labels_mask.view(-1) == 0] = 0 active_hard_probs = probs.view( -1, self.num_labels)[hard_active_loss] activate_hard_labels = hard_labels.view( -1, self.num_labels)[hard_active_loss] loss_hard = loss_KD_fct(active_hard_probs, activate_hard_labels) if hard_label_loss_weight >= 0: # use both soft labels and hard labels for calculating loss loss_KD += loss_hard * hard_label_loss_weight else: # i.e., Only use the ensured labels for training, NO soft labels are used loss_KD = loss_hard outputs = (loss_KD, ) + outputs return outputs # (loss_KD), (loss), scores, (hidden_states), (attentions)
n_epochs = 20 # ( lr = Learning Rate ) data_list = torch.load('E:\\final_project\\data\\graphs_ran\\%s' % graphs[0]) # GRABS FIRST Graph-FILE FOR TRAINING # (You could loop over this ) ## TRAINING scaler = StandardScaler() # SCALER loader = DataLoader( data_list, batch_size=batch_size) # LOADS Graph-file INTO BATCH FORMAT loss_list = list() # HOLDS LOSS FOR PLOTTING model = Net().to(device) optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4) # OPTIMIZER loss_func = MSELoss() # LOSS FUNCTION for i in range(0, len(loader)): # LOOP OVER BATCHES data_train = next(iter(loader)) # data_train = data_train.to(device) # MOUNTS DATA TO DEVICE ## 'PRE'-PROCESSING # NEXT FEW LINES DOES SCALING ON NODE FEATURES if i == 0: # ( This was not done before putting them into the, scaler.fit( data_train.x.cpu().numpy() [:, 0:5]) # Graph-file) data_train.x = torch.tensor(scaler.transform( data_train.x.cpu().numpy()[:, 0:5]), dtype=torch.float).float().cuda() else: data_train.x = torch.tensor(scaler.transform( data_train.x.cpu().numpy()[:, 0:5]),
# pin_memory=True # ) checkpoint = '../model/{}_fold_{}_gpu_{}.pth'.format( args.model, fold, args.gpu) model = get_model() if args.finetune: state_dict = torch.load(checkpoint) model.load_state_dict(state_dict) for param in list(model.parameters())[:-109]: param.requires_grad = False model.cuda() if TRAIN: # criterion = mixed_loss criterion = MSELoss() if args.lr_scheduler == 'step': optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=args.lr / 100) scheduler = StepLR(optimizer, step_size=7, gamma=.1) else: optimizer = Adam(model.parameters(), lr=args.lr) scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=3, verbose=1, cooldown=1, min_lr=args.lr / 1000, factor=.1)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--data_dir", default="data/MNLI", type=str, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--teacher_model", default="pretrained/checkpoint-31280/", type=str, help="The teacher model dir.") parser.add_argument("--student_model", default="pretrained/generalbert", type=str, help="The student model dir.") parser.add_argument("--task_name", default="MNLI", type=str, help="The name of the task to train.") parser.add_argument( "--output_dir", default="output", type=str, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=384, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=128, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--weight_decay', '--wd', default=1e-4, type=float, metavar='W', help='weight decay') parser.add_argument("--num_train_epochs", default=5.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) # added arguments parser.add_argument('--aug_train', action='store_true') parser.add_argument('--eval_step', type=float, default=0.1) parser.add_argument('--pred_distill', action='store_true') parser.add_argument('--data_url', type=str, default="") parser.add_argument('--temperature', type=float, default=1.) args = parser.parse_args() logger.info('The args: {}'.format(args)) # intermediate distillation default parameters default_params = { "cola": { "num_train_epochs": 50, "max_seq_length": 64 }, "mnli": { "num_train_epochs": 5, "max_seq_length": 128 }, "mrpc": { "num_train_epochs": 20, "max_seq_length": 128 }, "sst-2": { "num_train_epochs": 10, "max_seq_length": 64 }, "sts-b": { "num_train_epochs": 20, "max_seq_length": 128 }, "qqp": { "num_train_epochs": 5, "max_seq_length": 128 }, "qnli": { "num_train_epochs": 10, "max_seq_length": 128 }, "rte": { "num_train_epochs": 20, "max_seq_length": 128 } } acc_tasks = ["mnli", "mrpc", "sst-2", "qqp", "qnli", "rte"] corr_tasks = ["sts-b"] mcc_tasks = ["cola"] # Prepare devices device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger.info("device: {} n_gpu: {}".format(device, n_gpu)) tb = SummaryWriter("./runs") # Prepare seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # Prepare task settings if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name in default_params: args.max_seq_len = default_params[task_name]["max_seq_length"] if not args.pred_distill and not args.do_eval: if task_name in default_params: args.num_train_epoch = default_params[task_name][ "num_train_epochs"] if task_name not in processors: raise ValueError("Task not found: %s" % task_name) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.student_model, do_lower_case=args.do_lower_case) if not args.do_eval: if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps train_data, _ = get_tensor_data(args, task_name, tokenizer, False, args.aug_train) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) num_train_optimization_steps = int( len(train_dataloader) / args.gradient_accumulation_steps) * args.num_train_epochs eval_data, eval_labels = get_tensor_data(args, task_name, tokenizer, True, False) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) if not args.do_eval: teacher_model = TinyBertForSequenceClassification.from_pretrained( args.teacher_model, num_labels=num_labels) teacher_model.to(device) student_model = TinyBertForSequenceClassification.from_pretrained( args.student_model, num_labels=num_labels) student_model.to(device) if args.do_eval: logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_data)) logger.info(" Batch size = %d", args.eval_batch_size) student_model.eval() result = do_eval(student_model, task_name, eval_dataloader, device, output_mode, eval_labels, num_labels) logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) else: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_data)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if n_gpu > 1: student_model = torch.nn.DataParallel(student_model) teacher_model = torch.nn.DataParallel(teacher_model) # Prepare optimizer param_optimizer = list(student_model.named_parameters()) size = 0 for n, p in student_model.named_parameters(): logger.info('n: {}'.format(n)) size += p.nelement() logger.info('Total parameters: {}'.format(size)) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] schedule = 'warmup_linear' if not args.pred_distill: schedule = 'none' optimizer = BertAdam(optimizer_grouped_parameters, schedule=schedule, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) # Prepare loss functions loss_mse = MSELoss() def soft_cross_entropy(predicts, targets): student_likelihood = torch.nn.functional.log_softmax(predicts, dim=-1) targets_prob = torch.nn.functional.softmax(targets, dim=-1) return (-targets_prob * student_likelihood).mean() # Train and evaluate global_step = 0 best_dev_acc = 0.0 output_eval_file = os.path.join(args.output_dir, "eval_results.txt") for epoch_ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0. tr_att_loss = 0. tr_rep_loss = 0. tr_cls_loss = 0. student_model.train() nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration", ascii=True)): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch if input_ids.size()[0] != args.train_batch_size: continue att_loss = 0. rep_loss = 0. cls_loss = 0. student_logits, student_atts, student_reps = student_model( input_ids, segment_ids, input_mask, is_student=True) with torch.no_grad(): teacher_logits, teacher_atts, teacher_reps = teacher_model( input_ids, segment_ids, input_mask) if not args.pred_distill: teacher_layer_num = len(teacher_atts) student_layer_num = len(student_atts) assert teacher_layer_num % student_layer_num == 0 layers_per_block = int(teacher_layer_num / student_layer_num) new_teacher_atts = [ teacher_atts[i * layers_per_block + layers_per_block - 1] for i in range(student_layer_num) ] for student_att, teacher_att in zip( student_atts, new_teacher_atts): student_att = torch.where( student_att <= -1e2, torch.zeros_like(student_att).to(device), student_att) teacher_att = torch.where( teacher_att <= -1e2, torch.zeros_like(teacher_att).to(device), teacher_att) tmp_loss = loss_mse(student_att, teacher_att) att_loss += tmp_loss new_teacher_reps = [ teacher_reps[i * layers_per_block] for i in range(student_layer_num + 1) ] new_student_reps = student_reps for student_rep, teacher_rep in zip( new_student_reps, new_teacher_reps): tmp_loss = loss_mse(student_rep, teacher_rep) rep_loss += tmp_loss loss = rep_loss + att_loss tr_att_loss += att_loss.item() tr_rep_loss += rep_loss.item() else: if output_mode == "classification": cls_loss = soft_cross_entropy( student_logits / args.temperature, teacher_logits / args.temperature) elif output_mode == "regression": loss_mse = MSELoss() cls_loss = loss_mse(student_logits.view(-1), label_ids.view(-1)) loss = cls_loss tr_cls_loss += cls_loss.item() if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tb.add_scalar("loss", loss.item(), global_step) tr_loss += loss.item() nb_tr_examples += label_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 if (global_step + 1) % int( args.eval_step * num_train_optimization_steps) == 0: logger.info("***** Running evaluation *****") logger.info(" Epoch = {} iter {} step".format( epoch_, global_step)) logger.info(" Num examples = %d", len(eval_data)) logger.info(" Batch size = %d", args.eval_batch_size) student_model.eval() loss = tr_loss / (step + 1) cls_loss = tr_cls_loss / (step + 1) att_loss = tr_att_loss / (step + 1) rep_loss = tr_rep_loss / (step + 1) result = {} if args.pred_distill: result = do_eval(student_model, task_name, eval_dataloader, device, output_mode, eval_labels, num_labels) result['global_step'] = global_step result['cls_loss'] = cls_loss result['att_loss'] = att_loss result['rep_loss'] = rep_loss result['loss'] = loss result_to_file(result, output_eval_file) if not args.pred_distill: save_model = True else: save_model = False if task_name in acc_tasks and result[ 'acc'] > best_dev_acc: best_dev_acc = result['acc'] save_model = True if task_name in corr_tasks and result[ 'corr'] > best_dev_acc: best_dev_acc = result['corr'] save_model = True if task_name in mcc_tasks and result[ 'mcc'] > best_dev_acc: best_dev_acc = result['mcc'] save_model = True if save_model: logger.info("***** Save model *****") model_to_save = student_model.module if hasattr( student_model, 'module') else student_model model_name = WEIGHTS_NAME # if not args.pred_distill: # model_name = "step_{}_{}".format(global_step, WEIGHTS_NAME) output_model_file = os.path.join( args.output_dir, model_name) output_config_file = os.path.join( args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Test mnli-mm if args.pred_distill and task_name == "mnli": task_name = "mnli-mm" if not os.path.exists(args.output_dir + '-MM'): os.makedirs(args.output_dir + '-MM') eval_data, eval_labels = get_tensor_data( args, task_name, tokenizer, True, False) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) logger.info("***** Running mm evaluation *****") logger.info(" Num examples = %d", len(eval_data)) logger.info(" Batch size = %d", args.eval_batch_size) result = do_eval(student_model, task_name, eval_dataloader, device, output_mode, eval_labels, num_labels) result['global_step'] = global_step tmp_output_eval_file = os.path.join( args.output_dir + '-MM', "eval_results.txt") result_to_file(result, tmp_output_eval_file) task_name = 'mnli' student_model.train()
import torch from diffeopt.group.ddmatch.representation import FunctionRepresentation, DensityRepresentation from diffeopt.group.ddmatch.group import DiffeoGroup from diffeopt.sum_representation import get_sum_representation from diffeopt.distance.information import information_distance from torch.nn import MSELoss from diffeopt.cometric.laplace import get_laplace_cometric from diffeopt.utils import normalize mse = MSELoss() from diffeopt.optim import GroupOptimizer def test_orbit_optimisation(): shape = (16, 16) I0, I1 = [1 + torch.rand(*shape, dtype=torch.float64) for i in range(2)] group = DiffeoGroup(I0.shape) cometric = get_laplace_cometric(group, s=2) sum_rep = get_sum_representation(FunctionRepresentation(group), DensityRepresentation(group)) oo = GroupOptimizer(sum_rep.parameters(), lr=.1, cometric=cometric) vol = normalize(torch.ones_like(I1)) vol__ = vol + 1e-2 * torch.randn_like(vol) for i in range(2): oo.zero_grad() I_, vol_ = sum_rep(I0, vol) loss = mse(I_, I0) + information_distance(vol_, vol__) loss.backward() oo.step()
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, threshold=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). threshold (:obj:`float`): Threshold value (see :class:`~emmental.MaskedLinear`). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): Classification (or regression if config.num_labels==1) loss. logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, threshold=threshold, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), logits, (hidden_states), (attentions)
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, input_ids_pub=None, labels_pub=None, attention_mask_pub=None, token_type_ids_pub=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0][:,0,:] # take <s> token (equiv. to [CLS]) # print(sequence_output.shape) # TextHide sequence_output_pub = None if input_ids_pub is not None: input_ids_pub = input_ids_pub[:len(input_ids)] #token_type_ids_pub = token_type_ids_pub[:len(token_type_ids)] # truncate to match size outputs_pub = self.roberta( input_ids_pub, attention_mask=attention_mask, token_type_ids=token_type_ids_pub, position_ids=position_ids, ) sequence_output_pub = outputs_pub[0][:,0,:] if labels is not None: sequence_output, mix_labels, lams = mixup( sequence_output, labels, k=self.num_k, embeds_help=sequence_output_pub) sequence_output = self.apply_mask(sequence_output) logits = self.classifier(sequence_output) loss = None if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() # loss = loss_fct(logits.view(-1), labels.view(-1)) # commented this since mixup_criterion() has incorporated this function else: loss_fct = CrossEntropyLoss() # loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) # commented this since mixup_criterion() has incorporated this function loss = mixup_criterion( loss_fct, logits, mix_labels, lams, self.num_labels) if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, )
def forward( self, pixel_values: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[tuple, SequenceClassifierOutput]: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: Examples: ```python >>> from transformers import DeiTFeatureExtractor, DeiTForImageClassification >>> import torch >>> from PIL import Image >>> import requests >>> torch.manual_seed(3) # doctest: +IGNORE_RESULT >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) >>> # note: we are loading a DeiTForImageClassificationWithTeacher from the hub here, >>> # so the head will be randomly initialized, hence the predictions will be random >>> feature_extractor = DeiTFeatureExtractor.from_pretrained("facebook/deit-base-distilled-patch16-224") >>> model = DeiTForImageClassification.from_pretrained("facebook/deit-base-distilled-patch16-224") >>> inputs = feature_extractor(images=image, return_tensors="pt") >>> outputs = model(**inputs) >>> logits = outputs.logits >>> # model predicts one of the 1000 ImageNet classes >>> predicted_class_idx = logits.argmax(-1).item() >>> print("Predicted class:", model.config.id2label[predicted_class_idx]) Predicted class: maillot ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.deit( pixel_values, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] logits = self.classifier(sequence_output[:, 0, :]) # we don't use the distillation token loss = None if labels is not None: if self.config.problem_type is None: if self.num_labels == 1: self.config.problem_type = "regression" elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): self.config.problem_type = "single_label_classification" else: self.config.problem_type = "multi_label_classification" if self.config.problem_type == "regression": loss_fct = MSELoss() if self.num_labels == 1: loss = loss_fct(logits.squeeze(), labels.squeeze()) else: loss = loss_fct(logits, labels) elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) if not return_dict: output = (logits, ) + outputs[2:] return ((loss, ) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, )
def scalar_loss(self, prediction, target): return MSELoss(reduction='none')(prediction, target)
from torch.utils.data import DataLoader from Model import time_dataset, collate train_loader = DataLoader(dataset=time_dataset(X_train, y_train), batch_size=128, shuffle=False, collate_fn=collate) val_loader = DataLoader(dataset=time_dataset(X_val, y_val), batch_size=64, shuffle=False, collate_fn=collate) from Model import GGNN, Mol2NumNet_regressor from time import time from torch import optim from torch.nn import MSELoss from pytorch_tools import EarlyStopping ode_gnn = GGNN(in_feats=4, n_hidden=50, out_feats=4, n_iter_readout=1).cuda() #recurrent_layer=3, output_dim=5).cuda() ode_gnn = Mol2NumNet_regressor(node_input_dim=4, edge_input_dim=6, node_hidden_dim=12, edge_hidden_dim=20, n_classes=1).cuda() optimizer = optim.Adam(ode_gnn.parameters(), lr=0.01) criterion = MSELoss().cuda() early_stopping = EarlyStopping(patience=20) def train(model, epochs, loss_func, data_loader, val_data_loader): train_epochs_losses = [] val_epoch_losses = [] dur = [] for epoch in range(epochs): model.train() train_epochs_loss = 0 if epoch >= 1: t0 = time() for graph, state in data_loader: graph = graph.to(device) state = torch.stack(state).to(device, dtype=torch.float)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, head2labels=None, return_pooler_output=False, head2mask=None, nsp_loss_weights=None): device = "cuda" if torch.cuda.is_available() else "cpu" # Get logits output = self.bert(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, output_attentions=False, output_hidden_states=False, return_dict=True) pooled_output = self.dropout(output["pooler_output"]).to(device) head2logits = {} return_dict = {} for head_name, head in self.heads.items(): head2logits[head_name] = self.heads[head_name](pooled_output) head2logits[head_name] = head2logits[head_name].float() return_dict[head_name + "_logits"] = head2logits[head_name] if head2labels is not None: for head_name, labels in head2labels.items(): num_classes = head2logits[head_name].shape[1] # Regression (e.g. for politeness) if num_classes == 1: # Only consider positive examples if head2mask is not None and head_name in head2mask: num_positives = head2labels[head2mask[head_name]].sum( ) # use certain labels as mask if num_positives == 0: return_dict[head_name + "_loss"] = torch.tensor( [0]).to(device) else: loss_fct = MSELoss(reduction='none') loss = loss_fct(head2logits[head_name].view(-1), labels.float().view(-1)) return_dict[head_name + "_loss"] = loss.dot( head2labels[head2mask[head_name]].float().view( -1)) / num_positives else: loss_fct = MSELoss() return_dict[head_name + "_loss"] = loss_fct( head2logits[head_name].view(-1), labels.float().view(-1)) else: loss_fct = CrossEntropyLoss( weight=nsp_loss_weights.float()) return_dict[head_name + "_loss"] = loss_fct( head2logits[head_name], labels.view(-1)) if return_pooler_output: return_dict["pooler_output"] = output["pooler_output"] return return_dict
def forward( self, input_ids=None, past_key_values=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids, past_key_values=past_key_values, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = transformer_outputs[0] logits = self.score(hidden_states) if input_ids is not None: batch_size, sequence_length = input_ids.shape[:2] else: batch_size, sequence_length = inputs_embeds.shape[:2] assert ( self.config.pad_token_id is not None or batch_size == 1 ), "Cannot handle batch sizes > 1 if no padding token is defined." if self.config.pad_token_id is None: sequence_lengths = -1 else: if input_ids is not None: sequence_lengths = torch.ne( input_ids, self.config.pad_token_id).sum(-1) - 1 else: sequence_lengths = -1 logger.warning( f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " f"unexpected if using padding tokens in conjuction with `inputs_embeds.`" ) pooled_logits = logits[range(batch_size), sequence_lengths] loss = None if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(pooled_logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) if not return_dict: output = (pooled_logits, ) + transformer_outputs[1:] return ((loss, ) + output) if loss is not None else output return SequenceClassifierOutputWithPast( loss=loss, logits=pooled_logits, past_key_values=transformer_outputs.past_key_values, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, )
class YOLOLayer(Module): """Detection layer""" def __init__(self, anchors, num_classes, img_dim): super(YOLOLayer, self).__init__() self.anchors = anchors self.num_anchors = len(anchors) self.num_classes = num_classes self.bbox_attrs = 5 + num_classes self.img_dim = img_dim self.ignore_thres = 0.5 self.lambda_coord = 1 self.mse_loss = MSELoss() self.bce_loss = BCELoss() def forward(self, x, targets=None): bs = x.size(0) g_dim = x.size(2) stride = self.img_dim / g_dim # Tensors for cuda support FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor prediction = x.view(bs, self.num_anchors, self.bbox_attrs, g_dim, g_dim).permute(0, 1, 3, 4, 2).contiguous() # Get outputs x = torch.sigmoid(prediction[..., 0]) # Center x y = torch.sigmoid(prediction[..., 1]) # Center y w = prediction[..., 2] # Width h = prediction[..., 3] # Height conf = torch.sigmoid(prediction[..., 4]) # Conf pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred. # Calculate offsets for each grid grid_x = torch.linspace(0, g_dim-1, g_dim).repeat(g_dim,1).repeat(bs*self.num_anchors, 1, 1).view(x.shape).type(FloatTensor) grid_y = torch.linspace(0, g_dim-1, g_dim).repeat(g_dim,1).t().repeat(bs*self.num_anchors, 1, 1).view(y.shape).type(FloatTensor) scaled_anchors = [(a_w / stride, a_h / stride) for a_w, a_h in self.anchors] anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0])) anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1])) anchor_w = anchor_w.repeat(bs, 1).repeat(1, 1, g_dim*g_dim).view(w.shape) anchor_h = anchor_h.repeat(bs, 1).repeat(1, 1, g_dim*g_dim).view(h.shape) # Add offset and scale with anchors pred_boxes = FloatTensor(prediction[..., :4].shape) pred_boxes[..., 0] = x.data + grid_x pred_boxes[..., 1] = y.data + grid_y pred_boxes[..., 2] = torch.exp(w.data) * anchor_w pred_boxes[..., 3] = torch.exp(h.data) * anchor_h # Training if targets is not None: if x.is_cuda: self.mse_loss = self.mse_loss.cuda() self.bce_loss = self.bce_loss.cuda() nGT, nCorrect, mask, conf_mask, tx, ty, tw, th, tconf, tcls = build_targets(pred_boxes.cpu().data, targets.cpu().data, scaled_anchors, self.num_anchors, self.num_classes, g_dim, self.ignore_thres, self.img_dim) nProposals = int((conf > 0.25).sum().item()) recall = float(nCorrect / nGT) if nGT else 1 # Handle masks mask = Variable(mask.type(FloatTensor)) cls_mask = Variable(mask.unsqueeze(-1).repeat(1, 1, 1, 1, self.num_classes).type(FloatTensor)) conf_mask = Variable(conf_mask.type(FloatTensor)) # Handle target variables tx = Variable(tx.type(FloatTensor), requires_grad=False) ty = Variable(ty.type(FloatTensor), requires_grad=False) tw = Variable(tw.type(FloatTensor), requires_grad=False) th = Variable(th.type(FloatTensor), requires_grad=False) tconf = Variable(tconf.type(FloatTensor), requires_grad=False) tcls = Variable(tcls.type(FloatTensor), requires_grad=False) # Mask outputs to ignore non-existing objects loss_x = self.lambda_coord * self.bce_loss(x * mask, tx * mask) loss_y = self.lambda_coord * self.bce_loss(y * mask, ty * mask) loss_w = self.lambda_coord * self.mse_loss(w * mask, tw * mask) / 2 loss_h = self.lambda_coord * self.mse_loss(h * mask, th * mask) / 2 loss_conf = self.bce_loss(conf * conf_mask, tconf * conf_mask) loss_cls = self.bce_loss(pred_cls * cls_mask, tcls * cls_mask) loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls return loss, loss_x.item(), loss_y.item(), loss_w.item(), loss_h.item(), loss_conf.item(), loss_cls.item(), recall else: # If not in training phase return predictions output = torch.cat((pred_boxes.view(bs, -1, 4) * stride, conf.view(bs, -1, 1), pred_cls.view(bs, -1, self.num_classes)), -1) return output.data
def main(): parser = build_argparse() parser.add_argument('--kd_decay', type=float, default=0.999) parser.add_argument('--kd_coeff', type=float, default=1.0) args = parser.parse_args() # output dir if args.model_name is None: args.model_name = args.model_path.split("/")[-1] args.output_dir = args.output_dir + '{}'.format(args.model_name) os.makedirs(args.output_dir, exist_ok=True) prefix = "_".join([args.model_name, args.task_name]) logger = TrainLogger(log_dir=args.output_dir, prefix=prefix) # device logger.info("initializing device") args.device, args.n_gpu = prepare_device(args.gpu, args.local_rank) seed_everything(args.seed) args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] # data processor logger.info("initializing data processor") tokenizer = tokenizer_class.from_pretrained( args.model_path, do_lower_case=args.do_lower_case) processor = ColaProcessor(data_dir=args.data_dir, tokenizer=tokenizer, prefix=prefix) label_list = processor.get_labels() num_labels = len(label_list) args.num_labels = num_labels # model logger.info("initializing model and config") config = config_class.from_pretrained( args.model_path, num_labels=num_labels, cache_dir=args.cache_dir if args.cache_dir else None) model = model_class.from_pretrained(args.model_path, config=config) model.to(args.device) # trainer logger.info("initializing traniner") trainer = SDATrainer(logger=logger, args=args, collate_fn=processor.collate_fn, input_keys=processor.get_input_keys(), kd_model=copy.deepcopy(model), kd_loss_fct=MSELoss(), metrics=[MattewsCorrcoef()]) # do train if args.do_train: train_dataset = processor.create_dataset(args.train_max_seq_length, 'train.tsv', 'train') eval_dataset = processor.create_dataset(args.eval_max_seq_length, 'dev.tsv', 'dev') trainer.train(model, train_dataset=train_dataset, eval_dataset=eval_dataset) if args.do_eval and args.local_rank in [-1, 0]: results = {} eval_dataset = processor.create_dataset(args.eval_max_seq_length, 'dev.tsv', 'dev') checkpoints = [args.output_dir] if args.eval_all_checkpoints or args.checkpoint_number > 0: checkpoints = get_checkpoints(args.output_dir, args.checkpoint_number, WEIGHTS_NAME) logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split("/")[-1].split("-")[-1] model = model_class.from_pretrained(checkpoint, config=config) model.to(args.device) trainer.evaluate(model, eval_dataset, save_preds=True, prefix=str(global_step)) if global_step: result = { "{}_{}".format(global_step, k): v for k, v in trainer.records['result'].items() } results.update(result) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") dict_to_text(output_eval_file, results) if args.do_predict: test_dataset = processor.create_dataset(args.eval_max_seq_length, 'test.tsv', 'test') if args.checkpoint_number == 0: raise ValueError("checkpoint number should > 0,but get %d", args.checkpoint_number) checkpoints = get_checkpoints(args.output_dir, args.checkpoint_number, WEIGHTS_NAME) for checkpoint in checkpoints: global_step = checkpoint.split("/")[-1].split("-")[-1] model = model_class.from_pretrained(checkpoint) model.to(args.device) trainer.predict(model, test_dataset=test_dataset, prefix=str(global_step))
def main(): parser = ArgumentParser() parser.add_argument( '--pregenerated_data', type=str, required=True, default='/nas/hebin/data/english-exp/books_wiki_tokens_ngrams') parser.add_argument('--s3_output_dir', type=str, default='huawei_yun') parser.add_argument('--student_model', type=str, default='8layer_bert', required=True) parser.add_argument('--teacher_model', type=str, default='electra_base') parser.add_argument('--cache_dir', type=str, default='/cache', help='') parser.add_argument("--epochs", type=int, default=2, help="Number of epochs to train for") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--train_batch_size", default=16, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=1e-4, type=float, help="The initial learning rate for Adam.") parser.add_argument("--max_seq_length", type=int, default=512) parser.add_argument("--do_lower_case", action="store_true") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--scratch', action='store_true', help="Whether to train from scratch") parser.add_argument( "--reduce_memory", action="store_true", help= "Store training data as on-disc memmaps to massively reduce memory usage" ) parser.add_argument('--debug', action='store_true', help="Whether to debug") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument("--already_trained_epoch", default=0, type=int) parser.add_argument( "--masked_lm_prob", type=float, default=0.0, help="Probability of masking each token for the LM task") parser.add_argument( "--max_predictions_per_seq", type=int, default=77, help="Maximum number of tokens to mask in each sequence") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") parser.add_argument("--warmup_steps", default=10000, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_workers", type=int, default=4, help="num_workers.") parser.add_argument("--continue_index", type=int, default=0, help="") parser.add_argument("--threads", type=int, default=27, help="Number of threads to preprocess input data") # Search space for sub_bart architecture parser.add_argument('--layer_num_space', nargs='+', type=int, default=[1, 8]) parser.add_argument('--hidden_size_space', nargs='+', type=int, default=[128, 768]) parser.add_argument('--qkv_size_space', nargs='+', type=int, default=[180, 768]) parser.add_argument('--intermediate_size_space', nargs='+', type=int, default=[128, 3072]) parser.add_argument('--head_num_space', nargs='+', type=int, default=[1, 12]) parser.add_argument('--sample_times_per_batch', type=int, default=1) parser.add_argument('--further_train', action='store_true') parser.add_argument('--mlm_loss', action='store_true') # Argument for Huawei yun parser.add_argument('--data_url', type=str, default='', help='s3 url') parser.add_argument("--train_url", type=str, default="", help="s3 url") args = parser.parse_args() assert (torch.cuda.is_available()) device_count = torch.cuda.device_count() args.rank = int(os.getenv('RANK', '0')) args.world_size = int(os.getenv("WORLD_SIZE", '1')) # Call the init process # init_method = 'tcp://' init_method = '' master_ip = os.getenv('MASTER_ADDR', 'localhost') master_port = os.getenv('MASTER_PORT', '6000') init_method += master_ip + ':' + master_port # Manually set the device ids. # if device_count > 0: # args.local_rank = args.rank % device_count torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) print('device_id: %s' % args.local_rank) print('device_count: %s, rank: %s, world_size: %s' % (device_count, args.rank, args.world_size)) print(init_method) torch.distributed.init_process_group(backend='nccl', world_size=args.world_size, rank=args.rank, init_method=init_method) LOCAL_DIR = args.cache_dir if oncloud: assert mox.file.exists(LOCAL_DIR) if args.local_rank == 0 and oncloud: logging.info( mox.file.list_directory(args.pregenerated_data, recursive=True)) logging.info( mox.file.list_directory(args.student_model, recursive=True)) local_save_dir = os.path.join(LOCAL_DIR, 'output', 'superbert', 'checkpoints') local_tsbd_dir = os.path.join(LOCAL_DIR, 'output', 'superbert', 'tensorboard') save_name = '_'.join([ 'superbert', 'epoch', str(args.epochs), 'lr', str(args.learning_rate), 'bsz', str(args.train_batch_size), 'grad_accu', str(args.gradient_accumulation_steps), str(args.max_seq_length), 'gpu', str(args.world_size), ]) bash_save_dir = os.path.join(local_save_dir, save_name) bash_tsbd_dir = os.path.join(local_tsbd_dir, save_name) if args.local_rank == 0: if not os.path.exists(bash_save_dir): os.makedirs(bash_save_dir) logger.info(bash_save_dir + ' created!') if not os.path.exists(bash_tsbd_dir): os.makedirs(bash_tsbd_dir) logger.info(bash_tsbd_dir + ' created!') local_data_dir_tmp = '/cache/data/tmp/' local_data_dir = local_data_dir_tmp + save_name if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) args.tokenizer = BertTokenizer.from_pretrained( args.student_model, do_lower_case=args.do_lower_case) args.vocab_list = list(args.tokenizer.vocab.keys()) config = BertConfig.from_pretrained( os.path.join(args.student_model, CONFIG_NAME)) logger.info("Model config {}".format(config)) if args.further_train: if args.mlm_loss: student_model = SuperBertForPreTraining.from_pretrained( args.student_model, config) else: student_model = SuperTinyBertForPreTraining.from_pretrained( args.student_model, config) else: if args.mlm_loss: student_model = SuperBertForPreTraining.from_scratch( args.student_model, config) else: student_model = SuperTinyBertForPreTraining.from_scratch( args.student_model, config) student_model.to(device) if not args.mlm_loss: teacher_model = BertModel.from_pretrained(args.teacher_model) teacher_model.to(device) # build arch space min_hidden_size, max_hidden_size = args.hidden_size_space min_ffn_size, max_ffn_size = args.intermediate_size_space min_qkv_size, max_qkv_size = args.qkv_size_space min_head_num, max_head_num = args.head_num_space hidden_step = 4 ffn_step = 4 qkv_step = 12 head_step = 1 number_hidden_step = int((max_hidden_size - min_hidden_size) / hidden_step) number_ffn_step = int((max_ffn_size - min_ffn_size) / ffn_step) number_qkv_step = int((max_qkv_size - min_qkv_size) / qkv_step) number_head_step = int((max_head_num - min_head_num) / head_step) layer_numbers = list( range(args.layer_num_space[0], args.layer_num_space[1] + 1)) hidden_sizes = [ i * hidden_step + min_hidden_size for i in range(number_hidden_step + 1) ] ffn_sizes = [ i * ffn_step + min_ffn_size for i in range(number_ffn_step + 1) ] qkv_sizes = [ i * qkv_step + min_qkv_size for i in range(number_qkv_step + 1) ] head_numbers = [ i * head_step + min_head_num for i in range(number_head_step + 1) ] ###### if args.local_rank == 0: tb_writer = SummaryWriter(bash_tsbd_dir) global_step = 0 step = 0 tr_loss, tr_rep_loss, tr_att_loss = 0.0, 0.0, 0.0 logging_loss, rep_logging_loss, att_logging_loss = 0.0, 0.0, 0.0 end_time, start_time = 0, 0 submodel_config = dict() if args.further_train: submodel_config['sample_layer_num'] = config.num_hidden_layers submodel_config['sample_hidden_size'] = config.hidden_size submodel_config[ 'sample_intermediate_sizes'] = config.num_hidden_layers * [ config.intermediate_size ] submodel_config[ 'sample_num_attention_heads'] = config.num_hidden_layers * [ config.num_attention_heads ] submodel_config['sample_qkv_sizes'] = config.num_hidden_layers * [ config.qkv_size ] for epoch in range(args.epochs): if epoch < args.continue_index: args.warmup_steps = 0 continue args.local_data_dir = os.path.join(local_data_dir, str(epoch)) if args.local_rank == 0: os.makedirs(args.local_data_dir) while 1: if os.path.exists(args.local_data_dir): epoch_dataset = load_doc_tokens_ngrams(args) break if args.local_rank == 0 and oncloud: logging.info('Dataset in epoch %s', epoch) logging.info( mox.file.list_directory(args.local_data_dir, recursive=True)) train_sampler = DistributedSampler(epoch_dataset, num_replicas=1, rank=0) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) step_in_each_epoch = len( train_dataloader) // args.gradient_accumulation_steps num_train_optimization_steps = step_in_each_epoch * args.epochs logging.info("***** Running training *****") logging.info(" Num examples = %d", len(epoch_dataset) * args.world_size) logger.info(" Num Epochs = %d", args.epochs) logging.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * args.world_size) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logging.info(" Num steps = %d", num_train_optimization_steps) if epoch == args.continue_index: # Prepare optimizer param_optimizer = list(student_model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] warm_up_ratio = args.warmup_steps / num_train_optimization_steps print('warm_up_ratio: {}'.format(warm_up_ratio)) optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, e=args.adam_epsilon, schedule='warmup_linear', t_total=num_train_optimization_steps, warmup=warm_up_ratio) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex" " to use fp16 training.") student_model, optimizer = amp.initialize( student_model, optimizer, opt_level=args.fp16_opt_level, min_loss_scale=1) # # apex student_model = DDP( student_model, message_size=10000000, gradient_predivide_factor=torch.distributed.get_world_size(), delay_allreduce=True) if not args.mlm_loss: teacher_model = DDP(teacher_model, message_size=10000000, gradient_predivide_factor=torch. distributed.get_world_size(), delay_allreduce=True) teacher_model.eval() logger.info('apex data paralleled!') from torch.nn import MSELoss loss_mse = MSELoss() student_model.train() for step_, batch in enumerate(train_dataloader): step += 1 batch = tuple(t.to(device) for t in batch) input_ids, input_masks, lm_label_ids = batch if not args.mlm_loss: teacher_last_rep, teacher_last_att = teacher_model( input_ids, input_masks) teacher_last_att = torch.where( teacher_last_att <= -1e2, torch.zeros_like(teacher_last_att).to(device), teacher_last_att) teacher_last_rep.detach() teacher_last_att.detach() for sample_idx in range(args.sample_times_per_batch): att_loss = 0. rep_loss = 0. rand_seed = int(global_step * args.world_size + sample_idx) # + args.rank % args.world_size) if not args.mlm_loss: if not args.further_train: submodel_config = sample_arch_4_kd( layer_numbers, hidden_sizes, ffn_sizes, qkv_sizes, reset_rand_seed=True, rand_seed=rand_seed) # knowledge distillation student_last_rep, student_last_att = student_model( input_ids, submodel_config, attention_mask=input_masks) student_last_att = torch.where( student_last_att <= -1e2, torch.zeros_like(student_last_att).to(device), student_last_att) att_loss += loss_mse(student_last_att, teacher_last_att) rep_loss += loss_mse(student_last_rep, teacher_last_rep) loss = att_loss + rep_loss if args.gradient_accumulation_steps > 1: rep_loss = rep_loss / args.gradient_accumulation_steps att_loss = att_loss / args.gradient_accumulation_steps loss = loss / args.gradient_accumulation_steps tr_rep_loss += rep_loss.item() tr_att_loss += att_loss.item() else: if not args.further_train: submodel_config = sample_arch_4_mlm( layer_numbers, hidden_sizes, ffn_sizes, head_numbers, reset_rand_seed=True, rand_seed=rand_seed) loss = student_model(input_ids, submodel_config, attention_mask=input_masks, masked_lm_labels=lm_label_ids) tr_loss += loss.item() if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward(retain_graph=True) else: loss.backward(retain_graph=True) if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(student_model.parameters(), args.max_grad_norm) optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % (args.gradient_accumulation_steps * args.logging_steps) == 0 \ and args.local_rank < 2 or global_step < 100: end_time = time.time() if not args.mlm_loss: logger.info( 'Epoch: %s, global_step: %s/%s, lr: %s, loss is %s; ' 'rep_loss is %s; att_loss is %s; (%.2f sec)' % (epoch, global_step + 1, step_in_each_epoch, optimizer.get_lr()[0], loss.item() * args.gradient_accumulation_steps, rep_loss.item() * args.gradient_accumulation_steps, att_loss.item() * args.gradient_accumulation_steps, end_time - start_time)) else: logger.info( 'Epoch: %s, global_step: %s/%s, lr: %s, loss is %s; ' ' (%.2f sec)' % (epoch, global_step + 1, step_in_each_epoch, optimizer.get_lr()[0], loss.item() * args.gradient_accumulation_steps, end_time - start_time)) start_time = time.time() if args.logging_steps > 0 and global_step % args.logging_steps == 0 and args.local_rank == 0: tb_writer.add_scalar("lr", optimizer.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) if not args.mlm_loss: tb_writer.add_scalar("rep_loss", (tr_rep_loss - rep_logging_loss) / args.logging_steps, global_step) tb_writer.add_scalar("att_loss", (tr_att_loss - att_logging_loss) / args.logging_steps, global_step) rep_logging_loss = tr_rep_loss att_logging_loss = tr_att_loss logging_loss = tr_loss # Save a trained model if args.rank == 0: saving_path = bash_save_dir saving_path = Path(os.path.join(saving_path, "epoch_" + str(epoch))) if saving_path.is_dir() and list(saving_path.iterdir()): logging.warning( f"Output directory ({ saving_path }) already exists and is not empty!" ) saving_path.mkdir(parents=True, exist_ok=True) logging.info("** ** * Saving fine-tuned model ** ** * ") model_to_save = student_model.module if hasattr(student_model, 'module')\ else student_model # Only save the model it-self output_model_file = os.path.join(saving_path, WEIGHTS_NAME) output_config_file = os.path.join(saving_path, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) args.tokenizer.save_vocabulary(saving_path) torch.save(optimizer.state_dict(), os.path.join(saving_path, "optimizer.pt")) logger.info("Saving optimizer and scheduler states to %s", saving_path) # debug if oncloud: local_output_dir = os.path.join(LOCAL_DIR, 'output') logger.info( mox.file.list_directory(local_output_dir, recursive=True)) logger.info('s3_output_dir: ' + args.s3_output_dir) mox.file.copy_parallel(local_output_dir, args.s3_output_dir) if args.local_rank == 0: tb_writer.close()
def forward( self, pixel_values=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for computing the image classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: Examples:: >>> from transformers import SegformerFeatureExtractor, SegformerForImageClassification >>> from PIL import Image >>> import requests >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' >>> image = Image.open(requests.get(url, stream=True).raw) >>> feature_extractor = SegformerFeatureExtractor.from_pretrained('nvidia/mit-b0') >>> model = SegformerForImageClassification.from_pretrained('nvidia/mit-b0') >>> inputs = feature_extractor(images=image, return_tensors="pt") >>> outputs = model(**inputs) >>> logits = outputs.logits >>> # model predicts one of the 1000 ImageNet classes >>> predicted_class_idx = logits.argmax(-1).item() >>> print("Predicted class:", model.config.id2label[predicted_class_idx]) """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.segformer( pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] # reshape last hidden states to (batch_size, height*width, hidden_size) batch_size = sequence_output.shape[0] sequence_output = sequence_output.reshape(batch_size, -1, self.config.hidden_sizes[-1]) # global average pooling sequence_output = sequence_output.mean(dim=1) logits = self.classifier(sequence_output) loss = None if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) if not return_dict: output = (logits, ) + outputs[1:] return ((loss, ) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, )
def train(self): step = 0 validation_step = 0 writer = SummaryWriter(properties.prep_tensor_board) temp_optimizer = optim.Adam( self.prep_model.parameters(), lr=0.01, weight_decay=0) temp_loss_fn = MSELoss().to(self.device) self.prep_model.train() for image, labels, names in self.loader_train: self.prep_model.zero_grad() X_var = image.to(self.device) preds = self.prep_model(X_var) loss = temp_loss_fn(preds, X_var) loss.backward() temp_optimizer.step() for epoch in range(self.epochs): training_loss = 0 self.prep_model.train() for image, label_dict, names in self.loader_train: self.prep_model.zero_grad() X_var = image.to(self.device) pred = self.prep_model(X_var) text_crops, labels = get_text_stack( pred[0], label_dict[0], self.input_size) batch, c, h, w = text_crops.shape grads = torch.zeros_like(text_crops).to(self.device) batch_loss = 0 for i in range(batch): noise = torch.randn( size=(self.p_samples, c, h, w)).to(self.device) noise = torch.cat((noise, -noise), dim=0) noisy_imgs = text_crops[i] + (noise*self.std) noisy_imgs = noisy_imgs.view(2*self.p_samples, c, -1) noisy_imgs -= noisy_imgs.min(2, keepdim=True)[0] noisy_imgs /= noisy_imgs.max(2, keepdim=True)[0] noisy_imgs = noisy_imgs.view(2*self.p_samples, c, h, w) noisy_labels = self.ocr.get_labels( noisy_imgs.detach().cpu()) loss = self._get_cer( noisy_labels, [labels[i]]*2*self.p_samples) mean_loss = loss.mean(dim=0) batch_loss += mean_loss.item() loss = loss.unsqueeze(1).unsqueeze(1).unsqueeze(1) loss = noise*loss.to(self.device) loss = torch.div(loss.mean(dim=0), self.std) grads[i] += loss training_loss += (batch_loss/batch) sec_loss = self.secondary_loss_fn(pred, torch.ones( pred.shape).to(self.device))*self.sec_loss_scalar sec_loss.backward(retain_graph=True) text_crops.backward(grads) self.optimizer.step() if step % 500 == 0: print("Iteration: %d => %f" % (step, batch_loss/batch)) step += 1 writer.add_scalar('Training Loss', training_loss / self.train_set_size, epoch + 1) self.prep_model.eval() validation_loss = 0 tess_crt_count = 0 tess_CER = 0 label_count = 0 with torch.no_grad(): for image, label_dict in self.loader_validation: X_var = image.to(self.device) pred = self.prep_model(X_var) text_crops, labels = get_text_stack( pred[0], label_dict[0], self.input_size) ocr_labels = self.ocr.get_labels(text_crops.detach().cpu()) loss = self._get_cer(ocr_labels, labels) mean_loss = loss.mean(dim=0) validation_loss += mean_loss.item() tess_crt, tess_cer = compare_labels(ocr_labels, labels) tess_crt_count += tess_crt tess_CER += tess_cer validation_step += 1 label_count += len(labels) writer.add_scalar('Accuracy/'+self.ocr_name+'_output', tess_crt_count/label_count, epoch + 1) writer.add_scalar('Validation Loss', validation_loss/self.val_set_size, epoch + 1) save_img(pred.cpu(), 'out_' + str(epoch), properties.img_out_path) if epoch == 0: save_img(image.cpu(), 'out_original', properties.img_out_path) print("%s correct count: %d; (validation set size:%d)" % ( self.ocr_name, tess_crt_count, label_count)) print("%s CER: %d;" % (self.ocr_name, tess_CER)) print("Epoch: %d/%d => Training loss: %f | Validation loss: %f" % ((epoch + 1), self.epochs, training_loss / (self.train_set_size // self.batch_size), validation_loss/(self.val_set_size//self.batch_size))) torch.save(self.prep_model, properties.prep_model_path + "Prep_model_"+str(epoch)) writer.flush() writer.close()
def evaluate(self, model, examples): """ Evaluate the model. Parameters ---------- model: BertModel The model to be evaluated. examples: list Evaluation data as a list of InputExample's/ Returns ------- evaluation_df: pd.DataFrame A dataframe that includes for each example predicted probability and labels. """ eval_loader = self.get_loader(examples, phase='eval') logger.info("***** Running evaluation ***** ") logger.info(" Num examples = %d", len(examples)) logger.info(" Batch size = %d", self.config.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 predictions = [] labels = [] agree_levels = [] text_ids = [] for input_ids, attention_mask, token_type_ids, label_ids, agree_ids in tqdm( eval_loader, desc="Testing"): input_ids = input_ids.to(self.device) attention_mask = attention_mask.to(self.device) token_type_ids = token_type_ids.to(self.device) label_ids = label_ids.to(self.device) agree_ids = agree_ids.to(self.device) with torch.no_grad(): logits = model(input_ids, attention_mask, token_type_ids)[0] if self.config.output_mode == "classification": loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, self.num_labels), label_ids.view(-1)) elif self.config.output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) np_logits = logits.cpu().numpy() if self.config.output_mode == 'classification': prediction = np.array(np_logits) elif self.config.output_mode == "regression": prediction = np.array(np_logits) for agree_id in agree_ids: agree_levels.append(agree_id.item()) for label_id in label_ids: labels.append(label_id.item()) for pred in prediction: predictions.append(pred) text_ids.append(input_ids) # tmp_eval_loss = loss_fct(logits.view(-1, self.num_labels), label_ids.view(-1)) # tmp_eval_loss = model(input_ids, token_type_ids, attention_mask, label_ids) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 # logits = logits.detach().cpu().numpy() # label_ids = label_ids.to('cpu').numpy() # tmp_eval_accuracy = accuracy(logits, label_ids) # eval_loss += tmp_eval_loss.mean().item() # eval_accuracy += tmp_eval_accuracy evaluation_df = pd.DataFrame({ 'predictions': predictions, 'labels': labels, "agree_levels": agree_levels }) return evaluation_df
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.deberta( input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, position_ids=position_ids, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) encoder_layer = outputs[0] pooled_output = self.pooler(encoder_layer) pooled_output = self.drop(pooled_output) logits = self.classifier(pooled_output) loss = None if labels is not None: if self.config.problem_type is None: if self.n_labels == 1: # regression task loss_fn = nn.MSELoss() logits = logits.view(-1).to(labels.dtype) loss = loss_fn(logits, labels.view(-1)) elif labels.dim() == 1 or labels.size(-1) == 1: label_index = (labels >= 0).nonzero() labels = labels.long() if label_index.size(0) > 0: labeled_logits = torch.gather( logits, 0, label_index.expand(label_index.size(0), logits.size(1)) ) labels = torch.gather(labels, 0, label_index.view(-1)) loss_fct = CrossEntropyLoss() loss = loss_fct( labeled_logits.view(-1, self.n_labels).float(), labels.view(-1) ) else: loss = torch.tensor(0).to(logits) else: log_softmax = nn.LogSoftmax(-1) loss = -((log_softmax(logits) * labels).sum(-1)).mean() elif self.config.problem_type == "regression": loss_fct = MSELoss() if self.n_labels == 1: loss = loss_fct(logits.squeeze(), labels.squeeze()) else: loss = loss_fct(logits, labels) elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.n_labels), labels.view(-1)) elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) if not return_dict: output = (logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, logits=logits, hiddens=outputs.hiddens, attns=outputs.attns, )
def main(): parser = argparse.ArgumentParser() parser.add_argument("--data_dir", default='data/', type=str, help="The data directory.") parser.add_argument("--model_dir", default='models/', type=str, help="The models directory.") parser.add_argument("--teacher_model", default=None, type=str, help="The models directory.") parser.add_argument("--student_model", default=None, type=str, help="The models directory.") parser.add_argument( "--output_dir", default='output', type=str, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument('--version_2_with_negative', action='store_true', help="Squadv2.0 if true else Squadv1.1 ") # default parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument("--verbose_logging", default=0, type=int) parser.add_argument( '--null_score_diff_threshold', type=float, default=0.0, help= "If null_score - best_non_null is greater than the threshold predict null." ) parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--do_lower_case', #action='store_true', default=True, help="do lower case") parser.add_argument("--per_gpu_batch_size", default=16, type=int, help="Per GPU batch size for training.") parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument('--eval_step', type=int, default=200, help="Evaluate every X training steps") parser.add_argument('--pred_distill', action='store_true', help="Whether to distil with task layer") parser.add_argument('--intermediate_distill', action='store_true', help="Whether to distil with intermediate layers") parser.add_argument('--save_fp_model', action='store_true', help="Whether to save fp32 model") parser.add_argument('--save_quantized_model', action='store_true', help="Whether to save quantized model") parser.add_argument("--weight_bits", default=2, type=int, choices=[2, 8], help="Quantization bits for weight.") parser.add_argument("--input_bits", default=8, type=int, help="Quantization bits for activation.") parser.add_argument("--clip_val", default=2.5, type=float, help="Initial clip value.") args = parser.parse_args() summaryWriter = SummaryWriter(args.output_dir) if args.teacher_model is None: args.teacher_model = args.model_dir if args.student_model is None: args.student_model = args.model_dir args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.n_gpu > 0: torch.cuda.manual_seed_all(args.seed) args.batch_size = args.n_gpu * args.per_gpu_batch_size logger.info(f'The args: {args}') if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.teacher_model, do_lower_case=True) # preparing training data input_file = 'train-v2.0' if args.version_2_with_negative else 'train-v1.1' input_file = os.path.join(args.data_dir, input_file) if os.path.exists(input_file): train_features = pickle.load(open(input_file, 'rb')) else: input_file = 'train-v2.0.json' if args.version_2_with_negative else 'train-v1.1.json' input_file = os.path.join(args.data_dir, input_file) _, train_examples = read_squad_examples( input_file=input_file, is_training=True, version_2_with_negative=args.version_2_with_negative) train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) num_train_optimization_steps = int( len(train_features) / args.batch_size) * args.num_train_epochs logger.info("***** Running training *****") logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size) input_file = 'dev-v2.0.json' if args.version_2_with_negative else 'dev-v1.1.json' args.dev_file = os.path.join(args.data_dir, input_file) dev_dataset, eval_examples = read_squad_examples( input_file=args.dev_file, is_training=False, version_2_with_negative=args.version_2_with_negative) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) teacher_model = BertForQuestionAnswering.from_pretrained( args.teacher_model) teacher_model.to(args.device) teacher_model.eval() if args.n_gpu > 1: teacher_model = torch.nn.DataParallel(teacher_model) result = do_eval(args, teacher_model, eval_dataloader, eval_features, eval_examples, args.device, dev_dataset) em, f1 = result['exact_match'], result['f1'] logger.info(f"Full precision teacher exact_match={em},f1={f1}") student_config = BertConfig.from_pretrained(args.student_model, quantize_act=True, weight_bits=args.weight_bits, input_bits=args.input_bits, clip_val=args.clip_val) student_model = QuantBertForQuestionAnswering.from_pretrained( args.student_model, config=student_config) student_model.to(args.device) if args.n_gpu > 1: student_model = torch.nn.DataParallel(student_model) # Prepare optimizer param_optimizer = list(student_model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] schedule = 'warmup_linear' optimizer = BertAdam(optimizer_grouped_parameters, schedule=schedule, lr=args.learning_rate, warmup=0.1, t_total=num_train_optimization_steps) loss_mse = MSELoss() # Train and evaluate global_step = 0 best_dev_f1 = 0.0 flag_loss = float('inf') previous_best = None tr_loss = 0. tr_att_loss = 0. tr_rep_loss = 0. tr_cls_loss = 0. for epoch_ in range(int(args.num_train_epochs)): for step, batch in enumerate(train_dataloader): student_model.train() batch = tuple(t.to(args.device) for t in batch) input_ids, input_mask, segment_ids, start_positions, end_positions = batch att_loss = 0. rep_loss = 0. cls_loss = 0. loss = 0 student_logits, student_atts, student_reps = student_model( input_ids, segment_ids, input_mask) with torch.no_grad(): teacher_logits, teacher_atts, teacher_reps = teacher_model( input_ids, segment_ids, input_mask) if args.pred_distill: soft_start_ce_loss = soft_cross_entropy( student_logits[0], teacher_logits[0]) soft_end_ce_loss = soft_cross_entropy(student_logits[1], teacher_logits[1]) cls_loss = soft_start_ce_loss + soft_end_ce_loss loss += cls_loss tr_cls_loss += cls_loss.item() if args.intermediate_distill: for student_att, teacher_att in zip(student_atts, teacher_atts): student_att = torch.where( student_att <= -1e2, torch.zeros_like(student_att).to(args.device), student_att) teacher_att = torch.where( teacher_att <= -1e2, torch.zeros_like(teacher_att).to(args.device), teacher_att) tmp_loss = loss_mse(student_att, teacher_att) att_loss += tmp_loss for student_rep, teacher_rep in zip(student_reps, teacher_reps): tmp_loss = loss_mse(student_rep, teacher_rep) rep_loss += tmp_loss loss += rep_loss + att_loss tr_att_loss += att_loss.item() tr_rep_loss += rep_loss.item() if args.n_gpu > 1: loss = loss.mean() loss.backward() tr_loss += loss.item() optimizer.step() optimizer.zero_grad() global_step += 1 save_model = False if global_step % args.eval_step == 0 or global_step == num_train_optimization_steps - 1: logger.info("***** Running evaluation *****") logger.info(f" Epoch = {epoch_} iter {global_step} step") if previous_best is not None: logger.info(f"Previous best = {previous_best}") student_model.eval() result = do_eval(args, student_model, eval_dataloader, eval_features, eval_examples, args.device, dev_dataset) em, f1 = result['exact_match'], result['f1'] logger.info(f'{em}/{f1}') if f1 > best_dev_f1: previous_best = f"exact_match={em},f1={f1}" best_dev_f1 = f1 save_model = True summaryWriter.add_scalars('performance', { 'exact_match': em, 'f1': f1 }, global_step) loss = tr_loss / global_step cls_loss = tr_cls_loss / global_step att_loss = tr_att_loss / global_step rep_loss = tr_rep_loss / global_step summaryWriter.add_scalar('total_loss', loss, global_step) summaryWriter.add_scalars( 'distill_loss', { 'att_loss': att_loss, 'rep_loss': rep_loss, 'cls_loss': cls_loss }, global_step) #save quantiozed model if save_model: logger.info(previous_best) if args.save_fp_model: logger.info( "******************** Save full precision model ********************" ) model_to_save = student_model.module if hasattr( student_model, 'module') else student_model output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) if args.save_quantized_model: logger.info( "******************** Save quantized model ********************" ) output_quant_dir = os.path.join(args.output_dir, 'quant') if not os.path.exists(output_quant_dir): os.makedirs(output_quant_dir) model_to_save = student_model.module if hasattr( student_model, 'module') else student_model quant_model = copy.deepcopy(model_to_save) for name, module in quant_model.named_modules(): if hasattr(module, 'weight_quantizer'): module.weight.data = module.weight_quantizer.apply( module.weight, module.weight_clip_val, module.weight_bits, True) output_model_file = os.path.join(output_quant_dir, WEIGHTS_NAME) output_config_file = os.path.join(output_quant_dir, CONFIG_NAME) torch.save(quant_model.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(output_quant_dir)
def forward_head(self, outputs, head_name=None, attention_mask=None, labels=None): head_name = head_name or self.active_head if not head_name: logger.debug("No prediction head is used.") return outputs if head_name not in self.config.prediction_heads: raise ValueError("Unknown head_name '{}'".format(head_name)) head = self.config.prediction_heads[head_name] sequence_output = outputs[0] if head["head_type"] == "classification": logits = self.heads[head_name](sequence_output[:, 0]) outputs = (logits,) + outputs[2:] if labels is not None: if head["num_labels"] == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, head["num_labels"]), labels.view(-1)) outputs = (loss,) + outputs elif head["head_type"] == "multilabel_classification": logits = self.heads[head_name](sequence_output[:, 0]) outputs = (logits,) + outputs[2:] if labels is not None: loss_fct = BCEWithLogitsLoss() if labels.dtype != torch.float32: labels = labels.float() loss = loss_fct(logits, labels) outputs = (loss,) + outputs elif head["head_type"] == "multiple_choice": logits = self.heads[head_name](sequence_output[:, 0]) logits = logits.view(-1, head["num_choices"]) outputs = (logits,) + outputs[2:] if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(logits, labels) outputs = (loss,) + outputs elif head["head_type"] == "tagging": logits = self.heads[head_name](sequence_output) outputs = (logits,) + outputs[2:] if labels is not None: loss_fct = CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels) active_labels = torch.where( active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) ) loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs else: raise ValueError("Unknown head_type '{}'".format(head["head_type"])) return outputs # (loss), logits, (hidden_states), (attentions)
def __init__(self): super(TripletLoss, self).__init__() self.mseLoss = MSELoss() self.alpha = 1
def forward( self, input_ids=None, input_numericul=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) pooled_output = cat((pooled_output, input_numericul), dim=1) pooled_output = pooled_output.type(torch.float32) logits = self.classifier(pooled_output) loss = None if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) if not return_dict: output = (logits, ) + outputs[2:] return ((loss, ) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, )