def train(args, train_dataset, dev_dataset, model, tokenizer): train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) t_total = len(train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs if args.warmup_ratio: ws = args.warmup_ratio * t_total else: ws = args.warmup_steps # prepare the optimizer and scheduler (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=ws, t_total=t_total) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 best_f1 = 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc='Epoch', disable=args.local_rank not in [-1, 0]) for _ in train_iterator: epoch_loss = 0.0 epoch_iteration = tqdm(train_dataloader, desc='Iter') for step, batch in enumerate(epoch_iteration): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'segment_ids': batch[2] } y_true = batch[3].view(-1, 1).float() logits = model(**inputs) # print(logits.type()) # print(y_true.type()) loss_func = BCEWithLogitsLoss() loss = loss_func(logits, y_true) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() epoch_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if global_step % args.dev_step == 0: curr_f1 = evaluate(args, dev_dataset, model, tokenizer, mode='dev') logger.info(' current f1: %f', curr_f1) if curr_f1 > best_f1: best_f1 = curr_f1 logger.info(' best f1: %f', best_f1) output_file = os.path.join( args.output_dir, str(args.max_seq_length) + '_' + str(args.train_batch_size) + '_' + args.task + 'best_model.bin') model_to_save = model.module if hasattr( model, 'module') else model torch.save(model_to_save, output_file) logger.info('Training loss current epoch: %f', epoch_loss) return global_step, tr_loss / global_step
def main(args, logger): # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv') trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl') trn_df['is_original'] = 1 # raw_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/raw_pseudo_tst_df.csv') half_opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/half_opt_pseudo_tst_df.csv') opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/opt_pseudo_tst_df.csv') # clean texts # trn_df = clean_data(trn_df, ['question_title', 'question_body', 'answer']) # load additional tokens # with open('./mnt/inputs/nes_info/trn_over_10_vocab.pkl', 'rb') as fin: # additional_tokens = pickle.load(fin) gkf = GroupKFold( n_splits=5).split( X=trn_df.question_body, groups=trn_df.question_body_le, ) histories = { 'trn_loss': {}, 'val_loss': {}, 'val_metric': {}, 'val_metric_raws': {}, } loaded_fold = -1 loaded_epoch = -1 if args.checkpoint: histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint) fold_best_metrics = [] fold_best_metrics_raws = [] for fold, (trn_idx, val_idx) in enumerate(gkf): if fold < loaded_fold: fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax(histories["val_metric"][fold])]) continue sel_log( f' --------------------------- start fold {fold} --------------------------- ', logger) fold_trn_df = trn_df.iloc[trn_idx] # .query('is_original == 1') fold_trn_df = fold_trn_df.drop( ['is_original', 'question_body_le'], axis=1) # use only original row fold_val_df = trn_df.iloc[val_idx].query('is_original == 1') fold_val_df = fold_val_df.drop( ['is_original', 'question_body_le'], axis=1) if args.debug: fold_trn_df = fold_trn_df.sample(100, random_state=71) fold_val_df = fold_val_df.sample(100, random_state=71) temp = pd.Series(list(itertools.chain.from_iterable( fold_trn_df.question_title.apply(lambda x: x.split(' ')) + fold_trn_df.question_body.apply(lambda x: x.split(' ')) + fold_trn_df.answer.apply(lambda x: x.split(' ')) ))).value_counts() tokens = temp[temp >= 10].index.tolist() # tokens = [] tokens = [ 'CAT_TECHNOLOGY'.casefold(), 'CAT_STACKOVERFLOW'.casefold(), 'CAT_CULTURE'.casefold(), 'CAT_SCIENCE'.casefold(), 'CAT_LIFE_ARTS'.casefold(), ]# + additional_tokens fold_trn_df = pd.concat([fold_trn_df, opt_pseudo_df, half_opt_pseudo_df], axis=0) trn_dataset = QUESTDataset( df=fold_trn_df, mode='train', tokens=tokens, augment=[], tokenizer_type=TOKENIZER_TYPE, pretrained_model_name_or_path=TOKENIZER_PRETRAIN, do_lower_case=DO_LOWER_CASE, LABEL_COL=LABEL_COL, t_max_len=T_MAX_LEN, q_max_len=Q_MAX_LEN, a_max_len=A_MAX_LEN, tqa_mode=TQA_MODE, TBSEP='[TBSEP]', pos_id_type='arange', MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, ) # update token trn_sampler = RandomSampler(data_source=trn_dataset) trn_loader = DataLoader(trn_dataset, batch_size=BATCH_SIZE, sampler=trn_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=True, pin_memory=True) val_dataset = QUESTDataset( df=fold_val_df, mode='valid', tokens=tokens, augment=[], tokenizer_type=TOKENIZER_TYPE, pretrained_model_name_or_path=TOKENIZER_PRETRAIN, do_lower_case=DO_LOWER_CASE, LABEL_COL=LABEL_COL, t_max_len=T_MAX_LEN, q_max_len=Q_MAX_LEN, a_max_len=A_MAX_LEN, tqa_mode=TQA_MODE, TBSEP='[TBSEP]', pos_id_type='arange', MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, ) val_sampler = RandomSampler(data_source=val_dataset) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=False, pin_memory=True) fobj = BCEWithLogitsLoss() state_dict = BertModel.from_pretrained(MODEL_PRETRAIN).state_dict() model = BertModelForBinaryMultiLabelClassifier(num_labels=len(LABEL_COL), config_path=MODEL_CONFIG_PATH, state_dict=state_dict, token_size=len( trn_dataset.tokenizer), MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, cat_last_layer_num=1, do_ratio=0.2, ) optimizer = optim.Adam(model.parameters(), lr=3e-5) scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=MAX_EPOCH, eta_min=1e-5) # load checkpoint model, optim, scheduler if args.checkpoint and fold == loaded_fold: load_checkpoint(args.checkpoint, model, optimizer, scheduler) for epoch in tqdm(list(range(MAX_EPOCH))): if fold <= loaded_fold and epoch <= loaded_epoch: continue if epoch < 1: model.freeze_unfreeze_bert(freeze=True, logger=logger) else: model.freeze_unfreeze_bert(freeze=False, logger=logger) model = DataParallel(model) model = model.to(DEVICE) trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader, DEVICE) val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test( model, fobj, val_loader, DEVICE, mode='valid') scheduler.step() if fold in histories['trn_loss']: histories['trn_loss'][fold].append(trn_loss) else: histories['trn_loss'][fold] = [trn_loss, ] if fold in histories['val_loss']: histories['val_loss'][fold].append(val_loss) else: histories['val_loss'][fold] = [val_loss, ] if fold in histories['val_metric']: histories['val_metric'][fold].append(val_metric) else: histories['val_metric'][fold] = [val_metric, ] if fold in histories['val_metric_raws']: histories['val_metric_raws'][fold].append(val_metric_raws) else: histories['val_metric_raws'][fold] = [val_metric_raws, ] logging_val_metric_raws = '' for val_metric_raw in val_metric_raws: logging_val_metric_raws += f'{float(val_metric_raw):.4f}, ' sel_log( f'fold : {fold} -- epoch : {epoch} -- ' f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_metric : {float(val_metric):.4f} -- ' f'val_metric_raws : {logging_val_metric_raws}', logger) model = model.to('cpu') model = model.module save_checkpoint( f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model, optimizer, scheduler, histories, val_y_preds, val_y_trues, val_qa_ids, fold, epoch, val_loss, val_metric, ) fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax(histories["val_metric"][fold])]) save_and_clean_for_prediction( f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', trn_dataset.tokenizer, clean=False) del model # calc training stats fold_best_metric_mean = np.mean(fold_best_metrics) fold_best_metric_std = np.std(fold_best_metrics) fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}' sel_log(fold_stats, logger) send_line_notification(fold_stats) fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0) fold_raw_stats = '' for metric_stats_raw in fold_best_metrics_raws_mean: fold_raw_stats += f'{float(metric_stats_raw):.4f},' sel_log(fold_raw_stats, logger) send_line_notification(fold_raw_stats) sel_log('now saving best checkpoints...', logger)
def criterion(self, pred, target): loss_fn = BCEWithLogitsLoss() pred = pred.float() target = target.float() loss = loss_fn(input=pred, target=target) return loss
val_dataset = CXRDataset(args.image_dir, [validation_fold]) # Data loaders train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=True) print('Total number of training images: {}, validation images: {}.'.format( len(train_dataset), len(val_dataset))) aux_criterion = CrossEntropyLoss().to(device) if args.label_encoding == 'ordinal': BCE_loss_criterion = BCEWithLogitsLoss().to(device) add_softmax = False output_channels = 3 if args.label_encoding == 'onehot': BCE_loss_criterion = BCELoss().to(device) add_softmax = True output_channels = 4 # Create instance of a resnet model if args.model_architecture == 'resnet18': model = ResidualNet('ImageNet', 18, output_channels, None, add_softmax=True) if args.model_architecture == 'resnet14_1':
def evaluate(dataloader, model, device, no_labels=False): # Turn on evaluation mode which disables dropout. model.eval() logits = [] preds = [] labels = [] ids = [] avg_loss = 0. loss_fct = BCEWithLogitsLoss() def append(all_tensors, batch_tensor): if len(all_tensors) == 0: all_tensors.append(batch_tensor) else: all_tensors[0] = np.append(all_tensors[0], batch_tensor, axis=0) return all_tensors def detach(tensor, dtype=None): if dtype: return tensor.detach().cpu().numpy().astype(dtype) else: return tensor.detach().cpu().numpy() with torch.no_grad(): for batch in tqdm(dataloader, desc="Iteration"): batch = tuple(t.to(device) for t in batch) if no_labels: b_inputs, b_ids = batch else: b_inputs, b_labels, b_ids = batch b_logits = model(b_inputs) if not no_labels: loss = loss_fct(b_logits, b_labels) avg_loss += loss.item() b_preds = (torch.sigmoid(b_logits).detach().cpu().numpy() >= 0.5).astype(int) b_logits = detach(b_logits, float) if not no_labels: b_labels = detach(b_labels, int) b_ids = detach(b_ids, int) preds = append(preds, b_preds) logits = append(logits, b_logits) if not no_labels: labels = append(labels, b_labels) ids = append(ids, b_ids) preds = preds[0] logits = logits[0] if not no_labels: labels = labels[0] avg_loss /= len(dataloader) ids = ids[0] if not no_labels: score = f1_score(y_true=labels, y_pred=preds, average='micro') print("\nEvaluation - loss: {:.6f} f1: {:.4f}%\n".format( avg_loss, score)) else: score = 0. return score, (logits, preds, labels, ids, avg_loss)
def main(rank, dev_id, args): set_seed() # Remove the line below will result in problems for multiprocess if args['num_devices'] > 1: torch.set_num_threads(1) if dev_id == -1: args['device'] = torch.device('cpu') else: args['device'] = torch.device('cuda:{}'.format(dev_id)) # Set current device torch.cuda.set_device(args['device']) train_set, val_set = load_dataset(args) get_center_subset(train_set, rank, args['num_devices']) train_loader = DataLoader(train_set, batch_size=args['batch_size'], collate_fn=collate_center, shuffle=True) val_loader = DataLoader(val_set, batch_size=args['batch_size'], collate_fn=collate_center, shuffle=False) model = WLNReactionCenter(node_in_feats=args['node_in_feats'], edge_in_feats=args['edge_in_feats'], node_pair_in_feats=args['node_pair_in_feats'], node_out_feats=args['node_out_feats'], n_layers=args['n_layers'], n_tasks=args['n_tasks']).to(args['device']) model.train() if rank == 0: print('# trainable parameters in the model: ', count_parameters(model)) criterion = BCEWithLogitsLoss(reduction='sum') optimizer = Adam(model.parameters(), lr=args['lr']) if args['num_devices'] <= 1: from utils import Optimizer optimizer = Optimizer(model, args['lr'], optimizer, max_grad_norm=args['max_norm']) else: from utils import MultiProcessOptimizer optimizer = MultiProcessOptimizer(args['num_devices'], model, args['lr'], optimizer, max_grad_norm=args['max_norm']) total_iter = 0 rank_iter = 0 grad_norm_sum = 0 loss_sum = 0 dur = [] for epoch in range(args['num_epochs']): t0 = time.time() for batch_id, batch_data in enumerate(train_loader): total_iter += args['num_devices'] rank_iter += 1 batch_reactions, batch_graph_edits, batch_mol_graphs, \ batch_complete_graphs, batch_atom_pair_labels = batch_data labels = batch_atom_pair_labels.to(args['device']) pred, biased_pred = reaction_center_prediction( args['device'], model, batch_mol_graphs, batch_complete_graphs) loss = criterion(pred, labels) / len(batch_reactions) loss_sum += loss.cpu().detach().data.item() grad_norm_sum += optimizer.backward_and_step(loss) if rank_iter % args['print_every'] == 0 and rank == 0: progress = 'Epoch {:d}/{:d}, iter {:d}/{:d} | ' \ 'loss {:.4f} | grad norm {:.4f}'.format( epoch + 1, args['num_epochs'], batch_id + 1, len(train_loader), loss_sum / args['print_every'], grad_norm_sum / args['print_every']) print(progress) grad_norm_sum = 0 loss_sum = 0 if total_iter % args['decay_every'] == 0: optimizer.decay_lr(args['lr_decay_factor']) if total_iter % args['decay_every'] == 0 and rank == 0: if epoch >= 1: dur.append(time.time() - t0) print('Training time per {:d} iterations: {:.4f}'.format( rank_iter, np.mean(dur))) total_samples = total_iter * args['batch_size'] prediction_summary = 'total samples {:d}, (epoch {:d}/{:d}, iter {:d}/{:d}) '.format( total_samples, epoch + 1, args['num_epochs'], batch_id + 1, len(train_loader)) + \ reaction_center_final_eval(args, args['top_ks_val'], model, val_loader, easy=True) print(prediction_summary) with open(args['result_path'] + '/val_eval.txt', 'a') as f: f.write(prediction_summary) torch.save({'model_state_dict': model.state_dict()}, args['result_path'] + '/model_{:d}.pkl'.format(total_samples)) t0 = time.time() model.train() synchronize(args['num_devices'])
def __init__(self): self.loss_fn = BCEWithLogitsLoss()
def forward( self, pixel_values=None, past_key_values=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: Examples:: >>> from transformers import ImageGPTFeatureExtractor, ImageGPTForImageClassification >>> from PIL import Image >>> import requests >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' >>> image = Image.open(requests.get(url, stream=True).raw) >>> feature_extractor = ImageGPTFeatureExtractor.from_pretrained('openai/imagegpt-small') >>> model = ImageGPTForImageClassification.from_pretrained('openai/imagegpt-small') >>> inputs = feature_extractor(images=image, return_tensors="pt") >>> outputs = model(**inputs) >>> logits = outputs.logits """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( pixel_values, past_key_values=past_key_values, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = transformer_outputs[0] # average-pool the hidden states along the sequence dimension pooled_hidden_states = hidden_states.mean(dim=1) # project from (batch_size, hidden_size) to (batch_size, num_labels) logits = self.score(pooled_hidden_states) loss = None if labels is not None: if self.config.problem_type is None: if self.num_labels == 1: self.config.problem_type = "regression" elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): self.config.problem_type = "single_label_classification" else: self.config.problem_type = "multi_label_classification" if self.config.problem_type == "regression": loss_fct = MSELoss() if self.num_labels == 1: loss = loss_fct(logits.squeeze(), labels.squeeze()) else: loss = loss_fct(logits, labels) elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) if not return_dict: output = (logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output return SequenceClassifierOutputWithPast( loss=loss, logits=logits, past_key_values=transformer_outputs.past_key_values, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, )
def Loss(self,logits,Y): return BCEWithLogitsLoss()(logits,Y)
def forward( self, input_ids=None, past_key_values=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids, past_key_values=past_key_values, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = transformer_outputs[0] logits = self.classifier(hidden_states) if input_ids is not None: batch_size, sequence_length = input_ids.shape[:2] else: batch_size, sequence_length = inputs_embeds.shape[:2] assert ( self.config.pad_token_id is not None or batch_size == 1 ), "Cannot handle batch sizes > 1 if no padding token is defined." if self.config.pad_token_id is None: sequence_lengths = -1 else: if input_ids is not None: sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1 else: sequence_lengths = -1 logger.warning( f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " f"unexpected if using padding tokens in conjunction with `inputs_embeds.`" ) pooled_logits = logits[range(batch_size), sequence_lengths] loss = None if labels is not None: if self.config.problem_type is None: if self.num_labels == 1: self.config.problem_type = "regression" elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): self.config.problem_type = "single_label_classification" else: self.config.problem_type = "multi_label_classification" if self.config.problem_type == "regression": loss_fct = MSELoss() if self.num_labels == 1: loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) else: loss = loss_fct(pooled_logits, labels) elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(pooled_logits, labels) if not return_dict: output = (pooled_logits,) + transformer_outputs[2:] return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, logits=pooled_logits, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, )
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None, output_attentions=None, output_hidden_states=None, ): outputs = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=None, ) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) # batch_size # print(start_logits.shape, end_logits.shape, start_positions.shape, end_positions.shape) total_loss = None if ( start_positions is not None and end_positions is not None ): # [batch_size/seq_length] # # If we are on multi-GPU, split add a dimension # if len(start_positions.size()) > 1: # start_positions = start_positions.squeeze(-1) # if len(end_positions.size()) > 1: # end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms # ignored_index = start_logits.size(1) # start_positions.clamp_(0, ignored_index) # end_positions.clamp_(0, ignored_index) # start_positions = start_logits.view() loss_fct = BCEWithLogitsLoss() start_loss = loss = loss_fct( start_logits, start_positions.float(), ) end_loss = loss = loss_fct( end_logits, end_positions.float(), ) total_loss = (start_loss + end_loss) / 2 output = (start_logits, end_logits) + outputs[2:] return ((total_loss,) + output) if total_loss is not None else output
def train(self, dataset: BaseADDataset, net: BaseNet): logger = logging.getLogger() if self.train_loader is None: try: self.train_loader, _, _ = dataset.loaders( batch_size=self.batch_size, num_workers=self.n_jobs_dataloader) except: self.train_loader, _ = dataset.loaders( batch_size=self.batch_size, num_workers=self.n_jobs_dataloader) # Set device for network net = net.to(self.device) # Set optimizer (Adam optimizer for now) optimizer = optim.Adam(net.parameters(), lr=self.lr, weight_decay=self.weight_decay) # Set learning rate scheduler scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=self.lr_milestones, gamma=0.1) # Set loss function criterion = BCEWithLogitsLoss() # Training logger.info('Starting training...') start_time = time.time() net.train() for epoch in range(self.n_epochs): epoch_loss = 0.0 n_batches = 0 epoch_start_time = time.time() for data in train_loader: inputs, targets, _, _ = data inputs, targets = inputs.to(self.device), targets.to( self.device) # Zero the network parameter gradients optimizer.zero_grad() # Update network parameters via backpropagation: forward + backward + optimize outputs = net(inputs) targets = targets.type_as(outputs) loss = criterion(outputs, targets.unsqueeze(1)) loss.backward() optimizer.step() epoch_loss += loss.item() n_batches += 1 scheduler.step() if epoch in self.lr_milestones: logger.info(' LR scheduler: new learning rate is %g' % float(scheduler.get_lr()[0])) # log epoch statistics epoch_train_time = time.time() - epoch_start_time logger.info( f'| Epoch: {epoch + 1:03}/{self.n_epochs:03} | Train Time: {epoch_train_time:.3f}s ' f'| Train Loss: {epoch_loss / n_batches:.6f} |') if self.reporter: self._log_train(net, dataset) self.reporter( ** {'train/loss/' + str(dataset.id): epoch_loss / n_batches}) self.train_time = time.time() - start_time logger.info('Training Time: {:.3f}s'.format(self.train_time)) logger.info('Finished training.') return net
def __init__(self): super().__init__() self.weight = None self.bce_with_logits_loss = BCEWithLogitsLoss(weight=self.weight, reduction='mean')
def __init__(self): super().__init__() self.loss = BCEWithLogitsLoss(reduction='mean')
def main(args, logger): trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv') gkf = GroupKFold(n_splits=5).split(X=trn_df.question_body, groups=trn_df.question_body) histories = { 'trn_loss': [], 'val_loss': [], 'val_metric': [], } loaded_fold = -1 loaded_epoch = -1 if args.checkpoint: histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint) for fold, (trn_idx, val_idx) in enumerate(gkf): if fold < loaded_fold: continue fold_trn_df = trn_df.iloc[trn_idx] fold_val_df = trn_df.iloc[val_idx] if args.debug: fold_trn_df = fold_trn_df.sample(100, random_state=71) fold_val_df = fold_val_df.sample(100, random_state=71) temp = pd.Series( list( itertools.chain.from_iterable( fold_trn_df.question_title.apply(lambda x: x.split(' ')) + fold_trn_df.question_body.apply(lambda x: x.split(' ')) + fold_trn_df.answer.apply(lambda x: x.split(' ')))) ).value_counts() tokens = temp[temp >= 10].index.tolist() # tokens = [] tokens = [ 'CAT_TECHNOLOGY'.casefold(), 'CAT_STACKOVERFLOW'.casefold(), 'CAT_CULTURE'.casefold(), 'CAT_SCIENCE'.casefold(), 'CAT_LIFE_ARTS'.casefold(), ] trn_dataset = QUESTDataset( df=fold_trn_df, mode='train', tokens=tokens, augment=[], pretrained_model_name_or_path=TOKENIZER_PRETRAIN, ) # update token trn_sampler = RandomSampler(data_source=trn_dataset) trn_loader = DataLoader(trn_dataset, batch_size=BATCH_SIZE, sampler=trn_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=True, pin_memory=True) val_dataset = QUESTDataset( df=fold_val_df, mode='valid', tokens=tokens, augment=[], pretrained_model_name_or_path=TOKENIZER_PRETRAIN, ) val_sampler = RandomSampler(data_source=val_dataset) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=False, pin_memory=True) fobj = BCEWithLogitsLoss() model = BertModelForBinaryMultiLabelClassifier( num_labels=30, pretrained_model_name_or_path=MODEL_PRETRAIN, # cat_num=5 ) model.resize_token_embeddings(len(trn_dataset.tokenizer)) optimizer = optim.Adam(model.parameters(), lr=3e-5) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=MAX_EPOCH, eta_min=1e-5) # load checkpoint model, optim, scheduler if args.checkpoint and fold == loaded_fold: load_checkpoint(args.checkpoint, model, optimizer, scheduler) for epoch in tqdm(list(range(MAX_EPOCH))): model = model.to(DEVICE) if fold <= loaded_fold and epoch <= loaded_epoch: continue trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader) val_loss, val_metric, val_y_preds, val_y_trues, val_qa_ids = test( model, fobj, val_loader) scheduler.step() histories['trn_loss'].append(trn_loss) histories['val_loss'].append(val_loss) histories['val_metric'].append(val_metric) sel_log( f'epoch : {epoch} -- fold : {fold} -- ' f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_metric : {float(val_metric):.4f}', logger) model = model.to('cpu') save_checkpoint(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model, optimizer, scheduler, histories, val_y_preds, val_y_trues, val_qa_ids, fold, epoch, val_loss, val_metric) save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', trn_dataset.tokenizer) del model sel_log('now saving best checkpoints...', logger)
def forward( self, pixel_values=None, head_mask=None, labels=None, output_attentions=None, output_hidden_states=None, interpolate_pos_encoding=None, return_dict=None, ): r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.vit( pixel_values, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) sequence_output = outputs[0] logits = self.classifier(sequence_output[:, 0, :]) loss = None if labels is not None: if self.config.problem_type is None: if self.num_labels == 1: self.config.problem_type = "regression" elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): self.config.problem_type = "single_label_classification" else: self.config.problem_type = "multi_label_classification" if self.config.problem_type == "regression": loss_fct = MSELoss() if self.num_labels == 1: loss = loss_fct(logits.squeeze(), labels.squeeze()) else: loss = loss_fct(logits, labels) elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) if not return_dict: output = (logits, ) + outputs[2:] return ((loss, ) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, )
model = BertClassifier(args, cache_dir=BERT_DIR) model.load_state_dict(torch.load(args.model_path, map_location=device)) if args.verbose: print(f"model loaded from {args.model_path}") model.to(device) text_preprocessor = build_preprocess(demojize=True, textify_emoji=True, mention_limit=3, punc_limit=3, lower_hashtag=True, segment_hashtag=True, add_cap_sign=True) loss_fn_class = BCEWithLogitsLoss() loss_fn_domain = CrossEntropyLoss() loss_fn_domain_joint = BCELoss() if args.file_path.endswith(".pkl"): data, bool_already_processed = import_pickle_file(args.file_path) else: data = import_txt_file(args.file_path) bool_already_processed = False if args.verbose: print("processing the data") tgt_test = process_data(data, bool_already_processed) if args.verbose: print("starting to generate")
def __init__(self,config,vocab): super(AlignCNN, self).__init__() self.config = config self.vocab = vocab # Character embeddings self.embedding = nn.Embedding(vocab.size+1, config.embedding_dim, padding_idx=0) # Sequence encoder of strings (LSTM) self.rnn = nn.LSTM(config.embedding_dim, config.rnn_hidden_size, 1, bidirectional = config.bidirectional, batch_first = True) if self.config.bidirectional: self.num_directions = 2 else: self.num_directions = 1 # Variables for initial states of LSTM (these are different for train and dev because dev might be of different batch sizes) self.h0 = Variable(torch.zeros(self.num_directions, config.batch_size, config.rnn_hidden_size).cuda(), requires_grad=False) self.c0 = Variable(torch.zeros(self.num_directions, config.batch_size, config.rnn_hidden_size).cuda(), requires_grad=False) self.h0_dev = Variable(torch.zeros(self.num_directions, config.dev_batch_size, config.rnn_hidden_size).cuda(), requires_grad=False) self.c0_dev = Variable(torch.zeros(self.num_directions, config.dev_batch_size, config.rnn_hidden_size).cuda(), requires_grad=False) # Define the CNN used to score the alignment matrix pool_output_height = int(np.floor(config.max_string_len/2.0)) # Select # of layers / increasing or decreasing filter size based on config if config.num_layers == 4: self.num_layers = 4 self.relu = nn.ReLU() if config.increasing == True: convlyr = nn.Conv2d(1, config.filter_count, 3, padding=1, stride=1) convlyr2 = nn.Conv2d(config.filter_count, config.filter_count2, 5, padding=2, stride=1) convlyr3 = nn.Conv2d(config.filter_count2, config.filter_count3, 5, padding=2, stride=1) convlyr4 = nn.Conv2d(config.filter_count3, config.filter_count4, 7, padding=3, stride=1) else: convlyr = nn.Conv2d(1, config.filter_count, 7, padding=3, stride=1) convlyr2 = nn.Conv2d(config.filter_count, config.filter_count2, 5, padding=2, stride=1) convlyr3 = nn.Conv2d(config.filter_count2, config.filter_count3, 5, padding=2, stride=1) convlyr4 = nn.Conv2d(config.filter_count3, config.filter_count4, 3, padding=1, stride=1) self.add_module("cnn2",convlyr2) self.add_module("cnn3",convlyr3) self.add_module("cnn4",convlyr4) self.align_weights = nn.Parameter(torch.randn(config.filter_count3, pool_output_height, pool_output_height).cuda(),requires_grad=True) elif config.num_layers == 3: self.num_layers = 3 self.relu = nn.ReLU() if config.increasing == True: convlyr = nn.Conv2d(1, config.filter_count, 5, padding=2, stride=1) convlyr2 = nn.Conv2d(config.filter_count, config.filter_count2, 5, padding=2, stride=1) convlyr3 = nn.Conv2d(config.filter_count2, config.filter_count3, 7, padding=3, stride=1) else: convlyr = nn.Conv2d(1, config.filter_count, 7, padding=3, stride=1) convlyr2 = nn.Conv2d(config.filter_count, config.filter_count2, 5, padding=2, stride=1) convlyr3 = nn.Conv2d(config.filter_count2, config.filter_count3, 5, padding=2, stride=1) self.add_module("cnn2",convlyr2) self.add_module("cnn3",convlyr3) self.align_weights = nn.Parameter(torch.randn(config.filter_count3, pool_output_height, pool_output_height).cuda(),requires_grad=True) elif config.num_layers == 2: self.num_layers = 2 self.relu = nn.ReLU() convlyr = nn.Conv2d(1, config.filter_count, 5, padding=2, stride=1) convlyr2 = nn.Conv2d(config.filter_count, config.filter_count2, 3, padding=1, stride=1) self.add_module("cnn2",convlyr2) self.align_weights = nn.Parameter(torch.randn(config.filter_count2, pool_output_height, pool_output_height).cuda(),requires_grad=True) else: self.num_layers = 1 convlyr = nn.Conv2d(1, config.filter_count, 7, padding=3, stride=1) self.align_weights = nn.Parameter(torch.randn(config.filter_count, pool_output_height, pool_output_height).cuda(),requires_grad=True) self.add_module("cnn",convlyr) # Define pooling self.pool = nn.MaxPool2d((2, 2), stride=2) # Vector of ones (used for loss) self.ones = Variable(torch.ones(config.batch_size, 1).cuda()) # Loss self.loss = BCEWithLogitsLoss()
def run(settings): settings.description = 'Training script for STARK-S, STARK-ST stage1, and STARK-ST stage2' # update the default configs with config file if not os.path.exists(settings.cfg_file): raise ValueError("%s doesn't exist." % settings.cfg_file) config_module = importlib.import_module("lib.config.%s.config" % settings.script_name) cfg = config_module.cfg config_module.update_config_from_file(settings.cfg_file) if settings.local_rank in [-1, 0]: print("New configuration is shown below.") for key in cfg.keys(): print("%s configuration:" % key, cfg[key]) print('\n') # update settings based on cfg update_settings(settings, cfg) # Record the training log log_dir = os.path.join(settings.save_dir, 'logs') if settings.local_rank in [-1, 0]: if not os.path.exists(log_dir): os.makedirs(log_dir) settings.log_file = os.path.join( log_dir, "%s-%s.log" % (settings.script_name, settings.config_name)) # Build dataloaders loader_train, loader_val = build_dataloaders(cfg, settings) # Create network if settings.script_name == "stark_s": net = build_starks(cfg) elif settings.script_name == "stark_st1" or settings.script_name == "stark_st2": net = build_starkst(cfg) else: raise ValueError("illegal script name") # wrap networks to distributed one net.cuda() if settings.local_rank != -1: net = DDP(net, device_ids=[settings.local_rank], find_unused_parameters=True) settings.device = torch.device("cuda:%d" % settings.local_rank) else: settings.device = torch.device("cuda:0") # Loss functions and Actors if settings.script_name == "stark_s" or settings.script_name == "stark_st1": objective = {'giou': giou_loss, 'l1': l1_loss} loss_weight = { 'giou': cfg.TRAIN.GIOU_WEIGHT, 'l1': cfg.TRAIN.L1_WEIGHT } actor = STARKSActor(net=net, objective=objective, loss_weight=loss_weight, settings=settings) elif settings.script_name == "stark_st2": objective = {'cls': BCEWithLogitsLoss()} loss_weight = {'cls': 1.0} actor = STARKSTActor(net=net, objective=objective, loss_weight=loss_weight, settings=settings) else: raise ValueError("illegal script name") if cfg.TRAIN.DEEP_SUPERVISION: raise ValueError("Deep supervision is not supported now.") # Optimizer, parameters, and learning rates optimizer, lr_scheduler = get_optimizer_scheduler(net, cfg) trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) # train process if settings.script_name == "stark_st2": trainer.train(cfg.TRAIN.EPOCH, load_latest=True, fail_safe=True, load_previous_ckpt=True) else: trainer.train(cfg.TRAIN.EPOCH, load_latest=True, fail_safe=True)
unlabeled_idxs = list(range(labelnum, 80)) batch_sampler = TwoStreamBatchSampler( labeled_idxs, unlabeled_idxs, batch_size, batch_size-labeled_bs) def worker_init_fn(worker_id): random.seed(args.seed+worker_id) trainloader = DataLoader(db_train, batch_sampler=batch_sampler, num_workers=4, pin_memory=True, worker_init_fn=worker_init_fn) model_seg.train() model_dis.train() optimizer_seg = optim.SGD(model_seg.parameters(), lr=base_lr, momentum=0.9, weight_decay=0.0001) optimizer_dis = optim.SGD(model_dis.parameters(), lr=base_lr, momentum=0.9, weight_decay=0.0001) ce_loss = BCEWithLogitsLoss() mse_loss = MSELoss() if args.consistency_type == 'mse': consistency_criterion = losses.softmax_mse_loss elif args.consistency_type == 'kl': consistency_criterion = losses.softmax_kl_loss else: assert False, args.consistency_type writer = SummaryWriter(snapshot_path+'/log') logging.info("{} itertations per epoch".format(len(trainloader))) iter_num = 0 max_epoch = max_iterations//len(trainloader)+1 lr_ = base_lr
def main(args, logger): # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv') trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl') trn_df['is_original'] = 1 # aug_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/ContextualWordEmbsAug_sub_df.pkl') # aug_df['is_original'] = 0 # trn_df = pd.concat([trn_df, aug_df], axis=0).reset_index(drop=True) gkf = GroupKFold(n_splits=5).split( X=trn_df.question_body, groups=trn_df.question_body_le, ) histories = { 'trn_loss': {}, 'val_loss': {}, 'val_metric': {}, 'val_metric_raws': {}, } loaded_fold = -1 loaded_epoch = -1 if args.checkpoint: histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint) # calc max_seq_len using quest dataset # max_seq_len = QUESTDataset( # df=trn_df, # mode='train', # tokens=[], # augment=[], # pretrained_model_name_or_path=TOKENIZER_PRETRAIN, # ).MAX_SEQUENCE_LENGTH # max_seq_len = 9458 # max_seq_len = 1504 max_seq_len = 512 fold_best_metrics = [] fold_best_metrics_raws = [] for fold, (trn_idx, val_idx) in enumerate(gkf): if fold < loaded_fold: fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax( histories["val_metric"][fold])]) continue sel_log( f' --------------------------- start fold {fold} --------------------------- ', logger) fold_trn_df = trn_df.iloc[trn_idx] # .query('is_original == 1') fold_trn_df = fold_trn_df.drop(['is_original', 'question_body_le'], axis=1) # use only original row fold_val_df = trn_df.iloc[val_idx].query('is_original == 1') fold_val_df = fold_val_df.drop(['is_original', 'question_body_le'], axis=1) if args.debug: fold_trn_df = fold_trn_df.sample(100, random_state=71) fold_val_df = fold_val_df.sample(100, random_state=71) temp = pd.Series( list( itertools.chain.from_iterable( fold_trn_df.question_title.apply(lambda x: x.split(' ')) + fold_trn_df.question_body.apply(lambda x: x.split(' ')) + fold_trn_df.answer.apply(lambda x: x.split(' ')))) ).value_counts() tokens = temp[temp >= 10].index.tolist() # tokens = [] tokens = [ 'CAT_TECHNOLOGY'.casefold(), 'CAT_STACKOVERFLOW'.casefold(), 'CAT_CULTURE'.casefold(), 'CAT_SCIENCE'.casefold(), 'CAT_LIFE_ARTS'.casefold(), ] trn_dataset = QUESTDataset( df=fold_trn_df, mode='train', tokens=tokens, augment=[], pretrained_model_name_or_path=TOKENIZER_PRETRAIN, MAX_SEQUENCE_LENGTH=max_seq_len, ) # update token trn_sampler = RandomSampler(data_source=trn_dataset) trn_loader = DataLoader(trn_dataset, batch_size=BATCH_SIZE, sampler=trn_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=True, pin_memory=True) val_dataset = QUESTDataset( df=fold_val_df, mode='valid', tokens=tokens, augment=[], pretrained_model_name_or_path=TOKENIZER_PRETRAIN, MAX_SEQUENCE_LENGTH=max_seq_len, ) val_sampler = RandomSampler(data_source=val_dataset) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=False, pin_memory=True) fobj = BCEWithLogitsLoss() # fobj = MSELoss() model = BertModelForBinaryMultiLabelClassifier( num_labels=30, pretrained_model_name_or_path=MODEL_PRETRAIN, # cat_num=5, token_size=len(trn_dataset.tokenizer), MAX_SEQUENCE_LENGTH=max_seq_len, ) optimizer = optim.Adam(model.parameters(), lr=3e-5) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=MAX_EPOCH, eta_min=1e-5) # load checkpoint model, optim, scheduler if args.checkpoint and fold == loaded_fold: load_checkpoint(args.checkpoint, model, optimizer, scheduler) for epoch in tqdm(list(range(MAX_EPOCH))): if fold <= loaded_fold and epoch <= loaded_epoch: continue if epoch < 1: model.freeze_unfreeze_bert(freeze=True, logger=logger) else: model.freeze_unfreeze_bert(freeze=False, logger=logger) model = DataParallel(model) model = model.to(DEVICE) trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader) val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test( model, fobj, val_loader) scheduler.step() if fold in histories['trn_loss']: histories['trn_loss'][fold].append(trn_loss) else: histories['trn_loss'][fold] = [ trn_loss, ] if fold in histories['val_loss']: histories['val_loss'][fold].append(val_loss) else: histories['val_loss'][fold] = [ val_loss, ] if fold in histories['val_metric']: histories['val_metric'][fold].append(val_metric) else: histories['val_metric'][fold] = [ val_metric, ] if fold in histories['val_metric_raws']: histories['val_metric_raws'][fold].append(val_metric_raws) else: histories['val_metric_raws'][fold] = [ val_metric_raws, ] logging_val_metric_raws = '' for val_metric_raw in val_metric_raws: logging_val_metric_raws += f'{float(val_metric_raw):.4f}, ' sel_log( f'fold : {fold} -- epoch : {epoch} -- ' f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_metric : {float(val_metric):.4f} -- ' f'val_metric_raws : {logging_val_metric_raws}', logger) model = model.to('cpu') model = model.module save_checkpoint(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model, optimizer, scheduler, histories, val_y_preds, val_y_trues, val_qa_ids, fold, epoch, val_loss, val_metric) fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax( histories["val_metric"][fold])]) save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', trn_dataset.tokenizer) del model # calc training stats fold_best_metric_mean = np.mean(fold_best_metrics) fold_best_metric_std = np.std(fold_best_metrics) fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}' sel_log(fold_stats, logger) send_line_notification(fold_stats) fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0) fold_raw_stats = '' for metric_stats_raw in fold_best_metrics_raws_mean: fold_raw_stats += f'{float(metric_stats_raw):.4f},' sel_log(fold_raw_stats, logger) send_line_notification(fold_raw_stats) sel_log('now saving best checkpoints...', logger)
def forward( self, pixel_values=None, head_mask=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: Examples: ```python >>> from transformers import DeiTFeatureExtractor, DeiTForImageClassification >>> import torch >>> from PIL import Image >>> import requests >>> torch.manual_seed(3) # doctest: +IGNORE_RESULT >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) >>> # note: we are loading a DeiTForImageClassificationWithTeacher from the hub here, >>> # so the head will be randomly initialized, hence the predictions will be random >>> feature_extractor = DeiTFeatureExtractor.from_pretrained("facebook/deit-base-distilled-patch16-224") >>> model = DeiTForImageClassification.from_pretrained("facebook/deit-base-distilled-patch16-224") >>> inputs = feature_extractor(images=image, return_tensors="pt") >>> outputs = model(**inputs) >>> logits = outputs.logits >>> # model predicts one of the 1000 ImageNet classes >>> predicted_class_idx = logits.argmax(-1).item() >>> print("Predicted class:", model.config.id2label[predicted_class_idx]) Predicted class: maillot ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.deit( pixel_values, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] logits = self.classifier(sequence_output[:, 0, :]) # we don't use the distillation token loss = None if labels is not None: if self.config.problem_type is None: if self.num_labels == 1: self.config.problem_type = "regression" elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): self.config.problem_type = "single_label_classification" else: self.config.problem_type = "multi_label_classification" if self.config.problem_type == "regression": loss_fct = MSELoss() if self.num_labels == 1: loss = loss_fct(logits.squeeze(), labels.squeeze()) else: loss = loss_fct(logits, labels) elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, )
def train(self, training_dataset, holdout_dataset, model_parameters): optimizer = Adam(self.__model.parameters(), lr=model_parameters.learning_rate) loss = BCEWithLogitsLoss(reduce="sum") self.train_model(model_parameters, training_dataset, holdout_dataset, loss, optimizer)
class CustomLoss3(nn.Module): def __init__(self): super(CustomLoss3, self).__init__() def forward(self, inputs, targets): inputs = inputs.flatten() targets = targets.flatten() loss = -(targets * log_sigmoid(inputs) + \ (1 - targets) * torch.log(1-sigmoid(inputs))) # print("l1 loss average: {}".format(torch.mean(loss))) return torch.mean(loss) if __name__ == "__main__": loss_fn1 = CustomLoss1() loss_fn2 = CustomLoss2() loss_ = BCEWithLogitsLoss() y_true = torch.tensor([[0., 0., 1., 1.], [0., 0., 1., 1.]]) y_pred = torch.tensor([[0.1, 0.1, 0.9, 0.9], [0.1, 0.1, 0.9, 0.8]]) print(loss_fn1(y_pred, y_true)) print(loss_fn2(y_pred, y_true)) print(loss_(y_pred, y_true))
batch_size=batch_size, shuffle=True, num_workers=0) for dataset in split(dataset=dataset) ] base_model = nn.Sequential( nn.Dropout(p=dropout_rate), nn.Linear(in_features=ds_info['feature_dim'], out_features=1, bias=True), ) print(base_model) t_handler = TrainTestHandler( base_model=base_model, criterion=BCEWithLogitsLoss(), device=device, ) hyper_parameters = { 'model_name': 'simple_feedforward', 'batch_size': batch_size, 'random_seed': random_seed, 'num_train_samples': len(train_loader), 'num_test_samples': len(test_loader), 'num_val_samples': len(val_loader), 'task_name': 'SIM', 'dropout_rate': dropout_rate, } t_handler.train(train_loader, val_loader)
train_loader = create_loader(cfg.train_csv, data_dir, cfg, mode='train', dicom=False, type=cfg.type) val_loader = create_loader(cfg.dev_csv, data_dir, cfg, mode='val', dicom=False, type=cfg.type) # loss_func = BCELoss() loss_func = BCEWithLogitsLoss() metrics_dict = { 'auc': AUC(), 'sensitivity': Recall(), 'specificity': Specificity(), 'f1': F1() } loader_dict = {'train': train_loader, 'val': val_loader} chexpert_model = CheXpert_model(cfg, loss_func, metrics_dict) chexpert_model.load_backbone(cfg.ckp_path, strict=False) # chexpert_model.freeze_backbone() writer = SummaryWriter(os.path.join('experiment', cfg.log_dir))
def forward( self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict distilbert_output = self.distilbert( input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_state = distilbert_output[0] # (bs, seq_len, dim) pooled_output = hidden_state[:, 0] # (bs, dim) pooled_output = self.pre_classifier(pooled_output) # (bs, dim) pooled_output = nn.ReLU()(pooled_output) # (bs, dim) pooled_output = self.dropout(pooled_output) # (bs, dim) logits = self.classifier(pooled_output) # (bs, num_labels) loss = None if labels is not None: if self.config.problem_type is None: if self.num_labels == 1: self.config.problem_type = "regression" elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): self.config.problem_type = "single_label_classification" else: self.config.problem_type = "multi_label_classification" if self.config.problem_type == "regression": loss_fct = MSELoss() if self.num_labels == 1: loss = loss_fct(logits.squeeze(), labels.squeeze()) else: loss = loss_fct(logits, labels) elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) if not return_dict: output = (logits, ) + distilbert_output[1:] return ((loss, ) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, logits=logits, hidden_states=distilbert_output.hidden_states, attentions=distilbert_output.attentions, )
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): Classification (or regression if config.num_labels==1) loss. logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForSequenceClassification import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForSequenceClassification.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) outputs = (logits, ) + outputs[ 2:] # add hidden states and attention if they are here # if labels is not None: # if labels is not None and any(labels): if labels is not None and torch.sum(labels) > 0: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: # # logger.debug(f"logits: {type(logits)}, {logits}") # preds = logits_to_preds(logits.detach().cpu().numpy()) # preds = torch.tensor(preds).float().cuda() # loss = FocalLoss(gamma=1.5, alpha=None)(preds.view(-1), # labels.view(-1)) # # loss_fct = CrossEntropyLoss() # loss = loss_fct(logits.view(-1, self.num_labels), # labels.view(-1)) # logger.debug(f"preds: {type(preds)}, {preds}") assert self.loss_type in [ 'FocalLoss', 'DiceLoss', 'CircleLoss', 'CrossEntropyLoss', 'BCEWithLogitsLoss' ] if self.loss_type == 'FocalLoss': # 0.58 loss = FocalLoss(gamma=self.focalloss_gamma, alpha=self.focalloss_alpha)(logits.view( -1, self.num_labels), labels.view(-1)) elif self.loss_type == 'CircleLoss': from ...losses.circleloss import CircleLoss # 0.496885 (4/5) loss = CircleLoss(gamma=15, m=1e-6, similarity='cos')( logits.view(-1, self.num_labels), labels.view(-1)) # loss = CircleLoss(gamma=15, m=1e-6, similarity='dot')( # logits.view(-1, self.num_labels), labels.view(-1)) elif self.loss_type == 'DiceLoss': # loss = DiceLoss(weight=self.diceloss_weight)(logits.view( # -1, self.num_labels), labels.view(-1)) loss = DiceLoss(epsilon=1e-5)(logits.view( -1, self.num_labels), labels.view(-1, self.num_labels)) elif self.loss_type == 'BCEWithLogitsLoss': loss_fct = BCEWithLogitsLoss() # logger.debug(f"logits: {logits.shape}, {logits}") # logger.debug(f"labels: {labels.shape}, {labels}") # loss = loss_fct(logits, labels.float()) loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels).float()) else: loss_fct = CrossEntropyLoss() # logger.warning(f"logits: {logits.shape}") # logger.warning(f"labels: {labels.shape}") loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) # loss = loss_fct(logits.view(-1, self.num_labels), # labels.view(-1, self.num_labels)) # from ...losses import FocalLoss # # loss = FocalLoss(gamma=1.5, # # alpha=None)(logits.view(-1, self.num_labels), # # labels.view(-1)) # loss = FocalLoss( # gamma=1.5, # alpha=[ # # # --- 0.9354 (10/10) # # # 0.5, # # # 0.40, # # # 0.10, # # # 0.05, # # # 0.5, # # # 0.30, # # # 0.25, # # # 0.35 # # # --- 0.9383 (10/10) # # 0.5, # # 0.40, # # 0.10, # # 0.05, # # 0.5, # # 0.40, # # 0.40, # # 0.40 # # # focalloss_sympotom --- 0.9423 (10/10) # # # online f1: 0.92590 # # 0.50, # 10 # # 0.40, # 52 # # 0.10, # 1267 # # 0.05, # 2550 # # 0.50, # 7 # # 0.40, # 147 # # 0.35, # 867 # # 0.40, # 100 # # # focalloss_sympotom --- 0.9396 (10/10) # # 0.50, # 10 # # 0.40, # 52 # # 0.10, # 1267 # # 0.05, # 2550 # # 0.50, # 7 # # 0.40, # 147 # # 0.38, # 867 # # 0.40, # 100 # # # focalloss_disease --- 0.9407 (7/10) # # # online f1: 0.92330 # # 0.50, # 10 # # 0.40, # 52 # # 0.08, # 1267 # # 0.05, # 2550 # # 0.50, # 7 # # 0.40, # 147 # # 0.35, # 867 # # 0.40, # 100 # # # focalloss_others --- 0.9396 (10/10) # # 0.50, # 10 # # 0.40, # 52 # # 0.10, # 1267 # # 0.05, # 2550 # # 0.50, # 7 # # 0.40, # 147 # # 0.35, # 867 # # 0.30, # 100 # # focalloss_checks --- 0.9397 (8/10) # # # online f1: 0.92640 # 0.50, # 10 # 0.40, # 52 # 0.10, # 1267 # 0.05, # 2550 # 0.50, # 7 # 0.38, # 147 # 0.35, # 867 # 0.40, # 100 # ])(logits.view(-1, self.num_labels), labels.view(-1)) # # glue_labels = ['病毒', '细菌', '疾病', '药物', '医学专科', '检查科目', '症状', 'NoneType'] # 10, 52, 1267, 2550, 7, 147, 867, 100 # [0.200, 0.200, 0.025, 0.025, 0.200, 0.100, 0.050, 0.200] # from ...losses import CircleLoss # loss = CircleLoss(m=0.25, # gamma=128) # loss = CircleLoss(scale=32, margin=0.25)(logits.view( # -1, self.num_labels), labels.view(-1)) # outputs = (loss, ) + outputs return (loss, ) + outputs else: # (loss), logits, (hidden_states), (attentions) return (torch.tensor(0.0).cuda(), ) + outputs
def prep_for_training(num_training_steps): if args.model == "language_only": model = AlbertForSequenceClassification.from_pretrained( "albert-base-v2", num_labels=1 ) elif args.model == "acoustic_only": model = Transformer(ACOUSTIC_DIM, num_layers=args.n_layers, nhead=args.n_heads, dim_feedforward=args.fc_dim) elif args.model == "visual_only": model = Transformer(VISUAL_DIM, num_layers=args.n_layers, nhead=args.n_heads, dim_feedforward=args.fc_dim) elif args.model=="hcf_only": model=Transformer(HCF_DIM, num_layers=args.n_layers, nhead=args.n_heads, dim_feedforward=args.fc_dim) elif args.model == "HKT" : #HKT model has 4 unimodal encoders. But the language one is ALBERT pretrained model. But other enocders are #trained from scratch with low level features. We have found that many times most of the the gardients flows to albert encoders only as it #already has rich contextual representation. So in the beginning the gradient flows ignores other encoders which are trained from low level features. # We found that if we intitalize the weights of the acoustic, visual and hcf encoders of HKT model from the best unimodal models that we already ran for ablation study then #the model converege faster. Other wise it takes very long time to converge. if args.dataset=="humor": visual_model = Transformer(VISUAL_DIM, num_layers=7, nhead=3, dim_feedforward= 128) visual_model.load_state_dict(torch.load("./model_weights/init/humor/humorVisualTransformer.pt")) acoustic_model = Transformer(ACOUSTIC_DIM, num_layers=8, nhead=3, dim_feedforward = 256) acoustic_model.load_state_dict(torch.load("./model_weights/init/humor/humorAcousticTransformer.pt")) hcf_model = Transformer(HCF_DIM, num_layers=3, nhead=2, dim_feedforward = 128) hcf_model.load_state_dict(torch.load("./model_weights/init/humor/humorHCFTransformer.pt")) elif args.dataset=="sarcasm": visual_model = Transformer(VISUAL_DIM, num_layers=8, nhead=4, dim_feedforward=1024) visual_model.load_state_dict(torch.load("./model_weights/init/sarcasm/sarcasmVisualTransformer.pt")) acoustic_model = Transformer(ACOUSTIC_DIM, num_layers=1, nhead=3, dim_feedforward=512) acoustic_model.load_state_dict(torch.load("./model_weights/init/sarcasm/sarcasmAcousticTransformer.pt")) hcf_model = Transformer(HCF_DIM, num_layers=8, nhead=4, dim_feedforward=128) hcf_model.load_state_dict(torch.load("./model_weights/init/sarcasm/sarcasmHCFTransformer.pt")) text_model = AlbertModel.from_pretrained('albert-base-v2') model = HKT(text_model, visual_model, acoustic_model,hcf_model, args) else: raise ValueError("Requested model is not available") model.to(DEVICE) loss_fct = BCEWithLogitsLoss() # Prepare optimizer # used different learning rates for different componenets. if args.model == "HKT" : acoustic_params,visual_params,hcf_params,other_params = model.get_params() optimizer_o,scheduler_o=get_optimizer_scheduler(other_params,num_training_steps,learning_rate=args.learning_rate) optimizer_h,scheduler_h=get_optimizer_scheduler(hcf_params,num_training_steps,learning_rate=args.learning_rate_h) optimizer_v,scheduler_v=get_optimizer_scheduler(visual_params,num_training_steps,learning_rate=args.learning_rate_v) optimizer_a,scheduler_a=get_optimizer_scheduler(acoustic_params,num_training_steps,learning_rate=args.learning_rate_a) optimizers=[optimizer_o,optimizer_h,optimizer_v,optimizer_a] schedulers=[scheduler_o,scheduler_h,scheduler_v,scheduler_a] else: params = list(model.named_parameters()) optimizer_l, scheduler_l = get_optimizer_scheduler( params, num_training_steps, learning_rate=args.learning_rate ) optimizers=[optimizer_l] schedulers=[scheduler_l] return model, optimizers, schedulers,loss_fct
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained model downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { # "cola": ColaProcessor, # "mnli": MnliProcessor, # "mnli-mm": MnliMismatchedProcessor, # "mrpc": MrpcProcessor, # "sst-2": Sst2Processor, # "sts-b": StsbProcessor, # "qqp": QqpProcessor, # "qnli": QnliProcessor, # "rte": RteProcessor, # "wnli": WnliProcessor, "self": SelfProcessor, } output_modes = { # "cola": "classification", # "mnli": "classification", # "mrpc": "classification", # "sst-2": "classification", # "sts-b": "regression", # "qqp": "classification", # "qnli": "classification", # "rte": "classification", # "wnli": "classification", "self": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForMultiLabelSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) # 修改分类器类名 if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) # 修改 elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # define a new function to compute loss values for both output_modes logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == "classification": # loss_fct = CrossEntropyLoss() # 注释 loss_fct = BCEWithLogitsLoss() # 修改 loss = loss_fct(logits.view(-1), label_ids.view(-1)) # 修改 elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForMultiLabelSequenceClassification.from_pretrained( args.output_dir, num_labels=num_labels) # 修改分类器类名 tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) else: model = BertForMultiLabelSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) # 修改分类器类名 model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) # 修改 elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) # create eval loss and other metric required by the task if output_mode == "classification": # loss_fct = CrossEntropyLoss() # 注释 loss_fct = BCEWithLogitsLoss() # 添加 tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) # 修改 elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] if output_mode == "classification": # preds = np.argmax(preds, axis=1) # 注释 pass # 添加 elif output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / nb_tr_steps if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # hack for MNLI-MM if task_name == "mnli": task_name = "mnli-mm" processor = processors[task_name]() if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.". format(args.output_dir)) if not os.path.exists(args.output_dir + '-MM'): os.makedirs(args.output_dir + '-MM') eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] preds = np.argmax(preds, axis=1) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / nb_tr_steps if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))