def get_param_weight_decay_dict(param_group_name_list): return [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in param_group_name_list) ], "weight_decay": args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in param_group_name_list) ], 'weight_decay': 0.0 }]
def save_layer_fig(model,exam_id, org_input, org_target, prediction, slice_level_performance, result_dir): result_exam_dir = os.path.join(result_dir, exam_id) if not os.path.exists(result_exam_dir): os.makedirs(result_exam_dir) for name, param in model.named_parameters(): print(name, '\t\t', param.shape)
def model_save(fn, all_model=1, model_para=0): if all_model: with open(fn, 'wb') as f: torch.save([model, optimizer], f) if model_para: para_dic = {} for name, para in model.named_parameters(): para_dic[name] = para.clone().cpu().data.numpy() with open(fn, 'wb') as f: pickle.dump(para_dic, f)
def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) criterion = nn.CrossEntropyLoss() for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() logits, hidden = model(data, hidden) loss_values = model.calculate_loss_values(logits, targets) loss_values_data = tuple(map(lambda x: x.data[0], loss_values)) cross_entropy_val, center_loss_val = loss_values_data cross_entropy_loss = loss_values[0] center_loss = loss_values[1] if center_loss_factor > 0: train_loss = cross_entropy_loss + center_loss_factor*center_loss else: train_loss = cross_entropy_loss train_loss.backward() train_loss_val = train_loss.data[0] perplexity_val = math.exp(cross_entropy_val) train_metrics['train_loss'].append(train_loss_val) train_metrics['center_loss'].append(center_loss_val) train_metrics['cross_entropy'].append(cross_entropy_val) train_metrics['perplexity'].append(perplexity_val) # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) for name,p in model.named_parameters(): if p.requires_grad: p.data.add_(-lr, p.grad.data) total_loss += cross_entropy_val if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'ce loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) print('| train loss: {:.3f} | center loss: {:.3f} | cross entropy: {:.3f} | ' 'perplexity: {:.3f}'.format(train_loss_val, center_loss_val, cross_entropy_val, perplexity_val)) total_loss = 0 start_time = time.time()
def print_model_param_info(model): print('[Model parameters to train]') for name, param in model.named_parameters(): if param.requires_grad: print(name, param.data) # param.data # for param in model.parameters(): # if param.requires_grad: # print(param.name, param.size()) # print(param.data) return
def trainable_params(model, feature_extract): """ Prints and returns all the trainable parameters in model. :param model: the model instance :param feature_extract: if True, only params with *requires_grad* will be returned. :return: a list containing the model's trainable params. """ params_to_update = model.parameters() print("Params to learn:") if feature_extract: params_to_update = [] for name, param in model.named_parameters(): if param.requires_grad == True: params_to_update.append(param) print("\t", name) else: for name, param in model.named_parameters(): if param.requires_grad == True: print("\t", name) return params_to_update
def finetune(cur_iter, cur_learning_rate): # Turn on training mode which enables dropout. model.train() total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) print("cur learning rate = ", cur_learning_rate) print("iteration", cur_iter) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) # print(data.size()) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset # (previous batches). hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) for name, p in model.named_parameters(): if p.requires_grad: p.data.add_( -get_discriminative_lr(name, cur_learning_rate), p.grad.data) # p.data = p.data += -lr * p.grad.data total_loss += loss.item() cur_learning_rate = scheduled_slanted_lr(cur_iter, T, args.cut_frac, args.ratio, args.lr) cur_iter += 1 #print(batch) if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:04.4f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, cur_learning_rate, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() return cur_iter, cur_learning_rate
def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0. start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) orth_reg = 0.0 for name, param in model.named_parameters(): if param.requires_grad and 'U' in name: d = min(list(param.shape)) diff = torch.matmul(param.t(), param) - torch.eye(d).cuda() orth_reg += torch.sum(diff**2) / (d * d) elif param.requires_grad and 'V' in name: d = min(list(param.shape)) diff = torch.matmul(param, param.t()) - torch.eye(d).cuda() orth_reg += torch.sum(diff**2) / (d * d) total = loss + args.od * orth_reg total.backward() model.rnn.svd_grad() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.item() if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.4f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def unfreeze_rln(self): """ Method to freeze RLN parameters """ if isinstance(self.net, nn.DataParallel): model = self.net.module.model else: model = self.net.model for name, param in model.named_parameters(): param.learn = True for name, param in model.bert.named_parameters(): param.learn = True for name, param in model.bert.named_parameters(): param.learn = True
def test(epoch): global best_acc model.eval() test_loss = 0 correct = 0 total = 0 with torch.no_grad(): for batch_index, (inputs, targets) in enumerate(testloader): inputs, targets = inputs.to(device), targets.to(device) inputs = Variable(inputs) targets = Variable(targets) # Forward outputs = model(inputs) loss = criterion(outputs, targets) # Results test_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() print('Loss: %.3f | Accuracy: %.3f' % (test_loss / 100, 100. * correct / total)) # Save the model acc = 100. * correct / total if acc > best_acc: print('Saving..') state = { 'net': model.state_dict(), 'acc': acc, 'epoch': epoch, } if not os.path.isdir('checkpoint'): os.mkdir('checkpoint') torch.save(state, './checkpoint/Baseline.ckpt') best_acc = acc # Plot the model info = {'loss': test_loss, 'accuracy': acc} for tag, value in info.items(): logger.scalar_summary(tag, value, epoch + 1) for tag, value in model.named_parameters(): tag = tag.replace('.', '/') logger.histo_summary(tag, value.data.cpu().numpy(), epoch + 1) logger.histo_summary(tag + '/grad', value.grad.data.cpu().numpy(), epoch + 1)
def train(lr, epoch = 0): # Turn on training mode which enables dropout. model.train() total_loss = 0. start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) # if args.admm: if stage == 'admm': ce_loss = loss admm.admm_update(args,ADMM,model,None,None,None,epoch,None,batch) # update Z and U ce_loss,admm_loss,mixed_loss = admm.append_admm_loss(args,ADMM,model,ce_loss) # append admm losss loss = mixed_loss loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) if stage == 'masked_retrain': for name,W in model.named_parameters(): if name in config.masks: W.grad.data *= config.masks[name] for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.item() if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def eval_smooth(prev_model, model, num_pts=1): alphas = np.arange(1, num_pts + 1) / (num_pts + 1) gnorm = eval_grad(prev_model) update_size = utils.norm_diff(utils.get_model_params(model), \ utils.get_model_params(prev_model)) max_smooth = -1 for alpha in alphas: new_model = copy.deepcopy(prev_model) for n, p in new_model.named_parameters(): p.data = alpha * p.data + ( 1 - alpha) * {n: p for n, p in model.named_parameters()}[n].data eval_grad(new_model) smooth = utils.norm_diff( utils.get_model_grads(new_model), utils.get_model_grads(prev_model)) / (update_size * (1 - alpha)) max_smooth = max(smooth, max_smooth) return max_smooth, gnorm
def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) #print(output.size()) loss = criterion(output.view(-1, ntokens), targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) #print(model.state_dict()) #print("Done***") for n,p in model.named_parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.data if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | perplexity {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def train(epoch, optimizer, compression_scheduler=None): # Turn on training mode which enables dropout. model.train() total_samples = train_data.size(0) steps_per_epoch = math.ceil(total_samples / args.bptt) total_loss = 0. start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) # The line below was fixed as per: https://github.com/pytorch/examples/issues/214 for batch, i in enumerate(range(0, train_data.size(0), args.bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) if compression_scheduler: compression_scheduler.on_minibatch_begin( epoch, minibatch_id=batch, minibatches_per_epoch=steps_per_epoch) output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) if compression_scheduler: # Before running the backward phase, we add any regularization loss computed by the scheduler regularizer_loss = compression_scheduler.before_backward_pass( epoch, minibatch_id=batch, minibatches_per_epoch=steps_per_epoch, loss=loss) loss += regularizer_loss optimizer.zero_grad() loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) optimizer.step() total_loss += loss.item() if compression_scheduler: compression_scheduler.on_minibatch_end( epoch, minibatch_id=batch, minibatches_per_epoch=steps_per_epoch) if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time lr = optimizer.param_groups[0]['lr'] msglogger.info( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.4f} | ms/batch {:5.2f} ' '| loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() stats = ('Peformance/Training/', OrderedDict([('Loss', cur_loss), ('Perplexity', math.exp(cur_loss)), ('LR', lr), ('Batch Time', elapsed * 1000)])) steps_completed = batch + 1 distiller.log_training_progress(stats, model.named_parameters(), epoch, steps_completed, steps_per_epoch, args.log_interval, [tflogger])
} training_loader = DataLoader(training_set, **train_params) testing_loader = DataLoader(testing_set, **test_params) # GPU check and setting the device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print(torch.cuda.get_device_name(0)) # Object of RobertaBaseClass and setting to device model = model.RobertaBaseClass() model.to(device) # Model parameters param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0,
def main(): dataframe = pd.read_csv('../input/imdb.csv') # load dataframe dataframe.sentiment = dataframe.sentiment.apply(lambda x: 1 if x == 'positive' else 0) # sentiment is category target variable so we have to label encode it, we can do it like this by hand, or simply with sklearn.model_selection.LabelEncoder # now split data into validation and training df_train, df_valid = model_selection.train_test_split( dataframe, test_size=0.1, # 10 percent of dataframe will be for validation random_state= 42, # if we are going to run multiple time this script, random state enables that everytime we get same split with same random state shuffle=True, # shuffle indices stratify=dataframe.sentiment. values # same distribution in train and valid ) df_train = df_train.reset_index( drop=True) # we reset indices from 0 to len(df_train) df_valid = df_valid.reset_index( drop=True) # we reset indices from 0 to len(df_valid) # make datasets with our class in order to make data loaders training_dataset = dataset.BERTdataset(review=df_train.review.values, sentiment=df_train.sentiment.values) # from dataset to dataloader training_data_loader = torch.utils.data.DataLoader( dataset=training_dataset, batch_size=config.TRAINING_BATCH_SIZE, shuffle=True, num_workers=4) validation_dataset = dataset.BERTdataset( review=df_valid.review.values, sentiment=df_valid.sentiment.values, ) # from dataset to dataloader validation_data_loader = torch.utils.data.DataLoader( dataset=validation_dataset, batch_size=config.VALIDATION_BATCH_SIZE, shuffle=False, num_workers=4) device = torch.device('cuda') model = model.BERTsentiment() model.to(device) # move model to cuda device # params to optimize param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.00 }] number_of_training_steps = int( len(df_train) / config.TRAINING_BATCH_SIZE * config.EPOCHS) #AdamW focuses on regularization and model does better on generalization optimizer = AdamW(params=param_optimizer, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=0, num_training_steps=number_of_training_steps, ) best_accuracy = [] for epoch in range(config.EPOCHS): print('EPOCH:', epoch + 1) engine.training_loop(training_data_loader, model, optimizer, scheduler, device) outputs, sentiments = engine.validation_loop(validation_data_loader, model, device) # distribution is 50 50 so we can use acc score outputs = np.array(outputs) >= 0.5 # positive class accuracy = metrics.accuracy_score(sentiments, outputs) print('ACCURACY SCORE', {accuracy}) if accuracy > best_accuracy: torch.save(model.state_dict(), config.MODEL_PATH) # save model in working dir best_accuracy = accuracy
def main(): """ Training and validation. """ global epochs_since_improvement, start_epoch, label_map, best_loss, epoch, checkpoint # Initialize model or load checkpoint if checkpoint is None: model = SSD300(n_classes=n_classes) # Initialize the optimizer, with twice the default learning rate for biases, as in the original Caffe repo biases = list() not_biases = list() for param_name, param in model.named_parameters(): if param.requires_grad: if param_name.endswith('.bias'): biases.append(param) else: not_biases.append(param) optimizer = torch.optim.SGD(params=[{ 'params': biases, 'lr': 2 * lr }, { 'params': not_biases }], lr=lr, momentum=momentum, weight_decay=weight_decay) else: checkpoint = torch.load(checkpoint) start_epoch = checkpoint['epoch'] + 1 epochs_since_improvement = checkpoint['epochs_since_improvement'] best_loss = checkpoint['best_loss'] print( '\nLoaded checkpoint from epoch %d. Best loss so far is %.3f.\n' % (start_epoch, best_loss)) model = checkpoint['model'] optimizer = checkpoint['optimizer'] # Move to default device model = model.to(device) criterion = MultiBoxLoss(priors_cxcy=model.priors_cxcy).to(device) # Custom dataloaders train_dataset = PascalVOCDataset(data_folder, split='train') val_dataset = PascalVOCDataset(data_folder, split='test') train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_dataset.collate_fn, num_workers=workers, pin_memory=True) # note that we're passing the collate function here val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, collate_fn=val_dataset.collate_fn, num_workers=workers, pin_memory=True) # Epochs for epoch in range(start_epoch, epochs): # Paper describes decaying the learning rate at the 80000th, 100000th, 120000th 'iteration', i.e. model update or batch # The paper uses a batch size of 32, which means there were about 517 iterations in an epoch # Therefore, to find the epochs to decay at, you could do, # if epoch in {80000 // 517, 100000 // 517, 120000 // 517}: # adjust_learning_rate(optimizer, 0.1) # In practice, I just decayed the learning rate when loss stopped improving for long periods, # and I would resume from the last best checkpoint with the new learning rate, # since there's no point in resuming at the most recent and significantly worse checkpoint. # So, when you're ready to decay the learning rate, just set checkpoint = 'BEST_checkpoint_ssd300.pth.tar' above # and have adjust_learning_rate(optimizer, 0.1) BEFORE this 'for' loop # One epoch's training train(train_loader=train_loader, model=model, criterion=criterion, optimizer=optimizer, epoch=epoch) # One epoch's validation val_loss = validate(val_loader=val_loader, model=model, criterion=criterion) # Did validation loss improve? is_best = val_loss < best_loss best_loss = min(val_loss, best_loss) if not is_best: epochs_since_improvement += 1 print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement, )) else: epochs_since_improvement = 0 # Save checkpoint save_checkpoint(epoch, epochs_since_improvement, model, optimizer, val_loss, best_loss, is_best)
def run(): df_train1 = pd.read_csv(config.TRAIN_PATH_1, usecols=['comment_text', 'toxic']).fillna('none') df_train2 = pd.read_csv(config.TRAIN_PATH_2, usecols=['comment_text', 'toxic']).fillna('none') df_train_full = pd.concat([df_train1, df_train2], axis=0).reset_index(drop=True) df_train_full = df_train_full.sample(frac=1).reset_index(drop=True).head(400000) df_valid = pd.read_csv(config.VALID_PATH) tokenizer = config.tokenizer train_targets = df_train.toxic.values valid_targets = df_valid.toxic.values train_dataset = dataset.BERTDatasetTraining( comment_text=df_train.comment_text.values, targets=train_targets, tokenizer=tokenizer, max_length=config.MAX_LEN ) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True ) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, sampler=train_sampler, drop_last=True, num_workers=4 ) valid_dataset = dataset.BERTDatasetTraining( comment_text=df_valid.comment_text.values, targets=valid_targets, tokeizer=config.tokenizer, max_length=config.MAX_LEN ) valid_sampler = torch.utils.data.distributed.DistributedSampler( valid_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=False ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, sampler=valid_sampler, drop_last=True, num_workers=4 ) device = xm.xla_device() model = BERTBaseUncased(bert_path=config.MODEL_PATH).to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params':[p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay':0.01}, {'params':[p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay':0.0} ] lr = 3e-5 * xm.xrt_world_size() num_train_steps = int(len(train_dataset)/config.TRAIN_BATCH_SIZE/xm.xrt_world_size()*EPOCHS) xm.master_print(f'num_train_steps={num_train_steps}, world_size={xm.world_size()}') optimizer = AdamW(optimizer_grouped_parameters, lr=lr) scheduler = get_constant_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps ) for epoch in range(config.EPOCHS): para_loader = pl.ParallelLoader(train_data_loader, [device]) engine.train_fn(para_loader.per_device_loader(device), model, optimizer, device, scheduler=scheduler) para_loader = pl.ParallelLoader(valid_data_loader, [device]) o, t = engine.eval_fn(para_loader.per_device_loader, model, device) xm.save(model.state_dict(), 'model.bin') auc = metrics.roc_auc_score(np.array(t) >= 0.5, o) xm.master_print(f'AUC ={auc}')
elapsed = time.time() - start_time logging.info( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() batch += 1 i += seq_len for k, v in model.named_parameters(): print(k) print(v) # Loop over epochs. lr = args.lr best_val_loss = [] stored_loss = 100000000 # At any point you can hit Ctrl + C to break out of training early. try: if args.continue_train: optimizer_state = torch.load(os.path.join(args.save, 'optimizer.pt')) if 't0' in optimizer_state['param_groups'][0]: optimizer = torch.optim.ASGD(model.parameters(), lr=args.lr, t0=0,
def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0. start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() noises = {} if args.sharpness_smoothing: for key, p in model.named_parameters(): if hasattr(model, 'quiet_parameters') and (key in model.quiet_parameters): continue if args.adapt_type == 'weight': noise = (torch.cuda.FloatTensor(p.size()).uniform_() * 2. - 1.) * args.sharpness_smoothing * torch.abs(p.data) elif args.adapt_type == 'none': noise = (torch.cuda.FloatTensor(p.size()).uniform_() * 2. - 1.) * args.sharpness_smoothing else: raise ValueError('Unkown --adapt-type') noises[key] = noise p.data.add_(noise) output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) loss.backward() # denoise @ each mini-mini-batch. if args.sharpness_smoothing: for key, p in model.named_parameters(): if key in noises: p.data.sub_(noises[key]) # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.item() if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
retraction=opt.retraction, lr=opt.lr * hvd.size(), ) print(f'Size of hvd process : {hvd.size()}') # optimizer = Adam( # model.parameters(), # lr=opt.lr * hvd.size(), # ) lr = opt.lr # Add Horovod Distributed Optimizer optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) _lr_multiplier = 0.1 NoDisplayObj = 66 # Number of entities to display on the graph # scheduler = MultiStepLR(optimizer, milestones=[opt.burnin]+list(range(int(opt.epochs/10), opt.epochs, int(opt.epochs/10))), gamma=_lr_multiplier) # scheduler = StepLR(optimizer, step_size=10, gamma=0.9) # Broadcast parameters from rank 0 to all other processes. hvd.broadcast_parameters(model.state_dict(), root_rank=0) loader = th.utils.data.DataLoader( data, batch_size=opt.batchsize, #shuffle=True, num_workers=opt.ndproc,
val_loss2 = evaluate(val_data) print('-' * 89) print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f} | valid bpc {:8.3f}'.format( epoch, (time.time() - epoch_start_time), val_loss2, math.exp(val_loss2), val_loss2 / math.log(2))) print('-' * 89) print("MAX EPOCH = ", args.epochs + 1) for epoch in range(args.start, args.epochs + 1): epoch_start_time = time.time() train(epoch) print("TRAIN FINISHED") if 't0' in optimizer.param_groups[0]: tmp = {} for param_name, prm in model.named_parameters(): tmp[param_name] = prm.data.clone() try: prm.data = optimizer.state[prm]['ax'].clone() except: pass val_loss2 = evaluate(val_data) print('-' * 89) print( '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f} | valid bpc {:8.3f}'.format( epoch, (time.time() - epoch_start_time), val_loss2, math.exp(val_loss2), val_loss2 / math.log(2))) print('-' * 89) if epoch % 30 == 0:
# Make results dir out_dir = os.path.join(args.result_dir, args.name) if not os.path.exists(out_dir): os.makedirs(out_dir) ############################################################################### # Build the model ############################################################################### ntokens = len(corpus.dictionary) model = model.RNNModel(args.class_type, args.r, args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(device) # Print params for name, param in model.named_parameters(): if param.requires_grad: print(('Parameter name, shape: ', name, param.data.shape)) criterion = nn.CrossEntropyLoss() ############################################################################### # Training code ############################################################################### def repackage_hidden(h): """Wraps hidden states in new Tensors, to detach them from their history.""" if isinstance(h, torch.Tensor): return h.detach() else:
def train_model(model, trainds, testds, config, device, writer=None): batch_size = config['data']['batch_size'] status = config['training']['status'] epochs = config['training']['epochs'] balanced_loss = config['loss']['balanced'] # nval = config['nval'] nval_tests = config['nval_tests'] nsave = config['nsave'] model_save = config['model_save'] rank = config['rank'] nranks = config['nranks'] hvd = config['hvd'] num_classes = config['data']['num_classes'] ## create samplers for these datasets train_sampler = torch.utils.data.distributed.DistributedSampler( trainds, nranks, rank, shuffle=True, drop_last=True) test_sampler = torch.utils.data.distributed.DistributedSampler( testds, nranks, rank, shuffle=True, drop_last=True) ## create data loaders train_loader = torch.utils.data.DataLoader( trainds, shuffle=False, sampler=train_sampler, num_workers=config['data']['num_parallel_readers'], batch_size=batch_size, persistent_workers=True) test_loader = torch.utils.data.DataLoader( testds, shuffle=False, sampler=test_sampler, num_workers=config['data']['num_parallel_readers'], batch_size=batch_size, persistent_workers=True) loss_func = loss.get_loss(config) ave_loss = CalcMean.CalcMean() acc_func = accuracy.get_accuracy(config) ave_acc = CalcMean.CalcMean() opt_func = optimizer.get_optimizer(config) opt = opt_func(model.parameters(), **config['optimizer']['args']) lrsched_func = optimizer.get_learning_rate_scheduler(config) lrsched = lrsched_func(opt, **config['lr_schedule']['args']) # Add Horovod Distributed Optimizer if hvd: opt = hvd.DistributedOptimizer( opt, named_parameters=model.named_parameters()) # Broadcast parameters from rank 0 to all other processes. hvd.broadcast_parameters(model.state_dict(), root_rank=0) model.to(device) for epoch in range(epochs): logger.info(' epoch %s of %s', epoch, epochs) train_sampler.set_epoch(epoch) test_sampler.set_epoch(epoch) model.to(device) for batch_counter, (inputs, targets, class_weights, nonzero_mask) in enumerate(train_loader): # move data to device inputs = inputs.to(device) targets = targets.to(device) class_weights = class_weights.to(device) nonzero_mask = nonzero_mask.to(device) # zero grads opt.zero_grad() outputs, endpoints = model(inputs) # set the weights if balanced_loss: weights = class_weights nonzero_to_class_scaler = torch.sum( nonzero_mask.type(torch.float32)) / torch.sum( class_weights.type(torch.float32)) else: weights = nonzero_mask nonzero_to_class_scaler = torch.ones(1, device=device) loss_value = loss_func(outputs, targets.long()) loss_value = torch.mean( loss_value * weights) * nonzero_to_class_scaler # backward calc grads loss_value.backward() # apply grads opt.step() ave_loss.add_value(float(loss_value.to('cpu'))) # calc acc ave_acc.add_value( float(acc_func(outputs, targets, weights).to('cpu'))) # print statistics if batch_counter % status == 0: logger.info( '<[%3d of %3d, %5d of %5d]> train loss: %6.4f acc: %6.4f', epoch + 1, epochs, batch_counter, len(trainds) / nranks / batch_size, ave_loss.mean(), ave_acc.mean()) if writer and rank == 0: global_batch = epoch * len( trainds) / nranks / batch_size + batch_counter writer.add_scalars('loss', {'train': ave_loss.mean()}, global_batch) writer.add_scalars('accuracy', {'train': ave_acc.mean()}, global_batch) #writer.add_histogram('input_trans',endpoints['input_trans'].view(-1),global_batch) ave_loss = CalcMean.CalcMean() ave_acc = CalcMean.CalcMean() # release tensors for memory del inputs, targets, weights, endpoints, loss_value if config['batch_limiter'] and batch_counter > config[ 'batch_limiter']: logger.info('batch limiter enabled, stop training early') break # save at end of epoch torch.save(model.state_dict(), model_save + '_%05d.torch_model_state_dict' % epoch) if nval_tests == -1: nval_tests = len(testds) / nranks / batch_size logger.info('epoch %s complete, running validation on %s batches', epoch, nval_tests) model.to(device) # every epoch, evaluate validation data set with torch.no_grad(): vloss = CalcMean.CalcMean() vacc = CalcMean.CalcMean() vious = [CalcMean.CalcMean() for i in range(num_classes)] for valid_batch_counter, (inputs, targets, class_weights, nonzero_mask) in enumerate(test_loader): inputs = inputs.to(device) targets = targets.to(device) class_weights = class_weights.to(device) nonzero_mask = nonzero_mask.to(device) # set the weights if balanced_loss: weights = class_weights nonzero_to_class_scaler = torch.sum( nonzero_mask.type(torch.float32)) / torch.sum( class_weights.type(torch.float32)) else: weights = nonzero_mask nonzero_to_class_scaler = torch.ones(1, device=device) outputs, endpoints = model(inputs) loss_value = loss_func(outputs, targets.long()) loss_value = torch.mean( loss_value * weights) * nonzero_to_class_scaler vloss.add_value(float(loss_value.to('cpu'))) # calc acc vacc.add_value( float(acc_func(outputs, targets, weights).to('cpu'))) # calc ious ious = get_ious(outputs, targets, weights, num_classes) for i in range(num_classes): vious[i].add_value(float(ious[i])) if valid_batch_counter > nval_tests: break mean_acc = vacc.mean() mean_loss = vloss.mean() # if config['hvd'] is not None: # mean_acc = config['hvd'].allreduce(torch.tensor([mean_acc])) # mean_loss = config['hvd'].allreduce(torch.tensor([mean_loss])) mious = float( torch.sum(torch.FloatTensor([x.mean() for x in vious]))) / num_classes ious_out = { 'jet': vious[0].mean(), 'electron': vious[1].mean(), 'bkgd': vious[2].mean(), 'all': mious } # add validation to tensorboard if writer and rank == 0: global_batch = epoch * len( trainds) / nranks / batch_size + batch_counter writer.add_scalars('loss', {'valid': mean_loss}, global_batch) writer.add_scalars('accuracy', {'valid': mean_acc}, global_batch) writer.add_scalars('IoU', ious_out, global_batch) logger.warning( '>[%3d of %3d, %5d of %5d]<<< ave valid loss: %6.4f ave valid acc: %6.4f on %s batches >>>', epoch + 1, epochs, batch_counter, len(trainds) / nranks / batch_size, mean_loss, mean_acc, valid_batch_counter + 1) logger.warning(' >> ious: %s', ious_out) # update learning rate lrsched.step()
T.RandomSizedCrop(base_model.input_side), T.RandomHorizontalFlip(), normalize ]), download = True) dataset_eval = opts.dataset(opts.data, train = False, transform = transforms.Compose([ T.Scale(256), T.CenterCrop(base_model.input_side), normalize ]), download = True) adapt_sampler = lambda batch, dataset, sampler, **kwargs: type('', (torch.utils.data.Sampler, ), dict(__len__ = dataset.__len__, __iter__ = lambda _: itertools.chain.from_iterable(sampler(batch, dataset, **kwargs))))() loader_train = torch.utils.data.DataLoader(dataset_train, sampler = adapt_sampler(opts.batch, dataset_train, opts.sampler), num_workers = opts.threads, batch_size = opts.batch, drop_last = True, pin_memory = True) loader_eval = torch.utils.data.DataLoader(dataset_eval, shuffle = False, num_workers = opts.threads, batch_size = opts.batch, pin_memory = True) model = opts.model(base_model, dataset_train.num_training_classes).cuda() model_weights, model_biases, base_model_weights, base_model_biases = [[p for k, p in model.named_parameters() if p.requires_grad and ('bias' in k) == is_bias and ('base' in k) == is_base] for is_base in [False, True] for is_bias in [False, True]] base_model_lr_mult = model.optimizer_params.pop('base_model_lr_mult', 1.0) optimizer = model.optimizer([dict(params = base_model_weights, lr = base_model_lr_mult * model.optimizer_params['lr']), dict(params = base_model_biases, lr = base_model_lr_mult * model.optimizer_params['lr'], weight_decay = 0.0), dict(params = model_biases, weight_decay = 0.0)], **model.optimizer_params) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, **model.lr_scheduler_params) log = open(opts.log, 'w', 0) for epoch in range(opts.epochs): scheduler.step() model.train() loss_all, norm_all = [], [] for batch_idx, batch in enumerate(loader_train if model.criterion is not None else []): tic = time.time() images, labels = [tensor.cuda() for tensor in batch] loss = model.criterion(model(images), labels) loss_all.append(float(loss))
def run(): df1 = pd.read_csv("../data/jigsaw-toxic-comment-train.csv", usecols=["comment_text", "toxic"]) df2 = pd.read_csv("../data/jigsaw-unintended-bias-train.csv", usecols=["comment_text", "toxic"]) df_train = pd.concat([df1, df2], axis=0).reset_index(drop=True) df_valid = pd.read_csv("../data/validation.csv") train_dataset = dataset.BERTDataset( comment_text=df_train.comment_text.values, target=df_train.toxic.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_dataset = dataset.BERTDataset( comment_text=df_valid.comment_text.values, target=df_valid.toxic.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) device = torch.device("cuda") model = BERTBaseUncased() model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }, ] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) model = nn.DataParallel(model) best_accuracy = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler) outputs, targets = engine.eval_fn(valid_data_loader, model, device) targets = np.array(targets) >= 0.5 accuracy = metrics.roc_auc_score(targets, outputs) print(f"AUC Score = {accuracy}") if accuracy > best_accuracy: torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy
0., 0., 0., 0., 0., 0., args.n_experts, args.emblocks, args.emdensity, sparse_mode=args.sparse_mode, sparse_fract=args.sparse_fract) if args.cuda: if not args.multi_gpu: parallel_model = model.cuda() else: parallel_model = nn.DataParallel(model, dim=1).cuda() else: parallel_model = model logging('Args: {}'.format(args)) params_total, params_encoder, params_rnns = 0, 0, 0 for n, p in model.named_parameters(): #print('param {}: {}'.format(n, p.nelement())) if 'encoder' in n: params_encoder += p.nelement() elif 'rnns' in n: params_rnns += p.nelement() params_total += p.nelement() logging('params encoder: {}M'.format(params_encoder / 1.e6)) logging('params rnns: {}M'.format(params_rnns / 1.e6)) logging('params total: {}M'.format(params_total / 1.e6)) log_value('params rnn', params_rnns, 0) log_value('params encoder', params_encoder, 0) log_value('params total', params_total, 0) #write out model
def train(model, trainloader): if opt.use_cuda: model = model.cuda() model.train() criterion = torch.nn.CrossEntropyLoss() lr = opt.lr # optimizer = torch.optim.Adam(model.parameters(), lr, weight_decay=opt.weight_decay) weight_p = [] bias_p = [] for name, para in model.named_parameters(): if 'bias' in name: bias_p += [para] else: weight_p += [para] optimizer = torch.optim.Adam([{ 'params': weight_p, 'weight_decay': opt.weight_decay }, { 'params': bias_p, 'weight_decay': 0 }], lr=lr) previous_loss = 1e10 pEpoch = [] pLoss = [] for epoch in range(opt.epoch): loss_all = 0 total_accuracy = 0 for i, (input, target) in enumerate(trainloader): if opt.use_cuda: input = input.cuda() target = target.cuda() optimizer.zero_grad() score = model(input) loss = criterion(score, target) loss.backward() optimizer.step() pred = torch.max(score, 1)[1] accuracy = float((pred == target).sum()) accuracy = accuracy * 100 / input.size(0) # print((pred == target).sum(dim=0,keepdim=False)) total_accuracy += accuracy loss_all += float(loss) if i % opt.printinter == 0: print("Epoch: ", epoch, "| Iter:", i, "| Loss:", float(loss), "| Accuracy:", accuracy, "%") avgloss = loss_all / len(trainloader) avgaccuracy = total_accuracy / len(trainloader) print("the end of Epoch: ", epoch, "| AVGLoss:", avgloss, "| Accuracy:", avgaccuracy, "%") save(model, epoch) # plot pEpoch.append(epoch) pLoss.append(avgloss) plotlc(pEpoch, pLoss)
def train(): # Turn on training mode which enables dropout. if args.model == 'QRNN': model.reset() total_loss = 0 start_time = time.time() hidden = model.init_hidden(args.batch_size) batch, i = 0, 0 while i < train_data.size(1) - 1 - 1: # seq_len=args.bptt lr2 = optimizer.param_groups[0]['lr'] # optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt model.train() data, targets = get_batchlz(train_data, train_label, i, args.batch_size) hidden = repackage_hidden(hidden) optimizer.zero_grad() '''训练步''' output, hidden, rnn_hs, dropped_rnn_hs = model(data, None, return_h=True) raw_loss = criterion(output, targets) wri.add_scalar('raw_loss', raw_loss, epoch * lzarg + batch) loss = raw_loss # Activiation Regularization if args.alpha: loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) wri.add_scalar('alpha_loss', loss - raw_loss, epoch * lzarg + batch) # Temporal Activation Regularization (slowness) if args.beta: loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) wri.add_scalar('alphabeta_loss', loss - raw_loss, epoch * lzarg + batch) loss.backward() # wri.add_scalar('loss',loss,epoch*179+batch) # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. if args.clip: torch.nn.utils.clip_grad_norm_(params, args.clip) optimizer.step() total_loss += raw_loss.data optimizer.param_groups[0]['lr'] = lr2 if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss.item( ) / args.log_interval #log_interval平均loss wri.add_scalar( 'bat_curraw_loss', cur_loss, epoch * (lzarg // args.log_interval) + batch / args.log_interval) elapsed = time.time() - start_time pred_y = torch.max(output, 1)[1].data accuracy = (pred_y == targets).float().sum() / len(targets) wri.add_scalar( 'train_accuracy', accuracy, epoch * (lzarg // args.log_interval) + batch / args.log_interval) print( 'ACC={:3.2f} | epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | ' '平均loss {:5.2f} | ppl {:5.2f} | bpc {:5.3f}'.format( accuracy, epoch, batch, lzarg, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), cur_loss / math.log(2)), '\n') total_loss = 0 start_time = time.time() ### batch += 1 i += args.batch_size '''# 保存梯度数据来可视化''' if epoch % 10 == 0 and epoch != 0: # targets_image=image[targets] # wri.add_embedding(rnn_hs[0][-1,:,:].clone().cpu().data.numpy(),metadata=targets,label_img=targets_image.data,global_step=epoch/10) for name, param in model.named_parameters(): wri.add_histogram(name, param.clone().cpu().data.numpy(), epoch / 10)
test_batch_size = 1 train_data = batchify(corpus.train, args.batch_size, args) val_data = batchify(corpus.valid, eval_batch_size, args) test_data = batchify(corpus.test, test_batch_size, args) ############################################################################### # Build the model ############################################################################### criterion = None ntokens = len(corpus.dictionary) model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.chunk_size, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) for name, weight in model.named_parameters(): print(name, weight.numel()) ### if args.resume: print('Resuming model ...') model_load(args.resume) optimizer.param_groups[0]['lr'] = args.lr model.dropouti, model.dropouth, model.dropout, args.dropoute = args.dropouti, args.dropouth, args.dropout, args.dropoute if args.wdrop: for rnn in model.rnn.cells: rnn.hh.dropout = args.wdrop ### if not criterion: splits = [] if ntokens > 500000: # One Billion