def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.data if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def train(self, epoch_idx, batch_size, max_norm): logger, model, data = self.logger, self.model, self.data logger.info('At %d-th epoch with lr %f.', epoch_idx, self.optimizer.param_groups[0]['lr']) model.train() nb_train_batch = ceil(data.nb_train / batch_size) for src, src_mask, trg, _ in tqdm( data.train_batch_sample(batch_size), total=nb_train_batch): out = model(src, src_mask, trg) loss = model.loss(out, trg[1:]) self.optimizer.zero_grad() loss.backward() if max_norm > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) logger.debug('loss %f with total grad norm %f', loss, util.grad_norm(model.parameters())) self.optimizer.step()
def valid(epoch, quesfeaShu, labelShu, lengthShu): losses = AverageMeter() top1 = AverageMeter() model.eval() start_time = time.time() for i in range(0, len(quesfeaShu) / args.batch_size): if i == len(quesfeaShu) / args.batch_size - 1: batchend = len(quesfeaShu) else: batchend = (i + 1) * (args.batch_size) # print batchend batchstart = i * (args.batch_size) batch_size = batchend - batchstart quesfeabatch = [] labelbatch = [] lengthbatch = [] quesfeaOri = quesfeaShu[batchstart:batchend] labelOri = labelShu[batchstart:batchend] lengthOri = lengthShu[batchstart:batchend] idxbatch = sorted(range(len(lengthOri)), key=lambda x: lengthOri[x], reverse=True) for j in range(len(idxbatch)): quesfeabatch.append(quesfeaOri[idxbatch[j]]) labelbatch.append(labelOri[idxbatch[j]]) lengthbatch.append(lengthOri[idxbatch[j]]) questrainarray = np.asarray(quesfeabatch) labeltrainarray = np.asarray(labelbatch) lengthtrainarray = np.asarray(lengthbatch) tmp = [questrainarray, labeltrainarray, lengthtrainarray] tmp = [Variable(torch.from_numpy(_), requires_grad=False) for _ in tmp] trques, trlabel, length = tmp if args.cuda: trlabel.cuda() output = model(trques, length) # print output loss = criterion(output, trlabel) / (batch_size) prec1, = accuracy(output.data, trlabel.data, topk=(1,), ori_label=labeltrainarray) # label 0 or 1 losses.update(loss.data[0], batch_size) top1.update(prec1[0], batch_size) # loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) print str(top1.avg) + ' ' + str(loss.data[0]) + ' ' + 'batch_valid ' + str(i) # update better performance model global best_score if top1.avg > best_score: torch.save(model, args.save) print 'save model' best_score = top1.avg print str(top1.avg) + ' ' + str(loss.data[0]) + ' ' + 'epoch_valid ' + str(epoch)
def train(epoch, optimizer, quesfeaShu, labelShu, lengthShu): losses = AverageMeter() top1 = AverageMeter() model.train() for i in range(0, len(quesfeaShu) / args.batch_size): if i == len(quesfeaShu) / args.batch_size - 1: batchend = len(quesfeaShu) else: batchend = (i + 1) * (args.batch_size) batchstart = i * (args.batch_size) batch_size = batchend - batchstart quesfeabatch = [] labelbatch = [] lengthbatch = [] quesfeaOri = quesfeaShu[batchstart:batchend] labelOri = labelShu[batchstart:batchend] lengthOri = lengthShu[batchstart:batchend] idxbatch = sorted(range(len(lengthOri)), key=lambda x: lengthOri[x], reverse=True) for j in range(len(idxbatch)): quesfeabatch.append(quesfeaOri[idxbatch[j]]) labelbatch.append(labelOri[idxbatch[j]]) lengthbatch.append(lengthOri[idxbatch[j]]) questrainarray = np.asarray(quesfeabatch) labeltrainarray = np.asarray(labelbatch) lengthtrainarray = np.asarray(lengthbatch) tmp = [questrainarray, labeltrainarray, lengthtrainarray] tmp = [Variable(torch.from_numpy(_), requires_grad=False) for _ in tmp] trques, trlabel, length = tmp if args.cuda: trlabel.cuda() output = model(trques, length) loss = criterion(output, trlabel) / (batch_size) prec1, = accuracy(output.data, trlabel.data, topk=(1,)) losses.update(loss.data[0], batch_size) top1.update(prec1[0], batch_size) optimizer.zero_grad() loss.backward() optimizer.step() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) print str(top1.avg) + ' ' + str(top1.val) + ' ' + str(loss.data[0]) + ' ' + 'batch ' + str(i) print str(top1.avg) + ' ' + str(top1.val) + ' ' + str(loss.data[0]) + ' ' + 'epoch ' + str(epoch)
def train(): # Turn on training mode which enables dropout. if args.model == 'QRNN': model.reset() total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) batch, i = 0, 0 while i < train_data.size(0) - 1 - 1: bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2. # Prevent excessively small or negative sequence lengths seq_len = max(5, int(np.random.normal(bptt, 5))) # There's a very small chance that it could select a very long sequence length resulting in OOM # seq_len = min(seq_len, args.bptt + 10) lr2 = optimizer.param_groups[0]['lr'] optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt model.train() data, targets = get_batch(train_data, i, args, seq_len=seq_len) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) optimizer.zero_grad() output, hidden, rnn_hs, dropped_rnn_hs = model(data, hidden, return_h=True) raw_loss = criterion(output.view(-1, ntokens), targets) loss = raw_loss # Activiation Regularization loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) optimizer.step() total_loss += raw_loss.data optimizer.param_groups[0]['lr'] = lr2 if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() ### batch += 1 i += seq_len
# python main.py --log_interval 200 --lr 0.1 --nhid 200 --nlayer 1 --epochs 40 --dropout 0 --model GRU --lr_decay 0.5 --bptt 10 --batch_size 64 # | end of epoch 1 | time: 143.43s | valid loss 4.94 | valid perplexity 140.00 # python main.py --log_interval 200 --lr 0.1 --nhid 150 --nlayer 1 --epochs 40 --dropout 0 --model GRU --lr_decay 0.5 --bptt 10 --batch_size 64 # | end of epoch 1 | time: 121.41s | valid loss 4.98 | valid perplexity 144.75 # python main.py --log_interval 200 --lr 0.1 --nhid 150 --nlayer 1 --epochs 40 --dropout 0 --model GRU --lr_decay 0.5 --bptt 10 --batch_size 128 # | end of epoch 1 | time: 89.41s | valid loss 4.97 | valid perplexity 144.64 # python main.py --log_interval 200 --lr 0.1 --nhid 128 --nlayer 1 --epochs 40 --dropout 0 --model GRU --lr_decay 0.5 --bptt 10 --batch_size 128 # | end of epoch 1 | time: 78.13s | valid loss 4.98 | valid perplexity 145.55 # python main.py --log_interval 200 --lr 0.1 --nhid 128 --nlayer 1 --epochs 40 --dropout 0 --model GRU --lr_decay 0.5 --bptt 12 --batch_size 128 # | end of epoch 1 | time: 74.73s | valid loss 4.99 | valid perplexity 147.03 optimizer = optim.Adagrad(model.parameters(), lr=args.lr, lr_decay=1e-4, weight_decay=1e-5) ############################################################################### # Training code ############################################################################### def repackage_hidden(h): """Wraps hidden states in new Variables, to detach them from their history.""" if type(h) == Variable: return Variable(h.data) else: return tuple(repackage_hidden(v) for v in h)
def main(model, path): print(path) t1 = time.time() checkpoint_folder = "Model_Checkpoints" project_path = os.getcwd() save_path = os.path.join(project_path, checkpoint_folder) if not os.path.exists(checkpoint_folder): os.makedirs(checkpoint_folder) else: shutil.rmtree(save_path) os.makedirs(checkpoint_folder) in_features = 300 hidden_size = 256 layer_num = 2 print("\n") print(" Loading Data ... ") print("="*30) print("\n") train_dl, valid_dl, trn, vld = dataloader.train_val_loader(path) print(" Got train_dataloader and validation_dataloader ") print("="*30) print("\n") print(" Loading LSTM Model ...") print("="*30) print("\n") model = model.Rnn_Lstm(in_features, hidden_size, layer_num, 391) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) optimizer = optim.Adam(model.parameters(), lr=1e-2) criterion = nn.BCEWithLogitsLoss() epochs = 10 print(" Training started ... ") print("="*30) print("\n") for epoch in range(1, epochs + 1): checkpoint_name = "checkpoint_"+ str(epoch) +".pth" checkpoint_save_path = os.path.join(save_path, checkpoint_name) running_loss = 0.0 model.train() # turn on training mode for x, y in tqdm.tqdm(train_dl): x, y = x.to(device), y.to(device) optimizer.zero_grad() preds = model(x) loss = criterion(preds, y) loss.backward() optimizer.step() running_loss += loss.item() * x.size(0) epoch_loss = running_loss / len(trn) # calculate the validation loss for this epoch val_loss = 0.0 model.eval() # turn on evaluation mode for x, y in valid_dl: x, y = x.to(device), y.to(device) preds = model(x) loss = criterion(preds, y) val_loss += loss.item() * x.size(0) val_loss /= len(vld) print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f} \n'.format(epoch, epoch_loss, val_loss)) print("Checkpoint saved after {} epoch\n".format(epoch)) torch.save(model.state_dict(), checkpoint_save_path) print("Training completed -> Finished -- {} \n".format(time.time()-t1)) print("="*30) print("\n")
args.emsize, args.nhid, args.encinit, args.decinit, args.weightinit, args.dropout, args.optim, args.lr, args.tied, args.shuffle, ntokens, args.vocab) ]) print( 'Pytorch | RnnType | Clip | #Layers | EmbDim | HiddenDim | EncoderInit | DecoderInit | WeightInit | Dropout | Optimizer| LR | Tied | Shuffle | Ntokens | VocabSize' ) print(model_config) # Loop over epochs. lr = args.lr prev_val_loss = None optimizer = None if args.optim == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=lr) elif args.optim == 'sgd': optimizer = torch.optim.SGD(model.parameters(), lr=lr) best_val_perplex = 99999 for epoch in range(1, args.epochs + 1): epoch_start_time = time.time() train(optimizer) val_loss = evaluate(val_data) if math.exp(val_loss) < best_val_perplex: best_val_perplex = math.exp(val_loss) if args.save != '': # save the model torch.save(model, args.save) # save model state_dict to avoid pytorch version problems
if '64-64-64' in args.data_path: args.input_dim = (64, 64, 64) args.input_nf = 1 UP_AXIS = 0 print(args) # specify gpu os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) # create model model = model.GenModel(args.encoder_dim, args.input_dim, args.input_nf, args.coarse_feat_dim, args.refine_feat_dim, args.num_hierarchy_levels, not args.no_pass_occ, not args.no_pass_feats, args.use_skip_sparse, args.use_skip_dense).cuda() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) if args.retrain: print('loading model:', args.retrain) checkpoint = torch.load(args.retrain) args.start_epoch = args.start_epoch if args.start_epoch != 0 else checkpoint[ 'epoch'] model.load_state_dict(checkpoint['state_dict']) #, strict=False) optimizer.load_state_dict(checkpoint['optimizer']) last_epoch = -1 if not args.retrain else args.start_epoch - 1 scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.decay_lr, gamma=0.5, last_epoch=last_epoch)
val_data = batchify(corpus.valid, eval_batch_size) test_data = batchify(corpus.test, eval_batch_size) ############################################################################### # Build the model ############################################################################### ntokens = len(corpus.dictionary) model = model.RNNModel(modeltype, ntokens, emsize, nhid, nlayers, dropout, tied) if cuda: model.cuda() criterion = nn.CrossEntropyLoss() print("number of parameters: ", sum(param.numel() for param in model.parameters())) ############################################################################### # Training code ############################################################################### def repackage_hidden(h): """Wraps hidden states in new Variables, to detach them from their history.""" if type(h) == Variable: return Variable(h.data) else: return tuple(repackage_hidden(v) for v in h) # get_batch subdivides the source data into chunks of length bptt.
# Build the model ############################################################################### ntokens = len(corpus.dictionary) if args.continue_train: model = torch.load(os.path.join(args.save, 'finetune_model.pt')) else: model = torch.load(os.path.join(args.save, 'model.pt')) if args.cuda: if args.single_gpu: parallel_model = model.cuda() else: parallel_model = nn.DataParallel(model, dim=1).cuda() else: parallel_model = model total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in model.parameters()) logging('Args: {}'.format(args)) logging('Model total parameters: {}'.format(total_params)) criterion = nn.CrossEntropyLoss() ############################################################################### # Training code ############################################################################### def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size)
import torch.optim as optim import matplotlib.pyplot as plt """Parameters and user defined configs for model""" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = CNN().to(device) print(model.parameters) #parameters(user defined) in_channels = 3 num_classes = 2 learning_rate = 0.001 BATCH_SIZE = 66 EPOCHS = 20 momentum = 0.9 # Loss and Loss_function criterion loss_function = nn.CrossEntropyLoss() #As we are using cross entropy loss we dont have to use Softmax at the end optimizer = optim.SGD(model.parameters(),lr=learning_rate,momentum=momentum) """Actual Training""" def train(model_net,train_data,label_train,loss_lst): for epoch in range(EPOCHS):# here as we have already seperated Features and labels ,we have to for i in range(0,len(train_data),BATCH_SIZE):# Initiate a for loop which steps or iter train data by the steps of #print(i,i+BATCH_SIZE) #our user Defined BATCH_SIZE ,else if data is coming as a single unit, #Then we can use inbuilt torch "Dataloader" and enumerate data and labels using a single for loop X_train_batch = train_data[i : i+BATCH_SIZE].to(device=device) #converting data into "CUDA" or device standards y_train_batch = label_train[i : i+BATCH_SIZE].to(device=device) #Forward pass y_pred_outs = model_net(X_train_batch.float()) #should be converted to float otherwise it will throw a RUNTIME error of expecting DOUBLE loss = loss_function(y_pred_outs,y_train_batch.long()) loss_lst.append(loss) if epoch % 2 ==1: print("Epoch number:{} and loss:{}".format(epoch,loss.item()))
val_data = batchify(corpus.valid, eval_batch_size, args) test_data = batchify(corpus.test, test_batch_size, args) ############################################################################### # Build the model ############################################################################### ntokens = len(corpus.dictionary) model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) if args.cuda: model.cuda() total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in model.parameters()) print('Args:', args) print('Model total parameters:', total_params) # criterion = nn.CrossEntropyLoss() # master branch has a but here, see this: # https://github.com/salesforce/awd-lstm-lm/issues/28 # it should not be using CrossEntropyLoss() splits = [] if ntokens > 500000: # One Billion # This produces fairly even matrix mults for the buckets: # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422 splits = [4200, 35000, 180000] elif ntokens > 75000:
def train_model(model, trainds, testds, config, device, writer=None): batch_size = config['data']['batch_size'] status = config['training']['status'] epochs = config['training']['epochs'] balanced_loss = config['loss']['balanced'] # nval = config['nval'] nval_tests = config['nval_tests'] nsave = config['nsave'] model_save = config['model_save'] rank = config['rank'] nranks = config['nranks'] hvd = config['hvd'] num_classes = config['data']['num_classes'] ## create samplers for these datasets train_sampler = torch.utils.data.distributed.DistributedSampler( trainds, nranks, rank, shuffle=True, drop_last=True) test_sampler = torch.utils.data.distributed.DistributedSampler( testds, nranks, rank, shuffle=True, drop_last=True) ## create data loaders train_loader = torch.utils.data.DataLoader( trainds, shuffle=False, sampler=train_sampler, num_workers=config['data']['num_parallel_readers'], batch_size=batch_size, persistent_workers=True) test_loader = torch.utils.data.DataLoader( testds, shuffle=False, sampler=test_sampler, num_workers=config['data']['num_parallel_readers'], batch_size=batch_size, persistent_workers=True) loss_func = loss.get_loss(config) ave_loss = CalcMean.CalcMean() acc_func = accuracy.get_accuracy(config) ave_acc = CalcMean.CalcMean() opt_func = optimizer.get_optimizer(config) opt = opt_func(model.parameters(), **config['optimizer']['args']) lrsched_func = optimizer.get_learning_rate_scheduler(config) lrsched = lrsched_func(opt, **config['lr_schedule']['args']) # Add Horovod Distributed Optimizer if hvd: opt = hvd.DistributedOptimizer( opt, named_parameters=model.named_parameters()) # Broadcast parameters from rank 0 to all other processes. hvd.broadcast_parameters(model.state_dict(), root_rank=0) model.to(device) for epoch in range(epochs): logger.info(' epoch %s of %s', epoch, epochs) train_sampler.set_epoch(epoch) test_sampler.set_epoch(epoch) model.to(device) for batch_counter, (inputs, targets, class_weights, nonzero_mask) in enumerate(train_loader): # move data to device inputs = inputs.to(device) targets = targets.to(device) class_weights = class_weights.to(device) nonzero_mask = nonzero_mask.to(device) # zero grads opt.zero_grad() outputs, endpoints = model(inputs) # set the weights if balanced_loss: weights = class_weights nonzero_to_class_scaler = torch.sum( nonzero_mask.type(torch.float32)) / torch.sum( class_weights.type(torch.float32)) else: weights = nonzero_mask nonzero_to_class_scaler = torch.ones(1, device=device) loss_value = loss_func(outputs, targets.long()) loss_value = torch.mean( loss_value * weights) * nonzero_to_class_scaler # backward calc grads loss_value.backward() # apply grads opt.step() ave_loss.add_value(float(loss_value.to('cpu'))) # calc acc ave_acc.add_value( float(acc_func(outputs, targets, weights).to('cpu'))) # print statistics if batch_counter % status == 0: logger.info( '<[%3d of %3d, %5d of %5d]> train loss: %6.4f acc: %6.4f', epoch + 1, epochs, batch_counter, len(trainds) / nranks / batch_size, ave_loss.mean(), ave_acc.mean()) if writer and rank == 0: global_batch = epoch * len( trainds) / nranks / batch_size + batch_counter writer.add_scalars('loss', {'train': ave_loss.mean()}, global_batch) writer.add_scalars('accuracy', {'train': ave_acc.mean()}, global_batch) #writer.add_histogram('input_trans',endpoints['input_trans'].view(-1),global_batch) ave_loss = CalcMean.CalcMean() ave_acc = CalcMean.CalcMean() # release tensors for memory del inputs, targets, weights, endpoints, loss_value if config['batch_limiter'] and batch_counter > config[ 'batch_limiter']: logger.info('batch limiter enabled, stop training early') break # save at end of epoch torch.save(model.state_dict(), model_save + '_%05d.torch_model_state_dict' % epoch) if nval_tests == -1: nval_tests = len(testds) / nranks / batch_size logger.info('epoch %s complete, running validation on %s batches', epoch, nval_tests) model.to(device) # every epoch, evaluate validation data set with torch.no_grad(): vloss = CalcMean.CalcMean() vacc = CalcMean.CalcMean() vious = [CalcMean.CalcMean() for i in range(num_classes)] for valid_batch_counter, (inputs, targets, class_weights, nonzero_mask) in enumerate(test_loader): inputs = inputs.to(device) targets = targets.to(device) class_weights = class_weights.to(device) nonzero_mask = nonzero_mask.to(device) # set the weights if balanced_loss: weights = class_weights nonzero_to_class_scaler = torch.sum( nonzero_mask.type(torch.float32)) / torch.sum( class_weights.type(torch.float32)) else: weights = nonzero_mask nonzero_to_class_scaler = torch.ones(1, device=device) outputs, endpoints = model(inputs) loss_value = loss_func(outputs, targets.long()) loss_value = torch.mean( loss_value * weights) * nonzero_to_class_scaler vloss.add_value(float(loss_value.to('cpu'))) # calc acc vacc.add_value( float(acc_func(outputs, targets, weights).to('cpu'))) # calc ious ious = get_ious(outputs, targets, weights, num_classes) for i in range(num_classes): vious[i].add_value(float(ious[i])) if valid_batch_counter > nval_tests: break mean_acc = vacc.mean() mean_loss = vloss.mean() # if config['hvd'] is not None: # mean_acc = config['hvd'].allreduce(torch.tensor([mean_acc])) # mean_loss = config['hvd'].allreduce(torch.tensor([mean_loss])) mious = float( torch.sum(torch.FloatTensor([x.mean() for x in vious]))) / num_classes ious_out = { 'jet': vious[0].mean(), 'electron': vious[1].mean(), 'bkgd': vious[2].mean(), 'all': mious } # add validation to tensorboard if writer and rank == 0: global_batch = epoch * len( trainds) / nranks / batch_size + batch_counter writer.add_scalars('loss', {'valid': mean_loss}, global_batch) writer.add_scalars('accuracy', {'valid': mean_acc}, global_batch) writer.add_scalars('IoU', ious_out, global_batch) logger.warning( '>[%3d of %3d, %5d of %5d]<<< ave valid loss: %6.4f ave valid acc: %6.4f on %s batches >>>', epoch + 1, epochs, batch_counter, len(trainds) / nranks / batch_size, mean_loss, mean_acc, valid_batch_counter + 1) logger.warning(' >> ious: %s', ious_out) # update learning rate lrsched.step()
if ntokens > 500000: # One Billion # This produces fairly even matrix mults for the buckets: # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422 splits = [4200, 35000, 180000] elif ntokens > 75000: # WikiText-103 splits = [2800, 20000, 76000] print('Using', splits) criterion = SplitCrossEntropyLoss(args.emsize, splits=splits, verbose=False) ### if args.cuda: model = model.cuda() criterion = criterion.cuda() ### params = list(model.parameters()) + list(criterion.parameters()) total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in params if x.size()) print('Args:', args) print('Model total parameters:', total_params) ############################################################################### # Training code ############################################################################### def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() if args.model == 'QRNN': model.reset() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size)
def train(): assert args.batch_size % args.small_batch_size == 0, 'batch_size must be divisible by small_batch_size' # Turn on training mode which enables dropout. total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = [model.init_hidden(args.small_batch_size) for _ in range(args.batch_size // args.small_batch_size)] batch, i = 0, 0 while i < train_data.size(0) - 1 - 1: bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2. # Prevent excessively small or negative sequence lengths seq_len = max(5, int(np.random.normal(bptt, 5))) # There's a very small chance that it could select a very long sequence length resulting in OOM seq_len = min(seq_len, args.bptt + args.max_seq_len_delta) lr2 = optimizer.param_groups[0]['lr'] optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt model.train() data, targets = get_batch(train_data, i, args, seq_len=seq_len) optimizer.zero_grad() start, end, s_id = 0, args.small_batch_size, 0 while start < args.batch_size: cur_data, cur_targets = data[:, start: end], targets[:, start: end].contiguous().view(-1) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden[s_id] = repackage_hidden(hidden[s_id]) log_prob, hidden[s_id], rnn_hs, dropped_rnn_hs = parallel_model(cur_data, hidden[s_id], return_h=True) raw_loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), cur_targets) loss = raw_loss # Activiation Regularization loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) loss *= args.small_batch_size / args.batch_size total_loss += raw_loss.data * args.small_batch_size / args.batch_size loss.backward() s_id += 1 start = end end = start + args.small_batch_size gc.collect() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) optimizer.step() # total_loss += raw_loss.data optimizer.param_groups[0]['lr'] = lr2 if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time logging('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() ### batch += 1 i += seq_len
if args.continue_train: model = torch.load(os.path.join(args.save, 'model.pt')) else: model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nhidlast, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied, args.dropoutl, args.n_experts) if args.cuda: if args.single_gpu: parallel_model = model.cuda() else: parallel_model = nn.DataParallel(model, dim=1).cuda() else: parallel_model = model total_params = sum(x.data.nelement() for x in model.parameters()) logging('Args: {}'.format(args)) logging('Model total parameters: {}'.format(total_params)) criterion = nn.CrossEntropyLoss() ############################################################################### # Training code ############################################################################### def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size)
def train(): assert args.batch_size % args.small_batch_size == 0, 'batch_size must be divisible by small_batch_size' # Turn on training mode which enables dropout. total_loss = 0 total_student_loss = 0 start_time = time.time() ntokens = len(corpus.vocab) hidden = [model.init_hidden(args.small_batch_size) for _ in range(args.batch_size // args.small_batch_size)] batch, i = 0, 0 while i < train_data.size(0) - 1 - 1: bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2. # Prevent excessively small or negative sequence lengths seq_len = max(5, int(np.random.normal(bptt, 5))) # There's a very small chance that it could select a very long sequence length resulting in OOM seq_len = min(seq_len, args.bptt + args.max_seq_len_delta) lr2 = optimizer.param_groups[0]['lr'] optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt model.eval() # disable dropout data, targets = get_batch(train_data, i, args, seq_len=seq_len) optimizer.zero_grad() start, end, s_id = 0, args.small_batch_size, 0 while start < args.batch_size: cur_data, cur_targets = data[:, start: end], targets[:, start: end].contiguous().view(-1) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden[s_id] = repackage_hidden(hidden[s_id]) parallel_rv = parallel_model(*hidden[s_id], input=cur_data, return_h=True, return_student_distill_loss=True, flatten_returned_lists=True, enable_rnd_tune=True) # reassemble return values log_prob, student_distill_loss = parallel_rv[0], parallel_rv[-1].sum() parallel_rv = np.array(parallel_rv[1:-1]).reshape((3, -1)).tolist() hidden[s_id], rnn_hs, dropped_rnn_hs = parallel_rv raw_loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), cur_targets) loss = raw_loss # Activiation Regularization loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) # Student distillation loss total_student_loss += student_distill_loss.data / args.batch_size #loss = loss + args.distillossw * student_distill_loss #loss = student_distill_loss loss *= args.small_batch_size / args.batch_size total_loss += raw_loss.data * args.small_batch_size / args.batch_size loss.backward() s_id += 1 start = end end = start + args.small_batch_size gc.collect() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) optimizer.step() # total_loss += raw_loss.data optimizer.param_groups[0]['lr'] = lr2 if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss.item() / args.log_interval cur_student_loss = total_student_loss.item() / args.log_interval elapsed = time.time() - start_time logging('| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.4f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f} | distill loss {:5.4f} | post scaling gain0 {:5.4f}'.format( epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), cur_student_loss * args.small_batch_size / args.batch_size, model.rnd_models[0].post_scaling_gain.item())) total_loss = 0 total_student_loss = 0 start_time = time.time() ### batch += 1 i += seq_len
train_data = batchify(corpus.train, args.batch_size) # size(total_len//bsz, bsz) val_data = batchify(corpus.valid, eval_batch_size) test_data = batchify(corpus.test, eval_batch_size) # Build the model interval = 200 # interval to report ntokens = len(corpus.dictionary) # 10000 model = model.RNNModel(ntokens, args.embed_size, args.n_hid, args.n_layers, args.dropout) print(model) criterion = nn.CrossEntropyLoss() l_rate = args.l_rate best_val_loss = None opt = torch.optim.SGD(model.parameters(), lr=l_rate) if args.opt == 'Adam': opt = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.99)) l_rate = 0.001 if args.opt == 'Momentum': opt = torch.optim.SGD(model.parameters(), lr=l_rate, momentum=0.8) try: for epoch in range(1, args.epochs + 1): epoch_start_time = time.time() train_loss = train() val_loss = evaluate(val_data) print('-' * 80) print( '| end of epoch {:3d} | time: {:5.2f}s | train_loss {:5.2f} | valid loss {:5.2f} | ' 'valid ppl {:8.2f} | train ppl {:8.2f}'.format(
test_data = batchify(corpus.test, test_batch_size, args) ############################################################################### # Build the model ############################################################################### ntokens = len(corpus.dictionary) model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied, args.bytes) if args.cuda: model.cuda() total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in model.parameters()) print('Args:', args) print('Model total parameters:', total_params) criterion = nn.CrossEntropyLoss() ############################################################################### # Training code ############################################################################### def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() if args.model == 'QRNN': model.reset() total_loss = 0
if top1.avg > best_score: torch.save(model, args.save) print 'save model' best_score = top1.avg print str(top1.avg) + ' ' + str(loss.data[0]) + ' ' + 'epoch_valid ' + str(epoch) # Loop over epochs. lr = args.lr best_val_loss = None # At any point you can hit Ctrl + C to break out of training early. st(context=27) best_score = 0 try: optimizer = optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, args.epochs + 1): train(epoch, optimizer, questrainfealistShu, labeltrainlistShu, lengthtrainlistShu) valid(epoch, questrainfealistShu_valid, labeltrainlistShu_valid, lengthtrainlistShu_valid) except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') def test(model, quesfeaShu, labelShu, lengthShu): model.eval() idx = sorted(range(len(lengthShu)), key=lambda x: lengthShu[x], reverse=True) _quesfeaShu = []
# Load checkpoint if args.checkpoint != '': if args.cuda: model = torch.load(args.checkpoint) else: # Load GPU model on CPU model = torch.load(args.checkpoint, map_location=lambda storage, loc: storage) if args.cuda: model.cuda() else: model.cpu() print(model) print('------------------------------------------------------') print('\t\t Total parameters in model : ', sum(param.numel() for param in model.parameters())) print('------------------------------------------------------\n') def repackage_hidden(h): """Wraps hidden states in new Variables, to detach them from their history.""" return [state.detach() for state in h] def get_batch(source, i, evaluation=False): seq_len = min(args.bptt, len(source) - 1 - i) data = Variable(source[i:i + seq_len], volatile=evaluation) target = Variable(source[i + 1:i + 1 + seq_len].view(-1)) return data, target
def get_trainable_parameters(model): for param in model.parameters(): if param.requires_grad: yield param
eval_batch_size = 10 test_batch_size = 1 train_data = batchify(corpus.train, args.batch_size, args) val_data = batchify(corpus.valid, eval_batch_size, args) test_data = batchify(corpus.test, test_batch_size, args) ############################################################################### # Build the model ############################################################################### ntokens = len(corpus.dictionary) model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nout, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) if args.cuda: model.cuda() total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in model.parameters()) print('Args:', args) print('Model total parameters:', total_params) sys.stdout.flush() criterion = nn.CrossEntropyLoss() ############################################################################### # Training code ############################################################################### def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 ntokens = len(corpus.dictionary)
print("freeze embedding:", freeze) init_embedding = utils.read_pretrained_embeddings(options.word_embeddings, w2i, options.embed_dim) embed = nn.Embedding.from_pretrained(torch.FloatTensor(init_embedding), freeze=freeze) else: embed = nn.Embedding(len(w2i), options.embed_dim) model = model.Generator(embed, options.embed_dim, len(w2i), options.hidden_size, num_layers=options.layers, dropout=options.dropout, use_API=options.API) optimizer = torch.optim.Adam(model.parameters(), lr=options.lr) #optimizer=torch.optim.SGD(model.parameters(), lr = options.lr,momentum=0) criterion = nn.CrossEntropyLoss() if options.gpu: model = model.cuda() criterion = criterion.cuda() loss_meter = meter.AverageValueMeter() if options.old_model: # incremental training print("Incremental training from old model: {}".format(options.old_model)) model.load_state_dict(torch.load(options.old_model)) best_model = "{}_{}_{}_{}".format(options.name, options.API,
else: model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nhidlast, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied, args.dropoutl, args.n_experts) if args.cuda: if args.single_gpu: parallel_model = model.cuda() else: parallel_model = nn.DataParallel(model, dim=1).cuda() else: parallel_model = model total_params = sum(x.data.nelement() for x in model.parameters()) logging('Args: {}'.format(args)) logging('Model total parameters: {}'.format(total_params)) criterion = nn.CrossEntropyLoss() import pdb pdb.set_trace() ############################################################################### # Training code ############################################################################### def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout.
shuffle=False, num_workers=config.cpu_processor, drop_last=True) dev_loader = DataLoader(dev_data, batch_size=config.batch, shuffle=False, num_workers=config.cpu_processor, drop_last=True) # 모델 설정 device = torch.device(config.gpu if torch.cuda.is_available() else 'cpu') model = model.classifier(vocab_list, embedding) model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate) loss_function = nn.CrossEntropyLoss() # 훈련 step_list = [] loss_list = [] acc_test_list = [] acc_dev_list = [] step = 0 for i in range(config.epoch): print("epoch = ", i) start = time.time() for n, (label, sent1, sent2) in enumerate(train_loader): optimizer.zero_grad() # 초기화 label = Variable(label.to(device)) sent1 = Variable(torch.stack(sent1).to(device))
# This produces fairly even matrix mults for the buckets: # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422 splits = [4200, 35000, 180000] elif ntokens > 75000: # WikiText-103 splits = [2800, 20000, 76000] print('Using', splits) if args.split_cross: criterion = SplitCrossEntropyLoss(args.emsize, splits=splits, verbose=False) else: criterion = nn.CrossEntropyLoss( ) # SplitCrossEntropyLoss(args.emsize, splits=splits, verbose=False) ### params = list(filter(lambda p: not p is model.scale, model.parameters())) params = list(params) # + list(criterion.parameters()) if args.split_cross: params = params + list(criterion.parameters()) print(args.split_cross) if args.cuda: model = model.cuda() if args.split_cross: criterion = criterion.cuda() # criterion = criterion.cuda() params = list(params) # + list(criterion.parameters()) ### # for param in params: # print(param.size()) total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0]
def evaluate(data_source): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(eval_batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, evaluation=True) output, hidden = model(data, hidden) output_flat = output.view(-1, ntokens) total_loss += len(data) * criterion(output_flat, targets).data hidden = repackage_hidden(hidden) return total_loss[0] / len(data_source) if args.optim == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001 if args.optim == 'sparseadam': optimizer = optim.SparseAdam(model.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001 if args.optim == 'adamax': optimizer = optim.Adamax(model.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001 elif args.optim == 'rmsprop': optimizer = optim.RMSprop(model.parameters(), lr=args.lr, eps=1e-10) # 0.0001 elif args.optim == 'sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr) # 0.01 elif args.optim == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr=args.lr) elif args.optim == 'adadelta': optimizer = optim.Adadelta(model.parameters(), lr=args.lr) def train(): # Turn on training mode which enables dropout.
def train(): # Turn on training mode which enables dropout. if args.model == 'QRNN': model.reset() total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) batch, i = 0, 0 while i < train_data.size(0) - 1 - 1: bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2. # Prevent excessively small or negative sequence lengths seq_len = max(5, int(np.random.normal(bptt, 5))) # There's a very small chance that it could select a very long sequence length resulting in OOM # seq_len = min(seq_len, args.bptt + 10) lr2 = optimizer.param_groups[0]['lr'] optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt model.train() data, targets = get_batch(train_data, i, args, seq_len=seq_len) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) optimizer.zero_grad() output, hidden, rnn_hs, dropped_rnn_hs = model(data, hidden, return_h=True) if args.split_cross: raw_loss = criterion(model.decoder, output, targets) else: raw_loss = criterion(output, targets) loss = raw_loss # Activiation Regularization if args.alpha: loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) if args.beta: loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) if not args.collect_stats: loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. if args.clip: torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) optimizer.step() total_loss += raw_loss.data if model.scale.data.item() < 1: model.scale.data.add_(args.scale_alpha) optimizer.param_groups[0]['lr'] = lr2 if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time out = ('| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | ' +\ 'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f} | scale {:.3}').format( epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), cur_loss / math.log(2), model.scale.data.item()) print(out) with open(args.log_out, "a") as f: f.write(out + "\n") total_loss = 0 start_time = time.time() ### batch += 1 i += seq_len
# uses ggplot instead of default plotter matplotlib.style.use('ggplot') # define the computation device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # a list to save all the reconstructed images in PyTorch grid format grid_images = [] # initialize the model model = model.ConvVAE().to(device) # define the learning parameters lr = 0.0001 epochs = 200 batch_size = 64 optimizer = optim.Adam(model.parameters(), lr=lr) criterion = nn.BCELoss(reduction='sum') # initialize the transform transform = transform() # prepare the training and validation data loaders train_data, valid_data = prepare_dataset(root_path='../input/catsNdogs/') trainset = LFWDataset(train_data, transform=transform) trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True) validset = LFWDataset(valid_data, transform=transform) validloader = DataLoader(validset, batch_size=batch_size) train_loss = [] valid_loss = [] for epoch in range(epochs): print(f"Epoch {epoch+1} of {epochs}")
transforms_target=transforms.NumpyToLongTensor()) train_idx, valid_idx = helper.indice_splitter(iris_dataset, valid_size=0.2) train_loader = data.DataLoader(iris_dataset, batch_size=BSIZE, sampler=SubsetRandomSampler(train_idx), num_workers=0) valid_loader = data.DataLoader(iris_dataset, batch_size=BSIZE, sampler=SubsetRandomSampler(valid_idx), num_workers=0) model = model.IrisNetwork(4, 32, 3) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=LRATE) best_loss = 1.5 history = { 'epoch': [], 'train_loss': [], 'valid_loss': [], } for epoch in range(NUM_EPOCH): batch_time = meter.AverageMeter() data_time = meter.AverageMeter() losses = meter.AverageMeter() end_time = time.time() for idx, (x_train, y_train) in enumerate(train_loader): data_time.update(time.time() - end_time)
rnd_model = model.rnd_models[l] rnd_model.freeze_student(args.rnd_nofreeze_student) rnd_model.post_scaling_gain.requires_grad=True rnd_model.post_scaling_gain.data = torch.scalar_tensor(1.0) rnd_model.scaling_coefficient = torch.scalar_tensor(args.rnd_scaling_coefficient) model.freeze_for_rnd_distillation() if args.cuda: if args.single_gpu: parallel_model = model.cuda() else: parallel_model = nn.DataParallel(model, dim=1).cuda() else: parallel_model = model total_params = sum(x.data.nelement() for x in model.parameters()) logging('Args: {}'.format(args)) logging('Model total parameters: {}'.format(total_params)) criterion = nn.CrossEntropyLoss() ############################################################################### # Training code ############################################################################### def evaluate(data_source, data_source_mask, batch_size=10, average_ensemble=True): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 total_student_loss = 0 ntokens = len(corpus.vocab)
def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0 reg_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) l1 = nn.L1Loss(size_average=False) weight_l0 = torch.cat((model.rnn.weight_ih_l0, model.rnn.weight_hh_l0), 1) # shape (4*hidden_size x input_size) weight_l1 = torch.cat((model.rnn.weight_ih_l1, model.rnn.weight_hh_l1), 1) weight_l2 = model.decoder.weight # decoder layer weight of shape (out_features x in_features) if l1_reg: if args.cuda: dummy1 = Variable(torch.cuda.FloatTensor( weight_l0.size()).zero_(), requires_grad=False) dummy2 = Variable(torch.cuda.FloatTensor( weight_l1.size()).zero_(), requires_grad=False) else: dummy1 = Variable(torch.FloatTensor(weight_l0.size()).zero_(), requires_grad=False) dummy2 = Variable(torch.FloatTensor(weight_l1.size()).zero_(), requires_grad=False) loss += (0.00001*l1(weight_l0,dummy1)) + \ (0.00001*l1(weight_l1,dummy2)) structure_glasso_reg = 0.00245 * add_structure_glasso(weight_l0, weight_l1, 2) + \ 0.00245 * add_structure_glasso(weight_l1, weight_l2, 1) final = (loss * args.bptt) + structure_glasso_reg final.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.data reg_loss += structure_glasso_reg.data # zero out and get statistics if batch % args.log_interval == 0 and batch > 0: nonzero_cnt = 0. total_cnt = 0. # zero out gradient threshold = zero_threshold for param in model.parameters(): cond = torch.abs(param.data) < threshold param.data[cond] = 0 # statistics # variable contains tensor s = torch.nonzero(param.data) if len(s.shape) != 0: nonzero_cnt += s.shape[0] total_cnt += get_num(param.data) print("nonzero percentage:", nonzero_cnt / total_cnt) weight_l0 = torch.cat( (model.rnn.weight_ih_l0, model.rnn.weight_hh_l0), 1) weight_l1 = torch.cat( (model.rnn.weight_ih_l1, model.rnn.weight_hh_l1), 1) weight_l2 = model.decoder.weight row_col_sparsity(weight_l0, 'weight_l0') row_col_sparsity(weight_l1, 'weight_l1') row_col_sparsity(weight_l2, 'weight_l2') cur_loss = total_loss[0] / args.log_interval cur_reg_loss = reg_loss[0] / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f} | reg_loss {:5.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), cur_reg_loss)) total_loss = 0 reg_loss = 0 start_time = time.time()
def test_model(N,Dc,Dd,Db,L,K,X_c=None,X_d=None,X_b=None): batch_size = int(N/4) epsilon = 1e0 # ----------- Model ------------ gaussian = Gaussian(Dc, L, K) categorical = Categorical(Dd, L, K) bernoulli = Bernoulli(Db, L, K) likelihoods = [gaussian,bernoulli,categorical] model = Mixture_Model(N, L, likelihoods) optim = torch.optim.Adagrad(model.parameters(), lr=0.01) autograd.set_detect_anomaly(True) # optim = torch.optim.SGD(model.parameters(),lr=0.001, momentum= 0.9) data_set = torch.utils.data.TensorDataset(torch.Tensor(X_c), torch.Tensor(X_d),torch.Tensor(X_b)) #data_set = torch.utils.data.TensorDataset(torch.Tensor(X_c),torch.Tensor(X_b)) data_loader = torch.utils.data.DataLoader(data_set, batch_size=batch_size, shuffle=False) # shuffle a true? #data_loader= torch.utils.data.DataLoader(X_c, batch_size = batch_size, shuffle=False) #shuffle a true? num_epochs = 100 ll_list = [] loss_list = [] KL_z_list = [] KL_s_list = [] rik_epochs = [] term_1_list = [] term_2_list = [] term_3_list = [] past_loss = 0 for epoch in range(num_epochs): loss_epoch = 0 ll_epoch = 0 KL_z_epoch = 0 KL_s_epoch = 0 term_1_epoch = 0 term_2_epoch = 0 term_3_epoch = 0 # for x_batch_real, x_batch_discrete in data_loader: for index, x_batch in enumerate(data_loader): x_batch_real = x_batch[0] x_batch_disc = x_batch[1] x_batch_bin = x_batch[2] # ----- Variational E ----- fix θ optim.zero_grad() util.fix_model_params(likelihoods, set=False) util.fix_variational_params(model, set=True) loss, LL, KL_z, KL_s, rik, term_1, term_2,term_3 = model(index, X_c=x_batch_real.numpy(), X_d=x_batch_disc.numpy(), X_b=x_batch_bin.numpy()) loss.backward() optim.step() # ----- Variational M ----- fix φ optim.zero_grad() util.fix_model_params(likelihoods, set=True) util.fix_variational_params(model, set=False) loss, LL, KL_z, KL_s, rik, term_1, term_2,term_3 = model(index, X_c=x_batch_real.numpy(), X_d=x_batch_disc.numpy(), X_b=x_batch_bin.numpy()) loss.backward() optim.step() ll_epoch += LL KL_s_epoch += KL_s KL_z_epoch += KL_z loss_epoch += loss term_1_epoch += term_1 term_2_epoch += term_2 term_3_epoch += term_3 #print(f"Epoch = {epoch}, Loglik ={ll_epoch}, -ELBO ={loss_epoch}") rik_epochs.append(rik) KL_z_list.append(KL_z_epoch) KL_s_list.append(KL_s_epoch) loss_list.append(loss_epoch) term_1_list.append(term_1_epoch) term_2_list.append(term_2_epoch) term_3_list.append(term_3_epoch) ll_list.append(ll_epoch) z_mean = model.q_z_mean W_c = model.gaussian.W_c var_c =model.gaussian.var_c W_b = model.bernoulli.W_d W_d = model.categorical.W_d #W_d = None mu_d = model.categorical.mu_d #mu_d = None mu_b = model.bernoulli.mu_d param = torch.nn.functional.softmax(model.q_s_param, dim=1).detach().numpy() #print(param) profiles = np.argmax(param, axis=1) + 1 ''' plt.figure() plt.plot(np.arange(num_epochs), KL_z_list) plt.title(f'Convergence of KL_z for K={K}') plt.xlabel('Epochs') plt.ylabel('Kullback-Leibler divergence') plt.savefig('KL_z_'+str(K)+'.png') plt.figure() plt.plot(np.arange(num_epochs), KL_s_list) plt.title(f'Convergence of KL_s for K={K}') plt.xlabel('Epochs') plt.ylabel('Kullback-Leibler divergence') plt.savefig('KL_s_'+str(K)+'.png') ''' plt.figure() plt.plot(np.arange(num_epochs), term_1_list) plt.title(f'Convergence of ELBO terms for K={K}') plt.legend([ 'Gaussian Term ']) plt.xlabel('Epochs') plt.ylabel('Likelihood') plt.savefig('GaussianTerm_'+str(K)+'.png') plt.figure() plt.plot(np.arange(num_epochs), term_2_list) plt.title(f'Convergence of ELBO terms for K={K}') plt.legend(['Bernoulli term']) plt.xlabel('Epochs') plt.ylabel('Likelihood') plt.savefig('BernoulliTerm_'+str(K)+'.png') plt.figure() plt.plot(np.arange(num_epochs), term_3_list) plt.title(f'Convergence of ELBO terms for K={K}') plt.legend(['Categorical term']) plt.xlabel('Epochs') plt.ylabel('Likelihood') plt.savefig('CategoricalTerm_'+str(K)+'.png') plt.figure() plt.plot(np.arange(num_epochs), ll_list) plt.plot(np.arange(num_epochs), loss_list) plt.title(f'Performance in epochs for K={K}') plt.legend(['Likelihood evolution', 'Loss evolution']) plt.xlabel('Epochs') plt.ylabel('Likelihood') plt.savefig('Convergence_'+str(K)+'.png') #plt.show() return ll_list[-1],z_mean,W_c,W_b,mu_b,mu_d,W_d,var_c,profiles
# This produces fairly even matrix mults for the buckets: # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422 splits = [4200, 35000, 180000] elif ntokens > 75000: # WikiText-103 splits = [2800, 20000, 76000] print('Using', splits) criterion = SplitCrossEntropyLoss(args.emsize, splits=splits, verbose=False) ### if args.cuda: model = model.cuda() criterion = criterion.cuda() ### params = list(model.parameters()) + list(criterion.parameters()) total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in params if x.size()) print('Args:', args) print('Model total parameters:', total_params) ############################################################################### # Training code ############################################################################### def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() if args.model == 'QRNN': model.reset()
def configure_optimizers(model): optimizer = torch.optim.Adam(model.parameters(), lr=1.e-3) # optimizer = torch.optim.SGD(mlp.parameters(), lr=0.01) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5) # loss_function = torch.nn.MSELoss() return optimizer, scheduler
shuffle=True, num_workers=0, drop_last=True), } trainloader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=0) # Model model = model.UNet(num_class) model.to(device) # Optimizer optimizer_ft = optim.Adam(model.parameters(), lr=1e-5) if args.lr_scheduler == "linear": lr_scheduler = LambdaLR(optimizer_ft, lr_lambda=lambda epoch: 1.0) print("none") elif args.lr_scheduler == "cyclic": print("cyclicLR") lr_scheduler = lr_scheduler.CyclicLR( optimizer_ft, base_lr=1e-5, max_lr=0.1, step_size_up=100, step_size_down=1000, mode="exp_range", gamma=0.98,
def train(): if not args.maxent: model.train() hidden = model.init_hidden(minibatch) total_loss = 0. total_nword = 0 cur_loss = 0. nword = 0 start_time = time.time() i = 0 for chunk, (input, target, sent_lens) in enumerate(traindataloader): target_packed = pack_padded_sequence(target, sent_lens)[0] if not args.nnlm: output_me = memodel.forward(input, target, sent_lens) if not args.maxent: hidden = repackage_hidden(hidden) model.zero_grad() if args.noisedist == 'uniform': noise = noisesampler.draw_uniform(sent_lens[0].item(), args.ncesample) else: noise = noisesampler.draw(sent_lens[0].item(), args.ncesample) output_nn = model(input, target, hidden, sent_lens, noise=noise) if args.nnlm: if args.nce: loss = output_nn else: output = output_nn.view(-1, vocsize) elif args.maxent: output = output_me.view(-1, vocsize) else: output = torch.add(output_me.view(-1, vocsize), output_nn.view(-1, vocsize)) if not args.nce: loss = ce_crit(output, target_packed.view(-1)) # loss.backward() # if not args.nnlm: # memodel.update(lr) # if not args.maxent: # nn.utils.clip_grad_norm_(model.parameters(), args.clip) # #print('New period',i) # i = i + 1 # for p in model.parameters(): # if p.grad is not None: # #print(p.grad.data.size()) # p.data.add_(-lr, p.grad.data) # pytorch LBFGS optimization optimizer = LBFGS_withAdam(model.parameters(), lr=lr, max_iter=5, history_size=10, use_Adam=use_Adam_flag) def closure(): optimizer.zero_grad() if args.noisedist == 'uniform': noise = noisesampler.draw_uniform(sent_lens[0].item(), args.ncesample) else: noise = noisesampler.draw(sent_lens[0].item(), args.ncesample) output_nn = model(input, target, hidden, sent_lens, noise=noise) if args.nnlm: if args.nce: loss = output_nn else: output = output_nn.view(-1, vocsize) elif args.maxent: output = output_me.view(-1, vocsize) else: output = torch.add(output_me.view(-1, vocsize), output_nn.view(-1, vocsize)) loss = ce_crit(output, target_packed.view(-1)) loss.backward(retain_graph=True) return loss if not args.nnlm: memodel.update(lr) if not args.maxent: nn.utils.clip_grad_norm_(model.parameters(), args.clip) loss = optimizer.step(closure) nvalidword = torch.sum(sent_lens).item() nword += nvalidword cur_loss += loss.item() * nvalidword total_nword += nvalidword total_loss += loss.item() * nvalidword if chunk % args.log_interval == 0 and chunk > 0: cur_loss = cur_loss / nword elapsed = time.time() - start_time # sys.stdout.write ('Epoch {:3d} learn rate {:02.2f} train speed {:5.2f} word/sec, percent: {:5.2f} loss {:5.2f}, ppl {:8.2f} time: fw: {:.2f} s1: {:.2f} bw:{:.2f} \r'.format(epoch, lr, total_nword/elapsed, total_nword/TrainData.nword, cur_loss, math.exp(cur_loss), memodel.time_forward, memodel.time_step1-memodel.time_forward, memodel.time_backward)) sys.stdout.write( 'Epoch {:3d} learn rate {:02.2f} train speed {:5.2f} word/sec, percent: {:5.2f} loss {:5.2f}, ppl {:8.2f} \r' .format(epoch, lr, total_nword / elapsed, total_nword / TrainData.nword, cur_loss, math.exp(cur_loss))) cur_loss = 0 nword = 0 total_loss = total_loss / total_nword elapsed = time.time() - start_time sys.stdout.write( 'Epoch {:3d} learn rate {:02.2f} speed {:5.2f} word/sec, train loss {:5.2f}, ppl {:8.2f}, ' .format(epoch, lr, total_nword / elapsed, total_loss, math.exp(total_loss)))
torch.cuda.manual_seed(args.seed) # data loader and model assert args.type in ['cifar10', 'cifar100'], args.type if args.type == 'cifar10': train_loader, test_loader = dataset.get10(batch_size=args.batch_size, num_workers=1) model = model.cifar10(n_channel=args.channel) else: train_loader, test_loader = dataset.get100(batch_size=args.batch_size, num_workers=1) model = model.cifar100(n_channel=args.channel) model = torch.nn.DataParallel(model, device_ids= range(args.ngpu)) if args.cuda: model.cuda() # optimizer optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) decreasing_lr = list(map(int, args.decreasing_lr.split(','))) print('decreasing_lr: ' + str(decreasing_lr)) best_acc, old_file = 0, None t_begin = time.time() try: # ready to go for epoch in range(args.epochs): model.train() if epoch in decreasing_lr: optimizer.param_groups[0]['lr'] *= 0.1 for batch_idx, (data, target) in enumerate(train_loader): indx_target = target.clone() if args.cuda: data, target = data.cuda(), target.cuda() data, target = Variable(data), Variable(target)
# 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422 splits = [4200, 35000, 180000] elif ntokens > 75000: # WikiText-103 splits = [2800, 20000, 76000] criterion = SplitCrossEntropyLoss(args.emsize, splits=splits, verbose=False) cprint('Using splits ' + str(criterion.splits)) ### if args.cuda: # Because we have more embedding matrices we have to load them on CPU model = model.cuda() criterion = criterion.cuda() ### params = list(model.parameters()) + list(criterion.parameters()) total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in params if x.size()) cprint('Args: ' + str(args)) cprint('Model total parameters: ' + str(total_params)) ############################################################################### # Training code ############################################################################### def evaluate(data_source, batch_size=10, return_breakdown=False): # Turn on evaluation mode which disables dropout. model.eval() if args.model == 'QRNN': model.reset()
eval_batch_size = 10 train_data = batchify(corpus.train, args.batch_size) val_data = batchify(corpus.valid, eval_batch_size) test_data = batchify(corpus.test, eval_batch_size) ############################################################################### # Build the model ############################################################################### ntokens = len(corpus.dictionary) model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(device) criterion = nn.CrossEntropyLoss() optimizer = SGD(model.parameters(), args.lr, momentum=0.9, nesterov=True) #optimizer = Adagrad(model.parameters(), args.lr) #optimizer = Adam(model.parameters(), betas=(0.9, 0.999)) #optimizer = RMSprop(model.parameters()) ############################################################################### # Training code ############################################################################### def repackage_hidden(h): """Wraps hidden states in new Tensors, to detach them from their history.""" if isinstance(h, torch.Tensor): return h.detach() else: return tuple(repackage_hidden(v) for v in h)