コード例 #1
0
ファイル: main.py プロジェクト: Camuslu/examples
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.data

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss[0] / args.log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // args.bptt, lr,
                elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
コード例 #2
0
ファイル: train.py プロジェクト: UriSha/sigmorphon
 def train(self, epoch_idx, batch_size, max_norm):
     logger, model, data = self.logger, self.model, self.data
     logger.info('At %d-th epoch with lr %f.', epoch_idx,
                 self.optimizer.param_groups[0]['lr'])
     model.train()
     nb_train_batch = ceil(data.nb_train / batch_size)
     for src, src_mask, trg, _ in tqdm(
             data.train_batch_sample(batch_size), total=nb_train_batch):
         out = model(src, src_mask, trg)
         loss = model.loss(out, trg[1:])
         self.optimizer.zero_grad()
         loss.backward()
         if max_norm > 0:
             torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
         logger.debug('loss %f with total grad norm %f', loss,
                      util.grad_norm(model.parameters()))
         self.optimizer.step()
コード例 #3
0
ファイル: main.py プロジェクト: xiabofei/python_details
def valid(epoch, quesfeaShu, labelShu, lengthShu):
    losses = AverageMeter()
    top1 = AverageMeter()
    model.eval()

    start_time = time.time()
    for i in range(0, len(quesfeaShu) / args.batch_size):
        if i == len(quesfeaShu) / args.batch_size - 1:
            batchend = len(quesfeaShu)
        else:
            batchend = (i + 1) * (args.batch_size)
        # print batchend
        batchstart = i * (args.batch_size)
        batch_size = batchend - batchstart
        quesfeabatch = []
        labelbatch = []
        lengthbatch = []
        quesfeaOri = quesfeaShu[batchstart:batchend]
        labelOri = labelShu[batchstart:batchend]
        lengthOri = lengthShu[batchstart:batchend]
        idxbatch = sorted(range(len(lengthOri)), key=lambda x: lengthOri[x], reverse=True)
        for j in range(len(idxbatch)):
            quesfeabatch.append(quesfeaOri[idxbatch[j]])
            labelbatch.append(labelOri[idxbatch[j]])
            lengthbatch.append(lengthOri[idxbatch[j]])

        questrainarray = np.asarray(quesfeabatch)
        labeltrainarray = np.asarray(labelbatch)
        lengthtrainarray = np.asarray(lengthbatch)

        tmp = [questrainarray, labeltrainarray, lengthtrainarray]
        tmp = [Variable(torch.from_numpy(_), requires_grad=False) for _ in tmp]
        trques, trlabel, length = tmp
        if args.cuda:
            trlabel.cuda()
        output = model(trques, length)
        # print output
        loss = criterion(output, trlabel) / (batch_size)
        prec1, = accuracy(output.data, trlabel.data, topk=(1,), ori_label=labeltrainarray)
        # label 0 or 1
        losses.update(loss.data[0], batch_size)
        top1.update(prec1[0], batch_size)

        # loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
        print str(top1.avg) + ' ' + str(loss.data[0]) + ' ' + 'batch_valid ' + str(i)
    # update better performance model
    global best_score
    if top1.avg > best_score:
        torch.save(model, args.save)
        print 'save model'
        best_score = top1.avg
    print str(top1.avg) + ' ' + str(loss.data[0]) + ' ' + 'epoch_valid ' + str(epoch)
コード例 #4
0
ファイル: main.py プロジェクト: xiabofei/python_details
def train(epoch, optimizer, quesfeaShu, labelShu, lengthShu):
    losses = AverageMeter()
    top1 = AverageMeter()

    model.train()

    for i in range(0, len(quesfeaShu) / args.batch_size):
        if i == len(quesfeaShu) / args.batch_size - 1:
            batchend = len(quesfeaShu)
        else:
            batchend = (i + 1) * (args.batch_size)
        batchstart = i * (args.batch_size)
        batch_size = batchend - batchstart

        quesfeabatch = []
        labelbatch = []
        lengthbatch = []

        quesfeaOri = quesfeaShu[batchstart:batchend]
        labelOri = labelShu[batchstart:batchend]
        lengthOri = lengthShu[batchstart:batchend]

        idxbatch = sorted(range(len(lengthOri)), key=lambda x: lengthOri[x], reverse=True)
        for j in range(len(idxbatch)):
            quesfeabatch.append(quesfeaOri[idxbatch[j]])
            labelbatch.append(labelOri[idxbatch[j]])
            lengthbatch.append(lengthOri[idxbatch[j]])

        questrainarray = np.asarray(quesfeabatch)
        labeltrainarray = np.asarray(labelbatch)
        lengthtrainarray = np.asarray(lengthbatch)

        tmp = [questrainarray, labeltrainarray, lengthtrainarray]
        tmp = [Variable(torch.from_numpy(_), requires_grad=False) for _ in tmp]
        trques, trlabel, length = tmp
        if args.cuda:
            trlabel.cuda()

        output = model(trques, length)
        loss = criterion(output, trlabel) / (batch_size)
        prec1, = accuracy(output.data, trlabel.data, topk=(1,))

        losses.update(loss.data[0], batch_size)
        top1.update(prec1[0], batch_size)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
        print str(top1.avg) + ' ' + str(top1.val) + ' ' + str(loss.data[0]) + ' ' + 'batch ' + str(i)
    print str(top1.avg) + ' ' + str(top1.val) + ' ' + str(loss.data[0]) + ' ' + 'epoch ' + str(epoch)
コード例 #5
0
ファイル: main.py プロジェクト: duyvuleo/awd-lstm-lm
def train():
    # Turn on training mode which enables dropout.
    if args.model == 'QRNN': model.reset()
    total_loss = 0
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)
    batch, i = 0, 0
    while i < train_data.size(0) - 1 - 1:
        bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2.
        # Prevent excessively small or negative sequence lengths
        seq_len = max(5, int(np.random.normal(bptt, 5)))
        # There's a very small chance that it could select a very long sequence length resulting in OOM
        # seq_len = min(seq_len, args.bptt + 10)

        lr2 = optimizer.param_groups[0]['lr']
        optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt
        model.train()
        data, targets = get_batch(train_data, i, args, seq_len=seq_len)

        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        optimizer.zero_grad()

        output, hidden, rnn_hs, dropped_rnn_hs = model(data, hidden, return_h=True)
        raw_loss = criterion(output.view(-1, ntokens), targets)

        loss = raw_loss
        # Activiation Regularization
        loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:])
        # Temporal Activation Regularization (slowness)
        loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:])
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
        optimizer.step()

        total_loss += raw_loss.data
        optimizer.param_groups[0]['lr'] = lr2
        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss[0] / args.log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'],
                elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
        ###
        batch += 1
        i += seq_len
コード例 #6
0
# python main.py --log_interval 200 --lr 0.1 --nhid 200 --nlayer 1 --epochs 40 --dropout 0 --model GRU --lr_decay 0.5 --bptt 10 --batch_size 64
# | end of epoch   1 | time: 143.43s | valid loss  4.94 | valid perplexity   140.00

# python main.py --log_interval 200 --lr 0.1 --nhid 150 --nlayer 1 --epochs 40 --dropout 0 --model GRU --lr_decay 0.5 --bptt 10 --batch_size 64
# | end of epoch   1 | time: 121.41s | valid loss  4.98 | valid perplexity   144.75

# python main.py --log_interval 200 --lr 0.1 --nhid 150 --nlayer 1 --epochs 40 --dropout 0 --model GRU --lr_decay 0.5 --bptt 10 --batch_size 128
# | end of epoch   1 | time: 89.41s | valid loss  4.97 | valid perplexity   144.64

# python main.py --log_interval 200 --lr 0.1 --nhid 128 --nlayer 1 --epochs 40 --dropout 0 --model GRU --lr_decay 0.5 --bptt 10 --batch_size 128
# | end of epoch   1 | time: 78.13s | valid loss  4.98 | valid perplexity   145.55

# python main.py --log_interval 200 --lr 0.1 --nhid 128 --nlayer 1 --epochs 40 --dropout 0 --model GRU --lr_decay 0.5 --bptt 12 --batch_size 128
# | end of epoch   1 | time: 74.73s | valid loss  4.99 | valid perplexity   147.03

optimizer = optim.Adagrad(model.parameters(),
                          lr=args.lr,
                          lr_decay=1e-4,
                          weight_decay=1e-5)

###############################################################################
# Training code
###############################################################################


def repackage_hidden(h):
    """Wraps hidden states in new Variables, to detach them from their history."""
    if type(h) == Variable:
        return Variable(h.data)
    else:
        return tuple(repackage_hidden(v) for v in h)
def main(model, path):
	print(path)
	t1 = time.time()

	checkpoint_folder = "Model_Checkpoints"
	project_path = os.getcwd()
	save_path = os.path.join(project_path, checkpoint_folder)

	if not os.path.exists(checkpoint_folder):
		os.makedirs(checkpoint_folder)
	else:
		shutil.rmtree(save_path)
		os.makedirs(checkpoint_folder)

	in_features = 300
	hidden_size = 256
	layer_num = 2

	print("\n")
	print(" Loading Data ... ")
	print("="*30)
	print("\n")

	train_dl, valid_dl, trn, vld = dataloader.train_val_loader(path)

	print(" Got train_dataloader and validation_dataloader ")
	print("="*30)
	print("\n")

	print(" Loading LSTM Model ...")
	print("="*30)
	print("\n")
	model = model.Rnn_Lstm(in_features, hidden_size, layer_num, 391)

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	model.to(device)

	optimizer = optim.Adam(model.parameters(), lr=1e-2)
	criterion = nn.BCEWithLogitsLoss()

	epochs = 10

	print(" Training started ... ")
	print("="*30)
	print("\n")


	for epoch in range(1, epochs + 1):
		checkpoint_name = "checkpoint_"+ str(epoch) +".pth"
		checkpoint_save_path = os.path.join(save_path, checkpoint_name)
		running_loss = 0.0

		model.train() # turn on training mode
		for x, y in tqdm.tqdm(train_dl):
			x, y = x.to(device), y.to(device)
			optimizer.zero_grad()

			preds = model(x)
			loss = criterion(preds, y)
			loss.backward()
			optimizer.step()
			
			running_loss += loss.item() * x.size(0)
			
		epoch_loss = running_loss / len(trn)
		
		# calculate the validation loss for this epoch
		val_loss = 0.0
		model.eval() # turn on evaluation mode
		for x, y in valid_dl:
			x, y = x.to(device), y.to(device)
			preds = model(x)
			loss = criterion(preds, y)
			val_loss += loss.item() * x.size(0)

		val_loss /= len(vld)
		print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f} \n'.format(epoch, epoch_loss, val_loss))
		print("Checkpoint saved after {} epoch\n".format(epoch))
		torch.save(model.state_dict(), checkpoint_save_path)

	print("Training completed -> Finished -- {} \n".format(time.time()-t1))
	print("="*30)
	print("\n")
コード例 #8
0
                     args.emsize, args.nhid, args.encinit, args.decinit,
                     args.weightinit, args.dropout, args.optim, args.lr,
                     args.tied, args.shuffle, ntokens, args.vocab)
])

print(
    'Pytorch | RnnType | Clip | #Layers | EmbDim | HiddenDim | EncoderInit | DecoderInit | WeightInit | Dropout | Optimizer| LR | Tied | Shuffle | Ntokens | VocabSize'
)
print(model_config)

# Loop over epochs.
lr = args.lr
prev_val_loss = None
optimizer = None
if args.optim == 'adam':
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
elif args.optim == 'sgd':
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

best_val_perplex = 99999

for epoch in range(1, args.epochs + 1):
    epoch_start_time = time.time()
    train(optimizer)
    val_loss = evaluate(val_data)
    if math.exp(val_loss) < best_val_perplex:
        best_val_perplex = math.exp(val_loss)
        if args.save != '':
            # save the model
            torch.save(model, args.save)
            # save model state_dict to avoid pytorch version problems
コード例 #9
0
ファイル: train.py プロジェクト: zhanghua7099/sgnn
    if '64-64-64' in args.data_path:
        args.input_dim = (64, 64, 64)
args.input_nf = 1
UP_AXIS = 0
print(args)

# specify gpu
os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)

# create model
model = model.GenModel(args.encoder_dim, args.input_dim, args.input_nf,
                       args.coarse_feat_dim, args.refine_feat_dim,
                       args.num_hierarchy_levels, not args.no_pass_occ,
                       not args.no_pass_feats, args.use_skip_sparse,
                       args.use_skip_dense).cuda()
optimizer = torch.optim.Adam(model.parameters(),
                             lr=args.lr,
                             weight_decay=args.weight_decay)
if args.retrain:
    print('loading model:', args.retrain)
    checkpoint = torch.load(args.retrain)
    args.start_epoch = args.start_epoch if args.start_epoch != 0 else checkpoint[
        'epoch']
    model.load_state_dict(checkpoint['state_dict'])  #, strict=False)
    optimizer.load_state_dict(checkpoint['optimizer'])
last_epoch = -1 if not args.retrain else args.start_epoch - 1
scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                            step_size=args.decay_lr,
                                            gamma=0.5,
                                            last_epoch=last_epoch)
コード例 #10
0
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

###############################################################################
# Build the model
###############################################################################

ntokens = len(corpus.dictionary)
model = model.RNNModel(modeltype, ntokens, emsize, nhid, nlayers, dropout,
                       tied)
if cuda:
    model.cuda()

criterion = nn.CrossEntropyLoss()
print("number of parameters: ",
      sum(param.numel() for param in model.parameters()))

###############################################################################
# Training code
###############################################################################


def repackage_hidden(h):
    """Wraps hidden states in new Variables, to detach them from their history."""
    if type(h) == Variable:
        return Variable(h.data)
    else:
        return tuple(repackage_hidden(v) for v in h)


# get_batch subdivides the source data into chunks of length bptt.
コード例 #11
0
ファイル: finetune.py プロジェクト: pengyulong/mos
# Build the model
###############################################################################

ntokens = len(corpus.dictionary)
if args.continue_train:
    model = torch.load(os.path.join(args.save, 'finetune_model.pt'))
else:
    model = torch.load(os.path.join(args.save, 'model.pt'))
if args.cuda:
    if args.single_gpu:
        parallel_model = model.cuda()
    else:
        parallel_model = nn.DataParallel(model, dim=1).cuda()
else:
    parallel_model = model
total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in model.parameters())
logging('Args: {}'.format(args))
logging('Model total parameters: {}'.format(total_params))

criterion = nn.CrossEntropyLoss()

###############################################################################
# Training code
###############################################################################

def evaluate(data_source, batch_size=10):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(batch_size)
コード例 #12
0
import torch.optim as optim
import matplotlib.pyplot as plt
"""Parameters and user defined configs for model"""
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNN().to(device)
print(model.parameters)
#parameters(user defined)
in_channels = 3
num_classes = 2
learning_rate = 0.001
BATCH_SIZE = 66
EPOCHS = 20
momentum = 0.9
# Loss and Loss_function criterion
loss_function = nn.CrossEntropyLoss()   #As we are using cross entropy loss we dont have to use Softmax at the end
optimizer = optim.SGD(model.parameters(),lr=learning_rate,momentum=momentum)
"""Actual Training"""
def train(model_net,train_data,label_train,loss_lst):

    for epoch in range(EPOCHS):# here as we have already seperated Features and labels ,we have to
        for i in range(0,len(train_data),BATCH_SIZE):# Initiate a for loop which steps or iter train data by the steps of
            #print(i,i+BATCH_SIZE)             #our user Defined BATCH_SIZE ,else if data is coming as a single unit,
                                                #Then we can use inbuilt torch "Dataloader" and enumerate data and labels using a single for loop
            X_train_batch = train_data[i : i+BATCH_SIZE].to(device=device) #converting data into "CUDA" or device standards
            y_train_batch = label_train[i : i+BATCH_SIZE].to(device=device)
            #Forward pass
            y_pred_outs = model_net(X_train_batch.float())  #should be converted to float otherwise it will throw a RUNTIME error of expecting DOUBLE
            loss = loss_function(y_pred_outs,y_train_batch.long())
            loss_lst.append(loss)
            if epoch % 2 ==1:
                print("Epoch number:{} and loss:{}".format(epoch,loss.item()))
コード例 #13
0
ファイル: finetune.py プロジェクト: esvhd/awd-lstm-lm
val_data = batchify(corpus.valid, eval_batch_size, args)
test_data = batchify(corpus.test, test_batch_size, args)

###############################################################################
# Build the model
###############################################################################

ntokens = len(corpus.dictionary)
model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid,
                       args.nlayers, args.dropout, args.dropouth,
                       args.dropouti, args.dropoute, args.wdrop, args.tied)
if args.cuda:
    model.cuda()
total_params = sum(x.size()[0] *
                   x.size()[1] if len(x.size()) > 1 else x.size()[0]
                   for x in model.parameters())
print('Args:', args)
print('Model total parameters:', total_params)

# criterion = nn.CrossEntropyLoss()

# master branch has a but here, see this:
# https://github.com/salesforce/awd-lstm-lm/issues/28
# it should not be using CrossEntropyLoss()
splits = []
if ntokens > 500000:
    # One Billion
    # This produces fairly even matrix mults for the buckets:
    # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422
    splits = [4200, 35000, 180000]
elif ntokens > 75000:
コード例 #14
0
def train_model(model, trainds, testds, config, device, writer=None):
    batch_size = config['data']['batch_size']
    status = config['training']['status']
    epochs = config['training']['epochs']
    balanced_loss = config['loss']['balanced']
    # nval = config['nval']
    nval_tests = config['nval_tests']
    nsave = config['nsave']
    model_save = config['model_save']
    rank = config['rank']
    nranks = config['nranks']
    hvd = config['hvd']
    num_classes = config['data']['num_classes']

    ## create samplers for these datasets
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        trainds, nranks, rank, shuffle=True, drop_last=True)
    test_sampler = torch.utils.data.distributed.DistributedSampler(
        testds, nranks, rank, shuffle=True, drop_last=True)

    ## create data loaders
    train_loader = torch.utils.data.DataLoader(
        trainds,
        shuffle=False,
        sampler=train_sampler,
        num_workers=config['data']['num_parallel_readers'],
        batch_size=batch_size,
        persistent_workers=True)
    test_loader = torch.utils.data.DataLoader(
        testds,
        shuffle=False,
        sampler=test_sampler,
        num_workers=config['data']['num_parallel_readers'],
        batch_size=batch_size,
        persistent_workers=True)

    loss_func = loss.get_loss(config)
    ave_loss = CalcMean.CalcMean()
    acc_func = accuracy.get_accuracy(config)
    ave_acc = CalcMean.CalcMean()

    opt_func = optimizer.get_optimizer(config)
    opt = opt_func(model.parameters(), **config['optimizer']['args'])

    lrsched_func = optimizer.get_learning_rate_scheduler(config)
    lrsched = lrsched_func(opt, **config['lr_schedule']['args'])

    # Add Horovod Distributed Optimizer
    if hvd:
        opt = hvd.DistributedOptimizer(
            opt, named_parameters=model.named_parameters())

        # Broadcast parameters from rank 0 to all other processes.
        hvd.broadcast_parameters(model.state_dict(), root_rank=0)

    model.to(device)

    for epoch in range(epochs):
        logger.info(' epoch %s of %s', epoch, epochs)

        train_sampler.set_epoch(epoch)
        test_sampler.set_epoch(epoch)
        model.to(device)
        for batch_counter, (inputs, targets, class_weights,
                            nonzero_mask) in enumerate(train_loader):

            # move data to device
            inputs = inputs.to(device)
            targets = targets.to(device)
            class_weights = class_weights.to(device)
            nonzero_mask = nonzero_mask.to(device)

            # zero grads
            opt.zero_grad()
            outputs, endpoints = model(inputs)

            # set the weights
            if balanced_loss:
                weights = class_weights
                nonzero_to_class_scaler = torch.sum(
                    nonzero_mask.type(torch.float32)) / torch.sum(
                        class_weights.type(torch.float32))
            else:
                weights = nonzero_mask
                nonzero_to_class_scaler = torch.ones(1, device=device)

            loss_value = loss_func(outputs, targets.long())
            loss_value = torch.mean(
                loss_value * weights) * nonzero_to_class_scaler

            # backward calc grads
            loss_value.backward()

            # apply grads
            opt.step()

            ave_loss.add_value(float(loss_value.to('cpu')))

            # calc acc
            ave_acc.add_value(
                float(acc_func(outputs, targets, weights).to('cpu')))

            # print statistics
            if batch_counter % status == 0:

                logger.info(
                    '<[%3d of %3d, %5d of %5d]> train loss: %6.4f acc: %6.4f',
                    epoch + 1, epochs, batch_counter,
                    len(trainds) / nranks / batch_size, ave_loss.mean(),
                    ave_acc.mean())

                if writer and rank == 0:
                    global_batch = epoch * len(
                        trainds) / nranks / batch_size + batch_counter
                    writer.add_scalars('loss', {'train': ave_loss.mean()},
                                       global_batch)
                    writer.add_scalars('accuracy', {'train': ave_acc.mean()},
                                       global_batch)
                    #writer.add_histogram('input_trans',endpoints['input_trans'].view(-1),global_batch)

                ave_loss = CalcMean.CalcMean()
                ave_acc = CalcMean.CalcMean()

            # release tensors for memory
            del inputs, targets, weights, endpoints, loss_value

            if config['batch_limiter'] and batch_counter > config[
                    'batch_limiter']:
                logger.info('batch limiter enabled, stop training early')
                break

        # save at end of epoch
        torch.save(model.state_dict(),
                   model_save + '_%05d.torch_model_state_dict' % epoch)

        if nval_tests == -1:
            nval_tests = len(testds) / nranks / batch_size
        logger.info('epoch %s complete, running validation on %s batches',
                    epoch, nval_tests)

        model.to(device)
        # every epoch, evaluate validation data set
        with torch.no_grad():

            vloss = CalcMean.CalcMean()
            vacc = CalcMean.CalcMean()

            vious = [CalcMean.CalcMean() for i in range(num_classes)]

            for valid_batch_counter, (inputs, targets, class_weights,
                                      nonzero_mask) in enumerate(test_loader):

                inputs = inputs.to(device)
                targets = targets.to(device)
                class_weights = class_weights.to(device)
                nonzero_mask = nonzero_mask.to(device)

                # set the weights
                if balanced_loss:
                    weights = class_weights
                    nonzero_to_class_scaler = torch.sum(
                        nonzero_mask.type(torch.float32)) / torch.sum(
                            class_weights.type(torch.float32))
                else:
                    weights = nonzero_mask
                    nonzero_to_class_scaler = torch.ones(1, device=device)

                outputs, endpoints = model(inputs)

                loss_value = loss_func(outputs, targets.long())
                loss_value = torch.mean(
                    loss_value * weights) * nonzero_to_class_scaler
                vloss.add_value(float(loss_value.to('cpu')))

                # calc acc
                vacc.add_value(
                    float(acc_func(outputs, targets, weights).to('cpu')))

                # calc ious
                ious = get_ious(outputs, targets, weights, num_classes)
                for i in range(num_classes):
                    vious[i].add_value(float(ious[i]))

                if valid_batch_counter > nval_tests:
                    break

            mean_acc = vacc.mean()
            mean_loss = vloss.mean()
            # if config['hvd'] is not None:
            #    mean_acc  = config['hvd'].allreduce(torch.tensor([mean_acc]))
            #    mean_loss = config['hvd'].allreduce(torch.tensor([mean_loss]))
            mious = float(
                torch.sum(torch.FloatTensor([x.mean()
                                             for x in vious]))) / num_classes
            ious_out = {
                'jet': vious[0].mean(),
                'electron': vious[1].mean(),
                'bkgd': vious[2].mean(),
                'all': mious
            }
            # add validation to tensorboard
            if writer and rank == 0:
                global_batch = epoch * len(
                    trainds) / nranks / batch_size + batch_counter
                writer.add_scalars('loss', {'valid': mean_loss}, global_batch)
                writer.add_scalars('accuracy', {'valid': mean_acc},
                                   global_batch)
                writer.add_scalars('IoU', ious_out, global_batch)

            logger.warning(
                '>[%3d of %3d, %5d of %5d]<<< ave valid loss: %6.4f ave valid acc: %6.4f on %s batches >>>',
                epoch + 1, epochs, batch_counter,
                len(trainds) / nranks / batch_size, mean_loss, mean_acc,
                valid_batch_counter + 1)
            logger.warning('      >> ious: %s', ious_out)

        # update learning rate
        lrsched.step()
コード例 #15
0
ファイル: main.py プロジェクト: batermj/awd-lstm-lm
    if ntokens > 500000:
        # One Billion
        # This produces fairly even matrix mults for the buckets:
        # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422
        splits = [4200, 35000, 180000]
    elif ntokens > 75000:
        # WikiText-103
        splits = [2800, 20000, 76000]
    print('Using', splits)
    criterion = SplitCrossEntropyLoss(args.emsize, splits=splits, verbose=False)
###
if args.cuda:
    model = model.cuda()
    criterion = criterion.cuda()
###
params = list(model.parameters()) + list(criterion.parameters())
total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in params if x.size())
print('Args:', args)
print('Model total parameters:', total_params)

###############################################################################
# Training code
###############################################################################

def evaluate(data_source, batch_size=10):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    if args.model == 'QRNN': model.reset()
    total_loss = 0
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(batch_size)
コード例 #16
0
ファイル: main.py プロジェクト: pengyulong/mos
def train():
    assert args.batch_size % args.small_batch_size == 0, 'batch_size must be divisible by small_batch_size'

    # Turn on training mode which enables dropout.
    total_loss = 0
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = [model.init_hidden(args.small_batch_size) for _ in range(args.batch_size // args.small_batch_size)]
    batch, i = 0, 0
    while i < train_data.size(0) - 1 - 1:
        bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2.
        # Prevent excessively small or negative sequence lengths
        seq_len = max(5, int(np.random.normal(bptt, 5)))
        # There's a very small chance that it could select a very long sequence length resulting in OOM
        seq_len = min(seq_len, args.bptt + args.max_seq_len_delta)

        lr2 = optimizer.param_groups[0]['lr']
        optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt
        model.train()
        data, targets = get_batch(train_data, i, args, seq_len=seq_len)

        optimizer.zero_grad()

        start, end, s_id = 0, args.small_batch_size, 0
        while start < args.batch_size:
            cur_data, cur_targets = data[:, start: end], targets[:, start: end].contiguous().view(-1)

            # Starting each batch, we detach the hidden state from how it was previously produced.
            # If we didn't, the model would try backpropagating all the way to start of the dataset.
            hidden[s_id] = repackage_hidden(hidden[s_id])

            log_prob, hidden[s_id], rnn_hs, dropped_rnn_hs = parallel_model(cur_data, hidden[s_id], return_h=True)
            raw_loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), cur_targets)

            loss = raw_loss
            # Activiation Regularization
            loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:])
            # Temporal Activation Regularization (slowness)
            loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:])
            loss *= args.small_batch_size / args.batch_size
            total_loss += raw_loss.data * args.small_batch_size / args.batch_size
            loss.backward()

            s_id += 1
            start = end
            end = start + args.small_batch_size

            gc.collect()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
        optimizer.step()

        # total_loss += raw_loss.data
        optimizer.param_groups[0]['lr'] = lr2
        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss[0] / args.log_interval
            elapsed = time.time() - start_time
            logging('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'],
                elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
        ###
        batch += 1
        i += seq_len
コード例 #17
0
ファイル: main.py プロジェクト: pengyulong/mos
if args.continue_train:
    model = torch.load(os.path.join(args.save, 'model.pt'))
else:
    model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nhidlast, args.nlayers, 
                       args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, 
                       args.tied, args.dropoutl, args.n_experts)

if args.cuda:
    if args.single_gpu:
        parallel_model = model.cuda()
    else:
        parallel_model = nn.DataParallel(model, dim=1).cuda()
else:
    parallel_model = model

total_params = sum(x.data.nelement() for x in model.parameters())
logging('Args: {}'.format(args))
logging('Model total parameters: {}'.format(total_params))

criterion = nn.CrossEntropyLoss()

###############################################################################
# Training code
###############################################################################

def evaluate(data_source, batch_size=10):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(batch_size)
コード例 #18
0
def train():
    assert args.batch_size % args.small_batch_size == 0, 'batch_size must be divisible by small_batch_size'

    # Turn on training mode which enables dropout.
    total_loss = 0
    total_student_loss = 0
    start_time = time.time()
    ntokens = len(corpus.vocab)
    hidden = [model.init_hidden(args.small_batch_size) for _ in range(args.batch_size // args.small_batch_size)]
    batch, i = 0, 0
    while i < train_data.size(0) - 1 - 1:
        bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2.
        # Prevent excessively small or negative sequence lengths
        seq_len = max(5, int(np.random.normal(bptt, 5)))
        # There's a very small chance that it could select a very long sequence length resulting in OOM
        seq_len = min(seq_len, args.bptt + args.max_seq_len_delta)

        lr2 = optimizer.param_groups[0]['lr']
        optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt
        model.eval() # disable dropout
        data, targets = get_batch(train_data, i, args, seq_len=seq_len)

        optimizer.zero_grad()

        start, end, s_id = 0, args.small_batch_size, 0
        while start < args.batch_size:
            cur_data, cur_targets = data[:, start: end], targets[:, start: end].contiguous().view(-1)

            # Starting each batch, we detach the hidden state from how it was previously produced.
            # If we didn't, the model would try backpropagating all the way to start of the dataset.
            hidden[s_id] = repackage_hidden(hidden[s_id])

            parallel_rv = parallel_model(*hidden[s_id], input=cur_data, return_h=True, return_student_distill_loss=True, flatten_returned_lists=True, enable_rnd_tune=True)
            # reassemble return values
            log_prob, student_distill_loss = parallel_rv[0], parallel_rv[-1].sum()
            parallel_rv = np.array(parallel_rv[1:-1]).reshape((3, -1)).tolist()
            hidden[s_id], rnn_hs, dropped_rnn_hs = parallel_rv
            
            raw_loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), cur_targets)

            loss = raw_loss
            # Activiation Regularization
            loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:])
            # Temporal Activation Regularization (slowness)
            loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:])
            # Student distillation loss
            total_student_loss += student_distill_loss.data / args.batch_size
            #loss = loss + args.distillossw * student_distill_loss
            #loss = student_distill_loss
            loss *= args.small_batch_size / args.batch_size
            total_loss += raw_loss.data * args.small_batch_size / args.batch_size
            loss.backward()

            s_id += 1
            start = end
            end = start + args.small_batch_size

            gc.collect()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        optimizer.step()

        # total_loss += raw_loss.data
        optimizer.param_groups[0]['lr'] = lr2
        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss.item() / args.log_interval
            cur_student_loss = total_student_loss.item() / args.log_interval
            elapsed = time.time() - start_time
            logging('| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.4f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f} | distill loss {:5.4f} | post scaling gain0 {:5.4f}'.format(
                epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'],
                elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), 
                cur_student_loss * args.small_batch_size / args.batch_size, model.rnd_models[0].post_scaling_gain.item()))
            total_loss = 0
            total_student_loss = 0
            start_time = time.time()
        ###
        batch += 1
        i += seq_len
コード例 #19
0
    train_data = batchify(corpus.train,
                          args.batch_size)  # size(total_len//bsz, bsz)
    val_data = batchify(corpus.valid, eval_batch_size)
    test_data = batchify(corpus.test, eval_batch_size)

    # Build the model
    interval = 200  # interval to report
    ntokens = len(corpus.dictionary)  # 10000
    model = model.RNNModel(ntokens, args.embed_size, args.n_hid, args.n_layers,
                           args.dropout)

    print(model)
    criterion = nn.CrossEntropyLoss()
    l_rate = args.l_rate
    best_val_loss = None
    opt = torch.optim.SGD(model.parameters(), lr=l_rate)
    if args.opt == 'Adam':
        opt = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.99))
        l_rate = 0.001
    if args.opt == 'Momentum':
        opt = torch.optim.SGD(model.parameters(), lr=l_rate, momentum=0.8)

    try:
        for epoch in range(1, args.epochs + 1):
            epoch_start_time = time.time()
            train_loss = train()
            val_loss = evaluate(val_data)
            print('-' * 80)
            print(
                '| end of epoch {:3d} | time: {:5.2f}s | train_loss {:5.2f} | valid loss {:5.2f} | '
                'valid ppl {:8.2f} | train ppl {:8.2f}'.format(
コード例 #20
0
test_data = batchify(corpus.test, test_batch_size, args)

###############################################################################
# Build the model
###############################################################################

ntokens = len(corpus.dictionary)
model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid,
                       args.nlayers, args.dropout, args.dropouth,
                       args.dropouti, args.dropoute, args.wdrop, args.tied,
                       args.bytes)
if args.cuda:
    model.cuda()
total_params = sum(x.size()[0] *
                   x.size()[1] if len(x.size()) > 1 else x.size()[0]
                   for x in model.parameters())
print('Args:', args)
print('Model total parameters:', total_params)

criterion = nn.CrossEntropyLoss()

###############################################################################
# Training code
###############################################################################


def evaluate(data_source, batch_size=10):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    if args.model == 'QRNN': model.reset()
    total_loss = 0
コード例 #21
0
ファイル: main.py プロジェクト: xiabofei/python_details
    if top1.avg > best_score:
        torch.save(model, args.save)
        print 'save model'
        best_score = top1.avg
    print str(top1.avg) + ' ' + str(loss.data[0]) + ' ' + 'epoch_valid ' + str(epoch)


# Loop over epochs.
lr = args.lr
best_val_loss = None

# At any point you can hit Ctrl + C to break out of training early.
st(context=27)
best_score = 0
try:
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    for epoch in range(1, args.epochs + 1):
        train(epoch, optimizer, questrainfealistShu, labeltrainlistShu, lengthtrainlistShu)
        valid(epoch, questrainfealistShu_valid, labeltrainlistShu_valid, lengthtrainlistShu_valid)
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')


def test(model, quesfeaShu, labelShu, lengthShu):

    model.eval()

    idx = sorted(range(len(lengthShu)), key=lambda x: lengthShu[x], reverse=True)

    _quesfeaShu = []
コード例 #22
0
ファイル: test_only.py プロジェクト: Daniboy370/Deep-Learning
# Load checkpoint
if args.checkpoint != '':
    if args.cuda:
        model = torch.load(args.checkpoint)
    else:
        # Load GPU model on CPU
        model = torch.load(args.checkpoint, map_location=lambda storage, loc: storage)

if args.cuda:
    model.cuda()
else:
    model.cpu()
print(model)

print('------------------------------------------------------')
print('\t\t Total parameters in model : ', sum(param.numel() for param in model.parameters()))
print('------------------------------------------------------\n')


def repackage_hidden(h):
    """Wraps hidden states in new Variables, to detach them from their history."""
    return [state.detach() for state in h]


def get_batch(source, i, evaluation=False):
    seq_len = min(args.bptt, len(source) - 1 - i)
    data = Variable(source[i:i + seq_len], volatile=evaluation)
    target = Variable(source[i + 1:i + 1 + seq_len].view(-1))
    return data, target

コード例 #23
0
ファイル: train.py プロジェクト: xiangfasong/feedbackprop
 def get_trainable_parameters(model):
     for param in model.parameters():
         if param.requires_grad:
             yield param
コード例 #24
0
ファイル: finetune.py プロジェクト: jkkummerfeld/emnlp20lm
eval_batch_size = 10
test_batch_size = 1
train_data = batchify(corpus.train, args.batch_size, args)
val_data = batchify(corpus.valid, eval_batch_size, args)
test_data = batchify(corpus.test, test_batch_size, args)

###############################################################################
# Build the model
###############################################################################

ntokens = len(corpus.dictionary)
model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nout, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied)
if args.cuda:
    model.cuda()
total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in model.parameters())
print('Args:', args)
print('Model total parameters:', total_params)
sys.stdout.flush()

criterion = nn.CrossEntropyLoss()

###############################################################################
# Training code
###############################################################################

def evaluate(data_source, batch_size=10):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0
    ntokens = len(corpus.dictionary)
コード例 #25
0
    print("freeze embedding:", freeze)
    init_embedding = utils.read_pretrained_embeddings(options.word_embeddings,
                                                      w2i, options.embed_dim)
    embed = nn.Embedding.from_pretrained(torch.FloatTensor(init_embedding),
                                         freeze=freeze)
else:
    embed = nn.Embedding(len(w2i), options.embed_dim)

model = model.Generator(embed,
                        options.embed_dim,
                        len(w2i),
                        options.hidden_size,
                        num_layers=options.layers,
                        dropout=options.dropout,
                        use_API=options.API)
optimizer = torch.optim.Adam(model.parameters(), lr=options.lr)
#optimizer=torch.optim.SGD(model.parameters(), lr = options.lr,momentum=0)
criterion = nn.CrossEntropyLoss()

if options.gpu:
    model = model.cuda()
    criterion = criterion.cuda()

loss_meter = meter.AverageValueMeter()

if options.old_model:
    # incremental training
    print("Incremental training from old model: {}".format(options.old_model))
    model.load_state_dict(torch.load(options.old_model))

best_model = "{}_{}_{}_{}".format(options.name, options.API,
コード例 #26
0
ファイル: main.py プロジェクト: abstatic/mos
else:
    model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid,
                           args.nhidlast, args.nlayers, args.dropout,
                           args.dropouth, args.dropouti, args.dropoute,
                           args.wdrop, args.tied, args.dropoutl,
                           args.n_experts)

if args.cuda:
    if args.single_gpu:
        parallel_model = model.cuda()
    else:
        parallel_model = nn.DataParallel(model, dim=1).cuda()
else:
    parallel_model = model

total_params = sum(x.data.nelement() for x in model.parameters())
logging('Args: {}'.format(args))
logging('Model total parameters: {}'.format(total_params))

criterion = nn.CrossEntropyLoss()

import pdb
pdb.set_trace()

###############################################################################
# Training code
###############################################################################


def evaluate(data_source, batch_size=10):
    # Turn on evaluation mode which disables dropout.
コード例 #27
0
ファイル: main.py プロジェクト: modudeepnlp/SentenceSimiarity
                             shuffle=False,
                             num_workers=config.cpu_processor,
                             drop_last=True)

    dev_loader = DataLoader(dev_data,
                            batch_size=config.batch,
                            shuffle=False,
                            num_workers=config.cpu_processor,
                            drop_last=True)

    # 모델 설정
    device = torch.device(config.gpu if torch.cuda.is_available() else 'cpu')
    model = model.classifier(vocab_list, embedding)
    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    loss_function = nn.CrossEntropyLoss()

    # 훈련
    step_list = []
    loss_list = []
    acc_test_list = []
    acc_dev_list = []
    step = 0
    for i in range(config.epoch):
        print("epoch = ", i)
        start = time.time()
        for n, (label, sent1, sent2) in enumerate(train_loader):
            optimizer.zero_grad()  # 초기화
            label = Variable(label.to(device))
            sent1 = Variable(torch.stack(sent1).to(device))
コード例 #28
0
        # This produces fairly even matrix mults for the buckets:
        # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422
        splits = [4200, 35000, 180000]
    elif ntokens > 75000:
        # WikiText-103
        splits = [2800, 20000, 76000]
    print('Using', splits)
    if args.split_cross:
        criterion = SplitCrossEntropyLoss(args.emsize,
                                          splits=splits,
                                          verbose=False)
    else:
        criterion = nn.CrossEntropyLoss(
        )  # SplitCrossEntropyLoss(args.emsize, splits=splits, verbose=False)
###
params = list(filter(lambda p: not p is model.scale, model.parameters()))
params = list(params)  # + list(criterion.parameters())
if args.split_cross:
    params = params + list(criterion.parameters())
print(args.split_cross)
if args.cuda:
    model = model.cuda()
    if args.split_cross:
        criterion = criterion.cuda()
    # criterion = criterion.cuda()
    params = list(params)  # + list(criterion.parameters())
###
# for param in params:
#     print(param.size())
total_params = sum(x.size()[0] *
                   x.size()[1] if len(x.size()) > 1 else x.size()[0]
コード例 #29
0
ファイル: main.py プロジェクト: ixaxaar/pytorch-sublstm
def evaluate(data_source):
  # Turn on evaluation mode which disables dropout.
  model.eval()
  total_loss = 0
  ntokens = len(corpus.dictionary)
  hidden = model.init_hidden(eval_batch_size)
  for i in range(0, data_source.size(0) - 1, args.bptt):
    data, targets = get_batch(data_source, i, evaluation=True)
    output, hidden = model(data, hidden)
    output_flat = output.view(-1, ntokens)
    total_loss += len(data) * criterion(output_flat, targets).data
    hidden = repackage_hidden(hidden)
  return total_loss[0] / len(data_source)

if args.optim == 'adam':
  optimizer = optim.Adam(model.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001
if args.optim == 'sparseadam':
  optimizer = optim.SparseAdam(model.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001
if args.optim == 'adamax':
  optimizer = optim.Adamax(model.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001
elif args.optim == 'rmsprop':
  optimizer = optim.RMSprop(model.parameters(), lr=args.lr, eps=1e-10) # 0.0001
elif args.optim == 'sgd':
  optimizer = optim.SGD(model.parameters(), lr=args.lr) # 0.01
elif args.optim == 'adagrad':
  optimizer = optim.Adagrad(model.parameters(), lr=args.lr)
elif args.optim == 'adadelta':
  optimizer = optim.Adadelta(model.parameters(), lr=args.lr)

def train():
  # Turn on training mode which enables dropout.
コード例 #30
0
def train():
    # Turn on training mode which enables dropout.
    if args.model == 'QRNN': model.reset()
    total_loss = 0
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)
    batch, i = 0, 0
    while i < train_data.size(0) - 1 - 1:
        bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2.
        # Prevent excessively small or negative sequence lengths
        seq_len = max(5, int(np.random.normal(bptt, 5)))
        # There's a very small chance that it could select a very long sequence length resulting in OOM
        # seq_len = min(seq_len, args.bptt + 10)

        lr2 = optimizer.param_groups[0]['lr']
        optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt
        model.train()
        data, targets = get_batch(train_data, i, args, seq_len=seq_len)

        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        optimizer.zero_grad()

        output, hidden, rnn_hs, dropped_rnn_hs = model(data,
                                                       hidden,
                                                       return_h=True)

        if args.split_cross:
            raw_loss = criterion(model.decoder, output, targets)
        else:
            raw_loss = criterion(output, targets)

        loss = raw_loss
        # Activiation Regularization
        if args.alpha:
            loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean()
                              for dropped_rnn_h in dropped_rnn_hs[-1:])
        # Temporal Activation Regularization (slowness)
        if args.beta:
            loss = loss + sum(args.beta *
                              (rnn_h[1:] - rnn_h[:-1]).pow(2).mean()
                              for rnn_h in rnn_hs[-1:])
        if not args.collect_stats:
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            if args.clip:
                torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
            optimizer.step()

        total_loss += raw_loss.data
        if model.scale.data.item() < 1:
            model.scale.data.add_(args.scale_alpha)
        optimizer.param_groups[0]['lr'] = lr2
        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss[0] / args.log_interval
            elapsed = time.time() - start_time
            out = ('| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | ' +\
                    'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f} | scale {:.3}').format(
                epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'],
                elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), cur_loss / math.log(2), model.scale.data.item())
            print(out)
            with open(args.log_out, "a") as f:
                f.write(out + "\n")
            total_loss = 0
            start_time = time.time()
        ###
        batch += 1
        i += seq_len
コード例 #31
0
# uses ggplot instead of default plotter
matplotlib.style.use('ggplot')

# define the computation device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# a list to save all the reconstructed images in PyTorch grid format
grid_images = []

# initialize the model
model = model.ConvVAE().to(device)
# define the learning parameters
lr = 0.0001
epochs = 200
batch_size = 64
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.BCELoss(reduction='sum')

# initialize the transform
transform = transform()
# prepare the training and validation data loaders
train_data, valid_data = prepare_dataset(root_path='../input/catsNdogs/')
trainset = LFWDataset(train_data, transform=transform)
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True)
validset = LFWDataset(valid_data, transform=transform)
validloader = DataLoader(validset, batch_size=batch_size)

train_loss = []
valid_loss = []
for epoch in range(epochs):
    print(f"Epoch {epoch+1} of {epochs}")
コード例 #32
0
ファイル: train.py プロジェクト: nunenuh/iris.pytorch
    transforms_target=transforms.NumpyToLongTensor())

train_idx, valid_idx = helper.indice_splitter(iris_dataset, valid_size=0.2)

train_loader = data.DataLoader(iris_dataset,
                               batch_size=BSIZE,
                               sampler=SubsetRandomSampler(train_idx),
                               num_workers=0)
valid_loader = data.DataLoader(iris_dataset,
                               batch_size=BSIZE,
                               sampler=SubsetRandomSampler(valid_idx),
                               num_workers=0)

model = model.IrisNetwork(4, 32, 3)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LRATE)

best_loss = 1.5
history = {
    'epoch': [],
    'train_loss': [],
    'valid_loss': [],
}
for epoch in range(NUM_EPOCH):
    batch_time = meter.AverageMeter()
    data_time = meter.AverageMeter()
    losses = meter.AverageMeter()

    end_time = time.time()
    for idx, (x_train, y_train) in enumerate(train_loader):
        data_time.update(time.time() - end_time)
コード例 #33
0
        rnd_model = model.rnd_models[l]
        rnd_model.freeze_student(args.rnd_nofreeze_student)
        rnd_model.post_scaling_gain.requires_grad=True
        rnd_model.post_scaling_gain.data = torch.scalar_tensor(1.0)
        rnd_model.scaling_coefficient = torch.scalar_tensor(args.rnd_scaling_coefficient)
    model.freeze_for_rnd_distillation()
    
if args.cuda:
    if args.single_gpu:
        parallel_model = model.cuda()
    else:
        parallel_model = nn.DataParallel(model, dim=1).cuda()
else:
    parallel_model = model

total_params = sum(x.data.nelement() for x in model.parameters())
logging('Args: {}'.format(args))
logging('Model total parameters: {}'.format(total_params))

criterion = nn.CrossEntropyLoss()

###############################################################################
# Training code
###############################################################################

def evaluate(data_source, data_source_mask, batch_size=10, average_ensemble=True):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0
    total_student_loss = 0
    ntokens = len(corpus.vocab)
コード例 #34
0
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0
    reg_loss = 0
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        l1 = nn.L1Loss(size_average=False)

        weight_l0 = torch.cat((model.rnn.weight_ih_l0, model.rnn.weight_hh_l0),
                              1)  # shape (4*hidden_size x input_size)
        weight_l1 = torch.cat((model.rnn.weight_ih_l1, model.rnn.weight_hh_l1),
                              1)
        weight_l2 = model.decoder.weight  # decoder layer weight of shape (out_features x in_features)

        if l1_reg:
            if args.cuda:
                dummy1 = Variable(torch.cuda.FloatTensor(
                    weight_l0.size()).zero_(),
                                  requires_grad=False)
                dummy2 = Variable(torch.cuda.FloatTensor(
                    weight_l1.size()).zero_(),
                                  requires_grad=False)
            else:
                dummy1 = Variable(torch.FloatTensor(weight_l0.size()).zero_(),
                                  requires_grad=False)
                dummy2 = Variable(torch.FloatTensor(weight_l1.size()).zero_(),
                                  requires_grad=False)
            loss += (0.00001*l1(weight_l0,dummy1)) + \
                    (0.00001*l1(weight_l1,dummy2))

        structure_glasso_reg = 0.00245 * add_structure_glasso(weight_l0, weight_l1, 2) + \
                               0.00245 * add_structure_glasso(weight_l1, weight_l2, 1)

        final = (loss * args.bptt) + structure_glasso_reg

        final.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.data
        reg_loss += structure_glasso_reg.data

        # zero out and get statistics
        if batch % args.log_interval == 0 and batch > 0:
            nonzero_cnt = 0.
            total_cnt = 0.

            # zero out gradient
            threshold = zero_threshold
            for param in model.parameters():
                cond = torch.abs(param.data) < threshold
                param.data[cond] = 0

                # statistics
                # variable contains tensor
                s = torch.nonzero(param.data)
                if len(s.shape) != 0:
                    nonzero_cnt += s.shape[0]

                total_cnt += get_num(param.data)

            print("nonzero percentage:", nonzero_cnt / total_cnt)

            weight_l0 = torch.cat(
                (model.rnn.weight_ih_l0, model.rnn.weight_hh_l0), 1)
            weight_l1 = torch.cat(
                (model.rnn.weight_ih_l1, model.rnn.weight_hh_l1), 1)
            weight_l2 = model.decoder.weight

            row_col_sparsity(weight_l0, 'weight_l0')
            row_col_sparsity(weight_l1, 'weight_l1')
            row_col_sparsity(weight_l2, 'weight_l2')

            cur_loss = total_loss[0] / args.log_interval
            cur_reg_loss = reg_loss[0] / args.log_interval
            elapsed = time.time() - start_time
            print(
                '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                'loss {:5.2f} | ppl {:8.2f} | reg_loss {:5.2f}'.format(
                    epoch, batch,
                    len(train_data) // args.bptt, lr,
                    elapsed * 1000 / args.log_interval, cur_loss,
                    math.exp(cur_loss), cur_reg_loss))
            total_loss = 0
            reg_loss = 0
            start_time = time.time()
コード例 #35
0
def test_model(N,Dc,Dd,Db,L,K,X_c=None,X_d=None,X_b=None):

    batch_size = int(N/4)
    epsilon = 1e0

    #  ----------- Model ------------
    gaussian = Gaussian(Dc, L, K)
    categorical = Categorical(Dd, L, K)
    bernoulli = Bernoulli(Db, L, K)

    likelihoods = [gaussian,bernoulli,categorical]

    model = Mixture_Model(N, L, likelihoods)
    optim = torch.optim.Adagrad(model.parameters(), lr=0.01)
    autograd.set_detect_anomaly(True)
    # optim = torch.optim.SGD(model.parameters(),lr=0.001, momentum= 0.9)

    data_set = torch.utils.data.TensorDataset(torch.Tensor(X_c), torch.Tensor(X_d),torch.Tensor(X_b))
    #data_set = torch.utils.data.TensorDataset(torch.Tensor(X_c),torch.Tensor(X_b))
    data_loader = torch.utils.data.DataLoader(data_set, batch_size=batch_size, shuffle=False)  # shuffle a true?
    #data_loader= torch.utils.data.DataLoader(X_c, batch_size = batch_size, shuffle=False)  #shuffle a true?

    num_epochs = 100
    ll_list = []
    loss_list = []
    KL_z_list = []
    KL_s_list = []
    rik_epochs = []
    term_1_list = []
    term_2_list = []
    term_3_list = []

    past_loss = 0

    for epoch in range(num_epochs):

        loss_epoch = 0
        ll_epoch = 0
        KL_z_epoch = 0
        KL_s_epoch = 0
        term_1_epoch = 0
        term_2_epoch = 0
        term_3_epoch = 0

        # for x_batch_real, x_batch_discrete in data_loader:
        for index, x_batch in enumerate(data_loader):
            x_batch_real = x_batch[0]
            x_batch_disc = x_batch[1]
            x_batch_bin = x_batch[2]


            # ----- Variational E ----- fix θ
            optim.zero_grad()
            util.fix_model_params(likelihoods, set=False)
            util.fix_variational_params(model, set=True)
            loss, LL, KL_z, KL_s, rik, term_1, term_2,term_3  = model(index, X_c=x_batch_real.numpy(), X_d=x_batch_disc.numpy(), X_b=x_batch_bin.numpy())
            loss.backward()
            optim.step()

            # ----- Variational M ----- fix φ

            optim.zero_grad()
            util.fix_model_params(likelihoods, set=True)
            util.fix_variational_params(model, set=False)
            loss, LL, KL_z, KL_s, rik, term_1, term_2,term_3  = model(index, X_c=x_batch_real.numpy(), X_d=x_batch_disc.numpy(), X_b=x_batch_bin.numpy())
            loss.backward()
            optim.step()
            ll_epoch += LL
            KL_s_epoch += KL_s
            KL_z_epoch += KL_z
            loss_epoch += loss

            term_1_epoch += term_1
            term_2_epoch += term_2
            term_3_epoch += term_3



        #print(f"Epoch = {epoch}, Loglik ={ll_epoch}, -ELBO ={loss_epoch}")
        rik_epochs.append(rik)
        KL_z_list.append(KL_z_epoch)
        KL_s_list.append(KL_s_epoch)
        loss_list.append(loss_epoch)
        term_1_list.append(term_1_epoch)
        term_2_list.append(term_2_epoch)
        term_3_list.append(term_3_epoch)
        ll_list.append(ll_epoch)


    z_mean = model.q_z_mean
    W_c = model.gaussian.W_c
    var_c =model.gaussian.var_c
    W_b = model.bernoulli.W_d
    W_d = model.categorical.W_d
    #W_d = None
    mu_d = model.categorical.mu_d
    #mu_d = None
    mu_b = model.bernoulli.mu_d
    param = torch.nn.functional.softmax(model.q_s_param, dim=1).detach().numpy()
    #print(param)

    profiles = np.argmax(param, axis=1) + 1

    '''
    plt.figure()
    plt.plot(np.arange(num_epochs), KL_z_list)
    plt.title(f'Convergence of KL_z for K={K}')
    plt.xlabel('Epochs')
    plt.ylabel('Kullback-Leibler divergence')
    plt.savefig('KL_z_'+str(K)+'.png')

    plt.figure()
    plt.plot(np.arange(num_epochs), KL_s_list)
    plt.title(f'Convergence of KL_s for K={K}')
    plt.xlabel('Epochs')
    plt.ylabel('Kullback-Leibler divergence')
    plt.savefig('KL_s_'+str(K)+'.png')

    '''

    plt.figure()
    plt.plot(np.arange(num_epochs), term_1_list)
    plt.title(f'Convergence of ELBO terms for K={K}')
    plt.legend([ 'Gaussian Term '])
    plt.xlabel('Epochs')
    plt.ylabel('Likelihood')
    plt.savefig('GaussianTerm_'+str(K)+'.png')



    plt.figure()
    plt.plot(np.arange(num_epochs), term_2_list)
    plt.title(f'Convergence of ELBO terms for K={K}')
    plt.legend(['Bernoulli term'])
    plt.xlabel('Epochs')
    plt.ylabel('Likelihood')
    plt.savefig('BernoulliTerm_'+str(K)+'.png')

    plt.figure()
    plt.plot(np.arange(num_epochs), term_3_list)
    plt.title(f'Convergence of ELBO terms for K={K}')
    plt.legend(['Categorical term'])
    plt.xlabel('Epochs')
    plt.ylabel('Likelihood')
    plt.savefig('CategoricalTerm_'+str(K)+'.png')


    plt.figure()
    plt.plot(np.arange(num_epochs), ll_list)
    plt.plot(np.arange(num_epochs), loss_list)
    plt.title(f'Performance in epochs for K={K}')
    plt.legend(['Likelihood evolution', 'Loss evolution'])
    plt.xlabel('Epochs')
    plt.ylabel('Likelihood')
    plt.savefig('Convergence_'+str(K)+'.png')

    #plt.show()


    return ll_list[-1],z_mean,W_c,W_b,mu_b,mu_d,W_d,var_c,profiles
コード例 #36
0
        # This produces fairly even matrix mults for the buckets:
        # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422
        splits = [4200, 35000, 180000]
    elif ntokens > 75000:
        # WikiText-103
        splits = [2800, 20000, 76000]
    print('Using', splits)
    criterion = SplitCrossEntropyLoss(args.emsize,
                                      splits=splits,
                                      verbose=False)
###
if args.cuda:
    model = model.cuda()
    criterion = criterion.cuda()
###
params = list(model.parameters()) + list(criterion.parameters())
total_params = sum(x.size()[0] *
                   x.size()[1] if len(x.size()) > 1 else x.size()[0]
                   for x in params if x.size())
print('Args:', args)
print('Model total parameters:', total_params)

###############################################################################
# Training code
###############################################################################


def evaluate(data_source, batch_size=10):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    if args.model == 'QRNN': model.reset()
コード例 #37
0
def configure_optimizers(model):
    optimizer =  torch.optim.Adam(model.parameters(), lr=1.e-3)
    # optimizer =  torch.optim.SGD(mlp.parameters(), lr=0.01)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
    # loss_function = torch.nn.MSELoss()
    return optimizer, scheduler
コード例 #38
0
ファイル: main.py プロジェクト: ferdaoussebs/BEng-BME-2020
               shuffle=True,
               num_workers=0,
               drop_last=True),
}

trainloader = DataLoader(train_set,
                         batch_size=batch_size,
                         shuffle=True,
                         num_workers=0)

# Model
model = model.UNet(num_class)
model.to(device)

# Optimizer
optimizer_ft = optim.Adam(model.parameters(), lr=1e-5)

if args.lr_scheduler == "linear":
    lr_scheduler = LambdaLR(optimizer_ft, lr_lambda=lambda epoch: 1.0)
    print("none")

elif args.lr_scheduler == "cyclic":
    print("cyclicLR")
    lr_scheduler = lr_scheduler.CyclicLR(
        optimizer_ft,
        base_lr=1e-5,
        max_lr=0.1,
        step_size_up=100,
        step_size_down=1000,
        mode="exp_range",
        gamma=0.98,
コード例 #39
0
def train():
    if not args.maxent:
        model.train()
        hidden = model.init_hidden(minibatch)
    total_loss = 0.
    total_nword = 0
    cur_loss = 0.
    nword = 0
    start_time = time.time()

    i = 0

    for chunk, (input, target, sent_lens) in enumerate(traindataloader):
        target_packed = pack_padded_sequence(target, sent_lens)[0]
        if not args.nnlm:
            output_me = memodel.forward(input, target, sent_lens)

        if not args.maxent:
            hidden = repackage_hidden(hidden)
            model.zero_grad()
            if args.noisedist == 'uniform':
                noise = noisesampler.draw_uniform(sent_lens[0].item(),
                                                  args.ncesample)
            else:
                noise = noisesampler.draw(sent_lens[0].item(), args.ncesample)
                output_nn = model(input,
                                  target,
                                  hidden,
                                  sent_lens,
                                  noise=noise)

        if args.nnlm:
            if args.nce:
                loss = output_nn
            else:
                output = output_nn.view(-1, vocsize)
        elif args.maxent:
            output = output_me.view(-1, vocsize)
        else:
            output = torch.add(output_me.view(-1, vocsize),
                               output_nn.view(-1, vocsize))

        if not args.nce:
            loss = ce_crit(output, target_packed.view(-1))

        # loss.backward()

        # if not args.nnlm:
        #     memodel.update(lr)
        # if not args.maxent:
        #     nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        #     #print('New period',i)
        #     i = i + 1
        #     for p in model.parameters():
        #         if p.grad is not None:
        #             #print(p.grad.data.size())
        #             p.data.add_(-lr, p.grad.data)


# pytorch LBFGS optimization

        optimizer = LBFGS_withAdam(model.parameters(),
                                   lr=lr,
                                   max_iter=5,
                                   history_size=10,
                                   use_Adam=use_Adam_flag)

        def closure():
            optimizer.zero_grad()

            if args.noisedist == 'uniform':
                noise = noisesampler.draw_uniform(sent_lens[0].item(),
                                                  args.ncesample)
            else:
                noise = noisesampler.draw(sent_lens[0].item(), args.ncesample)

            output_nn = model(input, target, hidden, sent_lens, noise=noise)

            if args.nnlm:
                if args.nce:
                    loss = output_nn
                else:
                    output = output_nn.view(-1, vocsize)
            elif args.maxent:
                output = output_me.view(-1, vocsize)
            else:
                output = torch.add(output_me.view(-1, vocsize),
                                   output_nn.view(-1, vocsize))

            loss = ce_crit(output, target_packed.view(-1))

            loss.backward(retain_graph=True)

            return loss

        if not args.nnlm:
            memodel.update(lr)
        if not args.maxent:
            nn.utils.clip_grad_norm_(model.parameters(), args.clip)
            loss = optimizer.step(closure)

        nvalidword = torch.sum(sent_lens).item()
        nword += nvalidword
        cur_loss += loss.item() * nvalidword
        total_nword += nvalidword
        total_loss += loss.item() * nvalidword

        if chunk % args.log_interval == 0 and chunk > 0:
            cur_loss = cur_loss / nword
            elapsed = time.time() - start_time
            # sys.stdout.write ('Epoch {:3d}   learn rate {:02.2f} train speed {:5.2f} word/sec, percent: {:5.2f} loss {:5.2f}, ppl {:8.2f} time: fw: {:.2f} s1: {:.2f} bw:{:.2f} \r'.format(epoch, lr, total_nword/elapsed, total_nword/TrainData.nword, cur_loss, math.exp(cur_loss), memodel.time_forward, memodel.time_step1-memodel.time_forward, memodel.time_backward))
            sys.stdout.write(
                'Epoch {:3d}   learn rate {:02.2f} train speed {:5.2f} word/sec, percent: {:5.2f} loss {:5.2f}, ppl {:8.2f} \r'
                .format(epoch, lr, total_nword / elapsed,
                        total_nword / TrainData.nword, cur_loss,
                        math.exp(cur_loss)))
            cur_loss = 0
            nword = 0
    total_loss = total_loss / total_nword
    elapsed = time.time() - start_time
    sys.stdout.write(
        'Epoch {:3d}   learn rate {:02.2f} speed {:5.2f} word/sec, train loss {:5.2f}, ppl {:8.2f},    '
        .format(epoch, lr, total_nword / elapsed, total_loss,
                math.exp(total_loss)))
コード例 #40
0
ファイル: train.py プロジェクト: ZJU-PLP/pytorch-playground
    torch.cuda.manual_seed(args.seed)

# data loader and model
assert args.type in ['cifar10', 'cifar100'], args.type
if args.type == 'cifar10':
    train_loader, test_loader = dataset.get10(batch_size=args.batch_size, num_workers=1)
    model = model.cifar10(n_channel=args.channel)
else:
    train_loader, test_loader = dataset.get100(batch_size=args.batch_size, num_workers=1)
    model = model.cifar100(n_channel=args.channel)
model = torch.nn.DataParallel(model, device_ids= range(args.ngpu))
if args.cuda:
    model.cuda()

# optimizer
optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd)
decreasing_lr = list(map(int, args.decreasing_lr.split(',')))
print('decreasing_lr: ' + str(decreasing_lr))
best_acc, old_file = 0, None
t_begin = time.time()
try:
    # ready to go
    for epoch in range(args.epochs):
        model.train()
        if epoch in decreasing_lr:
            optimizer.param_groups[0]['lr'] *= 0.1
        for batch_idx, (data, target) in enumerate(train_loader):
            indx_target = target.clone()
            if args.cuda:
                data, target = data.cuda(), target.cuda()
            data, target = Variable(data), Variable(target)
コード例 #41
0
        # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422
        splits = [4200, 35000, 180000]
    elif ntokens > 75000:
        # WikiText-103
        splits = [2800, 20000, 76000]
    criterion = SplitCrossEntropyLoss(args.emsize,
                                      splits=splits,
                                      verbose=False)
    cprint('Using splits ' + str(criterion.splits))
###
if args.cuda:
    # Because we have more embedding matrices we have to load them on CPU
    model = model.cuda()
    criterion = criterion.cuda()
###
params = list(model.parameters()) + list(criterion.parameters())
total_params = sum(x.size()[0] *
                   x.size()[1] if len(x.size()) > 1 else x.size()[0]
                   for x in params if x.size())
cprint('Args: ' + str(args))
cprint('Model total parameters: ' + str(total_params))

###############################################################################
# Training code
###############################################################################


def evaluate(data_source, batch_size=10, return_breakdown=False):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    if args.model == 'QRNN': model.reset()
コード例 #42
0
eval_batch_size = 10
train_data = batchify(corpus.train, args.batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

###############################################################################
# Build the model
###############################################################################

ntokens = len(corpus.dictionary)
model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid,
                       args.nlayers, args.dropout, args.tied).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = SGD(model.parameters(), args.lr, momentum=0.9, nesterov=True)
#optimizer = Adagrad(model.parameters(), args.lr)
#optimizer = Adam(model.parameters(), betas=(0.9, 0.999))
#optimizer = RMSprop(model.parameters())

###############################################################################
# Training code
###############################################################################


def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)