def decomopsitionNet(data, lookBack, batchSize):

    scaler = MinMaxScaler(feature_range=(0, 1))
    dataset = scaler.fit_transform(data)

    # 分割序列为样本,并整理成RNN的输入形式
    trainData, testData = divideTrainTest(dataset)

    trainX, trainY = createSamples(trainData, lookBack, RNN=False)
    testX, testY = createSamples(testData, lookBack, RNN=False)
    print("testX shape:", testX.shape)
    print("testy shape:", testY.shape)
    print("trainX shape:", trainX.shape)
    print("trainy shape:", trainY.shape)

    net1 = DecompositionNetModel(inputDim=24, hiddenNum=100, outputDim=24)
    net2 = RNNModel(inputDim=1,
                    hiddenNum=100,
                    outputDim=1,
                    layerNum=1,
                    cell="RNN")

    optimizer1 = optim.RMSprop(net1.parameters(), lr=1e-4)
    optimizer2 = optim.SGD(net2.parameters(), lr=1e-3)

    prime = net1.forward()
Ejemplo n.º 2
0
 def train_model(self):
     args = self.args
     # Load data
     corpus = Corpus(args.file)
     train_data = train.batchify(corpus.train, args.batch_size, self.device)
     # Build the model
     ntokens = len(corpus.dictionary)
     model = RNNModel(args.model, ntokens, args.emsize, args.nhid,
                      args.nlayers, args.dropout, args.tied).to(self.device)
     # criterion = nn.NLLLoss()
     # criterion = nn.MSELoss()
     criterion = self.args.criterion
     optimizer = optim.Adam(model.parameters(), lr=args.lr)
     # Training code
     # Loop over epochs.
     lr = args.lr
     # At any point you can hit Ctrl + C to break out of training early.
     try:
         for epoch in range(1, args.epochs + 1):
             epoch_start_time = time.time()
             train.train(train_data, args, model, optimizer, criterion,
                         corpus, epoch, lr, self.device)
             print('-' * 89)
             with open(args.save, 'wb') as f:
                 torch.save(model, f)
             lr /= 4.0
     except KeyboardInterrupt:
         print('-' * 89)
         print('Exiting from training early')
     return model
Ejemplo n.º 3
0
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            print(
                '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch,
                    len(train_data) // args.bptt, lr,
                    elapsed * 1000 / args.log_interval, cur_loss,
                    math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
Ejemplo n.º 4
0
def train():

    # training file on disk
    train_file = args.train_data

    # training data class
    print("starting")
    trainData = textData(train_file, args.vocab_size)
    # model
    model = RNNModel(embedding_size=args.embedding_size,
                     bidir=args.bidir,
                     hidden_units=args.hidden_units,
                     vocab_size=args.vocab_size,
                     batch_size=args.batch_size,
                     num_layers=args.num_layers,
                     num_entities=args.num_entities)

    # create the genereator for the training set and validation set
    params = {
        'batch_size': args.batch_size,
        'shuffle': True,
        'num_workers': 1,
        'collate_fn': collate_fn
    }

    train_gen = data.DataLoader(trainData, **params)

    max_epochs = args.epochs

    # loss function and optimizer
    loss_func = nn.NLLLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate)

    for epoch in range(max_epochs):
        for batch_x, batch_y in train_gen:
            if batch_y.size()[0] < args.batch_size:
                continue
            print(batch_x)
            # make zero grad
            optimizer.zero_grad()

            output = model(batch_x)
            loss = loss_func(output, batch_y)
            loss.backward()
            optimizer.step()
            print(loss)
Ejemplo n.º 5
0
def build_model(args, corpus):
    criterion = None
    ntokens = len(corpus.dictionary)
    model = RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers,
                     args.dropout, args.dropouth, args.dropouti, args.dropoute,
                     args.wdrop, args.tied)
    ###
    if args.resume:
        logging.info('Resuming model ...')
        model, criterion, optimizer = model_load(args.resume_path)
        optimizer.param_groups[0]['lr'] = args.lr
        model.dropouti, model.dropouth, model.dropout, args.dropoute = args.dropouti, args.dropouth, args.dropout, args.dropoute
        if args.wdrop:
            from weight_drop import WeightDrop
            for rnn in model.rnns:
                if type(rnn) == WeightDrop: rnn.dropout = args.wdrop
                elif rnn.zoneout > 0: rnn.zoneout = args.wdrop
    ###
    if not criterion:
        splits = []
        if ntokens > 500000:
            # One Billion
            # This produces fairly even matrix mults for the buckets:
            # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422
            splits = [4200, 35000, 180000]
        elif ntokens > 75000:
            # WikiText-103
            splits = [2800, 20000, 76000]
        logging.info(f'Using {splits}')
        criterion = SplitCrossEntropyLoss(args.emsize,
                                          splits=splits,
                                          verbose=False)
    ###
    params = list(model.parameters()) + list(criterion.parameters())
    total_params = sum(x.size()[0] *
                       x.size()[1] if len(x.size()) > 1 else x.size()[0]
                       for x in params if x.size())
    logging.info(f'Args: {args}')
    logging.info(f'Model total parameters: {total_params}')

    if args.cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    return model, criterion
Ejemplo n.º 6
0
def build_model(args, ntokens: int):
    """
    Returns model and loss function.
    """
    print('INFO: Building model')
    model = RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers,
                     args.dropout, args.dropouth, args.dropouti, args.dropoute,
                     args.wdrop, args.tied)
    if args.cuda:
        print('INFO: Moving model to GPU')
        model.cuda()
    total_params = sum(x.size()[0] *
                       x.size()[1] if len(x.size()) > 1 else x.size()[0]
                       for x in model.parameters())
    print('INFO: Model total parameters:', total_params)

    criterion = nn.CrossEntropyLoss()

    return model, criterion
Ejemplo n.º 7
0
                     args.n_layers).to(device)

    # Load the dictionaries
    with open(os.path.join(args.data_dir, "char_dict.pkl"), "rb") as f:
        model.char2int = pickle.load(f)

    with open(os.path.join(args.data_dir, "int_dict.pkl"), "rb") as f:
        model.int2char = pickle.load(f)

    print("Model loaded with embedding_dim {}, hidden_dim {}, vocab_size {}.".
          format(args.embedding_dim, args.hidden_dim, args.vocab_size))

    # Train the model.
    # Define Loss, Optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    train_main(model, optimizer, criterion, train_loader, num_batches,
               val_batches, args.batch_size, args.max_len, args.epochs,
               args.clip_norm, device)

    # Save the parameters used to construct the model
    model_info_path = os.path.join(args.model_dir, 'model_info.pth')
    with open(model_info_path, 'wb') as f:
        model_info = {
            'n_layers': args.n_layers,
            'embedding_dim': args.vocab_size,
            'hidden_dim': args.hidden_dim,
            'vocab_size': args.vocab_size,
            'drop_rate': 0.2
        }
Ejemplo n.º 8
0
def run(args):

    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        if not args.cuda:
            print("WARNING: You have a CUDA device, so you should probably run with --cuda")
        else:
            torch.cuda.manual_seed(args.seed)

    ###############################################################################
    # Load data
    ###############################################################################

    def model_save(fn):
        with open(fn, 'wb') as f:
            torch.save([model, optimizer], f)

    def model_load(fn):
        global model, criterion, optimizer
        with open(fn, 'rb') as f:
            model, optimizer = torch.load(f)

    import os
    import hashlib
    fn = 'corpus.{}.data'.format(hashlib.md5(args.data.encode()).hexdigest())
    if os.path.exists(fn):
        print('Loading cached dataset...')
        corpus = torch.load(fn)
    else:
        print('Producing dataset...')
        corpus = data.Corpus(args.data)
        torch.save(corpus, fn)

    # get token frequencies and eos_tokens
    frequencies, eos_tokens = None, None
    if not args.uni_freq: frequencies = corpus.frequencies
    if args.reinit_h: eos_tokens = corpus.reset_idxs

    # batchify
    eval_batch_size = 1
    test_batch_size = 1
    print(corpus.dictionary)
    if args.reinit_h:
        ntokens = len(corpus.dictionary) + 1 if args.batch_size > 1 else len(corpus.dictionary)
        train_data, seq_lens = batchify_padded(corpus.train, args.batch_size, args, ntokens, eos_tokens)    
    else:
        ntokens = len(corpus.dictionary)
        train_data = batchify(corpus.train, args.batch_size, args)
    val_data = batchify(corpus.valid, eval_batch_size, args)
    test_data = batchify(corpus.test, test_batch_size, args)

    ###############################################################################
    # Build the model
    ###############################################################################

    model = RNNModel(ntokens, args.emsize, args.nhid, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.nsamples,
                    args.temperature, frequencies, args.no_bias, args.bias_reg, args.dist_fn, args.activation_fn)
    ###
    if args.resume:
        print('Resuming model ...')
        model_load(args.resume)
        optimizer.param_groups[0]['lr'] = args.lr
        model.dropouti, model.dropouth, model.dropout, args.dropoute = args.dropouti, args.dropouth, args.dropout, args.dropoute

    ###
    if args.cuda:
        model = model.cuda()

    ###
    params = list(model.parameters())
    total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in params if x.size())
    print('Args:', args)
    print('Model total parameters:', total_params)

    ###############################################################################
    # Training code
    ###############################################################################

    def evaluate(data_source, epoch, batch_size=1):
        # Turn on evaluation mode which disables dropout.
        model.eval()

        if args.dump_hiddens:
            loss, entropy, hiddens = model.evaluate(data_source, eos_tokens, args.dump_hiddens)
            dump_hiddens(hiddens, 'hiddens_' + str(epoch))
        else:
            loss, entropy = model.evaluate(data_source, eos_tokens)
        
        if args.dump_words:
            dump_words(model.encoder.weight.detach().cpu().numpy(), 'words_' + str(epoch))

        if not args.dump_entropy is None:
            dump(entropy, args.dump_entropy + str(epoch))

        return loss


    def train():
        # Turn on training mode which enables dropout.
        total_loss, avrg_loss = 0, 0
        start_time = time.time()
        ntokens = len(corpus.dictionary)
        batch, i = 0, 0
        hidden = model.init_hidden(args.batch_size)
        while i < train_data.size(0)-1:

            if args.reinit_h:
                seq_len = seq_lens[batch] - 1
            else:
                bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2.
                # Prevent excessively small or negative sequence lengths
                seq_len = max(5, int(np.random.normal(bptt, 5)))
                # prevent negative sequence lengths
                # There's a very small chance that it could select a very long sequence length resulting in OOM
                # seq_len = min(seq_len, args.bptt + 10)

            lr2 = optimizer.param_groups[0]['lr']
            optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt
            model.train()
            data = get_batch(train_data, i, args, seq_len=seq_len)

            # Starting each batch, we detach the hidden state from how it was previously produced.
            # If we didn't, the model would try backpropagating all the way to start of the dataset.
            reset_hidden = args.reinit_h
            if reset_hidden:
                hidden = model.init_hidden(args.batch_size)

            hidden = repackage_hidden(hidden)
            optimizer.zero_grad()

            #raw_loss = model.train_crossentropy(data, eos_tokens)
            raw_loss, hidden = model(data, hidden)

            loss = raw_loss
            '''
            See what we can do here! We don't need the regularization as it is implicit!

            # Activiation Regularization
            if args.alpha: loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:])
            # Temporal Activation Regularization (slowness)
            if args.beta: loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:])
            '''
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            if args.clip: torch.nn.utils.clip_grad_norm_(params, args.clip)
            optimizer.step()

            total_loss += loss.data
            optimizer.param_groups[0]['lr'] = lr2
            if batch % args.log_interval == 0 and batch > 0:
                cur_loss = total_loss.item() / args.log_interval
                elapsed = time.time() - start_time
                print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | '
                        'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format(
                    epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'],
                    elapsed * 1000 / args.log_interval, cur_loss, cur_loss, cur_loss / math.log(2)))
                avrg_loss = avrg_loss + total_loss
                total_loss = 0
                start_time = time.time()
            ###
            batch += 1
            i += seq_len + 1

        return avrg_loss / train_data.size(0)

    # Loop over epochs.
    lr = args.lr
    best_val_loss = []
    valid_loss = []
    stored_loss = 100000000

    # At any point you can hit Ctrl + C to break out of training early.
    try:
        optimizer = None
        # Ensure the optimizer is optimizing params, which includes both the model's weights as well as the criterion's weight (i.e. Adaptive Softmax)
        if args.optimizer == 'sgd':
            optimizer = torch.optim.SGD(params, lr=args.lr, weight_decay=args.wdecay)
        if args.optimizer == 'adam':
            optimizer = torch.optim.Adam(params, lr=args.lr, weight_decay=args.wdecay)
        for epoch in range(1, args.epochs+1):
            epoch_start_time = time.time()
            train_loss = train()
            _, s, _= np.linalg.svd(model.rnn.module.weight_hh_l0.cpu().detach().numpy())
            print(s[0])
            #dump(model.decoder.bias.cpu().detach().numpy(), 'bias_' + str(epoch) +'.out')
            
            # skip to beginning if not in evaluation mode
            if epoch % args.evaluate_every > 0:
                print('-' * 89)
                print('| end of epoch {:3d} | time: {:5.2f}s | train loss {:5.2f} |'.format(
                        epoch, (time.time() - epoch_start_time), train_loss))
                print('-' * 89) 
                continue

            # evaluate validation loss 
            if 't0' in optimizer.param_groups[0]:
                tmp = {}
                for prm in model.parameters():
                    #if 'ax' in optimizer.state[prm]:
                    tmp[prm] = prm.data.clone()
                    if 'ax' in optimizer.state[prm]:
                        prm.data = optimizer.state[prm]['ax'].clone()

                val_loss2 = evaluate(val_data, epoch)
                valid_loss.append(val_loss2)
                print('-' * 89)
                print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                    'valid ppl {:8.2f} | valid bpc {:8.3f}'.format(
                        epoch, (time.time() - epoch_start_time), val_loss2, math.exp(val_loss2), val_loss2 / math.log(2)))
                print('-' * 89)

                if val_loss2 < stored_loss:
                    model_save(args.save)
                    print('Saving Averaged!')
                    stored_loss = val_loss2

                for prm in model.parameters():
                    prm.data = tmp[prm].clone()

            else:
                val_loss = evaluate(val_data, epoch, eval_batch_size)
                valid_loss.append(val_loss)
                print('-' * 89)
                print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                    'valid ppl {:8.2f} | valid bpc {:8.3f}'.format(
                  epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss), val_loss / math.log(2)))
                print('-' * 89)

                if val_loss < stored_loss:
                    model_save(args.save)
                    print('Saving model (new best validation)')
                    stored_loss = val_loss

                if args.optimizer == 'sgd' and 't0' not in optimizer.param_groups[0] and (len(best_val_loss)>args.nonmono and val_loss > min(best_val_loss[:-args.nonmono])):
                    print('Switching to ASGD')
                    optimizer = torch.optim.ASGD(model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay)

                if epoch in args.when:
                    print('Saving model before learning rate decreased')
                    model_save('{}.e{}'.format(args.save, epoch))
                    print('Dividing learning rate by 10')
                    optimizer.param_groups[0]['lr'] /= 10.

                best_val_loss.append(val_loss)

    except KeyboardInterrupt:
        print('-' * 89)
        print('Exiting from training early')

    # Load the best saved model.
    model_load(args.save)

    # Run on test data.
    test_loss = evaluate(test_data, args.epochs+1, test_batch_size)
    print('=' * 89)
    print('| End of training | test loss {:5.2f} | test ppl {:8.2f} | test bpc {:8.3f}'.format(
        test_loss, math.exp(test_loss), test_loss / math.log(2)))
    print('=' * 89)

    return np.array(valid_loss), test_loss
Ejemplo n.º 9
0
print(args)
input_size = 88
X_train, X_valid, X_test = data_generator(args.data)

nhid = args.nhid
dropout = args.dropout
rnn_type = args.rnn_type

model = RNNModel(rnn_type, input_size, input_size, nhid)

if args.cuda:
    model.cuda()

criterion = nn.CrossEntropyLoss()
lr = args.lr
optimizer = getattr(optim, args.optim)(model.parameters(), lr=lr)


def evaluate(X_data):
    model.eval()
    eval_idx_list = np.arange(len(X_data), dtype="int32")
    total_loss = 0.0
    count = 0
    for idx in eval_idx_list:
        data_line = X_data[idx]
        x, y = Variable(data_line[:-1]), Variable(data_line[1:])
        if args.cuda:
            x, y = x.cuda(), y.cuda()
        output = model(x.unsqueeze(0)).squeeze(0)
        loss = -torch.trace(
            torch.matmul(y,
Ejemplo n.º 10
0
else:
    print('Building model and criterion...')
    model = RNNModel(corpus.ntoken, args.emsize, corpus.weight, args.nhid,
                     args.nlayers, args.dropouti, args.dropoutrnn,
                     args.dropout, args.wdrop)

if args.cuda:
    model = model.cuda()

print('-' * 89)
print('Args:', args)
print('Model parameters:', count_parameters(model))

criterion = nn.CrossEntropyLoss()
if args.optimizer == 'sgd':
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.lr,
                                weight_decay=args.wdecay)
else:
    optimizer = torch.optim.Adam(model.parameters(), weight_decay=args.wdecay)


###############################################################################
# evaluate funcition
###############################################################################
def evaluate(model, criterion, data_source, batch_size):
    total_loss = 0
    model.eval()
    with torch.no_grad():
        hidden = None
        for i in range(0, data_source.size(0) - 1, args.bptt):
                       tieweights=args.tieweights)
else:
    LMModel_start = torch.load(args.start_model).cpu()
    # Note: watch out if the model class has different methods from the loaded one to start with !!!
    LMModel = RNNModel(vocab_size=vocab_size,
                       embed_size=args.embedsz,
                       hidden_size=args.hiddensz,
                       num_layers=args.numlayers,
                       dropout=args.dropout,
                       padid=padid,
                       tieweights=args.tieweights)
    LMModel.load_state_dict(LMModel_start.state_dict())

# LMModel = torch.load(args.save).cpu()

model_size = sum(p.nelement() for p in LMModel.parameters())
logging('-' * 30, f_log=f_log)
logging(f'Model tatal parameters: {model_size}', f_log=f_log)
logging('-' * 30, f_log=f_log)

# print('-' * 30)
# print(f'Model tatal parameters: {model_size}')
# print('-' * 30)

if torch.cuda.is_available() and cuda_device is not 'cpu':
    LMModel = LMModel.cuda(cuda_device)

LMModel_parallel = None
if torch.cuda.is_available() and args.devids is not 'off':
    LMModel_parallel = torch.nn.DataParallel(LMModel,
                                             device_ids=device_ids,
Ejemplo n.º 12
0
    collate_fn=lambda x: lm_collate(x, tokenizer.term2id['PAD']))
test_loader = DataLoader(
    test_dataset,
    batch_size=64,
    num_workers=NUM_WORKERS,
    collate_fn=lambda x: lm_collate(x, tokenizer.term2id['PAD']))

model = RNNModel(ntokens,
                 100,
                 100,
                 dropout=0.0,
                 pad_token=tokenizer.term2id['PAD'])
model = cudalize(model)
loss_fn = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), weight_decay=0.0)

cross_entropy = nn.CrossEntropyLoss()


def loss_function(preds, labels, lens):
    # TODO: delete padding
    new_preds, new_labels = [], []
    for pred, label, l in zip(preds, labels, lens):
        new_preds.append(pred[:l])
        new_labels.append(label[:l])
    preds = torch.cat(new_preds, dim=0)
    labels = torch.cat(new_labels, dim=0)
    return cross_entropy(preds, labels)

Ejemplo n.º 13
0
hidden_dim = 200
num_layers = 4
lr = 1e-3
log_dir = './ckpt'
model_name = 'model.pth'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print(f"Found {device} ...")
print("Instantiating RNN Model")

if not os.path.exists(log_dir):
    os.mkdir(log_dir)
model_save_path = os.path.join(log_dir, model_name)
model = RNNModel(x_train.shape[-1], hidden_dim, num_layers,
                 y_train.shape[-1]).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.MSELoss()

print("< Training starts >")
model = train(model, dataloader_train, dataloader_val, device, criterion,
              optimizer, n_epochs, model_save_path)

print("Testing on test data-set ")
log_dir = './ckpt'
model_name = 'model.pth'
model_save_path = os.path.join(log_dir, model_name)
output_dim = 4
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = RNNModel(x_test.shape[-1], hidden_dim, num_layers,
                 output_dim).to(device)
y_test_pred = test(x_test, model, model_save_path, device)
Ejemplo n.º 14
0
Archivo: main.py Proyecto: Phlix1/exps
D = args.emsize if args.proj else args.nhid
ss = SampledSoftmax(ntokens, nsampled, D, tied_weight=twht)

net.add_module("encoder", encoder)
net.add_module("decoder", ss)
net.cuda()
tmp_net = net
if world_size >= 1:
    tmp_net = DDP(net)
tmp_net.init_hidden = net.init_hidden
net = tmp_net

print("Batch Size:", args.batch_size * args.scale, "Initial LR:",
      args.lr * args.scale)
criterion = nn.CrossEntropyLoss()
optimizer = Adam(net.parameters(), args.lr * args.scale, betas=(0.9, 0.999))
scheduler = LinearLR(optimizer,
                     base_lr=args.lr * args.scale,
                     max_iters=train_corpus.batch_num * args.epochs,
                     last_iter=-1,
                     min_lr=1e-8)

###############################################################################
# Training code
###############################################################################


def repackage_hidden(h, device_id=0):
    """Wraps hidden states in new Variables, to detach them from their history."""
    if isinstance(h, Variable):
        return Variable(h.data).cuda(device_id)
Ejemplo n.º 15
0
    # Set dataloader
    train_loader = DataLoader(dataset=TensorDataset(torch.FloatTensor(x_train),
                                                    torch.LongTensor(y_train)),
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=4)

    valid_loader = DataLoader(dataset=TensorDataset(torch.FloatTensor(x_valid),
                                                    torch.LongTensor(y_valid)),
                              batch_size=batch_size,
                              shuffle=False,
                              num_workers=4)

    print('Initial RNN model.')
    model = RNNModel(input_dim, output_dim).cuda()
    optimizer = torch.optim.RMSprop(model.parameters(), lr=0.0001, alpha=0.9)
    loss_func = torch.nn.CrossEntropyLoss()

    print('Start training.')

    best_ed = 999
    early_stop_cnt = 0

    for epoch in range(1, epochs + 1):

        print('Epoch: {}/{}'.format(epoch, epochs))

        total_loss, total_acc, nonzeros = 0, 0, 0

        widgets = [
            FormatLabel(''), ' ',
Ejemplo n.º 16
0
train_loader, test_loader = data_generator(root, batch_size)

permute = torch.Tensor(np.random.permutation(784).astype(np.float64)).long()
model = RNNModel(rnn_type="LSTM",
                 ntoken=n_classes,
                 ninp=n_inputs,
                 nhid=nhid,
                 nlayers=args.num_layers)

if args.cuda:
    model.cuda()
    permute = permute.cuda()

lr = args.lr
#optimizer = getattr(optim, args.optim)(model.parameters(), lr=lr)
optimizer = optim.RMSprop(model.parameters(), lr=lr, momentum=0.9)


def train(ep):
    global steps
    train_loss = 0
    model.train()

    for batch_idx, (data, target) in enumerate(train_loader):
        if args.cuda: data, target = data.cuda(), target.cuda()
        data = data.view(-1, 1, seq_length)
        if args.permute:
            data = data[:, :, permute]
        # Data should be seq_len, batch, input_size,
        data = data.permute(2, 0, 1)
        data, target = Variable(data), Variable(target)
Ejemplo n.º 17
0
def train():
    # 载入数据与配置模型
    print("Loading data...")
    corpus = Corpus(train_dir)
    print(corpus)

    config = Config()
    config.vocab_size = len(corpus.dictionary)
    train_data = batchify(corpus.train, config.batch_size)
    train_len = train_data.size(0)
    seq_len = config.seq_len

    print("Configuring model...")
    model = RNNModel(config)
    if use_cuda:
        model.cuda()
    print(model)

    criterion = nn.CrossEntropyLoss()
    lr = config.learning_rate  # 初始学习率
    start_time = time.time()

    print("Training and generating...")
    for epoch in range(1, config.num_epochs + 1):  # 多轮次训练
        total_loss = 0.0
        model.train()  # 在训练模式下dropout才可用。
        hidden = model.init_hidden(config.batch_size)  # 初始化隐藏层参数

        for ibatch, i in enumerate(range(0, train_len - 1, seq_len)):
            data, targets = get_batch(train_data, i, seq_len)  # 取一个批次的数据
            # 在每批开始之前,将隐藏的状态与之前产生的结果分离。
            # 如果不这样做,模型会尝试反向传播到数据集的起点。
            hidden = repackage_hidden(hidden)
            model.zero_grad()

            output, hidden = model(data, hidden)
            loss = criterion(output.view(-1, config.vocab_size), targets)
            loss.backward()  # 反向传播

            # `clip_grad_norm` 有助于防止RNNs/LSTMs中的梯度爆炸问题。
            torch.nn.utils.clip_grad_norm(model.parameters(), config.clip)
            for p in model.parameters():  # 梯度更新
                p.data.add_(-lr, p.grad.data)

            total_loss += loss.data  # loss累计

            if ibatch % config.log_interval == 0 and ibatch > 0:  # 每隔多少个批次输出一次状态
                cur_loss = total_loss[0] / config.log_interval
                elapsed = get_time_dif(start_time)
                print(
                    "Epoch {:3d}, {:5d}/{:5d} batches, lr {:2.3f}, loss {:5.2f}, ppl {:8.2f}, time {}"
                    .format(epoch, ibatch, train_len // seq_len, lr, cur_loss,
                            math.exp(cur_loss), elapsed))
                total_loss = 0.0
        lr /= 4.0  # 在一轮迭代完成后,尝试缩小学习率

        # 每隔多少轮次保存一次模型参数
        if epoch % config.save_interval == 0:
            torch.save(model.state_dict(),
                       os.path.join(save_dir, model_name.format(epoch)))

        print(''.join(generate(model, corpus.dictionary.idx2word)))
Ejemplo n.º 18
0
    config = Config()
    config.vocab_size = len(corpus.dictionary)
    train_data = batchify(corpus.train, config.batch_size)
    train_len = train_data.size(0)
    seq_len = config.seq_len

    print("Configuring model...")
    model = RNNModel(config)
    if use_cuda:
        model.cuda()
    print(model)

    criterion = nn.CrossEntropyLoss()
    lr = config.learning_rate  # 初始学习率
    best_train_loss = None
    optimizer = torch.optim.Adam(model.parameters())

    print("Training and generating...")
    try:
        for epoch in range(1, config.num_epochs + 1):  # 多轮次训练
            epoch_start_time = time.time()
            train_loss = train()

            print('-' * 89)
            print(
                '| end of epoch {:3d} | time: {:5.2f}s | train loss {:5.2f} | '
                'train ppl {:8.2f}'.format(epoch,
                                           (time.time() - epoch_start_time),
                                           train_loss, math.exp(train_loss)))
            print('-' * 89)
            # Save the model if the validation loss is the best we've seen so far.
Ejemplo n.º 19
0
    def __init__(self, save_path,seed=111,val_interval = 20, val_times=1,controller = None, batch_size=128, grad_clip=0.1, config='eval'):

        args = {'emsize':850, 'nhid':850, 'nhidlast':850, 'dropoute':0.1, 'wdecay':8e-7}
        args['config'] = config

        args['data'] = '../data/penn'
        args['lr'] = 20
        args['clip'] = grad_clip
        args['batch_size'] = batch_size
        args['search_batch_size'] = 256*4
        args['small_batch_size'] = batch_size
        args['bptt'] = 35
        args['dropout'] = 0.75
        args['dropouth'] = 0.25
        args['dropoutx'] = 0.75
        args['dropouti'] = 0.2
        args['seed'] = seed
        args['nonmono'] = 5
        args['log_interval'] = val_interval
        args['val_times'] = val_times
        args['save'] = save_path
        args['alpha'] = 0
        args['beta'] = 1e-3
        args['max_seq_length_delta'] = 20
        args['unrolled'] = True
        args['gpu'] = 0
        args['cuda'] = True
        args = AttrDict(args)
        self.args = args
        self.seed = seed
        self.controller = controller
        
        
        
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        torch.cuda.set_device(args.gpu)
        cudnn.benchmark = True
        cudnn.enabled=True
        torch.cuda.manual_seed_all(args.seed)

        corpus = data.Corpus(args.data)
        self.corpus = corpus

        eval_batch_size = 64
        test_batch_size = 1
        args.eval_batch_size = eval_batch_size
        
        
        
        self.train_data = batchify(corpus.train, args.batch_size, args)
        self.search_data = batchify(corpus.valid, args.search_batch_size, args)
        
#         self.val_data = batchify(corpus.train[464794:], eval_batch_size, args)
#         self.test_data = batchify(corpus.test, test_batch_size, args)
#         raw_data = batchify(corpus.train, batch_size, None)
#         indx = np.arange(14524)
#         random.shuffle(indx)
        
#         self.train_data = raw_data[indx[0:int(14524/2)],:]
#         self.val_data = raw_data[indx[int(14524/2):],:]
        
        raw_data = batchify(corpus.valid, 1, None)
        val_data = []
        for i in range(len(raw_data)-1-args.bptt):
            val_data.append(raw_data[i:i+args.bptt+1])
        val_data = torch.cat(val_data,1)
        self.val_data = val_data
        
        
        print(self.train_data.shape)
        print(self.search_data.shape)
        print(self.val_data.shape)
        
        self.batch = 0
        self.steps = 0
        self.epochs = 0
        self.total_loss = 0
        self.start_time = time.time()


        ntokens = len(corpus.dictionary)
        #if args.continue_train:
        #    model = torch.load(os.path.join(args.save, 'model.pt'))
#         try:
#             model = torch.load(os.path.join(args.save, 'model.pt'))
#             print('Loaded model from checkpoint')
#         except Exception as e:
#             print(e)
        model = RNNModel(ntokens, args.emsize, args.nhid, args.nhidlast,
               args.dropout, args.dropouth, args.dropoutx, args.dropouti, args.dropoute, genotype=genotypes.DARTS)

        size = 0
        for p in model.parameters():
            size += p.nelement()
        logging.info('param size: {}'.format(size))
        logging.info('initial genotype:')
        logging.info(model.rnns[0].genotype)

        total_params = sum(x.data.nelement() for x in model.parameters())
        logging.info('Args: {}'.format(args))
        logging.info('Model total parameters: {}'.format(total_params))

        self.model = model.cuda()
        self.optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wdecay)
def train():
    best_val_loss = 100

    ntokens = len(corpus.dictionary)
    train_data = batchify(corpus.train, args.batch_size)  # num_batches, batch_size
    val_data = batchify(corpus.valid, args.batch_size)
    model = RNNModel(rnn_type=args.model,
                     ntoken=ntokens,
                     ninp=args.emsize,
                     nfeat=args.nfeat,
                     nhid=args.nhid,
                     nlayers=args.nlayers,
                     font_path=args.font_path,
                     font_size=args.font_size,
                     dropout=args.dropout,
                     tie_weights=args.tied,
                     ).to(device)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    print('start training...')
    hidden = model.init_hidden(args.batch_size)
    epoch_start_time = time.time()

    for epoch in range(args.epochs):

        model.eval()  # 在validation上测试
        total_loss = 0.
        with torch.no_grad():
            for idx in range(0, val_data.size(0) - 1, args.bptt):
                data, targets = get_batch(val_data, idx)
                output, hidden = model(data, hidden)
                output_flat = output.view(-1, ntokens)  # (seq_len, batch, ntokens) -> (seq_len*batch, ntokens)
                total_loss += len(data) * criterion(output_flat, targets.view(-1)).item()
                hidden = repackage_hidden(hidden)
        val_loss = total_loss / len(val_data)
        best_val_loss = min(best_val_loss, val_loss)
        print('-' * 100)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | valid ppl {:8.2f} | best valid ppl {:8.2f}'
              .format(epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss), math.exp(best_val_loss)))
        print('-' * 100)
        epoch_start_time = time.time()
        if val_loss == best_val_loss:  # Save the model if the validation loss is best so far.
            torch.save(model, os.path.join(args.save, 'model.pkl'))
        else:
            args.lr /= 4.0

        model.train()  # 在training set上训练
        total_loss = 0.
        start_time = time.time()
        for i, idx in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
            data, targets = get_batch(train_data, idx)
            hidden = repackage_hidden(hidden)
            model.zero_grad()  # 求loss和梯度
            output, hidden = model(data, hidden)
            loss = criterion(output.view(-1, ntokens), targets.view(-1))
            loss.backward()
            total_loss += loss.item()

            torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)  # 用梯度更新参数
            optimizer.step()
            # for p in model.parameters():
            #     p.data.add_(-args.lr, p.grad.data)

            if i % args.log_interval == 0 and i > 0:
                cur_loss = total_loss / args.log_interval
                elapsed = time.time() - start_time
                print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} |loss {:5.2f} | ppl {:8.2f}'
                      .format(epoch + 1, i, len(train_data) // args.bptt, args.lr, elapsed * 1000 / args.log_interval,
                              cur_loss, math.exp(cur_loss)))
                total_loss = 0
                start_time = time.time()
""" ----------- Model Creation ------------"""
number_tokens = len(corpus.dictionary)  # Number of unique word in our corpus

model = RNNModel(rnn_type = rt,
                ntoken = number_tokens,
                ninp = embedding_size,
                nhid = number_hidden,
                nlayers = number_layer,
                drop_rate = dropout,
                tie_weights = tied)

if cuda and torch.cuda.is_available():
    model = model.cuda()

criterion = nn.CrossEntropyLoss()
optimizer = optim.RMSprop(model.parameters(), lr = learning_rate)

""" ----------- Training Code ------------"""
def detach_hidden(h): # detach from distant history
    if type(h) == V:
        return V(h.data)
    else:
        return tuple(detach_hidden(v) for v in h)


def get_batch(source, i, sequence_length):
    seq_len = min(sequence_length, len(source) - 1 - i)
    # torch.cat([data.data.view(-1).unsqueeze(-1), target.data.unsqueeze(-1)], dim=1)
    data = V(source[i:i+seq_len]).type(LongType)
    target = V(source[i+1:i+1+seq_len].view(-1)).type(LongType)
    return data, target
Ejemplo n.º 22
0
class DartsTrainer():
    def __init__(self, arm):
        # Default params for eval network
        args = {
            'emsize': 850,
            'nhid': 850,
            'nhidlast': 850,
            'dropoute': 0.1,
            'wdecay': 8e-7
        }

        args['data'] = '/home/liamli4465/darts/data/penn'
        args['lr'] = 20
        args['clip'] = 0.25
        args['batch_size'] = 64
        args['search_batch_size'] = 256 * 4
        args['small_batch_size'] = 64
        args['bptt'] = 35
        args['dropout'] = 0.75
        args['dropouth'] = 0.25
        args['dropoutx'] = 0.75
        args['dropouti'] = 0.2
        args['seed'] = arm['seed']
        args['nonmono'] = 5
        args['log_interval'] = 50
        args['save'] = arm['dir']
        args['alpha'] = 0
        args['beta'] = 1e-3
        args['max_seq_length_delta'] = 20
        args['unrolled'] = True
        args['gpu'] = 0
        args['cuda'] = True
        args['genotype'] = arm['genotype']
        args = AttrDict(args)
        self.args = args
        self.epoch = 0

        np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        torch.cuda.set_device(args.gpu)
        cudnn.benchmark = True
        cudnn.enabled = True
        torch.cuda.manual_seed_all(args.seed)

        corpus = data.Corpus(args.data)
        self.corpus = corpus

        self.eval_batch_size = 10
        self.test_batch_size = 1

        self.train_data = batchify(corpus.train, args.batch_size, args)
        self.search_data = batchify(corpus.valid, args.search_batch_size, args)
        self.val_data = batchify(corpus.valid, self.eval_batch_size, args)
        self.test_data = batchify(corpus.test, self.test_batch_size, args)

        self.ntokens = len(corpus.dictionary)

    def model_save(self, fn, to_save):
        if self.epoch % 150 == 0:
            with open(
                    os.path.join(self.args.save,
                                 "checkpoint-incumbent-%d" % self.epoch),
                    'wb') as f:
                torch.save(to_save, f)

        with open(fn, 'wb') as f:
            torch.save(to_save, f)

    def model_load(self, fn):
        with open(fn, 'rb') as f:
            self.model, self.optimizer, rng_state, cuda_state = torch.load(f)
            torch.set_rng_state(rng_state)
            torch.cuda.set_rng_state(cuda_state)

    def model_resume(self, filename):
        logging.info('Resuming model from %s' % filename)
        self.model_load(filename)
        self.optimizer.param_groups[0]['lr'] = self.args.lr
        for rnn in self.model.rnns:
            rnn.genotype = self.args.genotype

    def train_epochs(self, epochs):
        args = self.args
        resume_filename = os.path.join(self.args.save, "checkpoint.incumbent")
        if os.path.exists(resume_filename):
            self.model_resume(resume_filename)
            logging.info('Loaded model from checkpoint')
        else:
            self.model = RNNModel(self.ntokens,
                                  args.emsize,
                                  args.nhid,
                                  args.nhidlast,
                                  args.dropout,
                                  args.dropouth,
                                  args.dropoutx,
                                  args.dropouti,
                                  args.dropoute,
                                  genotype=args.genotype)
            self.optimizer = torch.optim.SGD(self.model.parameters(),
                                             lr=args.lr,
                                             weight_decay=args.wdecay)

        size = 0
        for p in self.model.parameters():
            size += p.nelement()
        logging.info('param size: {}'.format(size))
        logging.info('initial genotype:')
        logging.info(self.model.rnns[0].genotype)

        total_params = sum(x.data.nelement() for x in self.model.parameters())
        logging.info('Args: {}'.format(args))
        logging.info('Model total parameters: {}'.format(total_params))

        self.model = self.model.cuda()
        # Loop over epochs.
        lr = args.lr
        best_val_loss = []
        stored_loss = 100000000

        # At any point you can hit Ctrl + C to break out of training early.
        try:
            for epoch in range(epochs):
                epoch_start_time = time.time()
                self.train()
                if 't0' in self.optimizer.param_groups[0]:
                    tmp = {}
                    for prm in self.model.parameters():
                        tmp[prm] = prm.data.clone()
                        prm.data = self.optimizer.state[prm]['ax'].clone()

                    val_loss2 = self.evaluate(self.val_data)
                    logging.info('-' * 89)
                    logging.info(
                        '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                        'valid ppl {:8.2f} | valid bpc {:8.3f}'.format(
                            self.epoch,
                            (time.time() - epoch_start_time), val_loss2,
                            math.exp(val_loss2), val_loss2 / math.log(2)))
                    logging.info('-' * 89)

                    if val_loss2 < stored_loss:
                        self.model_save(
                            os.path.join(args.save, 'checkpoint.incumbent'), [
                                self.model, self.optimizer,
                                torch.get_rng_state(),
                                torch.cuda.get_rng_state()
                            ])
                        logging.info('Saving Averaged!')
                        stored_loss = val_loss2

                    for prm in self.model.parameters():
                        prm.data = tmp[prm].clone()

                else:
                    val_loss = self.evaluate(self.val_data,
                                             self.eval_batch_size)
                    logging.info('-' * 89)
                    logging.info(
                        '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                        'valid ppl {:8.2f} | valid bpc {:8.3f}'.format(
                            self.epoch,
                            (time.time() - epoch_start_time), val_loss,
                            math.exp(val_loss), val_loss / math.log(2)))
                    logging.info('-' * 89)

                    if val_loss < stored_loss:
                        self.model_save(
                            os.path.join(args.save, 'checkpoint.incumbent'), [
                                self.model, self.optimizer,
                                torch.get_rng_state(),
                                torch.cuda.get_rng_state()
                            ])
                        logging.info('Saving model (new best validation)')
                        stored_loss = val_loss

                    if (self.epoch > 75
                            and 't0' not in self.optimizer.param_groups[0] and
                        (len(best_val_loss) > args.nonmono
                         and val_loss > min(best_val_loss[:-args.nonmono]))):
                        logging.info('Switching to ASGD')
                        self.optimizer = torch.optim.ASGD(
                            self.model.parameters(),
                            lr=args.lr,
                            t0=0,
                            lambd=0.,
                            weight_decay=args.wdecay)

                    best_val_loss.append(val_loss)

        except Exception as e:
            logging.info('-' * 89)
            logging.info(e)
            logging.info('Exiting from training early')
            return 0, 10000, 10000

        # Load the best saved model.
        self.model_load(os.path.join(args.save, 'checkpoint.incumbent'))

        # Run on test data.
        val_loss = self.evaluate(self.val_data, self.eval_batch_size)
        logging.info(math.exp(val_loss))
        test_loss = self.evaluate(self.test_data, self.test_batch_size)
        logging.info('=' * 89)
        logging.info(
            '| End of training | test loss {:5.2f} | test ppl {:8.2f} | test bpc {:8.3f}'
            .format(test_loss, math.exp(test_loss), test_loss / math.log(2)))
        logging.info('=' * 89)

        return 0, math.exp(val_loss), math.exp(test_loss)

    def train(self):
        args = self.args
        corpus = self.corpus
        total_loss = 0
        start_time = time.time()
        hidden = [
            self.model.init_hidden(args.small_batch_size)
            for _ in range(args.batch_size // args.small_batch_size)
        ]
        batch, i = 0, 0

        while i < self.train_data.size(0) - 1 - 1:
            bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2.
            # Prevent excessively small or negative sequence lengths
            seq_len = max(5, int(np.random.normal(bptt, 5)))
            # There's a very small chance that it could select a very long sequence length resulting in OOM
            seq_len = min(seq_len, args.bptt + args.max_seq_length_delta)

            lr2 = self.optimizer.param_groups[0]['lr']
            self.optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt
            self.model.train()
            data, targets = get_batch(self.train_data,
                                      i,
                                      args,
                                      seq_len=seq_len)

            self.optimizer.zero_grad()

            start, end, s_id = 0, args.small_batch_size, 0
            while start < args.batch_size:
                cur_data, cur_targets = data[:, start:
                                             end], targets[:, start:
                                                           end].contiguous(
                                                           ).view(-1)

                # Starting each batch, we detach the hidden state from how it was previously produced.
                # If we didn't, the model would try backpropagating all the way to start of the dataset.
                hidden[s_id] = repackage_hidden(hidden[s_id])

                log_prob, hidden[s_id], rnn_hs, dropped_rnn_hs = self.model(
                    cur_data, hidden[s_id], return_h=True)
                raw_loss = nn.functional.nll_loss(
                    log_prob.view(-1, log_prob.size(2)), cur_targets)

                loss = raw_loss
                # Activiation Regularization
                if args.alpha > 0:
                    loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean()
                                      for dropped_rnn_h in dropped_rnn_hs[-1:])
                # Temporal Activation Regularization (slowness)
                loss = loss + sum(args.beta *
                                  (rnn_h[1:] - rnn_h[:-1]).pow(2).mean()
                                  for rnn_h in rnn_hs[-1:])
                loss *= args.small_batch_size / args.batch_size
                total_loss += raw_loss.data * args.small_batch_size / args.batch_size
                loss.backward()

                s_id += 1
                start = end
                end = start + args.small_batch_size

                gc.collect()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs.
            torch.nn.utils.clip_grad_norm(self.model.parameters(), args.clip)
            self.optimizer.step()

            # total_loss += raw_loss.data
            self.optimizer.param_groups[0]['lr'] = lr2

            if np.isnan(total_loss[0]):
                raise

            #if batch % args.log_interval == 0 and batch > 0:
            #    cur_loss = total_loss[0] / args.log_interval
            #    elapsed = time.time() - start_time
            #    logging.info('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
            #            'loss {:5.2f} | ppl {:8.2f}'.format(
            #        self.epoch, batch, len(self.train_data) // args.bptt, self.optimizer.param_groups[0]['lr'],
            #        elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
            #    total_loss = 0
            #    start_time = time.time()
            batch += 1
            i += seq_len
        self.epoch += 1

    def evaluate(self, data_source, batch_size=10):
        # Turn on evaluation mode which disables dropout.
        self.model.eval()
        total_loss = 0
        hidden = self.model.init_hidden(batch_size)
        for i in range(0, data_source.size(0) - 1, self.args.bptt):
            data, targets = get_batch(data_source,
                                      i,
                                      self.args,
                                      evaluation=True)
            targets = targets.view(-1)

            log_prob, hidden = self.model(data, hidden)
            loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)),
                                          targets).data

            total_loss += loss * len(data)

            hidden = repackage_hidden(hidden)
        return total_loss[0] / len(data_source)
Ejemplo n.º 23
0
class WordLanguageModelTrial(PyTorchTrial):
    def __init__(self, context: PyTorchTrialContext):
        self.context = context
        data_config = self.context.get_data_config()
        hparams = self.context.get_hparams()
        using_bind_mount = data_config["use_bind_mount"]
        use_cache = data_config["use_cache"]
        self.eval_batch_size = hparams["eval_batch_size"]

        download_directory = (
            Path(data_config["bind_mount_path"]) if using_bind_mount else
            Path("/data")) / f"data-rank{self.context.distributed.get_rank()}"

        self.corpus = data.load_and_cache_dataset(download_directory,
                                                  use_cache)
        self.model_cls = hparams["model_cls"]
        emsize = hparams["word_embeddings_size"]
        num_hidden = hparams["num_hidden"]
        num_layers = hparams["num_layers"]
        dropout = hparams["dropout"]
        self.bptt = hparams["bptt"]

        if self.model_cls.lower() == "transformer":
            num_heads = hparams["num_heads"]
            self.model = TransformerModel(self.corpus.ntokens, emsize,
                                          num_heads, num_hidden, num_layers,
                                          dropout)
        else:
            tied = hparams["tied"]
            self.model = RNNModel(
                self.model_cls,
                self.corpus.ntokens,
                emsize,
                num_hidden,
                num_layers,
                dropout,
                tied,
            )

        self.model = self.context.wrap_model(self.model)
        self.criterion = nn.NLLLoss()

        lr = hparams["lr"]
        optimizer = torch.optim.SGD(self.model.parameters(), lr=lr)
        self.optimizer = self.context.wrap_optimizer(optimizer)

        self.lr_scheduler = self.context.wrap_lr_scheduler(
            torch.optim.lr_scheduler.ReduceLROnPlateau(
                self.optimizer,
                factor=0.25,
                patience=0,
                threshold=0.001,
                threshold_mode="abs",
                verbose=True,
            ),
            LRScheduler.StepMode.MANUAL_STEP,
        )

    def build_training_data_loader(self) -> DataLoader:
        train_dataset = data.WikiTextDataset(
            self.corpus,
            batch_size=self.context.get_per_slot_batch_size(),
        )
        batch_samp = data.BatchSamp(train_dataset, self.bptt)
        return DataLoader(train_dataset, batch_sampler=batch_samp)

    def build_validation_data_loader(self) -> DataLoader:
        val_dataset = data.WikiTextDataset(
            self.corpus,
            batch_size=self.eval_batch_size,
            valid=True,
        )
        self.val_data_len = len(val_dataset) - 1
        batch_samp = data.BatchSamp(val_dataset, self.bptt)
        return DataLoader(val_dataset, batch_sampler=batch_samp)

    def train_batch(self, batch: TorchData, epoch_idx: int,
                    batch_idx: int) -> Dict[str, Union[torch.Tensor, float]]:
        if batch_idx == 0 and self.model_cls.lower() != "transformer":
            self.hidden = self.model.init_hidden(
                self.context.get_per_slot_batch_size())
        inputs = batch[:-1]
        labels = batch[1:].view(-1)
        if self.model_cls.lower() == "transformer":
            output = self.model(inputs)
            output = output.view(-1, self.corpus.ntokens)
        else:
            self.hidden = self.model.repackage_hidden(self.hidden)
            output, self.hidden = self.model(inputs, self.hidden)
        loss = self.criterion(output, labels)

        self.context.backward(loss)
        self.context.step_optimizer(
            self.optimizer,
            clip_grads=lambda params: torch.nn.utils.clip_grad_norm_(
                params, self.context.get_hparam("max_grad_norm")),
        )
        return {
            "loss": loss,
            "lr": float(self.optimizer.param_groups[0]["lr"])
        }

    def evaluate_full_dataset(
            self, data_loader: DataLoader) -> Dict[str, torch.Tensor]:
        validation_loss = 0.0
        if self.model_cls.lower() != "transformer":
            self.hidden = self.model.init_hidden(self.eval_batch_size)
        for batch in data_loader:
            batch = self.context.to_device(batch)
            if self.model_cls.lower() == "transformer":
                output = self.model(batch[:-1])
                output = output.view(-1, self.corpus.ntokens)
            else:
                output, self.hidden = self.model(batch[:-1], self.hidden)
                self.hidden = self.model.repackage_hidden(self.hidden)
            validation_loss += (
                len(batch[:-1]) *
                self.criterion(output, batch[1:].view(-1)).item())

        validation_loss /= len(data_loader.dataset) - 1
        self.lr_scheduler.step(validation_loss)
        if self.model_cls.lower() != "transformer":
            self.hidden = self.model.init_hidden(
                self.context.get_per_slot_batch_size())
        return {"validation_loss": validation_loss}
Ejemplo n.º 24
0
    def __init__(self, save_path, seed, batch_size, grad_clip, config='eval'):
        if config == 'search':
            args = {
                'emsize': 300,
                'nhid': 300,
                'nhidlast': 300,
                'dropoute': 0,
                'wdecay': 5e-7
            }
        elif config == 'eval':
            args = {
                'emsize': 850,
                'nhid': 850,
                'nhidlast': 850,
                'dropoute': 0.1,
                'wdecay': 8e-7
            }
        args['config'] = config

        args['data'] = '/home/liamli4465/darts/data/penn'
        args['lr'] = 20
        args['clip'] = grad_clip
        args['batch_size'] = batch_size
        args['search_batch_size'] = 256 * 4
        args['small_batch_size'] = batch_size
        args['bptt'] = 35
        args['dropout'] = 0.75
        args['dropouth'] = 0.25
        args['dropoutx'] = 0.75
        args['dropouti'] = 0.2
        args['seed'] = seed
        args['nonmono'] = 5
        args['log_interval'] = 50
        args['save'] = save_path
        args['alpha'] = 0
        args['beta'] = 1e-3
        args['max_seq_length_delta'] = 20
        args['unrolled'] = True
        args['gpu'] = 0
        args['cuda'] = True
        args = AttrDict(args)
        self.args = args
        self.seed = seed

        np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        torch.cuda.set_device(args.gpu)
        cudnn.benchmark = True
        cudnn.enabled = True
        torch.cuda.manual_seed_all(args.seed)

        corpus = data.Corpus(args.data)
        self.corpus = corpus

        eval_batch_size = 10
        test_batch_size = 1

        self.train_data = batchify(corpus.train, args.batch_size, args)
        self.search_data = batchify(corpus.valid, args.search_batch_size, args)
        self.val_data = batchify(corpus.valid, eval_batch_size, args)
        self.test_data = batchify(corpus.test, test_batch_size, args)
        self.batch = 0
        self.steps = 0
        self.epochs = 0
        self.total_loss = 0
        self.start_time = time.time()

        ntokens = len(corpus.dictionary)
        # if args.continue_train:
        #    model = torch.load(os.path.join(args.save, 'model.pt'))
        try:
            model = torch.load(os.path.join(args.save, 'model.pt'))
            print('Loaded model from checkpoint')
        except Exception as e:
            print(e)
            model = RNNModel(ntokens,
                             args.emsize,
                             args.nhid,
                             args.nhidlast,
                             args.dropout,
                             args.dropouth,
                             args.dropoutx,
                             args.dropouti,
                             args.dropoute,
                             genotype=genotypes.DARTS)

        size = 0
        for p in model.parameters():
            size += p.nelement()
        logging.info('param size: {}'.format(size))
        logging.info('initial genotype:')
        logging.info(model.rnns[0].genotype)

        total_params = sum(x.data.nelement() for x in model.parameters())
        logging.info('Args: {}'.format(args))
        logging.info('Model total parameters: {}'.format(total_params))

        self.model = model.cuda()
        self.optimizer = torch.optim.SGD(model.parameters(),
                                         lr=args.lr,
                                         weight_decay=args.wdecay)
Ejemplo n.º 25
0
                 ntokens,
                 args.emsize,
                 args.nhid,
                 args.nlayers,
                 args.dropout,
                 args.rnn_dropout,
                 args.output_dropout,
                 args.tied,
                 adasoft=args.adasoft,
                 cutoff=cutoff)

if torch.cuda.is_available():
    model.cuda()

if args.optim == 'SGD':
    optimizer = torch.optim.SGD(params=model.parameters(), lr=args.lr)
elif args.optim == 'rms':
    optimizer = torch.optim.RMSprop(params=model.parameters(),
                                    lr=args.lr,
                                    weight_decay=0.00001)
else:
    raise Exception

criterion = None
if args.adasoft:
    criterion = AdaptiveLoss([*cutoff, ntokens + 1])
else:
    criterion = nn.CrossEntropyLoss()

###############################################################################
# Training code
Ejemplo n.º 26
0
                        datefmt='%H:%M:%S',
                        level=logging.INFO)
else:
    logging.basicConfig(format='%(asctime)s: %(message)s',
                        datefmt='%H:%M:%S',
                        filename=os.path.join(args.out, 'train.log'),
                        level=logging.INFO)
tb.configure(args.out)
random.seed(1024)
torch.manual_seed(1024)
torch.cuda.manual_seed_all(1024)

model = RNNModel(123, 62, 250, 3, args.dropout, bidirectional=args.bi)
if args.init: model.load_state_dict(torch.load(args.init))
else:
    for param in model.parameters():
        torch.nn.init.uniform(param, -0.1, 0.1)
if args.cuda: model.cuda()

optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=.9)
criterion = CTCLoss()

# data set
trainset = SequentialLoader('train', args.batch_size)
devset = SequentialLoader('dev', args.batch_size)

tri = cvi = 0


def eval():
    global cvi