Ejemplo n.º 1
0
def build_model():
    """Build the model according to CLI arguments

    Global Dependencies:
        - corpus
        - args
    """
    # noise for soise sampling in NCE
    noise = build_unigram_noise(torch.FloatTensor(corpus.vocab.idx2count))

    # setting up NCELoss modules
    if args.index_module == 'linear':
        criterion = IndexLinear(
            args.nhid,
            ntoken,
            noise=noise,
            noise_ratio=args.noise_ratio,
            norm_term=args.norm_term,
            loss_type=args.loss,
            reduction='none',
        )
        model = RNNModel(
            ntoken,
            args.emsize,
            args.nhid,
            args.nlayers,
            criterion=criterion,
            dropout=args.dropout,
        )
    elif args.index_module == 'gru':
        if args.nlayers != 1:
            logger.warning(
                'Falling into one layer GRU due to Index_GRU supporting')
        nce_criterion = IndexGRU(
            ntoken,
            args.nhid,
            args.nhid,
            args.dropout,
            noise=noise,
            noise_ratio=args.noise_ratio,
            norm_term=args.norm_term,
        )
        model = GenModel(criterion=nce_criterion, )
    else:
        logger.error('The index module [%s] is not supported yet' %
                     args.index_module)
        raise (NotImplementedError('index module not supported'))

    if args.cuda:
        model.cuda()

    logger.info('model definition:\n %s', model)
    return model
Ejemplo n.º 2
0
def main():
    args = parse_arguments()
    use_cuda = torch.cuda.is_available()

    print("[!] preparing dataset...")
    TEXT = data.Field()
    train_data, val_data, test_data = datasets.WikiText2.splits(TEXT)
    TEXT.build_vocab(train_data, min_freq=10)
    train_iter, val_iter, test_iter = data.BPTTIterator.splits(
        (train_data, val_data, test_data),
        batch_size=args.batch_size,
        bptt_len=30,
        repeat=False)
    vocab_size = len(TEXT.vocab)
    print("[TRAIN]:%d\t[VALID]:%d\t[TEST]:%d\t[VOCAB]%d" %
          (len(train_iter), len(val_iter), len(test_iter), vocab_size))

    print("[!] Instantiating models...")
    model = RNNModel('LSTM',
                     ntoken=vocab_size,
                     ninp=600,
                     nhid=600,
                     nlayers=2,
                     dropout=0.5)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    if use_cuda:
        model.cuda()
    print(model)

    best_val_loss = None
    for e in range(1, args.epochs + 1):
        train(model, optimizer, train_iter, vocab_size, args.grad_clip,
              args.log_interval, use_cuda)
        val_loss = evaluate(model, val_iter, vocab_size, use_cuda)
        print("[Epoch: %d] val-loss:%5.2f | val-pp:%5.2f" %
              (e, val_loss, math.exp(val_loss)))

        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            print("[!] saving model")
            if not os.path.isdir(args.save):
                os.makedirs(args.save)
            torch.save(model, './%s/lm_%d.pt' % (args.save, e))
            best_val_loss = val_loss
    test_loss = evaluate(model, test_iter, vocab_size, use_cuda)
    print("[Epoch: %d] test-loss:%5.2f | test-pp:%5.2f" %
          (e, test_loss, math.exp(test_loss)))
Ejemplo n.º 3
0
def build_model(args, ntokens: int):
    """
    Returns model and loss function.
    """
    print('INFO: Building model')
    model = RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers,
                     args.dropout, args.dropouth, args.dropouti, args.dropoute,
                     args.wdrop, args.tied)
    if args.cuda:
        print('INFO: Moving model to GPU')
        model.cuda()
    total_params = sum(x.size()[0] *
                       x.size()[1] if len(x.size()) > 1 else x.size()[0]
                       for x in model.parameters())
    print('INFO: Model total parameters:', total_params)

    criterion = nn.CrossEntropyLoss()

    return model, criterion
Ejemplo n.º 4
0
def build_model(args, corpus):
    criterion = None
    ntokens = len(corpus.dictionary)
    model = RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers,
                     args.dropout, args.dropouth, args.dropouti, args.dropoute,
                     args.wdrop, args.tied)
    ###
    if args.resume:
        logging.info('Resuming model ...')
        model, criterion, optimizer = model_load(args.resume_path)
        optimizer.param_groups[0]['lr'] = args.lr
        model.dropouti, model.dropouth, model.dropout, args.dropoute = args.dropouti, args.dropouth, args.dropout, args.dropoute
        if args.wdrop:
            from weight_drop import WeightDrop
            for rnn in model.rnns:
                if type(rnn) == WeightDrop: rnn.dropout = args.wdrop
                elif rnn.zoneout > 0: rnn.zoneout = args.wdrop
    ###
    if not criterion:
        splits = []
        if ntokens > 500000:
            # One Billion
            # This produces fairly even matrix mults for the buckets:
            # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422
            splits = [4200, 35000, 180000]
        elif ntokens > 75000:
            # WikiText-103
            splits = [2800, 20000, 76000]
        logging.info(f'Using {splits}')
        criterion = SplitCrossEntropyLoss(args.emsize,
                                          splits=splits,
                                          verbose=False)
    ###
    params = list(model.parameters()) + list(criterion.parameters())
    total_params = sum(x.size()[0] *
                       x.size()[1] if len(x.size()) > 1 else x.size()[0]
                       for x in params if x.size())
    logging.info(f'Args: {args}')
    logging.info(f'Model total parameters: {total_params}')

    if args.cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    return model, criterion
Ejemplo n.º 5
0
ntokens = len(corpus.dictionary)
cutoff = [2000]
model = RNNModel(args.model,
                 ntokens,
                 args.emsize,
                 args.nhid,
                 args.nlayers,
                 args.dropout,
                 args.rnn_dropout,
                 args.output_dropout,
                 args.tied,
                 adasoft=args.adasoft,
                 cutoff=cutoff)

if torch.cuda.is_available():
    model.cuda()

if args.optim == 'SGD':
    optimizer = torch.optim.SGD(params=model.parameters(), lr=args.lr)
elif args.optim == 'rms':
    optimizer = torch.optim.RMSprop(params=model.parameters(),
                                    lr=args.lr,
                                    weight_decay=0.00001)
else:
    raise Exception

criterion = None
if args.adasoft:
    criterion = AdaptiveLoss([*cutoff, ntokens + 1])
else:
    criterion = nn.CrossEntropyLoss()
train_data = create_batch(corpus.train, batch_size)

""" ----------- Model Creation ------------"""
number_tokens = len(corpus.dictionary)  # Number of unique word in our corpus

model = RNNModel(rnn_type = rt,
                ntoken = number_tokens,
                ninp = embedding_size,
                nhid = number_hidden,
                nlayers = number_layer,
                drop_rate = dropout,
                tie_weights = tied)

if cuda and torch.cuda.is_available():
    model = model.cuda()

criterion = nn.CrossEntropyLoss()
optimizer = optim.RMSprop(model.parameters(), lr = learning_rate)

""" ----------- Training Code ------------"""
def detach_hidden(h): # detach from distant history
    if type(h) == V:
        return V(h.data)
    else:
        return tuple(detach_hidden(v) for v in h)


def get_batch(source, i, sequence_length):
    seq_len = min(sequence_length, len(source) - 1 - i)
    # torch.cat([data.data.view(-1).unsqueeze(-1), target.data.unsqueeze(-1)], dim=1)
Ejemplo n.º 7
0
def run(args):

    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        if not args.cuda:
            print("WARNING: You have a CUDA device, so you should probably run with --cuda")
        else:
            torch.cuda.manual_seed(args.seed)

    ###############################################################################
    # Load data
    ###############################################################################

    def model_save(fn):
        with open(fn, 'wb') as f:
            torch.save([model, optimizer], f)

    def model_load(fn):
        global model, criterion, optimizer
        with open(fn, 'rb') as f:
            model, optimizer = torch.load(f)

    import os
    import hashlib
    fn = 'corpus.{}.data'.format(hashlib.md5(args.data.encode()).hexdigest())
    if os.path.exists(fn):
        print('Loading cached dataset...')
        corpus = torch.load(fn)
    else:
        print('Producing dataset...')
        corpus = data.Corpus(args.data)
        torch.save(corpus, fn)

    # get token frequencies and eos_tokens
    frequencies, eos_tokens = None, None
    if not args.uni_freq: frequencies = corpus.frequencies
    if args.reinit_h: eos_tokens = corpus.reset_idxs

    # batchify
    eval_batch_size = 1
    test_batch_size = 1
    print(corpus.dictionary)
    if args.reinit_h:
        ntokens = len(corpus.dictionary) + 1 if args.batch_size > 1 else len(corpus.dictionary)
        train_data, seq_lens = batchify_padded(corpus.train, args.batch_size, args, ntokens, eos_tokens)    
    else:
        ntokens = len(corpus.dictionary)
        train_data = batchify(corpus.train, args.batch_size, args)
    val_data = batchify(corpus.valid, eval_batch_size, args)
    test_data = batchify(corpus.test, test_batch_size, args)

    ###############################################################################
    # Build the model
    ###############################################################################

    model = RNNModel(ntokens, args.emsize, args.nhid, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.nsamples,
                    args.temperature, frequencies, args.no_bias, args.bias_reg, args.dist_fn, args.activation_fn)
    ###
    if args.resume:
        print('Resuming model ...')
        model_load(args.resume)
        optimizer.param_groups[0]['lr'] = args.lr
        model.dropouti, model.dropouth, model.dropout, args.dropoute = args.dropouti, args.dropouth, args.dropout, args.dropoute

    ###
    if args.cuda:
        model = model.cuda()

    ###
    params = list(model.parameters())
    total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in params if x.size())
    print('Args:', args)
    print('Model total parameters:', total_params)

    ###############################################################################
    # Training code
    ###############################################################################

    def evaluate(data_source, epoch, batch_size=1):
        # Turn on evaluation mode which disables dropout.
        model.eval()

        if args.dump_hiddens:
            loss, entropy, hiddens = model.evaluate(data_source, eos_tokens, args.dump_hiddens)
            dump_hiddens(hiddens, 'hiddens_' + str(epoch))
        else:
            loss, entropy = model.evaluate(data_source, eos_tokens)
        
        if args.dump_words:
            dump_words(model.encoder.weight.detach().cpu().numpy(), 'words_' + str(epoch))

        if not args.dump_entropy is None:
            dump(entropy, args.dump_entropy + str(epoch))

        return loss


    def train():
        # Turn on training mode which enables dropout.
        total_loss, avrg_loss = 0, 0
        start_time = time.time()
        ntokens = len(corpus.dictionary)
        batch, i = 0, 0
        hidden = model.init_hidden(args.batch_size)
        while i < train_data.size(0)-1:

            if args.reinit_h:
                seq_len = seq_lens[batch] - 1
            else:
                bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2.
                # Prevent excessively small or negative sequence lengths
                seq_len = max(5, int(np.random.normal(bptt, 5)))
                # prevent negative sequence lengths
                # There's a very small chance that it could select a very long sequence length resulting in OOM
                # seq_len = min(seq_len, args.bptt + 10)

            lr2 = optimizer.param_groups[0]['lr']
            optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt
            model.train()
            data = get_batch(train_data, i, args, seq_len=seq_len)

            # Starting each batch, we detach the hidden state from how it was previously produced.
            # If we didn't, the model would try backpropagating all the way to start of the dataset.
            reset_hidden = args.reinit_h
            if reset_hidden:
                hidden = model.init_hidden(args.batch_size)

            hidden = repackage_hidden(hidden)
            optimizer.zero_grad()

            #raw_loss = model.train_crossentropy(data, eos_tokens)
            raw_loss, hidden = model(data, hidden)

            loss = raw_loss
            '''
            See what we can do here! We don't need the regularization as it is implicit!

            # Activiation Regularization
            if args.alpha: loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:])
            # Temporal Activation Regularization (slowness)
            if args.beta: loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:])
            '''
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            if args.clip: torch.nn.utils.clip_grad_norm_(params, args.clip)
            optimizer.step()

            total_loss += loss.data
            optimizer.param_groups[0]['lr'] = lr2
            if batch % args.log_interval == 0 and batch > 0:
                cur_loss = total_loss.item() / args.log_interval
                elapsed = time.time() - start_time
                print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | '
                        'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format(
                    epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'],
                    elapsed * 1000 / args.log_interval, cur_loss, cur_loss, cur_loss / math.log(2)))
                avrg_loss = avrg_loss + total_loss
                total_loss = 0
                start_time = time.time()
            ###
            batch += 1
            i += seq_len + 1

        return avrg_loss / train_data.size(0)

    # Loop over epochs.
    lr = args.lr
    best_val_loss = []
    valid_loss = []
    stored_loss = 100000000

    # At any point you can hit Ctrl + C to break out of training early.
    try:
        optimizer = None
        # Ensure the optimizer is optimizing params, which includes both the model's weights as well as the criterion's weight (i.e. Adaptive Softmax)
        if args.optimizer == 'sgd':
            optimizer = torch.optim.SGD(params, lr=args.lr, weight_decay=args.wdecay)
        if args.optimizer == 'adam':
            optimizer = torch.optim.Adam(params, lr=args.lr, weight_decay=args.wdecay)
        for epoch in range(1, args.epochs+1):
            epoch_start_time = time.time()
            train_loss = train()
            _, s, _= np.linalg.svd(model.rnn.module.weight_hh_l0.cpu().detach().numpy())
            print(s[0])
            #dump(model.decoder.bias.cpu().detach().numpy(), 'bias_' + str(epoch) +'.out')
            
            # skip to beginning if not in evaluation mode
            if epoch % args.evaluate_every > 0:
                print('-' * 89)
                print('| end of epoch {:3d} | time: {:5.2f}s | train loss {:5.2f} |'.format(
                        epoch, (time.time() - epoch_start_time), train_loss))
                print('-' * 89) 
                continue

            # evaluate validation loss 
            if 't0' in optimizer.param_groups[0]:
                tmp = {}
                for prm in model.parameters():
                    #if 'ax' in optimizer.state[prm]:
                    tmp[prm] = prm.data.clone()
                    if 'ax' in optimizer.state[prm]:
                        prm.data = optimizer.state[prm]['ax'].clone()

                val_loss2 = evaluate(val_data, epoch)
                valid_loss.append(val_loss2)
                print('-' * 89)
                print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                    'valid ppl {:8.2f} | valid bpc {:8.3f}'.format(
                        epoch, (time.time() - epoch_start_time), val_loss2, math.exp(val_loss2), val_loss2 / math.log(2)))
                print('-' * 89)

                if val_loss2 < stored_loss:
                    model_save(args.save)
                    print('Saving Averaged!')
                    stored_loss = val_loss2

                for prm in model.parameters():
                    prm.data = tmp[prm].clone()

            else:
                val_loss = evaluate(val_data, epoch, eval_batch_size)
                valid_loss.append(val_loss)
                print('-' * 89)
                print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                    'valid ppl {:8.2f} | valid bpc {:8.3f}'.format(
                  epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss), val_loss / math.log(2)))
                print('-' * 89)

                if val_loss < stored_loss:
                    model_save(args.save)
                    print('Saving model (new best validation)')
                    stored_loss = val_loss

                if args.optimizer == 'sgd' and 't0' not in optimizer.param_groups[0] and (len(best_val_loss)>args.nonmono and val_loss > min(best_val_loss[:-args.nonmono])):
                    print('Switching to ASGD')
                    optimizer = torch.optim.ASGD(model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay)

                if epoch in args.when:
                    print('Saving model before learning rate decreased')
                    model_save('{}.e{}'.format(args.save, epoch))
                    print('Dividing learning rate by 10')
                    optimizer.param_groups[0]['lr'] /= 10.

                best_val_loss.append(val_loss)

    except KeyboardInterrupt:
        print('-' * 89)
        print('Exiting from training early')

    # Load the best saved model.
    model_load(args.save)

    # Run on test data.
    test_loss = evaluate(test_data, args.epochs+1, test_batch_size)
    print('=' * 89)
    print('| End of training | test loss {:5.2f} | test ppl {:8.2f} | test bpc {:8.3f}'.format(
        test_loss, math.exp(test_loss), test_loss / math.log(2)))
    print('=' * 89)

    return np.array(valid_loss), test_loss
                       tieweights=args.tieweights)
    LMModel.load_state_dict(LMModel_start.state_dict())

# LMModel = torch.load(args.save).cpu()

model_size = sum(p.nelement() for p in LMModel.parameters())
logging('-' * 30, f_log=f_log)
logging(f'Model tatal parameters: {model_size}', f_log=f_log)
logging('-' * 30, f_log=f_log)

# print('-' * 30)
# print(f'Model tatal parameters: {model_size}')
# print('-' * 30)

if torch.cuda.is_available() and cuda_device is not 'cpu':
    LMModel = LMModel.cuda(cuda_device)

LMModel_parallel = None
if torch.cuda.is_available() and args.devids is not 'off':
    LMModel_parallel = torch.nn.DataParallel(LMModel,
                                             device_ids=device_ids,
                                             output_device=output_device,
                                             dim=1)
    # .cuda() is necessary if LMModel was not on any GPU device
#     LMModel_parallel._modules['module'].lstm.flatten_parameters()

if args.optim == 'SGD':
    optimizer = optim.SGD(LMModel.parameters(),
                          lr=args.lr,
                          momentum=args.momentum,
                          weight_decay=args.wd)
Ejemplo n.º 9
0
    def __init__(self, save_path, seed, batch_size, grad_clip, config='eval'):
        if config == 'search':
            args = {
                'emsize': 300,
                'nhid': 300,
                'nhidlast': 300,
                'dropoute': 0,
                'wdecay': 5e-7
            }
        elif config == 'eval':
            args = {
                'emsize': 850,
                'nhid': 850,
                'nhidlast': 850,
                'dropoute': 0.1,
                'wdecay': 8e-7
            }
        args['config'] = config

        args['data'] = '/home/liamli4465/darts/data/penn'
        args['lr'] = 20
        args['clip'] = grad_clip
        args['batch_size'] = batch_size
        args['search_batch_size'] = 256 * 4
        args['small_batch_size'] = batch_size
        args['bptt'] = 35
        args['dropout'] = 0.75
        args['dropouth'] = 0.25
        args['dropoutx'] = 0.75
        args['dropouti'] = 0.2
        args['seed'] = seed
        args['nonmono'] = 5
        args['log_interval'] = 50
        args['save'] = save_path
        args['alpha'] = 0
        args['beta'] = 1e-3
        args['max_seq_length_delta'] = 20
        args['unrolled'] = True
        args['gpu'] = 0
        args['cuda'] = True
        args = AttrDict(args)
        self.args = args
        self.seed = seed

        np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        torch.cuda.set_device(args.gpu)
        cudnn.benchmark = True
        cudnn.enabled = True
        torch.cuda.manual_seed_all(args.seed)

        corpus = data.Corpus(args.data)
        self.corpus = corpus

        eval_batch_size = 10
        test_batch_size = 1

        self.train_data = batchify(corpus.train, args.batch_size, args)
        self.search_data = batchify(corpus.valid, args.search_batch_size, args)
        self.val_data = batchify(corpus.valid, eval_batch_size, args)
        self.test_data = batchify(corpus.test, test_batch_size, args)
        self.batch = 0
        self.steps = 0
        self.epochs = 0
        self.total_loss = 0
        self.start_time = time.time()

        ntokens = len(corpus.dictionary)
        # if args.continue_train:
        #    model = torch.load(os.path.join(args.save, 'model.pt'))
        try:
            model = torch.load(os.path.join(args.save, 'model.pt'))
            print('Loaded model from checkpoint')
        except Exception as e:
            print(e)
            model = RNNModel(ntokens,
                             args.emsize,
                             args.nhid,
                             args.nhidlast,
                             args.dropout,
                             args.dropouth,
                             args.dropoutx,
                             args.dropouti,
                             args.dropoute,
                             genotype=genotypes.DARTS)

        size = 0
        for p in model.parameters():
            size += p.nelement()
        logging.info('param size: {}'.format(size))
        logging.info('initial genotype:')
        logging.info(model.rnns[0].genotype)

        total_params = sum(x.data.nelement() for x in model.parameters())
        logging.info('Args: {}'.format(args))
        logging.info('Model total parameters: {}'.format(total_params))

        self.model = model.cuda()
        self.optimizer = torch.optim.SGD(model.parameters(),
                                         lr=args.lr,
                                         weight_decay=args.wdecay)
Ejemplo n.º 10
0
class DartsTrainer():
    def __init__(self, arm):
        # Default params for eval network
        args = {
            'emsize': 850,
            'nhid': 850,
            'nhidlast': 850,
            'dropoute': 0.1,
            'wdecay': 8e-7
        }

        args['data'] = '/home/liamli4465/darts/data/penn'
        args['lr'] = 20
        args['clip'] = 0.25
        args['batch_size'] = 64
        args['search_batch_size'] = 256 * 4
        args['small_batch_size'] = 64
        args['bptt'] = 35
        args['dropout'] = 0.75
        args['dropouth'] = 0.25
        args['dropoutx'] = 0.75
        args['dropouti'] = 0.2
        args['seed'] = arm['seed']
        args['nonmono'] = 5
        args['log_interval'] = 50
        args['save'] = arm['dir']
        args['alpha'] = 0
        args['beta'] = 1e-3
        args['max_seq_length_delta'] = 20
        args['unrolled'] = True
        args['gpu'] = 0
        args['cuda'] = True
        args['genotype'] = arm['genotype']
        args = AttrDict(args)
        self.args = args
        self.epoch = 0

        np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        torch.cuda.set_device(args.gpu)
        cudnn.benchmark = True
        cudnn.enabled = True
        torch.cuda.manual_seed_all(args.seed)

        corpus = data.Corpus(args.data)
        self.corpus = corpus

        self.eval_batch_size = 10
        self.test_batch_size = 1

        self.train_data = batchify(corpus.train, args.batch_size, args)
        self.search_data = batchify(corpus.valid, args.search_batch_size, args)
        self.val_data = batchify(corpus.valid, self.eval_batch_size, args)
        self.test_data = batchify(corpus.test, self.test_batch_size, args)

        self.ntokens = len(corpus.dictionary)

    def model_save(self, fn, to_save):
        if self.epoch % 150 == 0:
            with open(
                    os.path.join(self.args.save,
                                 "checkpoint-incumbent-%d" % self.epoch),
                    'wb') as f:
                torch.save(to_save, f)

        with open(fn, 'wb') as f:
            torch.save(to_save, f)

    def model_load(self, fn):
        with open(fn, 'rb') as f:
            self.model, self.optimizer, rng_state, cuda_state = torch.load(f)
            torch.set_rng_state(rng_state)
            torch.cuda.set_rng_state(cuda_state)

    def model_resume(self, filename):
        logging.info('Resuming model from %s' % filename)
        self.model_load(filename)
        self.optimizer.param_groups[0]['lr'] = self.args.lr
        for rnn in self.model.rnns:
            rnn.genotype = self.args.genotype

    def train_epochs(self, epochs):
        args = self.args
        resume_filename = os.path.join(self.args.save, "checkpoint.incumbent")
        if os.path.exists(resume_filename):
            self.model_resume(resume_filename)
            logging.info('Loaded model from checkpoint')
        else:
            self.model = RNNModel(self.ntokens,
                                  args.emsize,
                                  args.nhid,
                                  args.nhidlast,
                                  args.dropout,
                                  args.dropouth,
                                  args.dropoutx,
                                  args.dropouti,
                                  args.dropoute,
                                  genotype=args.genotype)
            self.optimizer = torch.optim.SGD(self.model.parameters(),
                                             lr=args.lr,
                                             weight_decay=args.wdecay)

        size = 0
        for p in self.model.parameters():
            size += p.nelement()
        logging.info('param size: {}'.format(size))
        logging.info('initial genotype:')
        logging.info(self.model.rnns[0].genotype)

        total_params = sum(x.data.nelement() for x in self.model.parameters())
        logging.info('Args: {}'.format(args))
        logging.info('Model total parameters: {}'.format(total_params))

        self.model = self.model.cuda()
        # Loop over epochs.
        lr = args.lr
        best_val_loss = []
        stored_loss = 100000000

        # At any point you can hit Ctrl + C to break out of training early.
        try:
            for epoch in range(epochs):
                epoch_start_time = time.time()
                self.train()
                if 't0' in self.optimizer.param_groups[0]:
                    tmp = {}
                    for prm in self.model.parameters():
                        tmp[prm] = prm.data.clone()
                        prm.data = self.optimizer.state[prm]['ax'].clone()

                    val_loss2 = self.evaluate(self.val_data)
                    logging.info('-' * 89)
                    logging.info(
                        '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                        'valid ppl {:8.2f} | valid bpc {:8.3f}'.format(
                            self.epoch,
                            (time.time() - epoch_start_time), val_loss2,
                            math.exp(val_loss2), val_loss2 / math.log(2)))
                    logging.info('-' * 89)

                    if val_loss2 < stored_loss:
                        self.model_save(
                            os.path.join(args.save, 'checkpoint.incumbent'), [
                                self.model, self.optimizer,
                                torch.get_rng_state(),
                                torch.cuda.get_rng_state()
                            ])
                        logging.info('Saving Averaged!')
                        stored_loss = val_loss2

                    for prm in self.model.parameters():
                        prm.data = tmp[prm].clone()

                else:
                    val_loss = self.evaluate(self.val_data,
                                             self.eval_batch_size)
                    logging.info('-' * 89)
                    logging.info(
                        '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                        'valid ppl {:8.2f} | valid bpc {:8.3f}'.format(
                            self.epoch,
                            (time.time() - epoch_start_time), val_loss,
                            math.exp(val_loss), val_loss / math.log(2)))
                    logging.info('-' * 89)

                    if val_loss < stored_loss:
                        self.model_save(
                            os.path.join(args.save, 'checkpoint.incumbent'), [
                                self.model, self.optimizer,
                                torch.get_rng_state(),
                                torch.cuda.get_rng_state()
                            ])
                        logging.info('Saving model (new best validation)')
                        stored_loss = val_loss

                    if (self.epoch > 75
                            and 't0' not in self.optimizer.param_groups[0] and
                        (len(best_val_loss) > args.nonmono
                         and val_loss > min(best_val_loss[:-args.nonmono]))):
                        logging.info('Switching to ASGD')
                        self.optimizer = torch.optim.ASGD(
                            self.model.parameters(),
                            lr=args.lr,
                            t0=0,
                            lambd=0.,
                            weight_decay=args.wdecay)

                    best_val_loss.append(val_loss)

        except Exception as e:
            logging.info('-' * 89)
            logging.info(e)
            logging.info('Exiting from training early')
            return 0, 10000, 10000

        # Load the best saved model.
        self.model_load(os.path.join(args.save, 'checkpoint.incumbent'))

        # Run on test data.
        val_loss = self.evaluate(self.val_data, self.eval_batch_size)
        logging.info(math.exp(val_loss))
        test_loss = self.evaluate(self.test_data, self.test_batch_size)
        logging.info('=' * 89)
        logging.info(
            '| End of training | test loss {:5.2f} | test ppl {:8.2f} | test bpc {:8.3f}'
            .format(test_loss, math.exp(test_loss), test_loss / math.log(2)))
        logging.info('=' * 89)

        return 0, math.exp(val_loss), math.exp(test_loss)

    def train(self):
        args = self.args
        corpus = self.corpus
        total_loss = 0
        start_time = time.time()
        hidden = [
            self.model.init_hidden(args.small_batch_size)
            for _ in range(args.batch_size // args.small_batch_size)
        ]
        batch, i = 0, 0

        while i < self.train_data.size(0) - 1 - 1:
            bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2.
            # Prevent excessively small or negative sequence lengths
            seq_len = max(5, int(np.random.normal(bptt, 5)))
            # There's a very small chance that it could select a very long sequence length resulting in OOM
            seq_len = min(seq_len, args.bptt + args.max_seq_length_delta)

            lr2 = self.optimizer.param_groups[0]['lr']
            self.optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt
            self.model.train()
            data, targets = get_batch(self.train_data,
                                      i,
                                      args,
                                      seq_len=seq_len)

            self.optimizer.zero_grad()

            start, end, s_id = 0, args.small_batch_size, 0
            while start < args.batch_size:
                cur_data, cur_targets = data[:, start:
                                             end], targets[:, start:
                                                           end].contiguous(
                                                           ).view(-1)

                # Starting each batch, we detach the hidden state from how it was previously produced.
                # If we didn't, the model would try backpropagating all the way to start of the dataset.
                hidden[s_id] = repackage_hidden(hidden[s_id])

                log_prob, hidden[s_id], rnn_hs, dropped_rnn_hs = self.model(
                    cur_data, hidden[s_id], return_h=True)
                raw_loss = nn.functional.nll_loss(
                    log_prob.view(-1, log_prob.size(2)), cur_targets)

                loss = raw_loss
                # Activiation Regularization
                if args.alpha > 0:
                    loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean()
                                      for dropped_rnn_h in dropped_rnn_hs[-1:])
                # Temporal Activation Regularization (slowness)
                loss = loss + sum(args.beta *
                                  (rnn_h[1:] - rnn_h[:-1]).pow(2).mean()
                                  for rnn_h in rnn_hs[-1:])
                loss *= args.small_batch_size / args.batch_size
                total_loss += raw_loss.data * args.small_batch_size / args.batch_size
                loss.backward()

                s_id += 1
                start = end
                end = start + args.small_batch_size

                gc.collect()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs.
            torch.nn.utils.clip_grad_norm(self.model.parameters(), args.clip)
            self.optimizer.step()

            # total_loss += raw_loss.data
            self.optimizer.param_groups[0]['lr'] = lr2

            if np.isnan(total_loss[0]):
                raise

            #if batch % args.log_interval == 0 and batch > 0:
            #    cur_loss = total_loss[0] / args.log_interval
            #    elapsed = time.time() - start_time
            #    logging.info('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
            #            'loss {:5.2f} | ppl {:8.2f}'.format(
            #        self.epoch, batch, len(self.train_data) // args.bptt, self.optimizer.param_groups[0]['lr'],
            #        elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
            #    total_loss = 0
            #    start_time = time.time()
            batch += 1
            i += seq_len
        self.epoch += 1

    def evaluate(self, data_source, batch_size=10):
        # Turn on evaluation mode which disables dropout.
        self.model.eval()
        total_loss = 0
        hidden = self.model.init_hidden(batch_size)
        for i in range(0, data_source.size(0) - 1, self.args.bptt):
            data, targets = get_batch(data_source,
                                      i,
                                      self.args,
                                      evaluation=True)
            targets = targets.view(-1)

            log_prob, hidden = self.model(data, hidden)
            loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)),
                                          targets).data

            total_loss += loss * len(data)

            hidden = repackage_hidden(hidden)
        return total_loss[0] / len(data_source)
Ejemplo n.º 11
0
def train():
    # 载入数据与配置模型
    print("Loading data...")
    corpus = Corpus(train_dir)
    print(corpus)

    config = Config()
    config.vocab_size = len(corpus.dictionary)
    train_data = batchify(corpus.train, config.batch_size)
    train_len = train_data.size(0)
    seq_len = config.seq_len

    print("Configuring model...")
    model = RNNModel(config)
    if use_cuda:
        model.cuda()
    print(model)

    criterion = nn.CrossEntropyLoss()
    lr = config.learning_rate  # 初始学习率
    start_time = time.time()

    print("Training and generating...")
    for epoch in range(1, config.num_epochs + 1):  # 多轮次训练
        total_loss = 0.0
        model.train()  # 在训练模式下dropout才可用。
        hidden = model.init_hidden(config.batch_size)  # 初始化隐藏层参数

        for ibatch, i in enumerate(range(0, train_len - 1, seq_len)):
            data, targets = get_batch(train_data, i, seq_len)  # 取一个批次的数据
            # 在每批开始之前,将隐藏的状态与之前产生的结果分离。
            # 如果不这样做,模型会尝试反向传播到数据集的起点。
            hidden = repackage_hidden(hidden)
            model.zero_grad()

            output, hidden = model(data, hidden)
            loss = criterion(output.view(-1, config.vocab_size), targets)
            loss.backward()  # 反向传播

            # `clip_grad_norm` 有助于防止RNNs/LSTMs中的梯度爆炸问题。
            torch.nn.utils.clip_grad_norm(model.parameters(), config.clip)
            for p in model.parameters():  # 梯度更新
                p.data.add_(-lr, p.grad.data)

            total_loss += loss.data  # loss累计

            if ibatch % config.log_interval == 0 and ibatch > 0:  # 每隔多少个批次输出一次状态
                cur_loss = total_loss[0] / config.log_interval
                elapsed = get_time_dif(start_time)
                print(
                    "Epoch {:3d}, {:5d}/{:5d} batches, lr {:2.3f}, loss {:5.2f}, ppl {:8.2f}, time {}"
                    .format(epoch, ibatch, train_len // seq_len, lr, cur_loss,
                            math.exp(cur_loss), elapsed))
                total_loss = 0.0
        lr /= 4.0  # 在一轮迭代完成后,尝试缩小学习率

        # 每隔多少轮次保存一次模型参数
        if epoch % config.save_interval == 0:
            torch.save(model.state_dict(),
                       os.path.join(save_dir, model_name.format(epoch)))

        print(''.join(generate(model, corpus.dictionary.idx2word)))
Ejemplo n.º 12
0
def build_model(resume):
    """Build the model according to CLI arguments

    Global Dependencies:
        - corpus
        - args
    """
    if resume != "":
        model = torch.load(resume)
        for param in model.parameters():
            param.requires_grad = False
            if param.shape[0] == ntoken and param.shape[1] >= 1:
                param.requires_grad = True
            print(param.shape, param.requires_grad)
        return model

    # noise for soise sampling in NCE
    noise = build_unigram_noise(torch.FloatTensor(corpus.vocab.idx2count))

    norm_term = 'auto' if args.norm_term == -1 else args.norm_term
    # setting up NCELoss modules

    if args.index_module == 'linear':
        criterion = IndexLinear(
            args.nhid,
            ntoken,
            args.trick,
            noise=noise,
            noise_ratio=args.noise_ratio,
            norm_term=norm_term,
            theta=args.theta,
            loss_type=args.loss,
            reduction='none',
            sample_with_replacement=args.sample_with_replacement,
            grouping=args.sample_with_grouping)
        model = RNNModel(
            ntoken,
            args.emsize,
            args.nhid,
            args.nlayers,
            criterion=criterion,
            dropout=args.dropout,
        )
    elif args.index_module == 'gru':
        if args.nlayers != 1:
            logger.warning(
                'Falling into one layer GRU due to Index_GRU supporting')
        nce_criterion = IndexGRU(
            ntoken,
            args.nhid,
            args.nhid,
            args.dropout,
            noise=noise,
            noise_ratio=args.noise_ratio,
            norm_term=norm_term,
        )
        model = GenModel(criterion=nce_criterion, )
    else:
        logger.error('The index module [%s] is not supported yet' %
                     args.index_module)
        raise (NotImplementedError('index module not supported'))

    if args.cuda:
        model.cuda()

    logger.info('model definition:\n %s', model)
    return model
Ejemplo n.º 13
0
def main():
    torch_num_threads = 25
    torch.set_num_threads(torch_num_threads)
    ''' Main function'''
    parser = argparse.ArgumentParser()

    #parser.add_argument('-data', required=True)
    parser.add_argument('-torch_threads', type=int, default=25)
    
    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-batch_size', type=int, default=8)
    
    #parser.add_argument('-d_word_vec', type=int, default=512)
    parser.add_argument('-d_model', type=int, default=8)
    parser.add_argument('-d_inner_hid', type=int, default=8)

    parser.add_argument('-n_warmup_steps', type=int, default=3)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default=None)
    parser.add_argument('-save_model', default='model')
    parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best')

    parser.add_argument('-no_cuda', action='store_true')

    parser.add_argument('-network', type=int, default=0) # use social network; need features or deepwalk embeddings as initial input
    parser.add_argument('-pos_emb', type=int, default=1)
    parser.add_argument('-warmup', type=int, default=3) # warmup epochs
    parser.add_argument('-notes', default='')
    parser.add_argument('-data_name', default='twitter')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model
    if opt.network==1:
        opt.network = True
    else:
        opt.network = False
    if opt.pos_emb==1:
        opt.pos_emb = True
    else:
        opt.pos_emb = False
    print(opt.notes)
    

    #========= Preparing DataLoader =========#
    train_data = DataLoader(opt.data_name, data=0, load_dict=True, batch_size=opt.batch_size, cuda=opt.cuda, loadNE=opt.network)
    valid_data = DataLoader(opt.data_name, data=1, batch_size=opt.batch_size, cuda=opt.cuda, loadNE=opt.network)
    test_data = DataLoader(opt.data_name, data=2, batch_size=opt.batch_size, cuda=opt.cuda, loadNE=opt.network)

    opt.user_size = train_data.user_size
    if opt.network:
        opt.net = train_data._adj_list
        opt.net_dict = train_data._adj_dict_list
        opt.embeds = train_data._embeds

    #========= Preparing Model =========#
    #print(opt)

    decoder = RNNModel('GRUCell', opt)
    RLLearner = RRModel(decoder)
    #print(transformer)

    optimizer = ScheduledOptim(
        optim.Adam(
            RLLearner.parameters(),
            betas=(0.9, 0.98), eps=1e-09),
        opt.d_model, opt.n_warmup_steps)


    def get_criterion(user_size):
        ''' With PAD token zero weight '''
        weight = torch.ones(user_size)
        weight[Constants.PAD] = 0
        weight[Constants.EOS] = 1
        return nn.CrossEntropyLoss(weight, size_average=False)

    crit = get_criterion(train_data.user_size)

    if opt.cuda:
        decoder = decoder.cuda()
        RLLearner = RLLearner.cuda()
        crit = crit.cuda()

    train(RLLearner, train_data, valid_data, test_data, crit, optimizer, opt)
Ejemplo n.º 14
0
    def __init__(self, save_path,seed=111,val_interval = 20, val_times=1,controller = None, batch_size=128, grad_clip=0.1, config='eval'):

        args = {'emsize':850, 'nhid':850, 'nhidlast':850, 'dropoute':0.1, 'wdecay':8e-7}
        args['config'] = config

        args['data'] = '../data/penn'
        args['lr'] = 20
        args['clip'] = grad_clip
        args['batch_size'] = batch_size
        args['search_batch_size'] = 256*4
        args['small_batch_size'] = batch_size
        args['bptt'] = 35
        args['dropout'] = 0.75
        args['dropouth'] = 0.25
        args['dropoutx'] = 0.75
        args['dropouti'] = 0.2
        args['seed'] = seed
        args['nonmono'] = 5
        args['log_interval'] = val_interval
        args['val_times'] = val_times
        args['save'] = save_path
        args['alpha'] = 0
        args['beta'] = 1e-3
        args['max_seq_length_delta'] = 20
        args['unrolled'] = True
        args['gpu'] = 0
        args['cuda'] = True
        args = AttrDict(args)
        self.args = args
        self.seed = seed
        self.controller = controller
        
        
        
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        torch.cuda.set_device(args.gpu)
        cudnn.benchmark = True
        cudnn.enabled=True
        torch.cuda.manual_seed_all(args.seed)

        corpus = data.Corpus(args.data)
        self.corpus = corpus

        eval_batch_size = 64
        test_batch_size = 1
        args.eval_batch_size = eval_batch_size
        
        
        
        self.train_data = batchify(corpus.train, args.batch_size, args)
        self.search_data = batchify(corpus.valid, args.search_batch_size, args)
        
#         self.val_data = batchify(corpus.train[464794:], eval_batch_size, args)
#         self.test_data = batchify(corpus.test, test_batch_size, args)
#         raw_data = batchify(corpus.train, batch_size, None)
#         indx = np.arange(14524)
#         random.shuffle(indx)
        
#         self.train_data = raw_data[indx[0:int(14524/2)],:]
#         self.val_data = raw_data[indx[int(14524/2):],:]
        
        raw_data = batchify(corpus.valid, 1, None)
        val_data = []
        for i in range(len(raw_data)-1-args.bptt):
            val_data.append(raw_data[i:i+args.bptt+1])
        val_data = torch.cat(val_data,1)
        self.val_data = val_data
        
        
        print(self.train_data.shape)
        print(self.search_data.shape)
        print(self.val_data.shape)
        
        self.batch = 0
        self.steps = 0
        self.epochs = 0
        self.total_loss = 0
        self.start_time = time.time()


        ntokens = len(corpus.dictionary)
        #if args.continue_train:
        #    model = torch.load(os.path.join(args.save, 'model.pt'))
#         try:
#             model = torch.load(os.path.join(args.save, 'model.pt'))
#             print('Loaded model from checkpoint')
#         except Exception as e:
#             print(e)
        model = RNNModel(ntokens, args.emsize, args.nhid, args.nhidlast,
               args.dropout, args.dropouth, args.dropoutx, args.dropouti, args.dropoute, genotype=genotypes.DARTS)

        size = 0
        for p in model.parameters():
            size += p.nelement()
        logging.info('param size: {}'.format(size))
        logging.info('initial genotype:')
        logging.info(model.rnns[0].genotype)

        total_params = sum(x.data.nelement() for x in model.parameters())
        logging.info('Args: {}'.format(args))
        logging.info('Model total parameters: {}'.format(total_params))

        self.model = model.cuda()
        self.optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wdecay)
Ejemplo n.º 15
0
else:
    logging.basicConfig(format='%(asctime)s: %(message)s',
                        datefmt='%H:%M:%S',
                        filename=os.path.join(args.out, 'train.log'),
                        level=logging.INFO)
tb.configure(args.out)
random.seed(1024)
torch.manual_seed(1024)
torch.cuda.manual_seed_all(1024)

model = RNNModel(123, 62, 250, 3, args.dropout, bidirectional=args.bi)
if args.init: model.load_state_dict(torch.load(args.init))
else:
    for param in model.parameters():
        torch.nn.init.uniform(param, -0.1, 0.1)
if args.cuda: model.cuda()

optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=.9)
criterion = CTCLoss()

# data set
trainset = SequentialLoader('train', args.batch_size)
devset = SequentialLoader('dev', args.batch_size)

tri = cvi = 0


def eval():
    global cvi
    losses = []
    tacc = TokenAcc()
Ejemplo n.º 16
0
def main():
    # Add ckp
    parser = argparse.ArgumentParser(
        description='PyTorch PennTreeBank RNN/LSTM Language Model')
    parser.add_argument(
        '--data',
        type=str,
        default='/input',  # /input
        help='location of the data corpus')
    parser.add_argument('--checkpoint',
                        type=str,
                        default='',
                        help='model checkpoint to use')
    parser.add_argument(
        '--model',
        type=str,
        default='LSTM',
        help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)')
    parser.add_argument('--emsize',
                        type=int,
                        default=200,
                        help='size of word embeddings')
    parser.add_argument('--nhid',
                        type=int,
                        default=200,
                        help='number of hidden units per layer')
    parser.add_argument('--nlayers',
                        type=int,
                        default=2,
                        help='number of layers')
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        help='initial learning rate')
    parser.add_argument('--clip',
                        type=float,
                        default=0.25,
                        help='gradient clipping')
    parser.add_argument('--epochs',
                        type=int,
                        default=40,
                        help='upper epoch limit')
    parser.add_argument('--batch_size',
                        type=int,
                        default=256,
                        metavar='N',
                        help='batch size')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.2,
                        help='dropout applied to layers (0 = no dropout)')
    parser.add_argument('--tied',
                        action='store_true',
                        help='tie the word embedding and softmax weights')
    parser.add_argument('--seed', type=int, default=1111, help='random seed')
    parser.add_argument('--cuda', action='store_true', help='use CUDA')
    parser.add_argument('--log-interval',
                        type=int,
                        default=200,
                        metavar='N',
                        help='report interval')
    parser.add_argument(
        '--save',
        type=str,
        default='/output/model.pt',  # /output
        help='path to save the final model')
    args = parser.parse_args()

    # Set the random seed manually for reproducibility.
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        if not args.cuda:
            print(
                "WARNING: You have a CUDA device, so you should probably run with --cuda"
            )
        else:
            torch.cuda.manual_seed(args.seed)

    # Load checkpoint
    build_vocab = False
    if args.checkpoint != '' and os.path.exists(args.checkpoint):
        print(f'Loading field from {args.checkpoint}')
        save_dict = torch.load(args.checkpoint)
        field = save_dict['field']
        start_epoch = save_dict['start_epoch']
    else:
        save_dict = None
        field = Field(tokenize=split_tokenize, init_token='<init>')
        build_vocab = True
        start_epoch = 0

    ###############################################################################
    # Load data
    ###############################################################################

    train_data, val_data, test_data = TabularDataset.splits(
        path=args.data,
        train='train.txt',
        validation='valid.txt',
        test='test.txt',
        format='tsv',
        fields=[('text', field)])
    print(train_data, len(train_data), val_data, len(val_data), test_data,
          len(test_data))
    if build_vocab:
        field.eos_token = '<eos>'
        field.build_vocab(train_data, val_data, min_freq=1000)
        field.eos_token = None
    eos_id = field.vocab.stoi['<eos>']
    pad_id = field.vocab.stoi[field.pad_token]

    train_iter = BucketIterator(train_data,
                                args.batch_size,
                                train=True,
                                repeat=False,
                                device='cuda:0' if args.cuda else 'cpu:0')
    val_iter = Iterator(val_data,
                        args.batch_size,
                        repeat=False,
                        device='cuda:0' if args.cuda else 'cpu:0')
    test_iter = Iterator(test_data,
                         args.batch_size,
                         repeat=False,
                         device='cuda:0' if args.cuda else 'cpu:0')
    print(train_iter, len(train_iter), val_iter, len(val_iter), test_iter,
          len(test_iter))

    ###############################################################################
    # Build the model
    ###############################################################################

    ntokens = len(field.vocab)
    model = RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers,
                     args.dropout, args.tied)

    if save_dict is not None:
        model.load_state_dict(save_dict['model'])

    if args.cuda:
        model.cuda()
    else:
        model.cpu()
    print(model)

    if save_dict:
        opt = save_dict['optimizer']
    else:
        opt = torch.optim.Adam(model.parameters(), lr=args.lr)

    if args.checkpoint:
        torch.save(
            dict(field=field,
                 model=model.state_dict(),
                 optimizer=opt,
                 start_epoch=start_epoch), args.checkpoint)

    ###############################################################################
    # Training code
    ###############################################################################

    criterion = torch.nn.CrossEntropyLoss(ignore_index=pad_id)

    def make_target(text):
        batch_size = text.size()[1]
        eos_vector = torch.full((1, batch_size),
                                eos_id,
                                dtype=text.dtype,
                                device='cuda:0' if args.cuda else 'cpu:0')
        target = torch.cat((text[1:], eos_vector), dim=0)
        return target

    def compute_loss(output, text):
        output_flat = output.view(-1, ntokens)
        target = make_target(text)
        target_flat = target.view(-1)

        return criterion(output_flat, target_flat)

    def evaluate(data_source):
        # Turn on evaluation mode which disables dropout.
        with torch.no_grad():
            model.eval()
            total_loss = 0
            for batch in data_source:
                output, hidden = model(batch.text)
                loss = compute_loss(output, batch.text)

                total_loss += loss.item()
            return total_loss / len(data_source)

    def train():
        # Turn on training mode which enables dropout.
        model.train()
        total_loss = 0
        start_time = time.time()
        for i, batch in enumerate(train_iter):
            model.zero_grad()

            output, hidden = model(batch.text)
            target = make_target(batch.text)

            loss = compute_loss(output, batch.text)
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
            opt.step()

            total_loss += loss.item()

            if i % args.log_interval == 0 and i > 0:
                cur_loss = total_loss / args.log_interval
                elapsed = time.time() - start_time
                print(
                    '| epoch {:3d} | {:5d}/{:5d} batches | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                        epoch, i, len(train_iter),
                        elapsed * 1000 / args.log_interval, cur_loss,
                        math.exp(cur_loss)))
                total_loss = 0
                start_time = time.time()

    # Loop over epochs.
    best_val_loss = None

    # At any point you can hit Ctrl + C to break out of training early.
    try:
        for epoch in range(start_epoch, args.epochs):
            epoch_start_time = time.time()
            train()
            val_loss = evaluate(val_iter)
            print('-' * 89)
            print(
                '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch,
                                           (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
            print('-' * 89)
            # Save the model if the validation loss is the best we've seen so far.
            if not best_val_loss or val_loss < best_val_loss:
                if args.checkpoint:
                    torch.save(
                        dict(field=field,
                             model=model.state_dict(),
                             optimizer=opt,
                             start_epoch=epoch), args.checkpoint)
                best_val_loss = val_loss
    except KeyboardInterrupt:
        print('-' * 89)
        print('Exiting from training early')

    torch.save(
        dict(vocab=field.vocab.itos,
             model=model.state_dict(),
             settings=dict(rnn_type=args.model,
                           emsize=args.emsize,
                           nhid=args.nhid,
                           nlayers=args.nlayers)), args.save)

    # Load the best saved model.
    #with open(args.save, 'rb') as f:
    #    save_dict = torch.load(f)
    #    field = save_dict['field']
    #    if save_dict is not None:
    #        model.load_state_dict(save_dict['model'])
    #
    #    if args.cuda:
    #        model.cuda()
    #    else:
    #        model.cpu()

    # Run on test data.
    test_loss = evaluate(test_iter)
    print('=' * 89)
    print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
        test_loss, math.exp(test_loss)))
    print('=' * 89)
Ejemplo n.º 17
0
Archivo: main.py Proyecto: Phlix1/exps
encoder = nn.Embedding(ntokens, args.emsize, sparse=False)
util.initialize(encoder.weight)

twht = None
if args.tied:
    if args.nhid != args.emsize and not args.proj:
        raise ValueError(
            'When using the tied flag, hidden must be equal to embedding size')
    twht = encoder.weight

D = args.emsize if args.proj else args.nhid
ss = SampledSoftmax(ntokens, nsampled, D, tied_weight=twht)

net.add_module("encoder", encoder)
net.add_module("decoder", ss)
net.cuda()
tmp_net = net
if world_size >= 1:
    tmp_net = DDP(net)
tmp_net.init_hidden = net.init_hidden
net = tmp_net

print("Batch Size:", args.batch_size * args.scale, "Initial LR:",
      args.lr * args.scale)
criterion = nn.CrossEntropyLoss()
optimizer = Adam(net.parameters(), args.lr * args.scale, betas=(0.9, 0.999))
scheduler = LinearLR(optimizer,
                     base_lr=args.lr * args.scale,
                     max_iters=train_corpus.batch_num * args.epochs,
                     last_iter=-1,
                     min_lr=1e-8)