Beispiel #1
0
def build_model(corpus, model_name, emsize, nhid, nlayers, dropout, dropouth,
                dropouti, dropoute, wdrop, lr, tied, resume, cuda):
    criterion = None

    ntokens = len(corpus.dictionary)
    model = model_module.RNNModel(model_name, ntokens, emsize, nhid, nlayers,
                                  dropout, dropouth, dropouti, dropoute, wdrop,
                                  tied)
    ###
    if resume:
        print('Resuming model ...')
        model, criterion, optimizer = model_load(resume)
        optimizer.param_groups[0]['lr'] = lr
        model.dropouti, model.dropouth, model.dropout, model.dropoute = dropouti, dropouth, dropout, dropoute
        if wdrop:
            from weight_drop import WeightDrop
            for rnn in model.rnns:
                if type(rnn) == WeightDrop: rnn.dropout = wdrop
                elif rnn.zoneout > 0: rnn.zoneout = wdrop
    ###
    if not criterion:
        splits = []
        if ntokens > 500000:
            # One Billion
            # This produces fairly even matrix mults for the buckets:
            # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422
            splits = [4200, 35000, 180000]
        elif ntokens > 75000:
            # WikiText-103
            splits = [2800, 20000, 76000]
        print('Using', splits)
        criterion = SplitCrossEntropyLoss(emsize, splits=splits, verbose=False)
    ###
    if cuda:
        model = model.cuda()
        criterion = criterion.cuda()
    ###
    params = list(model.parameters()) + list(criterion.parameters())
    total_params = sum(x.size()[0] *
                       x.size()[1] if len(x.size()) > 1 else x.size()[0]
                       for x in params if x.size())
    print('Args:', args)
    print('Model total parameters:', total_params)

    return model, criterion, None
Beispiel #2
0
def build_model(args, corpus):
    criterion = None
    ntokens = len(corpus.dictionary)
    model = RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers,
                     args.dropout, args.dropouth, args.dropouti, args.dropoute,
                     args.wdrop, args.tied)
    ###
    if args.resume:
        logging.info('Resuming model ...')
        model, criterion, optimizer = model_load(args.resume_path)
        optimizer.param_groups[0]['lr'] = args.lr
        model.dropouti, model.dropouth, model.dropout, args.dropoute = args.dropouti, args.dropouth, args.dropout, args.dropoute
        if args.wdrop:
            from weight_drop import WeightDrop
            for rnn in model.rnns:
                if type(rnn) == WeightDrop: rnn.dropout = args.wdrop
                elif rnn.zoneout > 0: rnn.zoneout = args.wdrop
    ###
    if not criterion:
        splits = []
        if ntokens > 500000:
            # One Billion
            # This produces fairly even matrix mults for the buckets:
            # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422
            splits = [4200, 35000, 180000]
        elif ntokens > 75000:
            # WikiText-103
            splits = [2800, 20000, 76000]
        logging.info(f'Using {splits}')
        criterion = SplitCrossEntropyLoss(args.emsize,
                                          splits=splits,
                                          verbose=False)
    ###
    params = list(model.parameters()) + list(criterion.parameters())
    total_params = sum(x.size()[0] *
                       x.size()[1] if len(x.size()) > 1 else x.size()[0]
                       for x in params if x.size())
    logging.info(f'Args: {args}')
    logging.info(f'Model total parameters: {total_params}')

    if args.cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    return model, criterion
Beispiel #3
0
 def build_criterion(self):
     splits = []
     if self.ninp > 500000:
         # One Billion
         # This produces fairly even matrix mults for the buckets:
         # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422
         splits = [4200, 35000, 180000]
     elif self.ninp > 75000:
         # WikiText-103
         splits = [2800, 20000, 76000]
     logging.info('Using splits: {}'.format(' '.join(splits)))
     self.criterion = SplitCrossEntropyLoss(self.ninp, splits=splits, verbose=False)
Beispiel #4
0
    #    for rnn in model.rnns:
    #        if type(rnn) == WeightDrop: rnn.dropout = args.wdrop
    #        elif rnn.zoneout > 0: rnn.zoneout = args.wdrop
###
if not criterion:
    splits = []
    if ntokens > 500000:
        # One Billion
        # This produces fairly even matrix mults for the buckets:
        # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422
        splits = [4200, 35000, 180000]
    elif ntokens > 75000:
        # WikiText-103
        splits = [2800, 20000, 76000]
    print('Using', splits)
    criterion = SplitCrossEntropyLoss(args.emsize, splits=splits, verbose=False)
###
if args.cuda:
    model = model.cuda()
    criterion = criterion.cuda()
if False: # or args.jit:
    print('Jitting ...')
    model.eval()
    model.lmr = torch.jit.trace(model.lmr, (torch.rand([args.bptt, args.batch_size, args.emsize]).cuda(), torch.rand([1, args.batch_size, args.emsize]).cuda()))
#model = torch.jit.trace_module(model, torch.zeros((args.bptt, args.batch_size), dtype=torch.long))
###
params = list(model.parameters()) + list(criterion.parameters())
total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in params if x.size())
print('Args:', args)
print('Model total parameters:', total_params)
Beispiel #5
0

criterion = None
if not criterion:
    splits = []
    if ntokens > 500000:
        # One Billion
        # This produces fairly even matrix mults for the buckets:
        # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422
        splits = [4200, 35000, 180000]
    elif ntokens > 75000:
        # WikiText-103
        splits = [2800, 20000, 76000]
    print('Using', splits)
    criterion = SplitCrossEntropyLoss(args.emsize,
                                      splits=splits,
                                      verbose=False)


def train():
    # Turn on training mode which enables dropout.
    if args.model == 'QRNN': model.reset()
    total_loss = 0
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)
    batch, i = 0, 0
    while i < train_data.size(0) - 1 - 1:
        bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2.
        # Prevent excessively small or negative sequence lengths
        seq_len = max(5, int(np.random.normal(bptt, 5)))
Beispiel #6
0
    #             rnn.zoneout = args.whhdrop
###
if not criterion:
    splits = []
    if ntokens > 500000:
        # One Billion
        # This produces fairly even matrix mults for the buckets:
        # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422
        splits = [4200, 35000, 180000]
    elif ntokens > 75000:
        # WikiText-103
        splits = [2800, 20000, 76000]
    elif ntokens > 20000:
        splits = [5000, 20000]
    print('Using', splits)
    criterion = SplitCrossEntropyLoss(args.nhid, splits=splits,
                                      tied_weights=args.tied, verbose=False)
###
if args.cuda:
    model = model.cuda()
    criterion = criterion.cuda()
###
params = list(model.parameters()) + list(criterion.parameters())
print('{:-^60}'.format(''))
print('Args:', args)
print('{:-^60}'.format(''))
print('Model parameters:', count_parameters(model))
print('Criterion parameters:', count_parameters(criterion))


###############################################################################
# Training code
Beispiel #7
0
    if args.wdrop:
        for rnn in model.rnn.cells:
            rnn.hh.dropout = args.wdrop
###
if not criterion:
    splits = []
    if ntokens > 500000:
        # One Billion
        # This produces fairly even matrix mults for the buckets:
        # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422
        splits = [4200, 35000, 180000]
    elif ntokens > 75000:
        # WikiText-103
        splits = [2800, 20000, 76000]
    tools.print_log(args.save, splits)
    criterion = SplitCrossEntropyLoss(args.emsize, splits=splits, verbose=False)
    if args.mode == 'GPT':
        criterion_gpt = CrossEntropyLoss(ignore_index=-1)

###
if args.cuda:
    model = model.cuda()
    criterion = criterion.cuda()
###
params = list(filter(lambda x: x.requires_grad, model.parameters())) + list(criterion.parameters())
total_params = sum(p.data.nelement() for p in params if p.requires_grad)
if args.mode == 'GPT':
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight', 'ln_'] # Add 'ln_1' to test if it's better
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and 'transformer' in n],
Beispiel #8
0
        for rnn in model.rnns:
            if type(rnn) == WeightDrop: rnn.dropout = args.wdrop
            elif rnn.zoneout > 0: rnn.zoneout = args.wdrop
###
if not criterion:
    splits = []
    if ntokens > 500000:
        # One Billion
        # This produces fairly even matrix mults for the buckets:
        # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422
        splits = [4200, 35000, 180000]
    elif ntokens > 75000:
        # WikiText-103
        splits = [2800, 20000, 76000]
    print('Using', splits)
    criterion = SplitCrossEntropyLoss(args.emsize, splits=splits, verbose=False)
###
if args.cuda:
    model = model.cuda()
    criterion = criterion.cuda()
###
params = list(model.parameters()) + list(criterion.parameters())
total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in params if x.size())
print('Args:', args)
print('Model total parameters:', total_params)

###############################################################################
# Training code
###############################################################################

def evaluate(data_source, batch_size=10):
Beispiel #9
0
def main():

    parser = argparse.ArgumentParser(
        description='PyTorch PennTreeBank RNN/LSTM Language Model')
    parser.add_argument('--data',
                        type=str,
                        default='data/penn/',
                        help='location of the data corpus')
    parser.add_argument('--model',
                        type=str,
                        default='LSTM',
                        help='type of recurrent net (LSTM, QRNN, GRU)')
    parser.add_argument('--emsize',
                        type=int,
                        default=400,
                        help='size of word embeddings')
    parser.add_argument('--nhid',
                        type=int,
                        default=1150,
                        help='number of hidden units per layer')
    parser.add_argument('--nlayers',
                        type=int,
                        default=3,
                        help='number of layers')
    parser.add_argument('--lr',
                        type=float,
                        default=30,
                        help='initial learning rate')
    parser.add_argument('--clip',
                        type=float,
                        default=0.25,
                        help='gradient clipping')
    parser.add_argument('--epochs',
                        type=int,
                        default=8000,
                        help='upper epoch limit')
    parser.add_argument('--max-steps-per-epoch',
                        type=int,
                        default=-1,
                        help='upper steps per epoch epoch limit')
    parser.add_argument('--batch-size',
                        type=int,
                        default=80,
                        metavar='N',
                        help='batch size')
    parser.add_argument('--bptt', type=int, default=70, help='sequence length')
    parser.add_argument('--warmup',
                        type=int,
                        default=4000,
                        help='warmup for learning rate')
    parser.add_argument('--cooldown',
                        type=int,
                        default=None,
                        help='cooldown for learning rate')
    parser.add_argument(
        '--accumulate',
        type=int,
        default=1,
        help='number of batches to accumulate before gradient update')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.4,
                        help='dropout applied to layers (0 = no dropout)')
    parser.add_argument('--dropouth',
                        type=float,
                        default=0.3,
                        help='dropout for rnn layers (0 = no dropout)')
    parser.add_argument(
        '--dropouti',
        type=float,
        default=0.65,
        help='dropout for input embedding layers (0 = no dropout)')
    parser.add_argument(
        '--dropoute',
        type=float,
        default=0.1,
        help='dropout to remove words from embedding layer (0 = no dropout)')
    parser.add_argument(
        '--wdrop',
        type=float,
        default=0.0,
        help=
        'amount of weight dropout to apply to the RNN hidden to hidden matrix')
    parser.add_argument('--seed', type=int, default=1111, help='random seed')
    parser.add_argument('--nonmono', type=int, default=5, help='random seed')
    parser.add_argument('--cuda', action='store_false', help='use CUDA')
    parser.add_argument('--log-interval',
                        type=int,
                        default=200,
                        metavar='N',
                        help='report interval')
    randomhash = ''.join(str(time.time()).split('.'))
    parser.add_argument('--save',
                        type=str,
                        default=randomhash + '.pt',
                        help='path to save the final model')
    parser.add_argument(
        '--alpha',
        type=float,
        default=2,
        help=
        'alpha L2 regularization on RNN activation (alpha = 0 means no regularization)'
    )
    parser.add_argument(
        '--beta',
        type=float,
        default=1,
        help=
        'beta slowness regularization applied on RNN activiation (beta = 0 means no regularization)'
    )
    parser.add_argument('--wdecay',
                        type=float,
                        default=1.2e-6,
                        help='weight decay applied to all weights')
    parser.add_argument('--resume',
                        type=str,
                        default='',
                        help='path of model to resume')
    parser.add_argument('--optimizer',
                        type=str,
                        default='sgd',
                        help='optimizer to use (sgd, adam)')
    parser.add_argument(
        '--when',
        nargs="+",
        type=int,
        default=[-1],
        help=
        'When (which epochs) to divide the learning rate by 10 - accepts multiple'
    )
    parser.add_argument(
        '--discard-highest-losses',
        type=float,
        default=0.0,
        help=
        'discard highest percentage of prediction losses before executing an optimizer step'
    )
    parser.add_argument(
        '--enlarge-model-every-n-epochs',
        type=int,
        default=-1,
        help='enlarge model (hidden and embedding dims) after every n epochs')

    args = parser.parse_args()
    args.tied = True

    # Set the random seed manually for reproducibility.
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        if not args.cuda:
            print(
                "WARNING: You have a CUDA device, so you should probably run with --cuda"
            )
        else:
            torch.cuda.manual_seed(args.seed)

    ###############################################################################
    # Load data
    ###############################################################################

    import os
    import hashlib
    fn = 'corpus.{}.data'.format(hashlib.md5(args.data.encode()).hexdigest())
    if os.path.exists(fn):
        print('Loading cached dataset...')
        corpus = torch.load(fn)
    else:
        print('Producing dataset...')
        corpus = data.Corpus(args.data)
        torch.save(corpus, fn)

    eval_batch_size = min(100, args.batch_size)
    print('Eval batch size of', eval_batch_size)
    test_batch_size = 8
    train_data = batchify(corpus.train, args.batch_size, args)
    val_data = batchify(corpus.valid, eval_batch_size, args)
    test_data = batchify(corpus.test, test_batch_size, args)

    ###############################################################################
    # Build the model
    ###############################################################################

    from splitcross import SplitCrossEntropyLoss
    criterion = None

    ntokens = len(corpus.dictionary)
    print('Total number of tokens:', ntokens)
    #model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied)
    #model = model.BoomRNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied)
    if args.enlarge_model_every_n_epochs <= 0:
        model = SHARNN(args.model, ntokens, args.emsize, args.nhid,
                       args.nlayers, args.dropout, args.dropouth,
                       args.dropouti, args.dropoute, args.wdrop, args.tied)
    else:
        model = None
    #model = model.AttnRNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied)
    #model = model.RecAttn(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied)
    #model = model.LNRNN(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied)
    #model = model.LNRR(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied)
    ###

    splits = []
    if ntokens > 500000:
        # One Billion
        # This produces fairly even matrix mults for the buckets:
        # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422
        splits = [4200, 35000, 180000]
    elif ntokens > 75000:
        # WikiText-103
        splits = [2800, 20000, 76000]
    print('Using', splits)

    if model is not None:
        if args.resume and args.epochs > 0:
            print('Resuming model ...')
            criterion = model_load(args.resume, model)
            #optimizer.param_groups[0]['lr'] = args.lr
            model.dropouti, model.dropouth, model.dropout, args.dropoute = args.dropouti, args.dropouth, args.dropout, args.dropoute
            #if args.wdrop:
            #    from weight_drop import WeightDrop
            #    for rnn in model.rnns:
            #        if type(rnn) == WeightDrop: rnn.dropout = args.wdrop
            #        elif rnn.zoneout > 0: rnn.zoneout = args.wdrop
        ###
        if not criterion:

            criterion = SplitCrossEntropyLoss(args.emsize,
                                              splits=splits,
                                              verbose=False)
        ###
        if args.cuda:
            model = model.cuda()
            criterion = criterion.cuda()
        if False:  # or args.jit:
            print('Jitting ...')
            model.eval()
            model.lmr = torch.jit.trace(model.lmr, (torch.rand([
                args.bptt, args.batch_size, args.emsize
            ]).cuda(), torch.rand([1, args.batch_size, args.emsize]).cuda()))
        #model = torch.jit.trace_module(model, torch.zeros((args.bptt, args.batch_size), dtype=torch.long))
        ###

    ###############################################################################
    # Training code
    ###############################################################################

    # Loop over epochs.
    #lr = args.lr
    best_val_loss = []
    stored_loss = 100000000

    # At any point you can hit Ctrl + C to break out of training early.
    try:
        if model is not None:
            model, optimizer, params = init_optimizer(args, model, criterion)

        for epoch in range(1, args.epochs + 1):
            epoch_start_time = time.time()
            discard_highest_losses = args.discard_highest_losses * (
                args.epochs - epoch + 1) / args.epochs
            if args.enlarge_model_every_n_epochs > 0 and (
                    epoch - 1) % args.enlarge_model_every_n_epochs == 0:
                prev_model = model
                current_factor = (args.enlarge_model_every_n_epochs + epoch -
                                  1) / (args.enlarge_model_every_n_epochs +
                                        args.epochs)
                emsize = int(args.emsize * current_factor)
                nhid = int(args.nhid * current_factor)
                print(
                    f'enlarge model: emsize={emsize}, nhid={nhid} (discard_highest_losses={discard_highest_losses})'
                )
                model = SHARNN(args.model, ntokens, emsize, nhid, args.nlayers,
                               args.dropout, args.dropouth, args.dropouti,
                               args.dropoute, args.wdrop, args.tied)
                criterion = SplitCrossEntropyLoss(emsize,
                                                  splits=splits,
                                                  verbose=False)
                if args.cuda:
                    model = model.cuda()
                    criterion = criterion.cuda()
                if prev_model is not None:
                    model.load_from_smaller_and_freeze(prev_model)
                model, optimizer, params = init_optimizer(
                    args, model, criterion)

            train(model,
                  optimizer,
                  criterion,
                  args,
                  train_data,
                  params,
                  epoch=epoch - 1,
                  max_steps=args.max_steps_per_epoch,
                  discard_highest_losses=discard_highest_losses)
            if 't0' in optimizer.param_groups[0]:
                tmp = {}
                for prm in model.parameters():
                    tmp[prm] = prm.data.clone()
                    prm.data = optimizer.state[prm]['ax'].clone()

                val_loss2 = evaluate(model, criterion, args, val_data)
                print('-' * 89)
                print(
                    '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                    'valid ppl {:8.2f} | valid bpc {:8.3f}'.format(
                        epoch, (time.time() - epoch_start_time), val_loss2,
                        math.exp(val_loss2), val_loss2 / math.log(2)))
                print('-' * 89)

                if val_loss2 < stored_loss:
                    model_save(args.save, model, criterion)
                    print('Saving Averaged!')
                    stored_loss = val_loss2

                for prm in model.parameters():
                    prm.data = tmp[prm].clone()

            else:
                val_loss = evaluate(model, criterion, args, val_data,
                                    eval_batch_size)
                print('-' * 89)
                print(
                    '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                    'valid ppl {:8.2f} | valid bpc {:8.3f}'.format(
                        epoch, (time.time() - epoch_start_time), val_loss,
                        math.exp(val_loss), val_loss / math.log(2)))
                print('-' * 89)

                if val_loss < stored_loss:
                    model_save(args.save, model, criterion)
                    print('Saving model (new best validation)')
                    stored_loss = val_loss

                if args.optimizer == 'sgd' and 't0' not in optimizer.param_groups[
                        0] and (len(best_val_loss) > args.nonmono and
                                val_loss > min(best_val_loss[:-args.nonmono])):
                    print('Switching to ASGD')
                    optimizer = torch.optim.ASGD(model.parameters(),
                                                 lr=args.lr,
                                                 t0=0,
                                                 lambd=0.,
                                                 weight_decay=args.wdecay)

                if epoch in args.when:
                    print('Saving model before learning rate decreased')
                    model_save('{}.e{}'.format(args.save, epoch), model,
                               criterion)
                    print('Dividing learning rate by 10')
                    optimizer.param_groups[0]['lr'] /= 10.

                best_val_loss.append(val_loss)

    except KeyboardInterrupt:
        print('-' * 89)
        print('Exiting from training early')

    # Load the best saved model.
    criterion = model_load(args.save, model)

    params = list(model.parameters()) + list(criterion.parameters())
    total_params = sum(x.size()[0] *
                       x.size()[1] if len(x.size()) > 1 else x.size()[0]
                       for x in params if x.size())
    print('Model total parameters:', total_params)

    # Run on test data.
    test_loss = evaluate(model, criterion, args, test_data, test_batch_size)
    print('=' * 89)
    print(
        '| End of training | test loss {:5.2f} | test ppl {:8.2f} | test bpc {:8.3f}'
        .format(test_loss, math.exp(test_loss), test_loss / math.log(2)))
    print('=' * 89)
if args.resume:
    print('Resuming model ...')
    model_load(args.resume)
    optimizer.param_groups[0]['lr'] = args.lr
    model.dropouti, model.dropouth, model.dropout, args.dropoute = args.dropouti, args.dropouth, args.dropout, args.dropoute
    if args.wdrop:
        from weight_drop import WeightDrop
        for rnn in model.rnns:
            if type(rnn) == WeightDrop: rnn.dropout = args.wdrop
            elif rnn.zoneout > 0: rnn.zoneout = args.wdrop
###
if (not criterion):
    if args.sampling_loss:
        criterion = SamplingLoss(args.emsize, k=args.k, obj=args.obj, Z=args.Z, noise=unigram, q=args.q, b=args.b, g=args.g)  
    else:
        criterion = SplitCrossEntropyLoss(args.emsize, q=args.q, b=args.b, g=args.g)
###

if args.cuda:
    model = model.cuda()
    criterion = criterion.cuda()
###

params = list(model.parameters()) + list(criterion.parameters())
total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in params if x.size())
print('Args:', args)
print('Model total parameters:', total_params)

###############################################################################
# Training code
###############################################################################
                    default=False,
                    help='Calculate model performance on the test set')
parser.add_argument(
    '--significance_testing',
    action='store_true',
    default=False,
    help='Performance significance testing on baseline and MTS model')
parser.add_argument(
    '--unit_ablation',
    action='store_true',
    default=False,
    help='Evaluate model performance with unit ablation for layer 2')

args = parser.parse_args()

criterion = SplitCrossEntropyLoss(400, splits=[], verbose=False)
#entropy_calc = Entropy_calculation(400, splits=[], verbose=False)

seed = 141
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)


def model_load(fn):
    #global model
    with open(fn, 'rb') as f:
        model, criterion_m, optim = torch.load(
            f)  #, map_location=torch.device('cpu')) #,
    return model
Beispiel #12
0
            elif rnn.zoneout > 0:
                rnn.zoneout = model.wdrop
###
if not criterion:
    splits = []
    if ntokens > 500000:
        # One Billion
        # This produces fairly even matrix mults for the buckets:
        # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422
        splits = [4200, 35000, 180000]
    elif ntokens > 75000:
        # WikiText-103
        splits = [2800, 20000, 76000]
    print('Using', splits)
    criterion = SplitCrossEntropyLoss(args.emsize,
                                      splits=splits,
                                      verbose=False)
###
if args.cuda:
    model = model.cuda()
    criterion = criterion.cuda()
###

params, parser_params = [], []
for n, p in model.named_parameters():
    if "_att_" in n:
        parser_params.append(p)
    else:
        params.append(p)
for n, p in criterion.named_parameters():
    if "_att_" in n:
                               verbose=False)

    if args.kd:
        model_t = load_teacher(main_args.logs_path, args, ntokens)
        if model_t is None:
            raise Exception("Teacher model not found")
        model_t.eval()

    log_stats = vars(args)
    log_stats['experiment_id'] = main_args.experiment_id
    log_stats['init_time'] = init_time
    log_stats['num_params'] = sum(
        x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0]
        for x in custom_model.parameters() if x.size())

    criterion = SplitCrossEntropyLoss(args.emsize, splits=[], verbose=False)
    # criterion = torch.nn.CrossEntropyLoss()
    criterion_kd = DistillKL(args.kd_tau)

    if args.cuda:
        custom_model = custom_model.to(cuda)
        criterion = criterion.to(cuda)
        criterion_kd = criterion_kd.to(cuda)
        if args.kd:
            model_t = model_t.to(cuda)

    params = list(custom_model.parameters()) + list(
        criterion.parameters()) + list(criterion_kd.parameters())

    optimizer = torch.optim.Adam(params, lr=args.lr, weight_decay=args.wdecay)
Beispiel #14
0
###
if not criterion:
    splits = []
    if ntokens > 500000:
        # One Billion
        # This produces fairly even matrix mults for the buckets:
        # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422
        splits = [4200, 35000, 180000]
    elif ntokens > 75000:
        # WikiText-103
        splits = [2800, 20000, 76000]
    else:
        splits = [1400, 10000, 32000]
    print('Using', splits)
    criterion = SplitCrossEntropyLoss(args.emsize,
                                      splits=splits,
                                      verbose=False)
###
if args.cuda:
    model = model.cuda()
    criterion = criterion.cuda()
    # model_r = model_r.cuda()
    model_mlp = model_mlp.cuda()
###
params = list(model.parameters()) + list(model_mlp.parameters()) + list(
    criterion.parameters())
total_params = sum(x.size()[0] *
                   x.size()[1] if len(x.size()) > 1 else x.size()[0]
                   for x in params if x.size())
print('Args:', args)
print('Model total parameters:', total_params)