Exemple #1
0
def train():
    best_val = None
    for epoch in range(args.epochs):
        total_L = 0.0
        start_time = time.time()
        hidden = model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=context)
        for ibatch, i in enumerate(range(0, train_data.shape[0] - 1, args.bptt)):
            data, target = get_batch(train_data, i)
            hidden = detach(hidden)
            with autograd.record():
                output, hidden = model(data, hidden)
                L = loss(output, target)
                L.backward()

            grads = [i.grad(context) for i in model.collect_params().values()]
            # Here gradient is not divided by batch_size yet.
            # So we multiply max_norm by batch_size to balance it.
            gluon.utils.clip_global_norm(grads, args.clip * args.batch_size)

            trainer.step(args.batch_size)
            total_L += mx.nd.sum(L).asscalar()

            if ibatch % args.log_interval == 0 and ibatch > 0:
                cur_L = total_L / args.batch_size / args.bptt / args.log_interval
                print('[Epoch %d Batch %d] loss %.2f, ppl %.2f'%(
                    epoch, ibatch, cur_L, math.exp(cur_L)))
                total_L = 0.0

        val_L = eval(val_data)

        print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f'%(
            epoch, time.time()-start_time, val_L, math.exp(val_L)))
Exemple #2
0
def train():
    total_loss = 0.0
    start_time = time.time()
    hidden = model.begin_state(func=mx.nd.zeros,
                               batch_size=args.batch_size,
                               ctx=context)
    for batch, i in enumerate(range(0, train_data.shape[0] - 1, args.bptt)):
        data, target = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = detach(hidden)
        with autograd.record():
            output, hidden = model(data, hidden)
            loss = criterion(output, target)
        loss.backward()

        grads = [p.grad(context) for p in model.collect_params().values()]
        # grad clipping helps prevent the exploding gradient problem in RNNs / LSTMs.
        gluon.utils.clip_global_norm(grads,
                                     args.clip * args.bptt * args.batch_size)
        trainer.step(args.bptt * args.batch_size)
        total_loss += mx.nd.sum(loss).asscalar() / loss.shape[0]

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} '
                  '| ms/batch {:5.2f} | loss {:5.2f} | ppl {:8.2f}'.format(
                      epoch + 1, batch,
                      len(train_data) // args.bptt, lr,
                      elapsed * 1000 / args.log_interval, cur_loss,
                      math.exp(cur_loss)))
            total_loss = 0.0
            start_time = time.time()
Exemple #3
0
def train():
    best_val = float("Inf")
    for epoch in range(args.epochs):
        total_L = 0.0
        start_time = time.time()
        hidden = model.begin_state(func=mx.nd.zeros,
                                   batch_size=args.batch_size,
                                   ctx=context)

        # add time checkpoint for logging
        before_time = start_time

        for i, (data, target) in enumerate(train_data):
            data = data.as_in_context(context).T
            target = target.as_in_context(context).T.reshape((-1, 1))
            hidden = detach(hidden)
            with autograd.record():
                output, hidden = model(data, hidden)
                # Here L is a vector of size batch_size * bptt size
                L = loss(output, target)
                L = L / (args.bptt * args.batch_size)
                L.backward()

            grads = [p.grad(context) for p in model.collect_params().values()]
            gluon.utils.clip_global_norm(grads, args.clip)

            trainer.step(1)
            total_L += mx.nd.sum(L).asscalar()

            if i % args.log_interval == 0 and i > 0:

                # log interval latency print
                batch_latency = time.time() - before_time
                before_time = time.time()

                cur_L = total_L / args.log_interval
                print(
                    '[Epoch %d Batch %d] loss %.2f, ppl %.2f, batch_latency %.4f'
                    % (epoch, i, cur_L, math.exp(cur_L), batch_latency))
                total_L = 0.0

            if args.export_model:
                model.export('model')
                return

        val_L = eval(val_data)

        print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f' %
              (epoch, time.time() - start_time, val_L, math.exp(val_L)))

        if val_L < best_val:
            best_val = val_L
            test_L = eval(test_data)
            model.save_parameters(args.save)
            print('test loss %.2f, test ppl %.2f' % (test_L, math.exp(test_L)))
        else:
            args.lr = args.lr * 0.25
            trainer.set_learning_rate(args.lr)
Exemple #4
0
def eval(data_source):
    total_L = 0.0
    ntotal = 0
    hidden = model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=context)
    for i in range(0, data_source.shape[0] - 1, args.bptt):
        data, target = get_batch(data_source, i)
        output, hidden = model(data, hidden)
        L = loss(output, target)
        total_L += mx.nd.sum(L).asscalar()
        ntotal += L.size
    return total_L / ntotal
Exemple #5
0
def eval(data_source):
    total_L = 0.0
    ntotal = 0
    hidden = model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=context)
    for i, (data, target) in enumerate(data_source):
        data = data.as_in_context(context).T
        target = target.as_in_context(context).T.reshape((-1, 1))
        output, hidden = model(data, hidden)
        L = loss(output, target)
        total_L += mx.nd.sum(L).asscalar()
        ntotal += L.size
    return total_L / ntotal
Exemple #6
0
def eval(data_source):
    total_L = 0.0
    ntotal = 0
    hidden = model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=context)
    for i, (data, target) in enumerate(data_source):
        data = data.as_in_context(context).T
        target = target.as_in_context(context).T.reshape((-1, 1))
        output, hidden = model(data, hidden)
        L = loss(output, target)
        total_L += mx.nd.sum(L).asscalar()
        ntotal += L.size
    return total_L / ntotal
Exemple #7
0
def train():
    best_val = float("Inf")
    for epoch in range(args.epochs):
        total_L = 0.0
        start_time = time.time()
        hidden = model.begin_state(func=mx.nd.zeros,
                                   batch_size=args.batch_size,
                                   ctx=context)
        for ibatch, i in enumerate(range(0, train_data.shape[0] - 1,
                                         args.bptt)):
            data, target = get_batch(train_data, i)
            hidden = detach(hidden)
            with autograd.record():
                output, hidden = model(data, hidden)
                L = loss(output, target)
                L.backward()

            grads = [i.grad(context) for i in model.collect_params().values()]
            # Here gradient is for the whole batch.
            # So we multiply max_norm by batch_size and bptt size to balance it.
            gluon.utils.clip_global_norm(
                grads, args.clip * args.bptt * args.batch_size)

            trainer.step(args.batch_size)
            total_L += mx.nd.sum(L).asscalar()

            if ibatch % args.log_interval == 0 and ibatch > 0:
                cur_L = total_L / args.bptt / args.batch_size / args.log_interval
                print('[Epoch %d Batch %d] loss %.2f, ppl %.2f' %
                      (epoch, ibatch, cur_L, math.exp(cur_L)))
                total_L = 0.0

        val_L = eval(val_data)

        print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f' %
              (epoch, time.time() - start_time, val_L, math.exp(val_L)))

        if val_L < best_val:
            best_val = val_L
            test_L = eval(test_data)
            model.collect_params().save(args.save)
            print('test loss %.2f, test ppl %.2f' % (test_L, math.exp(test_L)))
        else:
            args.lr = args.lr * 0.25
            trainer._init_optimizer('sgd', {
                'learning_rate': args.lr,
                'momentum': 0,
                'wd': 0
            })
            model.collect_params().load(args.save, context)
Exemple #8
0
def train():
    best_val = float("Inf")
    for epoch in range(args.epochs):
        total_L = 0.0
        start_time = time.time()
        hidden = model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=context)
        for i, (data, target) in enumerate(train_data):
            data = data.as_in_context(context).T
            target = target.as_in_context(context).T.reshape((-1, 1))
            hidden = detach(hidden)
            with autograd.record():
                output, hidden = model(data, hidden)
                L = loss(output, target)
                L.backward()

            grads = [p.grad(context) for p in model.collect_params().values()]
            # Here gradient is for the whole batch.
            # So we multiply max_norm by batch_size and bptt size to balance it.
            gluon.utils.clip_global_norm(grads, args.clip * args.bptt * args.batch_size)

            trainer.step(args.batch_size)
            total_L += mx.nd.sum(L).asscalar()

            if i % args.log_interval == 0 and i > 0:
                cur_L = total_L / args.bptt / args.batch_size / args.log_interval
                print('[Epoch %d Batch %d] loss %.2f, ppl %.2f'%(
                    epoch, i, cur_L, math.exp(cur_L)))
                total_L = 0.0

        val_L = eval(val_data)

        print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f'%(
            epoch, time.time()-start_time, val_L, math.exp(val_L)))

        if val_L < best_val:
            best_val = val_L
            test_L = eval(test_data)
            model.collect_params().save(args.save)
            print('test loss %.2f, test ppl %.2f'%(test_L, math.exp(test_L)))
        else:
            args.lr = args.lr*0.25
            trainer._init_optimizer('sgd',
                                    {'learning_rate': args.lr,
                                     'momentum': 0,
                                     'wd': 0})
            model.collect_params().load(args.save, context)
Exemple #9
0
def train():
    best_val = float("Inf")
    for epoch in range(args.epochs):
        total_L = 0.0
        start_time = time.time()
        hidden = model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=context)
        for i, (data, target) in enumerate(train_data):
            data = data.as_in_context(context).T
            target = target.as_in_context(context).T.reshape((-1, 1))
            hidden = detach(hidden)
            with autograd.record():
                output, hidden = model(data, hidden)
                # Here L is a vector of size batch_size * bptt size
                L = loss(output, target)
                L = L / (args.bptt * args.batch_size)
                L.backward()

            grads = [p.grad(context) for p in model.collect_params().values()]
            gluon.utils.clip_global_norm(grads, args.clip)

            trainer.step(1)
            total_L += mx.nd.sum(L).asscalar()

            if i % args.log_interval == 0 and i > 0:
                cur_L = total_L / args.log_interval
                print('[Epoch %d Batch %d] loss %.2f, ppl %.2f'%(
                    epoch, i, cur_L, math.exp(cur_L)))
                total_L = 0.0

            if args.export_model:
                model.export('model')
                return

        val_L = eval(val_data)

        print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f'%(
            epoch, time.time()-start_time, val_L, math.exp(val_L)))

        if val_L < best_val:
            best_val = val_L
            test_L = eval(test_data)
            model.save_parameters(args.save)
            print('test loss %.2f, test ppl %.2f'%(test_L, math.exp(test_L)))
        else:
            args.lr = args.lr*0.25
            trainer.set_learning_rate(args.lr)
Exemple #10
0
def evaluate(data_source, batch_size):
    '''https://mxnet.incubator.apache.org/api/python/autograd/autograd.html#train-mode-and-predict-mode'''
    tic = time.time()
    total_loss = 0
    N = 0
    states = model.begin_state(batch_size, ctx=ctxs[0])
    for cursor in range(0, data_source.shape[0] - 1, args.bptt):
        Xs, Ys = get_batch(data_source, cursor, args)
        # By default, MXNet is in predict_mode
        output, states, _, _ = model(
            Xs, states)  # state(num_layers, bsz, hidden_size)
        states = detach(states)
        total_loss += nd.sum(batch_size *
                             loss(output, Ys)).asscalar()  # loss (seq_len,)
        N += batch_size * len(output)

    return (total_loss / N), time.time() - tic
def eval(data_source, ctx):
    total_L = 0.0
    ntotal = 0
    hidden_states = [
        model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size/len(ctx), ctx=ctx[i])
        for i in range(len(ctx))
    ]
    for i in range(0, data_source.shape[0] - 1, args.bptt):
        data_batch, target_batch = get_batch(data_source, i)
        data = gluon.utils.split_and_load(data_batch, ctx_list=ctx, batch_axis=1)
        target = gluon.utils.split_and_load(target_batch, ctx_list=ctx, batch_axis=1)
        for (d, t) in zip(data, target):
            hidden = hidden_states[d.context.device_id]
            output, hidden = model(d, hidden)
            L = loss(output, t.reshape((-1,)))
            total_L += mx.nd.sum(L).asscalar()
            ntotal += L.size
    return total_L / ntotal
Exemple #12
0
def eval(data_source, ctx):
    total_L = 0.0
    ntotal = 0
    hidden_states = [
        model.begin_state(func=mx.nd.zeros, batch_size=int(args.batch_size/len(ctx)), ctx=ctx[i])
        for i in range(len(ctx))
    ]
    for i in range(0, data_source.shape[0] - 1, args.bptt):
        data_batch, target_batch = get_batch(data_source, i)
        data = gluon.utils.split_and_load(data_batch, ctx_list=ctx, batch_axis=1)
        target = gluon.utils.split_and_load(target_batch, ctx_list=ctx, batch_axis=1)
        for (d, t) in zip(data, target):
            hidden = hidden_states[d.context.device_id]
            output, hidden = model(d, hidden)
            L = loss(output, t.reshape((-1,)))
            total_L += mx.nd.sum(L).asscalar()
            ntotal += L.size
    return total_L / ntotal
Exemple #13
0
def train(epochs, ctx):
    best_val = float("Inf")

    for epoch in range(epochs):
        total_L = 0.0
        cur_L = 0.0
        tic = time.time()
        hidden_states = [
            model.begin_state(func=mx.nd.zeros,
                              batch_size=args.batch_size // len(ctx),
                              ctx=ctx[i]) for i in range(len(ctx))
        ]
        btic = time.time()
        for ibatch, i in enumerate(range(0, train_data.shape[0] - 1,
                                         args.bptt)):
            # get data batch from the training data
            data_batch, target_batch = get_batch(train_data, i)
            # For RNN we can do within batch multi-device parallelization
            data = gluon.utils.split_and_load(data_batch,
                                              ctx_list=ctx,
                                              batch_axis=1)
            target = gluon.utils.split_and_load(target_batch,
                                                ctx_list=ctx,
                                                batch_axis=1)
            Ls = []
            for (d, t) in zip(data, target):
                # get corresponding hidden state then update hidden
                hidden = detach(hidden_states[d.context.device_id])
                with autograd.record():
                    output, hidden = model(d, hidden)
                    L = loss(output, t.reshape((-1, )))
                    L.backward()
                    Ls.append(L)
                # write back to the record
                hidden_states[d.context.device_id] = hidden

            for c in ctx:
                grads = [i.grad(c) for i in model.collect_params().values()]
                # Here gradient is for the whole batch.
                # So we multiply max_norm by batch_size and bptt size to balance it.
                # Also this utility function needs to be applied within the same context
                gluon.utils.clip_global_norm(
                    grads, args.clip * args.bptt * args.batch_size / len(ctx))

            trainer.step(args.batch_size)
            for L in Ls:
                total_L += mx.nd.sum(L).asscalar()

            if ibatch % args.log_interval == 0 and ibatch > 0:
                cur_L = total_L / args.bptt / args.batch_size / args.log_interval
                logging.info(
                    '[Epoch %d Batch %d] Speed: %f samples/sec loss %.2f, ppl %.2f'
                    % (epoch, ibatch, args.batch_size /
                       (time.time() - btic), cur_L, math.exp(cur_L)))
                total_L = 0.0
            btic = time.time()

        logging.info('[Epoch %d] train loss %.2f, train ppl %.2f' %
                     (epoch, cur_L, math.exp(cur_L)))
        logging.info('[Epoch %d] time cost %.2f' % (epoch, time.time() - tic))
        val_L = eval(val_data, ctx)
        logging.info('[Epoch %d] valid loss %.2f, valid ppl %.2f' %
                     (epoch, val_L, math.exp(val_L)))

        if val_L < best_val:
            best_val = val_L
            # test_L = eval(test_data, ctx)
            model.collect_params().save('model.params')
            # logging.info('test loss %.2f, test ppl %.2f' % (test_L, math.exp(test_L)))
        else:
            args.lr = args.lr * 0.25
            trainer._init_optimizer('sgd', {
                'learning_rate': args.lr,
                'momentum': 0,
                'wd': 0
            })
            model.collect_params().load('model.params', ctx)
def train(epochs, ctx):
    best_val = float("Inf")

    for epoch in range(epochs):
        total_L = 0.0
        start_time = time.time()
        hidden_states = [
            model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size // len(ctx), ctx=ctx[i])
            for i in range(len(ctx))
        ]
        for ibatch, i in enumerate(range(0, train_data.shape[0] - 1, args.bptt)):
            # get data batch from the training data
            data_batch, target_batch = get_batch(train_data, i)
            # For RNN we can do within batch multi-device parallelization
            data = gluon.utils.split_and_load(data_batch, ctx_list=ctx, batch_axis=1)
            target = gluon.utils.split_and_load(target_batch, ctx_list=ctx, batch_axis=1)
            Ls = []
            for (d, t) in zip(data, target):
                # get corresponding hidden state then update hidden
                hidden = detach(hidden_states[d.context.device_id])
                with autograd.record():
                    output, hidden = model(d, hidden)
                    L = loss(output, t.reshape((-1,)))
                    L.backward()
                    Ls.append(L)
                # write back to the record
                hidden_states[d.context.device_id] = hidden

            for c in ctx:
                grads = [i.grad(c) for i in model.collect_params().values()]
                # Here gradient is for the whole batch.
                # So we multiply max_norm by batch_size and bptt size to balance it.
                # Also this utility function needs to be applied within the same context
                gluon.utils.clip_global_norm(grads, args.clip * args.bptt * args.batch_size / len(ctx))

            trainer.step(args.batch_size)
            for L in Ls:
                total_L += mx.nd.sum(L).asscalar()

            if ibatch % args.log_interval == 0 and ibatch > 0:
                cur_L = total_L / args.bptt / args.batch_size / args.log_interval
                logging.info('[Epoch %d Batch %d] loss %.2f, ppl %.2f' % (
                    epoch, ibatch, cur_L, math.exp(cur_L)))
                total_L = 0.0

        val_L = eval(val_data, ctx)

        logging.info('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f' % (
            epoch, time.time() - start_time, val_L, math.exp(val_L)))

        if val_L < best_val:
            best_val = val_L
            test_L = eval(test_data, ctx)
            model.collect_params().save('model.params')
            logging.info('test loss %.2f, test ppl %.2f' % (test_L, math.exp(test_L)))
        else:
            args.lr = args.lr * 0.25
            trainer._init_optimizer('sgd',
                {
                    'learning_rate': args.lr,
                    'momentum': 0,
                    'wd': 0
                }
            )
            model.collect_params().load('model.params', ctx)
Exemple #15
0
def train_one_epoch(epoch, cur_lr):
    ''' Train all the batches within one epoch.
    costs is the container created once and reuse for efficiency'''

    total_loss = 0
    states = [model.begin_state(batch_size=m, ctx=ctx) for ctx in ctxs]

    # Loop all batches
    batch, cursor = 0, 0
    tic_log_interval = time.time()
    while cursor < train_data.shape[0] - 1 - 1:
        #######################################################################
        # Control seq_len cited from origin paper
        random_bptt = args.bptt if np.random.random(
        ) < 0.95 else args.bptt / 2.
        # Normal distribution (mean, variance): Prevent extreme sequence lengths
        seq_len = max(5, int(np.random.normal(random_bptt, 5)))
        # There's a very small chance that it could select a very long sequence length resulting in OOM
        seq_len = min(seq_len, args.bptt + args.max_seq_len_delta)
        # Rescale learning rate depending on the variable length w.r.t bptt
        trainer.set_learning_rate(cur_lr * seq_len / args.bptt)
        ########################################################################
        '''Each batch shape(seq_len, batch_size), split data to each device.
        m is the # of samples for each device, devided along batch_size axis.'''
        Xs, Ys = get_batch(train_data, cursor, args, seq_len=seq_len)
        assert args.batch_size == Xs.shape[
            1], 'data shape[1] should be batch_size'
        Xs = gluon.utils.split_and_load(Xs, ctxs, 1)
        Ys = gluon.utils.split_and_load(Ys, ctxs, 1)
        tic_b = time.time()

        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        states = detach(states)
        loss_list = []
        with autograd.record():  # train_mode
            for i, X in enumerate(Xs):
                output, states[i], encoded_raw, encoded_dropped = model(
                    X, states[i])  # state(num_layers, bsz, hidden_size)
                device_loss = joint_loss(output, Ys[i], encoded_raw,
                                         encoded_dropped)
                loss_list.append(device_loss.as_in_context(ctxs[0]) / X.size)
        for l in loss_list:
            l.backward()
        ''' trainer.allreduce_grads()
            For each parameter, reduce the gradients from different contexts.
            Should be called after autograd.backward(), outside of record() scope, and before trainer.update().
            For normal parameter updates, step() should be used, which internally calls allreduce_grads() and then update().
            However, in gradient clipping, manually call allreduce_grads() and update() separately.
        '''
        # trainer.allreduce_grads()
        # grads = [p.grad(ctxs[0]) for p in parameters]
        grads = [p.grad(ctx) for ctx in ctxs for p in parameters]
        gluon.utils.clip_global_norm(grads, args.clipping_theta)
        trainer.step(1)
        # trainer.update(1)

        batch_loss = sum([nd.sum(l).asscalar() for l in loss_list]) / len(ctxs)
        toc_b = time.time()
        batch_info.append([
            epoch, batch, trainer.learning_rate, seq_len,
            (toc_b - tic_b) * 1000,
            args.batch_size * seq_len // (toc_b - tic_b), batch_loss,
            math.exp(batch_loss)
        ])

        total_loss += batch_loss

        if batch % args.log_interval == 0 and batch > 0:
            utils.save_info(batch_info, batch_file)

            toc_log_interval = time.time()
            total_loss = total_loss / args.log_interval

            logging.info(
                '| epoch {:4d} ({:5.2f}%)| batch {:4d} | lr {:7.4f} | seq_len {:2d} | {:4.0f} ms/batch | '
                '{:5d} tokens/s | loss {:6.3f} | ppl {:5.2f}'.format(
                    epoch, cursor / train_data.shape[0] * 100, batch,
                    trainer.learning_rate, seq_len,
                    (toc_log_interval - tic_log_interval) * 1000 /
                    args.log_interval,
                    int(args.batch_size * args.log_interval * seq_len /
                        (toc_log_interval - tic_log_interval)), total_loss,
                    math.exp(total_loss)))

            total_loss = 0
            tic_log_interval = time.time()

        batch += 1
        cursor += seq_len

        global parameters_count
        if not parameters_count:
            logging.info('Parameters (except embeding): {}'.format(
                sum(p.data(ctxs[0]).size for p in parameters)))
            parameters_count = 1

    nd.waitall()  # synchronize batch data