Exemple #1
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--batchsize', '-b', type=int, default=100,
                        help='Number of examples in each mini-batch')
    parser.add_argument('--bproplen', '-l', type=int, default=200,
                        help='Number of words in each mini-batch '
                             '(= length of truncated BPTT)')
    parser.add_argument('--epoch', '-e', type=int, default=40,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu', '-g', type=int, default=0,
                        help='GPU ID (negative value indicates CPU)')

    parser.add_argument('--out', '-o', default='result',
                        help='Directory to output the result')

    parser.add_argument('--file', default="enwik8",
                        help='path to text file for training')
    parser.add_argument('--unit', '-u', type=int, default=2800,
                        help='Number of LSTM units')
    parser.add_argument('--embd', type=int, default=400,
                        help='Number of embedding units')
    parser.add_argument('--hdrop', type=float, default=0.2,
                        help='hidden state dropout (variational)')
    parser.add_argument('--edrop', type=float, default=0.5,
                        help='embedding dropout')

    args = parser.parse_args()

    nembd = args.embd
    #number of training iterations per model save, log write, and validation set evaluation
    interval =100

    pdrop = args.hdrop

    pdrope = args.edrop

    #initial learning rate
    alpha0 = .001
    #inverse of linear decay rate towards 0
    dec_it = 12*9000
    #minimum learning rate
    alpha_min = .00007

    #first ntrain words of dataset will be used for training
    ntrain = 90000000


    seqlen = args.bproplen
    nbatch = args.batchsize

    filename= args.file

    text,mapping = get_char(filename)
    sequence = np.array(text).astype(np.int32)

    itrain =sequence[0:ntrain]
    ttrain = sequence[1:ntrain+1]
    fullseql=int(ntrain/nbatch)

    itrain = itrain.reshape(nbatch,fullseql)
    ttrain = ttrain.reshape(nbatch,fullseql)

    #doesn't use full validations set
    nval = 500000
    ival = sequence[ntrain:ntrain+nval]
    tval = sequence[ntrain+1:ntrain+nval+1]

    ival = ival.reshape(ival.shape[0]//1000,1000)
    tval = tval.reshape(tval.shape[0]//1000,1000)
    #test = sequence[ntrain+nval:ntrain+nval+ntest]


    nvocab = max(sequence) + 1  # train is just an array of integers
    print('#vocab =', nvocab)

    # Prepare an RNNLM model
    rnn = RNNForLM(nvocab, args.unit,args.embd)
    model = L.Classifier(rnn)
    model.compute_accuracy = False  # we only want the perplexity
    if args.gpu >= 0:
        chainer.cuda.get_device(args.gpu).use()  # make the GPU current
        model.to_gpu()

    # Set up an optimizer
    optimizer = Adam(alpha=alpha0)
    optimizer.setup(model)
    resultdir = args.out

    print('starting')
    nepoch = args.epoch

    start = 0
    loss_sum = 0;

    if not os.path.isdir(resultdir):
        os.mkdir(resultdir)

    vloss = test(rnn,ival,tval)
    vloss= (1.4427*vloss)
    f = open(os.path.join(resultdir,'log'), 'w')
    outstring = "Initial Validation loss (bits/word): " + str(vloss) + '\n'
    f.write(outstring)
    f.close()

    i=0
    epoch_num = 0
    it_num = 0

    while True:
        # Get the result of the forward pass.
        fin = start+seqlen

        if fin>(itrain.shape[1]):
            start = 0
            fin = start+seqlen
            epoch_num = epoch_num+1
            if epoch_num== nepoch:
                break

        inputs = itrain[:,start:fin]
        targets = ttrain[:,start:fin]
        start = fin

        inputs = Variable(inputs)
        targets = Variable(targets)

        targets.to_gpu()
        inputs.to_gpu()
        it_num+=1
        loss = 0
        rnn.applyWN()

        #make hidden dropout mask
        mask = cp.zeros((inputs.shape[0],args.unit),dtype = cp.float32)
        ind = cp.nonzero(cp.random.rand(inputs.shape[0],args.unit)>pdrop)
        mask[ind] = 1/(1-pdrop)

        #make embedding dropout mask
        mask2 = cp.zeros((inputs.shape[0],nembd),dtype = cp.float32)
        ind = cp.nonzero(cp.random.rand(inputs.shape[0],nembd)>pdrope)
        mask2[ind] = 1/(1-pdrope)

        for j in range(seqlen):

            output = rnn(inputs[:,j],mask,mask2)
            loss = loss+ F.softmax_cross_entropy(output,targets[:,j])

        loss = loss/(seqlen)

        # Zero all gradients before updating them.
        rnn.zerograds()
        loss_sum += loss.data

        # Calculate and update all gradients.
        loss.backward()
        s = 0;

        # Use the optmizer to move all parameters of the network
        # to values which will reduce the loss.
        optimizer.update()
        #decays learning rate linearly
        optimizer.alpha = alpha0*(dec_it-it_num)/float(dec_it)
        #prevents learning rate from going below minumum
        if optimizer.alpha<alpha_min:
            optimizer.alpha = alpha_min

        loss.unchain_backward()

        if ((i+1)%interval) ==0:
            rnn.reset_state()
            vloss = test(rnn,ival,tval)

            #converts to binary entropy
            vloss= (1.4427*vloss)
            loss_sum = (1.4427*loss_sum/interval)

            serializers.save_npz(os.path.join(resultdir,'model'),rnn)

            outstring = "Training iteration: " + str(i+1) + " Training loss (bits/char): " + str(loss_sum) + " Validation loss (bits/word): " + str(vloss) + '\n'
            f = open(os.path.join(resultdir,'log'), 'a')
            f.write(outstring)
            f.close()
            print("Training iteration: " + str(i+1))
            print('training loss: ' + str(loss_sum))
            print('validation loss: ' + str(vloss))
            loss_sum=0

        i+=1
Exemple #2
0
def train(source_bpe, target_bpe, source_glove, target_glove, chunk_length,
          batch_size, warmup_steps, save_decimation, num_steps, gpu_id, out,
          log_level):
    if not os.path.exists(out):
        os.makedirs(out)

    ll = getattr(logging, log_level)

    stream_handler = logging.StreamHandler(sys.stdout)
    stream_handler.setLevel(ll)
    stream_handler.setFormatter(logging.Formatter('%(message)s'))

    file_handler = logging.FileHandler(filename=os.path.join(
        out, 'training.log'),
                                       mode='a')
    file_handler.setLevel(ll)
    file_handler.setFormatter(logging.Formatter('%(message)s'))

    logger.addHandler(stream_handler)
    logger.addHandler(file_handler)
    logger.setLevel(ll)

    gpu_id = gpu_id if gpu_id is not None else -1

    device_name = '@intel64'
    if gpu_id >= 0:
        device_name = f'@cupy:{gpu_id}'

    with chainer.using_device(device_name):
        source_vocab = make_vocab(source_glove)
        target_vocab = make_vocab(target_glove)
        output_model_dim = target_vocab.embedding_size
        dataset = make_dataset(source_bpe, target_bpe, source_vocab,
                               target_vocab, chunk_length)
        iterator = MultithreadIterator(dataset, batch_size)
        state = TrainingState()
        model = Transformer(source_vocab, target_vocab)
        model.to_gpu(gpu_id)
        optimizer = Adam(beta1=0.99, beta2=0.98, eps=1e-9).setup(model)

        load_training(out, model, optimizer, state)

        try:
            for n, batch in enumerate(iterator):
                if n >= num_steps:
                    break

                if (n + 1) % save_decimation == 0:
                    save_training(out, model, optimizer, state)

                model.cleargrads()
                gc.collect()

                source, target = stack_nested(batch)

                source.token_ids.to_gpu(gpu_id)
                source.masks.to_gpu(gpu_id)
                target.token_ids.to_gpu(gpu_id)
                target.masks.to_gpu(gpu_id)

                output_probs = model.train_forward(source.token_ids,
                                                   target.token_ids,
                                                   input_masks=source.masks,
                                                   output_masks=target.masks)

                unnormalized_loss = F.softmax_cross_entropy(
                    F.reshape(output_probs,
                              (output_probs.shape[0] * output_probs.shape[1],
                               output_probs.shape[2])),
                    F.reshape(target.token_ids, (target.token_ids.shape[0] *
                                                 target.token_ids.shape[1], )),
                    reduce='no')
                loss_mask = xp.reshape(
                    xp.logical_not(target.masks.array).astype(xp.float32),
                    (target.masks.shape[0] * target.masks.shape[1], ))
                loss = F.sum(unnormalized_loss * loss_mask) / F.sum(loss_mask)
                loss.backward()

                learning_rate = (output_model_dim**-0.5) * min(
                    (state.step**-0.5), state.step * (warmup_steps**-1.5))
                optimizer.alpha = learning_rate
                optimizer.update()

                logger.info(
                    f'time = {int(time.time())} | step = {state.step} | loss = {float(loss.array)} | lr = {learning_rate}'
                )

                state.step += 1
        finally:
            save_training(out, model, optimizer, state)