Ejemplo n.º 1
0
 def test_constraint(self):
     threshold = 0.02
     cons = opt.L2Constraint(threshold)
     cons.apply(0, self.W, self.g)
     g = tensor.to_numpy(self.g)
     nrm = np.linalg.norm(self.np_g) / self.np_g.size
     for i in range(g.size):
         self.assertAlmostEqual(g[i], self.np_g[i] * threshold / nrm)
Ejemplo n.º 2
0
    def train(self, data_path, max_epoch, model_path='model'):
        # SGD with L2 gradient normalization
        opt = optimizer.RMSProp(constraint=optimizer.L2Constraint(5))
        #opt = optimizer.SGD(momentum=0.9, weight_decay=5e-4)

        # initialize embedding layer
        embed_w = self.embed.param_values()[0]
        embed_b = self.embed.param_values()[1]
        #initializer.uniform(embed_w, 0, embed_w.shape[1])
        embed_w.uniform(-0.08, 0.08)
        embed_b.set_value(0)
        print 'embed weight l1 = %f' % (embed_w.l1())
        print 'embed b l1 = %f' % (embed_b.l1())

        # initialize lstm layer
        lstm_w = self.lstm.param_values()[0]
        lstm_w.uniform(-0.08, 0.08)  # init all lstm parameters
        print 'lstm weight l1 = %f' % (lstm_w.l1())

        # initialize dense layer
        dense_w = self.dense.param_values()[0]
        dense_b = self.dense.param_values()[1]
        dense_w.uniform(-0.1, 0.1)
        dense_b.set_value(0)
        print 'dense w ', dense_w.shape
        print 'dense b ', dense_b.shape
        print 'dense weight l1 = %f' % (dense_w.l1())
        print 'dense b l1 = %f' % (dense_b.l1())

        start = time()
        train_dat, train_label, val_dat, val_label = load_sample() 
        #train_dat, train_label, val_dat, val_label = load_corpus(data_path)
        train_label = word2onehot(train_label, 2)
        val_label = word2onehot(val_label, 2)
        print 'loading time:', time() - start
        print "train data shape:", train_dat.shape, "train label shape:", train_label.shape
        print "val data shape:", val_dat.shape, "val label shape:", val_label.shape
        for epoch in range(max_epoch):
            train_loss = 0
            num_train_batch = train_dat.shape[0] / self.batchsize
            glb_acc = 0
            for b in range(num_train_batch):
                start = time()
                # load training data
                inputs_arr = train_dat[b * self.batchsize: (b + 1) * self.batchsize]
                labels = train_label[b * self.batchsize: (b + 1) * self.batchsize]
                lens = rm_padding(inputs_arr)
                acc = 0
                batch_loss = 0.0
                g_dense_w = tensor.Tensor(dense_w.shape, self.dev)
                g_dense_w.set_value(0)
                g_dense_b = tensor.Tensor(dense_b.shape, self.dev)
                g_dense_b.set_value(0)
                g_lstm_w = tensor.Tensor(lstm_w.shape, self.dev)
                g_lstm_w.set_value(0)
                g_embed_w = tensor.Tensor(embed_w.shape, self.dev)
                g_embed_w.set_value(0)
                for idx_sam in range(len(inputs_arr)):
                    sam_arr = inputs_arr[idx_sam]
                    sam_arr = convert_sample(sam_arr, sam_arr.shape[0], self.vocab_size, self.dev)
                    sample = tensor.from_numpy(sam_arr)
                    sample.to_device(self.dev)
                    #print sample.shape
                    embed = self.embed.forward(model_pb2.kTrain, sample)
                    #print embed.shape is (53, 128)
                    # embed.shape[0] means the sequence length of the sample
                    embeded = []
                    for idx_seq in range(self.seq_length):
                        if idx_seq >= embed.shape[0]:
                            embeded.append(tensor.Tensor())
                        else:
                            seq = tensor.Tensor((1,embed.shape[1]), self.dev)
                            tensor.copy_data_to_from(seq, embed, embed.shape[1], 0, idx_seq * embed.shape[1])
                            embeded.append(seq)
                    embeded.append(tensor.Tensor()) # hx
                    embeded.append(tensor.Tensor()) # cx
                    #print 'forward embedding time:', time() -start
                    #print tensor.to_numpy(embeded[self.seq_length-1])
                   
                    # forward lstm layer
                    hidden = self.lstm.forward(model_pb2.kTrain, embeded)
                    # outputs are [y1, ..., yn, hx, cx], only need the last output as the predicted latent vector
                    #print len(hidden), hidden[embed.shape[0]-1]
                    #print [hidden[i].l1() for i in range(len(hidden))]
                    # forward dense and loss layer
                    act = self.dense.forward(model_pb2.kTrain, hidden[lens[idx_sam]-1])
                    label = tensor.from_numpy(labels[idx_sam])
                    label.to_device(self.dev)
                    lvalue = self.loss.forward(model_pb2.kTrain, act, label)
                    #print 'forward dense time:', time() - start
                    regularized_act = self.sft.forward(model_pb2.kEval, act)
                    pred = tensor.to_numpy(regularized_act)
                    gt = labels[idx_sam][1]
                    if (gt and pred[0,1] > pred[0,0]) or (gt == 0 and pred[0,1] <= pred[0,0]):
                        acc += 1
                
                    grads = []
                    batch_loss += lvalue.l1() / self.batchsize
                    #print batch_loss
                    start = time()
                    # backward loss and dense layer
                    grad = self.loss.backward() / self.batchsize
                    grad, gwb = self.dense.backward(model_pb2.kTrain, grad)
                    g_dense_w += gwb[0]
                    g_dense_b += gwb[1]
                    #print 'dense_w l1 = %f' % (gwb[0].l1())
                    for i in range(self.seq_length):
                        if i == lens[idx_sam] - 1:
                            grads.append(grad)
                        else:
                            emp = tensor.Tensor(grad.shape, self.dev)
                            emp.set_value(0)
                            grads.append(emp)
                    grads.append(tensor.Tensor())
                    grads.append(tensor.Tensor())
                    # backward lstm layer
                    lstm_input_grad, lstm_param_grad = self.lstm.backward(model_pb2.kTrain, grads)
                    g_lstm_w += lstm_param_grad[0] 
                    #print 'lstm_input l1 = %f' % (lstm_input_grad[0].l1())
                    #print 'backward lstm'  
                    embed_grad = tensor.Tensor(embed.shape, self.dev)
                    for idx in range(len(lstm_input_grad)-2):
                        tensor.copy_data_to_from(embed_grad, lstm_input_grad[idx], embed.shape[1],
					idx * embed.shape[1], 0)
                    _, grad_w = self.embed.backward(model_pb2.kTrain, embed_grad)
                    #print 'backward embedding time:', time() - start
                    #print 'embed weight l1 = %f' % (grad_w[0].l1())
                    g_embed_w += grad_w[0]

                train_loss += batch_loss
                glb_acc += acc

                utils.update_progress(
                    b * 1.0 / num_train_batch, 'training loss = %f, acc = %f' %
                    (batch_loss, acc * 1.0 / self.batchsize))
                opt.apply_with_lr(epoch, get_lr(epoch), g_lstm_w, lstm_w, 'lstm_w')
                opt.apply_with_lr(epoch, get_lr(epoch), g_dense_w, dense_w, 'dense_w')
                opt.apply_with_lr(epoch, get_lr(epoch), g_dense_b, dense_b, 'dense_b')
                opt.apply_with_lr(epoch, get_lr(epoch), g_embed_w, embed_w, 'embed_w')
                #opt.apply_with_lr(epoch, get_lr(epoch), grad_w[1], embed_b, 'embed_b')
            print '\nEpoch %d, train loss is %f, acc = %f' % \
                    (epoch, train_loss / num_train_batch, glb_acc * 1. / (self.batchsize * num_train_batch))

            # evaluation
            eval_loss = 0
            val_acc = 0
            num_test_batch = min(5000, val_dat.shape[0] / self.batchsize)
            for b in range(num_test_batch):
                acc = 0
                val_arr = val_dat[b * self.batchsize: (b + 1) * self.batchsize]
                labels = val_label[b * self.batchsize: (b + 1) * self.batchsize]
                lens = rm_padding(val_arr)
                val_arr = convert(val_arr, self.batchsize, self.seq_length,
                                  self.vocab_size, self.dev)
                val_arr = np.swapaxes(val_arr, 0, 1).reshape((
			self.batchsize * self.seq_length, self.vocab_size)) 
                inputs = tensor.from_numpy(val_arr)
                inputs.to_device(self.dev) # shape (128*53, 33366)
                embed = self.embed.forward(model_pb2.kEval, inputs)
                embed.reshape((self.seq_length, self.batchsize, self.embed_size))
                embeded = []
                for idx in range(self.seq_length):
                    embed_seq = tensor.Tensor((self.batchsize, self.embed_size), self.dev)
                    tensor.copy_data_to_from(embed_seq, embed, 
			self.batchsize * self.embed_size, 0, idx * self.batchsize * self.embed_size)
                    embeded.append(embed_seq)
                embeded.append(tensor.Tensor()) # hx
                embeded.append(tensor.Tensor()) # cx

                hidden = self.lstm.forward(model_pb2.kEval, embeded)
                hidden_batch = tensor.Tensor((self.batchsize, self.hidden_size), self.dev)
                for idx in range(self.batchsize):
                    tensor.copy_data_to_from(hidden_batch, hidden[lens[idx]-1],
			self.hidden_size, idx * self.hidden_size, idx* self.hidden_size)

                act = self.dense.forward(model_pb2.kEval, hidden_batch)
                labels = tensor.from_numpy(labels)
                labels.to_device(self.dev)
                eval_loss += self.loss.forward(model_pb2.kEval, act, labels).l1()
                regularized_act = self.sft.forward(model_pb2.kEval, act)
                pred = tensor.to_numpy(regularized_act)
                gt = tensor.to_numpy(labels)[:,1]
                for i in range(self.batchsize):
                    if (gt[i] and pred[i,1] > pred[i,0]) or (gt[i] == 0 and pred[i,1] <= pred[i,0]):
                        acc += 1
                #print 'acc = %f' % (acc * 1. / self.batchsize)
                val_acc += acc
  
            print 'Epoch %d, evaluation loss is %f, acc = %f' % \
                (epoch, eval_loss / num_test_batch, val_acc * 1. / (num_test_batch * self.batchsize))

            # model saving
            if (epoch + 1) % 2 == 0 or epoch + 1 == max_epoch:
                print 'dense weight l1 = %f' % (dense_w.l1())
                print 'dense bias l1 = %f' % (dense_b.l1())
                print 'lstm weight l1 = %f' % (lstm_w.l1())
                print 'embed weight l1 = %f' % (embed_w.l1())
                # checkpoint the file model
                with open('%s_%d.bin' % (model_path, epoch), 'wb') as fd:
                    print 'saving model to %s' % model_path
                    d = {}
                    for name, w in zip(
                        ['embed_w','embed_b', 'lstm_w', 'dense_w', 'dense_b'],
                        [embed_w, embed_b, lstm_w, dense_w, dense_b]):
                        w.to_host()
                        d[name] = tensor.to_numpy(w)
                        w.to_device(self.dev)
                    '''d['idx_to_char'] = data.idx_to_char
                    d['char_to_idx'] = data.char_to_idx
                    d['hidden_size'] = hidden_size
                    d['num_stacks'] = num_stacks
                    d['dropout'] = dropout'''
                    pickle.dump(d, fd)
Ejemplo n.º 3
0
def train(data,
          max_epoch,
          hidden_size=100,
          seq_length=100,
          batch_size=16,
          num_stacks=1,
          dropout=0.5,
          model_path='model'):
    # SGD with L2 gradient normalization
    opt = optimizer.RMSProp(constraint=optimizer.L2Constraint(5))
    cuda = device.create_cuda_gpu()
    rnn = layer.LSTM(name='lstm',
                     hidden_size=hidden_size,
                     num_stacks=num_stacks,
                     dropout=dropout,
                     input_sample_shape=(data.vocab_size, ))
    rnn.to_device(cuda)
    print 'created rnn'
    rnn_w = rnn.param_values()[0]
    rnn_w.uniform(-0.08, 0.08)  # init all rnn parameters
    print 'rnn weight l1 = %f' % (rnn_w.l1())
    dense = layer.Dense('dense',
                        data.vocab_size,
                        input_sample_shape=(hidden_size, ))
    dense.to_device(cuda)
    dense_w = dense.param_values()[0]
    dense_b = dense.param_values()[1]
    print 'dense w ', dense_w.shape
    print 'dense b ', dense_b.shape
    initializer.uniform(dense_w, dense_w.shape[0], 0)
    print 'dense weight l1 = %f' % (dense_w.l1())
    dense_b.set_value(0)
    print 'dense b l1 = %f' % (dense_b.l1())

    g_dense_w = tensor.Tensor(dense_w.shape, cuda)
    g_dense_b = tensor.Tensor(dense_b.shape, cuda)

    lossfun = loss.SoftmaxCrossEntropy()
    for epoch in range(max_epoch):
        train_loss = 0
        for b in range(data.num_train_batch):
            batch = data.train_dat[b * batch_size:(b + 1) * batch_size]
            inputs, labels = convert(batch, batch_size, seq_length,
                                     data.vocab_size, cuda)
            inputs.append(tensor.Tensor())
            inputs.append(tensor.Tensor())

            outputs = rnn.forward(model_pb2.kTrain, inputs)[0:-2]
            grads = []
            batch_loss = 0
            g_dense_w.set_value(0.0)
            g_dense_b.set_value(0.0)
            for output, label in zip(outputs, labels):
                act = dense.forward(model_pb2.kTrain, output)
                lvalue = lossfun.forward(model_pb2.kTrain, act, label)
                batch_loss += lvalue.l1()
                grad = lossfun.backward()
                grad /= batch_size
                grad, gwb = dense.backward(model_pb2.kTrain, grad)
                grads.append(grad)
                g_dense_w += gwb[0]
                g_dense_b += gwb[1]
                # print output.l1(), act.l1()
            utils.update_progress(
                b * 1.0 / data.num_train_batch,
                'training loss = %f' % (batch_loss / seq_length))
            train_loss += batch_loss

            grads.append(tensor.Tensor())
            grads.append(tensor.Tensor())
            g_rnn_w = rnn.backward(model_pb2.kTrain, grads)[1][0]
            dense_w, dense_b = dense.param_values()
            opt.apply_with_lr(epoch, get_lr(epoch), g_rnn_w, rnn_w, 'rnnw')
            opt.apply_with_lr(epoch, get_lr(epoch), g_dense_w, dense_w,
                              'dense_w')
            opt.apply_with_lr(epoch, get_lr(epoch), g_dense_b, dense_b,
                              'dense_b')
        print '\nEpoch %d, train loss is %f' % \
            (epoch, train_loss / data.num_train_batch / seq_length)

        eval_loss = 0
        for b in range(data.num_test_batch):
            batch = data.val_dat[b * batch_size:(b + 1) * batch_size]
            inputs, labels = convert(batch, batch_size, seq_length,
                                     data.vocab_size, cuda)
            inputs.append(tensor.Tensor())
            inputs.append(tensor.Tensor())
            outputs = rnn.forward(model_pb2.kEval, inputs)[0:-2]
            for output, label in zip(outputs, labels):
                output = dense.forward(model_pb2.kEval, output)
                eval_loss += lossfun.forward(model_pb2.kEval, output,
                                             label).l1()
        print 'Epoch %d, evaluation loss is %f' % \
            (epoch, eval_loss / data.num_test_batch / seq_length)

        if (epoch + 1) % 30 == 0:
            # checkpoint the file model
            with open('%s_%d.bin' % (model_path, epoch), 'wb') as fd:
                print 'saving model to %s' % model_path
                d = {}
                for name, w in zip(['rnn_w', 'dense_w', 'dense_b'],
                                   [rnn_w, dense_w, dense_b]):
                    w.to_host()
                    d[name] = tensor.to_numpy(w)
                    w.to_device(cuda)
                d['idx_to_char'] = data.idx_to_char
                d['char_to_idx'] = data.char_to_idx
                d['hidden_size'] = hidden_size
                d['num_stacks'] = num_stacks
                d['dropout'] = dropout

                pickle.dump(d, fd)
Ejemplo n.º 4
0
from singa import initializer
from singa.proto import model_pb2
from tqdm import tnrange
from word2tensor import load_data, numpy2tensors, convert, labelconvert

import time


def get_lr(epoch):
    return 0.001 / float(1 << (epoch / 50))


if __name__ == "__main__":
    # SGD with L2 gradient normalization
    vocab_size = 7000
    opt = optimizer.RMSProp(constraint=optimizer.L2Constraint(5))
    cuda = device.create_cuda_gpu_on(1)
    encoder = layer.LSTM(name='lstm1',
                         hidden_size=64,
                         num_stacks=5,
                         dropout=0.5,
                         input_sample_shape=(vocab_size, ))
    decoder = layer.LSTM(name='lstm2',
                         hidden_size=64,
                         num_stacks=5,
                         dropout=0.5,
                         input_sample_shape=(vocab_size, ))
    encoder.to_device(cuda)
    decoder.to_device(cuda)
    encoder_w = encoder.param_values()[0]
    encoder_w.uniform(-0.08, 0.08)