def create_net(self, dropout=0.5): self.embed = layer.Dense('embed', self.embed_size, input_sample_shape=(self.vocab_size, )) self.embed.to_device(self.dev) self.lstm = layer.LSTM( name='lstm', hidden_size=self.hidden_size, num_stacks=self.num_stack_layers, dropout=dropout, input_sample_shape=( self.embed_size, )) self.lstm.to_device(self.dev) self.dense = layer.Dense( 'dense', 2, #output shape input_sample_shape=( self.hidden_size, )) self.dense.to_device(self.dev) self.sft = layer.Softmax('softmax', input_sample_shape=( 2, )) self.sft.to_device(self.dev) self.loss = loss.SoftmaxCrossEntropy()
def __init__(self, vocab_size, hidden_size=32): super(CharRNN, self).__init__() self.rnn = layer.LSTM(vocab_size, hidden_size) self.cat = layer.Cat() self.reshape1 = layer.Reshape() self.dense = layer.Linear(hidden_size, vocab_size) self.reshape2 = layer.Reshape() self.softmax_cross_entropy = layer.SoftMaxCrossEntropy() self.optimizer = opt.SGD(0.01) self.hidden_size = hidden_size self.vocab_size = vocab_size
a=x[i] sen.append(dic[a]) return sen if __name__ == "__main__": model_file = open('71.bin', 'rb') param = pickle.load(model_file) model_file.close() decoderw=param['decoder_w'] densew,denseb=param['dense_w'],param['dense_b'] hiddensize=param['hidden_size'] numstacks=param['num_stacks'] drop_out=param['dropout'] vocab_size=7000 cuda = device.create_cuda_gpu_on(1) encoder = layer.LSTM(name='lstm1', hidden_size=hiddensize, num_stacks=numstacks, dropout=drop_out, input_sample_shape=(vocab_size,)) decoder = layer.LSTM(name='lstm2', hidden_size=hiddensize, num_stacks=numstacks, dropout=drop_out, input_sample_shape=(vocab_size,)) encoder.to_device(cuda) decoder.to_device(cuda) encoder_w = encoder.param_values()[0] encoder_w.uniform(-0.08, 0.08) decoder.param_values()[0].copy_from_numpy(decoderw, offset=0) dense = layer.Dense('dense', vocab_size, input_sample_shape=(hiddensize,)) dense.to_device(cuda) dense.param_values()[0].copy_from_numpy(densew,offset=0) dense.param_values()[1].copy_from_numpy(denseb,offset=0) metadata,idx_q,idx_a=load_data() idx2w=metadata['idx2w']
def train(data, max_epoch, hidden_size=100, seq_length=100, batch_size=16, num_stacks=1, dropout=0.5, model_path='model'): # SGD with L2 gradient normalization opt = optimizer.RMSProp(constraint=optimizer.L2Constraint(5)) cuda = device.create_cuda_gpu() rnn = layer.LSTM(name='lstm', hidden_size=hidden_size, num_stacks=num_stacks, dropout=dropout, input_sample_shape=(data.vocab_size, )) rnn.to_device(cuda) print 'created rnn' rnn_w = rnn.param_values()[0] rnn_w.uniform(-0.08, 0.08) # init all rnn parameters print 'rnn weight l1 = %f' % (rnn_w.l1()) dense = layer.Dense('dense', data.vocab_size, input_sample_shape=(hidden_size, )) dense.to_device(cuda) dense_w = dense.param_values()[0] dense_b = dense.param_values()[1] print 'dense w ', dense_w.shape print 'dense b ', dense_b.shape initializer.uniform(dense_w, dense_w.shape[0], 0) print 'dense weight l1 = %f' % (dense_w.l1()) dense_b.set_value(0) print 'dense b l1 = %f' % (dense_b.l1()) g_dense_w = tensor.Tensor(dense_w.shape, cuda) g_dense_b = tensor.Tensor(dense_b.shape, cuda) lossfun = loss.SoftmaxCrossEntropy() for epoch in range(max_epoch): train_loss = 0 for b in range(data.num_train_batch): batch = data.train_dat[b * batch_size:(b + 1) * batch_size] inputs, labels = convert(batch, batch_size, seq_length, data.vocab_size, cuda) inputs.append(tensor.Tensor()) inputs.append(tensor.Tensor()) outputs = rnn.forward(model_pb2.kTrain, inputs)[0:-2] grads = [] batch_loss = 0 g_dense_w.set_value(0.0) g_dense_b.set_value(0.0) for output, label in zip(outputs, labels): act = dense.forward(model_pb2.kTrain, output) lvalue = lossfun.forward(model_pb2.kTrain, act, label) batch_loss += lvalue.l1() grad = lossfun.backward() grad /= batch_size grad, gwb = dense.backward(model_pb2.kTrain, grad) grads.append(grad) g_dense_w += gwb[0] g_dense_b += gwb[1] # print output.l1(), act.l1() utils.update_progress( b * 1.0 / data.num_train_batch, 'training loss = %f' % (batch_loss / seq_length)) train_loss += batch_loss grads.append(tensor.Tensor()) grads.append(tensor.Tensor()) g_rnn_w = rnn.backward(model_pb2.kTrain, grads)[1][0] dense_w, dense_b = dense.param_values() opt.apply_with_lr(epoch, get_lr(epoch), g_rnn_w, rnn_w, 'rnnw') opt.apply_with_lr(epoch, get_lr(epoch), g_dense_w, dense_w, 'dense_w') opt.apply_with_lr(epoch, get_lr(epoch), g_dense_b, dense_b, 'dense_b') print '\nEpoch %d, train loss is %f' % \ (epoch, train_loss / data.num_train_batch / seq_length) eval_loss = 0 for b in range(data.num_test_batch): batch = data.val_dat[b * batch_size:(b + 1) * batch_size] inputs, labels = convert(batch, batch_size, seq_length, data.vocab_size, cuda) inputs.append(tensor.Tensor()) inputs.append(tensor.Tensor()) outputs = rnn.forward(model_pb2.kEval, inputs)[0:-2] for output, label in zip(outputs, labels): output = dense.forward(model_pb2.kEval, output) eval_loss += lossfun.forward(model_pb2.kEval, output, label).l1() print 'Epoch %d, evaluation loss is %f' % \ (epoch, eval_loss / data.num_test_batch / seq_length) if (epoch + 1) % 30 == 0: # checkpoint the file model with open('%s_%d.bin' % (model_path, epoch), 'wb') as fd: print 'saving model to %s' % model_path d = {} for name, w in zip(['rnn_w', 'dense_w', 'dense_b'], [rnn_w, dense_w, dense_b]): w.to_host() d[name] = tensor.to_numpy(w) w.to_device(cuda) d['idx_to_char'] = data.idx_to_char d['char_to_idx'] = data.char_to_idx d['hidden_size'] = hidden_size d['num_stacks'] = num_stacks d['dropout'] = dropout pickle.dump(d, fd)
def sample(model_path, nsamples=100, seed_text='', do_sample=True): with open(model_path, 'rb') as fd: d = pickle.load(fd) rnn_w = tensor.from_numpy(d['rnn_w']) idx_to_char = d['idx_to_char'] char_to_idx = d['char_to_idx'] vocab_size = len(idx_to_char) dense_w = tensor.from_numpy(d['dense_w']) dense_b = tensor.from_numpy(d['dense_b']) hidden_size = d['hidden_size'] num_stacks = d['num_stacks'] dropout = d['dropout'] cuda = device.create_cuda_gpu() rnn = layer.LSTM(name='lstm', hidden_size=hidden_size, num_stacks=num_stacks, dropout=dropout, input_sample_shape=(len(idx_to_char), )) rnn.to_device(cuda) rnn.param_values()[0].copy_data(rnn_w) dense = layer.Dense('dense', vocab_size, input_sample_shape=(hidden_size, )) dense.to_device(cuda) dense.param_values()[0].copy_data(dense_w) dense.param_values()[1].copy_data(dense_b) hx = tensor.Tensor((num_stacks, 1, hidden_size), cuda) cx = tensor.Tensor((num_stacks, 1, hidden_size), cuda) hx.set_value(0.0) cx.set_value(0.0) if len(seed_text) > 0: for c in seed_text: x = np.zeros((1, vocab_size), dtype=np.float32) x[0, char_to_idx[c]] = 1 tx = tensor.from_numpy(x) tx.to_device(cuda) inputs = [tx, hx, cx] outputs = rnn.forward(False, inputs) y = dense.forward(False, outputs[0]) y = tensor.softmax(y) hx = outputs[1] cx = outputs[2] sys.stdout.write(seed_text) else: y = tensor.Tensor((1, vocab_size), cuda) y.set_value(1.0 / vocab_size) for i in range(nsamples): y.to_host() prob = tensor.to_numpy(y)[0] if do_sample: cur = np.random.choice(vocab_size, 1, p=prob)[0] else: cur = np.argmax(prob) sys.stdout.write(idx_to_char[cur]) x = np.zeros((1, vocab_size), dtype=np.float32) x[0, cur] = 1 tx = tensor.from_numpy(x) tx.to_device(cuda) inputs = [tx, hx, cx] outputs = rnn.forward(False, inputs) y = dense.forward(False, outputs[0]) y = tensor.softmax(y) hx = outputs[1] cx = outputs[2] print('')
import time def get_lr(epoch): return 0.001 / float(1 << (epoch / 50)) if __name__ == "__main__": # SGD with L2 gradient normalization vocab_size = 7000 opt = optimizer.RMSProp(constraint=optimizer.L2Constraint(5)) cuda = device.create_cuda_gpu_on(1) encoder = layer.LSTM(name='lstm1', hidden_size=64, num_stacks=5, dropout=0.5, input_sample_shape=(vocab_size, )) decoder = layer.LSTM(name='lstm2', hidden_size=64, num_stacks=5, dropout=0.5, input_sample_shape=(vocab_size, )) encoder.to_device(cuda) decoder.to_device(cuda) encoder_w = encoder.param_values()[0] encoder_w.uniform(-0.08, 0.08) decoder_w = decoder.param_values()[0] decoder_w.uniform(-0.08, 0.08) dense = layer.Dense('dense', vocab_size, input_sample_shape=(64, ))