def __fit_one(self, link, content_layers, style_patches): xp = self.xp link.zerograds() layers = self.model(link.x) if self.keep_color: trans_layers = self.model(util.gray(link.x)) else: trans_layers = layers loss_info = [] loss = Variable(xp.zeros((), dtype=np.float32)) for name, content_layer in content_layers: layer = layers[name] content_loss = self.content_weight * F.mean_squared_error( layer, Variable(content_layer.data)) loss_info.append(('content_' + name, float(content_loss.data))) loss += content_loss for name, style_patch, style_patch_norm in style_patches: patch = trans_layers[name] near, size, size2 = util.nearest_neighbor_patch( patch, style_patch, style_patch_norm) style_loss = self.style_weight * ( F.sum(F.square(patch)) * size2 / size - 2 * F.sum(near) / size) loss_info.append(('style_' + name, float(style_loss.data))) loss += style_loss tv_loss = self.tv_weight * util.total_variation(link.x) loss_info.append(('tv', float(tv_loss.data))) loss += tv_loss loss.backward() self.optimizer.update() return loss_info
def __fit_one(self, link, content_layers, style_grams): xp = self.xp link.zerograds() layers = self.model(link.x) if self.keep_color: trans_layers = self.model(util.gray(link.x)) else: trans_layers = layers loss_info = [] loss = Variable(xp.zeros((), dtype=np.float32)) for name, content_layer in content_layers: layer = layers[name] content_loss = self.content_weight * F.mean_squared_error(layer, Variable(content_layer.data)) loss_info.append(('content_' + name, float(content_loss.data))) loss += content_loss for name, style_gram in style_grams: gram = util.gram_matrix(trans_layers[name]) style_loss = self.style_weight * F.mean_squared_error(gram, Variable(style_gram.data)) loss_info.append(('style_' + name, float(style_loss.data))) loss += style_loss tv_loss = self.tv_weight * util.total_variation(link.x) loss_info.append(('tv', float(tv_loss.data))) loss += tv_loss loss.backward() self.optimizer.update() return loss_info
def optimizeCRNN(iterNum,maxIndex,indicies): batchSize = 1000 model = EvalCRNN(maxIndex,500) print(len(indicies),computeEntropy(maxIndex,indicies)) learningRate = 0.001 epoch = 3 for j in range(epoch): my_optimizer = optimizers.RMSpropGraves(lr = learningRate) my_optimizer.setup(model) my_optimizer.add_hook(optimizer.GradientClipping(1)) model.cRNN.reset() loss = Variable(np.array([[0]])) for i in range(iterNum): t1 = time.clock() model.zerograds() loss.unchain_backward() loss = model(indicies[batchSize*i:batchSize*(i+1)],iterNum*batchSize) loss.backward() t2 = time.clock() msg = "iter: " + str(i + iterNum * j + 1) + "/" + str(iterNum * epoch) msgLoss = "loss: " + str(loss.data/batchSize) msgNorm = "grad: " + str(my_optimizer.compute_grads_norm()) msgTime = "time: " + str(t2 - t1) + " seconds" print(msgLoss,msgNorm,msg,msgTime) my_optimizer.update() learningRate *= 0.50 print(model(indicies[batchSize*(iterNum):batchSize*(iterNum+10)]).data/(batchSize*10)) return model.cRNN
def __fit_one(self, link, content4_2, style3_2,style4_2): xp = self.xp link.zerograds() layer3_2,layer4_2 = self.model(link.x) if self.keep_color: #trans_layers = self.model(util.gray(link.x)) print "don't keep color!" loss_info = [] loss = Variable(xp.zeros((), dtype=np.float32)) #layer = layers[name] content_loss = self.content_weight * F.mean_squared_error(layer4_2, Variable(content4_2)) loss_info.append(('content_', float(content_loss.data))) loss += content_loss style_patch, style_patch_norm = style3_2 near,size,size2 = util.nearest_neighbor_patch(layer3_2, style_patch, style_patch_norm) style_loss = self.style_weight * (F.sum(F.square(layer3_2))*size2/size-2*F.sum(near)/size) loss_info.append(('style_', float(style_loss.data))) loss+=style_loss style_patch, style_patch_norm = style4_2 near,size,size2 = util.nearest_neighbor_patch(layer4_2, style_patch, style_patch_norm) style_loss = self.style_weight *1.5* (F.sum(F.square(layer4_2))*size2/size-2*F.sum(near)/size) loss_info.append(('style_', float(style_loss.data))) loss+= style_loss tv_loss = self.tv_weight * util.total_variation(link.x) loss_info.append(('tv', float(tv_loss.data))) loss+=tv_loss loss.backward() self.optimizer.update() return loss_info
def step(self, perm, batch_index, mode, epoch): if mode == 'train': data, first_words, label = self.read_batch(perm, batch_index, self.train_data, mode) train = True else: data, first_words, label = self.read_batch(perm, batch_index, self.test_data, mode) train = False data = Variable(cuda.to_gpu(data)) state = { name: Variable( self.xp.zeros((self.batchsize, 1024), dtype=self.xp.float32)) for name in ('c1', 'h1') } loss = Variable(cuda.cupy.asarray(0.0).astype(np.float32)) acc = 0.0 ### image-encoder ### h = self.enc(data, train=train, test=not train) h = h.data h = Variable(h) ### first LSTM ### state, _ = self.dec(h, state, train=train, test=not train, image=True) ### input <SOS> ### state, y = self.dec(Variable(cuda.to_gpu(first_words)), state, train=train, test=not train) loss += F.softmax_cross_entropy(y, Variable(cuda.to_gpu(label.T[1]))) acc += F.accuracy(y, Variable(cuda.to_gpu(label.T[1])), ignore_label=-1).data.get() for cur_word, next_word in zip(label.T[1:-1], label.T[2:]): state, y = self.dec(Variable(cuda.to_gpu(cur_word)), state, train=train, test=not train) loss += F.softmax_cross_entropy(y, Variable(cuda.to_gpu(next_word))) acc += F.accuracy(y, Variable(cuda.to_gpu(next_word)), ignore_label=-1).data.get() if mode == 'train': self.dec.cleargrads() loss.backward() self.o_dec.update() return { "prediction": 0, "current_loss": loss.data.get() / (label.T.shape[0]), "current_accuracy": acc / (label.T.shape[0]), }
def generate_image(img_orig, img_style, width, nw, nh, max_iter, lr, img_gen=None): batch_size = img_orig.shape[0] mid_orig = nn.forward(Variable(img_orig, volatile=True)) style_mats = [get_matrix(y) for y in nn.forward(Variable(img_style, volatile=True))] if img_gen is None: if args.gpu >= 0: img_gen_ = xp.random.uniform(-20,20,(3,width,width),dtype=np.float32) img_gen = xp.random.uniform(-20,20,(batch_size,3,width,width),dtype=np.float32) img_gen[:,:,:,:] = img_gen_ else: img_gen_ = np.random.uniform(-20,20,(3,width,width)).astype(np.float32) img_gen = np.random.uniform(-20,20,(batch_size,3,width,width)).astype(np.float32) img_gen[:,:,:,:] = img_gen_ x = Variable(img_gen) xg = xp.zeros_like(x.data) optimizer = optimizers.Adam(alpha=lr) optimizer.setup((img_gen,xg)) for i in range(max_iter): x = Variable(img_gen) y = nn.forward(x) optimizer.zero_grads() L = Variable(xp.zeros((), dtype=np.float32)) for l in range(len(y)): gogh_matrix = get_matrix(y[l]) L1 = np.float32(args.lam) * np.float32(nn.alpha[l])*F.mean_squared_error(y[l], Variable(mid_orig[l].data)) L2 = np.float32(nn.beta[l])*F.mean_squared_error(gogh_matrix, Variable(style_mats[l].data))/np.float32(len(y)) L += L1+L2 if i%100==0: print (i,l,L1.data,L2.data) L.backward() xg += x.grad optimizer.update() tmp_shape = img_gen.shape if args.gpu >= 0: img_gen += Clip().forward(img_gen).reshape(tmp_shape) - img_gen else: def clip(x): return -120 if x<-120 else (136 if x>136 else x) img_gen += np.vectorize(clip)(img_gen).reshape(tmp_shape) - img_gen if i%50==0: for j in range(img_gen.shape[0]): save_image(img_gen[j], W, nw[j], nh[j], args.out_dir+"_%d/im_%05d.png"%(j,i)) for j in range(img_gen.shape[0]): save_image(img_gen[j], W, nw[j], nh[j], args.out_dir+"_last/im_%d.png"%(j))
def generate_image(img_orig, img_style, width, nw, nh, max_iter, lr, img_gen=None): batch_size = img_orig.shape[0] mid_orig = nn.forward(Variable(img_orig, volatile=True)) style_mats = [get_matrix(y) for y in nn.forward(Variable(img_style, volatile=True))] if img_gen is None: if args.gpu >= 0: img_gen_ = xp.random.uniform(-20,20,(3,width,width),dtype=np.float32) img_gen = xp.random.uniform(-20,20,(batch_size,3,width,width),dtype=np.float32) img_gen[:,:,:,:] = img_gen_ else: img_gen_ = np.random.uniform(-20,20,(3,width,width)).astype(np.float32) img_gen = np.random.uniform(-20,20,(batch_size,3,width,width)).astype(np.float32) img_gen[:,:,:,:] = img_gen_ x = Variable(img_gen) xg = xp.zeros_like(x.data) optimizer = optimizers.Adam(alpha=lr) optimizer.setup((img_gen,xg)) for i in range(max_iter): x = Variable(img_gen) y = nn.forward(x) optimizer.zero_grads() L = Variable(xp.zeros((), dtype=np.float32)) for l in range(len(y)): gogh_matrix = get_matrix(y[l]) L1 = np.float32(args.lam) * np.float32(nn.alpha[l])*F.mean_squared_error(y[l], Variable(mid_orig[l].data)) L2 = np.float32(nn.beta[l])*F.mean_squared_error(gogh_matrix, Variable(style_mats[l].data))/np.float32(len(y)) L += L1+L2 if i%100==0: print i,l,L1.data,L2.data L.backward() xg += x.grad optimizer.update() tmp_shape = img_gen.shape if args.gpu >= 0: img_gen += Clip().forward(img_gen).reshape(tmp_shape) - img_gen else: def clip(x): return -120 if x<-120 else (136 if x>136 else x) img_gen += np.vectorize(clip)(img_gen).reshape(tmp_shape) - img_gen if i%50==0: for j in range(img_gen.shape[0]): save_image(img_gen[j], W, nw[j], nh[j], args.out_dir+"_%d/im_%05d.png"%(j,i)) for j in range(img_gen.shape[0]): save_image(img_gen[j], W, nw[j], nh[j], args.out_dir+"_last/im_%d.png"%(j))
class StatefulAgent(Agent): def __init__(self, model, optimizer=None, gpu=-1, cutoff=None, last=False): super(StatefulAgent, self).__init__(model, optimizer=optimizer, gpu=gpu, last=last, cutoff=cutoff) # cutoff for BPTT self.cutoff = cutoff # whether to update from loss in last step only self.last = last # keep track of loss for truncated BPTT self.loss = Variable(self.xp.zeros((), 'float32')) def run(self, data, train=True, idx=None, final=False): if (idx) % self.cutoff == 0: self.reset() loss = self.model(map(lambda x: Variable(self.xp.asarray(x)), data), train=True) if self.last: # used in case we propagate back at end of trials only if ((idx + 1) % self.cutoff) == 0: self.loss = loss else: loss = Variable(self.xp.zeros((), 'float32')) else: self.loss += loss # normalize by number of datapoints in minibatch _loss = float(loss.data) # backpropagate if we reach the cutoff for truncated backprop or if we processed the last batch if train and ((self.cutoff and ((idx + 1) % self.cutoff) == 0) or final): self.optimizer.zero_grads() self.loss.backward() self.loss.unchain_backward() self.optimizer.update() self.loss = Variable(self.xp.zeros((), 'float32')) if not train: self.loss.unchain_backward() return _loss
def generate_image(img_orig, img_style, width, nw, nh, max_iter, lr, img_gen=None): with chainer.using_config('enable_backprop', True): mid_orig = nn.forward(Variable(img_orig)) with chainer.using_config('enable_backprop', True): style_mats = [get_matrix(y) for y in nn.forward(Variable(img_style))] if img_gen is None: if args.gpu >= 0: img_gen = xp.random.uniform(-20,20,(1,3,width,width),dtype=np.float32) else: img_gen = np.random.uniform(-20,20,(1,3,width,width)).astype(np.float32) img_gen = chainer.links.Parameter(img_gen) optimizer = optimizers.Adam(alpha=lr) optimizer.setup(img_gen) for i in range(max_iter): img_gen.zerograds() x = img_gen.W y = nn.forward(x) L = Variable(xp.zeros((), dtype=np.float32)) for l in range(len(y)): ch = y[l].data.shape[1] wd = y[l].data.shape[2] gogh_y = F.reshape(y[l], (ch,wd**2)) gogh_matrix = F.matmul(gogh_y, gogh_y, transb=True)/np.float32(ch*wd**2) L1 = np.float32(args.lam) * np.float32(nn.alpha[l])*F.mean_squared_error(y[l], Variable(mid_orig[l].data)) L2 = np.float32(nn.beta[l])*F.mean_squared_error(gogh_matrix, Variable(style_mats[l].data))/np.float32(len(y)) L += L1+L2 if i%100==0: print(i,l,L1.data,L2.data) L.backward() img_gen.W.grad = x.grad optimizer.update() tmp_shape = x.data.shape if args.gpu >= 0: img_gen.W.data += Clip().forward(img_gen.W.data).reshape(tmp_shape) - img_gen.W.data else: def clip(x): return -120 if x<-120 else (136 if x>136 else x) img_gen.W.data += np.vectorize(clip)(img_gen.W.data).reshape(tmp_shape) - img_gen.W.data if i%50==0: save_image(img_gen.W.data, W, nw, nh, i)
def generate_image(img_orig, img_style, width, nw, nh, max_iter, lr, img_gen=None): mid_orig = nn.forward(Variable(img_orig, volatile=True)) style_mats = [get_matrix(y) for y in nn.forward(Variable(img_style, volatile=True))] if img_gen is None: if args.gpu >= 0: img_gen = xp.random.uniform(-20,20,(1,3,width,width),dtype=np.float32) else: img_gen = np.random.uniform(-20,20,(1,3,width,width)).astype(np.float32) x = Variable(img_gen) xg = xp.zeros_like(x.data) optimizer = optimizers.Adam(alpha=lr) optimizer.setup((img_gen,xg)) for i in range(max_iter): x = Variable(img_gen) y = nn.forward(x) optimizer.zero_grads() L = Variable(xp.zeros((), dtype=np.float32)) for l in range(len(y)): ch = y[l].data.shape[1] wd = y[l].data.shape[2] gogh_y = F.reshape(y[l], (ch,wd**2)) gogh_matrix = F.matmul(gogh_y, gogh_y, transb=True)/np.float32(ch*wd**2) L1 = np.float32(args.lam) * np.float32(nn.alpha[l])*F.mean_squared_error(y[l], Variable(mid_orig[l].data)) L2 = np.float32(nn.beta[l])*F.mean_squared_error(gogh_matrix, Variable(style_mats[l].data))/np.float32(len(y)) L += L1+L2 if i%100==0: print i,l,L1.data,L2.data L.backward() xg += x.grad optimizer.update() tmp_shape = img_gen.shape if args.gpu >= 0: img_gen += Clip().forward(img_gen).reshape(tmp_shape) - img_gen else: def clip(x): return -120 if x<-120 else (136 if x>136 else x) img_gen += np.vectorize(clip)(img_gen).reshape(tmp_shape) - img_gen if i%3000==0: save_image(img_gen, W, nw, nh, i)
def step(self,perm,batch_index,mode,epoch): if mode=='train': data, first_words, label=self.read_batch(perm,batch_index,self.train_data,mode) train = True else : data, first_words, label=self.read_batch(perm,batch_index,self.test_data,mode) train = False data = Variable(cuda.to_gpu(data)) state = {name: Variable(self.xp.zeros((self.batchsize, 1024),dtype=self.xp.float32)) for name in ('c1', 'h1')} loss=Variable(cuda.cupy.asarray(0.0).astype(np.float32)) acc=0.0 ### image-encoder ### h = self.enc(data, train=train, test=not train) h=h.data h=Variable(h) ### first LSTM ### state,_ = self.dec(h, state,train=train, test=not train, image=True) ### input <SOS> ### state,y = self.dec(Variable(cuda.to_gpu(first_words)), state,train=train, test=not train) loss += F.softmax_cross_entropy(y, Variable(cuda.to_gpu(label.T[1]))) acc += F.accuracy(y, Variable(cuda.to_gpu(label.T[1])), ignore_label=-1).data.get() for cur_word,next_word in zip(label.T[1:-1],label.T[2:]): state,y = self.dec(Variable(cuda.to_gpu(cur_word)), state,train=train, test=not train) loss += F.softmax_cross_entropy(y, Variable(cuda.to_gpu(next_word))) acc += F.accuracy(y, Variable(cuda.to_gpu(next_word)), ignore_label=-1).data.get() if mode=='train': self.dec.cleargrads() loss.backward() self.o_dec.update() return {"prediction": 0, "current_loss": loss.data.get()/(label.T.shape[0]), "current_accuracy": acc/(label.T.shape[0]), }
def train(self, data): if not self.cutoff: cutoff = data.nbatches else: cutoff = self.cutoff self.model.predictor.reset_state() cumloss = self.xp.zeros((), 'float32') loss = Variable(self.xp.zeros((), 'float32')) # check if we are in train or test mode (used e.g. for dropout) self.model.predictor.test = False self.model.predictor.train = True for _x, _t in data: x = Variable(_x) t = Variable(_t) self.model.predictor(x) # backpropagate if we reach the cutoff for truncated backprop or if we processed the last batch if data.step % cutoff == 0 or data.step == data.nbatches: loss += self.model(x, t) self.optimizer.zero_grads() loss.backward() loss.unchain_backward() self.optimizer.update() #self.model.predictor[0][0].U.W.data[10:,:]=0 cumloss += loss.data loss = Variable(self.xp.zeros((), 'float32')) self.model.predictor.reset_state() return float(cumloss / (data.batch_ind.shape[1]))
def experience_replay(self, time): if self.initial_exploration < time: replay_goal = min(len(self.goal_history), self.goal_replay_size) replay_all = min(replay_goal, self.replay_size - self.goal_replay_size) # print "REPLAYING {} good and {} all".format(replay_goal, replay_all) replay_index = random.sample(range(len(self.history)), replay_all) goal_replay_index = random.sample(range(len(self.goal_history)), replay_goal) r_episodes = [deepcopy(self.history[id]) for id in replay_index] + \ [deepcopy(self.goal_history[id]) for id in goal_replay_index] # # Can be harmful # # randomly decide length of episodes # for episode in r_episodes: # length = random.randint(1, len(episode.actions)) # episode.actions = episode.actions[:length] # episode.rewards = episode.rewards[:length] # update target model if self.initial_exploration == time + 1: self.optimizer.zero_grads() loss = Variable(np.asarray(np.float32(0.0))) for episode in r_episodes: loss += self.get_loss(episode) loss.backward() self.optimizer.update() self.target_model_update(time, soft_update=False) # set model to original state if self.history[-1].ended: self.model.set_state([-1]) else: self.model.set_state([-1] + self.history[-1].actions)
def train(self, data): self.model.predictor.reset_state() cumloss = self.xp.zeros((), 'float32') loss = Variable(self.xp.zeros((), 'float32')) # check if we are in train or test mode (e.g. for dropout) self.model.predictor.test = False self.model.predictor.train = True for _x, _t in data: x = Variable(self.xp.asarray(_x)) t = Variable(self.xp.asarray(_t)) loss = self.model(x, t) cumloss += loss.data self.optimizer.zero_grads() loss.backward() self.optimizer.update() return float(cumloss / data.nbatches)
def main(): # arguments parser = argparse.ArgumentParser() parser.add_argument('--data_dir', type=str, default='data/dazai') parser.add_argument('--checkpoint_dir', type=str, default='model') parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--rnn_size', type=int, default=128) parser.add_argument('--learning_rate', type=float, default=2e-3) parser.add_argument('--learning_rate_decay', type=float, default=0.97) parser.add_argument('--learning_rate_decay_after', type=int, default=10) parser.add_argument('--decay_rate', type=float, default=0.95) parser.add_argument('--dropout', type=float, default=0.0) parser.add_argument('--seq_length', type=int, default=50) parser.add_argument('--batchsize', type=int, default=50) parser.add_argument('--epochs', type=int, default=50) parser.add_argument('--grad_clip', type=int, default=5) parser.add_argument('--init_from', type=str, default='') parser.add_argument('--enable_checkpoint', type=bool, default=True) parser.add_argument('--file_name', type=str, default='input.txt') args = parser.parse_args() if not os.path.exists(args.checkpoint_dir): os.mkdir(args.checkpoint_dir) n_epochs = args.epochs n_units = args.rnn_size batchsize = args.batchsize bprop_len = args.seq_length grad_clip = args.grad_clip xp = cuda.cupy if args.gpu >= 0 else np train_data, words, vocab = load_data(args.data_dir, args.file_name) pickle.dump(vocab, open('%s/vocab.bin' % args.data_dir, 'wb')) if len(args.init_from) > 0: model = pickle.load(open(args.init_from, 'rb')) else: model = CharRNN(len(vocab), n_units) if args.gpu >= 0: cuda.get_device(args.gpu).use() model.to_gpu() optimizer = optimizers.RMSprop(lr=args.learning_rate, alpha=args.decay_rate, eps=1e-8) #optimizer = chainer.optimizers.SGD(lr=1.0) optimizer.setup(model) optimizer.add_hook( chainer.optimizer.GradientClipping(grad_clip)) #勾配の上限を設定 whole_len = train_data.shape[0] #jump = whole_len / batchsize jump = int(whole_len / batchsize) epoch = 0 start_at = time.time() cur_at = start_at state = make_initial_state(n_units, batchsize=batchsize) if args.gpu >= 0: accum_loss = Variable(xp.zeros(()).astype(np.float32)) for key, value in state.items(): value.data = cuda.to_gpu(value.data) else: accum_loss = Variable(xp.zeros(()).astype(np.float32)) print('going to train {} iterations'.format(jump * n_epochs / bprop_len)) sum_perp = 0 count = 0 iteration = 0 for i in range(jump * n_epochs): x_batch = xp.array([ train_data[(jump * j + i) % whole_len] for j in xrange(batchsize) ]) y_batch = xp.array([ train_data[(jump * j + i + 1) % whole_len] for j in xrange(batchsize) ]) if args.gpu >= 0: x_batch = cuda.to_gpu(x_batch) y_batch = cuda.to_gpu(y_batch) state, loss_i = model.forward_one_step(x_batch, y_batch, state, dropout_ratio=args.dropout) accum_loss += loss_i count += 1 if (i + 1) % bprop_len == 0: # Run truncated BPTT iteration += 1 sum_perp += accum_loss.data now = time.time() #print('{}/{}, train_loss = {}, time = {:.2f}'.format((i+1)/bprop_len, jump, accum_loss.data / bprop_len, now-cur_at)) print('{}/{}, train_loss = {}, time = {:.2f}'.format( (i + 1) / bprop_len, jump * n_epochs / bprop_len, accum_loss.data / bprop_len, now - cur_at)) cur_at = now model.cleargrads() #optimizer.zero_grads() accum_loss.backward() accum_loss.unchain_backward() # truncate #accum_loss = Variable(xp.zeros(()).astype(np.float32)) if args.gpu >= 0: accum_loss = Variable(xp.zeros(()).astype(np.float32)) #accum_loss = Variable(cuda.zeros(())) else: accum_loss = Variable(np.zeros((), dtype=np.float32)) #optimizer.clip_grads(grad_clip) optimizer.update() if (i + 1) % 1000 == 0: print('epoch: ', epoch) print('iteration: ', iteration) print('training perplexity: ', np.exp(float(sum_perp) / count)) sum_perp = 0 count = 0 if args.enable_checkpoint: if (i + 1) % 10000 == 0: fn = ('%s/charrnn_epoch_%.2f.chainermodel' % (args.checkpoint_dir, float(i) / jump)) pickle.dump(copy.deepcopy(model).to_cpu(), open(fn, 'wb')) pickle.dump( copy.deepcopy(model).to_cpu(), open('%s/latest.chainermodel' % (args.checkpoint_dir), 'wb')) if (i + 1) % jump == 0: epoch += 1 if epoch >= args.learning_rate_decay_after: optimizer.lr *= args.learning_rate_decay print('decayed learning rate by a factor {} to {}'.format( args.learning_rate_decay, optimizer.lr)) sys.stdout.flush()
def train(self, words, steps, batchsize=100, sequence_length=10): """ Train the Predictor's model on words for steps number of steps. """ whole_len = len(words) train_data = np.ndarray(whole_len, dtype=np.int32) jumps = steps * sequence_length # Initialize training data and maybe vocab. if self.vocab is None: vocab_initializing = True self.vocab = {} for i, word in enumerate(words): if vocab_initializing: if word not in self.vocab: self.vocab[word] = len(self.vocab) train_data[i] = self.vocab[word] vocab_initializing = False print 'corpus length:', len(words) print 'self.vocab size:', len(self.vocab) # Initialize base model (if we need to) if self.model is None: self.model = BaseRNN(len(self.vocab), self.units) if self.gpu >= 0: cuda.get_device(self.gpu).use() self.model.to_self.gpu() optimizer = optimizers.RMSprop(lr=self.settings.learning_rate, alpha=self.settings.decay_rate, eps=1e-8) optimizer.setup(self.model) jumpsPerEpoch = whole_len / batchsize epoch = 0 start_at = time.time() cur_at = start_at state = make_initial_state(self.units, batchsize=batchsize) if self.gpu >= 0: accum_loss = Variable(cuda.zeros(())) for _, value in state.items(): value.data = cuda.to_self.gpu(value.data) else: accum_loss = Variable(np.zeros((), dtype=np.float32)) print 'going to train {} iterations'.format(steps) for i in xrange(jumps): x_batch = np.array([ train_data[(jumpsPerEpoch * j + i) % whole_len] for j in xrange(batchsize) ]) y_batch = np.array([ train_data[(jumpsPerEpoch * j + i + 1) % whole_len] for j in xrange(batchsize) ]) if self.gpu >= 0: x_batch = cuda.to_self.gpu(x_batch) y_batch = cuda.to_self.gpu(y_batch) state, loss_i = self.model.forward_one_step( x_batch, y_batch, state, dropout_ratio=self.settings.dropout) accum_loss += loss_i if (i + 1) % sequence_length == 0: now = time.time() print '{}/{}, train_loss = {}, time = {:.2f}'.format( (i + 1) / sequence_length, steps, accum_loss.data / sequence_length, now - cur_at) cur_at = now optimizer.zero_grads() accum_loss.backward() accum_loss.unchain_backward() # truncate if self.gpu >= 0: accum_loss = Variable(cuda.zeros(())) else: accum_loss = Variable(np.zeros((), dtype=np.float32)) optimizer.clip_grads(self.settings.grad_clip) optimizer.update() if (i + 1) % jumpsPerEpoch == 0: epoch += 1 if epoch >= self.settings.learning_rate_decay_after: optimizer.lr *= self.settings.learning_rate_decay print 'decayed self.settings.learning rate by a factor {} to {}'.format( self.settings.learning_rate_decay, optimizer.lr)
def update_core(self): optimizer_sd = self.get_optimizer('main') optimizer_enc = self.get_optimizer('enc') optimizer_dec = self.get_optimizer('dec') optimizer_dis = self.get_optimizer('dis') xp = self.seed.xp step = self.iteration % self.args.iter osem_step = step % self.args.osem if step == 0: batch = self.get_iterator('main').next() self.prImg, self.rev, self.patient_id, self.slice = self.converter(batch, self.device) print(self.prImg.shape) self.n_reconst += 1 self.recon_freq = 1 if ".npy" in self.args.model_image: self.seed.W.array = xp.reshape(xp.load(self.args.model_image),(1,1,self.args.crop_height,self.args.crop_width)) elif ".dcm" in self.args.model_image: ref_dicom = dicom.read_file(self.args.model_image, force=True) img = xp.array(ref_dicom.pixel_array+ref_dicom.RescaleIntercept) img = (2*(xp.clip(img,self.args.HU_base,self.args.HU_base+self.args.HU_range)-self.args.HU_base)/self.args.HU_range-1.0).astype(np.float32) self.seed.W.array = xp.reshape(img,(1,1,self.args.crop_height,self.args.crop_width)) else: # initializers.Uniform(scale=0.5)(self.seed.W.array) initializers.HeNormal()(self.seed.W.array) self.initial_seed = self.seed.W.array.copy() # print(xp.min(self.initial_seed),xp.max(self.initial_seed),xp.mean(self.initial_seed)) ## for seed array arr = self.seed() HU = self.var2HU(arr) raw = self.HU2raw(HU) self.seed.cleargrads() loss_seed = Variable(xp.array([0.0],dtype=np.float32)) # conjugate correction using system matrix if self.args.lambda_sd > 0: self.seed.W.grad = xp.zeros_like(self.seed.W.array) loss_sd = 0 for i in range(len(self.prImg)): if self.rev[i]: rec_sd = F.exp(-F.sparse_matmul(self.prMats[osem_step],F.reshape(raw[i,:,::-1,::-1],(-1,1)))) ## else: rec_sd = F.exp(-F.sparse_matmul(self.prMats[osem_step],F.reshape(raw[i],(-1,1)))) ## if self.args.log: loss_sd += F.mean_squared_error(F.log(rec_sd),F.log(self.prImg[i][osem_step])) else: loss_sd += F.mean_squared_error(rec_sd,self.prImg[i][osem_step]) if self.args.system_matrix: gd = F.sparse_matmul( self.conjMats[osem_step], rec_sd-self.prImg[i][osem_step], transa=True) if self.rev[i]: self.seed.W.grad[i] -= self.args.lambda_sd * F.reshape(gd, (1,self.args.crop_height,self.args.crop_width)).array[:,::-1,::-1] # / logrep.shape[0] ? else: self.seed.W.grad[i] -= self.args.lambda_sd * F.reshape(gd, (1,self.args.crop_height,self.args.crop_width)).array # / logrep.shape[0] ? if not self.args.system_matrix: (self.args.lambda_sd *loss_sd).backward() chainer.report({'loss_sd': loss_sd/len(self.prImg)}, self.seed) if self.args.lambda_tvs > 0: loss_tvs = losses.total_variation(arr, tau=self.args.tv_tau, method=self.args.tv_method) loss_seed += self.args.lambda_tvs * loss_tvs chainer.report({'loss_tvs': loss_tvs}, self.seed) if self.args.lambda_advs>0: L_advs = F.average( (self.dis(arr)-1.0)**2 ) loss_seed += self.args.lambda_advs * L_advs chainer.report({'loss_advs': L_advs}, self.seed) ## generator output arr_n = losses.add_noise(arr,self.args.noise_gen) if self.args.no_train_seed: arr_n.unchain() if not self.args.decoder_only: arr_n = self.encoder(arr_n) gen = self.decoder(arr_n) # range = [-1,1] ## generator loss loss_gen = Variable(xp.array([0.0],dtype=np.float32)) plan, plan_ae = None, None if self.args.lambda_ae1>0 or self.args.lambda_ae2>0: plan = losses.add_noise(Variable(self.converter(self.get_iterator('planct').next(), self.device)), self.args.noise_dis) plan_enc = self.encoder(plan) plan_ae = self.decoder(plan_enc) loss_ae1 = F.mean_absolute_error(plan,plan_ae) loss_ae2 = F.mean_squared_error(plan,plan_ae) if self.args.lambda_reg>0: loss_reg_ae = losses.loss_func_reg(plan_enc[-1],'l2') chainer.report({'loss_reg_ae': loss_reg_ae}, self.seed) loss_gen += self.args.lambda_reg * loss_reg_ae loss_gen += self.args.lambda_ae1 * loss_ae1 + self.args.lambda_ae2 * loss_ae2 chainer.report({'loss_ae1': loss_ae1}, self.seed) chainer.report({'loss_ae2': loss_ae2}, self.seed) if self.args.lambda_tv > 0: L_tv = losses.total_variation(gen, tau=self.args.tv_tau, method=self.args.tv_method) loss_gen += self.args.lambda_tv * L_tv chainer.report({'loss_tv': L_tv}, self.seed) if self.args.lambda_adv>0: L_adv = F.average( (self.dis(gen)-1.0)**2 ) loss_gen += self.args.lambda_adv * L_adv chainer.report({'loss_adv': L_adv}, self.seed) ## regularisation on the latent space if self.args.lambda_reg>0: loss_reg = losses.loss_func_reg(arr_n[-1],'l2') chainer.report({'loss_reg': loss_reg}, self.seed) loss_gen += self.args.lambda_reg * loss_reg self.encoder.cleargrads() self.decoder.cleargrads() loss_gen.backward() loss_seed.backward() chainer.report({'loss_gen': loss_gen}, self.seed) optimizer_enc.update() optimizer_dec.update() optimizer_sd.update() chainer.report({'grad_sd': F.average(F.absolute(self.seed.W.grad))}, self.seed) if hasattr(self.decoder, 'latent_fc'): chainer.report({'grad_gen': F.average(F.absolute(self.decoder.latent_fc.W.grad))}, self.seed) # reconstruction consistency for NN if (step % self.recon_freq == 0) and self.args.lambda_nn>0: self.encoder.cleargrads() self.decoder.cleargrads() self.seed.cleargrads() gen.grad = xp.zeros_like(gen.array) HU_nn = self.var2HU(gen) raw_nn = self.HU2raw(HU_nn) loss_nn = 0 for i in range(len(self.prImg)): if self.rev[i]: rec_nn = F.exp(-F.sparse_matmul(self.prMats[osem_step],F.reshape(raw_nn[i,:,::-1,::-1],(-1,1)))) else: rec_nn = F.exp(-F.sparse_matmul(self.prMats[osem_step],F.reshape(raw_nn[i],(-1,1)))) loss_nn += F.mean_squared_error(rec_nn,self.prImg[i][osem_step]) if self.args.system_matrix: gd_nn = F.sparse_matmul( rec_nn-self.prImg[i][osem_step], self.conjMats[osem_step], transa=True ) if self.rev[i]: gen.grad[i] -= self.args.lambda_nn * F.reshape(gd_nn, (1,self.args.crop_height,self.args.crop_width)).array[:,::-1,::-1] else: gen.grad[i] -= self.args.lambda_nn * F.reshape(gd_nn, (1,self.args.crop_height,self.args.crop_width)).array chainer.report({'loss_nn': loss_nn/len(self.prImg)}, self.seed) if self.args.system_matrix: gen.backward() else: (self.args.lambda_nn * loss_nn).backward() if not self.args.no_train_seed: optimizer_sd.update() if not self.args.no_train_enc: optimizer_enc.update() if not self.args.no_train_dec: optimizer_dec.update() if self.seed.W.grad is not None: chainer.report({'grad_sd_consistency': F.average(F.absolute(self.seed.W.grad))}, self.seed) if hasattr(self.decoder, 'latent_fc'): chainer.report({'grad_gen_consistency': F.average(F.absolute(self.decoder.latent_fc.W.grad))}, self.seed) elif hasattr(self.decoder, 'ul'): chainer.report({'grad_gen_consistency': F.average(F.absolute(self.decoder.ul.c1.c.W.grad))}, self.seed) chainer.report({'seed_diff': F.mean_absolute_error(self.initial_seed,self.seed.W)/F.mean_absolute_error(self.initial_seed,xp.zeros_like(self.initial_seed))}, self.seed) # clip seed to [-1,1] if self.args.clip: self.seed.W.array = xp.clip(self.seed.W.array,a_min=-1.0, a_max=1.0) # adjust consistency loss update frequency self.recon_freq = max(1,int(round(self.args.max_reconst_freq * (step-self.args.reconst_freq_decay_start) / (self.args.iter+1-self.args.reconst_freq_decay_start)))) ## for discriminator fake = None if self.args.dis_freq > 0 and ( (step+1) % self.args.dis_freq == 0) and (self.args.lambda_gan+self.args.lambda_adv+self.args.lambda_advs>0): # get mini-batch if plan is None: plan = self.converter(self.get_iterator('planct').next(), self.device) plan = losses.add_noise(Variable(plan),self.args.noise_dis) # create fake if self.args.lambda_gan>0: if self.args.decoder_only: fake_seed = xp.random.uniform(-1,1,(1,self.args.latent_dim)).astype(np.float32) else: fake_seed = self.encoder(xp.random.uniform(-1,1,(1,1,self.args.crop_height,self.args.crop_width)).astype(np.float32)) fake = self.decoder(fake_seed) # decoder self.decoder.cleargrads() loss_gan = F.average( (self.dis(fake)-1.0)**2 ) chainer.report({'loss_gan': loss_gan}, self.seed) loss_gan *= self.args.lambda_gan loss_gan.backward() optimizer_dec.update(loss=loss_gan) fake_copy = self._buffer.query(fake.array) if self.args.lambda_nn>0: fake_copy = self._buffer.query(self.converter(self.get_iterator('mvct').next(), self.device)) if (step+1) % (self.args.iter // 30): fake_copy = Variable(self._buffer.query(gen.array)) # discriminator L_real = F.average( (self.dis(plan)-1.0)**2 ) L_fake = F.average( self.dis(fake_copy)**2 ) loss_dis = 0.5*(L_real+L_fake) self.dis.cleargrads() loss_dis.backward() optimizer_dis.update() chainer.report({'loss_dis': (L_real+L_fake)/2}, self.seed) if ((self.iteration+1) % self.args.vis_freq == 0) or ((step+1)==self.args.iter): for i in range(self.args.batchsize): outlist=[] if not self.args.no_train_seed and not self.args.decoder_only: outlist.append((self.seed()[i],"0sd")) if plan_ae is not None: outlist.append((plan[i],'2pl')) outlist.append((plan_ae[i],'3ae')) if self.args.lambda_nn>0 or self.args.lambda_adv>0: if self.args.decoder_only: gen_img = self.decoder([self.seed()]) else: gen_img = self.decoder(self.encoder(self.seed())) outlist.append((gen_img[i],'1gn')) if fake is not None: outlist.append((fake[i],'4fa')) for out,typ in outlist: out.to_cpu() HU = (((out+1)/2 * self.args.HU_range)+self.args.HU_base).array # [-1000=air,0=water,>1000=bone] print("type: ",typ,"HU:",np.min(HU),np.mean(HU),np.max(HU)) #visimg = np.clip((out.array+1)/2,0,1) * 255.0 b,r = -self.args.HU_range_vis//2,self.args.HU_range_vis visimg = (np.clip(HU,b,b+r)-b)/r * 255.0 fn = 'n{:0>5}_iter{:0>6}_p{}_z{}_{}'.format(self.n_reconst,step+1,self.patient_id[i],self.slice[i],typ) write_image(np.uint8(visimg),os.path.join(self.args.out,fn+'.jpg')) if (step+1)==self.args.iter or (not self.args.no_save_dcm): #np.save(os.path.join(self.args.out,fn+'.npy'),HU[0]) write_dicom(os.path.join(self.args.out,fn+'.dcm'),HU[0])
list_sentences = [np.array(row, np.int32) for row in list_sentences] opt1 = SGD_Embedid() # 確率的勾配法を使用 opt2 = SGD() # 確率的勾配法を使用 opt1.setup(model1) # 学習器の初期化 opt2.setup(model2) # 学習器の初期化 opt1.tuples[0][1].fill(0) opt2.zero_grads() random.shuffle(list_sentences) list_minibatch = [] for i, sentence in enumerate(list_sentences): list_minibatch.append(sentence) if len(list_minibatch) == BATCH_SIZE: accum_loss_total = Variable(np.zeros((), dtype=np.float32)) # 累積損失の初期値 uniq_sentence = np.zeros((), np.int32) for batch_sentence in list_minibatch: accum_loss_total += forward(batch_sentence) # 損失の計算 uniq_sentence = np.append(uniq_sentence, batch_sentence) accum_loss_total.backward() # 誤差逆伝播 opt1.clip_grads(10) # 大きすぎる勾配を抑制 opt2.clip_grads(10) # 大きすぎる勾配を抑制 uniq_sentence = np.unique(uniq_sentence) opt1.update(uniq_sentence) # パラメータの更新 opt2.update() # パラメータの更新 opt1.zero_grads(uniq_sentence) # 勾配の初期化 opt2.zero_grads() # 勾配の初期化 list_minibatch = [] if i % 1000 == 999: break
def _train(self, **kwargs): gpu = -1 if "gpu" not in kwargs else kwargs["gpu"] lr = 2e-3 if "lr" not in kwargs else kwargs["lr"] lr_decay = 0.97 if "lr_decay" not in kwargs else kwargs["lr_decay"] lr_decay_after=10 if "lr_decay_after" not in kwargs else kwargs["lr_decay_after"] decay_rate = 0.95 if "decay_rate" not in kwargs else kwargs["decay_rate"] dropout = 0.0 if "dropout" not in kwargs else kwargs["dropout"] bprop_len = 50 if "bprop_len" not in kwargs else kwargs["bprop_len"] batchsize = 50 if "batchsize" not in kwargs else kwargs["batchsize"] grad_clip = 5 if "grad_clip" not in kwargs else kwargs["grad_clip"] n_epochs = 5 if "epochs" not in kwargs else kwargs["epochs"] if gpu >= 0: cuda.get_device(gpu).use() self.model.to_gpu() optimizer = optimizers.RMSprop(lr=lr, alpha=decay_rate, eps=1e-8) optimizer.setup(self.model) train_data = self.dataset whole_len = train_data.shape[0] jump = whole_len // batchsize epoch = 0 start_at = time.time() cur_at = start_at state = self.model.make_initial_state(batchsize=batchsize) if gpu >= 0: accum_loss = Variable(cuda.zeros(())) for key, value in state.items(): value.data = cuda.to_gpu(value.data)#plist else: accum_loss = Variable(np.zeros((), dtype=np.float32)) print ('going to train {} iterations'.format(jump * n_epochs)) for i in range(jump * n_epochs): x_batch = np.array([train_data[(jump * j + i) % whole_len] for j in range(batchsize)]) y_batch = np.array([train_data[(jump * j + i + 1) % whole_len] for j in range(batchsize)]) if gpu >=0: x_batch = cuda.to_gpu(x_batch) y_batch = cuda.to_gpu(y_batch) state, loss_i = self.model.forward_one_step(x_batch, y_batch, state, dropout_ratio=dropout) accum_loss += loss_i if (i + 1) % bprop_len == 0: # Run truncated BPTT now = time.time() sys.stderr.write('\r{}/{}, train_loss = {}, time = {:.2f}'.format((i+1)//bprop_len,(jump*n_epochs)//bprop_len, accum_loss.data / bprop_len, now-cur_at)) sys.stderr.flush() cur_at = now optimizer.zero_grads() accum_loss.backward() accum_loss.unchain_backward() # truncate if gpu >= 0: accum_loss = Variable(cuda.zeros(())) else: accum_loss = Variable(np.zeros((), dtype=np.float32)) optimizer.clip_grads(grad_clip) optimizer.update() if (i + 1) % 10000 == 0: pickle.dump(copy.deepcopy(self.model).to_cpu(), open(self.model_path, 'wb')) if (i + 1) % jump == 0: epoch += 1 if epoch >= lr_decay_after: optimizer.lr *= lr_decay print ('decayed learning rate by a factor {} to {}'.format(lr_decay, optimizer.lr)) sys.stdout.flush() pickle.dump(copy.deepcopy(self.model).to_cpu(), open(self.model_path, 'wb'))
def train_encoder(model, dictionary: corpora.Dictionary, sentence_file: str, model_dir: str, epoch_size: int = 100, batch_size: int = 30, dropout: bool = True, gpu: bool = False) -> None: if gpu >= 0: model.to_gpu() print(model.xp) # setup SGD optimizer opt = optimizers.SGD() opt.setup(model) # optimizer hooks clip_threshold = 5.0 print("set optimizer clip threshold: {}".format(clip_threshold)) opt.add_hook(chainer.optimizer.GradientClipping(clip_threshold)) # load conversation sentences sentences = load_sentence(sentence_file) data_size = len(sentences) print("data size: {}".format(data_size)) for epoch in range(epoch_size): print("epoch {}".format(epoch)) indexes = np.random.permutation(data_size) epoch_loss = 0 # int for bat_i in range(0, data_size, batch_size): forward_start_time = datetime.now() batch_loss = Variable(model.xp.zeros((), dtype=model.xp.float32)) for index in indexes[bat_i:bat_i + batch_size]: input_words = sentences[index] # id のリストに変換する input_words_with_s = tokens2ids(input_words, dictionary, verbose=False) # フォワード try: new_loss = model(input_words_with_s, dropout=dropout, state=None, train=True) if model.xp.isnan(new_loss.data): sys.exit(1) batch_loss += new_loss except Exception: print(index, input_words_with_s) import traceback traceback.print_exc() # 平均化 batch_size_array = model.xp.array(batch_size, dtype=model.xp.float32) # if gpu: # batch_size_array = cuda.to_gpu(batch_size_array) batch_loss = batch_loss / Variable(batch_size_array) epoch_loss += batch_loss.data # 時間計測 forward_end_time = datetime.now() # 最適化 opt_start_time = datetime.now() model.zerograds() batch_loss.backward() opt.update() opt_end_time = datetime.now() forward_delta = forward_end_time - forward_start_time opt_delta = opt_end_time - opt_start_time print_fmt = ("epoch {} batch {}: " "loss {}, grad L2 norm: {}, forward {}, optimizer {}") print( print_fmt.format( epoch, int(bat_i / batch_size), batch_loss.data, opt.compute_grads_norm(), forward_delta, opt_delta, )) # save if ((bat_i / batch_size) + 1) % 100 == 0: serializers.save_npz(os.path.join(model_dir, "model.npz"), model) if ((bat_i / batch_size) + 1) % 1000 == 0: serializers.save_npz( os.path.join( model_dir, "model_{}_{}_{}.npz".format( epoch, int(bat_i / batch_size) + 1, datetime.now().strftime("%Y%m%d-%H%M%S"))), model) print("finish epoch {}, loss {}".format(epoch, epoch_loss / epoch_size)) # save serializers.save_npz(os.path.join(model_dir, "model.npz"), model) serializers.save_npz( os.path.join( model_dir, "model_{}_{}_{}.npz".format( epoch, int(bat_i / batch_size) + 1, datetime.now().strftime("%Y%m%d-%H%M%S"))), model)
def main(): args = parse_args() init_program_state(args) vocab = make_vocab() data, batched_data = load_data(args.train, vocab, args.batch_size) dev , batched_dev = load_data(args.dev, vocab, 1) test, batched_test = load_data(args.test, vocab, 1) model = init_model(input_size = len(vocab), embed_size = args.embed_size, hidden_size = args.hidden_size, output_size = len(vocab)) optimizer = optimizers.SGD(lr=args.lr) # Begin Training UF.init_model_parameters(model) model = UF.convert_to_GPU(USE_GPU, model) optimizer.setup(model) batchsize = args.batch_size epoch = args.epoch accum_loss = Variable(xp.zeros((), dtype=np.float32)) counter = 0 # For each epoch.. for ep in range(epoch): UF.trace("Training Epoch %d" % ep) total_tokens = 0 log_ppl = 0.0 # For each batch, do forward & backward computations for i, batch in enumerate(batched_data): loss, nwords = forward(model, batch) accum_loss += loss log_ppl += loss.data.reshape(()) # Tracing... total_tokens += nwords # UF.trace(' %d/%d = %.5f' % (min(i*batchsize, len(data)), len(data), loss.data.reshape(())*batchsize)) # Counting if (counter+1) % bp_len == 0: optimizer.zero_grads() accum_loss.backward() accum_loss.unchain_backward() accum_loss = Variable(xp.zeros((), dtype=np.float32)) optimizer.clip_grads(grad_clip) optimizer.update() counter += 1 # Counting Perplexity log_ppl /= total_tokens UF.trace(" PPL (Train) = %.10f" % math.exp(UF.to_cpu(USE_GPU, log_ppl))) dev_ppl = evaluate(model, batched_dev) UF.trace(" PPL (Dev) = %.10f" % math.exp(UF.to_cpu(USE_GPU, dev_ppl))) # Reducing learning rate if ep > 6: optimizer.lr /= 1.2 UF.trace("Reducing LR:", optimizer.lr) # Begin Testing UF.trace("Begin Testing...") test_ppl = evaluate(model, batched_test) UF.trace(" log(PPL) = %.10f" % test_ppl) UF.trace(" PPL = %.10f" % math.exp(UF.to_cpu(USE_GPU, test_ppl)))
def train_encoder_decoder(model, dictionary: corpora.Dictionary, conversation_file: str, decoder_model_dir: str, epoch_size: int = 100, batch_size: int = 30, dropout: bool = False, gpu: bool = False) -> None: if gpu >= 0: model.to_gpu() print(model.xp) # setup SGD optimizer # opt = optimizers.SGD() opt = optimizers.Adam() opt.setup(model) # optimizer hooks clip_threshold = 5.0 print("set optimizer clip threshold: {}".format(clip_threshold)) opt.add_hook(chainer.optimizer.GradientClipping(clip_threshold)) # load conversation sentences conversation = load_conversation(conversation_file, dictionary) data_size = len(conversation) print("data size: {}".format(data_size)) for epoch in range(epoch_size): print("running epoch {}".format(epoch)) indexes = np.random.permutation(range(data_size)) epoch_loss = 0 # int for bat_i in range(0, data_size, batch_size): forward_start_time = datetime.now() for index in indexes[bat_i:bat_i + batch_size]: pair_words = conversation[index] # encoder input words orig_words = pair_words[0][:-1] # remove END_SYMBOL reply_words = pair_words[1] if orig_words: assert orig_words[-1] is not config.END_SYMBOL input_words_with_s = tokens2ids(orig_words, dictionary) ys, state = model.predictor.forward([ Variable(model.xp.array([word], dtype=model.xp.int32)) for word in input_words_with_s ], state=None, dropout=dropout, train=True) # decode assert reply_words[0] == config.END_SYMBOL assert reply_words[-1] == config.END_SYMBOL output_words_with_s = tokens2ids(reply_words, dictionary) batch_loss = Variable(model.xp.zeros((), dtype=np.float32)) try: new_loss = model( output_words_with_s, state=state, # init_state を input の state にする dropout=dropout, train=True) batch_loss += new_loss except Exception: print(index, input_words_with_s) import traceback traceback.print_exc() # 平均化 batch_size_array = model.xp.array(batch_size, dtype=model.xp.float32) batch_loss = batch_loss / Variable(batch_size_array) epoch_loss += batch_loss.data # 時間計測 forward_end_time = datetime.now() # 最適化 opt_start_time = datetime.now() model.zerograds() batch_loss.backward() opt.update() opt_end_time = datetime.now() forward_delta = forward_end_time - forward_start_time opt_delta = opt_end_time - opt_start_time # print( # ("decoder epoch {} batch {}: loss {}, " # "forward {}, optimizer {},").format( # epoch, # int(bat_i / batch_size), # batch_loss.data, # forward_delta, # opt_delta, # ) # ) print_fmt = ("epoch {} batch {}: " "loss {}, grad L2 norm: {}, forward {}, optimizer {}") print( print_fmt.format( epoch, int(bat_i / batch_size), batch_loss.data, opt.compute_grads_norm(), forward_delta, opt_delta, )) # save if ((bat_i / batch_size) + 1) % 100 == 0: serializers.save_npz( os.path.join(decoder_model_dir, "model.npz"), model) if ((bat_i / batch_size) + 1) % 1000 == 0: serializers.save_npz( os.path.join( decoder_model_dir, "model_{}_{}_{}.npz".format( epoch, int(bat_i / batch_size) + 1, datetime.now().strftime("%Y%m%d-%H%M%S"))), model) print("finish epoch {}, loss {}".format( epoch, epoch_loss / math.ceil(data_size / batch_size))) # save serializers.save_npz(os.path.join(decoder_model_dir, "model.npz"), model) serializers.save_npz( os.path.join( decoder_model_dir, "model_{}_{}_{}.npz".format( epoch, int(bat_i / batch_size) + 1, datetime.now().strftime("%Y%m%d-%H%M%S"))), model)
def generate_image(img_orig, img_style, width, nw, nh, max_iter, lr, img_gen=None): #arrays which is in neaural net of original image mid_orig = nn.forward(Variable(img_orig)) #shape of style_mats is ( 4, ch, ch) style_mats = [get_matrix(y) for y in nn.forward(Variable(img_style))] #set the initial image if args.initial_img != 'nothing': img_gen = image_resize(args.initial_img, W) if img_gen is None: img_gen = np.random.uniform(-20, 20, (1, 3, width, width)).astype(np.float32) img_gen = chainer.links.Parameter(img_gen) optimizer = optimizers.Adam(alpha=lr) #optimize the img_gen optimizer.setup(img_gen) for i in range(max_iter): img_gen.zerograds() x = img_gen.W # y is arrays of discrmination model whose input is img_gen y = nn.forward(x) L = Variable(xp.zeros((), dtype=np.float32)) for l in range(len(y)): ch = y[l].data.shape[1] wd = y[l].data.shape[2] gogh_y = F.reshape(y[l], (ch, wd**2)) #correratoin of the y gogh_matrix = F.matmul(gogh_y, gogh_y, transb=True) / np.float32( ch * wd**2) #differance between y and mid_orig #deep layer may be imoirtant for figuring shapes of objects L1 = (l + 1) * 0.3 * np.float32(args.lam) * np.float32( nn.alpha[l]) * F.mean_squared_error(y[l], Variable(mid_orig[l].data)) #differance between gogh_matrix and style_mats <- these two mats are correlations L2 = np.float32(nn.beta[l]) * F.mean_squared_error( gogh_matrix, Variable(style_mats[l].data)) / np.float32(len(y)) L += L1 + L2 if i % 100 == 0: print(i, l, L1.data, L2.data) L.backward() img_gen.W.grad = x.grad optimizer.update() tmp_shape = x.data.shape if args.gpu >= 0: img_gen.W.data += Clip().forward( img_gen.W.data).reshape(tmp_shape) - img_gen.W.data else: def clip(x): return -120 if x < -120 else (136 if x > 136 else x) img_gen.W.data += np.vectorize(clip)( img_gen.W.data).reshape(tmp_shape) - img_gen.W.data if i % 50 == 0: save_image(img_gen.W.data, W, nw, nh, i) if int(i) == int(max_iter - 1): chainer.serializers.save_npz('final_img.npz', img_gen)
pg0=get_batch6.GET_BATCH6(band_num=BAND_BUNKATU0, seq_num=SEQUENCE_LEN0,n_delay=NDELAY,npoint=NPOINT0,fs0=FS0,fc0=FC1, gain0=GAIN1, q0=Q1) a1=pg0.a1 b1=pg0.b1 loss = Variable(np.zeros((), dtype=np.float32)) losses =[] NUMBER_ITERATION=501 for i in range(NUMBER_ITERATION): x,y = pg0.get1() # get train data loss, state = rnn.compute_loss(model, x, y, state) # do one sequence while batch bands model.cleargrads() loss.backward() optimizer.update() losses.append(loss.data /(SEQUENCE_LEN0 * 1.0)) # total loss while one BAND_BUNKATU0 state = rnn.make_initial_state( batchsize=BAND_BUNKATU0 ) # clear for next batch-sequence-input if i%20==0: plt.plot(losses,"b") plt.yscale('log') plt.title('loss') plt.pause(1.0) print "loss.data (%06d)="%i, loss.data / (SEQUENCE_LEN0 * 1.0) ##if i%100==0: # save model parameter in the directory model20 every 100 ## serializers.save_npz('model20/%06d_my.model.npz'%i, model)
def train(self, train_data, train_input, test_data, test_input, n_epochs, filename=None, KL_loss=False, Add_training=False): """ :param train_data: data in the form n_batches x batch_size x n_steps x n_outputs :param test_data: data in the form n_batches x batch_size x n_steps x n_outputs :param n_epochs: nr of training epochs :param dec_input: this is the input to the decoder, which can modulate input dynamics; size: step_size x n_inputs :return: """ # keep track of loss train_loss = np.zeros(n_epochs) test_loss = np.zeros(n_epochs) batches_loss = np.zeros(train_data.shape[0] * n_epochs) # keep track of learned alphas and weights if self.model.mode is not 'Static': learning_alphasS = np.empty( (n_epochs + 1, self.model.hidden.alphaS.alpha.size)) learning_alphasR = np.empty( (n_epochs + 1, self.model.hidden.alphaR.alpha.size)) learning_alphasS[0, :] = self.model.hidden.alphaS.alpha learning_alphasR[0, :] = self.model.hidden.alphaR.alpha # saved_U_fast = np.zeros((n_epochs,self.model.n_fast, self.model.n_slow+self.model.n_inout)) # saved_U_inout= np.zeros((n_epochs, self.model.n_inout, self.model.n_fast)) # saved_W_fast = np.zeros((n_epochs, self.model.n_fast, self.model.n_fast)) # saved_W_inout= np.zeros((n_epochs, self.model.n_inout, self.model.n_inout)) index = 0 #for batches_wise loss best_loss = 4000 if Add_training: self.optimizer.setup(self.model.slow) #self.model.inout.W.W.data = np.zeros((25,25)) #NO RECURRENT CONNECTION IN OUTPUT LAYER! for epoch in tqdm.tqdm(xrange(n_epochs)): #for epoch in xrange(n_epochs): with chainer.using_config('train', True): n_batches = train_data.shape[0] batch_size = train_data.shape[1] n_steps = train_data.shape[2] for i in range(n_batches): #print('Sample number %i' %i) loss = Variable(self.xp.array(0, 'float32')) self.model.reset_state() #initialization for this batch data0 = Variable(train_data[i, :, 0, :]) self.model.hidden.initialize_state(batch_size) #self.model.readout.initialize_state(batch_size) for t in xrange(0, n_steps, 1): x = Variable(train_input[i, :, t, :]) data = self.xp.array(train_data[i, :, t, :]) _loss = mean_squared_error(self.model(x), data) # prediction mode if KL_loss: _loss = self.KL_divergence(self.model(), data) #print _loss train_loss[epoch] += cuda.to_cpu(_loss.data) loss += _loss batches_loss[index] = loss.data index = index + 1 self.model.cleargrads( ) #look into this function to clear grad of a whole link loss.backward() loss.unchain_backward() #self.model.inout.W.disable_update() #NO RECURRENT CONNECTIONS IN OUTPUT LAYER! #if self.model.mode == 'Static': # self.model.hidden.alphaS.disable_update() # self.model.hidden.alphaR.disable_update() if Add_training: #delete grads to be deleted! or use enable_update() self.model.fast.U1.disable_update() self.model.fast.W.disable_update() self.model.inout.disable_update() self.model.slow.W.disable_update() self.optimizer.update() #print 'UPDATE' # saved_U_fast[epoch,:,:] = self.model.fast.U.W.data # saved_U_inout[epoch,:,:]= self.model.inout.U.W.data # saved_W_fast[epoch,:,:] = self.model.fast.W.W.data # saved_W_inout[epoch, :,:]= self.model.inout.W.W.data # # #save learning of time constants if self.model.mode is not 'Static': learning_alphasS[epoch + 1, :] = self.model.hidden.alphaS.alpha.data learning_alphasR[epoch + 1, :] = self.model.hidden.alphaR.alpha.data # compute loss per epoch train_loss[epoch] /= (n_batches * batch_size * self.model.n_out) # save model at some epoch #epochs_save = np.linspace(0, n_epochs-n_epochs/10, num=10, dtype=int) #if epoch in epochs_save: # thisname = 'model_at_epoch_%i' %epoch # self.save('saved_models/'+filename+'/'+thisname) # validation with chainer.using_config('train', False): n_batches = test_data.shape[0] batch_size = test_data.shape[1] n_steps = test_data.shape[2] # assert(n_steps == n_clamp+n_pred) for i in range(n_batches): self.model.reset_state() data0 = Variable(test_data[i, :, t, :]) self.model.hidden.initialize_state(batch_size) # self.model.readout.initialize_state(batch_size) for t in xrange(0, n_steps, 1): x = Variable(test_input[i, :, t, :]) data = self.xp.array(test_data[i, :, t, :]) _loss = mean_squared_error(self.model(x), data) # prediction mode if KL_loss: _loss = self.KL_divergence(self.model(), data) test_loss[epoch] += cuda.to_cpu(_loss.data) # compute loss per epoch test_loss[epoch] /= (n_batches * batch_size * self.model.n_out) #method do avoid overfitting if test_loss[epoch] < best_loss: best_loss = test_loss[epoch] self.save('saved_models/' + filename + '/best') np.save('saved_models/' + filename + '/conv_epoch', epoch) # end of training cycle np.save('saved_models/' + filename + '/best_loss', best_loss) if self.model.mode is not 'Static': np.save('saved_models/' + filename + '/learning_alphaS', learning_alphasS) np.save('saved_models/' + filename + '/learning_alphaR', learning_alphasR) # np.save('saved_U_fast', saved_U_fast) # np.save('saved_W_fast', saved_W_fast) # np.save('saved_U_inout', saved_U_inout) # np.save('saved_W_inout', saved_W_inout) # np.save('saved_models/'+filename+'/saved_alphas_fast', learning_alphas_fast) # np.save('saved_models/'+filename+'/saved_alphas_slow', learning_alphas_slow) # np.save('saved_models/'+filename+'/saved_alphas_inout', learning_alphas_inout) # return train_loss, test_loss, batches_loss
def train(self, words, steps, batchsize=100, sequence_length=10): """ Train the Predictor's model on words for steps number of steps. """ whole_len = len(words) train_data = np.ndarray(whole_len, dtype=np.int32) jumps = steps * sequence_length # Initialize training data and maybe vocab. if self.vocab is None: vocab_initializing = True self.vocab = {} for i, word in enumerate(words): if vocab_initializing: if word not in self.vocab: self.vocab[word] = len(self.vocab) train_data[i] = self.vocab[word] vocab_initializing = False print 'corpus length:', len(words) print 'self.vocab size:', len(self.vocab) # Initialize base model (if we need to) if self.model is None: self.model = BaseRNN(len(self.vocab), self.units) if self.gpu >= 0: cuda.get_device(self.gpu).use() self.model.to_self.gpu() optimizer = optimizers.RMSprop(lr=self.settings.learning_rate, alpha=self.settings.decay_rate, eps=1e-8) optimizer.setup(self.model) jumpsPerEpoch = whole_len / batchsize epoch = 0 start_at = time.time() cur_at = start_at state = make_initial_state(self.units, batchsize=batchsize) if self.gpu >= 0: accum_loss = Variable(cuda.zeros(())) for _, value in state.items(): value.data = cuda.to_self.gpu(value.data) else: accum_loss = Variable(np.zeros((), dtype=np.float32)) print 'going to train {} iterations'.format(steps) for i in xrange(jumps): x_batch = np.array([train_data[(jumpsPerEpoch * j + i) % whole_len] for j in xrange(batchsize)]) y_batch = np.array([train_data[(jumpsPerEpoch * j + i + 1) % whole_len] for j in xrange(batchsize)]) if self.gpu >= 0: x_batch = cuda.to_self.gpu(x_batch) y_batch = cuda.to_self.gpu(y_batch) state, loss_i = self.model.forward_one_step(x_batch, y_batch, state, dropout_ratio=self.settings.dropout) accum_loss += loss_i if (i + 1) % sequence_length == 0: now = time.time() print '{}/{}, train_loss = {}, time = {:.2f}'.format((i+1)/sequence_length, steps, accum_loss.data / sequence_length, now-cur_at) cur_at = now optimizer.zero_grads() accum_loss.backward() accum_loss.unchain_backward() # truncate if self.gpu >= 0: accum_loss = Variable(cuda.zeros(())) else: accum_loss = Variable(np.zeros((), dtype=np.float32)) optimizer.clip_grads(self.settings.grad_clip) optimizer.update() if (i + 1) % jumpsPerEpoch == 0: epoch += 1 if epoch >= self.settings.learning_rate_decay_after: optimizer.lr *= self.settings.learning_rate_decay print 'decayed self.settings.learning rate by a factor {} to {}'.format(self.settings.learning_rate_decay, optimizer.lr)
def generate_image(img_orig, img_style, width, nw, nh, max_iter, lr, img_gen=None): mid_orig = nn.forward(Variable(img_orig, volatile=True)) style_mats = [ get_matrix(y) for y in nn.forward(Variable(img_style, volatile=True)) ] if img_gen is None: if args.gpu >= 0: img_gen = xp.random.uniform(-20, 20, (1, 3, width, width), dtype=np.float32) else: img_gen = np.random.uniform(-20, 20, (1, 3, width, width)).astype( np.float32) x = Variable(img_gen) xg = xp.zeros_like(x.data) optimizer = optimizers.Adam(alpha=lr) optimizer.setup((img_gen, xg)) for i in range(max_iter): x = Variable(img_gen) y = nn.forward(x) optimizer.zero_grads() L = Variable(xp.zeros((), dtype=np.float32)) for l in range(len(y)): ch = y[l].data.shape[1] wd = y[l].data.shape[2] gogh_y = F.reshape(y[l], (ch, wd**2)) gogh_matrix = F.matmul(gogh_y, gogh_y, transb=True) / np.float32( ch * wd**2) L1 = np.float32(args.lam) * np.float32( nn.alpha[l]) * F.mean_squared_error(y[l], Variable(mid_orig[l].data)) L2 = np.float32(nn.beta[l]) * F.mean_squared_error( gogh_matrix, Variable(style_mats[l].data)) / np.float32(len(y)) L += L1 + L2 if i % 100 == 0: print i, l, L1.data, L2.data L.backward() xg += x.grad optimizer.update() tmp_shape = img_gen.shape if args.gpu >= 0: img_gen += Clip().forward(img_gen).reshape(tmp_shape) - img_gen else: def clip(x): return -120 if x < -120 else (136 if x > 136 else x) img_gen += np.vectorize(clip)(img_gen).reshape(tmp_shape) - img_gen if i % 3000 == 0: save_image(img_gen, W, nw, nh, i)
y_t = cuda.to_gpu(y_t) state, loss_i = model.forward_one_step(x_t, y_t, state, dropout_ratio=args.dropout) loss += loss_i now = time.time() end_time += now - cur_at iterations_count += 1 print 'loss_all=' + str(loss.data) print '{}, train_loss = {}, time = {:.4f}'.format( iterations_count, loss.data / (len(train_data[i % whole_len]) - 1), now - cur_at) cur_at = now optimizer.zero_grads() loss.backward() loss.unchain_backward() optimizer.clip_grads(grad_clip) optimizer.update() if (i + 1) == (whole_len * n_epochs): cuda.cupy.save('l1_x_W.npy', model.l1_x.W) cuda.cupy.save('l1_x_b.npy', model.l1_x.b) cuda.cupy.save('l1_h_W.npy', model.l1_h.W) cuda.cupy.save('l1_h_b.npy', model.l1_h.b) cuda.cupy.save('l6_W.npy', model.l6.W) cuda.cupy.save('l6_b.npy', model.l6.b) if ((i + 1) % whole_len) == 0: epoch += 1 train_loss_all.append(loss.data.get() / len(train_data[i % whole_len])) for k in xrange(whole_val_len): val_state = make_initial_state(n_units)
class RNNCharEstimator(ChainerClassifier): def __init__(self, net_type='lstm', net_hidden=100, vocab_size=1000, dropout_ratio=0.0, seq_size=70, grad_clip=100.0, **params): ChainerClassifier.__init__(self, **params) self.net_hidden = net_hidden self.net_type = net_type self.vocab_size = vocab_size self.dropout_ratio = dropout_ratio self.seq_size = seq_size self.grad_clip = grad_clip self.param_names.append('vocab_size') self.param_names.append('net_type') self.param_names.append('net_hidden') self.param_names.append('dropout_ratio') def setup_network(self, n_features): if self.net_type == 'lstm': self.network = CharLSTM(self.vocab_size, self.net_hidden, self.batch_size) elif self.net_type == 'irnn': self.network = CharIRNN(self.vocab_size, self.net_hidden, self.batch_size) else: error("Unknown net_type") self.reset_accum_loss() def reset_accum_loss(self): if self.gpu >= 0: self.accum_loss = Variable(cuda.zeros(())) else: self.accum_loss = Variable(np.zeros(())) def forward_train(self, x, t): return self.network.train(x, t, dropout_ratio=self.dropout_ratio) def predict(self, x_data): self.network.reset_state(1) if self.gpu >= 0: self.network.to_gpu() x_data = cuda.to_gpu(x_data) results = None for i in xrange(x_data.shape[0]): x = Variable(x_data[i, :]) y = self.network.predict(x) if results == None: results = cuda.to_cpu(y.data) else: results = np.concatenate([results, cuda.to_cpu(y.data)]) results = results.argmax(1) return results def fit_update(self, loss, batch_id): self.accum_loss += loss if ((batch_id + 1) % self.seq_size) == 0: # Run Truncated BPTT self.optimizer.zero_grads() self.accum_loss.backward() self.accum_loss.unchain_backward() # truncate self.optimizer.clip_grads(self.grad_clip) self.optimizer.update() self.reset_accum_loss() def make_batch(self, x_data, y_data, batch_id): batch_num = self.n_samples / self.batch_size x_batch = np.array([ x_data[(batch_id + batch_num * j) % self.n_samples] for j in xrange(self.batch_size) ]).reshape(self.batch_size) y_batch = np.array([ y_data[(batch_id + batch_num * j) % self.n_samples] for j in xrange(self.batch_size) ]) return x_batch, y_batch
def train_encoder( model, dictionary: corpora.Dictionary, sentence_file: str, model_dir: str, epoch_size: int=100, batch_size: int=30, dropout: bool=True, gpu: bool=False ) -> None: if gpu >= 0: model.to_gpu() print(model.xp) # setup SGD optimizer opt = optimizers.SGD() opt.setup(model) # optimizer hooks clip_threshold = 5.0 print("set optimizer clip threshold: {}".format(clip_threshold)) opt.add_hook(chainer.optimizer.GradientClipping(clip_threshold)) # load conversation sentences sentences = load_sentence(sentence_file) data_size = len(sentences) print("data size: {}".format(data_size)) for epoch in range(epoch_size): print("epoch {}".format(epoch)) indexes = np.random.permutation(data_size) epoch_loss = 0 # int for bat_i in range(0, data_size, batch_size): forward_start_time = datetime.now() batch_loss = Variable(model.xp.zeros((), dtype=model.xp.float32)) for index in indexes[bat_i:bat_i + batch_size]: input_words = sentences[index] # id のリストに変換する input_words_with_s = tokens2ids( input_words, dictionary, verbose=False ) # フォワード try: new_loss = model( input_words_with_s, dropout=dropout, state=None, train=True ) if model.xp.isnan(new_loss.data): sys.exit(1) batch_loss += new_loss except Exception: print(index, input_words_with_s) import traceback traceback.print_exc() # 平均化 batch_size_array = model.xp.array( batch_size, dtype=model.xp.float32 ) # if gpu: # batch_size_array = cuda.to_gpu(batch_size_array) batch_loss = batch_loss / Variable(batch_size_array) epoch_loss += batch_loss.data # 時間計測 forward_end_time = datetime.now() # 最適化 opt_start_time = datetime.now() model.zerograds() batch_loss.backward() opt.update() opt_end_time = datetime.now() forward_delta = forward_end_time - forward_start_time opt_delta = opt_end_time - opt_start_time print_fmt = ( "epoch {} batch {}: " "loss {}, grad L2 norm: {}, forward {}, optimizer {}" ) print(print_fmt.format( epoch, int(bat_i / batch_size), batch_loss.data, opt.compute_grads_norm(), forward_delta, opt_delta, )) # save if ((bat_i / batch_size) + 1) % 100 == 0: serializers.save_npz( os.path.join( model_dir, "model.npz" ), model ) if ((bat_i / batch_size) + 1) % 1000 == 0: serializers.save_npz( os.path.join( model_dir, "model_{}_{}_{}.npz".format( epoch, int(bat_i / batch_size) + 1, datetime.now().strftime("%Y%m%d-%H%M%S") ) ), model ) print("finish epoch {}, loss {}".format( epoch, epoch_loss / epoch_size )) # save serializers.save_npz( os.path.join( model_dir, "model.npz" ), model ) serializers.save_npz( os.path.join( model_dir, "model_{}_{}_{}.npz".format( epoch, int(bat_i / batch_size) + 1, datetime.now().strftime("%Y%m%d-%H%M%S") ) ), model )
list_sentences = readcsv("./files/list_id20151207.csv") list_sentences = [np.array(row, np.int32) for row in list_sentences] opt1 = SGD_Embedid() # 確率的勾配法を使用 opt2 = SGD() # 確率的勾配法を使用 opt1.setup(model1) # 学習器の初期化 opt2.setup(model2) # 学習器の初期化 opt1.tuples[0][1].fill(0) opt2.zero_grads() random.shuffle(list_sentences) list_minibatch = [] for i, sentence in enumerate(list_sentences): list_minibatch.append(sentence) if len(list_minibatch) == BATCH_SIZE: accum_loss_total = Variable(np.zeros((), dtype=np.float32)) # 累積損失の初期値 uniq_sentence = np.zeros((), np.int32) for batch_sentence in list_minibatch: accum_loss_total += forward(batch_sentence) # 損失の計算 uniq_sentence = np.append(uniq_sentence, batch_sentence) accum_loss_total.backward() # 誤差逆伝播 opt1.clip_grads(10) # 大きすぎる勾配を抑制 opt2.clip_grads(10) # 大きすぎる勾配を抑制 uniq_sentence = np.unique(uniq_sentence) opt1.update(uniq_sentence) # パラメータの更新 opt2.update() # パラメータの更新 opt1.zero_grads(uniq_sentence) # 勾配の初期化 opt2.zero_grads() # 勾配の初期化 list_minibatch = [] if i % 1000 == 999: break
def train_encoder_decoder( model, dictionary: corpora.Dictionary, conversation_file: str, decoder_model_dir: str, epoch_size: int=100, batch_size: int=30, dropout: bool=False, gpu: bool=False ) -> None: if gpu >= 0: model.to_gpu() print(model.xp) # setup SGD optimizer # opt = optimizers.SGD() opt = optimizers.Adam() opt.setup(model) # optimizer hooks clip_threshold = 5.0 print("set optimizer clip threshold: {}".format(clip_threshold)) opt.add_hook(chainer.optimizer.GradientClipping(clip_threshold)) # load conversation sentences conversation = load_conversation(conversation_file, dictionary) data_size = len(conversation) print("data size: {}".format(data_size)) for epoch in range(epoch_size): print("running epoch {}".format(epoch)) indexes = np.random.permutation(range(data_size)) epoch_loss = 0 # int for bat_i in range(0, data_size, batch_size): forward_start_time = datetime.now() for index in indexes[bat_i:bat_i + batch_size]: pair_words = conversation[index] # encoder input words orig_words = pair_words[0][:-1] # remove END_SYMBOL reply_words = pair_words[1] if orig_words: assert orig_words[-1] is not config.END_SYMBOL input_words_with_s = tokens2ids(orig_words, dictionary) ys, state = model.predictor.forward( [Variable( model.xp.array( [word], dtype=model.xp.int32 ) ) for word in input_words_with_s], state=None, dropout=dropout, train=True ) # decode assert reply_words[0] == config.END_SYMBOL assert reply_words[-1] == config.END_SYMBOL output_words_with_s = tokens2ids(reply_words, dictionary) batch_loss = Variable(model.xp.zeros((), dtype=np.float32)) try: new_loss = model( output_words_with_s, state=state, # init_state を input の state にする dropout=dropout, train=True ) batch_loss += new_loss except Exception: print(index, input_words_with_s) import traceback traceback.print_exc() # 平均化 batch_size_array = model.xp.array( batch_size, dtype=model.xp.float32 ) batch_loss = batch_loss / Variable(batch_size_array) epoch_loss += batch_loss.data # 時間計測 forward_end_time = datetime.now() # 最適化 opt_start_time = datetime.now() model.zerograds() batch_loss.backward() opt.update() opt_end_time = datetime.now() forward_delta = forward_end_time - forward_start_time opt_delta = opt_end_time - opt_start_time # print( # ("decoder epoch {} batch {}: loss {}, " # "forward {}, optimizer {},").format( # epoch, # int(bat_i / batch_size), # batch_loss.data, # forward_delta, # opt_delta, # ) # ) print_fmt = ( "epoch {} batch {}: " "loss {}, grad L2 norm: {}, forward {}, optimizer {}" ) print(print_fmt.format( epoch, int(bat_i / batch_size), batch_loss.data, opt.compute_grads_norm(), forward_delta, opt_delta, )) # save if ((bat_i / batch_size) + 1) % 100 == 0: serializers.save_npz( os.path.join( decoder_model_dir, "model.npz" ), model ) if ((bat_i / batch_size) + 1) % 1000 == 0: serializers.save_npz( os.path.join( decoder_model_dir, "model_{}_{}_{}.npz".format( epoch, int(bat_i / batch_size) + 1, datetime.now().strftime("%Y%m%d-%H%M%S") ) ), model ) print("finish epoch {}, loss {}".format( epoch, epoch_loss / math.ceil(data_size / batch_size) )) # save serializers.save_npz( os.path.join( decoder_model_dir, "model.npz" ), model ) serializers.save_npz( os.path.join( decoder_model_dir, "model_{}_{}_{}.npz".format( epoch, int(bat_i / batch_size) + 1, datetime.now().strftime("%Y%m%d-%H%M%S") ) ), model )
if args.gpu >=0: x_batch = cuda.to_gpu(x_batch) y_batch = cuda.to_gpu(y_batch) state, loss_i = model.forward_one_step(x_batch, y_batch, state, dropout_ratio=args.dropout) accum_loss += loss_i if (i + 1) % bprop_len == 0: # Run truncated BPTT now = time.time() print '{}/{}, train_loss = {}, time = {:.2f}'.format((i+1)/bprop_len, jump, accum_loss.data / bprop_len, now-cur_at) loss_file.write('{}\n'.format(accum_loss.data / bprop_len)) cur_at = now optimizer.zero_grads() accum_loss.backward() accum_loss.unchain_backward() # truncate if args.gpu >= 0: accum_loss = Variable(cuda.zeros(())) else: accum_loss = Variable(np.zeros(()).astype(np.float32)) optimizer.clip_grads(grad_clip) optimizer.update() if args.enable_checkpoint: if (i + 1) % 10000 == 0: fn = ('%s/charrnn_epoch_%.2f.chainermodel' % (args.checkpoint_dir, float(i)/jump)) pickle.dump(copy.deepcopy(model).to_cpu(), open(fn, 'wb')) if (i + 1) % jump == 0:
class RNNCharEstimator(ChainerClassifier): def __init__(self, net_type='lstm', net_hidden=100, vocab_size=1000, dropout_ratio=0.0, seq_size=70, grad_clip=100.0, **params): ChainerClassifier.__init__(self, **params) self.net_hidden = net_hidden self.net_type = net_type self.vocab_size = vocab_size self.dropout_ratio = dropout_ratio self.seq_size = seq_size self.grad_clip = grad_clip self.param_names.append('vocab_size') self.param_names.append('net_type') self.param_names.append('net_hidden') self.param_names.append('dropout_ratio') def setup_network(self, n_features): if self.net_type == 'lstm': self.network = CharLSTM(self.vocab_size, self.net_hidden, self.batch_size) elif self.net_type == 'irnn': self.network = CharIRNN(self.vocab_size, self.net_hidden, self.batch_size) else: error("Unknown net_type") self.reset_accum_loss() def reset_accum_loss(self): if self.gpu >= 0: self.accum_loss = Variable(cuda.zeros(())) else: self.accum_loss = Variable(np.zeros(())) def forward_train(self, x, t): return self.network.train(x, t, dropout_ratio=self.dropout_ratio) def predict(self, x_data): self.network.reset_state(1) if self.gpu >= 0: self.network.to_gpu() x_data = cuda.to_gpu(x_data) results = None for i in xrange(x_data.shape[0]): x = Variable(x_data[i,:]) y = self.network.predict(x) if results == None: results = cuda.to_cpu(y.data) else: results = np.concatenate([results, cuda.to_cpu(y.data)]) results = results.argmax(1) return results def fit_update(self, loss, batch_id): self.accum_loss += loss if ((batch_id + 1) % self.seq_size) == 0: # Run Truncated BPTT self.optimizer.zero_grads() self.accum_loss.backward() self.accum_loss.unchain_backward() # truncate self.optimizer.clip_grads(self.grad_clip) self.optimizer.update() self.reset_accum_loss() def make_batch(self, x_data, y_data, batch_id): batch_num = self.n_samples / self.batch_size x_batch = np.array([x_data[(batch_id + batch_num * j) % self.n_samples] for j in xrange(self.batch_size)]).reshape(self.batch_size) y_batch = np.array([y_data[(batch_id + batch_num * j) % self.n_samples] for j in xrange(self.batch_size)]) return x_batch, y_batch
for j in range(batchsize)]) if args.gpu >=0: x_batch = cuda.to_gpu(x_batch) y_batch = cuda.to_gpu(y_batch) state, loss_i = model.forward_one_step(x_batch, y_batch, state, dropout_ratio=args.dropout) accum_loss += loss_i if (i + 1) % bprop_len == 0: # Run truncated BPTT now = time.time() print('{}/{}, train_loss = {}, time = {:.2f}'.format((i+1)/bprop_len, jump, accum_loss.data / bprop_len, now-cur_at)) cur_at = now optimizer.zero_grads() accum_loss.backward() accum_loss.unchain_backward() # truncate if args.gpu >= 0: accum_loss = Variable(cuda.zeros(())) else: accum_loss = Variable(np.zeros((), dtype=np.float32)) optimizer.clip_grads(grad_clip) optimizer.update() if (i + 1) % 10000 == 0: fn = ('%s/charrnn_epoch_%.2f.chainermodel' % (args.checkpoint_dir, float(i)/jump)) pickle.dump(copy.deepcopy(model).to_cpu(), open(fn, 'wb')) if (i + 1) % jump == 0: epoch += 1
l1 = L.Linear(4, 3) l2 = L.Linear(3, 2) # ユニット数が 4 -> 3 -> 2 のネットワーク def my_forward(x): h = l1(x) return l2(h) x = Variable(np.array([[1, 2, 3, 4], [4, 5, 6, 7]], dtype=np.float32)) y = my_forward(x) y.grad = np.ones((2, 2), dtype=np.float32) x.backward() print("x.data = ") print(x.data) print() print("l1.W = ") print(l1.W.data) print("l1.b = ") print(l1.b.data) print() print("l2.W = ") print(l2.W.data) print("l2.b = ") print(l2.b.data) print()