def optimizer(opt_str): """ 入力文字列からオプティマイザを推定する """ if (opt_str.lower() == 'adam'): opt = O.Adam(amsgrad=True) elif (opt_str.lower() == 'ada_d'): opt = O.AdaDelta() elif (opt_str.lower() == 'ada_g'): opt = O.AdaGrad() elif (opt_str.lower() == 'm_sgd'): opt = O.MomentumSGD() elif (opt_str.lower() == 'n_ag'): opt = O.NesterovAG() elif (opt_str.lower() == 'rmsp'): opt = O.RMSprop() elif (opt_str.lower() == 'rmsp_g'): opt = O.RMSpropGraves() elif (opt_str.lower() == 'sgd'): opt = O.SGD() elif (opt_str.lower() == 'smorms'): opt = O.SMORMS3() else: opt = O.Adam(amsgrad=True) logger.warning('{}->{}'.format(opt_str, opt.__doc__.split('.')[0])) logger.debug('Optimizer: {}'.format(opt.__doc__.split('.')[0])) return opt
def optimizer(opt_str): """ 入力文字列からオプティマイザを推定する """ if(opt_str.lower() == 'adam'): opt = O.Adam(amsgrad=True) elif(opt_str.lower() == 'ada_d'): opt = O.AdaDelta() elif(opt_str.lower() == 'ada_g'): opt = O.AdaGrad() elif(opt_str.lower() == 'm_sgd'): opt = O.MomentumSGD() elif(opt_str.lower() == 'n_ag'): opt = O.NesterovAG() elif(opt_str.lower() == 'rmsp'): opt = O.RMSprop() elif(opt_str.lower() == 'rmsp_g'): opt = O.RMSpropGraves() elif(opt_str.lower() == 'sgd'): opt = O.SGD() elif(opt_str.lower() == 'smorms'): opt = O.SMORMS3() else: opt = O.Adam(amsgrad=True) print('\n[Warning] {0}\n\t{1}->{2}\n'.format( fileFuncLine(), opt_str, opt.__doc__.split('.')[0]) ) print('Optimizer:', opt.__doc__.split('.')[0]) return opt
def get_opt(args): if args.opt_model == "SGD": alpha0 = 0.01 if args.alpha0 == 0 else args.alpha0 return optimizers.SGD(lr=alpha0) if args.opt_model == "AdaGrad": alpha0 = 0.01 if args.alpha0 == 0 else args.alpha0 return optimizers.AdaGrad(lr=alpha0) if args.opt_model == "AdaDelta": alpha0 = 0.95 if args.alpha0 == 0 else args.alpha0 alpha1 = 1e-06 if args.alpha1 == 0 else args.alpha1 return optimizers.AdaDelta(rho=alpha0, eps=alpha1) if args.opt_model == "Momentum": alpha0 = 0.01 if args.alpha0 == 0 else args.alpha0 alpha1 = 0.9 if args.alpha1 == 0 else args.alpha1 return optimizers.MomentumSGD(lr=alpha0, momentum=alpha1) if args.opt_model == "NAG": alpha0 = 0.01 if args.alpha0 == 0 else args.alpha0 alpha1 = 0.9 if args.alpha1 == 0 else args.alpha1 return optimizers.NesterovAG(lr=alpha0, momentum=alpha1) if args.opt_model == "RMS": return optimizers.RMSpropGraves() if args.opt_model == "SM": return optimizers.SMORMS3() if args.opt_model == "Adam": # default case alpha0 = 0.001 if args.alpha0 == 0 else args.alpha0 alpha1 = 0.9 if args.alpha1 == 0 else args.alpha1 alpha2 = 0.999 if args.alpha2 == 0 else args.alpha2 alpha3 = 1e-08 if args.alpha3 == 0 else args.alpha3 return optimizers.Adam(alpha=alpha0, beta1=alpha1, beta2=alpha2, eps=alpha3) print('no such optimization method', args.opt_model) sys.exit(1)
def get_optimizer(self, name, lr, momentum=0.9): if name.lower() == "adam": return optimizers.Adam(alpha=lr, beta1=momentum) if name.lower() == "smorms3": return optimizers.SMORMS3(lr=lr) if name.lower() == "adagrad": return optimizers.AdaGrad(lr=lr) if name.lower() == "adadelta": return optimizers.AdaDelta(rho=momentum) if name.lower() == "nesterov" or name.lower() == "nesterovag": return optimizers.NesterovAG(lr=lr, momentum=momentum) if name.lower() == "rmsprop": return optimizers.RMSprop(lr=lr, alpha=momentum) if name.lower() == "momentumsgd": return optimizers.MomentumSGD(lr=lr, mommentum=mommentum) if name.lower() == "sgd": return optimizers.SGD(lr=lr)
def get_optimizer(name): """ :type name: str :rtype: chainer.Optimizer """ if name == "adadelta": opt = optimizers.AdaDelta() elif name == "adagrad": opt = optimizers.AdaGrad() elif name == "adam": opt = optimizers.Adam() elif name == "rmsprop": opt = optimizers.RMSprop() elif name == "smorms3": opt = optimizers.SMORMS3() else: raise ValueError("Unknown optimizer_name=%s" % name) return opt
def setOptimizer(model, method, params): learningRate = params['learningRate'] if ( params.has_key('learningRate')) else 0.001 alpha = params['alpha'] if (params.has_key('alpha')) else 0.001 if (method == 'adam'): optimizer = optimizers.Adam(alpha=alpha) elif (method == 'smorms3'): optimizer = optimizers.SMORMS3(lr=learningRate) elif (method == 'rmsprop'): optimizer = optimizers.RMSprop(lr=learningRate) elif (method == 'sgd'): optimizer = optimizers.SGD(lr=learningRate) elif (method == 'momentum'): optimizer = optimizers.MomentumSGD(lr=learningRate) elif (method == 'adagrad'): optimizer = optimizers.AdaGrad(lr=learningRate) elif (method == 'adadelta'): optimizer = optimizers.AdaDelta() optimizer.setup(model) return optimizer
def create(self): return optimizers.SMORMS3(0.1)
def main(args): gpu = args.gpu path_config = args.config mode = args.mode path_word2vec = args.word2vec curriculum = False if args.curriculum == 0 else True # Hyper parameters (const) MAX_EPOCH = 10000000000 MAX_PATIENCE = 20 EVAL = 10000 if curriculum: LENGTH_LIMITS = [10, 20, 30, 40, 50] # NOTE: experimental else: LENGTH_LIMITS = [50] config = utils.Config(path_config) # Preparaton path_corpus_train = config.getpath("prep_corpus") + ".train" path_corpus_val = config.getpath("prep_corpus") + ".val" basename = "won.%s.%s" % ( os.path.basename(path_corpus_train), os.path.splitext(os.path.basename(path_config))[0]) path_snapshot = os.path.join(config.getpath("snapshot"), basename + ".model") path_snapshot_vectors = os.path.join(config.getpath("snapshot"), basename + ".vectors.txt") if mode == "train": path_log = os.path.join(config.getpath("log"), basename + ".log") utils.set_logger(path_log) elif mode == "evaluation": path_evaluation = os.path.join(config.getpath("evaluation"), basename + ".txt") utils.set_logger(path_evaluation) elif mode == "analysis": path_analysis = os.path.join(config.getpath("analysis"), basename) utils.logger.debug("[info] TRAINING CORPUS: %s" % path_corpus_train) utils.logger.debug("[info] VALIDATION CORPUS: %s" % path_corpus_val) utils.logger.debug("[info] CONFIG: %s" % path_config) utils.logger.debug("[info] PRE-TRAINED WORD EMBEDDINGS: %s" % path_word2vec) utils.logger.debug("[info] SNAPSHOT (MODEL): %s " % path_snapshot) utils.logger.debug("[info] SNAPSHOT (WORD EMBEDDINGS): %s " % path_snapshot_vectors) if mode == "train": utils.logger.debug("[info] LOG: %s" % path_log) elif mode == "evaluation": utils.logger.debug("[info] EVALUATION: %s" % path_evaluation) elif mode == "analysis": utils.logger.debug("[info] ANALYSIS: %s" % path_analysis) # Hyper parameters word_dim = config.getint("word_dim") state_dim = config.getint("state_dim") aggregation = config.getstr("aggregation") attention = config.getstr("attention") retrofitting = config.getbool("retrofitting") alpha = config.getfloat("alpha") scale = config.getfloat("scale") identity_penalty = config.getbool("identity_penalty") lmd = config.getfloat("lambda") grad_clip = config.getfloat("grad_clip") weight_decay = config.getfloat("weight_decay") batch_size = config.getint("batch_size") utils.logger.debug("[info] WORD DIM: %d" % word_dim) utils.logger.debug("[info] STATE DIM: %d" % state_dim) utils.logger.debug("[info] AGGREGATION METHOD: %s" % aggregation) utils.logger.debug("[info] ATTENTION METHOD: %s" % attention) utils.logger.debug("[info] RETROFITTING: %s" % retrofitting) utils.logger.debug("[info] ALPHA = %f" % alpha) utils.logger.debug("[info] SCALE: %f" % scale) utils.logger.debug("[info] IDENTITY PENALTY: %s" % identity_penalty) utils.logger.debug("[info] LAMBDA: %f" % lmd) utils.logger.debug("[info] GRADIENT CLIPPING: %f" % grad_clip) utils.logger.debug("[info] WEIGHT DECAY: %f" % weight_decay) utils.logger.debug("[info] BATCH SIZE: %d" % batch_size) if retrofitting: assert path_word2vec is not None # Data preparation corpus_train_list = [ load_corpus( path_corpus_train, vocab=path_corpus_train + ".vocab", max_length=length_limit) for length_limit in LENGTH_LIMITS] corpus_val = load_corpus( path_corpus_val, vocab=corpus_train_list[0].vocab, max_length=LENGTH_LIMITS[-1]) # Model preparation if (mode == "train") and (path_word2vec is not None): initialW_data = utils.load_word2vec_weight_matrix( path_word2vec, word_dim, corpus_train_list[0].vocab, scale) else: initialW_data = None cuda.get_device(gpu).use() model = models.WON( vocab_size=len(corpus_train_list[0].vocab), word_dim=word_dim, state_dim=state_dim, aggregation=aggregation, attention=attention, initialW=initialW_data, EOS_ID=corpus_train_list[0].vocab["<EOS>"]) if mode != "train": serializers.load_npz(path_snapshot, model) model.to_gpu(gpu) # Training/Evaluation/Analysis if mode == "train": length_index = 0 utils.logger.debug("[info] Evaluating on the validation set ...") loss, acc = evaluate(model, corpus_val, lmd, identity_penalty) utils.logger.debug("[validation] iter=0, epoch=0, max_length=%d, loss=%.03f, accuracy=%.2f%%" % \ (LENGTH_LIMITS[length_index], loss, acc*100)) for _ in np.random.randint(0, len(corpus_val), 10): s = corpus_val.random_sample() batch_sents = [s] batch_labels = make_labels(batch_sents) _, order_pred = model.forward(batch_sents, train=False) order_pred = [a[0] for a in order_pred] order_gold = batch_labels[0] s = [corpus_val.ivocab[w] for w in s] s_pred = utils.reorder(s, order_pred) s_gold = utils.reorder(s, order_gold) s_pred = " ".join(s_pred).encode("utf-8") s_gold = " ".join(s_gold).encode("utf-8") utils.logger.debug("[check] <Gold> %s" % s_gold) utils.logger.debug("[check] <Pred> %s" % s_pred) utils.logger.debug("[check] <Gold:order> %s" % order_gold) utils.logger.debug("[check] <Pred:order> %s" % order_pred) # training & validation opt = optimizers.SMORMS3() opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(grad_clip)) opt.add_hook(chainer.optimizer.WeightDecay(weight_decay)) # best_acc = -1.0 best_acc = acc patience = 0 it = 0 n_train = len(corpus_train_list[0]) # TODO finish_training = False for epoch in xrange(1, MAX_EPOCH+1): if finish_training: break for data_i in xrange(0, n_train, batch_size): if data_i + batch_size > n_train: break # data preparation batch_sents = corpus_train_list[length_index].next_batch(size=batch_size) batch_labels = make_labels(batch_sents) # forward loss, acc = forward(model, batch_sents, batch_labels, lmd, identity_penalty, train=True) # TODO: BEGIN if retrofitting: part_indices_data = np.asarray(list( set([w for s_ in batch_sents for w in s_]) )) part_initialW_data = initialW_data[part_indices_data] part_indices = Variable(cuda.cupy.asarray(part_indices_data, dtype=np.int32), volatile=False) part_initialW = Variable(cuda.cupy.asarray(part_initialW_data, dtype=np.float32), volatile=False) loss_ret = frobenius_squared_error(model.embed(part_indices), part_initialW) else: loss_ret = 0.0 loss = loss + alpha * loss_ret # TODO: END # backward & update model.zerograds() loss.backward() loss.unchain_backward() opt.update() it += 1 # log loss = float(cuda.to_cpu(loss.data)) acc = float(cuda.to_cpu(acc.data)) utils.logger.debug("[training] iter=%d, epoch=%d (%d/%d=%.03f%%), max_length=%d, loss=%.03f, accuracy=%.2f%%" % \ (it, epoch, data_i+batch_size, n_train, float(data_i+batch_size)/n_train * 100, LENGTH_LIMITS[length_index], loss, acc*100)) if it % EVAL == 0: # validation utils.logger.debug("[info] Evaluating on the validation set ...") loss, acc = evaluate(model, corpus_val, lmd, identity_penalty) utils.logger.debug("[validation] iter=%d, epoch=%d, max_length=%d, loss=%.03f, accuracy=%.2f%%" % \ (it, epoch, LENGTH_LIMITS[length_index], loss, acc*100)) for _ in np.random.randint(0, len(corpus_val), 10): s = corpus_val.random_sample() batch_sents = [s] batch_labels = make_labels(batch_sents) _, order_pred = model.forward(batch_sents, train=False) order_pred = [a[0] for a in order_pred] order_gold = batch_labels[0] s = [corpus_val.ivocab[w] for w in s] s_pred = utils.reorder(s, order_pred) s_gold = utils.reorder(s, order_gold) s_pred = " ".join(s_pred).encode("utf-8") s_gold = " ".join(s_gold).encode("utf-8") utils.logger.debug("[check] <Gold> %s" % s_gold) utils.logger.debug("[check] <Pred> %s" % s_pred) utils.logger.debug("[check] <Gold:order> %s" % order_gold) utils.logger.debug("[check] <Pred:order> %s" % order_pred) if best_acc < acc: # save utils.logger.debug("[info] Best accuracy is updated: %.2f%% => %.2f%%" % (best_acc*100.0, acc*100.0)) best_acc = acc patience = 0 serializers.save_npz(path_snapshot, model) serializers.save_npz(path_snapshot + ".opt", opt) save_word2vec(path_snapshot_vectors, extract_word2vec(model, corpus_train_list[length_index].vocab)) utils.logger.debug("[info] Saved.") else: patience += 1 utils.logger.debug("[info] Patience: %d (best accuracy: %.2f%%)" % (patience, best_acc*100.0)) if patience >= MAX_PATIENCE: if curriculum and (length_index != len(LENGTH_LIMITS)-1): length_index += 1 break else: utils.logger.debug("[info] Patience %d is over. Training finished." \ % patience) finish_training = True break elif mode == "evaluation": pass elif mode == "analysis": utils.mkdir(path_analysis) f = open(os.path.join(path_analysis, "dump.txt"), "w") data_i = 0 for s in pyprind.prog_bar(corpus_val): # NOTE: analysisの場合は, 文長を気にせずすべて解かせる batch_sents = [s] batch_labels = make_labels(batch_sents) _, order_pred = model.forward(batch_sents, train=False) order_pred = [a[0] for a in order_pred] order_gold = batch_labels[0] s = [corpus_val.ivocab[w] for w in s] s_pred = utils.reorder(s, order_pred) s_gold = utils.reorder(s, order_gold) s_pred = " ".join(s_pred).encode("utf-8") s_gold = " ".join(s_gold).encode("utf-8") f.write("[%d] <Gold> %s\n" % (data_i+1, s_gold)) f.write("[%d] <Pred> %s\n" % (data_i+1, s_pred)) f.write("[%d] <Gold:order> %s\n" % (data_i+1, order_gold)) f.write("[%d] <Pred:order> %s\n" % (data_i+1, order_pred)) data_i += 1 f.flush() f.close() utils.logger.debug("[info] Done.")
t1[done, :] = 0 tt = r1 + (gamma * cp.max(t1, axis=1)) t[a0[:, None] == cp.arange(t.shape[1])] = tt #foo[ind[:,None] == range(foo.shape[1])] = bar train = [(s0[i], t[i]) for i in range(t.shape[0])] return train env = gym.make('CartPole-v1') state_size = env.observation_space.shape[0] n_actions = env.action_space.n q_func = MLP(32, n_actions) q_func.to_gpu(0) model = QClassifier(q_func) optimizer = optimizers.SMORMS3(1e-2) optimizer.setup(model) #optimizer.add_hook(chainer.optimizer.WeightDecay(0.001)) #optimizer.add_hook(chainer.optimizer.GradientNoise(0.001)) replay_buffer = [] s0_stack = cp.array([], dtype=cp.float32).reshape(0, state_size) a0_stack = cp.array([], dtype=cp.int32) r1_stack = cp.array([], dtype=cp.float32) s1_stack = cp.array([], dtype=cp.float32).reshape(0, state_size) done_stack = cp.array([], dtype=cp.bool_) for i in range(1, TRAINING_EPISODES + 1): s0 = env.reset() s0 = cp.array(s0, dtype=DTYPE) r0 = 0
elif args.adagrad: optimizer = optimizers.AdaGrad() elif args.amsgrad: optimizer = optimizers.AMSGrad() elif args.amsbound: optimizer = optimizers.AMSBound() elif args.correctedmomentsgd: optimizer = optimizers.CorrectedMomentumSGD() elif args.nesterovag: optimizer = optimizers.NesterovAG() elif args.msvag: optimizer = optimizers.MSVAG() elif args.rmspropgraves: optimizer = optimizers.RMSpropGraves() elif args.smorms3: optimizer = optimizers.SMORMS3() else: optimizer = optimizers.AdaDelta() optimizer.setup(net) if args.lasso: #Lasso回帰でスパース化 from chainer.optimizer_hooks import Lasso for param in net.params(): if param.name != 'b': param.update_rule.add_hook(Lasso(decay)) else: #Ridge回帰で過学習抑制 from chainer.optimizer_hooks import WeightDecay for param in net.params():
def main(gpu, path_corpus, path_config, path_word2vec): MAX_EPOCH = 50 EVAL = 200 MAX_LENGTH = 70 config = utils.Config(path_config) model_name = config.getstr("model") word_dim = config.getint("word_dim") state_dim = config.getint("state_dim") grad_clip = config.getfloat("grad_clip") weight_decay = config.getfloat("weight_decay") batch_size = config.getint("batch_size") print "[info] CORPUS: %s" % path_corpus print "[info] CONFIG: %s" % path_config print "[info] PRE-TRAINED WORD EMBEDDINGS: %s" % path_word2vec print "[info] MODEL: %s" % model_name print "[info] WORD DIM: %d" % word_dim print "[info] STATE DIM: %d" % state_dim print "[info] GRADIENT CLIPPING: %f" % grad_clip print "[info] WEIGHT DECAY: %f" % weight_decay print "[info] BATCH SIZE: %d" % batch_size path_save_head = os.path.join( config.getpath("snapshot"), "rnnlm.%s.%s" % (os.path.basename(path_corpus), os.path.splitext(os.path.basename(path_config))[0])) print "[info] SNAPSHOT: %s" % path_save_head sents_train, sents_val, vocab, ivocab = \ utils.load_corpus(path_corpus=path_corpus, max_length=MAX_LENGTH) if path_word2vec is not None: word2vec = utils.load_word2vec(path_word2vec, word_dim) initialW = utils.create_word_embeddings(vocab, word2vec, dim=word_dim, scale=0.001) else: initialW = None cuda.get_device(gpu).use() if model_name == "rnn": model = models.RNN(vocab_size=len(vocab), word_dim=word_dim, state_dim=state_dim, initialW=initialW, EOS_ID=vocab["<EOS>"]) elif model_name == "lstm": model = models.LSTM(vocab_size=len(vocab), word_dim=word_dim, state_dim=state_dim, initialW=initialW, EOS_ID=vocab["<EOS>"]) elif model_name == "gru": model = models.GRU(vocab_size=len(vocab), word_dim=word_dim, state_dim=state_dim, initialW=initialW, EOS_ID=vocab["<EOS>"]) elif model_name == "bd_lstm": model = models.BD_LSTM(vocab_size=len(vocab), word_dim=word_dim, state_dim=state_dim, initialW=initialW, EOS_ID=vocab["<EOS>"]) else: print "[error] Unknown model name: %s" % model_name sys.exit(-1) model.to_gpu(gpu) opt = optimizers.SMORMS3() opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(grad_clip)) opt.add_hook(chainer.optimizer.WeightDecay(weight_decay)) print "[info] Evaluating on the validation sentences ..." loss_data, acc_data = evaluate(model, model_name, sents_val, ivocab) perp = math.exp(loss_data) print "[validation] iter=0, epoch=0, perplexity=%f, accuracy=%.2f%%" \ % (perp, acc_data*100) it = 0 n_train = len(sents_train) vocab_size = model.vocab_size for epoch in xrange(1, MAX_EPOCH + 1): perm = np.random.permutation(n_train) for data_i in xrange(0, n_train, batch_size): if data_i + batch_size > n_train: break words = sents_train[perm[data_i:data_i + batch_size]] if model_name == "bd_lstm": xs, ms = utils.make_batch(words, train=True, tail=False, mask=True) ys = model.forward(xs=xs, ms=ms, train=True) else: xs = utils.make_batch(words, train=True, tail=False) ys = model.forward(ts=xs, train=True) ys = F.concat(ys, axis=0) ts = F.concat(xs, axis=0) ys = F.reshape(ys, (-1, vocab_size)) # (TN, |V|) ts = F.reshape(ts, (-1, )) # (TN,) loss = F.softmax_cross_entropy(ys, ts) acc = F.accuracy(ys, ts, ignore_label=-1) model.zerograds() loss.backward() loss.unchain_backward() opt.update() it += 1 loss_data = float(cuda.to_cpu(loss.data)) perp = math.exp(loss_data) acc_data = float(cuda.to_cpu(acc.data)) print "[training] iter=%d, epoch=%d (%d/%d=%.03f%%), perplexity=%f, accuracy=%.2f%%" \ % (it, epoch, data_i+batch_size, n_train, float(data_i+batch_size)/n_train*100, perp, acc_data*100) if it % EVAL == 0: print "[info] Evaluating on the validation sentences ..." loss_data, acc_data = evaluate(model, model_name, sents_val, ivocab) perp = math.exp(loss_data) print "[validation] iter=%d, epoch=%d, perplexity=%f, accuracy=%.2f%%" \ % (it, epoch, perp, acc_data*100) serializers.save_npz( path_save_head + ".iter_%d.epoch_%d.model" % (it, epoch), model) utils.save_word2vec( path_save_head + ".iter_%d.epoch_%d.vectors.txt" % (it, epoch), utils.extract_word2vec(model, vocab)) print "[info] Saved." print "[info] Done."
def main(gpu, path_corpus, path_config, path_word2vec): MAX_EPOCH = 50 EVAL = 200 MAX_LENGTH = 70 COUNTS_CACHE = "./cache/counts.pkl" config = utils.Config(path_config) word_dim = config.getint("word_dim") state_dim = config.getint("state_dim") grad_clip = config.getfloat("grad_clip") weight_decay = config.getfloat("weight_decay") batch_size = config.getint("batch_size") sample_size = config.getint("sample_size") print "[info] CORPUS: %s" % path_corpus print "[info] CONFIG: %s" % path_config print "[info] PRE-TRAINED WORD EMBEDDINGS: %s" % path_word2vec print "[info] WORD DIM: %d" % word_dim print "[info] STATE DIM: %d" % state_dim print "[info] GRADIENT CLIPPING: %f" % grad_clip print "[info] WEIGHT DECAY: %f" % weight_decay print "[info] BATCH SIZE: %d" % batch_size path_save_head = os.path.join(config.getpath("snapshot"), "rnnlm.%s.%s" % ( os.path.basename(path_corpus), os.path.splitext(os.path.basename(path_config))[0])) print "[info] SNAPSHOT: %s" % path_save_head sents_train, sents_val, vocab, ivocab = \ utils.load_corpus(path_corpus=path_corpus, max_length=MAX_LENGTH) #counts = None #print("[info] Load word counter") #if os.path.exists(COUNTS_CACHE): # print("[info] Found cache of counter") # counts = pickle.load(open(COUNTS_CACHE, "rb")) # if len(counts) != len(vocab): # counts = None #if counts is None: # counts = Counter() # for sent in list(sents_train) + list(sents_val): # counts += Counter(sent) # pickle.dump(counts, open(COUNTS_CACHE, "wb")) #cs = [counts[w] for w in range(len(counts))] if path_word2vec is not None: word2vec = utils.load_word2vec(path_word2vec, word_dim) initialW = utils.create_word_embeddings(vocab, word2vec, dim=word_dim, scale=0.001) else: initialW = None cuda.get_device(gpu).use() model = models.CXT_BLSTM( vocab_size=len(vocab), word_dim=word_dim, state_dim=state_dim, initialW=initialW, EOS_ID=vocab["<EOS>"]) model.to_gpu(gpu) opt = optimizers.SMORMS3() opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(grad_clip)) opt.add_hook(chainer.optimizer.WeightDecay(weight_decay)) # sampler = utils.RandomSampler(cs, sample_size) #print "[info] Evaluating on the validation sentences ..." #loss_data = evaluate(model, sents_val, ivocab, word_dim, sampler) #print "[validation] iter=0, epoch=0, loss=%f" \ # % (loss_data) it = 0 n_train = len(sents_train) vocab_size = model.vocab_size for epoch in xrange(1, MAX_EPOCH+1): perm = np.random.permutation(n_train) for data_i in xrange(0, n_train, batch_size): if data_i + batch_size > n_train: break words = sents_train[perm[data_i:data_i+batch_size]] xs, ms = utils.make_batch(words, train=True, tail=False, mask=True) ys = model.forward(xs=xs, ms=ms, train=True) words_without_edge = [w[1:-1] for w in words] xs_without_edge, ms_without_edge = utils.make_batch(words_without_edge, train=True, tail=False, mask=True) masked_ys = [] for y, m in zip(ys, ms_without_edge): m_ext = F.broadcast_to(F.reshape(m, (batch_size, 1)), (batch_size, vocab_size)) masked_ys.append(y*m_ext) #ts = model.embed_words(xs_without_edge, ms_without_edge, train=True) # BOS, EOSは除く # T : バッチの中の最大長 # N : バッチサイズ # |D|: word_dim ys = F.concat(masked_ys, axis=0) # (TN, |V|) ts = F.concat(xs_without_edge, axis=0) # (TN, |D|) ys = F.reshape(ys, (-1, vocab_size)) # (TN, |D|) ts = F.reshape(ts, (-1,)) # (TN,) loss = F.softmax_cross_entropy(ys, ts) acc = F.accuracy(ys, ts, ignore_label=-1) model.zerograds() loss.backward() loss.unchain_backward() opt.update() it += 1 loss_data = float(cuda.to_cpu(loss.data)) perp = math.exp(loss_data) acc_data = float(cuda.to_cpu(acc.data)) print "[training] iter=%d, epoch=%d (%d/%d=%.03f%%), perplexity=%f, accuracy=%.2f%%" \ % (it, epoch, data_i+batch_size, n_train, float(data_i+batch_size)/n_train*100, perp, acc_data*100) if it % EVAL == 0: print "[info] Evaluating on the validation sentences ..." loss_data, acc_data = evaluate(model, sents_val, ivocab, word_dim) perp = math.exp(loss_data) print "[validation] iter=%d, epoch=%d, perplexity=%f, accuracy=%.2f%%" \ % (it, epoch, perp, acc_data*100) serializers.save_npz(path_save_head + ".iter_%d.epoch_%d.model" % (it, epoch), model) # utils.save_word2vec(path_save_head + ".iter_%d.epoch_%d.vectors.txt" % (it, epoch), # utils.extract_word2vec(model, vocab)) print "[info] Saved." print "[info] Done."
jvi_order, device=gpu_id) elbo = model.ELBOObjective(encoder, decoder, zcount) elif vae_type == "is": vae = model.ISObjective(encoder, decoder, zcount) else: sys.exit("Unsupported VAE type (%s)." % vae_type) lr = float(args['--lr']) print "Using initial learning rate %f" % lr opt_type = args['--opt'] if opt_type == "adam": opt = optimizers.Adam(alpha=lr) opt_elbo = optimizers.Adam(alpha=lr) elif opt_type == "smorms3": opt = optimizers.SMORMS3(lr=lr) opt_elbo = optimizers.SMORMS3(lr=lr) elif opt_type == "sgd": opt = optimizers.SGD(lr=lr) opt_elbo = optimizers.SGD(lr=lr) else: sys.exit("Unsupported optimizer type (%s)." % opt_type) opt.setup(vae) opt.add_hook(chainer.optimizer.GradientClipping(4.0)) if elbo: opt_elbo.setup(elbo) opt_elbo.add_hook(chainer.optimizer.GradientClipping(4.0)) # Move to GPU